mem_impl.hh revision 11308
13898Ssaidi@eecs.umich.edu/*
22934Sktlim@umich.edu * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
32934Sktlim@umich.edu * All rights reserved.
42934Sktlim@umich.edu *
52934Sktlim@umich.edu * For use for simulation and test purposes only
62934Sktlim@umich.edu *
72934Sktlim@umich.edu * Redistribution and use in source and binary forms, with or without
82934Sktlim@umich.edu * modification, are permitted provided that the following conditions are met:
92934Sktlim@umich.edu *
102934Sktlim@umich.edu * 1. Redistributions of source code must retain the above copyright notice,
112934Sktlim@umich.edu * this list of conditions and the following disclaimer.
122934Sktlim@umich.edu *
132934Sktlim@umich.edu * 2. Redistributions in binary form must reproduce the above copyright notice,
142934Sktlim@umich.edu * this list of conditions and the following disclaimer in the documentation
152934Sktlim@umich.edu * and/or other materials provided with the distribution.
162934Sktlim@umich.edu *
172934Sktlim@umich.edu * 3. Neither the name of the copyright holder nor the names of its contributors
182934Sktlim@umich.edu * may be used to endorse or promote products derived from this software
192934Sktlim@umich.edu * without specific prior written permission.
202934Sktlim@umich.edu *
212934Sktlim@umich.edu * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
222934Sktlim@umich.edu * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
232934Sktlim@umich.edu * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
242934Sktlim@umich.edu * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
252934Sktlim@umich.edu * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
262934Sktlim@umich.edu * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
272934Sktlim@umich.edu * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
282934Sktlim@umich.edu * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
292934Sktlim@umich.edu * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
302969Sktlim@umich.edu * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
312934Sktlim@umich.edu * POSSIBILITY OF SUCH DAMAGE.
322995Ssaidi@eecs.umich.edu *
332934Sktlim@umich.edu * Author: Steve Reinhardt
342934Sktlim@umich.edu */
352934Sktlim@umich.edu
362934Sktlim@umich.edu#include "arch/hsail/generic_types.hh"
372934Sktlim@umich.edu#include "gpu-compute/hsail_code.hh"
382934Sktlim@umich.edu
392934Sktlim@umich.edu// defined in code.cc, but not worth sucking in all of code.h for this
402934Sktlim@umich.edu// at this point
413898Ssaidi@eecs.umich.eduextern const char *segmentNames[];
423898Ssaidi@eecs.umich.edu
433898Ssaidi@eecs.umich.edunamespace HsailISA
443898Ssaidi@eecs.umich.edu{
453898Ssaidi@eecs.umich.edu    template<typename DestDataType, typename AddrRegOperandType>
463898Ssaidi@eecs.umich.edu    void
473898Ssaidi@eecs.umich.edu    LdaInst<DestDataType, AddrRegOperandType>::generateDisassembly()
483898Ssaidi@eecs.umich.edu    {
492934Sktlim@umich.edu        this->disassembly = csprintf("%s_%s %s,%s", this->opcode,
502934Sktlim@umich.edu                                     DestDataType::label,
512934Sktlim@umich.edu                                     this->dest.disassemble(),
522934Sktlim@umich.edu                                     this->addr.disassemble());
532934Sktlim@umich.edu    }
542934Sktlim@umich.edu
552934Sktlim@umich.edu    template<typename DestDataType, typename AddrRegOperandType>
563005Sstever@eecs.umich.edu    void
572934Sktlim@umich.edu    LdaInst<DestDataType, AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
583005Sstever@eecs.umich.edu    {
593005Sstever@eecs.umich.edu        Wavefront *w = gpuDynInst->wavefront();
603304Sstever@eecs.umich.edu
612995Ssaidi@eecs.umich.edu        typedef typename DestDataType::CType CType M5_VAR_USED;
622934Sktlim@umich.edu        const VectorMask &mask = w->get_pred();
632934Sktlim@umich.edu        uint64_t addr_vec[VSZ];
642934Sktlim@umich.edu        this->addr.calcVector(w, addr_vec);
652995Ssaidi@eecs.umich.edu
662934Sktlim@umich.edu        for (int lane = 0; lane < VSZ; ++lane) {
672934Sktlim@umich.edu            if (mask[lane]) {
682934Sktlim@umich.edu                this->dest.set(w, lane, addr_vec[lane]);
692934Sktlim@umich.edu            }
702934Sktlim@umich.edu        }
712995Ssaidi@eecs.umich.edu    }
722934Sktlim@umich.edu
732934Sktlim@umich.edu    template<typename MemDataType, typename DestDataType,
742934Sktlim@umich.edu             typename AddrRegOperandType>
752934Sktlim@umich.edu    void
762934Sktlim@umich.edu    LdInst<MemDataType, DestDataType, AddrRegOperandType>::generateDisassembly()
772995Ssaidi@eecs.umich.edu    {
782934Sktlim@umich.edu        switch (num_dest_operands) {
792934Sktlim@umich.edu          case 1:
802953Sktlim@umich.edu            this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
812934Sktlim@umich.edu                                         segmentNames[this->segment],
822934Sktlim@umich.edu                                         MemDataType::label,
833449Shsul@eecs.umich.edu                                         this->dest.disassemble(),
842934Sktlim@umich.edu                                         this->addr.disassemble());
852934Sktlim@umich.edu            break;
862934Sktlim@umich.edu          case 2:
872934Sktlim@umich.edu            this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
882934Sktlim@umich.edu                                         segmentNames[this->segment],
893584Ssaidi@eecs.umich.edu                                         MemDataType::label,
903584Ssaidi@eecs.umich.edu                                         this->dest_vect[0].disassemble(),
913584Ssaidi@eecs.umich.edu                                         this->dest_vect[1].disassemble(),
923584Ssaidi@eecs.umich.edu                                         this->addr.disassemble());
933584Ssaidi@eecs.umich.edu            break;
943584Ssaidi@eecs.umich.edu          case 4:
953743Sgblack@eecs.umich.edu            this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
963584Ssaidi@eecs.umich.edu                                         this->opcode,
973743Sgblack@eecs.umich.edu                                         segmentNames[this->segment],
983743Sgblack@eecs.umich.edu                                         MemDataType::label,
993743Sgblack@eecs.umich.edu                                         this->dest_vect[0].disassemble(),
1003823Ssaidi@eecs.umich.edu                                         this->dest_vect[1].disassemble(),
1013814Ssaidi@eecs.umich.edu                                         this->dest_vect[2].disassemble(),
1023743Sgblack@eecs.umich.edu                                         this->dest_vect[3].disassemble(),
1033743Sgblack@eecs.umich.edu                                         this->addr.disassemble());
1043584Ssaidi@eecs.umich.edu            break;
1053814Ssaidi@eecs.umich.edu          default:
1063584Ssaidi@eecs.umich.edu            fatal("Bad ld register dest operand, num vector operands: %d \n",
1073745Sgblack@eecs.umich.edu                  num_dest_operands);
1083745Sgblack@eecs.umich.edu            break;
1093745Sgblack@eecs.umich.edu        }
1103584Ssaidi@eecs.umich.edu    }
1113898Ssaidi@eecs.umich.edu
1123898Ssaidi@eecs.umich.edu    static Addr
1133898Ssaidi@eecs.umich.edu    calcPrivAddr(Addr addr, Wavefront *w, int lane, GPUStaticInst *i)
1143584Ssaidi@eecs.umich.edu    {
1153584Ssaidi@eecs.umich.edu        // what is the size of the object we are accessing??
1163584Ssaidi@eecs.umich.edu        // NOTE: the compiler doesn't generate enough information
1173745Sgblack@eecs.umich.edu        // to do this yet..have to just line up all the private
1183745Sgblack@eecs.umich.edu        // work-item spaces back to back for now
1193745Sgblack@eecs.umich.edu        /*
1203584Ssaidi@eecs.umich.edu        StorageElement* se =
1213584Ssaidi@eecs.umich.edu            i->parent->findSymbol(Brig::BrigPrivateSpace, addr);
1223584Ssaidi@eecs.umich.edu        assert(se);
1233584Ssaidi@eecs.umich.edu
1243025Ssaidi@eecs.umich.edu        return w->wfSlotId * w->privSizePerItem * VSZ +
1252934Sktlim@umich.edu            se->offset * VSZ +
1262995Ssaidi@eecs.umich.edu            lane * se->size;
1272995Ssaidi@eecs.umich.edu        */
1283025Ssaidi@eecs.umich.edu
1293025Ssaidi@eecs.umich.edu        // addressing strategy: interleave the private spaces of
1303025Ssaidi@eecs.umich.edu        // work-items in a wave-front on 8 byte granularity.
1313025Ssaidi@eecs.umich.edu        // this won't be perfect coalescing like the spill space
1323025Ssaidi@eecs.umich.edu        // strategy, but it's better than nothing. The spill space
1332934Sktlim@umich.edu        // strategy won't work with private because the same address
1342934Sktlim@umich.edu        // may be accessed by different sized loads/stores.
1352934Sktlim@umich.edu
136        // Note: I'm assuming that the largest load/store to private
137        // is 8 bytes. If it is larger, the stride will have to increase
138
139        Addr addr_div8 = addr / 8;
140        Addr addr_mod8 = addr % 8;
141
142        Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase;
143
144        assert(ret < w->privBase + (w->privSizePerItem * VSZ));
145
146        return ret;
147    }
148
149    template<typename MemDataType, typename DestDataType,
150             typename AddrRegOperandType>
151    void
152    LdInst<MemDataType, DestDataType,
153           AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
154    {
155        Wavefront *w = gpuDynInst->wavefront();
156
157        typedef typename MemDataType::CType MemCType;
158        const VectorMask &mask = w->get_pred();
159
160        // Kernarg references are handled uniquely for now (no Memory Request
161        // is used), so special-case them up front.  Someday we should
162        // make this more realistic, at which we should get rid of this
163        // block and fold this case into the switch below.
164        if (this->segment == Brig::BRIG_SEGMENT_KERNARG) {
165            MemCType val;
166
167            // I assume no vector ld for kernargs
168            assert(num_dest_operands == 1);
169
170            // assuming for the moment that we'll never do register
171            // offsets into kernarg space... just to make life simpler
172            uint64_t address = this->addr.calcUniform();
173
174            val = *(MemCType*)&w->kernelArgs[address];
175
176            DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val);
177
178            for (int lane = 0; lane < VSZ; ++lane) {
179                if (mask[lane]) {
180                    this->dest.set(w, lane, val);
181                }
182            }
183
184            return;
185        } else if (this->segment == Brig::BRIG_SEGMENT_ARG) {
186            uint64_t address = this->addr.calcUniform();
187            for (int lane = 0; lane < VSZ; ++lane) {
188                if (mask[lane]) {
189                    MemCType val = w->readCallArgMem<MemCType>(lane, address);
190
191                    DPRINTF(HSAIL, "ld_arg [%d] -> %llu\n", address,
192                            (unsigned long long)val);
193
194                    this->dest.set(w, lane, val);
195                }
196            }
197
198            return;
199        }
200
201        GPUDynInstPtr m = gpuDynInst;
202
203        this->addr.calcVector(w, m->addr);
204
205        m->m_op = Enums::MO_LD;
206        m->m_type = MemDataType::memType;
207        m->v_type = DestDataType::vgprType;
208
209        m->exec_mask = w->execMask();
210        m->statusBitVector = 0;
211        m->equiv = this->equivClass;
212        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
213
214        m->scope = getGenericMemoryScope(this->memoryScope);
215
216        if (num_dest_operands == 1) {
217            m->dst_reg = this->dest.regIndex();
218            m->n_reg = 1;
219        } else {
220            m->n_reg = num_dest_operands;
221            for (int i = 0; i < num_dest_operands; ++i) {
222                m->dst_reg_vec[i] = this->dest_vect[i].regIndex();
223            }
224        }
225
226        m->simdId = w->simdId;
227        m->wfSlotId = w->wfSlotId;
228        m->wfDynId = w->wfDynId;
229        m->kern_id = w->kern_id;
230        m->cu_id = w->computeUnit->cu_id;
231        m->latency.init(&w->computeUnit->shader->tick_cnt);
232
233        switch (this->segment) {
234          case Brig::BRIG_SEGMENT_GLOBAL:
235            m->s_type = SEG_GLOBAL;
236            m->pipeId = GLBMEM_PIPE;
237            m->latency.set(w->computeUnit->shader->ticks(1));
238
239            // this is a complete hack to get around a compiler bug
240            // (the compiler currently generates global access for private
241            //  addresses (starting from 0). We need to add the private offset)
242            for (int lane = 0; lane < VSZ; ++lane) {
243                if (m->addr[lane] < w->privSizePerItem) {
244                    if (mask[lane]) {
245                        // what is the size of the object we are accessing?
246                        // find base for for this wavefront
247
248                        // calcPrivAddr will fail if accesses are unaligned
249                        assert(!((sizeof(MemCType) - 1) & m->addr[lane]));
250
251                        Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
252                                                     this);
253
254                        m->addr[lane] = privAddr;
255                    }
256                }
257            }
258
259            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
260            w->outstanding_reqs_rd_gm++;
261            w->rd_gm_reqs_in_pipe--;
262            break;
263
264          case Brig::BRIG_SEGMENT_SPILL:
265            assert(num_dest_operands == 1);
266            m->s_type = SEG_SPILL;
267            m->pipeId = GLBMEM_PIPE;
268            m->latency.set(w->computeUnit->shader->ticks(1));
269            {
270                for (int lane = 0; lane < VSZ; ++lane) {
271                    //  note: this calculation will NOT WORK if the compiler
272                    //  ever generates loads/stores to the same address with
273                    //  different widths (e.g., a ld_u32 addr and a ld_u16 addr)
274                    if (mask[lane]) {
275                        assert(m->addr[lane] < w->spillSizePerItem);
276
277                        m->addr[lane] = m->addr[lane] * w->spillWidth +
278                                        lane * sizeof(MemCType) + w->spillBase;
279
280                        w->last_addr[lane] = m->addr[lane];
281                    }
282                }
283            }
284
285            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
286            w->outstanding_reqs_rd_gm++;
287            w->rd_gm_reqs_in_pipe--;
288            break;
289
290          case Brig::BRIG_SEGMENT_GROUP:
291            m->s_type = SEG_SHARED;
292            m->pipeId = LDSMEM_PIPE;
293            m->latency.set(w->computeUnit->shader->ticks(24));
294            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
295            w->outstanding_reqs_rd_lm++;
296            w->rd_lm_reqs_in_pipe--;
297            break;
298
299          case Brig::BRIG_SEGMENT_READONLY:
300            m->s_type = SEG_READONLY;
301            m->pipeId = GLBMEM_PIPE;
302            m->latency.set(w->computeUnit->shader->ticks(1));
303
304            for (int lane = 0; lane < VSZ; ++lane) {
305                if (mask[lane]) {
306                    assert(m->addr[lane] + sizeof(MemCType) <= w->roSize);
307                    m->addr[lane] += w->roBase;
308                }
309            }
310
311            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
312            w->outstanding_reqs_rd_gm++;
313            w->rd_gm_reqs_in_pipe--;
314            break;
315
316          case Brig::BRIG_SEGMENT_PRIVATE:
317            m->s_type = SEG_PRIVATE;
318            m->pipeId = GLBMEM_PIPE;
319            m->latency.set(w->computeUnit->shader->ticks(1));
320            {
321                for (int lane = 0; lane < VSZ; ++lane) {
322                    if (mask[lane]) {
323                        assert(m->addr[lane] < w->privSizePerItem);
324
325                        m->addr[lane] = m->addr[lane] +
326                            lane * sizeof(MemCType) + w->privBase;
327                    }
328                }
329            }
330            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
331            w->outstanding_reqs_rd_gm++;
332            w->rd_gm_reqs_in_pipe--;
333            break;
334
335          default:
336            fatal("Load to unsupported segment %d %llxe\n", this->segment,
337                  m->addr[0]);
338        }
339
340        w->outstanding_reqs++;
341        w->mem_reqs_in_pipe--;
342    }
343
344    template<typename OperationType, typename SrcDataType,
345             typename AddrRegOperandType>
346    void
347    StInst<OperationType, SrcDataType,
348           AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
349    {
350        Wavefront *w = gpuDynInst->wavefront();
351
352        typedef typename OperationType::CType CType;
353
354        const VectorMask &mask = w->get_pred();
355
356        // arg references are handled uniquely for now (no Memory Request
357        // is used), so special-case them up front.  Someday we should
358        // make this more realistic, at which we should get rid of this
359        // block and fold this case into the switch below.
360        if (this->segment == Brig::BRIG_SEGMENT_ARG) {
361            uint64_t address = this->addr.calcUniform();
362
363            for (int lane = 0; lane < VSZ; ++lane) {
364                if (mask[lane]) {
365                    CType data = this->src.template get<CType>(w, lane);
366                    DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data);
367                    w->writeCallArgMem<CType>(lane, address, data);
368                }
369            }
370
371            return;
372        }
373
374        GPUDynInstPtr m = gpuDynInst;
375
376        m->exec_mask = w->execMask();
377
378        this->addr.calcVector(w, m->addr);
379
380        if (num_src_operands == 1) {
381            for (int lane = 0; lane < VSZ; ++lane) {
382                if (mask[lane]) {
383                    ((CType*)m->d_data)[lane] =
384                        this->src.template get<CType>(w, lane);
385                }
386            }
387        } else {
388            for (int k= 0; k < num_src_operands; ++k) {
389                for (int lane = 0; lane < VSZ; ++lane) {
390                    if (mask[lane]) {
391                        ((CType*)m->d_data)[k * VSZ + lane] =
392                            this->src_vect[k].template get<CType>(w, lane);
393                    }
394                }
395            }
396        }
397
398        m->m_op = Enums::MO_ST;
399        m->m_type = OperationType::memType;
400        m->v_type = OperationType::vgprType;
401
402        m->statusBitVector = 0;
403        m->equiv = this->equivClass;
404
405        if (num_src_operands == 1) {
406            m->n_reg = 1;
407        } else {
408            m->n_reg = num_src_operands;
409        }
410
411        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
412
413        m->scope = getGenericMemoryScope(this->memoryScope);
414
415        m->simdId = w->simdId;
416        m->wfSlotId = w->wfSlotId;
417        m->wfDynId = w->wfDynId;
418        m->kern_id = w->kern_id;
419        m->cu_id = w->computeUnit->cu_id;
420        m->latency.init(&w->computeUnit->shader->tick_cnt);
421
422        switch (this->segment) {
423          case Brig::BRIG_SEGMENT_GLOBAL:
424            m->s_type = SEG_GLOBAL;
425            m->pipeId = GLBMEM_PIPE;
426            m->latency.set(w->computeUnit->shader->ticks(1));
427
428            // this is a complete hack to get around a compiler bug
429            // (the compiler currently generates global access for private
430            //  addresses (starting from 0). We need to add the private offset)
431            for (int lane = 0; lane < VSZ; ++lane) {
432                if (mask[lane]) {
433                    if (m->addr[lane] < w->privSizePerItem) {
434
435                        // calcPrivAddr will fail if accesses are unaligned
436                        assert(!((sizeof(CType)-1) & m->addr[lane]));
437
438                        Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
439                                                     this);
440
441                        m->addr[lane] = privAddr;
442                    }
443                }
444            }
445
446            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
447            w->outstanding_reqs_wr_gm++;
448            w->wr_gm_reqs_in_pipe--;
449            break;
450
451          case Brig::BRIG_SEGMENT_SPILL:
452            assert(num_src_operands == 1);
453            m->s_type = SEG_SPILL;
454            m->pipeId = GLBMEM_PIPE;
455            m->latency.set(w->computeUnit->shader->ticks(1));
456            {
457                for (int lane = 0; lane < VSZ; ++lane) {
458                    if (mask[lane]) {
459                        assert(m->addr[lane] < w->spillSizePerItem);
460
461                        m->addr[lane] = m->addr[lane] * w->spillWidth +
462                                        lane * sizeof(CType) + w->spillBase;
463                    }
464                }
465            }
466
467            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
468            w->outstanding_reqs_wr_gm++;
469            w->wr_gm_reqs_in_pipe--;
470            break;
471
472          case Brig::BRIG_SEGMENT_GROUP:
473            m->s_type = SEG_SHARED;
474            m->pipeId = LDSMEM_PIPE;
475            m->latency.set(w->computeUnit->shader->ticks(24));
476            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
477            w->outstanding_reqs_wr_lm++;
478            w->wr_lm_reqs_in_pipe--;
479            break;
480
481          case Brig::BRIG_SEGMENT_PRIVATE:
482            m->s_type = SEG_PRIVATE;
483            m->pipeId = GLBMEM_PIPE;
484            m->latency.set(w->computeUnit->shader->ticks(1));
485            {
486                for (int lane = 0; lane < VSZ; ++lane) {
487                    if (mask[lane]) {
488                        assert(m->addr[lane] < w->privSizePerItem);
489                        m->addr[lane] = m->addr[lane] + lane *
490                            sizeof(CType)+w->privBase;
491                    }
492                }
493            }
494
495            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
496            w->outstanding_reqs_wr_gm++;
497            w->wr_gm_reqs_in_pipe--;
498            break;
499
500          default:
501            fatal("Store to unsupported segment %d\n", this->segment);
502        }
503
504        w->outstanding_reqs++;
505        w->mem_reqs_in_pipe--;
506    }
507
508    template<typename OperationType, typename SrcDataType,
509             typename AddrRegOperandType>
510    void
511    StInst<OperationType, SrcDataType,
512           AddrRegOperandType>::generateDisassembly()
513    {
514        switch (num_src_operands) {
515          case 1:
516            this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
517                                         segmentNames[this->segment],
518                                         OperationType::label,
519                                         this->src.disassemble(),
520                                         this->addr.disassemble());
521            break;
522          case 2:
523            this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
524                                         segmentNames[this->segment],
525                                         OperationType::label,
526                                         this->src_vect[0].disassemble(),
527                                         this->src_vect[1].disassemble(),
528                                         this->addr.disassemble());
529            break;
530          case 4:
531            this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
532                                         this->opcode,
533                                         segmentNames[this->segment],
534                                         OperationType::label,
535                                         this->src_vect[0].disassemble(),
536                                         this->src_vect[1].disassemble(),
537                                         this->src_vect[2].disassemble(),
538                                         this->src_vect[3].disassemble(),
539                                         this->addr.disassemble());
540            break;
541          default: fatal("Bad ld register src operand, num vector operands: "
542                         "%d \n", num_src_operands);
543            break;
544        }
545    }
546
547    template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
548             bool HasDst>
549    void
550    AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
551        HasDst>::execute(GPUDynInstPtr gpuDynInst)
552    {
553        typedef typename DataType::CType CType;
554
555        Wavefront *w = gpuDynInst->wavefront();
556
557        GPUDynInstPtr m = gpuDynInst;
558
559        this->addr.calcVector(w, m->addr);
560
561        for (int lane = 0; lane < VSZ; ++lane) {
562            ((CType *)m->a_data)[lane] =
563                this->src[0].template get<CType>(w, lane);
564        }
565
566        // load second source operand for CAS
567        if (NumSrcOperands > 1) {
568            for (int lane = 0; lane < VSZ; ++lane) {
569                ((CType*)m->x_data)[lane] =
570                    this->src[1].template get<CType>(w, lane);
571            }
572        }
573
574        assert(NumSrcOperands <= 2);
575
576        m->m_op = this->opType;
577        m->m_type = DataType::memType;
578        m->v_type = DataType::vgprType;
579
580        m->exec_mask = w->execMask();
581        m->statusBitVector = 0;
582        m->equiv = 0;  // atomics don't have an equivalence class operand
583        m->n_reg = 1;
584        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
585
586        m->scope = getGenericMemoryScope(this->memoryScope);
587
588        if (HasDst) {
589            m->dst_reg = this->dest.regIndex();
590        }
591
592        m->simdId = w->simdId;
593        m->wfSlotId = w->wfSlotId;
594        m->wfDynId = w->wfDynId;
595        m->kern_id = w->kern_id;
596        m->cu_id = w->computeUnit->cu_id;
597        m->latency.init(&w->computeUnit->shader->tick_cnt);
598
599        switch (this->segment) {
600          case Brig::BRIG_SEGMENT_GLOBAL:
601            m->s_type = SEG_GLOBAL;
602            m->latency.set(w->computeUnit->shader->ticks(64));
603            m->pipeId = GLBMEM_PIPE;
604
605            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
606            w->outstanding_reqs_wr_gm++;
607            w->wr_gm_reqs_in_pipe--;
608            w->outstanding_reqs_rd_gm++;
609            w->rd_gm_reqs_in_pipe--;
610            break;
611
612          case Brig::BRIG_SEGMENT_GROUP:
613            m->s_type = SEG_SHARED;
614            m->pipeId = LDSMEM_PIPE;
615            m->latency.set(w->computeUnit->shader->ticks(24));
616            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
617            w->outstanding_reqs_wr_lm++;
618            w->wr_lm_reqs_in_pipe--;
619            w->outstanding_reqs_rd_lm++;
620            w->rd_lm_reqs_in_pipe--;
621            break;
622
623          default:
624            fatal("Atomic op to unsupported segment %d\n",
625                  this->segment);
626        }
627
628        w->outstanding_reqs++;
629        w->mem_reqs_in_pipe--;
630    }
631
632    const char* atomicOpToString(Brig::BrigAtomicOperation atomicOp);
633
634    template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
635             bool HasDst>
636    void
637    AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
638               HasDst>::generateDisassembly()
639    {
640        if (HasDst) {
641            this->disassembly =
642                csprintf("%s_%s_%s_%s %s,%s", this->opcode,
643                         atomicOpToString(this->atomicOperation),
644                         segmentNames[this->segment],
645                         DataType::label, this->dest.disassemble(),
646                         this->addr.disassemble());
647        } else {
648            this->disassembly =
649                csprintf("%s_%s_%s_%s %s", this->opcode,
650                         atomicOpToString(this->atomicOperation),
651                         segmentNames[this->segment],
652                         DataType::label, this->addr.disassemble());
653        }
654
655        for (int i = 0; i < NumSrcOperands; ++i) {
656            this->disassembly += ",";
657            this->disassembly += this->src[i].disassemble();
658        }
659    }
660} // namespace HsailISA
661