1/*
2 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Steve Reinhardt
34 */
35
36#include "gpu-compute/hsail_code.hh"
37
38// defined in code.cc, but not worth sucking in all of code.h for this
39// at this point
40extern const char *segmentNames[];
41
42namespace HsailISA
43{
44    template<typename DestDataType, typename AddrRegOperandType>
45    void
46    LdaInst<DestDataType, AddrRegOperandType>::generateDisassembly()
47    {
48        this->disassembly = csprintf("%s_%s %s,%s", this->opcode,
49                                     DestDataType::label,
50                                     this->dest.disassemble(),
51                                     this->addr.disassemble());
52    }
53
54    template<typename DestDataType, typename AddrRegOperandType>
55    void
56    LdaInst<DestDataType, AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
57    {
58        Wavefront *w = gpuDynInst->wavefront();
59
60        typedef typename DestDataType::CType CType M5_VAR_USED;
61        const VectorMask &mask = w->getPred();
62        std::vector<Addr> addr_vec;
63        addr_vec.resize(w->computeUnit->wfSize(), (Addr)0);
64        this->addr.calcVector(w, addr_vec);
65
66        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
67            if (mask[lane]) {
68                this->dest.set(w, lane, addr_vec[lane]);
69            }
70        }
71        addr_vec.clear();
72    }
73
74    template<typename MemDataType, typename DestDataType,
75             typename AddrRegOperandType>
76    void
77    LdInst<MemDataType, DestDataType, AddrRegOperandType>::generateDisassembly()
78    {
79        switch (num_dest_operands) {
80          case 1:
81            this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
82                                         segmentNames[this->segment],
83                                         MemDataType::label,
84                                         this->dest.disassemble(),
85                                         this->addr.disassemble());
86            break;
87          case 2:
88            this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
89                                         segmentNames[this->segment],
90                                         MemDataType::label,
91                                         this->dest_vect[0].disassemble(),
92                                         this->dest_vect[1].disassemble(),
93                                         this->addr.disassemble());
94            break;
95          case 3:
96            this->disassembly = csprintf("%s_%s_%s (%s,%s,%s), %s", this->opcode,
97                                         segmentNames[this->segment],
98                                         MemDataType::label,
99                                         this->dest_vect[0].disassemble(),
100                                         this->dest_vect[1].disassemble(),
101                                         this->dest_vect[2].disassemble(),
102                                         this->addr.disassemble());
103            break;
104          case 4:
105            this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
106                                         this->opcode,
107                                         segmentNames[this->segment],
108                                         MemDataType::label,
109                                         this->dest_vect[0].disassemble(),
110                                         this->dest_vect[1].disassemble(),
111                                         this->dest_vect[2].disassemble(),
112                                         this->dest_vect[3].disassemble(),
113                                         this->addr.disassemble());
114            break;
115          default:
116            fatal("Bad ld register dest operand, num vector operands: %d \n",
117                  num_dest_operands);
118            break;
119        }
120    }
121
122    static Addr
123    calcPrivAddr(Addr addr, Wavefront *w, int lane, GPUStaticInst *i)
124    {
125        // what is the size of the object we are accessing??
126        // NOTE: the compiler doesn't generate enough information
127        // to do this yet..have to just line up all the private
128        // work-item spaces back to back for now
129        /*
130        StorageElement* se =
131            i->parent->findSymbol(Brig::BrigPrivateSpace, addr);
132        assert(se);
133
134        return w->wfSlotId * w->privSizePerItem * w->computeUnit->wfSize() +
135            se->offset * w->computeUnit->wfSize() +
136            lane * se->size;
137        */
138
139        // addressing strategy: interleave the private spaces of
140        // work-items in a wave-front on 8 byte granularity.
141        // this won't be perfect coalescing like the spill space
142        // strategy, but it's better than nothing. The spill space
143        // strategy won't work with private because the same address
144        // may be accessed by different sized loads/stores.
145
146        // Note: I'm assuming that the largest load/store to private
147        // is 8 bytes. If it is larger, the stride will have to increase
148
149        Addr addr_div8 = addr / 8;
150        Addr addr_mod8 = addr % 8;
151
152        Addr ret = addr_div8 * 8 * w->computeUnit->wfSize() + lane * 8 +
153            addr_mod8 + w->privBase;
154
155        assert(ret < w->privBase +
156               (w->privSizePerItem * w->computeUnit->wfSize()));
157
158        return ret;
159    }
160
161    template<typename MemDataType, typename DestDataType,
162             typename AddrRegOperandType>
163    void
164    LdInst<MemDataType, DestDataType,
165           AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
166    {
167        Wavefront *w = gpuDynInst->wavefront();
168
169        typedef typename MemDataType::CType MemCType;
170        const VectorMask &mask = w->getPred();
171
172        // Kernarg references are handled uniquely for now (no Memory Request
173        // is used), so special-case them up front.  Someday we should
174        // make this more realistic, at which we should get rid of this
175        // block and fold this case into the switch below.
176        if (this->segment == Brig::BRIG_SEGMENT_KERNARG) {
177            MemCType val;
178
179            // I assume no vector ld for kernargs
180            assert(num_dest_operands == 1);
181
182            // assuming for the moment that we'll never do register
183            // offsets into kernarg space... just to make life simpler
184            uint64_t address = this->addr.calcUniform();
185
186            val = *(MemCType*)&w->kernelArgs[address];
187
188            DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val);
189
190            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
191                if (mask[lane]) {
192                    this->dest.set(w, lane, val);
193                }
194            }
195
196            return;
197        } else if (this->segment == Brig::BRIG_SEGMENT_ARG) {
198            uint64_t address = this->addr.calcUniform();
199            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
200                if (mask[lane]) {
201                    MemCType val = w->readCallArgMem<MemCType>(lane, address);
202
203                    DPRINTF(HSAIL, "ld_arg [%d] -> %llu\n", address,
204                            (unsigned long long)val);
205
206                    this->dest.set(w, lane, val);
207                }
208            }
209
210            return;
211        }
212
213        GPUDynInstPtr m = gpuDynInst;
214
215        this->addr.calcVector(w, m->addr);
216
217        m->m_type = MemDataType::memType;
218        m->v_type = DestDataType::vgprType;
219
220        m->exec_mask = w->execMask();
221        m->statusBitVector = 0;
222        m->equiv = this->equivClass;
223
224        if (num_dest_operands == 1) {
225            m->dst_reg = this->dest.regIndex();
226            m->n_reg = 1;
227        } else {
228            m->n_reg = num_dest_operands;
229            for (int i = 0; i < num_dest_operands; ++i) {
230                m->dst_reg_vec[i] = this->dest_vect[i].regIndex();
231            }
232        }
233
234        m->simdId = w->simdId;
235        m->wfSlotId = w->wfSlotId;
236        m->wfDynId = w->wfDynId;
237        m->kern_id = w->kernId;
238        m->cu_id = w->computeUnit->cu_id;
239        m->latency.init(&w->computeUnit->shader->tick_cnt);
240
241        switch (this->segment) {
242          case Brig::BRIG_SEGMENT_GLOBAL:
243            m->pipeId = GLBMEM_PIPE;
244            m->latency.set(w->computeUnit->shader->ticks(1));
245
246            // this is a complete hack to get around a compiler bug
247            // (the compiler currently generates global access for private
248            //  addresses (starting from 0). We need to add the private offset)
249            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
250                if (m->addr[lane] < w->privSizePerItem) {
251                    if (mask[lane]) {
252                        // what is the size of the object we are accessing?
253                        // find base for for this wavefront
254
255                        // calcPrivAddr will fail if accesses are unaligned
256                        assert(!((sizeof(MemCType) - 1) & m->addr[lane]));
257
258                        Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
259                                                     this);
260
261                        m->addr[lane] = privAddr;
262                    }
263                }
264            }
265
266            w->computeUnit->globalMemoryPipe.issueRequest(m);
267            w->outstandingReqsRdGm++;
268            w->rdGmReqsInPipe--;
269            break;
270
271          case Brig::BRIG_SEGMENT_SPILL:
272            assert(num_dest_operands == 1);
273            m->pipeId = GLBMEM_PIPE;
274            m->latency.set(w->computeUnit->shader->ticks(1));
275            {
276                for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
277                    //  note: this calculation will NOT WORK if the compiler
278                    //  ever generates loads/stores to the same address with
279                    //  different widths (e.g., a ld_u32 addr and a ld_u16 addr)
280                    if (mask[lane]) {
281                        assert(m->addr[lane] < w->spillSizePerItem);
282
283                        m->addr[lane] = m->addr[lane] * w->spillWidth +
284                                        lane * sizeof(MemCType) + w->spillBase;
285
286                        w->lastAddr[lane] = m->addr[lane];
287                    }
288                }
289            }
290
291            w->computeUnit->globalMemoryPipe.issueRequest(m);
292            w->outstandingReqsRdGm++;
293            w->rdGmReqsInPipe--;
294            break;
295
296          case Brig::BRIG_SEGMENT_GROUP:
297            m->pipeId = LDSMEM_PIPE;
298            m->latency.set(w->computeUnit->shader->ticks(24));
299            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
300            w->outstandingReqsRdLm++;
301            w->rdLmReqsInPipe--;
302            break;
303
304          case Brig::BRIG_SEGMENT_READONLY:
305            m->pipeId = GLBMEM_PIPE;
306            m->latency.set(w->computeUnit->shader->ticks(1));
307
308            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
309                if (mask[lane]) {
310                    assert(m->addr[lane] + sizeof(MemCType) <= w->roSize);
311                    m->addr[lane] += w->roBase;
312                }
313            }
314
315            w->computeUnit->globalMemoryPipe.issueRequest(m);
316            w->outstandingReqsRdGm++;
317            w->rdGmReqsInPipe--;
318            break;
319
320          case Brig::BRIG_SEGMENT_PRIVATE:
321            m->pipeId = GLBMEM_PIPE;
322            m->latency.set(w->computeUnit->shader->ticks(1));
323            {
324                for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
325                    if (mask[lane]) {
326                        assert(m->addr[lane] < w->privSizePerItem);
327
328                        m->addr[lane] = m->addr[lane] +
329                            lane * sizeof(MemCType) + w->privBase;
330                    }
331                }
332            }
333            w->computeUnit->globalMemoryPipe.issueRequest(m);
334            w->outstandingReqsRdGm++;
335            w->rdGmReqsInPipe--;
336            break;
337
338          default:
339            fatal("Load to unsupported segment %d %llxe\n", this->segment,
340                  m->addr[0]);
341        }
342
343        w->outstandingReqs++;
344        w->memReqsInPipe--;
345    }
346
347    template<typename OperationType, typename SrcDataType,
348             typename AddrRegOperandType>
349    void
350    StInst<OperationType, SrcDataType,
351           AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
352    {
353        Wavefront *w = gpuDynInst->wavefront();
354
355        typedef typename OperationType::CType CType;
356
357        const VectorMask &mask = w->getPred();
358
359        // arg references are handled uniquely for now (no Memory Request
360        // is used), so special-case them up front.  Someday we should
361        // make this more realistic, at which we should get rid of this
362        // block and fold this case into the switch below.
363        if (this->segment == Brig::BRIG_SEGMENT_ARG) {
364            uint64_t address = this->addr.calcUniform();
365
366            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
367                if (mask[lane]) {
368                    CType data = this->src.template get<CType>(w, lane);
369                    DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data);
370                    w->writeCallArgMem<CType>(lane, address, data);
371                }
372            }
373
374            return;
375        }
376
377        GPUDynInstPtr m = gpuDynInst;
378
379        m->exec_mask = w->execMask();
380
381        this->addr.calcVector(w, m->addr);
382
383        if (num_src_operands == 1) {
384            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
385                if (mask[lane]) {
386                    ((CType*)m->d_data)[lane] =
387                        this->src.template get<CType>(w, lane);
388                }
389            }
390        } else {
391            for (int k= 0; k < num_src_operands; ++k) {
392                for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
393                    if (mask[lane]) {
394                        ((CType*)m->d_data)[k * w->computeUnit->wfSize() + lane] =
395                            this->src_vect[k].template get<CType>(w, lane);
396                    }
397                }
398            }
399        }
400
401        m->m_type = OperationType::memType;
402        m->v_type = OperationType::vgprType;
403
404        m->statusBitVector = 0;
405        m->equiv = this->equivClass;
406
407        if (num_src_operands == 1) {
408            m->n_reg = 1;
409        } else {
410            m->n_reg = num_src_operands;
411        }
412
413        m->simdId = w->simdId;
414        m->wfSlotId = w->wfSlotId;
415        m->wfDynId = w->wfDynId;
416        m->kern_id = w->kernId;
417        m->cu_id = w->computeUnit->cu_id;
418        m->latency.init(&w->computeUnit->shader->tick_cnt);
419
420        switch (this->segment) {
421          case Brig::BRIG_SEGMENT_GLOBAL:
422            m->pipeId = GLBMEM_PIPE;
423            m->latency.set(w->computeUnit->shader->ticks(1));
424
425            // this is a complete hack to get around a compiler bug
426            // (the compiler currently generates global access for private
427            //  addresses (starting from 0). We need to add the private offset)
428            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
429                if (mask[lane]) {
430                    if (m->addr[lane] < w->privSizePerItem) {
431
432                        // calcPrivAddr will fail if accesses are unaligned
433                        assert(!((sizeof(CType)-1) & m->addr[lane]));
434
435                        Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
436                                                     this);
437
438                        m->addr[lane] = privAddr;
439                    }
440                }
441            }
442
443            w->computeUnit->globalMemoryPipe.issueRequest(m);
444            w->outstandingReqsWrGm++;
445            w->wrGmReqsInPipe--;
446            break;
447
448          case Brig::BRIG_SEGMENT_SPILL:
449            assert(num_src_operands == 1);
450            m->pipeId = GLBMEM_PIPE;
451            m->latency.set(w->computeUnit->shader->ticks(1));
452            {
453                for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
454                    if (mask[lane]) {
455                        assert(m->addr[lane] < w->spillSizePerItem);
456
457                        m->addr[lane] = m->addr[lane] * w->spillWidth +
458                                        lane * sizeof(CType) + w->spillBase;
459                    }
460                }
461            }
462
463            w->computeUnit->globalMemoryPipe.issueRequest(m);
464            w->outstandingReqsWrGm++;
465            w->wrGmReqsInPipe--;
466            break;
467
468          case Brig::BRIG_SEGMENT_GROUP:
469            m->pipeId = LDSMEM_PIPE;
470            m->latency.set(w->computeUnit->shader->ticks(24));
471            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
472            w->outstandingReqsWrLm++;
473            w->wrLmReqsInPipe--;
474            break;
475
476          case Brig::BRIG_SEGMENT_PRIVATE:
477            m->pipeId = GLBMEM_PIPE;
478            m->latency.set(w->computeUnit->shader->ticks(1));
479            {
480                for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
481                    if (mask[lane]) {
482                        assert(m->addr[lane] < w->privSizePerItem);
483                        m->addr[lane] = m->addr[lane] + lane *
484                            sizeof(CType)+w->privBase;
485                    }
486                }
487            }
488
489            w->computeUnit->globalMemoryPipe.issueRequest(m);
490            w->outstandingReqsWrGm++;
491            w->wrGmReqsInPipe--;
492            break;
493
494          default:
495            fatal("Store to unsupported segment %d\n", this->segment);
496        }
497
498        w->outstandingReqs++;
499        w->memReqsInPipe--;
500    }
501
502    template<typename OperationType, typename SrcDataType,
503             typename AddrRegOperandType>
504    void
505    StInst<OperationType, SrcDataType,
506           AddrRegOperandType>::generateDisassembly()
507    {
508        switch (num_src_operands) {
509          case 1:
510            this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
511                                         segmentNames[this->segment],
512                                         OperationType::label,
513                                         this->src.disassemble(),
514                                         this->addr.disassemble());
515            break;
516          case 2:
517            this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
518                                         segmentNames[this->segment],
519                                         OperationType::label,
520                                         this->src_vect[0].disassemble(),
521                                         this->src_vect[1].disassemble(),
522                                         this->addr.disassemble());
523            break;
524          case 4:
525            this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
526                                         this->opcode,
527                                         segmentNames[this->segment],
528                                         OperationType::label,
529                                         this->src_vect[0].disassemble(),
530                                         this->src_vect[1].disassemble(),
531                                         this->src_vect[2].disassemble(),
532                                         this->src_vect[3].disassemble(),
533                                         this->addr.disassemble());
534            break;
535          default: fatal("Bad ld register src operand, num vector operands: "
536                         "%d \n", num_src_operands);
537            break;
538        }
539    }
540
541    template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
542             bool HasDst>
543    void
544    AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
545        HasDst>::execute(GPUDynInstPtr gpuDynInst)
546    {
547        typedef typename DataType::CType CType;
548
549        Wavefront *w = gpuDynInst->wavefront();
550
551        GPUDynInstPtr m = gpuDynInst;
552
553        this->addr.calcVector(w, m->addr);
554
555        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
556            ((CType *)m->a_data)[lane] =
557                this->src[0].template get<CType>(w, lane);
558        }
559
560        // load second source operand for CAS
561        if (NumSrcOperands > 1) {
562            for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
563                ((CType*)m->x_data)[lane] =
564                    this->src[1].template get<CType>(w, lane);
565            }
566        }
567
568        assert(NumSrcOperands <= 2);
569
570        m->m_type = DataType::memType;
571        m->v_type = DataType::vgprType;
572
573        m->exec_mask = w->execMask();
574        m->statusBitVector = 0;
575        m->equiv = 0;  // atomics don't have an equivalence class operand
576        m->n_reg = 1;
577
578        if (HasDst) {
579            m->dst_reg = this->dest.regIndex();
580        }
581
582        m->simdId = w->simdId;
583        m->wfSlotId = w->wfSlotId;
584        m->wfDynId = w->wfDynId;
585        m->kern_id = w->kernId;
586        m->cu_id = w->computeUnit->cu_id;
587        m->latency.init(&w->computeUnit->shader->tick_cnt);
588
589        switch (this->segment) {
590          case Brig::BRIG_SEGMENT_GLOBAL:
591            m->latency.set(w->computeUnit->shader->ticks(64));
592            m->pipeId = GLBMEM_PIPE;
593
594            w->computeUnit->globalMemoryPipe.issueRequest(m);
595            w->outstandingReqsWrGm++;
596            w->wrGmReqsInPipe--;
597            w->outstandingReqsRdGm++;
598            w->rdGmReqsInPipe--;
599            break;
600
601          case Brig::BRIG_SEGMENT_GROUP:
602            m->pipeId = LDSMEM_PIPE;
603            m->latency.set(w->computeUnit->shader->ticks(24));
604            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
605            w->outstandingReqsWrLm++;
606            w->wrLmReqsInPipe--;
607            w->outstandingReqsRdLm++;
608            w->rdLmReqsInPipe--;
609            break;
610
611          default:
612            fatal("Atomic op to unsupported segment %d\n",
613                  this->segment);
614        }
615
616        w->outstandingReqs++;
617        w->memReqsInPipe--;
618    }
619
620    const char* atomicOpToString(Brig::BrigAtomicOperation atomicOp);
621
622    template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
623             bool HasDst>
624    void
625    AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
626               HasDst>::generateDisassembly()
627    {
628        if (HasDst) {
629            this->disassembly =
630                csprintf("%s_%s_%s_%s %s,%s", this->opcode,
631                         atomicOpToString(this->atomicOperation),
632                         segmentNames[this->segment],
633                         DataType::label, this->dest.disassemble(),
634                         this->addr.disassemble());
635        } else {
636            this->disassembly =
637                csprintf("%s_%s_%s_%s %s", this->opcode,
638                         atomicOpToString(this->atomicOperation),
639                         segmentNames[this->segment],
640                         DataType::label, this->addr.disassemble());
641        }
642
643        for (int i = 0; i < NumSrcOperands; ++i) {
644            this->disassembly += ",";
645            this->disassembly += this->src[i].disassemble();
646        }
647    }
648} // namespace HsailISA
649