1/*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Marc Orr
34 */
35
36#include <csignal>
37
38#include "arch/hsail/insts/decl.hh"
39#include "arch/hsail/insts/mem.hh"
40
41namespace HsailISA
42{
43    // Pseudo (or magic) instructions are overloaded on the hsail call
44    // instruction, because of its flexible parameter signature.
45
46    // To add a new magic instruction:
47    // 1. Add an entry to the enum.
48    // 2. Implement it in the switch statement below (Call::exec).
49    // 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h,
50    //    so its easy to call from an OpenCL kernel.
51
52    // This enum should be identical to the enum in
53    // hsa/hsail-gpu-compute/util/magicinst.h
54    enum
55    {
56        MAGIC_PRINT_WF_32 = 0,
57        MAGIC_PRINT_WF_64,
58        MAGIC_PRINT_LANE,
59        MAGIC_PRINT_LANE_64,
60        MAGIC_PRINT_WF_FLOAT,
61        MAGIC_SIM_BREAK,
62        MAGIC_PREF_SUM,
63        MAGIC_REDUCTION,
64        MAGIC_MASKLANE_LOWER,
65        MAGIC_MASKLANE_UPPER,
66        MAGIC_JOIN_WF_BAR,
67        MAGIC_WAIT_WF_BAR,
68        MAGIC_PANIC,
69        MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG,
70        MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG,
71        MAGIC_LOAD_GLOBAL_U32_REG,
72        MAGIC_XACT_CAS_LD,
73        MAGIC_MOST_SIG_THD,
74        MAGIC_MOST_SIG_BROADCAST,
75        MAGIC_PRINT_WFID_32,
76        MAGIC_PRINT_WFID_64
77    };
78
79    void
80    Call::execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst)
81    {
82        const VectorMask &mask = w->getPred();
83
84        int op = 0;
85        bool got_op = false;
86
87        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
88            if (mask[lane]) {
89                int src_val0 = src1.get<int>(w, lane, 0);
90                if (got_op) {
91                    if (src_val0 != op) {
92                        fatal("Multiple magic instructions per PC not "
93                              "supported\n");
94                    }
95                } else {
96                    op = src_val0;
97                    got_op = true;
98                }
99            }
100        }
101
102        switch(op) {
103          case MAGIC_PRINT_WF_32:
104            MagicPrintWF32(w);
105            break;
106          case MAGIC_PRINT_WF_64:
107            MagicPrintWF64(w);
108            break;
109          case MAGIC_PRINT_LANE:
110            MagicPrintLane(w);
111            break;
112          case MAGIC_PRINT_LANE_64:
113            MagicPrintLane64(w);
114            break;
115          case MAGIC_PRINT_WF_FLOAT:
116            MagicPrintWFFloat(w);
117            break;
118          case MAGIC_SIM_BREAK:
119            MagicSimBreak(w);
120            break;
121          case MAGIC_PREF_SUM:
122            MagicPrefixSum(w);
123            break;
124          case MAGIC_REDUCTION:
125            MagicReduction(w);
126            break;
127          case MAGIC_MASKLANE_LOWER:
128            MagicMaskLower(w);
129            break;
130          case MAGIC_MASKLANE_UPPER:
131            MagicMaskUpper(w);
132            break;
133          case MAGIC_JOIN_WF_BAR:
134            MagicJoinWFBar(w);
135            break;
136          case MAGIC_WAIT_WF_BAR:
137            MagicWaitWFBar(w);
138            break;
139          case MAGIC_PANIC:
140            MagicPanic(w);
141            break;
142
143          // atomic instructions
144          case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG:
145            MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst);
146            break;
147
148          case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG:
149            MagicAtomicNRAddGroupU32Reg(w, gpuDynInst);
150            break;
151
152          case MAGIC_LOAD_GLOBAL_U32_REG:
153            MagicLoadGlobalU32Reg(w, gpuDynInst);
154            break;
155
156          case MAGIC_XACT_CAS_LD:
157            MagicXactCasLd(w);
158            break;
159
160          case MAGIC_MOST_SIG_THD:
161            MagicMostSigThread(w);
162            break;
163
164          case MAGIC_MOST_SIG_BROADCAST:
165            MagicMostSigBroadcast(w);
166            break;
167
168          case MAGIC_PRINT_WFID_32:
169            MagicPrintWF32ID(w);
170            break;
171
172          case MAGIC_PRINT_WFID_64:
173            MagicPrintWFID64(w);
174            break;
175
176          default: fatal("unrecognized magic instruction: %d\n", op);
177        }
178    }
179
180    void
181    Call::MagicPrintLane(Wavefront *w)
182    {
183    #if TRACING_ON
184        const VectorMask &mask = w->getPred();
185        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
186            if (mask[lane]) {
187                int src_val1 = src1.get<int>(w, lane, 1);
188                int src_val2 = src1.get<int>(w, lane, 2);
189                if (src_val2) {
190                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
191                             disassemble(), w->computeUnit->cu_id, w->simdId,
192                             w->wfSlotId, lane, src_val1);
193                } else {
194                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
195                             disassemble(), w->computeUnit->cu_id, w->simdId,
196                             w->wfSlotId, lane, src_val1);
197                }
198            }
199        }
200    #endif
201    }
202
203    void
204    Call::MagicPrintLane64(Wavefront *w)
205    {
206    #if TRACING_ON
207        const VectorMask &mask = w->getPred();
208        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
209            if (mask[lane]) {
210                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
211                int src_val2 = src1.get<int>(w, lane, 2);
212                if (src_val2) {
213                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
214                             disassemble(), w->computeUnit->cu_id, w->simdId,
215                             w->wfSlotId, lane, src_val1);
216                } else {
217                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
218                             disassemble(), w->computeUnit->cu_id, w->simdId,
219                             w->wfSlotId, lane, src_val1);
220                }
221            }
222        }
223    #endif
224    }
225
226    void
227    Call::MagicPrintWF32(Wavefront *w)
228    {
229    #if TRACING_ON
230        const VectorMask &mask = w->getPred();
231        std::string res_str;
232        res_str = csprintf("krl_prt (%s)\n", disassemble());
233
234        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
235            if (!(lane & 7)) {
236                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
237            }
238
239            if (mask[lane]) {
240                int src_val1 = src1.get<int>(w, lane, 1);
241                int src_val2 = src1.get<int>(w, lane, 2);
242
243                if (src_val2) {
244                    res_str += csprintf("%08x", src_val1);
245                } else {
246                    res_str += csprintf("%08d", src_val1);
247                }
248            } else {
249                res_str += csprintf("xxxxxxxx");
250            }
251
252            if ((lane & 7) == 7) {
253                res_str += csprintf("\n");
254            } else {
255                res_str += csprintf(" ");
256            }
257        }
258
259        res_str += "\n\n";
260        DPRINTFN(res_str.c_str());
261    #endif
262    }
263
264    void
265    Call::MagicPrintWF32ID(Wavefront *w)
266    {
267    #if TRACING_ON
268        const VectorMask &mask = w->getPred();
269        std::string res_str;
270        int src_val3 = -1;
271        res_str = csprintf("krl_prt (%s)\n", disassemble());
272
273        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
274            if (!(lane & 7)) {
275                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
276            }
277
278            if (mask[lane]) {
279                int src_val1 = src1.get<int>(w, lane, 1);
280                int src_val2 = src1.get<int>(w, lane, 2);
281                src_val3 = src1.get<int>(w, lane, 3);
282
283                if (src_val2) {
284                    res_str += csprintf("%08x", src_val1);
285                } else {
286                    res_str += csprintf("%08d", src_val1);
287                }
288            } else {
289                res_str += csprintf("xxxxxxxx");
290            }
291
292            if ((lane & 7) == 7) {
293                res_str += csprintf("\n");
294            } else {
295                res_str += csprintf(" ");
296            }
297        }
298
299        res_str += "\n\n";
300        if (w->wfDynId == src_val3) {
301            DPRINTFN(res_str.c_str());
302        }
303    #endif
304    }
305
306    void
307    Call::MagicPrintWF64(Wavefront *w)
308    {
309    #if TRACING_ON
310        const VectorMask &mask = w->getPred();
311        std::string res_str;
312        res_str = csprintf("krl_prt (%s)\n", disassemble());
313
314        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
315            if (!(lane & 3)) {
316                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
317            }
318
319            if (mask[lane]) {
320                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
321                int src_val2 = src1.get<int>(w, lane, 2);
322
323                if (src_val2) {
324                    res_str += csprintf("%016x", src_val1);
325                } else {
326                    res_str += csprintf("%016d", src_val1);
327                }
328            } else {
329                res_str += csprintf("xxxxxxxxxxxxxxxx");
330            }
331
332            if ((lane & 3) == 3) {
333                res_str += csprintf("\n");
334            } else {
335                res_str += csprintf(" ");
336            }
337        }
338
339        res_str += "\n\n";
340        DPRINTFN(res_str.c_str());
341    #endif
342    }
343
344    void
345    Call::MagicPrintWFID64(Wavefront *w)
346    {
347    #if TRACING_ON
348        const VectorMask &mask = w->getPred();
349        std::string res_str;
350        int src_val3 = -1;
351        res_str = csprintf("krl_prt (%s)\n", disassemble());
352
353        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
354            if (!(lane & 3)) {
355                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
356            }
357
358            if (mask[lane]) {
359                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
360                int src_val2 = src1.get<int>(w, lane, 2);
361                src_val3 = src1.get<int>(w, lane, 3);
362
363                if (src_val2) {
364                    res_str += csprintf("%016x", src_val1);
365                } else {
366                    res_str += csprintf("%016d", src_val1);
367                }
368            } else {
369                res_str += csprintf("xxxxxxxxxxxxxxxx");
370            }
371
372            if ((lane & 3) == 3) {
373                res_str += csprintf("\n");
374            } else {
375                res_str += csprintf(" ");
376            }
377        }
378
379        res_str += "\n\n";
380        if (w->wfDynId == src_val3) {
381            DPRINTFN(res_str.c_str());
382        }
383    #endif
384    }
385
386    void
387    Call::MagicPrintWFFloat(Wavefront *w)
388    {
389    #if TRACING_ON
390        const VectorMask &mask = w->getPred();
391        std::string res_str;
392        res_str = csprintf("krl_prt (%s)\n", disassemble());
393
394        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
395            if (!(lane & 7)) {
396                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
397            }
398
399            if (mask[lane]) {
400                float src_val1 = src1.get<float>(w, lane, 1);
401                res_str += csprintf("%08f", src_val1);
402            } else {
403                res_str += csprintf("xxxxxxxx");
404            }
405
406            if ((lane & 7) == 7) {
407                res_str += csprintf("\n");
408            } else {
409                res_str += csprintf(" ");
410            }
411        }
412
413        res_str += "\n\n";
414        DPRINTFN(res_str.c_str());
415    #endif
416    }
417
418    // raises a signal that GDB will catch
419    // when done with the break, type "signal 0" in gdb to continue
420    void
421    Call::MagicSimBreak(Wavefront *w)
422    {
423        std::string res_str;
424        // print out state for this wavefront and then break
425        res_str = csprintf("Breakpoint encountered for wavefront %i\n",
426                           w->wfSlotId);
427
428        res_str += csprintf("  Kern ID: %i\n", w->kernId);
429        res_str += csprintf("  Phase ID: %i\n", w->simdId);
430        res_str += csprintf("  Executing on CU #%i\n", w->computeUnit->cu_id);
431        res_str += csprintf("  Exec mask: ");
432
433        for (int i = w->computeUnit->wfSize() - 1; i >= 0; --i) {
434            if (w->execMask(i))
435                res_str += "1";
436            else
437                res_str += "0";
438
439            if ((i & 7) == 7)
440                res_str += " ";
441        }
442
443        res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong());
444
445        res_str += "\nHelpful debugging hints:\n";
446        res_str += "   Check out w->s_reg / w->d_reg for register state\n";
447
448        res_str += "\n\n";
449        DPRINTFN(res_str.c_str());
450        fflush(stdout);
451
452        raise(SIGTRAP);
453    }
454
455    void
456    Call::MagicPrefixSum(Wavefront *w)
457    {
458        const VectorMask &mask = w->getPred();
459        int res = 0;
460
461        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
462            if (mask[lane]) {
463                int src_val1 = src1.get<int>(w, lane, 1);
464                dest.set<int>(w, lane, res);
465                res += src_val1;
466            }
467        }
468    }
469
470    void
471    Call::MagicReduction(Wavefront *w)
472    {
473        // reduction magic instruction
474        //   The reduction instruction takes up to 64 inputs (one from
475        //   each thread in a WF) and sums them. It returns the sum to
476        //   each thread in the WF.
477        const VectorMask &mask = w->getPred();
478        int res = 0;
479
480        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
481            if (mask[lane]) {
482                int src_val1 = src1.get<int>(w, lane, 1);
483                res += src_val1;
484            }
485        }
486
487        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
488            if (mask[lane]) {
489                dest.set<int>(w, lane, res);
490            }
491        }
492    }
493
494    void
495    Call::MagicMaskLower(Wavefront *w)
496    {
497        const VectorMask &mask = w->getPred();
498        int res = 0;
499
500        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
501            if (mask[lane]) {
502                int src_val1 = src1.get<int>(w, lane, 1);
503
504                if (src_val1) {
505                    if (lane < (w->computeUnit->wfSize()/2)) {
506                        res = res | ((uint32_t)(1) << lane);
507                    }
508                }
509            }
510        }
511
512        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
513            if (mask[lane]) {
514                dest.set<int>(w, lane, res);
515            }
516        }
517    }
518
519    void
520    Call::MagicMaskUpper(Wavefront *w)
521    {
522        const VectorMask &mask = w->getPred();
523        int res = 0;
524        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
525            if (mask[lane]) {
526                int src_val1 = src1.get<int>(w, lane, 1);
527
528                if (src_val1) {
529                    if (lane >= (w->computeUnit->wfSize()/2)) {
530                        res = res | ((uint32_t)(1) <<
531                                     (lane - (w->computeUnit->wfSize()/2)));
532                    }
533                }
534            }
535        }
536
537        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
538            if (mask[lane]) {
539                dest.set<int>(w, lane, res);
540            }
541        }
542    }
543
544    void
545    Call::MagicJoinWFBar(Wavefront *w)
546    {
547        const VectorMask &mask = w->getPred();
548        int max_cnt = 0;
549
550        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
551            if (mask[lane]) {
552                w->barCnt[lane]++;
553
554                if (w->barCnt[lane] > max_cnt) {
555                    max_cnt = w->barCnt[lane];
556                }
557            }
558        }
559
560        if (max_cnt > w->maxBarCnt) {
561            w->maxBarCnt = max_cnt;
562        }
563    }
564
565    void
566    Call::MagicWaitWFBar(Wavefront *w)
567    {
568        const VectorMask &mask = w->getPred();
569        int max_cnt = 0;
570
571        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
572            if (mask[lane]) {
573                w->barCnt[lane]--;
574            }
575
576            if (w->barCnt[lane] > max_cnt) {
577                max_cnt = w->barCnt[lane];
578            }
579        }
580
581        if (max_cnt < w->maxBarCnt) {
582            w->maxBarCnt = max_cnt;
583        }
584
585        w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
586                                   w->instructionBuffer.end());
587        if (w->pendingFetch)
588            w->dropFetch = true;
589    }
590
591    void
592    Call::MagicPanic(Wavefront *w)
593    {
594        const VectorMask &mask = w->getPred();
595
596        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
597            if (mask[lane]) {
598                int src_val1 = src1.get<int>(w, lane, 1);
599                panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
600                      src_val1, lane);
601            }
602        }
603    }
604
605    void
606    Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
607    {
608        // the address is in src1 | src2
609        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
610            int src_val1 = src1.get<int>(w, lane, 1);
611            int src_val2 = src1.get<int>(w, lane, 2);
612            Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);
613
614            m->addr[lane] = addr;
615        }
616
617    }
618
619    void
620    Call::MagicAtomicNRAddGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
621    {
622        GPUDynInstPtr m = gpuDynInst;
623
624        calcAddr(w, m);
625
626        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
627            ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
628        }
629
630        setFlag(AtomicNoReturn);
631        setFlag(AtomicAdd);
632        setFlag(NoScope);
633        setFlag(NoOrder);
634        setFlag(GlobalSegment);
635
636        m->m_type = U32::memType;
637        m->v_type = U32::vgprType;
638
639        m->exec_mask = w->execMask();
640        m->statusBitVector = 0;
641        m->equiv = 0;  // atomics don't have an equivalence class operand
642        m->n_reg = 1;
643
644        m->simdId = w->simdId;
645        m->wfSlotId = w->wfSlotId;
646        m->wfDynId = w->wfDynId;
647        m->latency.init(&w->computeUnit->shader->tick_cnt);
648
649        m->pipeId = GLBMEM_PIPE;
650        m->latency.set(w->computeUnit->shader->ticks(64));
651        w->computeUnit->globalMemoryPipe.issueRequest(m);
652        w->outstandingReqsWrGm++;
653        w->wrGmReqsInPipe--;
654        w->outstandingReqsRdGm++;
655        w->rdGmReqsInPipe--;
656        w->outstandingReqs++;
657        w->memReqsInPipe--;
658    }
659
660    void
661    Call::MagicAtomicNRAddGroupU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
662    {
663        GPUDynInstPtr m = gpuDynInst;
664        calcAddr(w, m);
665
666        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
667            ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
668        }
669
670        setFlag(AtomicNoReturn);
671        setFlag(AtomicAdd);
672        setFlag(NoScope);
673        setFlag(NoOrder);
674        setFlag(GlobalSegment);
675
676        m->m_type = U32::memType;
677        m->v_type = U32::vgprType;
678
679        m->exec_mask = w->execMask();
680        m->statusBitVector = 0;
681        m->equiv = 0;  // atomics don't have an equivalence class operand
682        m->n_reg = 1;
683
684        m->simdId = w->simdId;
685        m->wfSlotId = w->wfSlotId;
686        m->wfDynId = w->wfDynId;
687        m->latency.init(&w->computeUnit->shader->tick_cnt);
688
689        m->pipeId = GLBMEM_PIPE;
690        m->latency.set(w->computeUnit->shader->ticks(64));
691        w->computeUnit->globalMemoryPipe.issueRequest(m);
692        w->outstandingReqsWrGm++;
693        w->wrGmReqsInPipe--;
694        w->outstandingReqsRdGm++;
695        w->rdGmReqsInPipe--;
696        w->outstandingReqs++;
697        w->memReqsInPipe--;
698    }
699
700    void
701    Call::MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
702    {
703        GPUDynInstPtr m = gpuDynInst;
704        // calculate the address
705        calcAddr(w, m);
706
707        setFlag(Load);
708        setFlag(NoScope);
709        setFlag(NoOrder);
710        setFlag(GlobalSegment);
711
712        m->m_type = U32::memType;  //MemDataType::memType;
713        m->v_type = U32::vgprType; //DestDataType::vgprType;
714
715        m->exec_mask = w->execMask();
716        m->statusBitVector = 0;
717        m->equiv = 0;
718        m->n_reg = 1;
719
720        // FIXME
721        //m->dst_reg = this->dest.regIndex();
722
723        m->simdId = w->simdId;
724        m->wfSlotId = w->wfSlotId;
725        m->wfDynId = w->wfDynId;
726        m->latency.init(&w->computeUnit->shader->tick_cnt);
727
728        m->pipeId = GLBMEM_PIPE;
729        m->latency.set(w->computeUnit->shader->ticks(1));
730        w->computeUnit->globalMemoryPipe.issueRequest(m);
731        w->outstandingReqsRdGm++;
732        w->rdGmReqsInPipe--;
733        w->outstandingReqs++;
734        w->memReqsInPipe--;
735    }
736
737    void
738    Call::MagicXactCasLd(Wavefront *w)
739    {
740        const VectorMask &mask = w->getPred();
741        int src_val1 = 0;
742
743        for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
744            if (mask[lane]) {
745                src_val1 = src1.get<int>(w, lane, 1);
746                break;
747            }
748        }
749
750        if (!w->computeUnit->xactCasLoadMap.count(src_val1)) {
751            w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue();
752            w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear();
753        }
754
755        w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue
756            .push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId));
757    }
758
759    void
760    Call::MagicMostSigThread(Wavefront *w)
761    {
762        const VectorMask &mask = w->getPred();
763        unsigned mst = true;
764
765        for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) {
766            if (mask[lane]) {
767                dest.set<int>(w, lane, mst);
768                mst = false;
769            }
770        }
771    }
772
773    void
774    Call::MagicMostSigBroadcast(Wavefront *w)
775    {
776        const VectorMask &mask = w->getPred();
777        int res = 0;
778        bool got_res = false;
779
780        for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) {
781            if (mask[lane]) {
782                if (!got_res) {
783                    res = src1.get<int>(w, lane, 1);
784                    got_res = true;
785                }
786                dest.set<int>(w, lane, res);
787            }
788        }
789    }
790
791} // namespace HsailISA
792