pseudo_inst.cc revision 11308:7d8836fd043d
1/*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Marc Orr
34 */
35
36#include <csignal>
37
38#include "arch/hsail/insts/decl.hh"
39#include "arch/hsail/insts/mem.hh"
40
41namespace HsailISA
42{
43    // Pseudo (or magic) instructions are overloaded on the hsail call
44    // instruction, because of its flexible parameter signature.
45
46    // To add a new magic instruction:
47    // 1. Add an entry to the enum.
48    // 2. Implement it in the switch statement below (Call::exec).
49    // 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h,
50    //    so its easy to call from an OpenCL kernel.
51
52    // This enum should be identical to the enum in
53    // hsa/hsail-gpu-compute/util/magicinst.h
54    enum
55    {
56        MAGIC_PRINT_WF_32 = 0,
57        MAGIC_PRINT_WF_64,
58        MAGIC_PRINT_LANE,
59        MAGIC_PRINT_LANE_64,
60        MAGIC_PRINT_WF_FLOAT,
61        MAGIC_SIM_BREAK,
62        MAGIC_PREF_SUM,
63        MAGIC_REDUCTION,
64        MAGIC_MASKLANE_LOWER,
65        MAGIC_MASKLANE_UPPER,
66        MAGIC_JOIN_WF_BAR,
67        MAGIC_WAIT_WF_BAR,
68        MAGIC_PANIC,
69        MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG,
70        MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG,
71        MAGIC_LOAD_GLOBAL_U32_REG,
72        MAGIC_XACT_CAS_LD,
73        MAGIC_MOST_SIG_THD,
74        MAGIC_MOST_SIG_BROADCAST,
75        MAGIC_PRINT_WFID_32,
76        MAGIC_PRINT_WFID_64
77    };
78
79    void
80    Call::execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst)
81    {
82        const VectorMask &mask = w->get_pred();
83
84        int op = 0;
85        bool got_op = false;
86
87        for (int lane = 0; lane < VSZ; ++lane) {
88            if (mask[lane]) {
89                int src_val0 = src1.get<int>(w, lane, 0);
90                if (got_op) {
91                    if (src_val0 != op) {
92                        fatal("Multiple magic instructions per PC not "
93                              "supported\n");
94                    }
95                } else {
96                    op = src_val0;
97                    got_op = true;
98                }
99            }
100        }
101
102        switch(op) {
103          case MAGIC_PRINT_WF_32:
104            MagicPrintWF32(w);
105            break;
106          case MAGIC_PRINT_WF_64:
107            MagicPrintWF64(w);
108            break;
109          case MAGIC_PRINT_LANE:
110            MagicPrintLane(w);
111            break;
112          case MAGIC_PRINT_LANE_64:
113            MagicPrintLane64(w);
114            break;
115          case MAGIC_PRINT_WF_FLOAT:
116            MagicPrintWFFloat(w);
117            break;
118          case MAGIC_SIM_BREAK:
119            MagicSimBreak(w);
120            break;
121          case MAGIC_PREF_SUM:
122            MagicPrefixSum(w);
123            break;
124          case MAGIC_REDUCTION:
125            MagicReduction(w);
126            break;
127          case MAGIC_MASKLANE_LOWER:
128            MagicMaskLower(w);
129            break;
130          case MAGIC_MASKLANE_UPPER:
131            MagicMaskUpper(w);
132            break;
133          case MAGIC_JOIN_WF_BAR:
134            MagicJoinWFBar(w);
135            break;
136          case MAGIC_WAIT_WF_BAR:
137            MagicWaitWFBar(w);
138            break;
139          case MAGIC_PANIC:
140            MagicPanic(w);
141            break;
142
143          // atomic instructions
144          case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG:
145            MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst);
146            break;
147
148          case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG:
149            MagicAtomicNRAddGroupU32Reg(w, gpuDynInst);
150            break;
151
152          case MAGIC_LOAD_GLOBAL_U32_REG:
153            MagicLoadGlobalU32Reg(w, gpuDynInst);
154            break;
155
156          case MAGIC_XACT_CAS_LD:
157            MagicXactCasLd(w);
158            break;
159
160          case MAGIC_MOST_SIG_THD:
161            MagicMostSigThread(w);
162            break;
163
164          case MAGIC_MOST_SIG_BROADCAST:
165            MagicMostSigBroadcast(w);
166            break;
167
168          case MAGIC_PRINT_WFID_32:
169            MagicPrintWF32ID(w);
170            break;
171
172          case MAGIC_PRINT_WFID_64:
173            MagicPrintWFID64(w);
174            break;
175
176          default: fatal("unrecognized magic instruction: %d\n", op);
177        }
178    }
179
180    void
181    Call::MagicPrintLane(Wavefront *w)
182    {
183    #if TRACING_ON
184        const VectorMask &mask = w->get_pred();
185        for (int lane = 0; lane < VSZ; ++lane) {
186            if (mask[lane]) {
187                int src_val1 = src1.get<int>(w, lane, 1);
188                int src_val2 = src1.get<int>(w, lane, 2);
189                if (src_val2) {
190                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
191                             disassemble(), w->computeUnit->cu_id, w->simdId,
192                             w->wfSlotId, lane, src_val1);
193                } else {
194                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
195                             disassemble(), w->computeUnit->cu_id, w->simdId,
196                             w->wfSlotId, lane, src_val1);
197                }
198            }
199        }
200    #endif
201    }
202
203    void
204    Call::MagicPrintLane64(Wavefront *w)
205    {
206    #if TRACING_ON
207        const VectorMask &mask = w->get_pred();
208        for (int lane = 0; lane < VSZ; ++lane) {
209            if (mask[lane]) {
210                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
211                int src_val2 = src1.get<int>(w, lane, 2);
212                if (src_val2) {
213                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
214                             disassemble(), w->computeUnit->cu_id, w->simdId,
215                             w->wfSlotId, lane, src_val1);
216                } else {
217                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
218                             disassemble(), w->computeUnit->cu_id, w->simdId,
219                             w->wfSlotId, lane, src_val1);
220                }
221            }
222        }
223    #endif
224    }
225
226    void
227    Call::MagicPrintWF32(Wavefront *w)
228    {
229    #if TRACING_ON
230        const VectorMask &mask = w->get_pred();
231        std::string res_str;
232        res_str = csprintf("krl_prt (%s)\n", disassemble());
233
234        for (int lane = 0; lane < VSZ; ++lane) {
235            if (!(lane & 7)) {
236                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
237            }
238
239            if (mask[lane]) {
240                int src_val1 = src1.get<int>(w, lane, 1);
241                int src_val2 = src1.get<int>(w, lane, 2);
242
243                if (src_val2) {
244                    res_str += csprintf("%08x", src_val1);
245                } else {
246                    res_str += csprintf("%08d", src_val1);
247                }
248            } else {
249                res_str += csprintf("xxxxxxxx");
250            }
251
252            if ((lane & 7) == 7) {
253                res_str += csprintf("\n");
254            } else {
255                res_str += csprintf(" ");
256            }
257        }
258
259        res_str += "\n\n";
260        DPRINTFN(res_str.c_str());
261    #endif
262    }
263
264    void
265    Call::MagicPrintWF32ID(Wavefront *w)
266    {
267    #if TRACING_ON
268        const VectorMask &mask = w->get_pred();
269        std::string res_str;
270        int src_val3 = -1;
271        res_str = csprintf("krl_prt (%s)\n", disassemble());
272
273        for (int lane = 0; lane < VSZ; ++lane) {
274            if (!(lane & 7)) {
275                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
276            }
277
278            if (mask[lane]) {
279                int src_val1 = src1.get<int>(w, lane, 1);
280                int src_val2 = src1.get<int>(w, lane, 2);
281                src_val3 = src1.get<int>(w, lane, 3);
282
283                if (src_val2) {
284                    res_str += csprintf("%08x", src_val1);
285                } else {
286                    res_str += csprintf("%08d", src_val1);
287                }
288            } else {
289                res_str += csprintf("xxxxxxxx");
290            }
291
292            if ((lane & 7) == 7) {
293                res_str += csprintf("\n");
294            } else {
295                res_str += csprintf(" ");
296            }
297        }
298
299        res_str += "\n\n";
300        if (w->wfDynId == src_val3) {
301            DPRINTFN(res_str.c_str());
302        }
303    #endif
304    }
305
306    void
307    Call::MagicPrintWF64(Wavefront *w)
308    {
309    #if TRACING_ON
310        const VectorMask &mask = w->get_pred();
311        std::string res_str;
312        res_str = csprintf("krl_prt (%s)\n", disassemble());
313
314        for (int lane = 0; lane < VSZ; ++lane) {
315            if (!(lane & 3)) {
316                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
317            }
318
319            if (mask[lane]) {
320                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
321                int src_val2 = src1.get<int>(w, lane, 2);
322
323                if (src_val2) {
324                    res_str += csprintf("%016x", src_val1);
325                } else {
326                    res_str += csprintf("%016d", src_val1);
327                }
328            } else {
329                res_str += csprintf("xxxxxxxxxxxxxxxx");
330            }
331
332            if ((lane & 3) == 3) {
333                res_str += csprintf("\n");
334            } else {
335                res_str += csprintf(" ");
336            }
337        }
338
339        res_str += "\n\n";
340        DPRINTFN(res_str.c_str());
341    #endif
342    }
343
344    void
345    Call::MagicPrintWFID64(Wavefront *w)
346    {
347    #if TRACING_ON
348        const VectorMask &mask = w->get_pred();
349        std::string res_str;
350        int src_val3 = -1;
351        res_str = csprintf("krl_prt (%s)\n", disassemble());
352
353        for (int lane = 0; lane < VSZ; ++lane) {
354            if (!(lane & 3)) {
355                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
356            }
357
358            if (mask[lane]) {
359                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
360                int src_val2 = src1.get<int>(w, lane, 2);
361                src_val3 = src1.get<int>(w, lane, 3);
362
363                if (src_val2) {
364                    res_str += csprintf("%016x", src_val1);
365                } else {
366                    res_str += csprintf("%016d", src_val1);
367                }
368            } else {
369                res_str += csprintf("xxxxxxxxxxxxxxxx");
370            }
371
372            if ((lane & 3) == 3) {
373                res_str += csprintf("\n");
374            } else {
375                res_str += csprintf(" ");
376            }
377        }
378
379        res_str += "\n\n";
380        if (w->wfDynId == src_val3) {
381            DPRINTFN(res_str.c_str());
382        }
383    #endif
384    }
385
386    void
387    Call::MagicPrintWFFloat(Wavefront *w)
388    {
389    #if TRACING_ON
390        const VectorMask &mask = w->get_pred();
391        std::string res_str;
392        res_str = csprintf("krl_prt (%s)\n", disassemble());
393
394        for (int lane = 0; lane < VSZ; ++lane) {
395            if (!(lane & 7)) {
396                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
397            }
398
399            if (mask[lane]) {
400                float src_val1 = src1.get<float>(w, lane, 1);
401                res_str += csprintf("%08f", src_val1);
402            } else {
403                res_str += csprintf("xxxxxxxx");
404            }
405
406            if ((lane & 7) == 7) {
407                res_str += csprintf("\n");
408            } else {
409                res_str += csprintf(" ");
410            }
411        }
412
413        res_str += "\n\n";
414        DPRINTFN(res_str.c_str());
415    #endif
416    }
417
418    // raises a signal that GDB will catch
419    // when done with the break, type "signal 0" in gdb to continue
420    void
421    Call::MagicSimBreak(Wavefront *w)
422    {
423        std::string res_str;
424        // print out state for this wavefront and then break
425        res_str = csprintf("Breakpoint encountered for wavefront %i\n",
426                           w->wfSlotId);
427
428        res_str += csprintf("  Kern ID: %i\n", w->kern_id);
429        res_str += csprintf("  Phase ID: %i\n", w->simdId);
430        res_str += csprintf("  Executing on CU #%i\n", w->computeUnit->cu_id);
431        res_str += csprintf("  Exec mask: ");
432
433        for (int i = VSZ - 1; i >= 0; --i) {
434            if (w->execMask(i))
435                res_str += "1";
436            else
437                res_str += "0";
438
439            if ((i & 7) == 7)
440                res_str += " ";
441        }
442
443        res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong());
444
445        res_str += "\nHelpful debugging hints:\n";
446        res_str += "   Check out w->s_reg / w->d_reg for register state\n";
447
448        res_str += "\n\n";
449        DPRINTFN(res_str.c_str());
450        fflush(stdout);
451
452        raise(SIGTRAP);
453    }
454
455    void
456    Call::MagicPrefixSum(Wavefront *w)
457    {
458        const VectorMask &mask = w->get_pred();
459        int res = 0;
460
461        for (int lane = 0; lane < VSZ; ++lane) {
462            if (mask[lane]) {
463                int src_val1 = src1.get<int>(w, lane, 1);
464                dest.set<int>(w, lane, res);
465                res += src_val1;
466            }
467        }
468    }
469
470    void
471    Call::MagicReduction(Wavefront *w)
472    {
473        // reduction magic instruction
474        //   The reduction instruction takes up to 64 inputs (one from
475        //   each thread in a WF) and sums them. It returns the sum to
476        //   each thread in the WF.
477        const VectorMask &mask = w->get_pred();
478        int res = 0;
479
480        for (int lane = 0; lane < VSZ; ++lane) {
481            if (mask[lane]) {
482                int src_val1 = src1.get<int>(w, lane, 1);
483                res += src_val1;
484            }
485        }
486
487        for (int lane = 0; lane < VSZ; ++lane) {
488            if (mask[lane]) {
489                dest.set<int>(w, lane, res);
490            }
491        }
492    }
493
494    void
495    Call::MagicMaskLower(Wavefront *w)
496    {
497        const VectorMask &mask = w->get_pred();
498        int res = 0;
499
500        for (int lane = 0; lane < VSZ; ++lane) {
501            if (mask[lane]) {
502                int src_val1 = src1.get<int>(w, lane, 1);
503
504                if (src_val1) {
505                    if (lane < (VSZ/2)) {
506                        res = res | ((uint32_t)(1) << lane);
507                    }
508                }
509            }
510        }
511
512        for (int lane = 0; lane < VSZ; ++lane) {
513            if (mask[lane]) {
514                dest.set<int>(w, lane, res);
515            }
516        }
517    }
518
519    void
520    Call::MagicMaskUpper(Wavefront *w)
521    {
522        const VectorMask &mask = w->get_pred();
523        int res = 0;
524        for (int lane = 0; lane < VSZ; ++lane) {
525            if (mask[lane]) {
526                int src_val1 = src1.get<int>(w, lane, 1);
527
528                if (src_val1) {
529                    if (lane >= (VSZ/2)) {
530                        res = res | ((uint32_t)(1) << (lane - (VSZ/2)));
531                    }
532                }
533            }
534        }
535
536        for (int lane = 0; lane < VSZ; ++lane) {
537            if (mask[lane]) {
538                dest.set<int>(w, lane, res);
539            }
540        }
541    }
542
543    void
544    Call::MagicJoinWFBar(Wavefront *w)
545    {
546        const VectorMask &mask = w->get_pred();
547        int max_cnt = 0;
548
549        for (int lane = 0; lane < VSZ; ++lane) {
550            if (mask[lane]) {
551                w->bar_cnt[lane]++;
552
553                if (w->bar_cnt[lane] > max_cnt) {
554                    max_cnt = w->bar_cnt[lane];
555                }
556            }
557        }
558
559        if (max_cnt > w->max_bar_cnt) {
560            w->max_bar_cnt = max_cnt;
561        }
562    }
563
564    void
565    Call::MagicWaitWFBar(Wavefront *w)
566    {
567        const VectorMask &mask = w->get_pred();
568        int max_cnt = 0;
569
570        for (int lane = 0; lane < VSZ; ++lane) {
571            if (mask[lane]) {
572                w->bar_cnt[lane]--;
573            }
574
575            if (w->bar_cnt[lane] > max_cnt) {
576                max_cnt = w->bar_cnt[lane];
577            }
578        }
579
580        if (max_cnt < w->max_bar_cnt) {
581            w->max_bar_cnt = max_cnt;
582        }
583
584        w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
585                                   w->instructionBuffer.end());
586        if (w->pendingFetch)
587            w->dropFetch = true;
588    }
589
590    void
591    Call::MagicPanic(Wavefront *w)
592    {
593        const VectorMask &mask = w->get_pred();
594
595        for (int lane = 0; lane < VSZ; ++lane) {
596            if (mask[lane]) {
597                int src_val1 = src1.get<int>(w, lane, 1);
598                panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
599                      src_val1, lane);
600            }
601        }
602    }
603
604    void
605    Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
606    {
607        // the address is in src1 | src2
608        for (int lane = 0; lane < VSZ; ++lane) {
609            int src_val1 = src1.get<int>(w, lane, 1);
610            int src_val2 = src1.get<int>(w, lane, 2);
611            Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);
612
613            m->addr[lane] = addr;
614        }
615
616    }
617
618    void
619    Call::MagicAtomicNRAddGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
620    {
621        GPUDynInstPtr m = gpuDynInst;
622
623        calcAddr(w, m);
624
625        for (int lane = 0; lane < VSZ; ++lane) {
626            ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
627        }
628
629        m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
630                                        Brig::BRIG_ATOMIC_ADD);
631        m->m_type = U32::memType;
632        m->v_type = U32::vgprType;
633
634        m->exec_mask = w->execMask();
635        m->statusBitVector = 0;
636        m->equiv = 0;  // atomics don't have an equivalence class operand
637        m->n_reg = 1;
638        m->memoryOrder = Enums::MEMORY_ORDER_NONE;
639        m->scope = Enums::MEMORY_SCOPE_NONE;
640
641        m->simdId = w->simdId;
642        m->wfSlotId = w->wfSlotId;
643        m->wfDynId = w->wfDynId;
644        m->latency.init(&w->computeUnit->shader->tick_cnt);
645
646        m->s_type = SEG_GLOBAL;
647        m->pipeId = GLBMEM_PIPE;
648        m->latency.set(w->computeUnit->shader->ticks(64));
649        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
650        w->outstanding_reqs_wr_gm++;
651        w->wr_gm_reqs_in_pipe--;
652        w->outstanding_reqs_rd_gm++;
653        w->rd_gm_reqs_in_pipe--;
654        w->outstanding_reqs++;
655        w->mem_reqs_in_pipe--;
656    }
657
658    void
659    Call::MagicAtomicNRAddGroupU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
660    {
661        GPUDynInstPtr m = gpuDynInst;
662        calcAddr(w, m);
663
664        for (int lane = 0; lane < VSZ; ++lane) {
665            ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
666        }
667
668        m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
669                                        Brig::BRIG_ATOMIC_ADD);
670        m->m_type = U32::memType;
671        m->v_type = U32::vgprType;
672
673        m->exec_mask = w->execMask();
674        m->statusBitVector = 0;
675        m->equiv = 0;  // atomics don't have an equivalence class operand
676        m->n_reg = 1;
677        m->memoryOrder = Enums::MEMORY_ORDER_NONE;
678        m->scope = Enums::MEMORY_SCOPE_NONE;
679
680        m->simdId = w->simdId;
681        m->wfSlotId = w->wfSlotId;
682        m->wfDynId = w->wfDynId;
683        m->latency.init(&w->computeUnit->shader->tick_cnt);
684
685        m->s_type = SEG_GLOBAL;
686        m->pipeId = GLBMEM_PIPE;
687        m->latency.set(w->computeUnit->shader->ticks(64));
688        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
689        w->outstanding_reqs_wr_gm++;
690        w->wr_gm_reqs_in_pipe--;
691        w->outstanding_reqs_rd_gm++;
692        w->rd_gm_reqs_in_pipe--;
693        w->outstanding_reqs++;
694        w->mem_reqs_in_pipe--;
695    }
696
697    void
698    Call::MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
699    {
700        GPUDynInstPtr m = gpuDynInst;
701        // calculate the address
702        calcAddr(w, m);
703
704        m->m_op = Enums::MO_LD;
705        m->m_type = U32::memType;  //MemDataType::memType;
706        m->v_type = U32::vgprType; //DestDataType::vgprType;
707
708        m->exec_mask = w->execMask();
709        m->statusBitVector = 0;
710        m->equiv = 0;
711        m->n_reg = 1;
712        m->memoryOrder = Enums::MEMORY_ORDER_NONE;
713        m->scope = Enums::MEMORY_SCOPE_NONE;
714
715        // FIXME
716        //m->dst_reg = this->dest.regIndex();
717
718        m->simdId = w->simdId;
719        m->wfSlotId = w->wfSlotId;
720        m->wfDynId = w->wfDynId;
721        m->latency.init(&w->computeUnit->shader->tick_cnt);
722
723        m->s_type = SEG_GLOBAL;
724        m->pipeId = GLBMEM_PIPE;
725        m->latency.set(w->computeUnit->shader->ticks(1));
726        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
727        w->outstanding_reqs_rd_gm++;
728        w->rd_gm_reqs_in_pipe--;
729        w->outstanding_reqs++;
730        w->mem_reqs_in_pipe--;
731    }
732
733    void
734    Call::MagicXactCasLd(Wavefront *w)
735    {
736        const VectorMask &mask = w->get_pred();
737        int src_val1 = 0;
738
739        for (int lane = 0; lane < VSZ; ++lane) {
740            if (mask[lane]) {
741                src_val1 = src1.get<int>(w, lane, 1);
742                break;
743            }
744        }
745
746        if (!w->computeUnit->xactCasLoadMap.count(src_val1)) {
747            w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue();
748            w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear();
749        }
750
751        w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue
752            .push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId));
753    }
754
755    void
756    Call::MagicMostSigThread(Wavefront *w)
757    {
758        const VectorMask &mask = w->get_pred();
759        unsigned mst = true;
760
761        for (int lane = VSZ - 1; lane >= 0; --lane) {
762            if (mask[lane]) {
763                dest.set<int>(w, lane, mst);
764                mst = false;
765            }
766        }
767    }
768
769    void
770    Call::MagicMostSigBroadcast(Wavefront *w)
771    {
772        const VectorMask &mask = w->get_pred();
773        int res = 0;
774        bool got_res = false;
775
776        for (int lane = VSZ - 1; lane >= 0; --lane) {
777            if (mask[lane]) {
778                if (!got_res) {
779                    res = src1.get<int>(w, lane, 1);
780                    got_res = true;
781                }
782                dest.set<int>(w, lane, res);
783            }
784        }
785    }
786
787} // namespace HsailISA
788