Cross Reference: /gem5/src/arch/hsail/insts/pseudo

pseudo_inst.cc (11534:7106f550afad)	pseudo_inst.cc (11639:2e8d4bd8108d)
1/* 2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Marc Orr 34 / 35 36#include <csignal> 37 38#include "arch/hsail/insts/decl.hh" 39#include "arch/hsail/insts/mem.hh" 40 41namespace HsailISA 42{ 43 // Pseudo (or magic) instructions are overloaded on the hsail call 44 // instruction, because of its flexible parameter signature. 45 46 // To add a new magic instruction: 47 // 1. Add an entry to the enum. 48 // 2. Implement it in the switch statement below (Call::exec). 49 // 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h, 50 // so its easy to call from an OpenCL kernel. 51 52 // This enum should be identical to the enum in 53 // hsa/hsail-gpu-compute/util/magicinst.h 54 enum 55 { 56 MAGIC_PRINT_WF_32 = 0, 57 MAGIC_PRINT_WF_64, 58 MAGIC_PRINT_LANE, 59 MAGIC_PRINT_LANE_64, 60 MAGIC_PRINT_WF_FLOAT, 61 MAGIC_SIM_BREAK, 62 MAGIC_PREF_SUM, 63 MAGIC_REDUCTION, 64 MAGIC_MASKLANE_LOWER, 65 MAGIC_MASKLANE_UPPER, 66 MAGIC_JOIN_WF_BAR, 67 MAGIC_WAIT_WF_BAR, 68 MAGIC_PANIC, 69 MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG, 70 MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG, 71 MAGIC_LOAD_GLOBAL_U32_REG, 72 MAGIC_XACT_CAS_LD, 73 MAGIC_MOST_SIG_THD, 74 MAGIC_MOST_SIG_BROADCAST, 75 MAGIC_PRINT_WFID_32, 76 MAGIC_PRINT_WFID_64 77 }; 78 79 void 80 Call::execPseudoInst(Wavefront w, GPUDynInstPtr gpuDynInst) 81 {	1/* 2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Marc Orr 34 / 35 36#include <csignal> 37 38#include "arch/hsail/insts/decl.hh" 39#include "arch/hsail/insts/mem.hh" 40 41namespace HsailISA 42{ 43 // Pseudo (or magic) instructions are overloaded on the hsail call 44 // instruction, because of its flexible parameter signature. 45 46 // To add a new magic instruction: 47 // 1. Add an entry to the enum. 48 // 2. Implement it in the switch statement below (Call::exec). 49 // 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h, 50 // so its easy to call from an OpenCL kernel. 51 52 // This enum should be identical to the enum in 53 // hsa/hsail-gpu-compute/util/magicinst.h 54 enum 55 { 56 MAGIC_PRINT_WF_32 = 0, 57 MAGIC_PRINT_WF_64, 58 MAGIC_PRINT_LANE, 59 MAGIC_PRINT_LANE_64, 60 MAGIC_PRINT_WF_FLOAT, 61 MAGIC_SIM_BREAK, 62 MAGIC_PREF_SUM, 63 MAGIC_REDUCTION, 64 MAGIC_MASKLANE_LOWER, 65 MAGIC_MASKLANE_UPPER, 66 MAGIC_JOIN_WF_BAR, 67 MAGIC_WAIT_WF_BAR, 68 MAGIC_PANIC, 69 MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG, 70 MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG, 71 MAGIC_LOAD_GLOBAL_U32_REG, 72 MAGIC_XACT_CAS_LD, 73 MAGIC_MOST_SIG_THD, 74 MAGIC_MOST_SIG_BROADCAST, 75 MAGIC_PRINT_WFID_32, 76 MAGIC_PRINT_WFID_64 77 }; 78 79 void 80 Call::execPseudoInst(Wavefront w, GPUDynInstPtr gpuDynInst) 81 {
82 const VectorMask &mask = w->get_pred();	82 const VectorMask &mask = w->getPred();
83 84 int op = 0; 85 bool got_op = false; 86 87 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 88 if (mask[lane]) { 89 int src_val0 = src1.get<int>(w, lane, 0); 90 if (got_op) { 91 if (src_val0 != op) { 92 fatal("Multiple magic instructions per PC not " 93 "supported\n"); 94 } 95 } else { 96 op = src_val0; 97 got_op = true; 98 } 99 } 100 } 101 102 switch(op) { 103 case MAGIC_PRINT_WF_32: 104 MagicPrintWF32(w); 105 break; 106 case MAGIC_PRINT_WF_64: 107 MagicPrintWF64(w); 108 break; 109 case MAGIC_PRINT_LANE: 110 MagicPrintLane(w); 111 break; 112 case MAGIC_PRINT_LANE_64: 113 MagicPrintLane64(w); 114 break; 115 case MAGIC_PRINT_WF_FLOAT: 116 MagicPrintWFFloat(w); 117 break; 118 case MAGIC_SIM_BREAK: 119 MagicSimBreak(w); 120 break; 121 case MAGIC_PREF_SUM: 122 MagicPrefixSum(w); 123 break; 124 case MAGIC_REDUCTION: 125 MagicReduction(w); 126 break; 127 case MAGIC_MASKLANE_LOWER: 128 MagicMaskLower(w); 129 break; 130 case MAGIC_MASKLANE_UPPER: 131 MagicMaskUpper(w); 132 break; 133 case MAGIC_JOIN_WF_BAR: 134 MagicJoinWFBar(w); 135 break; 136 case MAGIC_WAIT_WF_BAR: 137 MagicWaitWFBar(w); 138 break; 139 case MAGIC_PANIC: 140 MagicPanic(w); 141 break; 142 143 // atomic instructions 144 case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG: 145 MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst); 146 break; 147 148 case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG: 149 MagicAtomicNRAddGroupU32Reg(w, gpuDynInst); 150 break; 151 152 case MAGIC_LOAD_GLOBAL_U32_REG: 153 MagicLoadGlobalU32Reg(w, gpuDynInst); 154 break; 155 156 case MAGIC_XACT_CAS_LD: 157 MagicXactCasLd(w); 158 break; 159 160 case MAGIC_MOST_SIG_THD: 161 MagicMostSigThread(w); 162 break; 163 164 case MAGIC_MOST_SIG_BROADCAST: 165 MagicMostSigBroadcast(w); 166 break; 167 168 case MAGIC_PRINT_WFID_32: 169 MagicPrintWF32ID(w); 170 break; 171 172 case MAGIC_PRINT_WFID_64: 173 MagicPrintWFID64(w); 174 break; 175 176 default: fatal("unrecognized magic instruction: %d\n", op); 177 } 178 } 179 180 void 181 Call::MagicPrintLane(Wavefront w) 182* { 183 #if TRACING_ON	83 84 int op = 0; 85 bool got_op = false; 86 87 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 88 if (mask[lane]) { 89 int src_val0 = src1.get<int>(w, lane, 0); 90 if (got_op) { 91 if (src_val0 != op) { 92 fatal("Multiple magic instructions per PC not " 93 "supported\n"); 94 } 95 } else { 96 op = src_val0; 97 got_op = true; 98 } 99 } 100 } 101 102 switch(op) { 103 case MAGIC_PRINT_WF_32: 104 MagicPrintWF32(w); 105 break; 106 case MAGIC_PRINT_WF_64: 107 MagicPrintWF64(w); 108 break; 109 case MAGIC_PRINT_LANE: 110 MagicPrintLane(w); 111 break; 112 case MAGIC_PRINT_LANE_64: 113 MagicPrintLane64(w); 114 break; 115 case MAGIC_PRINT_WF_FLOAT: 116 MagicPrintWFFloat(w); 117 break; 118 case MAGIC_SIM_BREAK: 119 MagicSimBreak(w); 120 break; 121 case MAGIC_PREF_SUM: 122 MagicPrefixSum(w); 123 break; 124 case MAGIC_REDUCTION: 125 MagicReduction(w); 126 break; 127 case MAGIC_MASKLANE_LOWER: 128 MagicMaskLower(w); 129 break; 130 case MAGIC_MASKLANE_UPPER: 131 MagicMaskUpper(w); 132 break; 133 case MAGIC_JOIN_WF_BAR: 134 MagicJoinWFBar(w); 135 break; 136 case MAGIC_WAIT_WF_BAR: 137 MagicWaitWFBar(w); 138 break; 139 case MAGIC_PANIC: 140 MagicPanic(w); 141 break; 142 143 // atomic instructions 144 case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG: 145 MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst); 146 break; 147 148 case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG: 149 MagicAtomicNRAddGroupU32Reg(w, gpuDynInst); 150 break; 151 152 case MAGIC_LOAD_GLOBAL_U32_REG: 153 MagicLoadGlobalU32Reg(w, gpuDynInst); 154 break; 155 156 case MAGIC_XACT_CAS_LD: 157 MagicXactCasLd(w); 158 break; 159 160 case MAGIC_MOST_SIG_THD: 161 MagicMostSigThread(w); 162 break; 163 164 case MAGIC_MOST_SIG_BROADCAST: 165 MagicMostSigBroadcast(w); 166 break; 167 168 case MAGIC_PRINT_WFID_32: 169 MagicPrintWF32ID(w); 170 break; 171 172 case MAGIC_PRINT_WFID_64: 173 MagicPrintWFID64(w); 174 break; 175 176 default: fatal("unrecognized magic instruction: %d\n", op); 177 } 178 } 179 180 void 181 Call::MagicPrintLane(Wavefront w) 182* { 183 #if TRACING_ON
184 const VectorMask &mask = w->get_pred();	184 const VectorMask &mask = w->getPred();
185 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 186 if (mask[lane]) { 187 int src_val1 = src1.get<int>(w, lane, 1); 188 int src_val2 = src1.get<int>(w, lane, 2); 189 if (src_val2) { 190 DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n", 191 disassemble(), w->computeUnit->cu_id, w->simdId, 192 w->wfSlotId, lane, src_val1); 193 } else { 194 DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n", 195 disassemble(), w->computeUnit->cu_id, w->simdId, 196 w->wfSlotId, lane, src_val1); 197 } 198 } 199 } 200 #endif 201 } 202 203 void 204 Call::MagicPrintLane64(Wavefront w) 205* { 206 #if TRACING_ON	185 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 186 if (mask[lane]) { 187 int src_val1 = src1.get<int>(w, lane, 1); 188 int src_val2 = src1.get<int>(w, lane, 2); 189 if (src_val2) { 190 DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n", 191 disassemble(), w->computeUnit->cu_id, w->simdId, 192 w->wfSlotId, lane, src_val1); 193 } else { 194 DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n", 195 disassemble(), w->computeUnit->cu_id, w->simdId, 196 w->wfSlotId, lane, src_val1); 197 } 198 } 199 } 200 #endif 201 } 202 203 void 204 Call::MagicPrintLane64(Wavefront w) 205* { 206 #if TRACING_ON
207 const VectorMask &mask = w->get_pred();	207 const VectorMask &mask = w->getPred();
208 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 209 if (mask[lane]) { 210 int64_t src_val1 = src1.get<int64_t>(w, lane, 1); 211 int src_val2 = src1.get<int>(w, lane, 2); 212 if (src_val2) { 213 DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n", 214 disassemble(), w->computeUnit->cu_id, w->simdId, 215 w->wfSlotId, lane, src_val1); 216 } else { 217 DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n", 218 disassemble(), w->computeUnit->cu_id, w->simdId, 219 w->wfSlotId, lane, src_val1); 220 } 221 } 222 } 223 #endif 224 } 225 226 void 227 Call::MagicPrintWF32(Wavefront w) 228* { 229 #if TRACING_ON	208 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 209 if (mask[lane]) { 210 int64_t src_val1 = src1.get<int64_t>(w, lane, 1); 211 int src_val2 = src1.get<int>(w, lane, 2); 212 if (src_val2) { 213 DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n", 214 disassemble(), w->computeUnit->cu_id, w->simdId, 215 w->wfSlotId, lane, src_val1); 216 } else { 217 DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n", 218 disassemble(), w->computeUnit->cu_id, w->simdId, 219 w->wfSlotId, lane, src_val1); 220 } 221 } 222 } 223 #endif 224 } 225 226 void 227 Call::MagicPrintWF32(Wavefront w) 228* { 229 #if TRACING_ON
230 const VectorMask &mask = w->get_pred();	230 const VectorMask &mask = w->getPred();
231 std::string res_str; 232 res_str = csprintf("krl_prt (%s)\n", disassemble()); 233 234 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 235 if (!(lane & 7)) { 236 res_str += csprintf("DB%03d: ", (int)w->wfDynId); 237 } 238 239 if (mask[lane]) { 240 int src_val1 = src1.get<int>(w, lane, 1); 241 int src_val2 = src1.get<int>(w, lane, 2); 242 243 if (src_val2) { 244 res_str += csprintf("%08x", src_val1); 245 } else { 246 res_str += csprintf("%08d", src_val1); 247 } 248 } else { 249 res_str += csprintf("xxxxxxxx"); 250 } 251 252 if ((lane & 7) == 7) { 253 res_str += csprintf("\n"); 254 } else { 255 res_str += csprintf(" "); 256 } 257 } 258 259 res_str += "\n\n"; 260 DPRINTFN(res_str.c_str()); 261 #endif 262 } 263 264 void 265 Call::MagicPrintWF32ID(Wavefront w) 266* { 267 #if TRACING_ON	231 std::string res_str; 232 res_str = csprintf("krl_prt (%s)\n", disassemble()); 233 234 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 235 if (!(lane & 7)) { 236 res_str += csprintf("DB%03d: ", (int)w->wfDynId); 237 } 238 239 if (mask[lane]) { 240 int src_val1 = src1.get<int>(w, lane, 1); 241 int src_val2 = src1.get<int>(w, lane, 2); 242 243 if (src_val2) { 244 res_str += csprintf("%08x", src_val1); 245 } else { 246 res_str += csprintf("%08d", src_val1); 247 } 248 } else { 249 res_str += csprintf("xxxxxxxx"); 250 } 251 252 if ((lane & 7) == 7) { 253 res_str += csprintf("\n"); 254 } else { 255 res_str += csprintf(" "); 256 } 257 } 258 259 res_str += "\n\n"; 260 DPRINTFN(res_str.c_str()); 261 #endif 262 } 263 264 void 265 Call::MagicPrintWF32ID(Wavefront w) 266* { 267 #if TRACING_ON
268 const VectorMask &mask = w->get_pred();	268 const VectorMask &mask = w->getPred();
269 std::string res_str; 270 int src_val3 = -1; 271 res_str = csprintf("krl_prt (%s)\n", disassemble()); 272 273 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 274 if (!(lane & 7)) { 275 res_str += csprintf("DB%03d: ", (int)w->wfDynId); 276 } 277 278 if (mask[lane]) { 279 int src_val1 = src1.get<int>(w, lane, 1); 280 int src_val2 = src1.get<int>(w, lane, 2); 281 src_val3 = src1.get<int>(w, lane, 3); 282 283 if (src_val2) { 284 res_str += csprintf("%08x", src_val1); 285 } else { 286 res_str += csprintf("%08d", src_val1); 287 } 288 } else { 289 res_str += csprintf("xxxxxxxx"); 290 } 291 292 if ((lane & 7) == 7) { 293 res_str += csprintf("\n"); 294 } else { 295 res_str += csprintf(" "); 296 } 297 } 298 299 res_str += "\n\n"; 300 if (w->wfDynId == src_val3) { 301 DPRINTFN(res_str.c_str()); 302 } 303 #endif 304 } 305 306 void 307 Call::MagicPrintWF64(Wavefront w) 308* { 309 #if TRACING_ON	269 std::string res_str; 270 int src_val3 = -1; 271 res_str = csprintf("krl_prt (%s)\n", disassemble()); 272 273 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 274 if (!(lane & 7)) { 275 res_str += csprintf("DB%03d: ", (int)w->wfDynId); 276 } 277 278 if (mask[lane]) { 279 int src_val1 = src1.get<int>(w, lane, 1); 280 int src_val2 = src1.get<int>(w, lane, 2); 281 src_val3 = src1.get<int>(w, lane, 3); 282 283 if (src_val2) { 284 res_str += csprintf("%08x", src_val1); 285 } else { 286 res_str += csprintf("%08d", src_val1); 287 } 288 } else { 289 res_str += csprintf("xxxxxxxx"); 290 } 291 292 if ((lane & 7) == 7) { 293 res_str += csprintf("\n"); 294 } else { 295 res_str += csprintf(" "); 296 } 297 } 298 299 res_str += "\n\n"; 300 if (w->wfDynId == src_val3) { 301 DPRINTFN(res_str.c_str()); 302 } 303 #endif 304 } 305 306 void 307 Call::MagicPrintWF64(Wavefront w) 308* { 309 #if TRACING_ON
310 const VectorMask &mask = w->get_pred();	310 const VectorMask &mask = w->getPred();
311 std::string res_str; 312 res_str = csprintf("krl_prt (%s)\n", disassemble()); 313 314 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 315 if (!(lane & 3)) { 316 res_str += csprintf("DB%03d: ", (int)w->wfDynId); 317 } 318 319 if (mask[lane]) { 320 int64_t src_val1 = src1.get<int64_t>(w, lane, 1); 321 int src_val2 = src1.get<int>(w, lane, 2); 322 323 if (src_val2) { 324 res_str += csprintf("%016x", src_val1); 325 } else { 326 res_str += csprintf("%016d", src_val1); 327 } 328 } else { 329 res_str += csprintf("xxxxxxxxxxxxxxxx"); 330 } 331 332 if ((lane & 3) == 3) { 333 res_str += csprintf("\n"); 334 } else { 335 res_str += csprintf(" "); 336 } 337 } 338 339 res_str += "\n\n"; 340 DPRINTFN(res_str.c_str()); 341 #endif 342 } 343 344 void 345 Call::MagicPrintWFID64(Wavefront w) 346* { 347 #if TRACING_ON	311 std::string res_str; 312 res_str = csprintf("krl_prt (%s)\n", disassemble()); 313 314 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 315 if (!(lane & 3)) { 316 res_str += csprintf("DB%03d: ", (int)w->wfDynId); 317 } 318 319 if (mask[lane]) { 320 int64_t src_val1 = src1.get<int64_t>(w, lane, 1); 321 int src_val2 = src1.get<int>(w, lane, 2); 322 323 if (src_val2) { 324 res_str += csprintf("%016x", src_val1); 325 } else { 326 res_str += csprintf("%016d", src_val1); 327 } 328 } else { 329 res_str += csprintf("xxxxxxxxxxxxxxxx"); 330 } 331 332 if ((lane & 3) == 3) { 333 res_str += csprintf("\n"); 334 } else { 335 res_str += csprintf(" "); 336 } 337 } 338 339 res_str += "\n\n"; 340 DPRINTFN(res_str.c_str()); 341 #endif 342 } 343 344 void 345 Call::MagicPrintWFID64(Wavefront w) 346* { 347 #if TRACING_ON
348 const VectorMask &mask = w->get_pred();	348 const VectorMask &mask = w->getPred();
349 std::string res_str; 350 int src_val3 = -1; 351 res_str = csprintf("krl_prt (%s)\n", disassemble()); 352 353 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 354 if (!(lane & 3)) { 355 res_str += csprintf("DB%03d: ", (int)w->wfDynId); 356 } 357 358 if (mask[lane]) { 359 int64_t src_val1 = src1.get<int64_t>(w, lane, 1); 360 int src_val2 = src1.get<int>(w, lane, 2); 361 src_val3 = src1.get<int>(w, lane, 3); 362 363 if (src_val2) { 364 res_str += csprintf("%016x", src_val1); 365 } else { 366 res_str += csprintf("%016d", src_val1); 367 } 368 } else { 369 res_str += csprintf("xxxxxxxxxxxxxxxx"); 370 } 371 372 if ((lane & 3) == 3) { 373 res_str += csprintf("\n"); 374 } else { 375 res_str += csprintf(" "); 376 } 377 } 378 379 res_str += "\n\n"; 380 if (w->wfDynId == src_val3) { 381 DPRINTFN(res_str.c_str()); 382 } 383 #endif 384 } 385 386 void 387 Call::MagicPrintWFFloat(Wavefront w) 388* { 389 #if TRACING_ON	349 std::string res_str; 350 int src_val3 = -1; 351 res_str = csprintf("krl_prt (%s)\n", disassemble()); 352 353 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 354 if (!(lane & 3)) { 355 res_str += csprintf("DB%03d: ", (int)w->wfDynId); 356 } 357 358 if (mask[lane]) { 359 int64_t src_val1 = src1.get<int64_t>(w, lane, 1); 360 int src_val2 = src1.get<int>(w, lane, 2); 361 src_val3 = src1.get<int>(w, lane, 3); 362 363 if (src_val2) { 364 res_str += csprintf("%016x", src_val1); 365 } else { 366 res_str += csprintf("%016d", src_val1); 367 } 368 } else { 369 res_str += csprintf("xxxxxxxxxxxxxxxx"); 370 } 371 372 if ((lane & 3) == 3) { 373 res_str += csprintf("\n"); 374 } else { 375 res_str += csprintf(" "); 376 } 377 } 378 379 res_str += "\n\n"; 380 if (w->wfDynId == src_val3) { 381 DPRINTFN(res_str.c_str()); 382 } 383 #endif 384 } 385 386 void 387 Call::MagicPrintWFFloat(Wavefront w) 388* { 389 #if TRACING_ON
390 const VectorMask &mask = w->get_pred();	390 const VectorMask &mask = w->getPred();
391 std::string res_str; 392 res_str = csprintf("krl_prt (%s)\n", disassemble()); 393 394 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 395 if (!(lane & 7)) { 396 res_str += csprintf("DB%03d: ", (int)w->wfDynId); 397 } 398 399 if (mask[lane]) { 400 float src_val1 = src1.get<float>(w, lane, 1); 401 res_str += csprintf("%08f", src_val1); 402 } else { 403 res_str += csprintf("xxxxxxxx"); 404 } 405 406 if ((lane & 7) == 7) { 407 res_str += csprintf("\n"); 408 } else { 409 res_str += csprintf(" "); 410 } 411 } 412 413 res_str += "\n\n"; 414 DPRINTFN(res_str.c_str()); 415 #endif 416 } 417 418 // raises a signal that GDB will catch 419 // when done with the break, type "signal 0" in gdb to continue 420 void 421 Call::MagicSimBreak(Wavefront w) 422* { 423 std::string res_str; 424 // print out state for this wavefront and then break 425 res_str = csprintf("Breakpoint encountered for wavefront %i\n", 426 w->wfSlotId); 427	391 std::string res_str; 392 res_str = csprintf("krl_prt (%s)\n", disassemble()); 393 394 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 395 if (!(lane & 7)) { 396 res_str += csprintf("DB%03d: ", (int)w->wfDynId); 397 } 398 399 if (mask[lane]) { 400 float src_val1 = src1.get<float>(w, lane, 1); 401 res_str += csprintf("%08f", src_val1); 402 } else { 403 res_str += csprintf("xxxxxxxx"); 404 } 405 406 if ((lane & 7) == 7) { 407 res_str += csprintf("\n"); 408 } else { 409 res_str += csprintf(" "); 410 } 411 } 412 413 res_str += "\n\n"; 414 DPRINTFN(res_str.c_str()); 415 #endif 416 } 417 418 // raises a signal that GDB will catch 419 // when done with the break, type "signal 0" in gdb to continue 420 void 421 Call::MagicSimBreak(Wavefront w) 422* { 423 std::string res_str; 424 // print out state for this wavefront and then break 425 res_str = csprintf("Breakpoint encountered for wavefront %i\n", 426 w->wfSlotId); 427
428 res_str += csprintf(" Kern ID: %i\n", w->kern_id);	428 res_str += csprintf(" Kern ID: %i\n", w->kernId);
429 res_str += csprintf(" Phase ID: %i\n", w->simdId); 430 res_str += csprintf(" Executing on CU #%i\n", w->computeUnit->cu_id); 431 res_str += csprintf(" Exec mask: "); 432 433 for (int i = w->computeUnit->wfSize() - 1; i >= 0; --i) { 434 if (w->execMask(i)) 435 res_str += "1"; 436 else 437 res_str += "0"; 438 439 if ((i & 7) == 7) 440 res_str += " "; 441 } 442 443 res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong()); 444 445 res_str += "\nHelpful debugging hints:\n"; 446 res_str += " Check out w->s_reg / w->d_reg for register state\n"; 447 448 res_str += "\n\n"; 449 DPRINTFN(res_str.c_str()); 450 fflush(stdout); 451 452 raise(SIGTRAP); 453 } 454 455 void 456 Call::MagicPrefixSum(Wavefront w) 457* {	429 res_str += csprintf(" Phase ID: %i\n", w->simdId); 430 res_str += csprintf(" Executing on CU #%i\n", w->computeUnit->cu_id); 431 res_str += csprintf(" Exec mask: "); 432 433 for (int i = w->computeUnit->wfSize() - 1; i >= 0; --i) { 434 if (w->execMask(i)) 435 res_str += "1"; 436 else 437 res_str += "0"; 438 439 if ((i & 7) == 7) 440 res_str += " "; 441 } 442 443 res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong()); 444 445 res_str += "\nHelpful debugging hints:\n"; 446 res_str += " Check out w->s_reg / w->d_reg for register state\n"; 447 448 res_str += "\n\n"; 449 DPRINTFN(res_str.c_str()); 450 fflush(stdout); 451 452 raise(SIGTRAP); 453 } 454 455 void 456 Call::MagicPrefixSum(Wavefront w) 457* {
458 const VectorMask &mask = w->get_pred();	458 const VectorMask &mask = w->getPred();
459 int res = 0; 460 461 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 462 if (mask[lane]) { 463 int src_val1 = src1.get<int>(w, lane, 1); 464 dest.set<int>(w, lane, res); 465 res += src_val1; 466 } 467 } 468 } 469 470 void 471 Call::MagicReduction(Wavefront w) 472* { 473 // reduction magic instruction 474 // The reduction instruction takes up to 64 inputs (one from 475 // each thread in a WF) and sums them. It returns the sum to 476 // each thread in the WF.	459 int res = 0; 460 461 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 462 if (mask[lane]) { 463 int src_val1 = src1.get<int>(w, lane, 1); 464 dest.set<int>(w, lane, res); 465 res += src_val1; 466 } 467 } 468 } 469 470 void 471 Call::MagicReduction(Wavefront w) 472* { 473 // reduction magic instruction 474 // The reduction instruction takes up to 64 inputs (one from 475 // each thread in a WF) and sums them. It returns the sum to 476 // each thread in the WF.
477 const VectorMask &mask = w->get_pred();	477 const VectorMask &mask = w->getPred();
478 int res = 0; 479 480 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 481 if (mask[lane]) { 482 int src_val1 = src1.get<int>(w, lane, 1); 483 res += src_val1; 484 } 485 } 486 487 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 488 if (mask[lane]) { 489 dest.set<int>(w, lane, res); 490 } 491 } 492 } 493 494 void 495 Call::MagicMaskLower(Wavefront w) 496* {	478 int res = 0; 479 480 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 481 if (mask[lane]) { 482 int src_val1 = src1.get<int>(w, lane, 1); 483 res += src_val1; 484 } 485 } 486 487 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 488 if (mask[lane]) { 489 dest.set<int>(w, lane, res); 490 } 491 } 492 } 493 494 void 495 Call::MagicMaskLower(Wavefront w) 496* {
497 const VectorMask &mask = w->get_pred();	497 const VectorMask &mask = w->getPred();
498 int res = 0; 499 500 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 501 if (mask[lane]) { 502 int src_val1 = src1.get<int>(w, lane, 1); 503 504 if (src_val1) { 505 if (lane < (w->computeUnit->wfSize()/2)) { 506 res = res \| ((uint32_t)(1) << lane); 507 } 508 } 509 } 510 } 511 512 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 513 if (mask[lane]) { 514 dest.set<int>(w, lane, res); 515 } 516 } 517 } 518 519 void 520 Call::MagicMaskUpper(Wavefront w) 521* {	498 int res = 0; 499 500 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 501 if (mask[lane]) { 502 int src_val1 = src1.get<int>(w, lane, 1); 503 504 if (src_val1) { 505 if (lane < (w->computeUnit->wfSize()/2)) { 506 res = res \| ((uint32_t)(1) << lane); 507 } 508 } 509 } 510 } 511 512 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 513 if (mask[lane]) { 514 dest.set<int>(w, lane, res); 515 } 516 } 517 } 518 519 void 520 Call::MagicMaskUpper(Wavefront w) 521* {
522 const VectorMask &mask = w->get_pred();	522 const VectorMask &mask = w->getPred();
523 int res = 0; 524 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 525 if (mask[lane]) { 526 int src_val1 = src1.get<int>(w, lane, 1); 527 528 if (src_val1) { 529 if (lane >= (w->computeUnit->wfSize()/2)) { 530 res = res \| ((uint32_t)(1) << 531 (lane - (w->computeUnit->wfSize()/2))); 532 } 533 } 534 } 535 } 536 537 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 538 if (mask[lane]) { 539 dest.set<int>(w, lane, res); 540 } 541 } 542 } 543 544 void 545 Call::MagicJoinWFBar(Wavefront w) 546* {	523 int res = 0; 524 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 525 if (mask[lane]) { 526 int src_val1 = src1.get<int>(w, lane, 1); 527 528 if (src_val1) { 529 if (lane >= (w->computeUnit->wfSize()/2)) { 530 res = res \| ((uint32_t)(1) << 531 (lane - (w->computeUnit->wfSize()/2))); 532 } 533 } 534 } 535 } 536 537 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 538 if (mask[lane]) { 539 dest.set<int>(w, lane, res); 540 } 541 } 542 } 543 544 void 545 Call::MagicJoinWFBar(Wavefront w) 546* {
547 const VectorMask &mask = w->get_pred();	547 const VectorMask &mask = w->getPred();
548 int max_cnt = 0; 549 550 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 551 if (mask[lane]) {	548 int max_cnt = 0; 549 550 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 551 if (mask[lane]) {
552 w->bar_cnt[lane]++;	552 w->barCnt[lane]++;
553	553
554 if (w->bar_cnt[lane] > max_cnt) { 555 max_cnt = w->bar_cnt[lane];	554 if (w->barCnt[lane] > max_cnt) { 555 max_cnt = w->barCnt[lane];
556 } 557 } 558 } 559	556 } 557 } 558 } 559
560 if (max_cnt > w->max_bar_cnt) { 561 w->max_bar_cnt = max_cnt;	560 if (max_cnt > w->maxBarCnt) { 561 w->maxBarCnt = max_cnt;
562 } 563 } 564 565 void 566 Call::MagicWaitWFBar(Wavefront w) 567* {	562 } 563 } 564 565 void 566 Call::MagicWaitWFBar(Wavefront w) 567* {
568 const VectorMask &mask = w->get_pred();	568 const VectorMask &mask = w->getPred();
569 int max_cnt = 0; 570 571 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 572 if (mask[lane]) {	569 int max_cnt = 0; 570 571 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 572 if (mask[lane]) {
573 w->bar_cnt[lane]--;	573 w->barCnt[lane]--;
574 } 575	574 } 575
576 if (w->bar_cnt[lane] > max_cnt) { 577 max_cnt = w->bar_cnt[lane];	576 if (w->barCnt[lane] > max_cnt) { 577 max_cnt = w->barCnt[lane];
578 } 579 } 580	578 } 579 } 580
581 if (max_cnt < w->max_bar_cnt) { 582 w->max_bar_cnt = max_cnt;	581 if (max_cnt < w->maxBarCnt) { 582 w->maxBarCnt = max_cnt;
583 } 584 585 w->instructionBuffer.erase(w->instructionBuffer.begin() + 1, 586 w->instructionBuffer.end()); 587 if (w->pendingFetch) 588 w->dropFetch = true; 589 } 590 591 void 592 Call::MagicPanic(Wavefront w) 593* {	583 } 584 585 w->instructionBuffer.erase(w->instructionBuffer.begin() + 1, 586 w->instructionBuffer.end()); 587 if (w->pendingFetch) 588 w->dropFetch = true; 589 } 590 591 void 592 Call::MagicPanic(Wavefront w) 593* {
594 const VectorMask &mask = w->get_pred();	594 const VectorMask &mask = w->getPred();
595 596 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 597 if (mask[lane]) { 598 int src_val1 = src1.get<int>(w, lane, 1); 599 panic("OpenCL Code failed assertion #%d. Triggered by lane %s", 600 src_val1, lane); 601 } 602 } 603 } 604 605 void 606 Call::calcAddr(Wavefront w, GPUDynInstPtr m) 607* { 608 // the address is in src1 \| src2 609 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 610 int src_val1 = src1.get<int>(w, lane, 1); 611 int src_val2 = src1.get<int>(w, lane, 2); 612 Addr addr = (((Addr) src_val1) << 32) \| ((Addr) src_val2); 613 614 m->addr[lane] = addr; 615 } 616 617 } 618 619 void 620 Call::MagicAtomicNRAddGlobalU32Reg(Wavefront w, GPUDynInstPtr gpuDynInst) 621* { 622 GPUDynInstPtr m = gpuDynInst; 623 624 calcAddr(w, m); 625 626 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 627 ((int)m->a_data)[lane] = src1.get<int>(w, lane, 3); 628* } 629 630 m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET, 631 Brig::BRIG_ATOMIC_ADD); 632 m->m_type = U32::memType; 633 m->v_type = U32::vgprType; 634 635 m->exec_mask = w->execMask(); 636 m->statusBitVector = 0; 637 m->equiv = 0; // atomics don't have an equivalence class operand 638 m->n_reg = 1; 639 m->memoryOrder = Enums::MEMORY_ORDER_NONE; 640 m->scope = Enums::MEMORY_SCOPE_NONE; 641 642 m->simdId = w->simdId; 643 m->wfSlotId = w->wfSlotId; 644 m->wfDynId = w->wfDynId; 645 m->latency.init(&w->computeUnit->shader->tick_cnt); 646 647 m->s_type = SEG_GLOBAL; 648 m->pipeId = GLBMEM_PIPE; 649 m->latency.set(w->computeUnit->shader->ticks(64)); 650 w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);	595 596 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 597 if (mask[lane]) { 598 int src_val1 = src1.get<int>(w, lane, 1); 599 panic("OpenCL Code failed assertion #%d. Triggered by lane %s", 600 src_val1, lane); 601 } 602 } 603 } 604 605 void 606 Call::calcAddr(Wavefront w, GPUDynInstPtr m) 607* { 608 // the address is in src1 \| src2 609 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 610 int src_val1 = src1.get<int>(w, lane, 1); 611 int src_val2 = src1.get<int>(w, lane, 2); 612 Addr addr = (((Addr) src_val1) << 32) \| ((Addr) src_val2); 613 614 m->addr[lane] = addr; 615 } 616 617 } 618 619 void 620 Call::MagicAtomicNRAddGlobalU32Reg(Wavefront w, GPUDynInstPtr gpuDynInst) 621* { 622 GPUDynInstPtr m = gpuDynInst; 623 624 calcAddr(w, m); 625 626 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 627 ((int)m->a_data)[lane] = src1.get<int>(w, lane, 3); 628* } 629 630 m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET, 631 Brig::BRIG_ATOMIC_ADD); 632 m->m_type = U32::memType; 633 m->v_type = U32::vgprType; 634 635 m->exec_mask = w->execMask(); 636 m->statusBitVector = 0; 637 m->equiv = 0; // atomics don't have an equivalence class operand 638 m->n_reg = 1; 639 m->memoryOrder = Enums::MEMORY_ORDER_NONE; 640 m->scope = Enums::MEMORY_SCOPE_NONE; 641 642 m->simdId = w->simdId; 643 m->wfSlotId = w->wfSlotId; 644 m->wfDynId = w->wfDynId; 645 m->latency.init(&w->computeUnit->shader->tick_cnt); 646 647 m->s_type = SEG_GLOBAL; 648 m->pipeId = GLBMEM_PIPE; 649 m->latency.set(w->computeUnit->shader->ticks(64)); 650 w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
651 w->outstanding_reqs_wr_gm++; 652 w->wr_gm_reqs_in_pipe--; 653 w->outstanding_reqs_rd_gm++; 654 w->rd_gm_reqs_in_pipe--; 655 w->outstanding_reqs++; 656 w->mem_reqs_in_pipe--;	651 w->outstandingReqsWrGm++; 652 w->wrGmReqsInPipe--; 653 w->outstandingReqsRdGm++; 654 w->rdGmReqsInPipe--; 655 w->outstandingReqs++; 656 w->memReqsInPipe--;
657 } 658 659 void 660 Call::MagicAtomicNRAddGroupU32Reg(Wavefront w, GPUDynInstPtr gpuDynInst) 661* { 662 GPUDynInstPtr m = gpuDynInst; 663 calcAddr(w, m); 664 665 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 666 ((int)m->a_data)[lane] = src1.get<int>(w, lane, 1); 667* } 668 669 m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET, 670 Brig::BRIG_ATOMIC_ADD); 671 m->m_type = U32::memType; 672 m->v_type = U32::vgprType; 673 674 m->exec_mask = w->execMask(); 675 m->statusBitVector = 0; 676 m->equiv = 0; // atomics don't have an equivalence class operand 677 m->n_reg = 1; 678 m->memoryOrder = Enums::MEMORY_ORDER_NONE; 679 m->scope = Enums::MEMORY_SCOPE_NONE; 680 681 m->simdId = w->simdId; 682 m->wfSlotId = w->wfSlotId; 683 m->wfDynId = w->wfDynId; 684 m->latency.init(&w->computeUnit->shader->tick_cnt); 685 686 m->s_type = SEG_GLOBAL; 687 m->pipeId = GLBMEM_PIPE; 688 m->latency.set(w->computeUnit->shader->ticks(64)); 689 w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);	657 } 658 659 void 660 Call::MagicAtomicNRAddGroupU32Reg(Wavefront w, GPUDynInstPtr gpuDynInst) 661* { 662 GPUDynInstPtr m = gpuDynInst; 663 calcAddr(w, m); 664 665 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 666 ((int)m->a_data)[lane] = src1.get<int>(w, lane, 1); 667* } 668 669 m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET, 670 Brig::BRIG_ATOMIC_ADD); 671 m->m_type = U32::memType; 672 m->v_type = U32::vgprType; 673 674 m->exec_mask = w->execMask(); 675 m->statusBitVector = 0; 676 m->equiv = 0; // atomics don't have an equivalence class operand 677 m->n_reg = 1; 678 m->memoryOrder = Enums::MEMORY_ORDER_NONE; 679 m->scope = Enums::MEMORY_SCOPE_NONE; 680 681 m->simdId = w->simdId; 682 m->wfSlotId = w->wfSlotId; 683 m->wfDynId = w->wfDynId; 684 m->latency.init(&w->computeUnit->shader->tick_cnt); 685 686 m->s_type = SEG_GLOBAL; 687 m->pipeId = GLBMEM_PIPE; 688 m->latency.set(w->computeUnit->shader->ticks(64)); 689 w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
690 w->outstanding_reqs_wr_gm++; 691 w->wr_gm_reqs_in_pipe--; 692 w->outstanding_reqs_rd_gm++; 693 w->rd_gm_reqs_in_pipe--; 694 w->outstanding_reqs++; 695 w->mem_reqs_in_pipe--;	690 w->outstandingReqsWrGm++; 691 w->wrGmReqsInPipe--; 692 w->outstandingReqsRdGm++; 693 w->rdGmReqsInPipe--; 694 w->outstandingReqs++; 695 w->memReqsInPipe--;
696 } 697 698 void 699 Call::MagicLoadGlobalU32Reg(Wavefront w, GPUDynInstPtr gpuDynInst) 700* { 701 GPUDynInstPtr m = gpuDynInst; 702 // calculate the address 703 calcAddr(w, m); 704 705 m->m_op = Enums::MO_LD; 706 m->m_type = U32::memType; //MemDataType::memType; 707 m->v_type = U32::vgprType; //DestDataType::vgprType; 708 709 m->exec_mask = w->execMask(); 710 m->statusBitVector = 0; 711 m->equiv = 0; 712 m->n_reg = 1; 713 m->memoryOrder = Enums::MEMORY_ORDER_NONE; 714 m->scope = Enums::MEMORY_SCOPE_NONE; 715 716 // FIXME 717 //m->dst_reg = this->dest.regIndex(); 718 719 m->simdId = w->simdId; 720 m->wfSlotId = w->wfSlotId; 721 m->wfDynId = w->wfDynId; 722 m->latency.init(&w->computeUnit->shader->tick_cnt); 723 724 m->s_type = SEG_GLOBAL; 725 m->pipeId = GLBMEM_PIPE; 726 m->latency.set(w->computeUnit->shader->ticks(1)); 727 w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);	696 } 697 698 void 699 Call::MagicLoadGlobalU32Reg(Wavefront w, GPUDynInstPtr gpuDynInst) 700* { 701 GPUDynInstPtr m = gpuDynInst; 702 // calculate the address 703 calcAddr(w, m); 704 705 m->m_op = Enums::MO_LD; 706 m->m_type = U32::memType; //MemDataType::memType; 707 m->v_type = U32::vgprType; //DestDataType::vgprType; 708 709 m->exec_mask = w->execMask(); 710 m->statusBitVector = 0; 711 m->equiv = 0; 712 m->n_reg = 1; 713 m->memoryOrder = Enums::MEMORY_ORDER_NONE; 714 m->scope = Enums::MEMORY_SCOPE_NONE; 715 716 // FIXME 717 //m->dst_reg = this->dest.regIndex(); 718 719 m->simdId = w->simdId; 720 m->wfSlotId = w->wfSlotId; 721 m->wfDynId = w->wfDynId; 722 m->latency.init(&w->computeUnit->shader->tick_cnt); 723 724 m->s_type = SEG_GLOBAL; 725 m->pipeId = GLBMEM_PIPE; 726 m->latency.set(w->computeUnit->shader->ticks(1)); 727 w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
728 w->outstanding_reqs_rd_gm++; 729 w->rd_gm_reqs_in_pipe--; 730 w->outstanding_reqs++; 731 w->mem_reqs_in_pipe--;	728 w->outstandingReqsRdGm++; 729 w->rdGmReqsInPipe--; 730 w->outstandingReqs++; 731 w->memReqsInPipe--;
732 } 733 734 void 735 Call::MagicXactCasLd(Wavefront w) 736* {	732 } 733 734 void 735 Call::MagicXactCasLd(Wavefront w) 736* {
737 const VectorMask &mask = w->get_pred();	737 const VectorMask &mask = w->getPred();
738 int src_val1 = 0; 739 740 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 741 if (mask[lane]) { 742 src_val1 = src1.get<int>(w, lane, 1); 743 break; 744 } 745 } 746 747 if (!w->computeUnit->xactCasLoadMap.count(src_val1)) { 748 w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue(); 749 w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear(); 750 } 751 752 w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue 753 .push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId)); 754 } 755 756 void 757 Call::MagicMostSigThread(Wavefront w) 758* {	738 int src_val1 = 0; 739 740 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) { 741 if (mask[lane]) { 742 src_val1 = src1.get<int>(w, lane, 1); 743 break; 744 } 745 } 746 747 if (!w->computeUnit->xactCasLoadMap.count(src_val1)) { 748 w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue(); 749 w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear(); 750 } 751 752 w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue 753 .push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId)); 754 } 755 756 void 757 Call::MagicMostSigThread(Wavefront w) 758* {
759 const VectorMask &mask = w->get_pred();	759 const VectorMask &mask = w->getPred();
760 unsigned mst = true; 761 762 for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) { 763 if (mask[lane]) { 764 dest.set<int>(w, lane, mst); 765 mst = false; 766 } 767 } 768 } 769 770 void 771 Call::MagicMostSigBroadcast(Wavefront w) 772* {	760 unsigned mst = true; 761 762 for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) { 763 if (mask[lane]) { 764 dest.set<int>(w, lane, mst); 765 mst = false; 766 } 767 } 768 } 769 770 void 771 Call::MagicMostSigBroadcast(Wavefront w) 772* {
773 const VectorMask &mask = w->get_pred();	773 const VectorMask &mask = w->getPred();
774 int res = 0; 775 bool got_res = false; 776 777 for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) { 778 if (mask[lane]) { 779 if (!got_res) { 780 res = src1.get<int>(w, lane, 1); 781 got_res = true; 782 } 783 dest.set<int>(w, lane, res); 784 } 785 } 786 } 787 788} // namespace HsailISA	774 int res = 0; 775 bool got_res = false; 776 777 for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) { 778 if (mask[lane]) { 779 if (!got_res) { 780 res = src1.get<int>(w, lane, 1); 781 got_res = true; 782 } 783 dest.set<int>(w, lane, res); 784 } 785 } 786 } 787 788} // namespace HsailISA

1/*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Marc Orr
34 */
35
36#include <csignal>
37
38#include "arch/hsail/insts/decl.hh"
39#include "arch/hsail/insts/mem.hh"
40
41namespace HsailISA
42{
43 // Pseudo (or magic) instructions are overloaded on the hsail call
44 // instruction, because of its flexible parameter signature.
45
46 // To add a new magic instruction:
47 // 1. Add an entry to the enum.
48 // 2. Implement it in the switch statement below (Call::exec).
49 // 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h,
50 // so its easy to call from an OpenCL kernel.
51
52 // This enum should be identical to the enum in
53 // hsa/hsail-gpu-compute/util/magicinst.h
54 enum
55 {
56 MAGIC_PRINT_WF_32 = 0,
57 MAGIC_PRINT_WF_64,
58 MAGIC_PRINT_LANE,
59 MAGIC_PRINT_LANE_64,
60 MAGIC_PRINT_WF_FLOAT,
61 MAGIC_SIM_BREAK,
62 MAGIC_PREF_SUM,
63 MAGIC_REDUCTION,
64 MAGIC_MASKLANE_LOWER,
65 MAGIC_MASKLANE_UPPER,
66 MAGIC_JOIN_WF_BAR,
67 MAGIC_WAIT_WF_BAR,
68 MAGIC_PANIC,
69 MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG,
70 MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG,
71 MAGIC_LOAD_GLOBAL_U32_REG,
72 MAGIC_XACT_CAS_LD,
73 MAGIC_MOST_SIG_THD,
74 MAGIC_MOST_SIG_BROADCAST,
75 MAGIC_PRINT_WFID_32,
76 MAGIC_PRINT_WFID_64
77 };
78
79 void
80 Call::execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst)
81 {

82 const VectorMask &mask = w->get_pred();

82 const VectorMask &mask = w->getPred();

83
84 int op = 0;
85 bool got_op = false;
86
87 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
88 if (mask[lane]) {
89 int src_val0 = src1.get<int>(w, lane, 0);
90 if (got_op) {
91 if (src_val0 != op) {
92 fatal("Multiple magic instructions per PC not "
93 "supported\n");
94 }
95 } else {
96 op = src_val0;
97 got_op = true;
98 }
99 }
100 }
101
102 switch(op) {
103 case MAGIC_PRINT_WF_32:
104 MagicPrintWF32(w);
105 break;
106 case MAGIC_PRINT_WF_64:
107 MagicPrintWF64(w);
108 break;
109 case MAGIC_PRINT_LANE:
110 MagicPrintLane(w);
111 break;
112 case MAGIC_PRINT_LANE_64:
113 MagicPrintLane64(w);
114 break;
115 case MAGIC_PRINT_WF_FLOAT:
116 MagicPrintWFFloat(w);
117 break;
118 case MAGIC_SIM_BREAK:
119 MagicSimBreak(w);
120 break;
121 case MAGIC_PREF_SUM:
122 MagicPrefixSum(w);
123 break;
124 case MAGIC_REDUCTION:
125 MagicReduction(w);
126 break;
127 case MAGIC_MASKLANE_LOWER:
128 MagicMaskLower(w);
129 break;
130 case MAGIC_MASKLANE_UPPER:
131 MagicMaskUpper(w);
132 break;
133 case MAGIC_JOIN_WF_BAR:
134 MagicJoinWFBar(w);
135 break;
136 case MAGIC_WAIT_WF_BAR:
137 MagicWaitWFBar(w);
138 break;
139 case MAGIC_PANIC:
140 MagicPanic(w);
141 break;
142
143 // atomic instructions
144 case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG:
145 MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst);
146 break;
147
148 case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG:
149 MagicAtomicNRAddGroupU32Reg(w, gpuDynInst);
150 break;
151
152 case MAGIC_LOAD_GLOBAL_U32_REG:
153 MagicLoadGlobalU32Reg(w, gpuDynInst);
154 break;
155
156 case MAGIC_XACT_CAS_LD:
157 MagicXactCasLd(w);
158 break;
159
160 case MAGIC_MOST_SIG_THD:
161 MagicMostSigThread(w);
162 break;
163
164 case MAGIC_MOST_SIG_BROADCAST:
165 MagicMostSigBroadcast(w);
166 break;
167
168 case MAGIC_PRINT_WFID_32:
169 MagicPrintWF32ID(w);
170 break;
171
172 case MAGIC_PRINT_WFID_64:
173 MagicPrintWFID64(w);
174 break;
175
176 default: fatal("unrecognized magic instruction: %d\n", op);
177 }
178 }
179
180 void
181 Call::MagicPrintLane(Wavefront *w)
182 {
183 #if TRACING_ON

184 const VectorMask &mask = w->get_pred();

184 const VectorMask &mask = w->getPred();

185 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
186 if (mask[lane]) {
187 int src_val1 = src1.get<int>(w, lane, 1);
188 int src_val2 = src1.get<int>(w, lane, 2);
189 if (src_val2) {
190 DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
191 disassemble(), w->computeUnit->cu_id, w->simdId,
192 w->wfSlotId, lane, src_val1);
193 } else {
194 DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
195 disassemble(), w->computeUnit->cu_id, w->simdId,
196 w->wfSlotId, lane, src_val1);
197 }
198 }
199 }
200 #endif
201 }
202
203 void
204 Call::MagicPrintLane64(Wavefront *w)
205 {
206 #if TRACING_ON

207 const VectorMask &mask = w->get_pred();

207 const VectorMask &mask = w->getPred();

208 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
209 if (mask[lane]) {
210 int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
211 int src_val2 = src1.get<int>(w, lane, 2);
212 if (src_val2) {
213 DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
214 disassemble(), w->computeUnit->cu_id, w->simdId,
215 w->wfSlotId, lane, src_val1);
216 } else {
217 DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
218 disassemble(), w->computeUnit->cu_id, w->simdId,
219 w->wfSlotId, lane, src_val1);
220 }
221 }
222 }
223 #endif
224 }
225
226 void
227 Call::MagicPrintWF32(Wavefront *w)
228 {
229 #if TRACING_ON

230 const VectorMask &mask = w->get_pred();

230 const VectorMask &mask = w->getPred();

231 std::string res_str;
232 res_str = csprintf("krl_prt (%s)\n", disassemble());
233
234 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
235 if (!(lane & 7)) {
236 res_str += csprintf("DB%03d: ", (int)w->wfDynId);
237 }
238
239 if (mask[lane]) {
240 int src_val1 = src1.get<int>(w, lane, 1);
241 int src_val2 = src1.get<int>(w, lane, 2);
242
243 if (src_val2) {
244 res_str += csprintf("%08x", src_val1);
245 } else {
246 res_str += csprintf("%08d", src_val1);
247 }
248 } else {
249 res_str += csprintf("xxxxxxxx");
250 }
251
252 if ((lane & 7) == 7) {
253 res_str += csprintf("\n");
254 } else {
255 res_str += csprintf(" ");
256 }
257 }
258
259 res_str += "\n\n";
260 DPRINTFN(res_str.c_str());
261 #endif
262 }
263
264 void
265 Call::MagicPrintWF32ID(Wavefront *w)
266 {
267 #if TRACING_ON

268 const VectorMask &mask = w->get_pred();

268 const VectorMask &mask = w->getPred();

269 std::string res_str;
270 int src_val3 = -1;
271 res_str = csprintf("krl_prt (%s)\n", disassemble());
272
273 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
274 if (!(lane & 7)) {
275 res_str += csprintf("DB%03d: ", (int)w->wfDynId);
276 }
277
278 if (mask[lane]) {
279 int src_val1 = src1.get<int>(w, lane, 1);
280 int src_val2 = src1.get<int>(w, lane, 2);
281 src_val3 = src1.get<int>(w, lane, 3);
282
283 if (src_val2) {
284 res_str += csprintf("%08x", src_val1);
285 } else {
286 res_str += csprintf("%08d", src_val1);
287 }
288 } else {
289 res_str += csprintf("xxxxxxxx");
290 }
291
292 if ((lane & 7) == 7) {
293 res_str += csprintf("\n");
294 } else {
295 res_str += csprintf(" ");
296 }
297 }
298
299 res_str += "\n\n";
300 if (w->wfDynId == src_val3) {
301 DPRINTFN(res_str.c_str());
302 }
303 #endif
304 }
305
306 void
307 Call::MagicPrintWF64(Wavefront *w)
308 {
309 #if TRACING_ON

310 const VectorMask &mask = w->get_pred();

310 const VectorMask &mask = w->getPred();

311 std::string res_str;
312 res_str = csprintf("krl_prt (%s)\n", disassemble());
313
314 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
315 if (!(lane & 3)) {
316 res_str += csprintf("DB%03d: ", (int)w->wfDynId);
317 }
318
319 if (mask[lane]) {
320 int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
321 int src_val2 = src1.get<int>(w, lane, 2);
322
323 if (src_val2) {
324 res_str += csprintf("%016x", src_val1);
325 } else {
326 res_str += csprintf("%016d", src_val1);
327 }
328 } else {
329 res_str += csprintf("xxxxxxxxxxxxxxxx");
330 }
331
332 if ((lane & 3) == 3) {
333 res_str += csprintf("\n");
334 } else {
335 res_str += csprintf(" ");
336 }
337 }
338
339 res_str += "\n\n";
340 DPRINTFN(res_str.c_str());
341 #endif
342 }
343
344 void
345 Call::MagicPrintWFID64(Wavefront *w)
346 {
347 #if TRACING_ON

348 const VectorMask &mask = w->get_pred();

348 const VectorMask &mask = w->getPred();

349 std::string res_str;
350 int src_val3 = -1;
351 res_str = csprintf("krl_prt (%s)\n", disassemble());
352
353 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
354 if (!(lane & 3)) {
355 res_str += csprintf("DB%03d: ", (int)w->wfDynId);
356 }
357
358 if (mask[lane]) {
359 int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
360 int src_val2 = src1.get<int>(w, lane, 2);
361 src_val3 = src1.get<int>(w, lane, 3);
362
363 if (src_val2) {
364 res_str += csprintf("%016x", src_val1);
365 } else {
366 res_str += csprintf("%016d", src_val1);
367 }
368 } else {
369 res_str += csprintf("xxxxxxxxxxxxxxxx");
370 }
371
372 if ((lane & 3) == 3) {
373 res_str += csprintf("\n");
374 } else {
375 res_str += csprintf(" ");
376 }
377 }
378
379 res_str += "\n\n";
380 if (w->wfDynId == src_val3) {
381 DPRINTFN(res_str.c_str());
382 }
383 #endif
384 }
385
386 void
387 Call::MagicPrintWFFloat(Wavefront *w)
388 {
389 #if TRACING_ON

390 const VectorMask &mask = w->get_pred();

390 const VectorMask &mask = w->getPred();

391 std::string res_str;
392 res_str = csprintf("krl_prt (%s)\n", disassemble());
393
394 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
395 if (!(lane & 7)) {
396 res_str += csprintf("DB%03d: ", (int)w->wfDynId);
397 }
398
399 if (mask[lane]) {
400 float src_val1 = src1.get<float>(w, lane, 1);
401 res_str += csprintf("%08f", src_val1);
402 } else {
403 res_str += csprintf("xxxxxxxx");
404 }
405
406 if ((lane & 7) == 7) {
407 res_str += csprintf("\n");
408 } else {
409 res_str += csprintf(" ");
410 }
411 }
412
413 res_str += "\n\n";
414 DPRINTFN(res_str.c_str());
415 #endif
416 }
417
418 // raises a signal that GDB will catch
419 // when done with the break, type "signal 0" in gdb to continue
420 void
421 Call::MagicSimBreak(Wavefront *w)
422 {
423 std::string res_str;
424 // print out state for this wavefront and then break
425 res_str = csprintf("Breakpoint encountered for wavefront %i\n",
426 w->wfSlotId);
427

428 res_str += csprintf(" Kern ID: %i\n", w->kern_id);

428 res_str += csprintf(" Kern ID: %i\n", w->kernId);

429 res_str += csprintf(" Phase ID: %i\n", w->simdId);
430 res_str += csprintf(" Executing on CU #%i\n", w->computeUnit->cu_id);
431 res_str += csprintf(" Exec mask: ");
432
433 for (int i = w->computeUnit->wfSize() - 1; i >= 0; --i) {
434 if (w->execMask(i))
435 res_str += "1";
436 else
437 res_str += "0";
438
439 if ((i & 7) == 7)
440 res_str += " ";
441 }
442
443 res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong());
444
445 res_str += "\nHelpful debugging hints:\n";
446 res_str += " Check out w->s_reg / w->d_reg for register state\n";
447
448 res_str += "\n\n";
449 DPRINTFN(res_str.c_str());
450 fflush(stdout);
451
452 raise(SIGTRAP);
453 }
454
455 void
456 Call::MagicPrefixSum(Wavefront *w)
457 {

458 const VectorMask &mask = w->get_pred();

458 const VectorMask &mask = w->getPred();

459 int res = 0;
460
461 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
462 if (mask[lane]) {
463 int src_val1 = src1.get<int>(w, lane, 1);
464 dest.set<int>(w, lane, res);
465 res += src_val1;
466 }
467 }
468 }
469
470 void
471 Call::MagicReduction(Wavefront *w)
472 {
473 // reduction magic instruction
474 // The reduction instruction takes up to 64 inputs (one from
475 // each thread in a WF) and sums them. It returns the sum to
476 // each thread in the WF.

477 const VectorMask &mask = w->get_pred();

477 const VectorMask &mask = w->getPred();

478 int res = 0;
479
480 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
481 if (mask[lane]) {
482 int src_val1 = src1.get<int>(w, lane, 1);
483 res += src_val1;
484 }
485 }
486
487 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
488 if (mask[lane]) {
489 dest.set<int>(w, lane, res);
490 }
491 }
492 }
493
494 void
495 Call::MagicMaskLower(Wavefront *w)
496 {

497 const VectorMask &mask = w->get_pred();

497 const VectorMask &mask = w->getPred();

498 int res = 0;
499
500 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
501 if (mask[lane]) {
502 int src_val1 = src1.get<int>(w, lane, 1);
503
504 if (src_val1) {
505 if (lane < (w->computeUnit->wfSize()/2)) {
506 res = res | ((uint32_t)(1) << lane);
507 }
508 }
509 }
510 }
511
512 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
513 if (mask[lane]) {
514 dest.set<int>(w, lane, res);
515 }
516 }
517 }
518
519 void
520 Call::MagicMaskUpper(Wavefront *w)
521 {

522 const VectorMask &mask = w->get_pred();

522 const VectorMask &mask = w->getPred();

523 int res = 0;
524 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
525 if (mask[lane]) {
526 int src_val1 = src1.get<int>(w, lane, 1);
527
528 if (src_val1) {
529 if (lane >= (w->computeUnit->wfSize()/2)) {
530 res = res | ((uint32_t)(1) <<
531 (lane - (w->computeUnit->wfSize()/2)));
532 }
533 }
534 }
535 }
536
537 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
538 if (mask[lane]) {
539 dest.set<int>(w, lane, res);
540 }
541 }
542 }
543
544 void
545 Call::MagicJoinWFBar(Wavefront *w)
546 {

547 const VectorMask &mask = w->get_pred();

547 const VectorMask &mask = w->getPred();

548 int max_cnt = 0;
549
550 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
551 if (mask[lane]) {

552 w->bar_cnt[lane]++;

552 w->barCnt[lane]++;

553

554 if (w->bar_cnt[lane] > max_cnt) {
555 max_cnt = w->bar_cnt[lane];

554 if (w->barCnt[lane] > max_cnt) {
555 max_cnt = w->barCnt[lane];

556 }
557 }
558 }
559

560 if (max_cnt > w->max_bar_cnt) {
561 w->max_bar_cnt = max_cnt;

560 if (max_cnt > w->maxBarCnt) {
561 w->maxBarCnt = max_cnt;

562 }
563 }
564
565 void
566 Call::MagicWaitWFBar(Wavefront *w)
567 {

568 const VectorMask &mask = w->get_pred();

568 const VectorMask &mask = w->getPred();

569 int max_cnt = 0;
570
571 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
572 if (mask[lane]) {

573 w->bar_cnt[lane]--;

573 w->barCnt[lane]--;

574 }
575

576 if (w->bar_cnt[lane] > max_cnt) {
577 max_cnt = w->bar_cnt[lane];

576 if (w->barCnt[lane] > max_cnt) {
577 max_cnt = w->barCnt[lane];

578 }
579 }
580

581 if (max_cnt < w->max_bar_cnt) {
582 w->max_bar_cnt = max_cnt;

581 if (max_cnt < w->maxBarCnt) {
582 w->maxBarCnt = max_cnt;

583 }
584
585 w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
586 w->instructionBuffer.end());
587 if (w->pendingFetch)
588 w->dropFetch = true;
589 }
590
591 void
592 Call::MagicPanic(Wavefront *w)
593 {

594 const VectorMask &mask = w->get_pred();

594 const VectorMask &mask = w->getPred();

595
596 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
597 if (mask[lane]) {
598 int src_val1 = src1.get<int>(w, lane, 1);
599 panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
600 src_val1, lane);
601 }
602 }
603 }
604
605 void
606 Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
607 {
608 // the address is in src1 | src2
609 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
610 int src_val1 = src1.get<int>(w, lane, 1);
611 int src_val2 = src1.get<int>(w, lane, 2);
612 Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);
613
614 m->addr[lane] = addr;
615 }
616
617 }
618
619 void
620 Call::MagicAtomicNRAddGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
621 {
622 GPUDynInstPtr m = gpuDynInst;
623
624 calcAddr(w, m);
625
626 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
627 ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
628 }
629
630 m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
631 Brig::BRIG_ATOMIC_ADD);
632 m->m_type = U32::memType;
633 m->v_type = U32::vgprType;
634
635 m->exec_mask = w->execMask();
636 m->statusBitVector = 0;
637 m->equiv = 0; // atomics don't have an equivalence class operand
638 m->n_reg = 1;
639 m->memoryOrder = Enums::MEMORY_ORDER_NONE;
640 m->scope = Enums::MEMORY_SCOPE_NONE;
641
642 m->simdId = w->simdId;
643 m->wfSlotId = w->wfSlotId;
644 m->wfDynId = w->wfDynId;
645 m->latency.init(&w->computeUnit->shader->tick_cnt);
646
647 m->s_type = SEG_GLOBAL;
648 m->pipeId = GLBMEM_PIPE;
649 m->latency.set(w->computeUnit->shader->ticks(64));
650 w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);

651 w->outstanding_reqs_wr_gm++;
652 w->wr_gm_reqs_in_pipe--;
653 w->outstanding_reqs_rd_gm++;
654 w->rd_gm_reqs_in_pipe--;
655 w->outstanding_reqs++;
656 w->mem_reqs_in_pipe--;

651 w->outstandingReqsWrGm++;
652 w->wrGmReqsInPipe--;
653 w->outstandingReqsRdGm++;
654 w->rdGmReqsInPipe--;
655 w->outstandingReqs++;
656 w->memReqsInPipe--;

657 }
658
659 void
660 Call::MagicAtomicNRAddGroupU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
661 {
662 GPUDynInstPtr m = gpuDynInst;
663 calcAddr(w, m);
664
665 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
666 ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
667 }
668
669 m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
670 Brig::BRIG_ATOMIC_ADD);
671 m->m_type = U32::memType;
672 m->v_type = U32::vgprType;
673
674 m->exec_mask = w->execMask();
675 m->statusBitVector = 0;
676 m->equiv = 0; // atomics don't have an equivalence class operand
677 m->n_reg = 1;
678 m->memoryOrder = Enums::MEMORY_ORDER_NONE;
679 m->scope = Enums::MEMORY_SCOPE_NONE;
680
681 m->simdId = w->simdId;
682 m->wfSlotId = w->wfSlotId;
683 m->wfDynId = w->wfDynId;
684 m->latency.init(&w->computeUnit->shader->tick_cnt);
685
686 m->s_type = SEG_GLOBAL;
687 m->pipeId = GLBMEM_PIPE;
688 m->latency.set(w->computeUnit->shader->ticks(64));
689 w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);

690 w->outstanding_reqs_wr_gm++;
691 w->wr_gm_reqs_in_pipe--;
692 w->outstanding_reqs_rd_gm++;
693 w->rd_gm_reqs_in_pipe--;
694 w->outstanding_reqs++;
695 w->mem_reqs_in_pipe--;

690 w->outstandingReqsWrGm++;
691 w->wrGmReqsInPipe--;
692 w->outstandingReqsRdGm++;
693 w->rdGmReqsInPipe--;
694 w->outstandingReqs++;
695 w->memReqsInPipe--;

696 }
697
698 void
699 Call::MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
700 {
701 GPUDynInstPtr m = gpuDynInst;
702 // calculate the address
703 calcAddr(w, m);
704
705 m->m_op = Enums::MO_LD;
706 m->m_type = U32::memType; //MemDataType::memType;
707 m->v_type = U32::vgprType; //DestDataType::vgprType;
708
709 m->exec_mask = w->execMask();
710 m->statusBitVector = 0;
711 m->equiv = 0;
712 m->n_reg = 1;
713 m->memoryOrder = Enums::MEMORY_ORDER_NONE;
714 m->scope = Enums::MEMORY_SCOPE_NONE;
715
716 // FIXME
717 //m->dst_reg = this->dest.regIndex();
718
719 m->simdId = w->simdId;
720 m->wfSlotId = w->wfSlotId;
721 m->wfDynId = w->wfDynId;
722 m->latency.init(&w->computeUnit->shader->tick_cnt);
723
724 m->s_type = SEG_GLOBAL;
725 m->pipeId = GLBMEM_PIPE;
726 m->latency.set(w->computeUnit->shader->ticks(1));
727 w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);

728 w->outstanding_reqs_rd_gm++;
729 w->rd_gm_reqs_in_pipe--;
730 w->outstanding_reqs++;
731 w->mem_reqs_in_pipe--;

728 w->outstandingReqsRdGm++;
729 w->rdGmReqsInPipe--;
730 w->outstandingReqs++;
731 w->memReqsInPipe--;

732 }
733
734 void
735 Call::MagicXactCasLd(Wavefront *w)
736 {

737 const VectorMask &mask = w->get_pred();

737 const VectorMask &mask = w->getPred();

738 int src_val1 = 0;
739
740 for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
741 if (mask[lane]) {
742 src_val1 = src1.get<int>(w, lane, 1);
743 break;
744 }
745 }
746
747 if (!w->computeUnit->xactCasLoadMap.count(src_val1)) {
748 w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue();
749 w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear();
750 }
751
752 w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue
753 .push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId));
754 }
755
756 void
757 Call::MagicMostSigThread(Wavefront *w)
758 {

759 const VectorMask &mask = w->get_pred();

759 const VectorMask &mask = w->getPred();

760 unsigned mst = true;
761
762 for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) {
763 if (mask[lane]) {
764 dest.set<int>(w, lane, mst);
765 mst = false;
766 }
767 }
768 }
769
770 void
771 Call::MagicMostSigBroadcast(Wavefront *w)
772 {

773 const VectorMask &mask = w->get_pred();

773 const VectorMask &mask = w->getPred();