1// -*- mode: c++ -*- 2 3// Copyright (c) 2012-2014 ARM Limited 4// All rights reserved 5// 6// The license below extends only to copyright in the software and shall 7// not be construed as granting a license to any other intellectual 8// property including but not limited to intellectual property relating 9// to a hardware implementation of the functionality of the software 10// licensed hereunder. You may use the software subject to the license 11// terms below provided that you ensure that this notice is replicated 12// unmodified and in its entirety in all distributions of the software, 13// modified or unmodified, in source code or in binary form. 14// 15// Redistribution and use in source and binary forms, with or without 16// modification, are permitted provided that the following conditions are 17// met: redistributions of source code must retain the above copyright 18// notice, this list of conditions and the following disclaimer; 19// redistributions in binary form must reproduce the above copyright 20// notice, this list of conditions and the following disclaimer in the 21// documentation and/or other materials provided with the distribution; 22// neither the name of the copyright holders nor the names of its 23// contributors may be used to endorse or promote products derived from 24// this software without specific prior written permission. 25// 26// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 27// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 28// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 29// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 30// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 31// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 32// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 33// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 34// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 35// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 36// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37// 38// Authors: Mbou Eyole 39// Giacomo Gabrielli 40 41let {{ 42 43 header_output = '' 44 decoder_output = '' 45 exec_output = '' 46 47 zeroSveVecRegUpperPartCode = ''' 48 ArmISA::ISA::zeroSveVecRegUpperPart(%s, 49 ArmStaticInst::getCurSveVecLen<uint64_t>(xc->tcBase())); 50 ''' 51 52 def mkMemAccMicroOp(name): 53 global header_output, decoder_output, exec_output 54 SPAlignmentCheckCodeNeon = ''' 55 if (baseIsSP && bits(XURa, 3, 0) && 56 SPAlignmentCheckEnabled(xc->tcBase())) { 57 return std::make_shared<SPAlignmentFault>(); 58 } 59 ''' 60 eaCode = SPAlignmentCheckCodeNeon + ''' 61 EA = XURa + imm; 62 ''' 63 memDecl = ''' 64 const int MaxNumBytes = 16; 65 union MemUnion { 66 uint8_t bytes[MaxNumBytes]; 67 uint32_t floatRegBits[MaxNumBytes / 4]; 68 }; 69 ''' 70 71 # Do endian conversion for all the elements 72 convCode = ''' 73 VReg x = {0, 0}; 74 75 x.lo = (((XReg) memUnion.floatRegBits[1]) << 32) | 76 (XReg) memUnion.floatRegBits[0]; 77 x.hi = (((XReg) memUnion.floatRegBits[3]) << 32) | 78 (XReg) memUnion.floatRegBits[2]; 79 80 const unsigned eCount = 16 / (1 << eSize); 81 82 if (isBigEndian64(xc->tcBase())) { 83 for (unsigned i = 0; i < eCount; i++) { 84 switch (eSize) { 85 case 0x3: // 64-bit 86 writeVecElem(&x, (XReg) gtobe( 87 (uint64_t) readVecElem(x, i, eSize)), i, eSize); 88 break; 89 case 0x2: // 32-bit 90 writeVecElem(&x, (XReg) gtobe( 91 (uint32_t) readVecElem(x, i, eSize)), i, eSize); 92 break; 93 case 0x1: // 16-bit 94 writeVecElem(&x, (XReg) gtobe( 95 (uint16_t) readVecElem(x, i, eSize)), i, eSize); 96 break; 97 default: // 8-bit 98 break; // Nothing to do here 99 } 100 } 101 } else { 102 for (unsigned i = 0; i < eCount; i++) { 103 switch (eSize) { 104 case 0x3: // 64-bit 105 writeVecElem(&x, (XReg) gtole( 106 (uint64_t) readVecElem(x, i, eSize)), i, eSize); 107 break; 108 case 0x2: // 32-bit 109 writeVecElem(&x, (XReg) gtole( 110 (uint32_t) readVecElem(x, i, eSize)), i, eSize); 111 break; 112 case 0x1: // 16-bit 113 writeVecElem(&x, (XReg) gtole( 114 (uint16_t) readVecElem(x, i, eSize)), i, eSize); 115 break; 116 default: // 8-bit 117 break; // Nothing to do here 118 } 119 } 120 } 121 122 memUnion.floatRegBits[0] = (uint32_t) x.lo; 123 memUnion.floatRegBits[1] = (uint32_t) (x.lo >> 32); 124 memUnion.floatRegBits[2] = (uint32_t) x.hi; 125 memUnion.floatRegBits[3] = (uint32_t) (x.hi >> 32); 126 ''' 127 128 # Offload everything into registers 129 regSetCode = '' 130 for reg in range(4): 131 regSetCode += ''' 132 AA64FpDestP%(reg)d_uw = gtoh(memUnion.floatRegBits[%(reg)d]); 133 ''' % { 'reg' : reg } 134 135 # Pull everything in from registers 136 regGetCode = '' 137 for reg in range(4): 138 regGetCode += ''' 139 memUnion.floatRegBits[%(reg)d] = htog(AA64FpDestP%(reg)d_uw); 140 ''' % { 'reg' : reg } 141 142 loadMemAccCode = convCode + regSetCode 143 storeMemAccCode = regGetCode + convCode 144 145 loadIop = InstObjParams(name + 'ld', 146 'MicroNeonLoad64', 147 'MicroNeonMemOp', 148 { 'mem_decl' : memDecl, 149 'memacc_code' : loadMemAccCode, 150 'ea_code' : simd64EnabledCheckCode + eaCode, 151 }, 152 [ 'IsMicroop', 'IsMemRef', 'IsLoad' ]) 153 loadIop.snippets["memacc_code"] += zeroSveVecRegUpperPartCode % \ 154 "AA64FpDest" 155 storeIop = InstObjParams(name + 'st', 156 'MicroNeonStore64', 157 'MicroNeonMemOp', 158 { 'mem_decl' : memDecl, 159 'memacc_code' : storeMemAccCode, 160 'ea_code' : simd64EnabledCheckCode + eaCode, 161 }, 162 [ 'IsMicroop', 'IsMemRef', 'IsStore' ]) 163 164 exec_output += NeonLoadExecute64.subst(loadIop) + \ 165 NeonLoadInitiateAcc64.subst(loadIop) + \ 166 NeonLoadCompleteAcc64.subst(loadIop) + \ 167 NeonStoreExecute64.subst(storeIop) + \ 168 NeonStoreInitiateAcc64.subst(storeIop) + \ 169 NeonStoreCompleteAcc64.subst(storeIop) 170 header_output += MicroNeonMemDeclare64.subst(loadIop) + \ 171 MicroNeonMemDeclare64.subst(storeIop) 172 173 def mkMarshalMicroOp(name, Name, numRegs=4): 174 global header_output, decoder_output, exec_output 175 176 getInputCodeOp1L = '' 177 for v in range(numRegs): 178 for p in range(4): 179 getInputCodeOp1L += ''' 180 writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)d_uw, 181 %(p)d, 0x2); 182 ''' % { 'v' : v, 'p' : p } 183 184 getInputCodeOp1S = '' 185 for v in range(numRegs): 186 for p in range(4): 187 getInputCodeOp1S += ''' 188 writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)dS_uw, 189 %(p)d, 0x2); 190 ''' % { 'v' : v, 'p' : p } 191 192 if name == 'deint_neon_uop': 193 194 eCode = ''' 195 // input data from scratch area 196 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} }; 197 VReg output[2]; // output data to arch. SIMD regs 198 VReg temp; 199 temp.lo = 0; 200 temp.hi = 0; 201 ''' 202 for p in range(4): 203 eCode += ''' 204 writeVecElem(&temp, (XReg) AA64FpDestP%(p)dV1L_uw, %(p)d, 0x2); 205 ''' % { 'p' : p } 206 eCode += getInputCodeOp1L 207 208 # Note that numRegs is not always the same as numStructElems; in 209 # particular, for LD1/ST1, numStructElems is 1 but numRegs can be 210 # 1, 2, 3 or 4 211 212 eCode += ''' 213 output[0].lo = 0; 214 output[0].hi = 0; 215 output[1].lo = 0; 216 output[1].hi = 0; 217 218 int eCount = dataSize / (8 << eSize); 219 int eSizeBytes = 1 << eSize; // element size in bytes 220 int numBytes = step * dataSize / 4; 221 int totNumBytes = numRegs * dataSize / 8; 222 223 int structElemNo, pos, a, b; 224 XReg data; 225 226 for (int r = 0; r < 2; ++r) { 227 for (int i = 0; i < eCount; ++i) { 228 if (numBytes < totNumBytes) { 229 structElemNo = r + (step * 2); 230 if (numStructElems == 1) { 231 pos = (eSizeBytes * i) + 232 (eCount * structElemNo * eSizeBytes); 233 } else { 234 pos = (numStructElems * eSizeBytes * i) + 235 (structElemNo * eSizeBytes); 236 } 237 a = pos / 16; 238 b = (pos % 16) / eSizeBytes; 239 data = (XReg) readVecElem(input[a], (XReg) b, 240 eSize); 241 writeVecElem(&output[r], data, i, eSize); 242 numBytes += eSizeBytes; 243 } 244 } 245 } 246 ''' 247 for p in range(4): 248 eCode += ''' 249 AA64FpDestP%(p)dV0L_uw = (uint32_t) readVecElem(output[0], 250 %(p)d, 0x2); 251 ''' % { 'p' : p } 252 eCode += ''' 253 if ((numRegs % 2 == 0) || (numRegs == 3 && step == 0)) { 254 ''' 255 for p in range(4): 256 eCode += ''' 257 AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem( 258 output[1], %(p)d, 0x2); 259 ''' % { 'p' : p } 260 eCode += ''' 261 } else { 262 ''' 263 for p in range(4): 264 eCode += ''' 265 AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(temp, 266 %(p)d, 0x2); 267 ''' % { 'p' : p } 268 eCode += ''' 269 } 270 ''' 271 272 iop = InstObjParams(name, Name, 'MicroNeonMixOp64', 273 { 'code' : eCode, 'op_class' : 'No_OpClass' }, 274 ['IsMicroop']) 275 header_output += MicroNeonMixDeclare64.subst(iop) 276 exec_output += MicroNeonMixExecute64.subst(iop) 277 278 elif name == 'int_neon_uop': 279 280 eCode = ''' 281 // input data from arch. SIMD regs 282 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} }; 283 VReg output[2]; // output data to scratch area 284 ''' 285 286 eCode += getInputCodeOp1S 287 288 # Note that numRegs is not always the same as numStructElems; in 289 # particular, for LD1/ST1, numStructElems is 1 but numRegs can be 290 # 1, 2, 3 or 4 291 292 eCode += ''' 293 int eCount = dataSize / (8 << eSize); 294 int eSizeBytes = 1 << eSize; 295 int totNumBytes = numRegs * dataSize / 8; 296 int numOutputElems = 128 / (8 << eSize); 297 int stepOffset = step * 32; 298 299 for (int i = 0; i < 2; ++i) { 300 output[i].lo = 0; 301 output[i].hi = 0; 302 } 303 304 int r = 0, k = 0, i, j; 305 XReg data; 306 307 for (int pos = stepOffset; pos < 32 + stepOffset; 308 pos += eSizeBytes) { 309 if (pos < totNumBytes) { 310 if (numStructElems == 1) { 311 i = (pos / eSizeBytes) % eCount; 312 j = pos / (eCount * eSizeBytes); 313 } else { 314 i = pos / (numStructElems * eSizeBytes); 315 j = (pos % (numStructElems * eSizeBytes)) / 316 eSizeBytes; 317 } 318 data = (XReg) readVecElem(input[j], (XReg) i, eSize); 319 writeVecElem(&output[r], data, k, eSize); 320 k++; 321 if (k == numOutputElems){ 322 k = 0; 323 ++r; 324 } 325 } 326 } 327 ''' 328 for v in range(2): 329 for p in range(4): 330 eCode += ''' 331 AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem( 332 output[%(v)d], %(p)d, 0x2); 333 ''' % { 'v': v, 'p': p} 334 335 iop = InstObjParams(name, Name, 'MicroNeonMixOp64', 336 { 'code' : eCode, 'op_class' : 'No_OpClass' }, 337 ['IsMicroop']) 338 header_output += MicroNeonMixDeclare64.subst(iop) 339 exec_output += MicroNeonMixExecute64.subst(iop) 340 341 elif name == 'unpack_neon_uop': 342 343 eCode = ''' 344 //input data from scratch area 345 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} }; 346 //output data to arch. SIMD regs 347 VReg output[2] = { {0, 0}, {0, 0} }; 348 ''' 349 350 eCode += getInputCodeOp1L 351 352 # Fill output regs with register data initially. Note that 353 # elements in output register outside indexed lanes are left 354 # untouched 355 for v in range(2): 356 for p in range(4): 357 eCode += ''' 358 writeVecElem(&output[%(v)d], (XReg) AA64FpDestP%(p)dV%(v)dL_uw, 359 %(p)d, 0x2); 360 ''' % { 'v': v, 'p': p} 361 eCode += ''' 362 int eCount = dataSize / (8 << eSize); 363 int eCount128 = 128 / (8 << eSize); 364 int eSizeBytes = 1 << eSize; 365 int totNumBytes = numStructElems * eSizeBytes; 366 int numInputElems = eCount128; 367 int stepOffset = step * 2 * eSizeBytes; 368 int stepLimit = 2 * eSizeBytes; 369 370 int r = 0, i, j; 371 XReg data; 372 373 for (int pos = stepOffset; pos < stepLimit + stepOffset; 374 pos += eSizeBytes) { 375 if (pos < totNumBytes) { 376 r = pos / eSizeBytes; 377 j = r / numInputElems; 378 i = r % numInputElems; 379 data = (XReg) readVecElem(input[j], (XReg) i, eSize); 380 381 if (replicate) { 382 for (int i = 0; i < eCount128; ++i) { 383 if (i < eCount) { 384 writeVecElem(&output[r % 2], data, i, 385 eSize); 386 } else { // zero extend if necessary 387 writeVecElem(&output[r % 2], (XReg) 0, i, 388 eSize); 389 } 390 } 391 } else { 392 writeVecElem(&output[r % 2], data, lane, eSize); 393 } 394 } 395 } 396 ''' 397 for v in range(2): 398 for p in range(4): 399 eCode += ''' 400 AA64FpDestP%(p)dV%(v)dL_uw = (uint32_t) readVecElem( 401 output[%(v)d], %(p)d, 0x2); 402 ''' % { 'v' : v, 'p' : p } 403 404 iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64', 405 { 'code' : eCode }, ['IsMicroop']) 406 header_output += MicroNeonMixLaneDeclare64.subst(iop) 407 exec_output += MicroNeonMixExecute64.subst(iop) 408 409 elif name == 'pack_neon_uop': 410 411 eCode = ''' 412 // input data from arch. SIMD regs 413 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} }; 414 VReg output[2]; // output data to scratch area 415 ''' 416 417 eCode += getInputCodeOp1S 418 419 eCode += ''' 420 int eSizeBytes = 1 << eSize; 421 int numOutputElems = 128 / (8 << eSize); 422 int totNumBytes = numStructElems * eSizeBytes; 423 int stepOffset = step * 32; 424 int stepLimit = 32; 425 426 int r = 0, i, j; 427 XReg data; 428 429 for (int i = 0; i < 2; ++i) { 430 output[i].lo = 0; 431 output[i].hi = 0; 432 } 433 434 for (int pos = stepOffset; pos < stepLimit + stepOffset; 435 pos += eSizeBytes) { 436 if (pos < totNumBytes) { 437 r = pos / 16; 438 j = pos / eSizeBytes; 439 i = (pos / eSizeBytes) % numOutputElems; 440 data = (XReg) readVecElem(input[j], lane, eSize); 441 writeVecElem(&output[r % 2], data, i, eSize); 442 } 443 } 444 ''' 445 446 for v in range(2): 447 for p in range(4): 448 eCode += ''' 449 AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem( 450 output[%(v)d], %(p)d, 0x2); 451 ''' % { 'v' : v, 'p' : p } 452 453 iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64', 454 { 'code' : eCode }, ['IsMicroop']) 455 header_output += MicroNeonMixLaneDeclare64.subst(iop) 456 exec_output += MicroNeonMixExecute64.subst(iop) 457 458 # Generate instructions 459 mkMemAccMicroOp('mem_neon_uop') 460 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_1Reg', numRegs=1) 461 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_2Reg', numRegs=2) 462 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_3Reg', numRegs=3) 463 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_4Reg', numRegs=4) 464 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_1Reg', numRegs=1) 465 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_2Reg', numRegs=2) 466 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_3Reg', numRegs=3) 467 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_4Reg', numRegs=4) 468 mkMarshalMicroOp('unpack_neon_uop', 'MicroUnpackNeon64') 469 mkMarshalMicroOp('pack_neon_uop', 'MicroPackNeon64') 470 471}}; 472 473let {{ 474 475 iop = InstObjParams('vldmult64', 'VldMult64', 'VldMultOp64', '', []) 476 header_output += VMemMultDeclare64.subst(iop) 477 decoder_output += VMemMultConstructor64.subst(iop) 478 479 iop = InstObjParams('vstmult64', 'VstMult64', 'VstMultOp64', '', []) 480 header_output += VMemMultDeclare64.subst(iop) 481 decoder_output += VMemMultConstructor64.subst(iop) 482 483 iop = InstObjParams('vldsingle64', 'VldSingle64', 'VldSingleOp64', '', []) 484 header_output += VMemSingleDeclare64.subst(iop) 485 decoder_output += VMemSingleConstructor64.subst(iop) 486 487 iop = InstObjParams('vstsingle64', 'VstSingle64', 'VstSingleOp64', '', []) 488 header_output += VMemSingleDeclare64.subst(iop) 489 decoder_output += VMemSingleConstructor64.subst(iop) 490 491}}; 492