neon64_mem.isa revision 12583:0c047fc2b3e0
1// -*- mode: c++ -*- 2 3// Copyright (c) 2012-2014 ARM Limited 4// All rights reserved 5// 6// The license below extends only to copyright in the software and shall 7// not be construed as granting a license to any other intellectual 8// property including but not limited to intellectual property relating 9// to a hardware implementation of the functionality of the software 10// licensed hereunder. You may use the software subject to the license 11// terms below provided that you ensure that this notice is replicated 12// unmodified and in its entirety in all distributions of the software, 13// modified or unmodified, in source code or in binary form. 14// 15// Redistribution and use in source and binary forms, with or without 16// modification, are permitted provided that the following conditions are 17// met: redistributions of source code must retain the above copyright 18// notice, this list of conditions and the following disclaimer; 19// redistributions in binary form must reproduce the above copyright 20// notice, this list of conditions and the following disclaimer in the 21// documentation and/or other materials provided with the distribution; 22// neither the name of the copyright holders nor the names of its 23// contributors may be used to endorse or promote products derived from 24// this software without specific prior written permission. 25// 26// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 27// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 28// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 29// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 30// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 31// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 32// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 33// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 34// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 35// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 36// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37// 38// Authors: Mbou Eyole 39// Giacomo Gabrielli 40 41let {{ 42 43 header_output = '' 44 decoder_output = '' 45 exec_output = '' 46 47 def mkMemAccMicroOp(name): 48 global header_output, decoder_output, exec_output 49 SPAlignmentCheckCodeNeon = ''' 50 if (baseIsSP && bits(XURa, 3, 0) && 51 SPAlignmentCheckEnabled(xc->tcBase())) { 52 return std::make_shared<SPAlignmentFault>(); 53 } 54 ''' 55 eaCode = SPAlignmentCheckCodeNeon + ''' 56 EA = XURa + imm; 57 ''' 58 memDecl = ''' 59 const int MaxNumBytes = 16; 60 union MemUnion { 61 uint8_t bytes[MaxNumBytes]; 62 uint32_t floatRegBits[MaxNumBytes / 4]; 63 }; 64 ''' 65 66 # Do endian conversion for all the elements 67 convCode = ''' 68 VReg x = {0, 0}; 69 70 x.lo = (((XReg) memUnion.floatRegBits[1]) << 32) | 71 (XReg) memUnion.floatRegBits[0]; 72 x.hi = (((XReg) memUnion.floatRegBits[3]) << 32) | 73 (XReg) memUnion.floatRegBits[2]; 74 75 const unsigned eCount = 16 / (1 << eSize); 76 77 if (isBigEndian64(xc->tcBase())) { 78 for (unsigned i = 0; i < eCount; i++) { 79 switch (eSize) { 80 case 0x3: // 64-bit 81 writeVecElem(&x, (XReg) gtobe( 82 (uint64_t) readVecElem(x, i, eSize)), i, eSize); 83 break; 84 case 0x2: // 32-bit 85 writeVecElem(&x, (XReg) gtobe( 86 (uint32_t) readVecElem(x, i, eSize)), i, eSize); 87 break; 88 case 0x1: // 16-bit 89 writeVecElem(&x, (XReg) gtobe( 90 (uint16_t) readVecElem(x, i, eSize)), i, eSize); 91 break; 92 default: // 8-bit 93 break; // Nothing to do here 94 } 95 } 96 } else { 97 for (unsigned i = 0; i < eCount; i++) { 98 switch (eSize) { 99 case 0x3: // 64-bit 100 writeVecElem(&x, (XReg) gtole( 101 (uint64_t) readVecElem(x, i, eSize)), i, eSize); 102 break; 103 case 0x2: // 32-bit 104 writeVecElem(&x, (XReg) gtole( 105 (uint32_t) readVecElem(x, i, eSize)), i, eSize); 106 break; 107 case 0x1: // 16-bit 108 writeVecElem(&x, (XReg) gtole( 109 (uint16_t) readVecElem(x, i, eSize)), i, eSize); 110 break; 111 default: // 8-bit 112 break; // Nothing to do here 113 } 114 } 115 } 116 117 memUnion.floatRegBits[0] = (uint32_t) x.lo; 118 memUnion.floatRegBits[1] = (uint32_t) (x.lo >> 32); 119 memUnion.floatRegBits[2] = (uint32_t) x.hi; 120 memUnion.floatRegBits[3] = (uint32_t) (x.hi >> 32); 121 ''' 122 123 # Offload everything into registers 124 regSetCode = '' 125 for reg in range(4): 126 regSetCode += ''' 127 AA64FpDestP%(reg)d_uw = gtoh(memUnion.floatRegBits[%(reg)d]); 128 ''' % { 'reg' : reg } 129 130 # Pull everything in from registers 131 regGetCode = '' 132 for reg in range(4): 133 regGetCode += ''' 134 memUnion.floatRegBits[%(reg)d] = htog(AA64FpDestP%(reg)d_uw); 135 ''' % { 'reg' : reg } 136 137 loadMemAccCode = convCode + regSetCode 138 storeMemAccCode = regGetCode + convCode 139 140 loadIop = InstObjParams(name + 'ld', 141 'MicroNeonLoad64', 142 'MicroNeonMemOp', 143 { 'mem_decl' : memDecl, 144 'memacc_code' : loadMemAccCode, 145 'ea_code' : simd64EnabledCheckCode + eaCode, 146 }, 147 [ 'IsMicroop', 'IsMemRef', 'IsLoad' ]) 148 storeIop = InstObjParams(name + 'st', 149 'MicroNeonStore64', 150 'MicroNeonMemOp', 151 { 'mem_decl' : memDecl, 152 'memacc_code' : storeMemAccCode, 153 'ea_code' : simd64EnabledCheckCode + eaCode, 154 }, 155 [ 'IsMicroop', 'IsMemRef', 'IsStore' ]) 156 157 exec_output += NeonLoadExecute64.subst(loadIop) + \ 158 NeonLoadInitiateAcc64.subst(loadIop) + \ 159 NeonLoadCompleteAcc64.subst(loadIop) + \ 160 NeonStoreExecute64.subst(storeIop) + \ 161 NeonStoreInitiateAcc64.subst(storeIop) + \ 162 NeonStoreCompleteAcc64.subst(storeIop) 163 header_output += MicroNeonMemDeclare64.subst(loadIop) + \ 164 MicroNeonMemDeclare64.subst(storeIop) 165 166 def mkMarshalMicroOp(name, Name, numRegs=4): 167 global header_output, decoder_output, exec_output 168 169 getInputCodeOp1L = '' 170 for v in range(numRegs): 171 for p in range(4): 172 getInputCodeOp1L += ''' 173 writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)d_uw, 174 %(p)d, 0x2); 175 ''' % { 'v' : v, 'p' : p } 176 177 getInputCodeOp1S = '' 178 for v in range(numRegs): 179 for p in range(4): 180 getInputCodeOp1S += ''' 181 writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)dS_uw, 182 %(p)d, 0x2); 183 ''' % { 'v' : v, 'p' : p } 184 185 if name == 'deint_neon_uop': 186 187 eCode = ''' 188 // input data from scratch area 189 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} }; 190 VReg output[2]; // output data to arch. SIMD regs 191 VReg temp; 192 temp.lo = 0; 193 temp.hi = 0; 194 ''' 195 for p in range(4): 196 eCode += ''' 197 writeVecElem(&temp, (XReg) AA64FpDestP%(p)dV1L_uw, %(p)d, 0x2); 198 ''' % { 'p' : p } 199 eCode += getInputCodeOp1L 200 201 # Note that numRegs is not always the same as numStructElems; in 202 # particular, for LD1/ST1, numStructElems is 1 but numRegs can be 203 # 1, 2, 3 or 4 204 205 eCode += ''' 206 output[0].lo = 0; 207 output[0].hi = 0; 208 output[1].lo = 0; 209 output[1].hi = 0; 210 211 int eCount = dataSize / (8 << eSize); 212 int eSizeBytes = 1 << eSize; // element size in bytes 213 int numBytes = step * dataSize / 4; 214 int totNumBytes = numRegs * dataSize / 8; 215 216 int structElemNo, pos, a, b; 217 XReg data; 218 219 for (int r = 0; r < 2; ++r) { 220 for (int i = 0; i < eCount; ++i) { 221 if (numBytes < totNumBytes) { 222 structElemNo = r + (step * 2); 223 if (numStructElems == 1) { 224 pos = (eSizeBytes * i) + 225 (eCount * structElemNo * eSizeBytes); 226 } else { 227 pos = (numStructElems * eSizeBytes * i) + 228 (structElemNo * eSizeBytes); 229 } 230 a = pos / 16; 231 b = (pos % 16) / eSizeBytes; 232 data = (XReg) readVecElem(input[a], (XReg) b, 233 eSize); 234 writeVecElem(&output[r], data, i, eSize); 235 numBytes += eSizeBytes; 236 } 237 } 238 } 239 ''' 240 for p in range(4): 241 eCode += ''' 242 AA64FpDestP%(p)dV0L_uw = (uint32_t) readVecElem(output[0], 243 %(p)d, 0x2); 244 ''' % { 'p' : p } 245 eCode += ''' 246 if ((numRegs % 2 == 0) || (numRegs == 3 && step == 0)) { 247 ''' 248 for p in range(4): 249 eCode += ''' 250 AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem( 251 output[1], %(p)d, 0x2); 252 ''' % { 'p' : p } 253 eCode += ''' 254 } else { 255 ''' 256 for p in range(4): 257 eCode += ''' 258 AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(temp, 259 %(p)d, 0x2); 260 ''' % { 'p' : p } 261 eCode += ''' 262 } 263 ''' 264 265 iop = InstObjParams(name, Name, 'MicroNeonMixOp64', 266 { 'code' : eCode, 'op_class' : 'No_OpClass' }, 267 ['IsMicroop']) 268 header_output += MicroNeonMixDeclare64.subst(iop) 269 exec_output += MicroNeonMixExecute64.subst(iop) 270 271 elif name == 'int_neon_uop': 272 273 eCode = ''' 274 // input data from arch. SIMD regs 275 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} }; 276 VReg output[2]; // output data to scratch area 277 ''' 278 279 eCode += getInputCodeOp1S 280 281 # Note that numRegs is not always the same as numStructElems; in 282 # particular, for LD1/ST1, numStructElems is 1 but numRegs can be 283 # 1, 2, 3 or 4 284 285 eCode += ''' 286 int eCount = dataSize / (8 << eSize); 287 int eSizeBytes = 1 << eSize; 288 int totNumBytes = numRegs * dataSize / 8; 289 int numOutputElems = 128 / (8 << eSize); 290 int stepOffset = step * 32; 291 292 for (int i = 0; i < 2; ++i) { 293 output[i].lo = 0; 294 output[i].hi = 0; 295 } 296 297 int r = 0, k = 0, i, j; 298 XReg data; 299 300 for (int pos = stepOffset; pos < 32 + stepOffset; 301 pos += eSizeBytes) { 302 if (pos < totNumBytes) { 303 if (numStructElems == 1) { 304 i = (pos / eSizeBytes) % eCount; 305 j = pos / (eCount * eSizeBytes); 306 } else { 307 i = pos / (numStructElems * eSizeBytes); 308 j = (pos % (numStructElems * eSizeBytes)) / 309 eSizeBytes; 310 } 311 data = (XReg) readVecElem(input[j], (XReg) i, eSize); 312 writeVecElem(&output[r], data, k, eSize); 313 k++; 314 if (k == numOutputElems){ 315 k = 0; 316 ++r; 317 } 318 } 319 } 320 ''' 321 for v in range(2): 322 for p in range(4): 323 eCode += ''' 324 AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem( 325 output[%(v)d], %(p)d, 0x2); 326 ''' % { 'v': v, 'p': p} 327 328 iop = InstObjParams(name, Name, 'MicroNeonMixOp64', 329 { 'code' : eCode, 'op_class' : 'No_OpClass' }, 330 ['IsMicroop']) 331 header_output += MicroNeonMixDeclare64.subst(iop) 332 exec_output += MicroNeonMixExecute64.subst(iop) 333 334 elif name == 'unpack_neon_uop': 335 336 eCode = ''' 337 //input data from scratch area 338 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} }; 339 //output data to arch. SIMD regs 340 VReg output[2] = { {0, 0}, {0, 0} }; 341 ''' 342 343 eCode += getInputCodeOp1L 344 345 # Fill output regs with register data initially. Note that 346 # elements in output register outside indexed lanes are left 347 # untouched 348 for v in range(2): 349 for p in range(4): 350 eCode += ''' 351 writeVecElem(&output[%(v)d], (XReg) AA64FpDestP%(p)dV%(v)dL_uw, 352 %(p)d, 0x2); 353 ''' % { 'v': v, 'p': p} 354 eCode += ''' 355 int eCount = dataSize / (8 << eSize); 356 int eCount128 = 128 / (8 << eSize); 357 int eSizeBytes = 1 << eSize; 358 int totNumBytes = numStructElems * eSizeBytes; 359 int numInputElems = eCount128; 360 int stepOffset = step * 2 * eSizeBytes; 361 int stepLimit = 2 * eSizeBytes; 362 363 int r = 0, i, j; 364 XReg data; 365 366 for (int pos = stepOffset; pos < stepLimit + stepOffset; 367 pos += eSizeBytes) { 368 if (pos < totNumBytes) { 369 r = pos / eSizeBytes; 370 j = r / numInputElems; 371 i = r % numInputElems; 372 data = (XReg) readVecElem(input[j], (XReg) i, eSize); 373 374 if (replicate) { 375 for (int i = 0; i < eCount128; ++i) { 376 if (i < eCount) { 377 writeVecElem(&output[r % 2], data, i, 378 eSize); 379 } else { // zero extend if necessary 380 writeVecElem(&output[r % 2], (XReg) 0, i, 381 eSize); 382 } 383 } 384 } else { 385 writeVecElem(&output[r % 2], data, lane, eSize); 386 } 387 } 388 } 389 ''' 390 for v in range(2): 391 for p in range(4): 392 eCode += ''' 393 AA64FpDestP%(p)dV%(v)dL_uw = (uint32_t) readVecElem( 394 output[%(v)d], %(p)d, 0x2); 395 ''' % { 'v' : v, 'p' : p } 396 397 iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64', 398 { 'code' : eCode }, ['IsMicroop']) 399 header_output += MicroNeonMixLaneDeclare64.subst(iop) 400 exec_output += MicroNeonMixExecute64.subst(iop) 401 402 elif name == 'pack_neon_uop': 403 404 eCode = ''' 405 // input data from arch. SIMD regs 406 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} }; 407 VReg output[2]; // output data to scratch area 408 ''' 409 410 eCode += getInputCodeOp1S 411 412 eCode += ''' 413 int eSizeBytes = 1 << eSize; 414 int numOutputElems = 128 / (8 << eSize); 415 int totNumBytes = numStructElems * eSizeBytes; 416 int stepOffset = step * 32; 417 int stepLimit = 32; 418 419 int r = 0, i, j; 420 XReg data; 421 422 for (int i = 0; i < 2; ++i) { 423 output[i].lo = 0; 424 output[i].hi = 0; 425 } 426 427 for (int pos = stepOffset; pos < stepLimit + stepOffset; 428 pos += eSizeBytes) { 429 if (pos < totNumBytes) { 430 r = pos / 16; 431 j = pos / eSizeBytes; 432 i = (pos / eSizeBytes) % numOutputElems; 433 data = (XReg) readVecElem(input[j], lane, eSize); 434 writeVecElem(&output[r % 2], data, i, eSize); 435 } 436 } 437 ''' 438 439 for v in range(2): 440 for p in range(4): 441 eCode += ''' 442 AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem( 443 output[%(v)d], %(p)d, 0x2); 444 ''' % { 'v' : v, 'p' : p } 445 446 iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64', 447 { 'code' : eCode }, ['IsMicroop']) 448 header_output += MicroNeonMixLaneDeclare64.subst(iop) 449 exec_output += MicroNeonMixExecute64.subst(iop) 450 451 # Generate instructions 452 mkMemAccMicroOp('mem_neon_uop') 453 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_1Reg', numRegs=1) 454 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_2Reg', numRegs=2) 455 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_3Reg', numRegs=3) 456 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_4Reg', numRegs=4) 457 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_1Reg', numRegs=1) 458 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_2Reg', numRegs=2) 459 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_3Reg', numRegs=3) 460 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_4Reg', numRegs=4) 461 mkMarshalMicroOp('unpack_neon_uop', 'MicroUnpackNeon64') 462 mkMarshalMicroOp('pack_neon_uop', 'MicroPackNeon64') 463 464}}; 465 466let {{ 467 468 iop = InstObjParams('vldmult64', 'VldMult64', 'VldMultOp64', '', []) 469 header_output += VMemMultDeclare64.subst(iop) 470 decoder_output += VMemMultConstructor64.subst(iop) 471 472 iop = InstObjParams('vstmult64', 'VstMult64', 'VstMultOp64', '', []) 473 header_output += VMemMultDeclare64.subst(iop) 474 decoder_output += VMemMultConstructor64.subst(iop) 475 476 iop = InstObjParams('vldsingle64', 'VldSingle64', 'VldSingleOp64', '', []) 477 header_output += VMemSingleDeclare64.subst(iop) 478 decoder_output += VMemSingleConstructor64.subst(iop) 479 480 iop = InstObjParams('vstsingle64', 'VstSingle64', 'VstSingleOp64', '', []) 481 header_output += VMemSingleDeclare64.subst(iop) 482 decoder_output += VMemSingleConstructor64.subst(iop) 483 484}}; 485