neon64_mem.isa revision 10474:799c8ee4ecba
1// -*- mode: c++ -*- 2 3// Copyright (c) 2012-2014 ARM Limited 4// All rights reserved 5// 6// The license below extends only to copyright in the software and shall 7// not be construed as granting a license to any other intellectual 8// property including but not limited to intellectual property relating 9// to a hardware implementation of the functionality of the software 10// licensed hereunder. You may use the software subject to the license 11// terms below provided that you ensure that this notice is replicated 12// unmodified and in its entirety in all distributions of the software, 13// modified or unmodified, in source code or in binary form. 14// 15// Redistribution and use in source and binary forms, with or without 16// modification, are permitted provided that the following conditions are 17// met: redistributions of source code must retain the above copyright 18// notice, this list of conditions and the following disclaimer; 19// redistributions in binary form must reproduce the above copyright 20// notice, this list of conditions and the following disclaimer in the 21// documentation and/or other materials provided with the distribution; 22// neither the name of the copyright holders nor the names of its 23// contributors may be used to endorse or promote products derived from 24// this software without specific prior written permission. 25// 26// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 27// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 28// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 29// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 30// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 31// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 32// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 33// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 34// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 35// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 36// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37// 38// Authors: Mbou Eyole 39// Giacomo Gabrielli 40 41let {{ 42 43 header_output = '' 44 decoder_output = '' 45 exec_output = '' 46 47 def mkMemAccMicroOp(name): 48 global header_output, decoder_output, exec_output 49 SPAlignmentCheckCodeNeon = ''' 50 if (baseIsSP && bits(XURa, 3, 0) && 51 SPAlignmentCheckEnabled(xc->tcBase())) { 52 return std::make_shared<SPAlignmentFault>(); 53 } 54 ''' 55 eaCode = SPAlignmentCheckCodeNeon + ''' 56 EA = XURa + imm; 57 ''' 58 memDecl = ''' 59 const int MaxNumBytes = 16; 60 union MemUnion { 61 uint8_t bytes[MaxNumBytes]; 62 uint32_t floatRegBits[MaxNumBytes / 4]; 63 }; 64 ''' 65 66 # Do endian conversion for all the elements 67 convCode = ''' 68 VReg x = {0, 0}; 69 70 x.lo = (((XReg) memUnion.floatRegBits[1]) << 32) | 71 (XReg) memUnion.floatRegBits[0]; 72 x.hi = (((XReg) memUnion.floatRegBits[3]) << 32) | 73 (XReg) memUnion.floatRegBits[2]; 74 75 const unsigned eCount = 16 / (1 << eSize); 76 77 if (isBigEndian64(xc->tcBase())) { 78 for (unsigned i = 0; i < eCount; i++) { 79 switch (eSize) { 80 case 0x3: // 64-bit 81 writeVecElem(&x, (XReg) gtobe( 82 (uint64_t) readVecElem(x, i, eSize)), i, eSize); 83 break; 84 case 0x2: // 32-bit 85 writeVecElem(&x, (XReg) gtobe( 86 (uint32_t) readVecElem(x, i, eSize)), i, eSize); 87 break; 88 case 0x1: // 16-bit 89 writeVecElem(&x, (XReg) gtobe( 90 (uint16_t) readVecElem(x, i, eSize)), i, eSize); 91 break; 92 default: // 8-bit 93 break; // Nothing to do here 94 } 95 } 96 } else { 97 for (unsigned i = 0; i < eCount; i++) { 98 switch (eSize) { 99 case 0x3: // 64-bit 100 writeVecElem(&x, (XReg) gtole( 101 (uint64_t) readVecElem(x, i, eSize)), i, eSize); 102 break; 103 case 0x2: // 32-bit 104 writeVecElem(&x, (XReg) gtole( 105 (uint32_t) readVecElem(x, i, eSize)), i, eSize); 106 break; 107 case 0x1: // 16-bit 108 writeVecElem(&x, (XReg) gtole( 109 (uint16_t) readVecElem(x, i, eSize)), i, eSize); 110 break; 111 default: // 8-bit 112 break; // Nothing to do here 113 } 114 } 115 } 116 117 memUnion.floatRegBits[0] = (uint32_t) x.lo; 118 memUnion.floatRegBits[1] = (uint32_t) (x.lo >> 32); 119 memUnion.floatRegBits[2] = (uint32_t) x.hi; 120 memUnion.floatRegBits[3] = (uint32_t) (x.hi >> 32); 121 ''' 122 123 # Offload everything into registers 124 regSetCode = '' 125 for reg in range(4): 126 regSetCode += ''' 127 AA64FpDestP%(reg)d_uw = gtoh(memUnion.floatRegBits[%(reg)d]); 128 ''' % { 'reg' : reg } 129 130 # Pull everything in from registers 131 regGetCode = '' 132 for reg in range(4): 133 regGetCode += ''' 134 memUnion.floatRegBits[%(reg)d] = htog(AA64FpDestP%(reg)d_uw); 135 ''' % { 'reg' : reg } 136 137 loadMemAccCode = convCode + regSetCode 138 storeMemAccCode = regGetCode + convCode 139 140 loadIop = InstObjParams(name + 'ld', 141 'MicroNeonLoad64', 142 'MicroNeonMemOp', 143 { 'mem_decl' : memDecl, 144 'memacc_code' : loadMemAccCode, 145 'ea_code' : simd64EnabledCheckCode + eaCode, 146 }, 147 [ 'IsMicroop', 'IsMemRef', 'IsLoad' ]) 148 storeIop = InstObjParams(name + 'st', 149 'MicroNeonStore64', 150 'MicroNeonMemOp', 151 { 'mem_decl' : memDecl, 152 'memacc_code' : storeMemAccCode, 153 'ea_code' : simd64EnabledCheckCode + eaCode, 154 }, 155 [ 'IsMicroop', 'IsMemRef', 'IsStore' ]) 156 157 exec_output += NeonLoadExecute64.subst(loadIop) + \ 158 NeonLoadInitiateAcc64.subst(loadIop) + \ 159 NeonLoadCompleteAcc64.subst(loadIop) + \ 160 NeonStoreExecute64.subst(storeIop) + \ 161 NeonStoreInitiateAcc64.subst(storeIop) + \ 162 NeonStoreCompleteAcc64.subst(storeIop) 163 header_output += MicroNeonMemDeclare64.subst(loadIop) + \ 164 MicroNeonMemDeclare64.subst(storeIop) 165 166 def mkMarshalMicroOp(name, Name, numRegs=4): 167 global header_output, decoder_output, exec_output 168 169 getInputCodeOp1L = '' 170 for v in range(numRegs): 171 for p in range(4): 172 getInputCodeOp1L += ''' 173 writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)d_uw, 174 %(p)d, 0x2); 175 ''' % { 'v' : v, 'p' : p } 176 177 getInputCodeOp1S = '' 178 for v in range(numRegs): 179 for p in range(4): 180 getInputCodeOp1S += ''' 181 writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)dS_uw, 182 %(p)d, 0x2); 183 ''' % { 'v' : v, 'p' : p } 184 185 if name == 'deint_neon_uop': 186 187 eCode = ''' 188 VReg input[4]; // input data from scratch area 189 VReg output[2]; // output data to arch. SIMD regs 190 VReg temp; 191 temp.lo = 0; 192 temp.hi = 0; 193 ''' 194 for p in range(4): 195 eCode += ''' 196 writeVecElem(&temp, (XReg) AA64FpDestP%(p)dV1L_uw, %(p)d, 0x2); 197 ''' % { 'p' : p } 198 eCode += getInputCodeOp1L 199 200 # Note that numRegs is not always the same as numStructElems; in 201 # particular, for LD1/ST1, numStructElems is 1 but numRegs can be 202 # 1, 2, 3 or 4 203 204 eCode += ''' 205 output[0].lo = 0; 206 output[0].hi = 0; 207 output[1].lo = 0; 208 output[1].hi = 0; 209 210 int eCount = dataSize / (8 << eSize); 211 int eSizeBytes = 1 << eSize; // element size in bytes 212 int numBytes = step * dataSize / 4; 213 int totNumBytes = numRegs * dataSize / 8; 214 215 int structElemNo, pos, a, b; 216 XReg data; 217 218 for (int r = 0; r < 2; ++r) { 219 for (int i = 0; i < eCount; ++i) { 220 if (numBytes < totNumBytes) { 221 structElemNo = r + (step * 2); 222 if (numStructElems == 1) { 223 pos = (eSizeBytes * i) + 224 (eCount * structElemNo * eSizeBytes); 225 } else { 226 pos = (numStructElems * eSizeBytes * i) + 227 (structElemNo * eSizeBytes); 228 } 229 a = pos / 16; 230 b = (pos % 16) / eSizeBytes; 231 data = (XReg) readVecElem(input[a], (XReg) b, 232 eSize); 233 writeVecElem(&output[r], data, i, eSize); 234 numBytes += eSizeBytes; 235 } 236 } 237 } 238 ''' 239 for p in range(4): 240 eCode += ''' 241 AA64FpDestP%(p)dV0L_uw = (uint32_t) readVecElem(output[0], 242 %(p)d, 0x2); 243 ''' % { 'p' : p } 244 eCode += ''' 245 if ((numRegs % 2 == 0) || (numRegs == 3 && step == 0)) { 246 ''' 247 for p in range(4): 248 eCode += ''' 249 AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem( 250 output[1], %(p)d, 0x2); 251 ''' % { 'p' : p } 252 eCode += ''' 253 } else { 254 ''' 255 for p in range(4): 256 eCode += ''' 257 AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(temp, 258 %(p)d, 0x2); 259 ''' % { 'p' : p } 260 eCode += ''' 261 } 262 ''' 263 264 iop = InstObjParams(name, Name, 'MicroNeonMixOp64', 265 { 'code' : eCode, 'op_class' : 'No_OpClass' }, 266 ['IsMicroop']) 267 header_output += MicroNeonMixDeclare64.subst(iop) 268 exec_output += MicroNeonMixExecute64.subst(iop) 269 270 elif name == 'int_neon_uop': 271 272 eCode = ''' 273 VReg input[4]; // input data from arch. SIMD regs 274 VReg output[2]; // output data to scratch area 275 ''' 276 277 eCode += getInputCodeOp1S 278 279 # Note that numRegs is not always the same as numStructElems; in 280 # particular, for LD1/ST1, numStructElems is 1 but numRegs can be 281 # 1, 2, 3 or 4 282 283 eCode += ''' 284 int eCount = dataSize / (8 << eSize); 285 int eSizeBytes = 1 << eSize; 286 int totNumBytes = numRegs * dataSize / 8; 287 int numOutputElems = 128 / (8 << eSize); 288 int stepOffset = step * 32; 289 290 for (int i = 0; i < 2; ++i) { 291 output[i].lo = 0; 292 output[i].hi = 0; 293 } 294 295 int r = 0, k = 0, i, j; 296 XReg data; 297 298 for (int pos = stepOffset; pos < 32 + stepOffset; 299 pos += eSizeBytes) { 300 if (pos < totNumBytes) { 301 if (numStructElems == 1) { 302 i = (pos / eSizeBytes) % eCount; 303 j = pos / (eCount * eSizeBytes); 304 } else { 305 i = pos / (numStructElems * eSizeBytes); 306 j = (pos % (numStructElems * eSizeBytes)) / 307 eSizeBytes; 308 } 309 data = (XReg) readVecElem(input[j], (XReg) i, eSize); 310 writeVecElem(&output[r], data, k, eSize); 311 k++; 312 if (k == numOutputElems){ 313 k = 0; 314 ++r; 315 } 316 } 317 } 318 ''' 319 for v in range(2): 320 for p in range(4): 321 eCode += ''' 322 AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem( 323 output[%(v)d], %(p)d, 0x2); 324 ''' % { 'v': v, 'p': p} 325 326 iop = InstObjParams(name, Name, 'MicroNeonMixOp64', 327 { 'code' : eCode, 'op_class' : 'No_OpClass' }, 328 ['IsMicroop']) 329 header_output += MicroNeonMixDeclare64.subst(iop) 330 exec_output += MicroNeonMixExecute64.subst(iop) 331 332 elif name == 'unpack_neon_uop': 333 334 eCode = ''' 335 VReg input[4]; //input data from scratch area 336 VReg output[2]; //output data to arch. SIMD regs 337 ''' 338 339 eCode += getInputCodeOp1L 340 341 # Fill output regs with register data initially. Note that 342 # elements in output register outside indexed lanes are left 343 # untouched 344 for v in range(2): 345 for p in range(4): 346 eCode += ''' 347 writeVecElem(&output[%(v)d], (XReg) AA64FpDestP%(p)dV%(v)dL_uw, 348 %(p)d, 0x2); 349 ''' % { 'v': v, 'p': p} 350 eCode += ''' 351 int eCount = dataSize / (8 << eSize); 352 int eCount128 = 128 / (8 << eSize); 353 int eSizeBytes = 1 << eSize; 354 int totNumBytes = numStructElems * eSizeBytes; 355 int numInputElems = eCount128; 356 int stepOffset = step * 2 * eSizeBytes; 357 int stepLimit = 2 * eSizeBytes; 358 359 int r = 0, i, j; 360 XReg data; 361 362 for (int pos = stepOffset; pos < stepLimit + stepOffset; 363 pos += eSizeBytes) { 364 if (pos < totNumBytes) { 365 r = pos / eSizeBytes; 366 j = r / numInputElems; 367 i = r % numInputElems; 368 data = (XReg) readVecElem(input[j], (XReg) i, eSize); 369 370 if (replicate) { 371 for (int i = 0; i < eCount128; ++i) { 372 if (i < eCount) { 373 writeVecElem(&output[r % 2], data, i, 374 eSize); 375 } else { // zero extend if necessary 376 writeVecElem(&output[r % 2], (XReg) 0, i, 377 eSize); 378 } 379 } 380 } else { 381 writeVecElem(&output[r % 2], data, lane, eSize); 382 } 383 } 384 } 385 ''' 386 for v in range(2): 387 for p in range(4): 388 eCode += ''' 389 AA64FpDestP%(p)dV%(v)dL_uw = (uint32_t) readVecElem( 390 output[%(v)d], %(p)d, 0x2); 391 ''' % { 'v' : v, 'p' : p } 392 393 iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64', 394 { 'code' : eCode }, ['IsMicroop']) 395 header_output += MicroNeonMixLaneDeclare64.subst(iop) 396 exec_output += MicroNeonMixExecute64.subst(iop) 397 398 elif name == 'pack_neon_uop': 399 400 eCode = ''' 401 VReg input[4]; // input data from arch. SIMD regs 402 VReg output[2]; // output data to scratch area 403 ''' 404 405 eCode += getInputCodeOp1S 406 407 eCode += ''' 408 int eSizeBytes = 1 << eSize; 409 int numOutputElems = 128 / (8 << eSize); 410 int totNumBytes = numStructElems * eSizeBytes; 411 int stepOffset = step * 32; 412 int stepLimit = 32; 413 414 int r = 0, i, j; 415 XReg data; 416 417 for (int i = 0; i < 2; ++i) { 418 output[i].lo = 0; 419 output[i].hi = 0; 420 } 421 422 for (int pos = stepOffset; pos < stepLimit + stepOffset; 423 pos += eSizeBytes) { 424 if (pos < totNumBytes) { 425 r = pos / 16; 426 j = pos / eSizeBytes; 427 i = (pos / eSizeBytes) % numOutputElems; 428 data = (XReg) readVecElem(input[j], lane, eSize); 429 writeVecElem(&output[r % 2], data, i, eSize); 430 } 431 } 432 ''' 433 434 for v in range(2): 435 for p in range(4): 436 eCode += ''' 437 AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem( 438 output[%(v)d], %(p)d, 0x2); 439 ''' % { 'v' : v, 'p' : p } 440 441 iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64', 442 { 'code' : eCode }, ['IsMicroop']) 443 header_output += MicroNeonMixLaneDeclare64.subst(iop) 444 exec_output += MicroNeonMixExecute64.subst(iop) 445 446 # Generate instructions 447 mkMemAccMicroOp('mem_neon_uop') 448 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_1Reg', numRegs=1) 449 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_2Reg', numRegs=2) 450 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_3Reg', numRegs=3) 451 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_4Reg', numRegs=4) 452 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_1Reg', numRegs=1) 453 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_2Reg', numRegs=2) 454 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_3Reg', numRegs=3) 455 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_4Reg', numRegs=4) 456 mkMarshalMicroOp('unpack_neon_uop', 'MicroUnpackNeon64') 457 mkMarshalMicroOp('pack_neon_uop', 'MicroPackNeon64') 458 459}}; 460 461let {{ 462 463 iop = InstObjParams('vldmult64', 'VldMult64', 'VldMultOp64', '', []) 464 header_output += VMemMultDeclare64.subst(iop) 465 decoder_output += VMemMultConstructor64.subst(iop) 466 467 iop = InstObjParams('vstmult64', 'VstMult64', 'VstMultOp64', '', []) 468 header_output += VMemMultDeclare64.subst(iop) 469 decoder_output += VMemMultConstructor64.subst(iop) 470 471 iop = InstObjParams('vldsingle64', 'VldSingle64', 'VldSingleOp64', '', []) 472 header_output += VMemSingleDeclare64.subst(iop) 473 decoder_output += VMemSingleConstructor64.subst(iop) 474 475 iop = InstObjParams('vstsingle64', 'VstSingle64', 'VstSingleOp64', '', []) 476 header_output += VMemSingleDeclare64.subst(iop) 477 decoder_output += VMemSingleConstructor64.subst(iop) 478 479}}; 480