vfp.cc revision 10037
1/* 2 * Copyright (c) 2010-2013 ARM Limited 3 * All rights reserved 4 * 5 * The license below extends only to copyright in the software and shall 6 * not be construed as granting a license to any other intellectual 7 * property including but not limited to intellectual property relating 8 * to a hardware implementation of the functionality of the software 9 * licensed hereunder. You may use the software subject to the license 10 * terms below provided that you ensure that this notice is replicated 11 * unmodified and in its entirety in all distributions of the software, 12 * modified or unmodified, in source code or in binary form. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions are 16 * met: redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer; 18 * redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution; 21 * neither the name of the copyright holders nor the names of its 22 * contributors may be used to endorse or promote products derived from 23 * this software without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 30 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 31 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 32 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 34 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 * 37 * Authors: Gabe Black 38 */ 39 40#include "arch/arm/insts/vfp.hh" 41 42/* 43 * The asm statements below are to keep gcc from reordering code. Otherwise 44 * the rounding mode might be set after the operation it was intended for, the 45 * exception bits read before it, etc. 46 */ 47 48std::string 49FpCondCompRegOp::generateDisassembly( 50 Addr pc, const SymbolTable *symtab) const 51{ 52 std::stringstream ss; 53 printMnemonic(ss, "", false); 54 printReg(ss, op1); 55 ccprintf(ss, ", "); 56 printReg(ss, op2); 57 ccprintf(ss, ", #%d", defCc); 58 ccprintf(ss, ", "); 59 printCondition(ss, condCode, true); 60 return ss.str(); 61} 62 63std::string 64FpCondSelOp::generateDisassembly( 65 Addr pc, const SymbolTable *symtab) const 66{ 67 std::stringstream ss; 68 printMnemonic(ss, "", false); 69 printReg(ss, dest); 70 ccprintf(ss, ", "); 71 printReg(ss, op1); 72 ccprintf(ss, ", "); 73 printReg(ss, op2); 74 ccprintf(ss, ", "); 75 printCondition(ss, condCode, true); 76 return ss.str(); 77} 78 79std::string 80FpRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 81{ 82 std::stringstream ss; 83 printMnemonic(ss); 84 printReg(ss, dest + FP_Reg_Base); 85 ss << ", "; 86 printReg(ss, op1 + FP_Reg_Base); 87 return ss.str(); 88} 89 90std::string 91FpRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 92{ 93 std::stringstream ss; 94 printMnemonic(ss); 95 printReg(ss, dest + FP_Reg_Base); 96 ccprintf(ss, ", #%d", imm); 97 return ss.str(); 98} 99 100std::string 101FpRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 102{ 103 std::stringstream ss; 104 printMnemonic(ss); 105 printReg(ss, dest + FP_Reg_Base); 106 ss << ", "; 107 printReg(ss, op1 + FP_Reg_Base); 108 ccprintf(ss, ", #%d", imm); 109 return ss.str(); 110} 111 112std::string 113FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 114{ 115 std::stringstream ss; 116 printMnemonic(ss); 117 printReg(ss, dest + FP_Reg_Base); 118 ss << ", "; 119 printReg(ss, op1 + FP_Reg_Base); 120 ss << ", "; 121 printReg(ss, op2 + FP_Reg_Base); 122 return ss.str(); 123} 124 125std::string 126FpRegRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 127{ 128 std::stringstream ss; 129 printMnemonic(ss); 130 printReg(ss, dest + FP_Reg_Base); 131 ss << ", "; 132 printReg(ss, op1 + FP_Reg_Base); 133 ss << ", "; 134 printReg(ss, op2 + FP_Reg_Base); 135 ss << ", "; 136 printReg(ss, op3 + FP_Reg_Base); 137 return ss.str(); 138} 139 140std::string 141FpRegRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 142{ 143 std::stringstream ss; 144 printMnemonic(ss); 145 printReg(ss, dest + FP_Reg_Base); 146 ss << ", "; 147 printReg(ss, op1 + FP_Reg_Base); 148 ss << ", "; 149 printReg(ss, op2 + FP_Reg_Base); 150 ccprintf(ss, ", #%d", imm); 151 return ss.str(); 152} 153 154namespace ArmISA 155{ 156 157VfpSavedState 158prepFpState(uint32_t rMode) 159{ 160 int roundingMode = fegetround(); 161 feclearexcept(FeAllExceptions); 162 switch (rMode) { 163 case VfpRoundNearest: 164 fesetround(FeRoundNearest); 165 break; 166 case VfpRoundUpward: 167 fesetround(FeRoundUpward); 168 break; 169 case VfpRoundDown: 170 fesetround(FeRoundDown); 171 break; 172 case VfpRoundZero: 173 fesetround(FeRoundZero); 174 break; 175 } 176 return roundingMode; 177} 178 179void 180finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush, FPSCR mask) 181{ 182 int exceptions = fetestexcept(FeAllExceptions); 183 bool underflow = false; 184 if ((exceptions & FeInvalid) && mask.ioc) { 185 fpscr.ioc = 1; 186 } 187 if ((exceptions & FeDivByZero) && mask.dzc) { 188 fpscr.dzc = 1; 189 } 190 if ((exceptions & FeOverflow) && mask.ofc) { 191 fpscr.ofc = 1; 192 } 193 if (exceptions & FeUnderflow) { 194 underflow = true; 195 if (mask.ufc) 196 fpscr.ufc = 1; 197 } 198 if ((exceptions & FeInexact) && !(underflow && flush) && mask.ixc) { 199 fpscr.ixc = 1; 200 } 201 fesetround(state); 202} 203 204template <class fpType> 205fpType 206fixDest(bool flush, bool defaultNan, fpType val, fpType op1) 207{ 208 int fpClass = std::fpclassify(val); 209 fpType junk = 0.0; 210 if (fpClass == FP_NAN) { 211 const bool single = (sizeof(val) == sizeof(float)); 212 const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); 213 const bool nan = std::isnan(op1); 214 if (!nan || defaultNan) { 215 val = bitsToFp(qnan, junk); 216 } else if (nan) { 217 val = bitsToFp(fpToBits(op1) | qnan, junk); 218 } 219 } else if (fpClass == FP_SUBNORMAL && flush == 1) { 220 // Turn val into a zero with the correct sign; 221 uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1); 222 val = bitsToFp(fpToBits(val) & bitMask, junk); 223 feclearexcept(FeInexact); 224 feraiseexcept(FeUnderflow); 225 } 226 return val; 227} 228 229template 230float fixDest<float>(bool flush, bool defaultNan, float val, float op1); 231template 232double fixDest<double>(bool flush, bool defaultNan, double val, double op1); 233 234template <class fpType> 235fpType 236fixDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2) 237{ 238 int fpClass = std::fpclassify(val); 239 fpType junk = 0.0; 240 if (fpClass == FP_NAN) { 241 const bool single = (sizeof(val) == sizeof(float)); 242 const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); 243 const bool nan1 = std::isnan(op1); 244 const bool nan2 = std::isnan(op2); 245 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); 246 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); 247 if ((!nan1 && !nan2) || defaultNan) { 248 val = bitsToFp(qnan, junk); 249 } else if (signal1) { 250 val = bitsToFp(fpToBits(op1) | qnan, junk); 251 } else if (signal2) { 252 val = bitsToFp(fpToBits(op2) | qnan, junk); 253 } else if (nan1) { 254 val = op1; 255 } else if (nan2) { 256 val = op2; 257 } 258 } else if (fpClass == FP_SUBNORMAL && flush) { 259 // Turn val into a zero with the correct sign; 260 uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1); 261 val = bitsToFp(fpToBits(val) & bitMask, junk); 262 feclearexcept(FeInexact); 263 feraiseexcept(FeUnderflow); 264 } 265 return val; 266} 267 268template 269float fixDest<float>(bool flush, bool defaultNan, 270 float val, float op1, float op2); 271template 272double fixDest<double>(bool flush, bool defaultNan, 273 double val, double op1, double op2); 274 275template <class fpType> 276fpType 277fixDivDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2) 278{ 279 fpType mid = fixDest(flush, defaultNan, val, op1, op2); 280 const bool single = (sizeof(fpType) == sizeof(float)); 281 const fpType junk = 0.0; 282 if ((single && (val == bitsToFp(0x00800000, junk) || 283 val == bitsToFp(0x80800000, junk))) || 284 (!single && (val == bitsToFp(ULL(0x0010000000000000), junk) || 285 val == bitsToFp(ULL(0x8010000000000000), junk))) 286 ) { 287 __asm__ __volatile__("" : "=m" (op1) : "m" (op1)); 288 fesetround(FeRoundZero); 289 fpType temp = 0.0; 290 __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); 291 temp = op1 / op2; 292 if (flushToZero(temp)) { 293 feraiseexcept(FeUnderflow); 294 if (flush) { 295 feclearexcept(FeInexact); 296 mid = temp; 297 } 298 } 299 __asm__ __volatile__("" :: "m" (temp)); 300 } 301 return mid; 302} 303 304template 305float fixDivDest<float>(bool flush, bool defaultNan, 306 float val, float op1, float op2); 307template 308double fixDivDest<double>(bool flush, bool defaultNan, 309 double val, double op1, double op2); 310 311float 312fixFpDFpSDest(FPSCR fpscr, double val) 313{ 314 const float junk = 0.0; 315 float op1 = 0.0; 316 if (std::isnan(val)) { 317 uint64_t valBits = fpToBits(val); 318 uint32_t op1Bits = bits(valBits, 50, 29) | 319 (mask(9) << 22) | 320 (bits(valBits, 63) << 31); 321 op1 = bitsToFp(op1Bits, junk); 322 } 323 float mid = fixDest(fpscr.fz, fpscr.dn, (float)val, op1); 324 if (fpscr.fz && fetestexcept(FeUnderflow | FeInexact) == 325 (FeUnderflow | FeInexact)) { 326 feclearexcept(FeInexact); 327 } 328 if (mid == bitsToFp(0x00800000, junk) || 329 mid == bitsToFp(0x80800000, junk)) { 330 __asm__ __volatile__("" : "=m" (val) : "m" (val)); 331 fesetround(FeRoundZero); 332 float temp = 0.0; 333 __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); 334 temp = val; 335 if (flushToZero(temp)) { 336 feraiseexcept(FeUnderflow); 337 if (fpscr.fz) { 338 feclearexcept(FeInexact); 339 mid = temp; 340 } 341 } 342 __asm__ __volatile__("" :: "m" (temp)); 343 } 344 return mid; 345} 346 347double 348fixFpSFpDDest(FPSCR fpscr, float val) 349{ 350 const double junk = 0.0; 351 double op1 = 0.0; 352 if (std::isnan(val)) { 353 uint32_t valBits = fpToBits(val); 354 uint64_t op1Bits = ((uint64_t)bits(valBits, 21, 0) << 29) | 355 (mask(12) << 51) | 356 ((uint64_t)bits(valBits, 31) << 63); 357 op1 = bitsToFp(op1Bits, junk); 358 } 359 double mid = fixDest(fpscr.fz, fpscr.dn, (double)val, op1); 360 if (mid == bitsToFp(ULL(0x0010000000000000), junk) || 361 mid == bitsToFp(ULL(0x8010000000000000), junk)) { 362 __asm__ __volatile__("" : "=m" (val) : "m" (val)); 363 fesetround(FeRoundZero); 364 double temp = 0.0; 365 __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); 366 temp = val; 367 if (flushToZero(temp)) { 368 feraiseexcept(FeUnderflow); 369 if (fpscr.fz) { 370 feclearexcept(FeInexact); 371 mid = temp; 372 } 373 } 374 __asm__ __volatile__("" :: "m" (temp)); 375 } 376 return mid; 377} 378 379static inline uint16_t 380vcvtFpFpH(FPSCR &fpscr, bool flush, bool defaultNan, 381 uint32_t rMode, bool ahp, uint64_t opBits, bool isDouble) 382{ 383 uint32_t mWidth; 384 uint32_t eWidth; 385 uint32_t eHalfRange; 386 uint32_t sBitPos; 387 388 if (isDouble) { 389 mWidth = 52; 390 eWidth = 11; 391 } else { 392 mWidth = 23; 393 eWidth = 8; 394 } 395 sBitPos = eWidth + mWidth; 396 eHalfRange = (1 << (eWidth-1)) - 1; 397 398 // Extract the operand. 399 bool neg = bits(opBits, sBitPos); 400 uint32_t exponent = bits(opBits, sBitPos-1, mWidth); 401 uint64_t oldMantissa = bits(opBits, mWidth-1, 0); 402 uint32_t mantissa = oldMantissa >> (mWidth - 10); 403 // Do the conversion. 404 uint64_t extra = oldMantissa & mask(mWidth - 10); 405 if (exponent == mask(eWidth)) { 406 if (oldMantissa != 0) { 407 // Nans. 408 if (bits(mantissa, 9) == 0) { 409 // Signalling nan. 410 fpscr.ioc = 1; 411 } 412 if (ahp) { 413 mantissa = 0; 414 exponent = 0; 415 fpscr.ioc = 1; 416 } else if (defaultNan) { 417 mantissa = (1 << 9); 418 exponent = 0x1f; 419 neg = false; 420 } else { 421 exponent = 0x1f; 422 mantissa |= (1 << 9); 423 } 424 } else { 425 // Infinities. 426 exponent = 0x1F; 427 if (ahp) { 428 fpscr.ioc = 1; 429 mantissa = 0x3ff; 430 } else { 431 mantissa = 0; 432 } 433 } 434 } else if (exponent == 0 && oldMantissa == 0) { 435 // Zero, don't need to do anything. 436 } else { 437 // Normalized or denormalized numbers. 438 439 bool inexact = (extra != 0); 440 441 if (exponent == 0) { 442 // Denormalized. 443 // If flush to zero is on, this shouldn't happen. 444 assert(!flush); 445 446 // Check for underflow 447 if (inexact || fpscr.ufe) 448 fpscr.ufc = 1; 449 450 // Handle rounding. 451 unsigned mode = rMode; 452 if ((mode == VfpRoundUpward && !neg && extra) || 453 (mode == VfpRoundDown && neg && extra) || 454 (mode == VfpRoundNearest && 455 (extra > (1 << 9) || 456 (extra == (1 << 9) && bits(mantissa, 0))))) { 457 mantissa++; 458 } 459 460 // See if the number became normalized after rounding. 461 if (mantissa == (1 << 10)) { 462 mantissa = 0; 463 exponent = 1; 464 } 465 } else { 466 // Normalized. 467 468 // We need to track the dropped bits differently since 469 // more can be dropped by denormalizing. 470 bool topOne = bits(extra, mWidth - 10 - 1); 471 bool restZeros = bits(extra, mWidth - 10 - 2, 0) == 0; 472 473 if (exponent <= (eHalfRange - 15)) { 474 // The result is too small. Denormalize. 475 mantissa |= (1 << 10); 476 while (mantissa && exponent <= (eHalfRange - 15)) { 477 restZeros = restZeros && !topOne; 478 topOne = bits(mantissa, 0); 479 mantissa = mantissa >> 1; 480 exponent++; 481 } 482 if (topOne || !restZeros) 483 inexact = true; 484 exponent = 0; 485 } else { 486 // Change bias. 487 exponent -= (eHalfRange - 15); 488 } 489 490 if (exponent == 0 && (inexact || fpscr.ufe)) { 491 // Underflow 492 fpscr.ufc = 1; 493 } 494 495 // Handle rounding. 496 unsigned mode = rMode; 497 bool nonZero = topOne || !restZeros; 498 if ((mode == VfpRoundUpward && !neg && nonZero) || 499 (mode == VfpRoundDown && neg && nonZero) || 500 (mode == VfpRoundNearest && topOne && 501 (!restZeros || bits(mantissa, 0)))) { 502 mantissa++; 503 } 504 505 // See if we rounded up and need to bump the exponent. 506 if (mantissa == (1 << 10)) { 507 mantissa = 0; 508 exponent++; 509 } 510 511 // Deal with overflow 512 if (ahp) { 513 if (exponent >= 0x20) { 514 exponent = 0x1f; 515 mantissa = 0x3ff; 516 fpscr.ioc = 1; 517 // Supress inexact exception. 518 inexact = false; 519 } 520 } else { 521 if (exponent >= 0x1f) { 522 if ((mode == VfpRoundNearest) || 523 (mode == VfpRoundUpward && !neg) || 524 (mode == VfpRoundDown && neg)) { 525 // Overflow to infinity. 526 exponent = 0x1f; 527 mantissa = 0; 528 } else { 529 // Overflow to max normal. 530 exponent = 0x1e; 531 mantissa = 0x3ff; 532 } 533 fpscr.ofc = 1; 534 inexact = true; 535 } 536 } 537 } 538 539 if (inexact) { 540 fpscr.ixc = 1; 541 } 542 } 543 // Reassemble and install the result. 544 uint32_t result = bits(mantissa, 9, 0); 545 replaceBits(result, 14, 10, exponent); 546 if (neg) 547 result |= (1 << 15); 548 return result; 549} 550 551uint16_t 552vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan, 553 uint32_t rMode, bool ahp, float op) 554{ 555 uint64_t opBits = fpToBits(op); 556 return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, false); 557} 558 559uint16_t 560vcvtFpDFpH(FPSCR &fpscr, bool flush, bool defaultNan, 561 uint32_t rMode, bool ahp, double op) 562{ 563 uint64_t opBits = fpToBits(op); 564 return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, true); 565} 566 567static inline uint64_t 568vcvtFpHFp(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op, bool isDouble) 569{ 570 uint32_t mWidth; 571 uint32_t eWidth; 572 uint32_t eHalfRange; 573 uint32_t sBitPos; 574 575 if (isDouble) { 576 mWidth = 52; 577 eWidth = 11; 578 } else { 579 mWidth = 23; 580 eWidth = 8; 581 } 582 sBitPos = eWidth + mWidth; 583 eHalfRange = (1 << (eWidth-1)) - 1; 584 585 // Extract the bitfields. 586 bool neg = bits(op, 15); 587 uint32_t exponent = bits(op, 14, 10); 588 uint64_t mantissa = bits(op, 9, 0); 589 // Do the conversion. 590 if (exponent == 0) { 591 if (mantissa != 0) { 592 // Normalize the value. 593 exponent = exponent + (eHalfRange - 15) + 1; 594 while (mantissa < (1 << 10)) { 595 mantissa = mantissa << 1; 596 exponent--; 597 } 598 } 599 mantissa = mantissa << (mWidth - 10); 600 } else if (exponent == 0x1f && !ahp) { 601 // Infinities and nans. 602 exponent = mask(eWidth); 603 if (mantissa != 0) { 604 // Nans. 605 mantissa = mantissa << (mWidth - 10); 606 if (bits(mantissa, mWidth-1) == 0) { 607 // Signalling nan. 608 fpscr.ioc = 1; 609 mantissa |= (((uint64_t) 1) << (mWidth-1)); 610 } 611 if (defaultNan) { 612 mantissa &= ~mask(mWidth-1); 613 neg = false; 614 } 615 } 616 } else { 617 exponent = exponent + (eHalfRange - 15); 618 mantissa = mantissa << (mWidth - 10); 619 } 620 // Reassemble the result. 621 uint64_t result = bits(mantissa, mWidth-1, 0); 622 replaceBits(result, sBitPos-1, mWidth, exponent); 623 if (neg) { 624 result |= (((uint64_t) 1) << sBitPos); 625 } 626 return result; 627} 628 629double 630vcvtFpHFpD(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op) 631{ 632 double junk = 0.0; 633 uint64_t result; 634 635 result = vcvtFpHFp(fpscr, defaultNan, ahp, op, true); 636 return bitsToFp(result, junk); 637} 638 639float 640vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op) 641{ 642 float junk = 0.0; 643 uint64_t result; 644 645 result = vcvtFpHFp(fpscr, defaultNan, ahp, op, false); 646 return bitsToFp(result, junk); 647} 648 649float 650vfpUFixedToFpS(bool flush, bool defaultNan, 651 uint64_t val, uint8_t width, uint8_t imm) 652{ 653 fesetround(FeRoundNearest); 654 if (width == 16) 655 val = (uint16_t)val; 656 else if (width == 32) 657 val = (uint32_t)val; 658 else if (width != 64) 659 panic("Unsupported width %d", width); 660 float scale = powf(2.0, imm); 661 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 662 feclearexcept(FeAllExceptions); 663 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 664 return fixDivDest(flush, defaultNan, val / scale, (float)val, scale); 665} 666 667float 668vfpSFixedToFpS(bool flush, bool defaultNan, 669 int64_t val, uint8_t width, uint8_t imm) 670{ 671 fesetround(FeRoundNearest); 672 if (width == 16) 673 val = sext<16>(val & mask(16)); 674 else if (width == 32) 675 val = sext<32>(val & mask(32)); 676 else if (width != 64) 677 panic("Unsupported width %d", width); 678 679 float scale = powf(2.0, imm); 680 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 681 feclearexcept(FeAllExceptions); 682 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 683 return fixDivDest(flush, defaultNan, val / scale, (float)val, scale); 684} 685 686 687double 688vfpUFixedToFpD(bool flush, bool defaultNan, 689 uint64_t val, uint8_t width, uint8_t imm) 690{ 691 fesetround(FeRoundNearest); 692 if (width == 16) 693 val = (uint16_t)val; 694 else if (width == 32) 695 val = (uint32_t)val; 696 else if (width != 64) 697 panic("Unsupported width %d", width); 698 699 double scale = pow(2.0, imm); 700 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 701 feclearexcept(FeAllExceptions); 702 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 703 return fixDivDest(flush, defaultNan, val / scale, (double)val, scale); 704} 705 706double 707vfpSFixedToFpD(bool flush, bool defaultNan, 708 int64_t val, uint8_t width, uint8_t imm) 709{ 710 fesetround(FeRoundNearest); 711 if (width == 16) 712 val = sext<16>(val & mask(16)); 713 else if (width == 32) 714 val = sext<32>(val & mask(32)); 715 else if (width != 64) 716 panic("Unsupported width %d", width); 717 718 double scale = pow(2.0, imm); 719 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 720 feclearexcept(FeAllExceptions); 721 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 722 return fixDivDest(flush, defaultNan, val / scale, (double)val, scale); 723} 724 725// This function implements a magic formula taken from the architecture 726// reference manual. It was originally called recip_sqrt_estimate. 727static double 728recipSqrtEstimate(double a) 729{ 730 int64_t q0, q1, s; 731 double r; 732 if (a < 0.5) { 733 q0 = (int64_t)(a * 512.0); 734 r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0); 735 } else { 736 q1 = (int64_t)(a * 256.0); 737 r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0); 738 } 739 s = (int64_t)(256.0 * r + 0.5); 740 return (double)s / 256.0; 741} 742 743// This function is only intended for use in Neon instructions because 744// it ignores certain bits in the FPSCR. 745float 746fprSqrtEstimate(FPSCR &fpscr, float op) 747{ 748 const uint32_t qnan = 0x7fc00000; 749 float junk = 0.0; 750 int fpClass = std::fpclassify(op); 751 if (fpClass == FP_NAN) { 752 if ((fpToBits(op) & qnan) != qnan) 753 fpscr.ioc = 1; 754 return bitsToFp(qnan, junk); 755 } else if (fpClass == FP_ZERO) { 756 fpscr.dzc = 1; 757 // Return infinity with the same sign as the operand. 758 return bitsToFp((std::signbit(op) << 31) | 759 (0xFF << 23) | (0 << 0), junk); 760 } else if (std::signbit(op)) { 761 // Set invalid op bit. 762 fpscr.ioc = 1; 763 return bitsToFp(qnan, junk); 764 } else if (fpClass == FP_INFINITE) { 765 return 0.0; 766 } else { 767 uint64_t opBits = fpToBits(op); 768 double scaled; 769 if (bits(opBits, 23)) { 770 scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | 771 (ULL(0x3fd) << 52) | (bits(opBits, 31) << 63), 772 (double)0.0); 773 } else { 774 scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | 775 (ULL(0x3fe) << 52) | (bits(opBits, 31) << 63), 776 (double)0.0); 777 } 778 uint64_t resultExp = (380 - bits(opBits, 30, 23)) / 2; 779 780 uint64_t estimate = fpToBits(recipSqrtEstimate(scaled)); 781 782 return bitsToFp((bits(estimate, 63) << 31) | 783 (bits(resultExp, 7, 0) << 23) | 784 (bits(estimate, 51, 29) << 0), junk); 785 } 786} 787 788uint32_t 789unsignedRSqrtEstimate(uint32_t op) 790{ 791 if (bits(op, 31, 30) == 0) { 792 return -1; 793 } else { 794 double dpOp; 795 if (bits(op, 31)) { 796 dpOp = bitsToFp((ULL(0) << 63) | 797 (ULL(0x3fe) << 52) | 798 (bits((uint64_t)op, 30, 0) << 21) | 799 (0 << 0), (double)0.0); 800 } else { 801 dpOp = bitsToFp((ULL(0) << 63) | 802 (ULL(0x3fd) << 52) | 803 (bits((uint64_t)op, 29, 0) << 22) | 804 (0 << 0), (double)0.0); 805 } 806 uint64_t estimate = fpToBits(recipSqrtEstimate(dpOp)); 807 return (1 << 31) | bits(estimate, 51, 21); 808 } 809} 810 811// This function implements a magic formula taken from the architecture 812// reference manual. It was originally called recip_estimate. 813 814static double 815recipEstimate(double a) 816{ 817 int64_t q, s; 818 double r; 819 q = (int64_t)(a * 512.0); 820 r = 1.0 / (((double)q + 0.5) / 512.0); 821 s = (int64_t)(256.0 * r + 0.5); 822 return (double)s / 256.0; 823} 824 825// This function is only intended for use in Neon instructions because 826// it ignores certain bits in the FPSCR. 827float 828fpRecipEstimate(FPSCR &fpscr, float op) 829{ 830 const uint32_t qnan = 0x7fc00000; 831 float junk = 0.0; 832 int fpClass = std::fpclassify(op); 833 if (fpClass == FP_NAN) { 834 if ((fpToBits(op) & qnan) != qnan) 835 fpscr.ioc = 1; 836 return bitsToFp(qnan, junk); 837 } else if (fpClass == FP_INFINITE) { 838 return bitsToFp(std::signbit(op) << 31, junk); 839 } else if (fpClass == FP_ZERO) { 840 fpscr.dzc = 1; 841 // Return infinity with the same sign as the operand. 842 return bitsToFp((std::signbit(op) << 31) | 843 (0xFF << 23) | (0 << 0), junk); 844 } else if (fabs(op) >= pow(2.0, 126)) { 845 fpscr.ufc = 1; 846 return bitsToFp(std::signbit(op) << 31, junk); 847 } else { 848 uint64_t opBits = fpToBits(op); 849 double scaled; 850 scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | 851 (ULL(0x3fe) << 52) | (ULL(0) << 63), 852 (double)0.0); 853 uint64_t resultExp = 253 - bits(opBits, 30, 23); 854 855 uint64_t estimate = fpToBits(recipEstimate(scaled)); 856 857 return bitsToFp((bits(opBits, 31) << 31) | 858 (bits(resultExp, 7, 0) << 23) | 859 (bits(estimate, 51, 29) << 0), junk); 860 } 861} 862 863uint32_t 864unsignedRecipEstimate(uint32_t op) 865{ 866 if (bits(op, 31) == 0) { 867 return -1; 868 } else { 869 double dpOp; 870 dpOp = bitsToFp((ULL(0) << 63) | 871 (ULL(0x3fe) << 52) | 872 (bits((uint64_t)op, 30, 0) << 21) | 873 (0 << 0), (double)0.0); 874 uint64_t estimate = fpToBits(recipEstimate(dpOp)); 875 return (1 << 31) | bits(estimate, 51, 21); 876 } 877} 878 879template <class fpType> 880fpType 881FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, 882 fpType op1, fpType op2) const 883{ 884 done = true; 885 fpType junk = 0.0; 886 fpType dest = 0.0; 887 const bool single = (sizeof(fpType) == sizeof(float)); 888 const uint64_t qnan = 889 single ? 0x7fc00000 : ULL(0x7ff8000000000000); 890 const bool nan1 = std::isnan(op1); 891 const bool nan2 = std::isnan(op2); 892 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); 893 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); 894 if (nan1 || nan2) { 895 if (defaultNan) { 896 dest = bitsToFp(qnan, junk); 897 } else if (signal1) { 898 dest = bitsToFp(fpToBits(op1) | qnan, junk); 899 } else if (signal2) { 900 dest = bitsToFp(fpToBits(op2) | qnan, junk); 901 } else if (nan1) { 902 dest = op1; 903 } else if (nan2) { 904 dest = op2; 905 } 906 if (signal1 || signal2) { 907 fpscr.ioc = 1; 908 } 909 } else { 910 done = false; 911 } 912 return dest; 913} 914 915template 916float FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, 917 float op1, float op2) const; 918template 919double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, 920 double op1, double op2) const; 921 922// @TODO remove this function when we've finished switching all FMA code to use the new FPLIB 923template <class fpType> 924fpType 925FpOp::ternaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType op3, 926 fpType (*func)(fpType, fpType, fpType), 927 bool flush, bool defaultNan, uint32_t rMode) const 928{ 929 const bool single = (sizeof(fpType) == sizeof(float)); 930 fpType junk = 0.0; 931 932 if (flush && (flushToZero(op1, op2) || flushToZero(op3))) 933 fpscr.idc = 1; 934 VfpSavedState state = prepFpState(rMode); 935 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3), "=m" (state) 936 : "m" (op1), "m" (op2), "m" (op3), "m" (state)); 937 fpType dest = func(op1, op2, op3); 938 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); 939 940 int fpClass = std::fpclassify(dest); 941 // Get NAN behavior right. This varies between x86 and ARM. 942 if (fpClass == FP_NAN) { 943 const uint64_t qnan = 944 single ? 0x7fc00000 : ULL(0x7ff8000000000000); 945 const bool nan1 = std::isnan(op1); 946 const bool nan2 = std::isnan(op2); 947 const bool nan3 = std::isnan(op3); 948 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); 949 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); 950 const bool signal3 = nan3 && ((fpToBits(op3) & qnan) != qnan); 951 if ((!nan1 && !nan2 && !nan3) || (defaultNan == 1)) { 952 dest = bitsToFp(qnan, junk); 953 } else if (signal1) { 954 dest = bitsToFp(fpToBits(op1) | qnan, junk); 955 } else if (signal2) { 956 dest = bitsToFp(fpToBits(op2) | qnan, junk); 957 } else if (signal3) { 958 dest = bitsToFp(fpToBits(op3) | qnan, junk); 959 } else if (nan1) { 960 dest = op1; 961 } else if (nan2) { 962 dest = op2; 963 } else if (nan3) { 964 dest = op3; 965 } 966 } else if (flush && flushToZero(dest)) { 967 feraiseexcept(FeUnderflow); 968 } else if (( 969 (single && (dest == bitsToFp(0x00800000, junk) || 970 dest == bitsToFp(0x80800000, junk))) || 971 (!single && 972 (dest == bitsToFp(ULL(0x0010000000000000), junk) || 973 dest == bitsToFp(ULL(0x8010000000000000), junk))) 974 ) && rMode != VfpRoundZero) { 975 /* 976 * Correct for the fact that underflow is detected -before- rounding 977 * in ARM and -after- rounding in x86. 978 */ 979 fesetround(FeRoundZero); 980 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3) 981 : "m" (op1), "m" (op2), "m" (op3)); 982 fpType temp = func(op1, op2, op2); 983 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); 984 if (flush && flushToZero(temp)) { 985 dest = temp; 986 } 987 } 988 finishVfp(fpscr, state, flush); 989 return dest; 990} 991 992template 993float FpOp::ternaryOp(FPSCR &fpscr, float op1, float op2, float op3, 994 float (*func)(float, float, float), 995 bool flush, bool defaultNan, uint32_t rMode) const; 996template 997double FpOp::ternaryOp(FPSCR &fpscr, double op1, double op2, double op3, 998 double (*func)(double, double, double), 999 bool flush, bool defaultNan, uint32_t rMode) const; 1000 1001template <class fpType> 1002fpType 1003FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2, 1004 fpType (*func)(fpType, fpType), 1005 bool flush, bool defaultNan, uint32_t rMode) const 1006{ 1007 const bool single = (sizeof(fpType) == sizeof(float)); 1008 fpType junk = 0.0; 1009 1010 if (flush && flushToZero(op1, op2)) 1011 fpscr.idc = 1; 1012 VfpSavedState state = prepFpState(rMode); 1013 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (state) 1014 : "m" (op1), "m" (op2), "m" (state)); 1015 fpType dest = func(op1, op2); 1016 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); 1017 1018 // Get NAN behavior right. This varies between x86 and ARM. 1019 if (std::isnan(dest)) { 1020 const uint64_t qnan = 1021 single ? 0x7fc00000 : ULL(0x7ff8000000000000); 1022 const bool nan1 = std::isnan(op1); 1023 const bool nan2 = std::isnan(op2); 1024 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); 1025 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); 1026 if ((!nan1 && !nan2) || (defaultNan == 1)) { 1027 dest = bitsToFp(qnan, junk); 1028 } else if (signal1) { 1029 dest = bitsToFp(fpToBits(op1) | qnan, junk); 1030 } else if (signal2) { 1031 dest = bitsToFp(fpToBits(op2) | qnan, junk); 1032 } else if (nan1) { 1033 dest = op1; 1034 } else if (nan2) { 1035 dest = op2; 1036 } 1037 } else if (flush && flushToZero(dest)) { 1038 feraiseexcept(FeUnderflow); 1039 } else if (( 1040 (single && (dest == bitsToFp(0x00800000, junk) || 1041 dest == bitsToFp(0x80800000, junk))) || 1042 (!single && 1043 (dest == bitsToFp(ULL(0x0010000000000000), junk) || 1044 dest == bitsToFp(ULL(0x8010000000000000), junk))) 1045 ) && rMode != VfpRoundZero) { 1046 /* 1047 * Correct for the fact that underflow is detected -before- rounding 1048 * in ARM and -after- rounding in x86. 1049 */ 1050 fesetround(FeRoundZero); 1051 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2) 1052 : "m" (op1), "m" (op2)); 1053 fpType temp = func(op1, op2); 1054 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); 1055 if (flush && flushToZero(temp)) { 1056 dest = temp; 1057 } 1058 } 1059 finishVfp(fpscr, state, flush); 1060 return dest; 1061} 1062 1063template 1064float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2, 1065 float (*func)(float, float), 1066 bool flush, bool defaultNan, uint32_t rMode) const; 1067template 1068double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2, 1069 double (*func)(double, double), 1070 bool flush, bool defaultNan, uint32_t rMode) const; 1071 1072template <class fpType> 1073fpType 1074FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (*func)(fpType), 1075 bool flush, uint32_t rMode) const 1076{ 1077 const bool single = (sizeof(fpType) == sizeof(float)); 1078 fpType junk = 0.0; 1079 1080 if (flush && flushToZero(op1)) 1081 fpscr.idc = 1; 1082 VfpSavedState state = prepFpState(rMode); 1083 __asm__ __volatile__ ("" : "=m" (op1), "=m" (state) 1084 : "m" (op1), "m" (state)); 1085 fpType dest = func(op1); 1086 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); 1087 1088 // Get NAN behavior right. This varies between x86 and ARM. 1089 if (std::isnan(dest)) { 1090 const uint64_t qnan = 1091 single ? 0x7fc00000 : ULL(0x7ff8000000000000); 1092 const bool nan = std::isnan(op1); 1093 if (!nan || fpscr.dn == 1) { 1094 dest = bitsToFp(qnan, junk); 1095 } else if (nan) { 1096 dest = bitsToFp(fpToBits(op1) | qnan, junk); 1097 } 1098 } else if (flush && flushToZero(dest)) { 1099 feraiseexcept(FeUnderflow); 1100 } else if (( 1101 (single && (dest == bitsToFp(0x00800000, junk) || 1102 dest == bitsToFp(0x80800000, junk))) || 1103 (!single && 1104 (dest == bitsToFp(ULL(0x0010000000000000), junk) || 1105 dest == bitsToFp(ULL(0x8010000000000000), junk))) 1106 ) && rMode != VfpRoundZero) { 1107 /* 1108 * Correct for the fact that underflow is detected -before- rounding 1109 * in ARM and -after- rounding in x86. 1110 */ 1111 fesetround(FeRoundZero); 1112 __asm__ __volatile__ ("" : "=m" (op1) : "m" (op1)); 1113 fpType temp = func(op1); 1114 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); 1115 if (flush && flushToZero(temp)) { 1116 dest = temp; 1117 } 1118 } 1119 finishVfp(fpscr, state, flush); 1120 return dest; 1121} 1122 1123template 1124float FpOp::unaryOp(FPSCR &fpscr, float op1, float (*func)(float), 1125 bool flush, uint32_t rMode) const; 1126template 1127double FpOp::unaryOp(FPSCR &fpscr, double op1, double (*func)(double), 1128 bool flush, uint32_t rMode) const; 1129 1130IntRegIndex 1131VfpMacroOp::addStride(IntRegIndex idx, unsigned stride) 1132{ 1133 if (wide) { 1134 stride *= 2; 1135 } 1136 unsigned offset = idx % 8; 1137 idx = (IntRegIndex)(idx - offset); 1138 offset += stride; 1139 idx = (IntRegIndex)(idx + (offset % 8)); 1140 return idx; 1141} 1142 1143void 1144VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1, IntRegIndex &op2) 1145{ 1146 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; 1147 assert(!inScalarBank(dest)); 1148 dest = addStride(dest, stride); 1149 op1 = addStride(op1, stride); 1150 if (!inScalarBank(op2)) { 1151 op2 = addStride(op2, stride); 1152 } 1153} 1154 1155void 1156VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1) 1157{ 1158 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; 1159 assert(!inScalarBank(dest)); 1160 dest = addStride(dest, stride); 1161 if (!inScalarBank(op1)) { 1162 op1 = addStride(op1, stride); 1163 } 1164} 1165 1166void 1167VfpMacroOp::nextIdxs(IntRegIndex &dest) 1168{ 1169 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; 1170 assert(!inScalarBank(dest)); 1171 dest = addStride(dest, stride); 1172} 1173 1174} 1175