1/* 2 * Copyright (c) 2010-2013, 2019 ARM Limited 3 * All rights reserved 4 * 5 * The license below extends only to copyright in the software and shall 6 * not be construed as granting a license to any other intellectual 7 * property including but not limited to intellectual property relating 8 * to a hardware implementation of the functionality of the software 9 * licensed hereunder. You may use the software subject to the license 10 * terms below provided that you ensure that this notice is replicated 11 * unmodified and in its entirety in all distributions of the software, 12 * modified or unmodified, in source code or in binary form. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions are 16 * met: redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer; 18 * redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution; 21 * neither the name of the copyright holders nor the names of its 22 * contributors may be used to endorse or promote products derived from 23 * this software without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 30 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 31 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 32 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 34 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 * 37 * Authors: Gabe Black 38 */ 39 40#include "arch/arm/insts/vfp.hh" 41 42/* 43 * The asm statements below are to keep gcc from reordering code. Otherwise 44 * the rounding mode might be set after the operation it was intended for, the 45 * exception bits read before it, etc. 46 */ 47 48std::string 49FpCondCompRegOp::generateDisassembly( 50 Addr pc, const SymbolTable *symtab) const 51{ 52 std::stringstream ss; 53 printMnemonic(ss, "", false); 54 printIntReg(ss, op1); 55 ccprintf(ss, ", "); 56 printIntReg(ss, op2); 57 ccprintf(ss, ", #%d", defCc); 58 ccprintf(ss, ", "); 59 printCondition(ss, condCode, true); 60 return ss.str(); 61} 62 63std::string 64FpCondSelOp::generateDisassembly( 65 Addr pc, const SymbolTable *symtab) const 66{ 67 std::stringstream ss; 68 printMnemonic(ss, "", false); 69 printIntReg(ss, dest); 70 ccprintf(ss, ", "); 71 printIntReg(ss, op1); 72 ccprintf(ss, ", "); 73 printIntReg(ss, op2); 74 ccprintf(ss, ", "); 75 printCondition(ss, condCode, true); 76 return ss.str(); 77} 78 79std::string 80FpRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 81{ 82 std::stringstream ss; 83 printMnemonic(ss); 84 printFloatReg(ss, dest); 85 ss << ", "; 86 printFloatReg(ss, op1); 87 return ss.str(); 88} 89 90std::string 91FpRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 92{ 93 std::stringstream ss; 94 printMnemonic(ss); 95 printFloatReg(ss, dest); 96 ccprintf(ss, ", #%d", imm); 97 return ss.str(); 98} 99 100std::string 101FpRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 102{ 103 std::stringstream ss; 104 printMnemonic(ss); 105 printFloatReg(ss, dest); 106 ss << ", "; 107 printFloatReg(ss, op1); 108 ccprintf(ss, ", #%d", imm); 109 return ss.str(); 110} 111 112std::string 113FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 114{ 115 std::stringstream ss; 116 printMnemonic(ss); 117 printFloatReg(ss, dest); 118 ss << ", "; 119 printFloatReg(ss, op1); 120 ss << ", "; 121 printFloatReg(ss, op2); 122 return ss.str(); 123} 124 125std::string 126FpRegRegRegCondOp::generateDisassembly(Addr pc, const SymbolTable *symtab) 127 const 128{ 129 std::stringstream ss; 130 printMnemonic(ss); 131 printCondition(ss, cond); 132 printFloatReg(ss, dest); 133 ss << ", "; 134 printFloatReg(ss, op1); 135 ss << ", "; 136 printFloatReg(ss, op2); 137 return ss.str(); 138} 139 140std::string 141FpRegRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 142{ 143 std::stringstream ss; 144 printMnemonic(ss); 145 printFloatReg(ss, dest); 146 ss << ", "; 147 printFloatReg(ss, op1); 148 ss << ", "; 149 printFloatReg(ss, op2); 150 ss << ", "; 151 printFloatReg(ss, op3); 152 return ss.str(); 153} 154 155std::string 156FpRegRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 157{ 158 std::stringstream ss; 159 printMnemonic(ss); 160 printFloatReg(ss, dest); 161 ss << ", "; 162 printFloatReg(ss, op1); 163 ss << ", "; 164 printFloatReg(ss, op2); 165 ccprintf(ss, ", #%d", imm); 166 return ss.str(); 167} 168 169namespace ArmISA 170{ 171 172VfpSavedState 173prepFpState(uint32_t rMode) 174{ 175 int roundingMode = fegetround(); 176 feclearexcept(FeAllExceptions); 177 switch (rMode) { 178 case VfpRoundNearest: 179 fesetround(FeRoundNearest); 180 break; 181 case VfpRoundUpward: 182 fesetround(FeRoundUpward); 183 break; 184 case VfpRoundDown: 185 fesetround(FeRoundDown); 186 break; 187 case VfpRoundZero: 188 fesetround(FeRoundZero); 189 break; 190 } 191 return roundingMode; 192} 193 194void 195finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush, FPSCR mask) 196{ 197 int exceptions = fetestexcept(FeAllExceptions); 198 bool underflow = false; 199 if ((exceptions & FeInvalid) && mask.ioc) { 200 fpscr.ioc = 1; 201 } 202 if ((exceptions & FeDivByZero) && mask.dzc) { 203 fpscr.dzc = 1; 204 } 205 if ((exceptions & FeOverflow) && mask.ofc) { 206 fpscr.ofc = 1; 207 } 208 if (exceptions & FeUnderflow) { 209 underflow = true; 210 if (mask.ufc) 211 fpscr.ufc = 1; 212 } 213 if ((exceptions & FeInexact) && !(underflow && flush) && mask.ixc) { 214 fpscr.ixc = 1; 215 } 216 fesetround(state); 217} 218 219template <class fpType> 220fpType 221fixDest(bool flush, bool defaultNan, fpType val, fpType op1) 222{ 223 int fpClass = std::fpclassify(val); 224 fpType junk = 0.0; 225 if (fpClass == FP_NAN) { 226 const bool single = (sizeof(val) == sizeof(float)); 227 const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); 228 const bool nan = std::isnan(op1); 229 if (!nan || defaultNan) { 230 val = bitsToFp(qnan, junk); 231 } else if (nan) { 232 val = bitsToFp(fpToBits(op1) | qnan, junk); 233 } 234 } else if (fpClass == FP_SUBNORMAL && flush == 1) { 235 // Turn val into a zero with the correct sign; 236 uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1); 237 val = bitsToFp(fpToBits(val) & bitMask, junk); 238 feclearexcept(FeInexact); 239 feraiseexcept(FeUnderflow); 240 } 241 return val; 242} 243 244template 245float fixDest<float>(bool flush, bool defaultNan, float val, float op1); 246template 247double fixDest<double>(bool flush, bool defaultNan, double val, double op1); 248 249template <class fpType> 250fpType 251fixDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2) 252{ 253 int fpClass = std::fpclassify(val); 254 fpType junk = 0.0; 255 if (fpClass == FP_NAN) { 256 const bool single = (sizeof(val) == sizeof(float)); 257 const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); 258 const bool nan1 = std::isnan(op1); 259 const bool nan2 = std::isnan(op2); 260 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); 261 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); 262 if ((!nan1 && !nan2) || defaultNan) { 263 val = bitsToFp(qnan, junk); 264 } else if (signal1) { 265 val = bitsToFp(fpToBits(op1) | qnan, junk); 266 } else if (signal2) { 267 val = bitsToFp(fpToBits(op2) | qnan, junk); 268 } else if (nan1) { 269 val = op1; 270 } else if (nan2) { 271 val = op2; 272 } 273 } else if (fpClass == FP_SUBNORMAL && flush) { 274 // Turn val into a zero with the correct sign; 275 uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1); 276 val = bitsToFp(fpToBits(val) & bitMask, junk); 277 feclearexcept(FeInexact); 278 feraiseexcept(FeUnderflow); 279 } 280 return val; 281} 282 283template 284float fixDest<float>(bool flush, bool defaultNan, 285 float val, float op1, float op2); 286template 287double fixDest<double>(bool flush, bool defaultNan, 288 double val, double op1, double op2); 289 290template <class fpType> 291fpType 292fixDivDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2) 293{ 294 fpType mid = fixDest(flush, defaultNan, val, op1, op2); 295 const bool single = (sizeof(fpType) == sizeof(float)); 296 const fpType junk = 0.0; 297 if ((single && (val == bitsToFp(0x00800000, junk) || 298 val == bitsToFp(0x80800000, junk))) || 299 (!single && (val == bitsToFp(ULL(0x0010000000000000), junk) || 300 val == bitsToFp(ULL(0x8010000000000000), junk))) 301 ) { 302 __asm__ __volatile__("" : "=m" (op1) : "m" (op1)); 303 fesetround(FeRoundZero); 304 fpType temp = 0.0; 305 __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); 306 temp = op1 / op2; 307 if (flushToZero(temp)) { 308 feraiseexcept(FeUnderflow); 309 if (flush) { 310 feclearexcept(FeInexact); 311 mid = temp; 312 } 313 } 314 __asm__ __volatile__("" :: "m" (temp)); 315 } 316 return mid; 317} 318 319template 320float fixDivDest<float>(bool flush, bool defaultNan, 321 float val, float op1, float op2); 322template 323double fixDivDest<double>(bool flush, bool defaultNan, 324 double val, double op1, double op2); 325 326float 327fixFpDFpSDest(FPSCR fpscr, double val) 328{ 329 const float junk = 0.0; 330 float op1 = 0.0; 331 if (std::isnan(val)) { 332 uint64_t valBits = fpToBits(val); 333 uint32_t op1Bits = bits(valBits, 50, 29) | 334 (mask(9) << 22) | 335 (bits(valBits, 63) << 31); 336 op1 = bitsToFp(op1Bits, junk); 337 } 338 float mid = fixDest(fpscr.fz, fpscr.dn, (float)val, op1); 339 if (fpscr.fz && fetestexcept(FeUnderflow | FeInexact) == 340 (FeUnderflow | FeInexact)) { 341 feclearexcept(FeInexact); 342 } 343 if (mid == bitsToFp(0x00800000, junk) || 344 mid == bitsToFp(0x80800000, junk)) { 345 __asm__ __volatile__("" : "=m" (val) : "m" (val)); 346 fesetround(FeRoundZero); 347 float temp = 0.0; 348 __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); 349 temp = val; 350 if (flushToZero(temp)) { 351 feraiseexcept(FeUnderflow); 352 if (fpscr.fz) { 353 feclearexcept(FeInexact); 354 mid = temp; 355 } 356 } 357 __asm__ __volatile__("" :: "m" (temp)); 358 } 359 return mid; 360} 361 362double 363fixFpSFpDDest(FPSCR fpscr, float val) 364{ 365 const double junk = 0.0; 366 double op1 = 0.0; 367 if (std::isnan(val)) { 368 uint32_t valBits = fpToBits(val); 369 uint64_t op1Bits = ((uint64_t)bits(valBits, 21, 0) << 29) | 370 (mask(12) << 51) | 371 ((uint64_t)bits(valBits, 31) << 63); 372 op1 = bitsToFp(op1Bits, junk); 373 } 374 double mid = fixDest(fpscr.fz, fpscr.dn, (double)val, op1); 375 if (mid == bitsToFp(ULL(0x0010000000000000), junk) || 376 mid == bitsToFp(ULL(0x8010000000000000), junk)) { 377 __asm__ __volatile__("" : "=m" (val) : "m" (val)); 378 fesetround(FeRoundZero); 379 double temp = 0.0; 380 __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); 381 temp = val; 382 if (flushToZero(temp)) { 383 feraiseexcept(FeUnderflow); 384 if (fpscr.fz) { 385 feclearexcept(FeInexact); 386 mid = temp; 387 } 388 } 389 __asm__ __volatile__("" :: "m" (temp)); 390 } 391 return mid; 392} 393 394static inline uint16_t 395vcvtFpFpH(FPSCR &fpscr, bool flush, bool defaultNan, 396 uint32_t rMode, bool ahp, uint64_t opBits, bool isDouble) 397{ 398 uint32_t mWidth; 399 uint32_t eWidth; 400 uint32_t eHalfRange; 401 uint32_t sBitPos; 402 403 if (isDouble) { 404 mWidth = 52; 405 eWidth = 11; 406 } else { 407 mWidth = 23; 408 eWidth = 8; 409 } 410 sBitPos = eWidth + mWidth; 411 eHalfRange = (1 << (eWidth-1)) - 1; 412 413 // Extract the operand. 414 bool neg = bits(opBits, sBitPos); 415 uint32_t exponent = bits(opBits, sBitPos-1, mWidth); 416 uint64_t oldMantissa = bits(opBits, mWidth-1, 0); 417 uint32_t mantissa = oldMantissa >> (mWidth - 10); 418 // Do the conversion. 419 uint64_t extra = oldMantissa & mask(mWidth - 10); 420 if (exponent == mask(eWidth)) { 421 if (oldMantissa != 0) { 422 // Nans. 423 if (bits(mantissa, 9) == 0) { 424 // Signalling nan. 425 fpscr.ioc = 1; 426 } 427 if (ahp) { 428 mantissa = 0; 429 exponent = 0; 430 fpscr.ioc = 1; 431 } else if (defaultNan) { 432 mantissa = (1 << 9); 433 exponent = 0x1f; 434 neg = false; 435 } else { 436 exponent = 0x1f; 437 mantissa |= (1 << 9); 438 } 439 } else { 440 // Infinities. 441 exponent = 0x1F; 442 if (ahp) { 443 fpscr.ioc = 1; 444 mantissa = 0x3ff; 445 } else { 446 mantissa = 0; 447 } 448 } 449 } else if (exponent == 0 && oldMantissa == 0) { 450 // Zero, don't need to do anything. 451 } else { 452 // Normalized or denormalized numbers. 453 454 bool inexact = (extra != 0); 455 456 if (exponent == 0) { 457 // Denormalized. 458 // If flush to zero is on, this shouldn't happen. 459 assert(!flush); 460 461 // Check for underflow 462 if (inexact || fpscr.ufe) 463 fpscr.ufc = 1; 464 465 // Handle rounding. 466 unsigned mode = rMode; 467 if ((mode == VfpRoundUpward && !neg && extra) || 468 (mode == VfpRoundDown && neg && extra) || 469 (mode == VfpRoundNearest && 470 (extra > (1 << 9) || 471 (extra == (1 << 9) && bits(mantissa, 0))))) { 472 mantissa++; 473 } 474 475 // See if the number became normalized after rounding. 476 if (mantissa == (1 << 10)) { 477 mantissa = 0; 478 exponent = 1; 479 } 480 } else { 481 // Normalized. 482 483 // We need to track the dropped bits differently since 484 // more can be dropped by denormalizing. 485 bool topOne = bits(extra, mWidth - 10 - 1); 486 bool restZeros = bits(extra, mWidth - 10 - 2, 0) == 0; 487 488 if (exponent <= (eHalfRange - 15)) { 489 // The result is too small. Denormalize. 490 mantissa |= (1 << 10); 491 while (mantissa && exponent <= (eHalfRange - 15)) { 492 restZeros = restZeros && !topOne; 493 topOne = bits(mantissa, 0); 494 mantissa = mantissa >> 1; 495 exponent++; 496 } 497 if (topOne || !restZeros) 498 inexact = true; 499 exponent = 0; 500 } else { 501 // Change bias. 502 exponent -= (eHalfRange - 15); 503 } 504 505 if (exponent == 0 && (inexact || fpscr.ufe)) { 506 // Underflow 507 fpscr.ufc = 1; 508 } 509 510 // Handle rounding. 511 unsigned mode = rMode; 512 bool nonZero = topOne || !restZeros; 513 if ((mode == VfpRoundUpward && !neg && nonZero) || 514 (mode == VfpRoundDown && neg && nonZero) || 515 (mode == VfpRoundNearest && topOne && 516 (!restZeros || bits(mantissa, 0)))) { 517 mantissa++; 518 } 519 520 // See if we rounded up and need to bump the exponent. 521 if (mantissa == (1 << 10)) { 522 mantissa = 0; 523 exponent++; 524 } 525 526 // Deal with overflow 527 if (ahp) { 528 if (exponent >= 0x20) { 529 exponent = 0x1f; 530 mantissa = 0x3ff; 531 fpscr.ioc = 1; 532 // Supress inexact exception. 533 inexact = false; 534 } 535 } else { 536 if (exponent >= 0x1f) { 537 if ((mode == VfpRoundNearest) || 538 (mode == VfpRoundUpward && !neg) || 539 (mode == VfpRoundDown && neg)) { 540 // Overflow to infinity. 541 exponent = 0x1f; 542 mantissa = 0; 543 } else { 544 // Overflow to max normal. 545 exponent = 0x1e; 546 mantissa = 0x3ff; 547 } 548 fpscr.ofc = 1; 549 inexact = true; 550 } 551 } 552 } 553 554 if (inexact) { 555 fpscr.ixc = 1; 556 } 557 } 558 // Reassemble and install the result. 559 uint32_t result = bits(mantissa, 9, 0); 560 replaceBits(result, 14, 10, exponent); 561 if (neg) 562 result |= (1 << 15); 563 return result; 564} 565 566uint16_t 567vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan, 568 uint32_t rMode, bool ahp, float op) 569{ 570 uint64_t opBits = fpToBits(op); 571 return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, false); 572} 573 574uint16_t 575vcvtFpDFpH(FPSCR &fpscr, bool flush, bool defaultNan, 576 uint32_t rMode, bool ahp, double op) 577{ 578 uint64_t opBits = fpToBits(op); 579 return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, true); 580} 581 582static inline uint64_t 583vcvtFpHFp(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op, bool isDouble) 584{ 585 uint32_t mWidth; 586 uint32_t eWidth; 587 uint32_t eHalfRange; 588 uint32_t sBitPos; 589 590 if (isDouble) { 591 mWidth = 52; 592 eWidth = 11; 593 } else { 594 mWidth = 23; 595 eWidth = 8; 596 } 597 sBitPos = eWidth + mWidth; 598 eHalfRange = (1 << (eWidth-1)) - 1; 599 600 // Extract the bitfields. 601 bool neg = bits(op, 15); 602 uint32_t exponent = bits(op, 14, 10); 603 uint64_t mantissa = bits(op, 9, 0); 604 // Do the conversion. 605 if (exponent == 0) { 606 if (mantissa != 0) { 607 // Normalize the value. 608 exponent = exponent + (eHalfRange - 15) + 1; 609 while (mantissa < (1 << 10)) { 610 mantissa = mantissa << 1; 611 exponent--; 612 } 613 } 614 mantissa = mantissa << (mWidth - 10); 615 } else if (exponent == 0x1f && !ahp) { 616 // Infinities and nans. 617 exponent = mask(eWidth); 618 if (mantissa != 0) { 619 // Nans. 620 mantissa = mantissa << (mWidth - 10); 621 if (bits(mantissa, mWidth-1) == 0) { 622 // Signalling nan. 623 fpscr.ioc = 1; 624 mantissa |= (((uint64_t) 1) << (mWidth-1)); 625 } 626 if (defaultNan) { 627 mantissa &= ~mask(mWidth-1); 628 neg = false; 629 } 630 } 631 } else { 632 exponent = exponent + (eHalfRange - 15); 633 mantissa = mantissa << (mWidth - 10); 634 } 635 // Reassemble the result. 636 uint64_t result = bits(mantissa, mWidth-1, 0); 637 replaceBits(result, sBitPos-1, mWidth, exponent); 638 if (neg) { 639 result |= (((uint64_t) 1) << sBitPos); 640 } 641 return result; 642} 643 644double 645vcvtFpHFpD(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op) 646{ 647 double junk = 0.0; 648 uint64_t result; 649 650 result = vcvtFpHFp(fpscr, defaultNan, ahp, op, true); 651 return bitsToFp(result, junk); 652} 653 654float 655vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op) 656{ 657 float junk = 0.0; 658 uint64_t result; 659 660 result = vcvtFpHFp(fpscr, defaultNan, ahp, op, false); 661 return bitsToFp(result, junk); 662} 663 664float 665vfpUFixedToFpS(bool flush, bool defaultNan, 666 uint64_t val, uint8_t width, uint8_t imm) 667{ 668 fesetround(FeRoundNearest); 669 if (width == 16) 670 val = (uint16_t)val; 671 else if (width == 32) 672 val = (uint32_t)val; 673 else if (width != 64) 674 panic("Unsupported width %d", width); 675 float scale = powf(2.0, imm); 676 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 677 feclearexcept(FeAllExceptions); 678 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 679 return fixDivDest(flush, defaultNan, val / scale, (float)val, scale); 680} 681 682float 683vfpSFixedToFpS(bool flush, bool defaultNan, 684 int64_t val, uint8_t width, uint8_t imm) 685{ 686 fesetround(FeRoundNearest); 687 if (width == 16) 688 val = sext<16>(val & mask(16)); 689 else if (width == 32) 690 val = sext<32>(val & mask(32)); 691 else if (width != 64) 692 panic("Unsupported width %d", width); 693 694 float scale = powf(2.0, imm); 695 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 696 feclearexcept(FeAllExceptions); 697 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 698 return fixDivDest(flush, defaultNan, val / scale, (float)val, scale); 699} 700 701 702double 703vfpUFixedToFpD(bool flush, bool defaultNan, 704 uint64_t val, uint8_t width, uint8_t imm) 705{ 706 fesetround(FeRoundNearest); 707 if (width == 16) 708 val = (uint16_t)val; 709 else if (width == 32) 710 val = (uint32_t)val; 711 else if (width != 64) 712 panic("Unsupported width %d", width); 713 714 double scale = pow(2.0, imm); 715 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 716 feclearexcept(FeAllExceptions); 717 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 718 return fixDivDest(flush, defaultNan, val / scale, (double)val, scale); 719} 720 721double 722vfpSFixedToFpD(bool flush, bool defaultNan, 723 int64_t val, uint8_t width, uint8_t imm) 724{ 725 fesetround(FeRoundNearest); 726 if (width == 16) 727 val = sext<16>(val & mask(16)); 728 else if (width == 32) 729 val = sext<32>(val & mask(32)); 730 else if (width != 64) 731 panic("Unsupported width %d", width); 732 733 double scale = pow(2.0, imm); 734 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 735 feclearexcept(FeAllExceptions); 736 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 737 return fixDivDest(flush, defaultNan, val / scale, (double)val, scale); 738} 739 740// This function implements a magic formula taken from the architecture 741// reference manual. It was originally called recip_sqrt_estimate. 742static double 743recipSqrtEstimate(double a) 744{ 745 int64_t q0, q1, s; 746 double r; 747 if (a < 0.5) { 748 q0 = (int64_t)(a * 512.0); 749 r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0); 750 } else { 751 q1 = (int64_t)(a * 256.0); 752 r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0); 753 } 754 s = (int64_t)(256.0 * r + 0.5); 755 return (double)s / 256.0; 756} 757 758// This function is only intended for use in Neon instructions because 759// it ignores certain bits in the FPSCR. 760float 761fprSqrtEstimate(FPSCR &fpscr, float op) 762{ 763 const uint32_t qnan = 0x7fc00000; 764 float junk = 0.0; 765 int fpClass = std::fpclassify(op); 766 if (fpClass == FP_NAN) { 767 if ((fpToBits(op) & qnan) != qnan) 768 fpscr.ioc = 1; 769 return bitsToFp(qnan, junk); 770 } else if (fpClass == FP_ZERO) { 771 fpscr.dzc = 1; 772 // Return infinity with the same sign as the operand. 773 return bitsToFp((std::signbit(op) << 31) | 774 (0xFF << 23) | (0 << 0), junk); 775 } else if (std::signbit(op)) { 776 // Set invalid op bit. 777 fpscr.ioc = 1; 778 return bitsToFp(qnan, junk); 779 } else if (fpClass == FP_INFINITE) { 780 return 0.0; 781 } else { 782 uint64_t opBits = fpToBits(op); 783 double scaled; 784 if (bits(opBits, 23)) { 785 scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | 786 (ULL(0x3fd) << 52) | (bits(opBits, 31) << 63), 787 (double)0.0); 788 } else { 789 scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | 790 (ULL(0x3fe) << 52) | (bits(opBits, 31) << 63), 791 (double)0.0); 792 } 793 uint64_t resultExp = (380 - bits(opBits, 30, 23)) / 2; 794 795 uint64_t estimate = fpToBits(recipSqrtEstimate(scaled)); 796 797 return bitsToFp((bits(estimate, 63) << 31) | 798 (bits(resultExp, 7, 0) << 23) | 799 (bits(estimate, 51, 29) << 0), junk); 800 } 801} 802 803uint32_t 804unsignedRSqrtEstimate(uint32_t op) 805{ 806 if (bits(op, 31, 30) == 0) { 807 return -1; 808 } else { 809 double dpOp; 810 if (bits(op, 31)) { 811 dpOp = bitsToFp((ULL(0) << 63) | 812 (ULL(0x3fe) << 52) | 813 (bits((uint64_t)op, 30, 0) << 21) | 814 (0 << 0), (double)0.0); 815 } else { 816 dpOp = bitsToFp((ULL(0) << 63) | 817 (ULL(0x3fd) << 52) | 818 (bits((uint64_t)op, 29, 0) << 22) | 819 (0 << 0), (double)0.0); 820 } 821 uint64_t estimate = fpToBits(recipSqrtEstimate(dpOp)); 822 return (1 << 31) | bits(estimate, 51, 21); 823 } 824} 825 826// This function implements a magic formula taken from the architecture 827// reference manual. It was originally called recip_estimate. 828 829static double 830recipEstimate(double a) 831{ 832 int64_t q, s; 833 double r; 834 q = (int64_t)(a * 512.0); 835 r = 1.0 / (((double)q + 0.5) / 512.0); 836 s = (int64_t)(256.0 * r + 0.5); 837 return (double)s / 256.0; 838} 839 840// This function is only intended for use in Neon instructions because 841// it ignores certain bits in the FPSCR. 842float 843fpRecipEstimate(FPSCR &fpscr, float op) 844{ 845 const uint32_t qnan = 0x7fc00000; 846 float junk = 0.0; 847 int fpClass = std::fpclassify(op); 848 if (fpClass == FP_NAN) { 849 if ((fpToBits(op) & qnan) != qnan) 850 fpscr.ioc = 1; 851 return bitsToFp(qnan, junk); 852 } else if (fpClass == FP_INFINITE) { 853 return bitsToFp(std::signbit(op) << 31, junk); 854 } else if (fpClass == FP_ZERO) { 855 fpscr.dzc = 1; 856 // Return infinity with the same sign as the operand. 857 return bitsToFp((std::signbit(op) << 31) | 858 (0xFF << 23) | (0 << 0), junk); 859 } else if (fabs(op) >= pow(2.0, 126)) { 860 fpscr.ufc = 1; 861 return bitsToFp(std::signbit(op) << 31, junk); 862 } else { 863 uint64_t opBits = fpToBits(op); 864 double scaled; 865 scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | 866 (ULL(0x3fe) << 52) | (ULL(0) << 63), 867 (double)0.0); 868 uint64_t resultExp = 253 - bits(opBits, 30, 23); 869 870 uint64_t estimate = fpToBits(recipEstimate(scaled)); 871 872 return bitsToFp((bits(opBits, 31) << 31) | 873 (bits(resultExp, 7, 0) << 23) | 874 (bits(estimate, 51, 29) << 0), junk); 875 } 876} 877 878uint32_t 879unsignedRecipEstimate(uint32_t op) 880{ 881 if (bits(op, 31) == 0) { 882 return -1; 883 } else { 884 double dpOp; 885 dpOp = bitsToFp((ULL(0) << 63) | 886 (ULL(0x3fe) << 52) | 887 (bits((uint64_t)op, 30, 0) << 21) | 888 (0 << 0), (double)0.0); 889 uint64_t estimate = fpToBits(recipEstimate(dpOp)); 890 return (1 << 31) | bits(estimate, 51, 21); 891 } 892} 893 894FPSCR 895fpStandardFPSCRValue(const FPSCR &fpscr) 896{ 897 FPSCR new_fpscr(0); 898 new_fpscr.ahp = fpscr.ahp; 899 new_fpscr.dn = 1; 900 new_fpscr.fz = 1; 901 new_fpscr.fz16 = fpscr.fz16; 902 return new_fpscr; 903}; 904 905template <class fpType> 906fpType 907FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, 908 fpType op1, fpType op2) const 909{ 910 done = true; 911 fpType junk = 0.0; 912 fpType dest = 0.0; 913 const bool single = (sizeof(fpType) == sizeof(float)); 914 const uint64_t qnan = 915 single ? 0x7fc00000 : ULL(0x7ff8000000000000); 916 const bool nan1 = std::isnan(op1); 917 const bool nan2 = std::isnan(op2); 918 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); 919 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); 920 if (nan1 || nan2) { 921 if (defaultNan) { 922 dest = bitsToFp(qnan, junk); 923 } else if (signal1) { 924 dest = bitsToFp(fpToBits(op1) | qnan, junk); 925 } else if (signal2) { 926 dest = bitsToFp(fpToBits(op2) | qnan, junk); 927 } else if (nan1) { 928 dest = op1; 929 } else if (nan2) { 930 dest = op2; 931 } 932 if (signal1 || signal2) { 933 fpscr.ioc = 1; 934 } 935 } else { 936 done = false; 937 } 938 return dest; 939} 940 941template 942float FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, 943 float op1, float op2) const; 944template 945double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, 946 double op1, double op2) const; 947 948// @TODO remove this function when we've finished switching all FMA code to use the new FPLIB 949template <class fpType> 950fpType 951FpOp::ternaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType op3, 952 fpType (*func)(fpType, fpType, fpType), 953 bool flush, bool defaultNan, uint32_t rMode) const 954{ 955 const bool single = (sizeof(fpType) == sizeof(float)); 956 fpType junk = 0.0; 957 958 if (flush && (flushToZero(op1, op2) || flushToZero(op3))) 959 fpscr.idc = 1; 960 VfpSavedState state = prepFpState(rMode); 961 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3), "=m" (state) 962 : "m" (op1), "m" (op2), "m" (op3), "m" (state)); 963 fpType dest = func(op1, op2, op3); 964 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); 965 966 int fpClass = std::fpclassify(dest); 967 // Get NAN behavior right. This varies between x86 and ARM. 968 if (fpClass == FP_NAN) { 969 const uint64_t qnan = 970 single ? 0x7fc00000 : ULL(0x7ff8000000000000); 971 const bool nan1 = std::isnan(op1); 972 const bool nan2 = std::isnan(op2); 973 const bool nan3 = std::isnan(op3); 974 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); 975 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); 976 const bool signal3 = nan3 && ((fpToBits(op3) & qnan) != qnan); 977 if ((!nan1 && !nan2 && !nan3) || (defaultNan == 1)) { 978 dest = bitsToFp(qnan, junk); 979 } else if (signal1) { 980 dest = bitsToFp(fpToBits(op1) | qnan, junk); 981 } else if (signal2) { 982 dest = bitsToFp(fpToBits(op2) | qnan, junk); 983 } else if (signal3) { 984 dest = bitsToFp(fpToBits(op3) | qnan, junk); 985 } else if (nan1) { 986 dest = op1; 987 } else if (nan2) { 988 dest = op2; 989 } else if (nan3) { 990 dest = op3; 991 } 992 } else if (flush && flushToZero(dest)) { 993 feraiseexcept(FeUnderflow); 994 } else if (( 995 (single && (dest == bitsToFp(0x00800000, junk) || 996 dest == bitsToFp(0x80800000, junk))) || 997 (!single && 998 (dest == bitsToFp(ULL(0x0010000000000000), junk) || 999 dest == bitsToFp(ULL(0x8010000000000000), junk))) 1000 ) && rMode != VfpRoundZero) { 1001 /* 1002 * Correct for the fact that underflow is detected -before- rounding 1003 * in ARM and -after- rounding in x86. 1004 */ 1005 fesetround(FeRoundZero); 1006 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3) 1007 : "m" (op1), "m" (op2), "m" (op3)); 1008 fpType temp = func(op1, op2, op2); 1009 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); 1010 if (flush && flushToZero(temp)) { 1011 dest = temp; 1012 } 1013 } 1014 finishVfp(fpscr, state, flush); 1015 return dest; 1016} 1017 1018template 1019float FpOp::ternaryOp(FPSCR &fpscr, float op1, float op2, float op3, 1020 float (*func)(float, float, float), 1021 bool flush, bool defaultNan, uint32_t rMode) const; 1022template 1023double FpOp::ternaryOp(FPSCR &fpscr, double op1, double op2, double op3, 1024 double (*func)(double, double, double), 1025 bool flush, bool defaultNan, uint32_t rMode) const; 1026 1027template <class fpType> 1028fpType 1029FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2, 1030 fpType (*func)(fpType, fpType), 1031 bool flush, bool defaultNan, uint32_t rMode) const 1032{ 1033 const bool single = (sizeof(fpType) == sizeof(float)); 1034 fpType junk = 0.0; 1035 1036 if (flush && flushToZero(op1, op2)) 1037 fpscr.idc = 1; 1038 VfpSavedState state = prepFpState(rMode); 1039 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (state) 1040 : "m" (op1), "m" (op2), "m" (state)); 1041 fpType dest = func(op1, op2); 1042 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); 1043 1044 // Get NAN behavior right. This varies between x86 and ARM. 1045 if (std::isnan(dest)) { 1046 const uint64_t qnan = 1047 single ? 0x7fc00000 : ULL(0x7ff8000000000000); 1048 const bool nan1 = std::isnan(op1); 1049 const bool nan2 = std::isnan(op2); 1050 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); 1051 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); 1052 if ((!nan1 && !nan2) || (defaultNan == 1)) { 1053 dest = bitsToFp(qnan, junk); 1054 } else if (signal1) { 1055 dest = bitsToFp(fpToBits(op1) | qnan, junk); 1056 } else if (signal2) { 1057 dest = bitsToFp(fpToBits(op2) | qnan, junk); 1058 } else if (nan1) { 1059 dest = op1; 1060 } else if (nan2) { 1061 dest = op2; 1062 } 1063 } else if (flush && flushToZero(dest)) { 1064 feraiseexcept(FeUnderflow); 1065 } else if (( 1066 (single && (dest == bitsToFp(0x00800000, junk) || 1067 dest == bitsToFp(0x80800000, junk))) || 1068 (!single && 1069 (dest == bitsToFp(ULL(0x0010000000000000), junk) || 1070 dest == bitsToFp(ULL(0x8010000000000000), junk))) 1071 ) && rMode != VfpRoundZero) { 1072 /* 1073 * Correct for the fact that underflow is detected -before- rounding 1074 * in ARM and -after- rounding in x86. 1075 */ 1076 fesetround(FeRoundZero); 1077 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2) 1078 : "m" (op1), "m" (op2)); 1079 fpType temp = func(op1, op2); 1080 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); 1081 if (flush && flushToZero(temp)) { 1082 dest = temp; 1083 } 1084 } 1085 finishVfp(fpscr, state, flush); 1086 return dest; 1087} 1088 1089template 1090float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2, 1091 float (*func)(float, float), 1092 bool flush, bool defaultNan, uint32_t rMode) const; 1093template 1094double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2, 1095 double (*func)(double, double), 1096 bool flush, bool defaultNan, uint32_t rMode) const; 1097 1098template <class fpType> 1099fpType 1100FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (*func)(fpType), 1101 bool flush, uint32_t rMode) const 1102{ 1103 const bool single = (sizeof(fpType) == sizeof(float)); 1104 fpType junk = 0.0; 1105 1106 if (flush && flushToZero(op1)) 1107 fpscr.idc = 1; 1108 VfpSavedState state = prepFpState(rMode); 1109 __asm__ __volatile__ ("" : "=m" (op1), "=m" (state) 1110 : "m" (op1), "m" (state)); 1111 fpType dest = func(op1); 1112 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); 1113 1114 // Get NAN behavior right. This varies between x86 and ARM. 1115 if (std::isnan(dest)) { 1116 const uint64_t qnan = 1117 single ? 0x7fc00000 : ULL(0x7ff8000000000000); 1118 const bool nan = std::isnan(op1); 1119 if (!nan || fpscr.dn == 1) { 1120 dest = bitsToFp(qnan, junk); 1121 } else if (nan) { 1122 dest = bitsToFp(fpToBits(op1) | qnan, junk); 1123 } 1124 } else if (flush && flushToZero(dest)) { 1125 feraiseexcept(FeUnderflow); 1126 } else if (( 1127 (single && (dest == bitsToFp(0x00800000, junk) || 1128 dest == bitsToFp(0x80800000, junk))) || 1129 (!single && 1130 (dest == bitsToFp(ULL(0x0010000000000000), junk) || 1131 dest == bitsToFp(ULL(0x8010000000000000), junk))) 1132 ) && rMode != VfpRoundZero) { 1133 /* 1134 * Correct for the fact that underflow is detected -before- rounding 1135 * in ARM and -after- rounding in x86. 1136 */ 1137 fesetround(FeRoundZero); 1138 __asm__ __volatile__ ("" : "=m" (op1) : "m" (op1)); 1139 fpType temp = func(op1); 1140 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); 1141 if (flush && flushToZero(temp)) { 1142 dest = temp; 1143 } 1144 } 1145 finishVfp(fpscr, state, flush); 1146 return dest; 1147} 1148 1149template 1150float FpOp::unaryOp(FPSCR &fpscr, float op1, float (*func)(float), 1151 bool flush, uint32_t rMode) const; 1152template 1153double FpOp::unaryOp(FPSCR &fpscr, double op1, double (*func)(double), 1154 bool flush, uint32_t rMode) const; 1155 1156IntRegIndex 1157VfpMacroOp::addStride(IntRegIndex idx, unsigned stride) 1158{ 1159 if (wide) { 1160 stride *= 2; 1161 } 1162 unsigned offset = idx % 8; 1163 idx = (IntRegIndex)(idx - offset); 1164 offset += stride; 1165 idx = (IntRegIndex)(idx + (offset % 8)); 1166 return idx; 1167} 1168 1169void 1170VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1, IntRegIndex &op2) 1171{ 1172 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; 1173 assert(!inScalarBank(dest)); 1174 dest = addStride(dest, stride); 1175 op1 = addStride(op1, stride); 1176 if (!inScalarBank(op2)) { 1177 op2 = addStride(op2, stride); 1178 } 1179} 1180 1181void 1182VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1) 1183{ 1184 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; 1185 assert(!inScalarBank(dest)); 1186 dest = addStride(dest, stride); 1187 if (!inScalarBank(op1)) { 1188 op1 = addStride(op1, stride); 1189 } 1190} 1191 1192void 1193VfpMacroOp::nextIdxs(IntRegIndex &dest) 1194{ 1195 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; 1196 assert(!inScalarBank(dest)); 1197 dest = addStride(dest, stride); 1198} 1199 1200} 1201