vfp.cc revision 9918
1/* 2 * Copyright (c) 2010 ARM Limited 3 * All rights reserved 4 * 5 * The license below extends only to copyright in the software and shall 6 * not be construed as granting a license to any other intellectual 7 * property including but not limited to intellectual property relating 8 * to a hardware implementation of the functionality of the software 9 * licensed hereunder. You may use the software subject to the license 10 * terms below provided that you ensure that this notice is replicated 11 * unmodified and in its entirety in all distributions of the software, 12 * modified or unmodified, in source code or in binary form. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions are 16 * met: redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer; 18 * redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution; 21 * neither the name of the copyright holders nor the names of its 22 * contributors may be used to endorse or promote products derived from 23 * this software without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 30 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 31 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 32 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 34 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 * 37 * Authors: Gabe Black 38 */ 39 40#include "arch/arm/insts/vfp.hh" 41 42/* 43 * The asm statements below are to keep gcc from reordering code. Otherwise 44 * the rounding mode might be set after the operation it was intended for, the 45 * exception bits read before it, etc. 46 */ 47 48std::string 49FpRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 50{ 51 std::stringstream ss; 52 printMnemonic(ss); 53 printReg(ss, dest + FP_Reg_Base); 54 ss << ", "; 55 printReg(ss, op1 + FP_Reg_Base); 56 return ss.str(); 57} 58 59std::string 60FpRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 61{ 62 std::stringstream ss; 63 printMnemonic(ss); 64 printReg(ss, dest + FP_Reg_Base); 65 ccprintf(ss, ", #%d", imm); 66 return ss.str(); 67} 68 69std::string 70FpRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 71{ 72 std::stringstream ss; 73 printMnemonic(ss); 74 printReg(ss, dest + FP_Reg_Base); 75 ss << ", "; 76 printReg(ss, op1 + FP_Reg_Base); 77 ccprintf(ss, ", #%d", imm); 78 return ss.str(); 79} 80 81std::string 82FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 83{ 84 std::stringstream ss; 85 printMnemonic(ss); 86 printReg(ss, dest + FP_Reg_Base); 87 ss << ", "; 88 printReg(ss, op1 + FP_Reg_Base); 89 ss << ", "; 90 printReg(ss, op2 + FP_Reg_Base); 91 return ss.str(); 92} 93 94std::string 95FpRegRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 96{ 97 std::stringstream ss; 98 printMnemonic(ss); 99 printReg(ss, dest + FP_Reg_Base); 100 ss << ", "; 101 printReg(ss, op1 + FP_Reg_Base); 102 ss << ", "; 103 printReg(ss, op2 + FP_Reg_Base); 104 ccprintf(ss, ", #%d", imm); 105 return ss.str(); 106} 107 108namespace ArmISA 109{ 110 111VfpSavedState 112prepFpState(uint32_t rMode) 113{ 114 int roundingMode = fegetround(); 115 feclearexcept(FeAllExceptions); 116 switch (rMode) { 117 case VfpRoundNearest: 118 fesetround(FeRoundNearest); 119 break; 120 case VfpRoundUpward: 121 fesetround(FeRoundUpward); 122 break; 123 case VfpRoundDown: 124 fesetround(FeRoundDown); 125 break; 126 case VfpRoundZero: 127 fesetround(FeRoundZero); 128 break; 129 } 130 return roundingMode; 131} 132 133void 134finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush) 135{ 136 int exceptions = fetestexcept(FeAllExceptions); 137 bool underflow = false; 138 if (exceptions & FeInvalid) { 139 fpscr.ioc = 1; 140 } 141 if (exceptions & FeDivByZero) { 142 fpscr.dzc = 1; 143 } 144 if (exceptions & FeOverflow) { 145 fpscr.ofc = 1; 146 } 147 if (exceptions & FeUnderflow) { 148 underflow = true; 149 fpscr.ufc = 1; 150 } 151 if ((exceptions & FeInexact) && !(underflow && flush)) { 152 fpscr.ixc = 1; 153 } 154 fesetround(state); 155} 156 157template <class fpType> 158fpType 159fixDest(bool flush, bool defaultNan, fpType val, fpType op1) 160{ 161 int fpClass = std::fpclassify(val); 162 fpType junk = 0.0; 163 if (fpClass == FP_NAN) { 164 const bool single = (sizeof(val) == sizeof(float)); 165 const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); 166 const bool nan = std::isnan(op1); 167 if (!nan || defaultNan) { 168 val = bitsToFp(qnan, junk); 169 } else if (nan) { 170 val = bitsToFp(fpToBits(op1) | qnan, junk); 171 } 172 } else if (fpClass == FP_SUBNORMAL && flush == 1) { 173 // Turn val into a zero with the correct sign; 174 uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1); 175 val = bitsToFp(fpToBits(val) & bitMask, junk); 176 feclearexcept(FeInexact); 177 feraiseexcept(FeUnderflow); 178 } 179 return val; 180} 181 182template 183float fixDest<float>(bool flush, bool defaultNan, float val, float op1); 184template 185double fixDest<double>(bool flush, bool defaultNan, double val, double op1); 186 187template <class fpType> 188fpType 189fixDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2) 190{ 191 int fpClass = std::fpclassify(val); 192 fpType junk = 0.0; 193 if (fpClass == FP_NAN) { 194 const bool single = (sizeof(val) == sizeof(float)); 195 const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); 196 const bool nan1 = std::isnan(op1); 197 const bool nan2 = std::isnan(op2); 198 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); 199 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); 200 if ((!nan1 && !nan2) || defaultNan) { 201 val = bitsToFp(qnan, junk); 202 } else if (signal1) { 203 val = bitsToFp(fpToBits(op1) | qnan, junk); 204 } else if (signal2) { 205 val = bitsToFp(fpToBits(op2) | qnan, junk); 206 } else if (nan1) { 207 val = op1; 208 } else if (nan2) { 209 val = op2; 210 } 211 } else if (fpClass == FP_SUBNORMAL && flush) { 212 // Turn val into a zero with the correct sign; 213 uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1); 214 val = bitsToFp(fpToBits(val) & bitMask, junk); 215 feclearexcept(FeInexact); 216 feraiseexcept(FeUnderflow); 217 } 218 return val; 219} 220 221template 222float fixDest<float>(bool flush, bool defaultNan, 223 float val, float op1, float op2); 224template 225double fixDest<double>(bool flush, bool defaultNan, 226 double val, double op1, double op2); 227 228template <class fpType> 229fpType 230fixDivDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2) 231{ 232 fpType mid = fixDest(flush, defaultNan, val, op1, op2); 233 const bool single = (sizeof(fpType) == sizeof(float)); 234 const fpType junk = 0.0; 235 if ((single && (val == bitsToFp(0x00800000, junk) || 236 val == bitsToFp(0x80800000, junk))) || 237 (!single && (val == bitsToFp(ULL(0x0010000000000000), junk) || 238 val == bitsToFp(ULL(0x8010000000000000), junk))) 239 ) { 240 __asm__ __volatile__("" : "=m" (op1) : "m" (op1)); 241 fesetround(FeRoundZero); 242 fpType temp = 0.0; 243 __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); 244 temp = op1 / op2; 245 if (flushToZero(temp)) { 246 feraiseexcept(FeUnderflow); 247 if (flush) { 248 feclearexcept(FeInexact); 249 mid = temp; 250 } 251 } 252 __asm__ __volatile__("" :: "m" (temp)); 253 } 254 return mid; 255} 256 257template 258float fixDivDest<float>(bool flush, bool defaultNan, 259 float val, float op1, float op2); 260template 261double fixDivDest<double>(bool flush, bool defaultNan, 262 double val, double op1, double op2); 263 264float 265fixFpDFpSDest(FPSCR fpscr, double val) 266{ 267 const float junk = 0.0; 268 float op1 = 0.0; 269 if (std::isnan(val)) { 270 uint64_t valBits = fpToBits(val); 271 uint32_t op1Bits = bits(valBits, 50, 29) | 272 (mask(9) << 22) | 273 (bits(valBits, 63) << 31); 274 op1 = bitsToFp(op1Bits, junk); 275 } 276 float mid = fixDest(fpscr.fz, fpscr.dn, (float)val, op1); 277 if (fpscr.fz && fetestexcept(FeUnderflow | FeInexact) == 278 (FeUnderflow | FeInexact)) { 279 feclearexcept(FeInexact); 280 } 281 if (mid == bitsToFp(0x00800000, junk) || 282 mid == bitsToFp(0x80800000, junk)) { 283 __asm__ __volatile__("" : "=m" (val) : "m" (val)); 284 fesetround(FeRoundZero); 285 float temp = 0.0; 286 __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); 287 temp = val; 288 if (flushToZero(temp)) { 289 feraiseexcept(FeUnderflow); 290 if (fpscr.fz) { 291 feclearexcept(FeInexact); 292 mid = temp; 293 } 294 } 295 __asm__ __volatile__("" :: "m" (temp)); 296 } 297 return mid; 298} 299 300double 301fixFpSFpDDest(FPSCR fpscr, float val) 302{ 303 const double junk = 0.0; 304 double op1 = 0.0; 305 if (std::isnan(val)) { 306 uint32_t valBits = fpToBits(val); 307 uint64_t op1Bits = ((uint64_t)bits(valBits, 21, 0) << 29) | 308 (mask(12) << 51) | 309 ((uint64_t)bits(valBits, 31) << 63); 310 op1 = bitsToFp(op1Bits, junk); 311 } 312 double mid = fixDest(fpscr.fz, fpscr.dn, (double)val, op1); 313 if (mid == bitsToFp(ULL(0x0010000000000000), junk) || 314 mid == bitsToFp(ULL(0x8010000000000000), junk)) { 315 __asm__ __volatile__("" : "=m" (val) : "m" (val)); 316 fesetround(FeRoundZero); 317 double temp = 0.0; 318 __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); 319 temp = val; 320 if (flushToZero(temp)) { 321 feraiseexcept(FeUnderflow); 322 if (fpscr.fz) { 323 feclearexcept(FeInexact); 324 mid = temp; 325 } 326 } 327 __asm__ __volatile__("" :: "m" (temp)); 328 } 329 return mid; 330} 331 332uint16_t 333vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan, 334 uint32_t rMode, bool ahp, float op) 335{ 336 uint32_t opBits = fpToBits(op); 337 // Extract the operand. 338 bool neg = bits(opBits, 31); 339 uint32_t exponent = bits(opBits, 30, 23); 340 uint32_t oldMantissa = bits(opBits, 22, 0); 341 uint32_t mantissa = oldMantissa >> (23 - 10); 342 // Do the conversion. 343 uint32_t extra = oldMantissa & mask(23 - 10); 344 if (exponent == 0xff) { 345 if (oldMantissa != 0) { 346 // Nans. 347 if (bits(mantissa, 9) == 0) { 348 // Signalling nan. 349 fpscr.ioc = 1; 350 } 351 if (ahp) { 352 mantissa = 0; 353 exponent = 0; 354 fpscr.ioc = 1; 355 } else if (defaultNan) { 356 mantissa = (1 << 9); 357 exponent = 0x1f; 358 neg = false; 359 } else { 360 exponent = 0x1f; 361 mantissa |= (1 << 9); 362 } 363 } else { 364 // Infinities. 365 exponent = 0x1F; 366 if (ahp) { 367 fpscr.ioc = 1; 368 mantissa = 0x3ff; 369 } else { 370 mantissa = 0; 371 } 372 } 373 } else if (exponent == 0 && oldMantissa == 0) { 374 // Zero, don't need to do anything. 375 } else { 376 // Normalized or denormalized numbers. 377 378 bool inexact = (extra != 0); 379 380 if (exponent == 0) { 381 // Denormalized. 382 383 // If flush to zero is on, this shouldn't happen. 384 assert(!flush); 385 386 // Check for underflow 387 if (inexact || fpscr.ufe) 388 fpscr.ufc = 1; 389 390 // Handle rounding. 391 unsigned mode = rMode; 392 if ((mode == VfpRoundUpward && !neg && extra) || 393 (mode == VfpRoundDown && neg && extra) || 394 (mode == VfpRoundNearest && 395 (extra > (1 << 9) || 396 (extra == (1 << 9) && bits(mantissa, 0))))) { 397 mantissa++; 398 } 399 400 // See if the number became normalized after rounding. 401 if (mantissa == (1 << 10)) { 402 mantissa = 0; 403 exponent = 1; 404 } 405 } else { 406 // Normalized. 407 408 // We need to track the dropped bits differently since 409 // more can be dropped by denormalizing. 410 bool topOne = bits(extra, 12); 411 bool restZeros = bits(extra, 11, 0) == 0; 412 413 if (exponent <= (127 - 15)) { 414 // The result is too small. Denormalize. 415 mantissa |= (1 << 10); 416 while (mantissa && exponent <= (127 - 15)) { 417 restZeros = restZeros && !topOne; 418 topOne = bits(mantissa, 0); 419 mantissa = mantissa >> 1; 420 exponent++; 421 } 422 if (topOne || !restZeros) 423 inexact = true; 424 exponent = 0; 425 } else { 426 // Change bias. 427 exponent -= (127 - 15); 428 } 429 430 if (exponent == 0 && (inexact || fpscr.ufe)) { 431 // Underflow 432 fpscr.ufc = 1; 433 } 434 435 // Handle rounding. 436 unsigned mode = rMode; 437 bool nonZero = topOne || !restZeros; 438 if ((mode == VfpRoundUpward && !neg && nonZero) || 439 (mode == VfpRoundDown && neg && nonZero) || 440 (mode == VfpRoundNearest && topOne && 441 (!restZeros || bits(mantissa, 0)))) { 442 mantissa++; 443 } 444 445 // See if we rounded up and need to bump the exponent. 446 if (mantissa == (1 << 10)) { 447 mantissa = 0; 448 exponent++; 449 } 450 451 // Deal with overflow 452 if (ahp) { 453 if (exponent >= 0x20) { 454 exponent = 0x1f; 455 mantissa = 0x3ff; 456 fpscr.ioc = 1; 457 // Supress inexact exception. 458 inexact = false; 459 } 460 } else { 461 if (exponent >= 0x1f) { 462 if ((mode == VfpRoundNearest) || 463 (mode == VfpRoundUpward && !neg) || 464 (mode == VfpRoundDown && neg)) { 465 // Overflow to infinity. 466 exponent = 0x1f; 467 mantissa = 0; 468 } else { 469 // Overflow to max normal. 470 exponent = 0x1e; 471 mantissa = 0x3ff; 472 } 473 fpscr.ofc = 1; 474 inexact = true; 475 } 476 } 477 } 478 479 if (inexact) { 480 fpscr.ixc = 1; 481 } 482 } 483 // Reassemble and install the result. 484 uint32_t result = bits(mantissa, 9, 0); 485 replaceBits(result, 14, 10, exponent); 486 if (neg) 487 result |= (1 << 15); 488 return result; 489} 490 491float 492vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op) 493{ 494 float junk = 0.0; 495 // Extract the bitfields. 496 bool neg = bits(op, 15); 497 uint32_t exponent = bits(op, 14, 10); 498 uint32_t mantissa = bits(op, 9, 0); 499 // Do the conversion. 500 if (exponent == 0) { 501 if (mantissa != 0) { 502 // Normalize the value. 503 exponent = exponent + (127 - 15) + 1; 504 while (mantissa < (1 << 10)) { 505 mantissa = mantissa << 1; 506 exponent--; 507 } 508 } 509 mantissa = mantissa << (23 - 10); 510 } else if (exponent == 0x1f && !ahp) { 511 // Infinities and nans. 512 exponent = 0xff; 513 if (mantissa != 0) { 514 // Nans. 515 mantissa = mantissa << (23 - 10); 516 if (bits(mantissa, 22) == 0) { 517 // Signalling nan. 518 fpscr.ioc = 1; 519 mantissa |= (1 << 22); 520 } 521 if (defaultNan) { 522 mantissa &= ~mask(22); 523 neg = false; 524 } 525 } 526 } else { 527 exponent = exponent + (127 - 15); 528 mantissa = mantissa << (23 - 10); 529 } 530 // Reassemble the result. 531 uint32_t result = bits(mantissa, 22, 0); 532 replaceBits(result, 30, 23, exponent); 533 if (neg) 534 result |= (1 << 31); 535 return bitsToFp(result, junk); 536} 537 538uint64_t 539vfpFpSToFixed(float val, bool isSigned, bool half, 540 uint8_t imm, bool rzero) 541{ 542 int rmode = rzero ? FeRoundZero : fegetround(); 543 __asm__ __volatile__("" : "=m" (rmode) : "m" (rmode)); 544 fesetround(FeRoundNearest); 545 val = val * powf(2.0, imm); 546 __asm__ __volatile__("" : "=m" (val) : "m" (val)); 547 fesetround(rmode); 548 feclearexcept(FeAllExceptions); 549 __asm__ __volatile__("" : "=m" (val) : "m" (val)); 550 float origVal = val; 551 val = rintf(val); 552 int fpType = std::fpclassify(val); 553 if (fpType == FP_SUBNORMAL || fpType == FP_NAN) { 554 if (fpType == FP_NAN) { 555 feraiseexcept(FeInvalid); 556 } 557 val = 0.0; 558 } else if (origVal != val) { 559 switch (rmode) { 560 case FeRoundNearest: 561 if (origVal - val > 0.5) 562 val += 1.0; 563 else if (val - origVal > 0.5) 564 val -= 1.0; 565 break; 566 case FeRoundDown: 567 if (origVal < val) 568 val -= 1.0; 569 break; 570 case FeRoundUpward: 571 if (origVal > val) 572 val += 1.0; 573 break; 574 } 575 feraiseexcept(FeInexact); 576 } 577 578 if (isSigned) { 579 if (half) { 580 if ((double)val < (int16_t)(1 << 15)) { 581 feraiseexcept(FeInvalid); 582 feclearexcept(FeInexact); 583 return (int16_t)(1 << 15); 584 } 585 if ((double)val > (int16_t)mask(15)) { 586 feraiseexcept(FeInvalid); 587 feclearexcept(FeInexact); 588 return (int16_t)mask(15); 589 } 590 return (int16_t)val; 591 } else { 592 if ((double)val < (int32_t)(1 << 31)) { 593 feraiseexcept(FeInvalid); 594 feclearexcept(FeInexact); 595 return (int32_t)(1 << 31); 596 } 597 if ((double)val > (int32_t)mask(31)) { 598 feraiseexcept(FeInvalid); 599 feclearexcept(FeInexact); 600 return (int32_t)mask(31); 601 } 602 return (int32_t)val; 603 } 604 } else { 605 if (half) { 606 if ((double)val < 0) { 607 feraiseexcept(FeInvalid); 608 feclearexcept(FeInexact); 609 return 0; 610 } 611 if ((double)val > (mask(16))) { 612 feraiseexcept(FeInvalid); 613 feclearexcept(FeInexact); 614 return mask(16); 615 } 616 return (uint16_t)val; 617 } else { 618 if ((double)val < 0) { 619 feraiseexcept(FeInvalid); 620 feclearexcept(FeInexact); 621 return 0; 622 } 623 if ((double)val > (mask(32))) { 624 feraiseexcept(FeInvalid); 625 feclearexcept(FeInexact); 626 return mask(32); 627 } 628 return (uint32_t)val; 629 } 630 } 631} 632 633float 634vfpUFixedToFpS(bool flush, bool defaultNan, 635 uint32_t val, bool half, uint8_t imm) 636{ 637 fesetround(FeRoundNearest); 638 if (half) 639 val = (uint16_t)val; 640 float scale = powf(2.0, imm); 641 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 642 feclearexcept(FeAllExceptions); 643 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 644 return fixDivDest(flush, defaultNan, val / scale, (float)val, scale); 645} 646 647float 648vfpSFixedToFpS(bool flush, bool defaultNan, 649 int32_t val, bool half, uint8_t imm) 650{ 651 fesetround(FeRoundNearest); 652 if (half) 653 val = sext<16>(val & mask(16)); 654 float scale = powf(2.0, imm); 655 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 656 feclearexcept(FeAllExceptions); 657 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 658 return fixDivDest(flush, defaultNan, val / scale, (float)val, scale); 659} 660 661uint64_t 662vfpFpDToFixed(double val, bool isSigned, bool half, 663 uint8_t imm, bool rzero) 664{ 665 int rmode = rzero ? FeRoundZero : fegetround(); 666 fesetround(FeRoundNearest); 667 val = val * pow(2.0, imm); 668 __asm__ __volatile__("" : "=m" (val) : "m" (val)); 669 fesetround(rmode); 670 feclearexcept(FeAllExceptions); 671 __asm__ __volatile__("" : "=m" (val) : "m" (val)); 672 double origVal = val; 673 val = rint(val); 674 int fpType = std::fpclassify(val); 675 if (fpType == FP_SUBNORMAL || fpType == FP_NAN) { 676 if (fpType == FP_NAN) { 677 feraiseexcept(FeInvalid); 678 } 679 val = 0.0; 680 } else if (origVal != val) { 681 switch (rmode) { 682 case FeRoundNearest: 683 if (origVal - val > 0.5) 684 val += 1.0; 685 else if (val - origVal > 0.5) 686 val -= 1.0; 687 break; 688 case FeRoundDown: 689 if (origVal < val) 690 val -= 1.0; 691 break; 692 case FeRoundUpward: 693 if (origVal > val) 694 val += 1.0; 695 break; 696 } 697 feraiseexcept(FeInexact); 698 } 699 if (isSigned) { 700 if (half) { 701 if (val < (int16_t)(1 << 15)) { 702 feraiseexcept(FeInvalid); 703 feclearexcept(FeInexact); 704 return (int16_t)(1 << 15); 705 } 706 if (val > (int16_t)mask(15)) { 707 feraiseexcept(FeInvalid); 708 feclearexcept(FeInexact); 709 return (int16_t)mask(15); 710 } 711 return (int16_t)val; 712 } else { 713 if (val < (int32_t)(1 << 31)) { 714 feraiseexcept(FeInvalid); 715 feclearexcept(FeInexact); 716 return (int32_t)(1 << 31); 717 } 718 if (val > (int32_t)mask(31)) { 719 feraiseexcept(FeInvalid); 720 feclearexcept(FeInexact); 721 return (int32_t)mask(31); 722 } 723 return (int32_t)val; 724 } 725 } else { 726 if (half) { 727 if (val < 0) { 728 feraiseexcept(FeInvalid); 729 feclearexcept(FeInexact); 730 return 0; 731 } 732 if (val > mask(16)) { 733 feraiseexcept(FeInvalid); 734 feclearexcept(FeInexact); 735 return mask(16); 736 } 737 return (uint16_t)val; 738 } else { 739 if (val < 0) { 740 feraiseexcept(FeInvalid); 741 feclearexcept(FeInexact); 742 return 0; 743 } 744 if (val > mask(32)) { 745 feraiseexcept(FeInvalid); 746 feclearexcept(FeInexact); 747 return mask(32); 748 } 749 return (uint32_t)val; 750 } 751 } 752} 753 754double 755vfpUFixedToFpD(bool flush, bool defaultNan, 756 uint32_t val, bool half, uint8_t imm) 757{ 758 fesetround(FeRoundNearest); 759 if (half) 760 val = (uint16_t)val; 761 double scale = pow(2.0, imm); 762 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 763 feclearexcept(FeAllExceptions); 764 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 765 return fixDivDest(flush, defaultNan, val / scale, (double)val, scale); 766} 767 768double 769vfpSFixedToFpD(bool flush, bool defaultNan, 770 int32_t val, bool half, uint8_t imm) 771{ 772 fesetround(FeRoundNearest); 773 if (half) 774 val = sext<16>(val & mask(16)); 775 double scale = pow(2.0, imm); 776 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 777 feclearexcept(FeAllExceptions); 778 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 779 return fixDivDest(flush, defaultNan, val / scale, (double)val, scale); 780} 781 782// This function implements a magic formula taken from the architecture 783// reference manual. It was originally called recip_sqrt_estimate. 784static double 785recipSqrtEstimate(double a) 786{ 787 int64_t q0, q1, s; 788 double r; 789 if (a < 0.5) { 790 q0 = (int64_t)(a * 512.0); 791 r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0); 792 } else { 793 q1 = (int64_t)(a * 256.0); 794 r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0); 795 } 796 s = (int64_t)(256.0 * r + 0.5); 797 return (double)s / 256.0; 798} 799 800// This function is only intended for use in Neon instructions because 801// it ignores certain bits in the FPSCR. 802float 803fprSqrtEstimate(FPSCR &fpscr, float op) 804{ 805 const uint32_t qnan = 0x7fc00000; 806 float junk = 0.0; 807 int fpClass = std::fpclassify(op); 808 if (fpClass == FP_NAN) { 809 if ((fpToBits(op) & qnan) != qnan) 810 fpscr.ioc = 1; 811 return bitsToFp(qnan, junk); 812 } else if (fpClass == FP_ZERO) { 813 fpscr.dzc = 1; 814 // Return infinity with the same sign as the operand. 815 return bitsToFp((std::signbit(op) << 31) | 816 (0xFF << 23) | (0 << 0), junk); 817 } else if (std::signbit(op)) { 818 // Set invalid op bit. 819 fpscr.ioc = 1; 820 return bitsToFp(qnan, junk); 821 } else if (fpClass == FP_INFINITE) { 822 return 0.0; 823 } else { 824 uint64_t opBits = fpToBits(op); 825 double scaled; 826 if (bits(opBits, 23)) { 827 scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | 828 (ULL(0x3fd) << 52) | (bits(opBits, 31) << 63), 829 (double)0.0); 830 } else { 831 scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | 832 (ULL(0x3fe) << 52) | (bits(opBits, 31) << 63), 833 (double)0.0); 834 } 835 uint64_t resultExp = (380 - bits(opBits, 30, 23)) / 2; 836 837 uint64_t estimate = fpToBits(recipSqrtEstimate(scaled)); 838 839 return bitsToFp((bits(estimate, 63) << 31) | 840 (bits(resultExp, 7, 0) << 23) | 841 (bits(estimate, 51, 29) << 0), junk); 842 } 843} 844 845uint32_t 846unsignedRSqrtEstimate(uint32_t op) 847{ 848 if (bits(op, 31, 30) == 0) { 849 return -1; 850 } else { 851 double dpOp; 852 if (bits(op, 31)) { 853 dpOp = bitsToFp((ULL(0) << 63) | 854 (ULL(0x3fe) << 52) | 855 (bits((uint64_t)op, 30, 0) << 21) | 856 (0 << 0), (double)0.0); 857 } else { 858 dpOp = bitsToFp((ULL(0) << 63) | 859 (ULL(0x3fd) << 52) | 860 (bits((uint64_t)op, 29, 0) << 22) | 861 (0 << 0), (double)0.0); 862 } 863 uint64_t estimate = fpToBits(recipSqrtEstimate(dpOp)); 864 return (1 << 31) | bits(estimate, 51, 21); 865 } 866} 867 868// This function implements a magic formula taken from the architecture 869// reference manual. It was originally called recip_estimate. 870 871static double 872recipEstimate(double a) 873{ 874 int64_t q, s; 875 double r; 876 q = (int64_t)(a * 512.0); 877 r = 1.0 / (((double)q + 0.5) / 512.0); 878 s = (int64_t)(256.0 * r + 0.5); 879 return (double)s / 256.0; 880} 881 882// This function is only intended for use in Neon instructions because 883// it ignores certain bits in the FPSCR. 884float 885fpRecipEstimate(FPSCR &fpscr, float op) 886{ 887 const uint32_t qnan = 0x7fc00000; 888 float junk = 0.0; 889 int fpClass = std::fpclassify(op); 890 if (fpClass == FP_NAN) { 891 if ((fpToBits(op) & qnan) != qnan) 892 fpscr.ioc = 1; 893 return bitsToFp(qnan, junk); 894 } else if (fpClass == FP_INFINITE) { 895 return bitsToFp(std::signbit(op) << 31, junk); 896 } else if (fpClass == FP_ZERO) { 897 fpscr.dzc = 1; 898 // Return infinity with the same sign as the operand. 899 return bitsToFp((std::signbit(op) << 31) | 900 (0xFF << 23) | (0 << 0), junk); 901 } else if (fabs(op) >= pow(2.0, 126)) { 902 fpscr.ufc = 1; 903 return bitsToFp(std::signbit(op) << 31, junk); 904 } else { 905 uint64_t opBits = fpToBits(op); 906 double scaled; 907 scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | 908 (ULL(0x3fe) << 52) | (ULL(0) << 63), 909 (double)0.0); 910 uint64_t resultExp = 253 - bits(opBits, 30, 23); 911 912 uint64_t estimate = fpToBits(recipEstimate(scaled)); 913 914 return bitsToFp((bits(opBits, 31) << 31) | 915 (bits(resultExp, 7, 0) << 23) | 916 (bits(estimate, 51, 29) << 0), junk); 917 } 918} 919 920uint32_t 921unsignedRecipEstimate(uint32_t op) 922{ 923 if (bits(op, 31) == 0) { 924 return -1; 925 } else { 926 double dpOp; 927 dpOp = bitsToFp((ULL(0) << 63) | 928 (ULL(0x3fe) << 52) | 929 (bits((uint64_t)op, 30, 0) << 21) | 930 (0 << 0), (double)0.0); 931 uint64_t estimate = fpToBits(recipEstimate(dpOp)); 932 return (1 << 31) | bits(estimate, 51, 21); 933 } 934} 935 936template <class fpType> 937fpType 938FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, 939 fpType op1, fpType op2) const 940{ 941 done = true; 942 fpType junk = 0.0; 943 fpType dest = 0.0; 944 const bool single = (sizeof(fpType) == sizeof(float)); 945 const uint64_t qnan = 946 single ? 0x7fc00000 : ULL(0x7ff8000000000000); 947 const bool nan1 = std::isnan(op1); 948 const bool nan2 = std::isnan(op2); 949 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); 950 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); 951 if (nan1 || nan2) { 952 if (defaultNan) { 953 dest = bitsToFp(qnan, junk); 954 } else if (signal1) { 955 dest = bitsToFp(fpToBits(op1) | qnan, junk); 956 } else if (signal2) { 957 dest = bitsToFp(fpToBits(op2) | qnan, junk); 958 } else if (nan1) { 959 dest = op1; 960 } else if (nan2) { 961 dest = op2; 962 } 963 if (signal1 || signal2) { 964 fpscr.ioc = 1; 965 } 966 } else { 967 done = false; 968 } 969 return dest; 970} 971 972template 973float FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, 974 float op1, float op2) const; 975template 976double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, 977 double op1, double op2) const; 978 979template <class fpType> 980fpType 981FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2, 982 fpType (*func)(fpType, fpType), 983 bool flush, bool defaultNan, uint32_t rMode) const 984{ 985 const bool single = (sizeof(fpType) == sizeof(float)); 986 fpType junk = 0.0; 987 988 if (flush && flushToZero(op1, op2)) 989 fpscr.idc = 1; 990 VfpSavedState state = prepFpState(rMode); 991 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (state) 992 : "m" (op1), "m" (op2), "m" (state)); 993 fpType dest = func(op1, op2); 994 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); 995 996 // Get NAN behavior right. This varies between x86 and ARM. 997 if (std::isnan(dest)) { 998 const uint64_t qnan = 999 single ? 0x7fc00000 : ULL(0x7ff8000000000000); 1000 const bool nan1 = std::isnan(op1); 1001 const bool nan2 = std::isnan(op2); 1002 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); 1003 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); 1004 if ((!nan1 && !nan2) || (defaultNan == 1)) { 1005 dest = bitsToFp(qnan, junk); 1006 } else if (signal1) { 1007 dest = bitsToFp(fpToBits(op1) | qnan, junk); 1008 } else if (signal2) { 1009 dest = bitsToFp(fpToBits(op2) | qnan, junk); 1010 } else if (nan1) { 1011 dest = op1; 1012 } else if (nan2) { 1013 dest = op2; 1014 } 1015 } else if (flush && flushToZero(dest)) { 1016 feraiseexcept(FeUnderflow); 1017 } else if (( 1018 (single && (dest == bitsToFp(0x00800000, junk) || 1019 dest == bitsToFp(0x80800000, junk))) || 1020 (!single && 1021 (dest == bitsToFp(ULL(0x0010000000000000), junk) || 1022 dest == bitsToFp(ULL(0x8010000000000000), junk))) 1023 ) && rMode != VfpRoundZero) { 1024 /* 1025 * Correct for the fact that underflow is detected -before- rounding 1026 * in ARM and -after- rounding in x86. 1027 */ 1028 fesetround(FeRoundZero); 1029 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2) 1030 : "m" (op1), "m" (op2)); 1031 fpType temp = func(op1, op2); 1032 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); 1033 if (flush && flushToZero(temp)) { 1034 dest = temp; 1035 } 1036 } 1037 finishVfp(fpscr, state, flush); 1038 return dest; 1039} 1040 1041template 1042float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2, 1043 float (*func)(float, float), 1044 bool flush, bool defaultNan, uint32_t rMode) const; 1045template 1046double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2, 1047 double (*func)(double, double), 1048 bool flush, bool defaultNan, uint32_t rMode) const; 1049 1050template <class fpType> 1051fpType 1052FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (*func)(fpType), 1053 bool flush, uint32_t rMode) const 1054{ 1055 const bool single = (sizeof(fpType) == sizeof(float)); 1056 fpType junk = 0.0; 1057 1058 if (flush && flushToZero(op1)) 1059 fpscr.idc = 1; 1060 VfpSavedState state = prepFpState(rMode); 1061 __asm__ __volatile__ ("" : "=m" (op1), "=m" (state) 1062 : "m" (op1), "m" (state)); 1063 fpType dest = func(op1); 1064 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); 1065 1066 // Get NAN behavior right. This varies between x86 and ARM. 1067 if (std::isnan(dest)) { 1068 const uint64_t qnan = 1069 single ? 0x7fc00000 : ULL(0x7ff8000000000000); 1070 const bool nan = std::isnan(op1); 1071 if (!nan || fpscr.dn == 1) { 1072 dest = bitsToFp(qnan, junk); 1073 } else if (nan) { 1074 dest = bitsToFp(fpToBits(op1) | qnan, junk); 1075 } 1076 } else if (flush && flushToZero(dest)) { 1077 feraiseexcept(FeUnderflow); 1078 } else if (( 1079 (single && (dest == bitsToFp(0x00800000, junk) || 1080 dest == bitsToFp(0x80800000, junk))) || 1081 (!single && 1082 (dest == bitsToFp(ULL(0x0010000000000000), junk) || 1083 dest == bitsToFp(ULL(0x8010000000000000), junk))) 1084 ) && rMode != VfpRoundZero) { 1085 /* 1086 * Correct for the fact that underflow is detected -before- rounding 1087 * in ARM and -after- rounding in x86. 1088 */ 1089 fesetround(FeRoundZero); 1090 __asm__ __volatile__ ("" : "=m" (op1) : "m" (op1)); 1091 fpType temp = func(op1); 1092 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); 1093 if (flush && flushToZero(temp)) { 1094 dest = temp; 1095 } 1096 } 1097 finishVfp(fpscr, state, flush); 1098 return dest; 1099} 1100 1101template 1102float FpOp::unaryOp(FPSCR &fpscr, float op1, float (*func)(float), 1103 bool flush, uint32_t rMode) const; 1104template 1105double FpOp::unaryOp(FPSCR &fpscr, double op1, double (*func)(double), 1106 bool flush, uint32_t rMode) const; 1107 1108IntRegIndex 1109VfpMacroOp::addStride(IntRegIndex idx, unsigned stride) 1110{ 1111 if (wide) { 1112 stride *= 2; 1113 } 1114 unsigned offset = idx % 8; 1115 idx = (IntRegIndex)(idx - offset); 1116 offset += stride; 1117 idx = (IntRegIndex)(idx + (offset % 8)); 1118 return idx; 1119} 1120 1121void 1122VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1, IntRegIndex &op2) 1123{ 1124 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; 1125 assert(!inScalarBank(dest)); 1126 dest = addStride(dest, stride); 1127 op1 = addStride(op1, stride); 1128 if (!inScalarBank(op2)) { 1129 op2 = addStride(op2, stride); 1130 } 1131} 1132 1133void 1134VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1) 1135{ 1136 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; 1137 assert(!inScalarBank(dest)); 1138 dest = addStride(dest, stride); 1139 if (!inScalarBank(op1)) { 1140 op1 = addStride(op1, stride); 1141 } 1142} 1143 1144void 1145VfpMacroOp::nextIdxs(IntRegIndex &dest) 1146{ 1147 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; 1148 assert(!inScalarBank(dest)); 1149 dest = addStride(dest, stride); 1150} 1151 1152} 1153