Cross Reference: /gem5/src/arch/arm/insts/vfp.cc

Deleted Added

sdiff udiff text old ( 11671:520509f3e66c ) new ( 12104:edd63f9c6184 )

full compact

vfp.cc (11671:520509f3e66c)	vfp.cc (12104:edd63f9c6184)
1/* 2 * Copyright (c) 2010-2013 ARM Limited 3 * All rights reserved 4 * 5 * The license below extends only to copyright in the software and shall 6 * not be construed as granting a license to any other intellectual 7 * property including but not limited to intellectual property relating 8 * to a hardware implementation of the functionality of the software 9 * licensed hereunder. You may use the software subject to the license 10 * terms below provided that you ensure that this notice is replicated 11 * unmodified and in its entirety in all distributions of the software, 12 * modified or unmodified, in source code or in binary form. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions are 16 * met: redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer; 18 * redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution; 21 * neither the name of the copyright holders nor the names of its 22 * contributors may be used to endorse or promote products derived from 23 * this software without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 30 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 31 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 32 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 34 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 * 37 * Authors: Gabe Black 38 / 39 40#include "arch/arm/insts/vfp.hh" 41 42/ 43 * The asm statements below are to keep gcc from reordering code. Otherwise 44 * the rounding mode might be set after the operation it was intended for, the 45 * exception bits read before it, etc. 46 / 47 48std::string 49FpCondCompRegOp::generateDisassembly( 50 Addr pc, const SymbolTable symtab) const 51{ 52 std::stringstream ss; 53 printMnemonic(ss, "", false);	1/* 2 * Copyright (c) 2010-2013 ARM Limited 3 * All rights reserved 4 * 5 * The license below extends only to copyright in the software and shall 6 * not be construed as granting a license to any other intellectual 7 * property including but not limited to intellectual property relating 8 * to a hardware implementation of the functionality of the software 9 * licensed hereunder. You may use the software subject to the license 10 * terms below provided that you ensure that this notice is replicated 11 * unmodified and in its entirety in all distributions of the software, 12 * modified or unmodified, in source code or in binary form. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions are 16 * met: redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer; 18 * redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution; 21 * neither the name of the copyright holders nor the names of its 22 * contributors may be used to endorse or promote products derived from 23 * this software without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 30 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 31 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 32 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 34 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 * 37 * Authors: Gabe Black 38 / 39 40#include "arch/arm/insts/vfp.hh" 41 42/ 43 * The asm statements below are to keep gcc from reordering code. Otherwise 44 * the rounding mode might be set after the operation it was intended for, the 45 * exception bits read before it, etc. 46 / 47 48std::string 49FpCondCompRegOp::generateDisassembly( 50 Addr pc, const SymbolTable symtab) const 51{ 52 std::stringstream ss; 53 printMnemonic(ss, "", false);
54 printReg(ss, op1);	54 printIntReg(ss, op1);
55 ccprintf(ss, ", ");	55 ccprintf(ss, ", ");
56 printReg(ss, op2);	56 printIntReg(ss, op2);
57 ccprintf(ss, ", #%d", defCc); 58 ccprintf(ss, ", "); 59 printCondition(ss, condCode, true); 60 return ss.str(); 61} 62 63std::string 64FpCondSelOp::generateDisassembly( 65 Addr pc, const SymbolTable *symtab) const 66{ 67 std::stringstream ss; 68 printMnemonic(ss, "", false);	57 ccprintf(ss, ", #%d", defCc); 58 ccprintf(ss, ", "); 59 printCondition(ss, condCode, true); 60 return ss.str(); 61} 62 63std::string 64FpCondSelOp::generateDisassembly( 65 Addr pc, const SymbolTable *symtab) const 66{ 67 std::stringstream ss; 68 printMnemonic(ss, "", false);
69 printReg(ss, dest);	69 printIntReg(ss, dest);
70 ccprintf(ss, ", ");	70 ccprintf(ss, ", ");
71 printReg(ss, op1);	71 printIntReg(ss, op1);
72 ccprintf(ss, ", ");	72 ccprintf(ss, ", ");
73 printReg(ss, op2);	73 printIntReg(ss, op2);
74 ccprintf(ss, ", "); 75 printCondition(ss, condCode, true); 76 return ss.str(); 77} 78 79std::string 80FpRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 81{ 82 std::stringstream ss; 83 printMnemonic(ss);	74 ccprintf(ss, ", "); 75 printCondition(ss, condCode, true); 76 return ss.str(); 77} 78 79std::string 80FpRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 81{ 82 std::stringstream ss; 83 printMnemonic(ss);
84 printReg(ss, dest + FP_Reg_Base);	84 printFloatReg(ss, dest);
85 ss << ", ";	85 ss << ", ";
86 printReg(ss, op1 + FP_Reg_Base);	86 printFloatReg(ss, op1);
87 return ss.str(); 88} 89 90std::string 91FpRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 92{ 93 std::stringstream ss; 94 printMnemonic(ss);	87 return ss.str(); 88} 89 90std::string 91FpRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const 92{ 93 std::stringstream ss; 94 printMnemonic(ss);
95 printReg(ss, dest + FP_Reg_Base);	95 printFloatReg(ss, dest);
96 ccprintf(ss, ", #%d", imm); 97 return ss.str(); 98} 99 100std::string 101FpRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable symtab) const 102{ 103* std::stringstream ss; 104 printMnemonic(ss);	96 ccprintf(ss, ", #%d", imm); 97 return ss.str(); 98} 99 100std::string 101FpRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable symtab) const 102{ 103* std::stringstream ss; 104 printMnemonic(ss);
105 printReg(ss, dest + FP_Reg_Base);	105 printFloatReg(ss, dest);
106 ss << ", ";	106 ss << ", ";
107 printReg(ss, op1 + FP_Reg_Base);	107 printFloatReg(ss, op1);
108 ccprintf(ss, ", #%d", imm); 109 return ss.str(); 110} 111 112std::string 113FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable symtab) const 114{ 115* std::stringstream ss; 116 printMnemonic(ss);	108 ccprintf(ss, ", #%d", imm); 109 return ss.str(); 110} 111 112std::string 113FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable symtab) const 114{ 115* std::stringstream ss; 116 printMnemonic(ss);
117 printReg(ss, dest + FP_Reg_Base);	117 printFloatReg(ss, dest);
118 ss << ", ";	118 ss << ", ";
119 printReg(ss, op1 + FP_Reg_Base);	119 printFloatReg(ss, op1);
120 ss << ", ";	120 ss << ", ";
121 printReg(ss, op2 + FP_Reg_Base);	121 printFloatReg(ss, op2);
122 return ss.str(); 123} 124 125std::string 126FpRegRegRegCondOp::generateDisassembly(Addr pc, const SymbolTable symtab) 127* const 128{ 129 std::stringstream ss; 130 printMnemonic(ss); 131 printCondition(ss, cond);	122 return ss.str(); 123} 124 125std::string 126FpRegRegRegCondOp::generateDisassembly(Addr pc, const SymbolTable symtab) 127* const 128{ 129 std::stringstream ss; 130 printMnemonic(ss); 131 printCondition(ss, cond);
132 printReg(ss, dest + FP_Reg_Base);	132 printFloatReg(ss, dest);
133 ss << ", ";	133 ss << ", ";
134 printReg(ss, op1 + FP_Reg_Base);	134 printFloatReg(ss, op1);
135 ss << ", ";	135 ss << ", ";
136 printReg(ss, op2 + FP_Reg_Base);	136 printFloatReg(ss, op2);
137 return ss.str(); 138} 139 140std::string 141FpRegRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable symtab) const 142{ 143* std::stringstream ss; 144 printMnemonic(ss);	137 return ss.str(); 138} 139 140std::string 141FpRegRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable symtab) const 142{ 143* std::stringstream ss; 144 printMnemonic(ss);
145 printReg(ss, dest + FP_Reg_Base);	145 printFloatReg(ss, dest);
146 ss << ", ";	146 ss << ", ";
147 printReg(ss, op1 + FP_Reg_Base);	147 printFloatReg(ss, op1);
148 ss << ", ";	148 ss << ", ";
149 printReg(ss, op2 + FP_Reg_Base);	149 printFloatReg(ss, op2);
150 ss << ", ";	150 ss << ", ";
151 printReg(ss, op3 + FP_Reg_Base);	151 printFloatReg(ss, op3);
152 return ss.str(); 153} 154 155std::string 156FpRegRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable symtab) const 157{ 158* std::stringstream ss; 159 printMnemonic(ss);	152 return ss.str(); 153} 154 155std::string 156FpRegRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable symtab) const 157{ 158* std::stringstream ss; 159 printMnemonic(ss);
160 printReg(ss, dest + FP_Reg_Base);	160 printFloatReg(ss, dest);
161 ss << ", ";	161 ss << ", ";
162 printReg(ss, op1 + FP_Reg_Base);	162 printFloatReg(ss, op1);
163 ss << ", ";	163 ss << ", ";
164 printReg(ss, op2 + FP_Reg_Base);	164 printFloatReg(ss, op2);
165 ccprintf(ss, ", #%d", imm); 166 return ss.str(); 167} 168 169namespace ArmISA 170{ 171 172VfpSavedState 173prepFpState(uint32_t rMode) 174{ 175 int roundingMode = fegetround(); 176 feclearexcept(FeAllExceptions); 177 switch (rMode) { 178 case VfpRoundNearest: 179 fesetround(FeRoundNearest); 180 break; 181 case VfpRoundUpward: 182 fesetround(FeRoundUpward); 183 break; 184 case VfpRoundDown: 185 fesetround(FeRoundDown); 186 break; 187 case VfpRoundZero: 188 fesetround(FeRoundZero); 189 break; 190 } 191 return roundingMode; 192} 193 194void 195finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush, FPSCR mask) 196{ 197 int exceptions = fetestexcept(FeAllExceptions); 198 bool underflow = false; 199 if ((exceptions & FeInvalid) && mask.ioc) { 200 fpscr.ioc = 1; 201 } 202 if ((exceptions & FeDivByZero) && mask.dzc) { 203 fpscr.dzc = 1; 204 } 205 if ((exceptions & FeOverflow) && mask.ofc) { 206 fpscr.ofc = 1; 207 } 208 if (exceptions & FeUnderflow) { 209 underflow = true; 210 if (mask.ufc) 211 fpscr.ufc = 1; 212 } 213 if ((exceptions & FeInexact) && !(underflow && flush) && mask.ixc) { 214 fpscr.ixc = 1; 215 } 216 fesetround(state); 217} 218 219template <class fpType> 220fpType 221fixDest(bool flush, bool defaultNan, fpType val, fpType op1) 222{ 223 int fpClass = std::fpclassify(val); 224 fpType junk = 0.0; 225 if (fpClass == FP_NAN) { 226 const bool single = (sizeof(val) == sizeof(float)); 227 const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); 228 const bool nan = std::isnan(op1); 229 if (!nan \|\| defaultNan) { 230 val = bitsToFp(qnan, junk); 231 } else if (nan) { 232 val = bitsToFp(fpToBits(op1) \| qnan, junk); 233 } 234 } else if (fpClass == FP_SUBNORMAL && flush == 1) { 235 // Turn val into a zero with the correct sign; 236 uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1); 237 val = bitsToFp(fpToBits(val) & bitMask, junk); 238 feclearexcept(FeInexact); 239 feraiseexcept(FeUnderflow); 240 } 241 return val; 242} 243 244template 245float fixDest<float>(bool flush, bool defaultNan, float val, float op1); 246template 247double fixDest<double>(bool flush, bool defaultNan, double val, double op1); 248 249template <class fpType> 250fpType 251fixDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2) 252{ 253 int fpClass = std::fpclassify(val); 254 fpType junk = 0.0; 255 if (fpClass == FP_NAN) { 256 const bool single = (sizeof(val) == sizeof(float)); 257 const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); 258 const bool nan1 = std::isnan(op1); 259 const bool nan2 = std::isnan(op2); 260 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); 261 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); 262 if ((!nan1 && !nan2) \|\| defaultNan) { 263 val = bitsToFp(qnan, junk); 264 } else if (signal1) { 265 val = bitsToFp(fpToBits(op1) \| qnan, junk); 266 } else if (signal2) { 267 val = bitsToFp(fpToBits(op2) \| qnan, junk); 268 } else if (nan1) { 269 val = op1; 270 } else if (nan2) { 271 val = op2; 272 } 273 } else if (fpClass == FP_SUBNORMAL && flush) { 274 // Turn val into a zero with the correct sign; 275 uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1); 276 val = bitsToFp(fpToBits(val) & bitMask, junk); 277 feclearexcept(FeInexact); 278 feraiseexcept(FeUnderflow); 279 } 280 return val; 281} 282 283template 284float fixDest<float>(bool flush, bool defaultNan, 285 float val, float op1, float op2); 286template 287double fixDest<double>(bool flush, bool defaultNan, 288 double val, double op1, double op2); 289 290template <class fpType> 291fpType 292fixDivDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2) 293{ 294 fpType mid = fixDest(flush, defaultNan, val, op1, op2); 295 const bool single = (sizeof(fpType) == sizeof(float)); 296 const fpType junk = 0.0; 297 if ((single && (val == bitsToFp(0x00800000, junk) \|\| 298 val == bitsToFp(0x80800000, junk))) \|\| 299 (!single && (val == bitsToFp(ULL(0x0010000000000000), junk) \|\| 300 val == bitsToFp(ULL(0x8010000000000000), junk))) 301 ) { 302 __asm__ __volatile__("" : "=m" (op1) : "m" (op1)); 303 fesetround(FeRoundZero); 304 fpType temp = 0.0; 305 __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); 306 temp = op1 / op2; 307 if (flushToZero(temp)) { 308 feraiseexcept(FeUnderflow); 309 if (flush) { 310 feclearexcept(FeInexact); 311 mid = temp; 312 } 313 } 314 __asm__ __volatile__("" :: "m" (temp)); 315 } 316 return mid; 317} 318 319template 320float fixDivDest<float>(bool flush, bool defaultNan, 321 float val, float op1, float op2); 322template 323double fixDivDest<double>(bool flush, bool defaultNan, 324 double val, double op1, double op2); 325 326float 327fixFpDFpSDest(FPSCR fpscr, double val) 328{ 329 const float junk = 0.0; 330 float op1 = 0.0; 331 if (std::isnan(val)) { 332 uint64_t valBits = fpToBits(val); 333 uint32_t op1Bits = bits(valBits, 50, 29) \| 334 (mask(9) << 22) \| 335 (bits(valBits, 63) << 31); 336 op1 = bitsToFp(op1Bits, junk); 337 } 338 float mid = fixDest(fpscr.fz, fpscr.dn, (float)val, op1); 339 if (fpscr.fz && fetestexcept(FeUnderflow \| FeInexact) == 340 (FeUnderflow \| FeInexact)) { 341 feclearexcept(FeInexact); 342 } 343 if (mid == bitsToFp(0x00800000, junk) \|\| 344 mid == bitsToFp(0x80800000, junk)) { 345 __asm__ __volatile__("" : "=m" (val) : "m" (val)); 346 fesetround(FeRoundZero); 347 float temp = 0.0; 348 __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); 349 temp = val; 350 if (flushToZero(temp)) { 351 feraiseexcept(FeUnderflow); 352 if (fpscr.fz) { 353 feclearexcept(FeInexact); 354 mid = temp; 355 } 356 } 357 __asm__ __volatile__("" :: "m" (temp)); 358 } 359 return mid; 360} 361 362double 363fixFpSFpDDest(FPSCR fpscr, float val) 364{ 365 const double junk = 0.0; 366 double op1 = 0.0; 367 if (std::isnan(val)) { 368 uint32_t valBits = fpToBits(val); 369 uint64_t op1Bits = ((uint64_t)bits(valBits, 21, 0) << 29) \| 370 (mask(12) << 51) \| 371 ((uint64_t)bits(valBits, 31) << 63); 372 op1 = bitsToFp(op1Bits, junk); 373 } 374 double mid = fixDest(fpscr.fz, fpscr.dn, (double)val, op1); 375 if (mid == bitsToFp(ULL(0x0010000000000000), junk) \|\| 376 mid == bitsToFp(ULL(0x8010000000000000), junk)) { 377 __asm__ __volatile__("" : "=m" (val) : "m" (val)); 378 fesetround(FeRoundZero); 379 double temp = 0.0; 380 __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); 381 temp = val; 382 if (flushToZero(temp)) { 383 feraiseexcept(FeUnderflow); 384 if (fpscr.fz) { 385 feclearexcept(FeInexact); 386 mid = temp; 387 } 388 } 389 __asm__ __volatile__("" :: "m" (temp)); 390 } 391 return mid; 392} 393 394static inline uint16_t 395vcvtFpFpH(FPSCR &fpscr, bool flush, bool defaultNan, 396 uint32_t rMode, bool ahp, uint64_t opBits, bool isDouble) 397{ 398 uint32_t mWidth; 399 uint32_t eWidth; 400 uint32_t eHalfRange; 401 uint32_t sBitPos; 402 403 if (isDouble) { 404 mWidth = 52; 405 eWidth = 11; 406 } else { 407 mWidth = 23; 408 eWidth = 8; 409 } 410 sBitPos = eWidth + mWidth; 411 eHalfRange = (1 << (eWidth-1)) - 1; 412 413 // Extract the operand. 414 bool neg = bits(opBits, sBitPos); 415 uint32_t exponent = bits(opBits, sBitPos-1, mWidth); 416 uint64_t oldMantissa = bits(opBits, mWidth-1, 0); 417 uint32_t mantissa = oldMantissa >> (mWidth - 10); 418 // Do the conversion. 419 uint64_t extra = oldMantissa & mask(mWidth - 10); 420 if (exponent == mask(eWidth)) { 421 if (oldMantissa != 0) { 422 // Nans. 423 if (bits(mantissa, 9) == 0) { 424 // Signalling nan. 425 fpscr.ioc = 1; 426 } 427 if (ahp) { 428 mantissa = 0; 429 exponent = 0; 430 fpscr.ioc = 1; 431 } else if (defaultNan) { 432 mantissa = (1 << 9); 433 exponent = 0x1f; 434 neg = false; 435 } else { 436 exponent = 0x1f; 437 mantissa \|= (1 << 9); 438 } 439 } else { 440 // Infinities. 441 exponent = 0x1F; 442 if (ahp) { 443 fpscr.ioc = 1; 444 mantissa = 0x3ff; 445 } else { 446 mantissa = 0; 447 } 448 } 449 } else if (exponent == 0 && oldMantissa == 0) { 450 // Zero, don't need to do anything. 451 } else { 452 // Normalized or denormalized numbers. 453 454 bool inexact = (extra != 0); 455 456 if (exponent == 0) { 457 // Denormalized. 458 // If flush to zero is on, this shouldn't happen. 459 assert(!flush); 460 461 // Check for underflow 462 if (inexact \|\| fpscr.ufe) 463 fpscr.ufc = 1; 464 465 // Handle rounding. 466 unsigned mode = rMode; 467 if ((mode == VfpRoundUpward && !neg && extra) \|\| 468 (mode == VfpRoundDown && neg && extra) \|\| 469 (mode == VfpRoundNearest && 470 (extra > (1 << 9) \|\| 471 (extra == (1 << 9) && bits(mantissa, 0))))) { 472 mantissa++; 473 } 474 475 // See if the number became normalized after rounding. 476 if (mantissa == (1 << 10)) { 477 mantissa = 0; 478 exponent = 1; 479 } 480 } else { 481 // Normalized. 482 483 // We need to track the dropped bits differently since 484 // more can be dropped by denormalizing. 485 bool topOne = bits(extra, mWidth - 10 - 1); 486 bool restZeros = bits(extra, mWidth - 10 - 2, 0) == 0; 487 488 if (exponent <= (eHalfRange - 15)) { 489 // The result is too small. Denormalize. 490 mantissa \|= (1 << 10); 491 while (mantissa && exponent <= (eHalfRange - 15)) { 492 restZeros = restZeros && !topOne; 493 topOne = bits(mantissa, 0); 494 mantissa = mantissa >> 1; 495 exponent++; 496 } 497 if (topOne \|\| !restZeros) 498 inexact = true; 499 exponent = 0; 500 } else { 501 // Change bias. 502 exponent -= (eHalfRange - 15); 503 } 504 505 if (exponent == 0 && (inexact \|\| fpscr.ufe)) { 506 // Underflow 507 fpscr.ufc = 1; 508 } 509 510 // Handle rounding. 511 unsigned mode = rMode; 512 bool nonZero = topOne \|\| !restZeros; 513 if ((mode == VfpRoundUpward && !neg && nonZero) \|\| 514 (mode == VfpRoundDown && neg && nonZero) \|\| 515 (mode == VfpRoundNearest && topOne && 516 (!restZeros \|\| bits(mantissa, 0)))) { 517 mantissa++; 518 } 519 520 // See if we rounded up and need to bump the exponent. 521 if (mantissa == (1 << 10)) { 522 mantissa = 0; 523 exponent++; 524 } 525 526 // Deal with overflow 527 if (ahp) { 528 if (exponent >= 0x20) { 529 exponent = 0x1f; 530 mantissa = 0x3ff; 531 fpscr.ioc = 1; 532 // Supress inexact exception. 533 inexact = false; 534 } 535 } else { 536 if (exponent >= 0x1f) { 537 if ((mode == VfpRoundNearest) \|\| 538 (mode == VfpRoundUpward && !neg) \|\| 539 (mode == VfpRoundDown && neg)) { 540 // Overflow to infinity. 541 exponent = 0x1f; 542 mantissa = 0; 543 } else { 544 // Overflow to max normal. 545 exponent = 0x1e; 546 mantissa = 0x3ff; 547 } 548 fpscr.ofc = 1; 549 inexact = true; 550 } 551 } 552 } 553 554 if (inexact) { 555 fpscr.ixc = 1; 556 } 557 } 558 // Reassemble and install the result. 559 uint32_t result = bits(mantissa, 9, 0); 560 replaceBits(result, 14, 10, exponent); 561 if (neg) 562 result \|= (1 << 15); 563 return result; 564} 565 566uint16_t 567vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan, 568 uint32_t rMode, bool ahp, float op) 569{ 570 uint64_t opBits = fpToBits(op); 571 return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, false); 572} 573 574uint16_t 575vcvtFpDFpH(FPSCR &fpscr, bool flush, bool defaultNan, 576 uint32_t rMode, bool ahp, double op) 577{ 578 uint64_t opBits = fpToBits(op); 579 return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, true); 580} 581 582static inline uint64_t 583vcvtFpHFp(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op, bool isDouble) 584{ 585 uint32_t mWidth; 586 uint32_t eWidth; 587 uint32_t eHalfRange; 588 uint32_t sBitPos; 589 590 if (isDouble) { 591 mWidth = 52; 592 eWidth = 11; 593 } else { 594 mWidth = 23; 595 eWidth = 8; 596 } 597 sBitPos = eWidth + mWidth; 598 eHalfRange = (1 << (eWidth-1)) - 1; 599 600 // Extract the bitfields. 601 bool neg = bits(op, 15); 602 uint32_t exponent = bits(op, 14, 10); 603 uint64_t mantissa = bits(op, 9, 0); 604 // Do the conversion. 605 if (exponent == 0) { 606 if (mantissa != 0) { 607 // Normalize the value. 608 exponent = exponent + (eHalfRange - 15) + 1; 609 while (mantissa < (1 << 10)) { 610 mantissa = mantissa << 1; 611 exponent--; 612 } 613 } 614 mantissa = mantissa << (mWidth - 10); 615 } else if (exponent == 0x1f && !ahp) { 616 // Infinities and nans. 617 exponent = mask(eWidth); 618 if (mantissa != 0) { 619 // Nans. 620 mantissa = mantissa << (mWidth - 10); 621 if (bits(mantissa, mWidth-1) == 0) { 622 // Signalling nan. 623 fpscr.ioc = 1; 624 mantissa \|= (((uint64_t) 1) << (mWidth-1)); 625 } 626 if (defaultNan) { 627 mantissa &= ~mask(mWidth-1); 628 neg = false; 629 } 630 } 631 } else { 632 exponent = exponent + (eHalfRange - 15); 633 mantissa = mantissa << (mWidth - 10); 634 } 635 // Reassemble the result. 636 uint64_t result = bits(mantissa, mWidth-1, 0); 637 replaceBits(result, sBitPos-1, mWidth, exponent); 638 if (neg) { 639 result \|= (((uint64_t) 1) << sBitPos); 640 } 641 return result; 642} 643 644double 645vcvtFpHFpD(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op) 646{ 647 double junk = 0.0; 648 uint64_t result; 649 650 result = vcvtFpHFp(fpscr, defaultNan, ahp, op, true); 651 return bitsToFp(result, junk); 652} 653 654float 655vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op) 656{ 657 float junk = 0.0; 658 uint64_t result; 659 660 result = vcvtFpHFp(fpscr, defaultNan, ahp, op, false); 661 return bitsToFp(result, junk); 662} 663 664float 665vfpUFixedToFpS(bool flush, bool defaultNan, 666 uint64_t val, uint8_t width, uint8_t imm) 667{ 668 fesetround(FeRoundNearest); 669 if (width == 16) 670 val = (uint16_t)val; 671 else if (width == 32) 672 val = (uint32_t)val; 673 else if (width != 64) 674 panic("Unsupported width %d", width); 675 float scale = powf(2.0, imm); 676 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 677 feclearexcept(FeAllExceptions); 678 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 679 return fixDivDest(flush, defaultNan, val / scale, (float)val, scale); 680} 681 682float 683vfpSFixedToFpS(bool flush, bool defaultNan, 684 int64_t val, uint8_t width, uint8_t imm) 685{ 686 fesetround(FeRoundNearest); 687 if (width == 16) 688 val = sext<16>(val & mask(16)); 689 else if (width == 32) 690 val = sext<32>(val & mask(32)); 691 else if (width != 64) 692 panic("Unsupported width %d", width); 693 694 float scale = powf(2.0, imm); 695 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 696 feclearexcept(FeAllExceptions); 697 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 698 return fixDivDest(flush, defaultNan, val / scale, (float)val, scale); 699} 700 701 702double 703vfpUFixedToFpD(bool flush, bool defaultNan, 704 uint64_t val, uint8_t width, uint8_t imm) 705{ 706 fesetround(FeRoundNearest); 707 if (width == 16) 708 val = (uint16_t)val; 709 else if (width == 32) 710 val = (uint32_t)val; 711 else if (width != 64) 712 panic("Unsupported width %d", width); 713 714 double scale = pow(2.0, imm); 715 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 716 feclearexcept(FeAllExceptions); 717 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 718 return fixDivDest(flush, defaultNan, val / scale, (double)val, scale); 719} 720 721double 722vfpSFixedToFpD(bool flush, bool defaultNan, 723 int64_t val, uint8_t width, uint8_t imm) 724{ 725 fesetround(FeRoundNearest); 726 if (width == 16) 727 val = sext<16>(val & mask(16)); 728 else if (width == 32) 729 val = sext<32>(val & mask(32)); 730 else if (width != 64) 731 panic("Unsupported width %d", width); 732 733 double scale = pow(2.0, imm); 734 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 735 feclearexcept(FeAllExceptions); 736 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 737 return fixDivDest(flush, defaultNan, val / scale, (double)val, scale); 738} 739 740// This function implements a magic formula taken from the architecture 741// reference manual. It was originally called recip_sqrt_estimate. 742static double 743recipSqrtEstimate(double a) 744{ 745 int64_t q0, q1, s; 746 double r; 747 if (a < 0.5) { 748 q0 = (int64_t)(a * 512.0); 749 r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0); 750 } else { 751 q1 = (int64_t)(a * 256.0); 752 r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0); 753 } 754 s = (int64_t)(256.0 * r + 0.5); 755 return (double)s / 256.0; 756} 757 758// This function is only intended for use in Neon instructions because 759// it ignores certain bits in the FPSCR. 760float 761fprSqrtEstimate(FPSCR &fpscr, float op) 762{ 763 const uint32_t qnan = 0x7fc00000; 764 float junk = 0.0; 765 int fpClass = std::fpclassify(op); 766 if (fpClass == FP_NAN) { 767 if ((fpToBits(op) & qnan) != qnan) 768 fpscr.ioc = 1; 769 return bitsToFp(qnan, junk); 770 } else if (fpClass == FP_ZERO) { 771 fpscr.dzc = 1; 772 // Return infinity with the same sign as the operand. 773 return bitsToFp((std::signbit(op) << 31) \| 774 (0xFF << 23) \| (0 << 0), junk); 775 } else if (std::signbit(op)) { 776 // Set invalid op bit. 777 fpscr.ioc = 1; 778 return bitsToFp(qnan, junk); 779 } else if (fpClass == FP_INFINITE) { 780 return 0.0; 781 } else { 782 uint64_t opBits = fpToBits(op); 783 double scaled; 784 if (bits(opBits, 23)) { 785 scaled = bitsToFp((0 << 0) \| (bits(opBits, 22, 0) << 29) \| 786 (ULL(0x3fd) << 52) \| (bits(opBits, 31) << 63), 787 (double)0.0); 788 } else { 789 scaled = bitsToFp((0 << 0) \| (bits(opBits, 22, 0) << 29) \| 790 (ULL(0x3fe) << 52) \| (bits(opBits, 31) << 63), 791 (double)0.0); 792 } 793 uint64_t resultExp = (380 - bits(opBits, 30, 23)) / 2; 794 795 uint64_t estimate = fpToBits(recipSqrtEstimate(scaled)); 796 797 return bitsToFp((bits(estimate, 63) << 31) \| 798 (bits(resultExp, 7, 0) << 23) \| 799 (bits(estimate, 51, 29) << 0), junk); 800 } 801} 802 803uint32_t 804unsignedRSqrtEstimate(uint32_t op) 805{ 806 if (bits(op, 31, 30) == 0) { 807 return -1; 808 } else { 809 double dpOp; 810 if (bits(op, 31)) { 811 dpOp = bitsToFp((ULL(0) << 63) \| 812 (ULL(0x3fe) << 52) \| 813 (bits((uint64_t)op, 30, 0) << 21) \| 814 (0 << 0), (double)0.0); 815 } else { 816 dpOp = bitsToFp((ULL(0) << 63) \| 817 (ULL(0x3fd) << 52) \| 818 (bits((uint64_t)op, 29, 0) << 22) \| 819 (0 << 0), (double)0.0); 820 } 821 uint64_t estimate = fpToBits(recipSqrtEstimate(dpOp)); 822 return (1 << 31) \| bits(estimate, 51, 21); 823 } 824} 825 826// This function implements a magic formula taken from the architecture 827// reference manual. It was originally called recip_estimate. 828 829static double 830recipEstimate(double a) 831{ 832 int64_t q, s; 833 double r; 834 q = (int64_t)(a * 512.0); 835 r = 1.0 / (((double)q + 0.5) / 512.0); 836 s = (int64_t)(256.0 * r + 0.5); 837 return (double)s / 256.0; 838} 839 840// This function is only intended for use in Neon instructions because 841// it ignores certain bits in the FPSCR. 842float 843fpRecipEstimate(FPSCR &fpscr, float op) 844{ 845 const uint32_t qnan = 0x7fc00000; 846 float junk = 0.0; 847 int fpClass = std::fpclassify(op); 848 if (fpClass == FP_NAN) { 849 if ((fpToBits(op) & qnan) != qnan) 850 fpscr.ioc = 1; 851 return bitsToFp(qnan, junk); 852 } else if (fpClass == FP_INFINITE) { 853 return bitsToFp(std::signbit(op) << 31, junk); 854 } else if (fpClass == FP_ZERO) { 855 fpscr.dzc = 1; 856 // Return infinity with the same sign as the operand. 857 return bitsToFp((std::signbit(op) << 31) \| 858 (0xFF << 23) \| (0 << 0), junk); 859 } else if (fabs(op) >= pow(2.0, 126)) { 860 fpscr.ufc = 1; 861 return bitsToFp(std::signbit(op) << 31, junk); 862 } else { 863 uint64_t opBits = fpToBits(op); 864 double scaled; 865 scaled = bitsToFp((0 << 0) \| (bits(opBits, 22, 0) << 29) \| 866 (ULL(0x3fe) << 52) \| (ULL(0) << 63), 867 (double)0.0); 868 uint64_t resultExp = 253 - bits(opBits, 30, 23); 869 870 uint64_t estimate = fpToBits(recipEstimate(scaled)); 871 872 return bitsToFp((bits(opBits, 31) << 31) \| 873 (bits(resultExp, 7, 0) << 23) \| 874 (bits(estimate, 51, 29) << 0), junk); 875 } 876} 877 878uint32_t 879unsignedRecipEstimate(uint32_t op) 880{ 881 if (bits(op, 31) == 0) { 882 return -1; 883 } else { 884 double dpOp; 885 dpOp = bitsToFp((ULL(0) << 63) \| 886 (ULL(0x3fe) << 52) \| 887 (bits((uint64_t)op, 30, 0) << 21) \| 888 (0 << 0), (double)0.0); 889 uint64_t estimate = fpToBits(recipEstimate(dpOp)); 890 return (1 << 31) \| bits(estimate, 51, 21); 891 } 892} 893 894template <class fpType> 895fpType 896FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, 897 fpType op1, fpType op2) const 898{ 899 done = true; 900 fpType junk = 0.0; 901 fpType dest = 0.0; 902 const bool single = (sizeof(fpType) == sizeof(float)); 903 const uint64_t qnan = 904 single ? 0x7fc00000 : ULL(0x7ff8000000000000); 905 const bool nan1 = std::isnan(op1); 906 const bool nan2 = std::isnan(op2); 907 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); 908 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); 909 if (nan1 \|\| nan2) { 910 if (defaultNan) { 911 dest = bitsToFp(qnan, junk); 912 } else if (signal1) { 913 dest = bitsToFp(fpToBits(op1) \| qnan, junk); 914 } else if (signal2) { 915 dest = bitsToFp(fpToBits(op2) \| qnan, junk); 916 } else if (nan1) { 917 dest = op1; 918 } else if (nan2) { 919 dest = op2; 920 } 921 if (signal1 \|\| signal2) { 922 fpscr.ioc = 1; 923 } 924 } else { 925 done = false; 926 } 927 return dest; 928} 929 930template 931float FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, 932 float op1, float op2) const; 933template 934double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, 935 double op1, double op2) const; 936 937// @TODO remove this function when we've finished switching all FMA code to use the new FPLIB 938template <class fpType> 939fpType 940FpOp::ternaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType op3, 941 fpType (func)(fpType, fpType, fpType), 942* bool flush, bool defaultNan, uint32_t rMode) const 943{ 944 const bool single = (sizeof(fpType) == sizeof(float)); 945 fpType junk = 0.0; 946 947 if (flush && (flushToZero(op1, op2) \|\| flushToZero(op3))) 948 fpscr.idc = 1; 949 VfpSavedState state = prepFpState(rMode); 950 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3), "=m" (state) 951 : "m" (op1), "m" (op2), "m" (op3), "m" (state)); 952 fpType dest = func(op1, op2, op3); 953 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); 954 955 int fpClass = std::fpclassify(dest); 956 // Get NAN behavior right. This varies between x86 and ARM. 957 if (fpClass == FP_NAN) { 958 const uint64_t qnan = 959 single ? 0x7fc00000 : ULL(0x7ff8000000000000); 960 const bool nan1 = std::isnan(op1); 961 const bool nan2 = std::isnan(op2); 962 const bool nan3 = std::isnan(op3); 963 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); 964 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); 965 const bool signal3 = nan3 && ((fpToBits(op3) & qnan) != qnan); 966 if ((!nan1 && !nan2 && !nan3) \|\| (defaultNan == 1)) { 967 dest = bitsToFp(qnan, junk); 968 } else if (signal1) { 969 dest = bitsToFp(fpToBits(op1) \| qnan, junk); 970 } else if (signal2) { 971 dest = bitsToFp(fpToBits(op2) \| qnan, junk); 972 } else if (signal3) { 973 dest = bitsToFp(fpToBits(op3) \| qnan, junk); 974 } else if (nan1) { 975 dest = op1; 976 } else if (nan2) { 977 dest = op2; 978 } else if (nan3) { 979 dest = op3; 980 } 981 } else if (flush && flushToZero(dest)) { 982 feraiseexcept(FeUnderflow); 983 } else if (( 984 (single && (dest == bitsToFp(0x00800000, junk) \|\| 985 dest == bitsToFp(0x80800000, junk))) \|\| 986 (!single && 987 (dest == bitsToFp(ULL(0x0010000000000000), junk) \|\| 988 dest == bitsToFp(ULL(0x8010000000000000), junk))) 989 ) && rMode != VfpRoundZero) { 990 /* 991 * Correct for the fact that underflow is detected -before- rounding 992 * in ARM and -after- rounding in x86. 993 / 994* fesetround(FeRoundZero); 995 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3) 996 : "m" (op1), "m" (op2), "m" (op3)); 997 fpType temp = func(op1, op2, op2); 998 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); 999 if (flush && flushToZero(temp)) { 1000 dest = temp; 1001 } 1002 } 1003 finishVfp(fpscr, state, flush); 1004 return dest; 1005} 1006 1007template 1008float FpOp::ternaryOp(FPSCR &fpscr, float op1, float op2, float op3, 1009 float (func)(float, float, float), 1010* bool flush, bool defaultNan, uint32_t rMode) const; 1011template 1012double FpOp::ternaryOp(FPSCR &fpscr, double op1, double op2, double op3, 1013 double (func)(double, double, double), 1014* bool flush, bool defaultNan, uint32_t rMode) const; 1015 1016template <class fpType> 1017fpType 1018FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2, 1019 fpType (func)(fpType, fpType), 1020* bool flush, bool defaultNan, uint32_t rMode) const 1021{ 1022 const bool single = (sizeof(fpType) == sizeof(float)); 1023 fpType junk = 0.0; 1024 1025 if (flush && flushToZero(op1, op2)) 1026 fpscr.idc = 1; 1027 VfpSavedState state = prepFpState(rMode); 1028 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (state) 1029 : "m" (op1), "m" (op2), "m" (state)); 1030 fpType dest = func(op1, op2); 1031 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); 1032 1033 // Get NAN behavior right. This varies between x86 and ARM. 1034 if (std::isnan(dest)) { 1035 const uint64_t qnan = 1036 single ? 0x7fc00000 : ULL(0x7ff8000000000000); 1037 const bool nan1 = std::isnan(op1); 1038 const bool nan2 = std::isnan(op2); 1039 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); 1040 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); 1041 if ((!nan1 && !nan2) \|\| (defaultNan == 1)) { 1042 dest = bitsToFp(qnan, junk); 1043 } else if (signal1) { 1044 dest = bitsToFp(fpToBits(op1) \| qnan, junk); 1045 } else if (signal2) { 1046 dest = bitsToFp(fpToBits(op2) \| qnan, junk); 1047 } else if (nan1) { 1048 dest = op1; 1049 } else if (nan2) { 1050 dest = op2; 1051 } 1052 } else if (flush && flushToZero(dest)) { 1053 feraiseexcept(FeUnderflow); 1054 } else if (( 1055 (single && (dest == bitsToFp(0x00800000, junk) \|\| 1056 dest == bitsToFp(0x80800000, junk))) \|\| 1057 (!single && 1058 (dest == bitsToFp(ULL(0x0010000000000000), junk) \|\| 1059 dest == bitsToFp(ULL(0x8010000000000000), junk))) 1060 ) && rMode != VfpRoundZero) { 1061 /* 1062 * Correct for the fact that underflow is detected -before- rounding 1063 * in ARM and -after- rounding in x86. 1064 / 1065* fesetround(FeRoundZero); 1066 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2) 1067 : "m" (op1), "m" (op2)); 1068 fpType temp = func(op1, op2); 1069 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); 1070 if (flush && flushToZero(temp)) { 1071 dest = temp; 1072 } 1073 } 1074 finishVfp(fpscr, state, flush); 1075 return dest; 1076} 1077 1078template 1079float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2, 1080 float (func)(float, float), 1081* bool flush, bool defaultNan, uint32_t rMode) const; 1082template 1083double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2, 1084 double (func)(double, double), 1085* bool flush, bool defaultNan, uint32_t rMode) const; 1086 1087template <class fpType> 1088fpType 1089FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (func)(fpType), 1090* bool flush, uint32_t rMode) const 1091{ 1092 const bool single = (sizeof(fpType) == sizeof(float)); 1093 fpType junk = 0.0; 1094 1095 if (flush && flushToZero(op1)) 1096 fpscr.idc = 1; 1097 VfpSavedState state = prepFpState(rMode); 1098 __asm__ __volatile__ ("" : "=m" (op1), "=m" (state) 1099 : "m" (op1), "m" (state)); 1100 fpType dest = func(op1); 1101 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); 1102 1103 // Get NAN behavior right. This varies between x86 and ARM. 1104 if (std::isnan(dest)) { 1105 const uint64_t qnan = 1106 single ? 0x7fc00000 : ULL(0x7ff8000000000000); 1107 const bool nan = std::isnan(op1); 1108 if (!nan \|\| fpscr.dn == 1) { 1109 dest = bitsToFp(qnan, junk); 1110 } else if (nan) { 1111 dest = bitsToFp(fpToBits(op1) \| qnan, junk); 1112 } 1113 } else if (flush && flushToZero(dest)) { 1114 feraiseexcept(FeUnderflow); 1115 } else if (( 1116 (single && (dest == bitsToFp(0x00800000, junk) \|\| 1117 dest == bitsToFp(0x80800000, junk))) \|\| 1118 (!single && 1119 (dest == bitsToFp(ULL(0x0010000000000000), junk) \|\| 1120 dest == bitsToFp(ULL(0x8010000000000000), junk))) 1121 ) && rMode != VfpRoundZero) { 1122 /* 1123 * Correct for the fact that underflow is detected -before- rounding 1124 * in ARM and -after- rounding in x86. 1125 / 1126* fesetround(FeRoundZero); 1127 __asm__ __volatile__ ("" : "=m" (op1) : "m" (op1)); 1128 fpType temp = func(op1); 1129 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); 1130 if (flush && flushToZero(temp)) { 1131 dest = temp; 1132 } 1133 } 1134 finishVfp(fpscr, state, flush); 1135 return dest; 1136} 1137 1138template 1139float FpOp::unaryOp(FPSCR &fpscr, float op1, float (func)(float), 1140* bool flush, uint32_t rMode) const; 1141template 1142double FpOp::unaryOp(FPSCR &fpscr, double op1, double (func)(double), 1143* bool flush, uint32_t rMode) const; 1144 1145IntRegIndex 1146VfpMacroOp::addStride(IntRegIndex idx, unsigned stride) 1147{ 1148 if (wide) { 1149 stride = 2; 1150* } 1151 unsigned offset = idx % 8; 1152 idx = (IntRegIndex)(idx - offset); 1153 offset += stride; 1154 idx = (IntRegIndex)(idx + (offset % 8)); 1155 return idx; 1156} 1157 1158void 1159VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1, IntRegIndex &op2) 1160{ 1161 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; 1162 assert(!inScalarBank(dest)); 1163 dest = addStride(dest, stride); 1164 op1 = addStride(op1, stride); 1165 if (!inScalarBank(op2)) { 1166 op2 = addStride(op2, stride); 1167 } 1168} 1169 1170void 1171VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1) 1172{ 1173 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; 1174 assert(!inScalarBank(dest)); 1175 dest = addStride(dest, stride); 1176 if (!inScalarBank(op1)) { 1177 op1 = addStride(op1, stride); 1178 } 1179} 1180 1181void 1182VfpMacroOp::nextIdxs(IntRegIndex &dest) 1183{ 1184 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; 1185 assert(!inScalarBank(dest)); 1186 dest = addStride(dest, stride); 1187} 1188 1189}	165 ccprintf(ss, ", #%d", imm); 166 return ss.str(); 167} 168 169namespace ArmISA 170{ 171 172VfpSavedState 173prepFpState(uint32_t rMode) 174{ 175 int roundingMode = fegetround(); 176 feclearexcept(FeAllExceptions); 177 switch (rMode) { 178 case VfpRoundNearest: 179 fesetround(FeRoundNearest); 180 break; 181 case VfpRoundUpward: 182 fesetround(FeRoundUpward); 183 break; 184 case VfpRoundDown: 185 fesetround(FeRoundDown); 186 break; 187 case VfpRoundZero: 188 fesetround(FeRoundZero); 189 break; 190 } 191 return roundingMode; 192} 193 194void 195finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush, FPSCR mask) 196{ 197 int exceptions = fetestexcept(FeAllExceptions); 198 bool underflow = false; 199 if ((exceptions & FeInvalid) && mask.ioc) { 200 fpscr.ioc = 1; 201 } 202 if ((exceptions & FeDivByZero) && mask.dzc) { 203 fpscr.dzc = 1; 204 } 205 if ((exceptions & FeOverflow) && mask.ofc) { 206 fpscr.ofc = 1; 207 } 208 if (exceptions & FeUnderflow) { 209 underflow = true; 210 if (mask.ufc) 211 fpscr.ufc = 1; 212 } 213 if ((exceptions & FeInexact) && !(underflow && flush) && mask.ixc) { 214 fpscr.ixc = 1; 215 } 216 fesetround(state); 217} 218 219template <class fpType> 220fpType 221fixDest(bool flush, bool defaultNan, fpType val, fpType op1) 222{ 223 int fpClass = std::fpclassify(val); 224 fpType junk = 0.0; 225 if (fpClass == FP_NAN) { 226 const bool single = (sizeof(val) == sizeof(float)); 227 const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); 228 const bool nan = std::isnan(op1); 229 if (!nan \|\| defaultNan) { 230 val = bitsToFp(qnan, junk); 231 } else if (nan) { 232 val = bitsToFp(fpToBits(op1) \| qnan, junk); 233 } 234 } else if (fpClass == FP_SUBNORMAL && flush == 1) { 235 // Turn val into a zero with the correct sign; 236 uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1); 237 val = bitsToFp(fpToBits(val) & bitMask, junk); 238 feclearexcept(FeInexact); 239 feraiseexcept(FeUnderflow); 240 } 241 return val; 242} 243 244template 245float fixDest<float>(bool flush, bool defaultNan, float val, float op1); 246template 247double fixDest<double>(bool flush, bool defaultNan, double val, double op1); 248 249template <class fpType> 250fpType 251fixDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2) 252{ 253 int fpClass = std::fpclassify(val); 254 fpType junk = 0.0; 255 if (fpClass == FP_NAN) { 256 const bool single = (sizeof(val) == sizeof(float)); 257 const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); 258 const bool nan1 = std::isnan(op1); 259 const bool nan2 = std::isnan(op2); 260 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); 261 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); 262 if ((!nan1 && !nan2) \|\| defaultNan) { 263 val = bitsToFp(qnan, junk); 264 } else if (signal1) { 265 val = bitsToFp(fpToBits(op1) \| qnan, junk); 266 } else if (signal2) { 267 val = bitsToFp(fpToBits(op2) \| qnan, junk); 268 } else if (nan1) { 269 val = op1; 270 } else if (nan2) { 271 val = op2; 272 } 273 } else if (fpClass == FP_SUBNORMAL && flush) { 274 // Turn val into a zero with the correct sign; 275 uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1); 276 val = bitsToFp(fpToBits(val) & bitMask, junk); 277 feclearexcept(FeInexact); 278 feraiseexcept(FeUnderflow); 279 } 280 return val; 281} 282 283template 284float fixDest<float>(bool flush, bool defaultNan, 285 float val, float op1, float op2); 286template 287double fixDest<double>(bool flush, bool defaultNan, 288 double val, double op1, double op2); 289 290template <class fpType> 291fpType 292fixDivDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2) 293{ 294 fpType mid = fixDest(flush, defaultNan, val, op1, op2); 295 const bool single = (sizeof(fpType) == sizeof(float)); 296 const fpType junk = 0.0; 297 if ((single && (val == bitsToFp(0x00800000, junk) \|\| 298 val == bitsToFp(0x80800000, junk))) \|\| 299 (!single && (val == bitsToFp(ULL(0x0010000000000000), junk) \|\| 300 val == bitsToFp(ULL(0x8010000000000000), junk))) 301 ) { 302 __asm__ __volatile__("" : "=m" (op1) : "m" (op1)); 303 fesetround(FeRoundZero); 304 fpType temp = 0.0; 305 __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); 306 temp = op1 / op2; 307 if (flushToZero(temp)) { 308 feraiseexcept(FeUnderflow); 309 if (flush) { 310 feclearexcept(FeInexact); 311 mid = temp; 312 } 313 } 314 __asm__ __volatile__("" :: "m" (temp)); 315 } 316 return mid; 317} 318 319template 320float fixDivDest<float>(bool flush, bool defaultNan, 321 float val, float op1, float op2); 322template 323double fixDivDest<double>(bool flush, bool defaultNan, 324 double val, double op1, double op2); 325 326float 327fixFpDFpSDest(FPSCR fpscr, double val) 328{ 329 const float junk = 0.0; 330 float op1 = 0.0; 331 if (std::isnan(val)) { 332 uint64_t valBits = fpToBits(val); 333 uint32_t op1Bits = bits(valBits, 50, 29) \| 334 (mask(9) << 22) \| 335 (bits(valBits, 63) << 31); 336 op1 = bitsToFp(op1Bits, junk); 337 } 338 float mid = fixDest(fpscr.fz, fpscr.dn, (float)val, op1); 339 if (fpscr.fz && fetestexcept(FeUnderflow \| FeInexact) == 340 (FeUnderflow \| FeInexact)) { 341 feclearexcept(FeInexact); 342 } 343 if (mid == bitsToFp(0x00800000, junk) \|\| 344 mid == bitsToFp(0x80800000, junk)) { 345 __asm__ __volatile__("" : "=m" (val) : "m" (val)); 346 fesetround(FeRoundZero); 347 float temp = 0.0; 348 __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); 349 temp = val; 350 if (flushToZero(temp)) { 351 feraiseexcept(FeUnderflow); 352 if (fpscr.fz) { 353 feclearexcept(FeInexact); 354 mid = temp; 355 } 356 } 357 __asm__ __volatile__("" :: "m" (temp)); 358 } 359 return mid; 360} 361 362double 363fixFpSFpDDest(FPSCR fpscr, float val) 364{ 365 const double junk = 0.0; 366 double op1 = 0.0; 367 if (std::isnan(val)) { 368 uint32_t valBits = fpToBits(val); 369 uint64_t op1Bits = ((uint64_t)bits(valBits, 21, 0) << 29) \| 370 (mask(12) << 51) \| 371 ((uint64_t)bits(valBits, 31) << 63); 372 op1 = bitsToFp(op1Bits, junk); 373 } 374 double mid = fixDest(fpscr.fz, fpscr.dn, (double)val, op1); 375 if (mid == bitsToFp(ULL(0x0010000000000000), junk) \|\| 376 mid == bitsToFp(ULL(0x8010000000000000), junk)) { 377 __asm__ __volatile__("" : "=m" (val) : "m" (val)); 378 fesetround(FeRoundZero); 379 double temp = 0.0; 380 __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); 381 temp = val; 382 if (flushToZero(temp)) { 383 feraiseexcept(FeUnderflow); 384 if (fpscr.fz) { 385 feclearexcept(FeInexact); 386 mid = temp; 387 } 388 } 389 __asm__ __volatile__("" :: "m" (temp)); 390 } 391 return mid; 392} 393 394static inline uint16_t 395vcvtFpFpH(FPSCR &fpscr, bool flush, bool defaultNan, 396 uint32_t rMode, bool ahp, uint64_t opBits, bool isDouble) 397{ 398 uint32_t mWidth; 399 uint32_t eWidth; 400 uint32_t eHalfRange; 401 uint32_t sBitPos; 402 403 if (isDouble) { 404 mWidth = 52; 405 eWidth = 11; 406 } else { 407 mWidth = 23; 408 eWidth = 8; 409 } 410 sBitPos = eWidth + mWidth; 411 eHalfRange = (1 << (eWidth-1)) - 1; 412 413 // Extract the operand. 414 bool neg = bits(opBits, sBitPos); 415 uint32_t exponent = bits(opBits, sBitPos-1, mWidth); 416 uint64_t oldMantissa = bits(opBits, mWidth-1, 0); 417 uint32_t mantissa = oldMantissa >> (mWidth - 10); 418 // Do the conversion. 419 uint64_t extra = oldMantissa & mask(mWidth - 10); 420 if (exponent == mask(eWidth)) { 421 if (oldMantissa != 0) { 422 // Nans. 423 if (bits(mantissa, 9) == 0) { 424 // Signalling nan. 425 fpscr.ioc = 1; 426 } 427 if (ahp) { 428 mantissa = 0; 429 exponent = 0; 430 fpscr.ioc = 1; 431 } else if (defaultNan) { 432 mantissa = (1 << 9); 433 exponent = 0x1f; 434 neg = false; 435 } else { 436 exponent = 0x1f; 437 mantissa \|= (1 << 9); 438 } 439 } else { 440 // Infinities. 441 exponent = 0x1F; 442 if (ahp) { 443 fpscr.ioc = 1; 444 mantissa = 0x3ff; 445 } else { 446 mantissa = 0; 447 } 448 } 449 } else if (exponent == 0 && oldMantissa == 0) { 450 // Zero, don't need to do anything. 451 } else { 452 // Normalized or denormalized numbers. 453 454 bool inexact = (extra != 0); 455 456 if (exponent == 0) { 457 // Denormalized. 458 // If flush to zero is on, this shouldn't happen. 459 assert(!flush); 460 461 // Check for underflow 462 if (inexact \|\| fpscr.ufe) 463 fpscr.ufc = 1; 464 465 // Handle rounding. 466 unsigned mode = rMode; 467 if ((mode == VfpRoundUpward && !neg && extra) \|\| 468 (mode == VfpRoundDown && neg && extra) \|\| 469 (mode == VfpRoundNearest && 470 (extra > (1 << 9) \|\| 471 (extra == (1 << 9) && bits(mantissa, 0))))) { 472 mantissa++; 473 } 474 475 // See if the number became normalized after rounding. 476 if (mantissa == (1 << 10)) { 477 mantissa = 0; 478 exponent = 1; 479 } 480 } else { 481 // Normalized. 482 483 // We need to track the dropped bits differently since 484 // more can be dropped by denormalizing. 485 bool topOne = bits(extra, mWidth - 10 - 1); 486 bool restZeros = bits(extra, mWidth - 10 - 2, 0) == 0; 487 488 if (exponent <= (eHalfRange - 15)) { 489 // The result is too small. Denormalize. 490 mantissa \|= (1 << 10); 491 while (mantissa && exponent <= (eHalfRange - 15)) { 492 restZeros = restZeros && !topOne; 493 topOne = bits(mantissa, 0); 494 mantissa = mantissa >> 1; 495 exponent++; 496 } 497 if (topOne \|\| !restZeros) 498 inexact = true; 499 exponent = 0; 500 } else { 501 // Change bias. 502 exponent -= (eHalfRange - 15); 503 } 504 505 if (exponent == 0 && (inexact \|\| fpscr.ufe)) { 506 // Underflow 507 fpscr.ufc = 1; 508 } 509 510 // Handle rounding. 511 unsigned mode = rMode; 512 bool nonZero = topOne \|\| !restZeros; 513 if ((mode == VfpRoundUpward && !neg && nonZero) \|\| 514 (mode == VfpRoundDown && neg && nonZero) \|\| 515 (mode == VfpRoundNearest && topOne && 516 (!restZeros \|\| bits(mantissa, 0)))) { 517 mantissa++; 518 } 519 520 // See if we rounded up and need to bump the exponent. 521 if (mantissa == (1 << 10)) { 522 mantissa = 0; 523 exponent++; 524 } 525 526 // Deal with overflow 527 if (ahp) { 528 if (exponent >= 0x20) { 529 exponent = 0x1f; 530 mantissa = 0x3ff; 531 fpscr.ioc = 1; 532 // Supress inexact exception. 533 inexact = false; 534 } 535 } else { 536 if (exponent >= 0x1f) { 537 if ((mode == VfpRoundNearest) \|\| 538 (mode == VfpRoundUpward && !neg) \|\| 539 (mode == VfpRoundDown && neg)) { 540 // Overflow to infinity. 541 exponent = 0x1f; 542 mantissa = 0; 543 } else { 544 // Overflow to max normal. 545 exponent = 0x1e; 546 mantissa = 0x3ff; 547 } 548 fpscr.ofc = 1; 549 inexact = true; 550 } 551 } 552 } 553 554 if (inexact) { 555 fpscr.ixc = 1; 556 } 557 } 558 // Reassemble and install the result. 559 uint32_t result = bits(mantissa, 9, 0); 560 replaceBits(result, 14, 10, exponent); 561 if (neg) 562 result \|= (1 << 15); 563 return result; 564} 565 566uint16_t 567vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan, 568 uint32_t rMode, bool ahp, float op) 569{ 570 uint64_t opBits = fpToBits(op); 571 return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, false); 572} 573 574uint16_t 575vcvtFpDFpH(FPSCR &fpscr, bool flush, bool defaultNan, 576 uint32_t rMode, bool ahp, double op) 577{ 578 uint64_t opBits = fpToBits(op); 579 return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, true); 580} 581 582static inline uint64_t 583vcvtFpHFp(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op, bool isDouble) 584{ 585 uint32_t mWidth; 586 uint32_t eWidth; 587 uint32_t eHalfRange; 588 uint32_t sBitPos; 589 590 if (isDouble) { 591 mWidth = 52; 592 eWidth = 11; 593 } else { 594 mWidth = 23; 595 eWidth = 8; 596 } 597 sBitPos = eWidth + mWidth; 598 eHalfRange = (1 << (eWidth-1)) - 1; 599 600 // Extract the bitfields. 601 bool neg = bits(op, 15); 602 uint32_t exponent = bits(op, 14, 10); 603 uint64_t mantissa = bits(op, 9, 0); 604 // Do the conversion. 605 if (exponent == 0) { 606 if (mantissa != 0) { 607 // Normalize the value. 608 exponent = exponent + (eHalfRange - 15) + 1; 609 while (mantissa < (1 << 10)) { 610 mantissa = mantissa << 1; 611 exponent--; 612 } 613 } 614 mantissa = mantissa << (mWidth - 10); 615 } else if (exponent == 0x1f && !ahp) { 616 // Infinities and nans. 617 exponent = mask(eWidth); 618 if (mantissa != 0) { 619 // Nans. 620 mantissa = mantissa << (mWidth - 10); 621 if (bits(mantissa, mWidth-1) == 0) { 622 // Signalling nan. 623 fpscr.ioc = 1; 624 mantissa \|= (((uint64_t) 1) << (mWidth-1)); 625 } 626 if (defaultNan) { 627 mantissa &= ~mask(mWidth-1); 628 neg = false; 629 } 630 } 631 } else { 632 exponent = exponent + (eHalfRange - 15); 633 mantissa = mantissa << (mWidth - 10); 634 } 635 // Reassemble the result. 636 uint64_t result = bits(mantissa, mWidth-1, 0); 637 replaceBits(result, sBitPos-1, mWidth, exponent); 638 if (neg) { 639 result \|= (((uint64_t) 1) << sBitPos); 640 } 641 return result; 642} 643 644double 645vcvtFpHFpD(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op) 646{ 647 double junk = 0.0; 648 uint64_t result; 649 650 result = vcvtFpHFp(fpscr, defaultNan, ahp, op, true); 651 return bitsToFp(result, junk); 652} 653 654float 655vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op) 656{ 657 float junk = 0.0; 658 uint64_t result; 659 660 result = vcvtFpHFp(fpscr, defaultNan, ahp, op, false); 661 return bitsToFp(result, junk); 662} 663 664float 665vfpUFixedToFpS(bool flush, bool defaultNan, 666 uint64_t val, uint8_t width, uint8_t imm) 667{ 668 fesetround(FeRoundNearest); 669 if (width == 16) 670 val = (uint16_t)val; 671 else if (width == 32) 672 val = (uint32_t)val; 673 else if (width != 64) 674 panic("Unsupported width %d", width); 675 float scale = powf(2.0, imm); 676 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 677 feclearexcept(FeAllExceptions); 678 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 679 return fixDivDest(flush, defaultNan, val / scale, (float)val, scale); 680} 681 682float 683vfpSFixedToFpS(bool flush, bool defaultNan, 684 int64_t val, uint8_t width, uint8_t imm) 685{ 686 fesetround(FeRoundNearest); 687 if (width == 16) 688 val = sext<16>(val & mask(16)); 689 else if (width == 32) 690 val = sext<32>(val & mask(32)); 691 else if (width != 64) 692 panic("Unsupported width %d", width); 693 694 float scale = powf(2.0, imm); 695 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 696 feclearexcept(FeAllExceptions); 697 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 698 return fixDivDest(flush, defaultNan, val / scale, (float)val, scale); 699} 700 701 702double 703vfpUFixedToFpD(bool flush, bool defaultNan, 704 uint64_t val, uint8_t width, uint8_t imm) 705{ 706 fesetround(FeRoundNearest); 707 if (width == 16) 708 val = (uint16_t)val; 709 else if (width == 32) 710 val = (uint32_t)val; 711 else if (width != 64) 712 panic("Unsupported width %d", width); 713 714 double scale = pow(2.0, imm); 715 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 716 feclearexcept(FeAllExceptions); 717 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 718 return fixDivDest(flush, defaultNan, val / scale, (double)val, scale); 719} 720 721double 722vfpSFixedToFpD(bool flush, bool defaultNan, 723 int64_t val, uint8_t width, uint8_t imm) 724{ 725 fesetround(FeRoundNearest); 726 if (width == 16) 727 val = sext<16>(val & mask(16)); 728 else if (width == 32) 729 val = sext<32>(val & mask(32)); 730 else if (width != 64) 731 panic("Unsupported width %d", width); 732 733 double scale = pow(2.0, imm); 734 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 735 feclearexcept(FeAllExceptions); 736 __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); 737 return fixDivDest(flush, defaultNan, val / scale, (double)val, scale); 738} 739 740// This function implements a magic formula taken from the architecture 741// reference manual. It was originally called recip_sqrt_estimate. 742static double 743recipSqrtEstimate(double a) 744{ 745 int64_t q0, q1, s; 746 double r; 747 if (a < 0.5) { 748 q0 = (int64_t)(a * 512.0); 749 r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0); 750 } else { 751 q1 = (int64_t)(a * 256.0); 752 r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0); 753 } 754 s = (int64_t)(256.0 * r + 0.5); 755 return (double)s / 256.0; 756} 757 758// This function is only intended for use in Neon instructions because 759// it ignores certain bits in the FPSCR. 760float 761fprSqrtEstimate(FPSCR &fpscr, float op) 762{ 763 const uint32_t qnan = 0x7fc00000; 764 float junk = 0.0; 765 int fpClass = std::fpclassify(op); 766 if (fpClass == FP_NAN) { 767 if ((fpToBits(op) & qnan) != qnan) 768 fpscr.ioc = 1; 769 return bitsToFp(qnan, junk); 770 } else if (fpClass == FP_ZERO) { 771 fpscr.dzc = 1; 772 // Return infinity with the same sign as the operand. 773 return bitsToFp((std::signbit(op) << 31) \| 774 (0xFF << 23) \| (0 << 0), junk); 775 } else if (std::signbit(op)) { 776 // Set invalid op bit. 777 fpscr.ioc = 1; 778 return bitsToFp(qnan, junk); 779 } else if (fpClass == FP_INFINITE) { 780 return 0.0; 781 } else { 782 uint64_t opBits = fpToBits(op); 783 double scaled; 784 if (bits(opBits, 23)) { 785 scaled = bitsToFp((0 << 0) \| (bits(opBits, 22, 0) << 29) \| 786 (ULL(0x3fd) << 52) \| (bits(opBits, 31) << 63), 787 (double)0.0); 788 } else { 789 scaled = bitsToFp((0 << 0) \| (bits(opBits, 22, 0) << 29) \| 790 (ULL(0x3fe) << 52) \| (bits(opBits, 31) << 63), 791 (double)0.0); 792 } 793 uint64_t resultExp = (380 - bits(opBits, 30, 23)) / 2; 794 795 uint64_t estimate = fpToBits(recipSqrtEstimate(scaled)); 796 797 return bitsToFp((bits(estimate, 63) << 31) \| 798 (bits(resultExp, 7, 0) << 23) \| 799 (bits(estimate, 51, 29) << 0), junk); 800 } 801} 802 803uint32_t 804unsignedRSqrtEstimate(uint32_t op) 805{ 806 if (bits(op, 31, 30) == 0) { 807 return -1; 808 } else { 809 double dpOp; 810 if (bits(op, 31)) { 811 dpOp = bitsToFp((ULL(0) << 63) \| 812 (ULL(0x3fe) << 52) \| 813 (bits((uint64_t)op, 30, 0) << 21) \| 814 (0 << 0), (double)0.0); 815 } else { 816 dpOp = bitsToFp((ULL(0) << 63) \| 817 (ULL(0x3fd) << 52) \| 818 (bits((uint64_t)op, 29, 0) << 22) \| 819 (0 << 0), (double)0.0); 820 } 821 uint64_t estimate = fpToBits(recipSqrtEstimate(dpOp)); 822 return (1 << 31) \| bits(estimate, 51, 21); 823 } 824} 825 826// This function implements a magic formula taken from the architecture 827// reference manual. It was originally called recip_estimate. 828 829static double 830recipEstimate(double a) 831{ 832 int64_t q, s; 833 double r; 834 q = (int64_t)(a * 512.0); 835 r = 1.0 / (((double)q + 0.5) / 512.0); 836 s = (int64_t)(256.0 * r + 0.5); 837 return (double)s / 256.0; 838} 839 840// This function is only intended for use in Neon instructions because 841// it ignores certain bits in the FPSCR. 842float 843fpRecipEstimate(FPSCR &fpscr, float op) 844{ 845 const uint32_t qnan = 0x7fc00000; 846 float junk = 0.0; 847 int fpClass = std::fpclassify(op); 848 if (fpClass == FP_NAN) { 849 if ((fpToBits(op) & qnan) != qnan) 850 fpscr.ioc = 1; 851 return bitsToFp(qnan, junk); 852 } else if (fpClass == FP_INFINITE) { 853 return bitsToFp(std::signbit(op) << 31, junk); 854 } else if (fpClass == FP_ZERO) { 855 fpscr.dzc = 1; 856 // Return infinity with the same sign as the operand. 857 return bitsToFp((std::signbit(op) << 31) \| 858 (0xFF << 23) \| (0 << 0), junk); 859 } else if (fabs(op) >= pow(2.0, 126)) { 860 fpscr.ufc = 1; 861 return bitsToFp(std::signbit(op) << 31, junk); 862 } else { 863 uint64_t opBits = fpToBits(op); 864 double scaled; 865 scaled = bitsToFp((0 << 0) \| (bits(opBits, 22, 0) << 29) \| 866 (ULL(0x3fe) << 52) \| (ULL(0) << 63), 867 (double)0.0); 868 uint64_t resultExp = 253 - bits(opBits, 30, 23); 869 870 uint64_t estimate = fpToBits(recipEstimate(scaled)); 871 872 return bitsToFp((bits(opBits, 31) << 31) \| 873 (bits(resultExp, 7, 0) << 23) \| 874 (bits(estimate, 51, 29) << 0), junk); 875 } 876} 877 878uint32_t 879unsignedRecipEstimate(uint32_t op) 880{ 881 if (bits(op, 31) == 0) { 882 return -1; 883 } else { 884 double dpOp; 885 dpOp = bitsToFp((ULL(0) << 63) \| 886 (ULL(0x3fe) << 52) \| 887 (bits((uint64_t)op, 30, 0) << 21) \| 888 (0 << 0), (double)0.0); 889 uint64_t estimate = fpToBits(recipEstimate(dpOp)); 890 return (1 << 31) \| bits(estimate, 51, 21); 891 } 892} 893 894template <class fpType> 895fpType 896FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, 897 fpType op1, fpType op2) const 898{ 899 done = true; 900 fpType junk = 0.0; 901 fpType dest = 0.0; 902 const bool single = (sizeof(fpType) == sizeof(float)); 903 const uint64_t qnan = 904 single ? 0x7fc00000 : ULL(0x7ff8000000000000); 905 const bool nan1 = std::isnan(op1); 906 const bool nan2 = std::isnan(op2); 907 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); 908 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); 909 if (nan1 \|\| nan2) { 910 if (defaultNan) { 911 dest = bitsToFp(qnan, junk); 912 } else if (signal1) { 913 dest = bitsToFp(fpToBits(op1) \| qnan, junk); 914 } else if (signal2) { 915 dest = bitsToFp(fpToBits(op2) \| qnan, junk); 916 } else if (nan1) { 917 dest = op1; 918 } else if (nan2) { 919 dest = op2; 920 } 921 if (signal1 \|\| signal2) { 922 fpscr.ioc = 1; 923 } 924 } else { 925 done = false; 926 } 927 return dest; 928} 929 930template 931float FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, 932 float op1, float op2) const; 933template 934double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, 935 double op1, double op2) const; 936 937// @TODO remove this function when we've finished switching all FMA code to use the new FPLIB 938template <class fpType> 939fpType 940FpOp::ternaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType op3, 941 fpType (func)(fpType, fpType, fpType), 942* bool flush, bool defaultNan, uint32_t rMode) const 943{ 944 const bool single = (sizeof(fpType) == sizeof(float)); 945 fpType junk = 0.0; 946 947 if (flush && (flushToZero(op1, op2) \|\| flushToZero(op3))) 948 fpscr.idc = 1; 949 VfpSavedState state = prepFpState(rMode); 950 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3), "=m" (state) 951 : "m" (op1), "m" (op2), "m" (op3), "m" (state)); 952 fpType dest = func(op1, op2, op3); 953 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); 954 955 int fpClass = std::fpclassify(dest); 956 // Get NAN behavior right. This varies between x86 and ARM. 957 if (fpClass == FP_NAN) { 958 const uint64_t qnan = 959 single ? 0x7fc00000 : ULL(0x7ff8000000000000); 960 const bool nan1 = std::isnan(op1); 961 const bool nan2 = std::isnan(op2); 962 const bool nan3 = std::isnan(op3); 963 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); 964 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); 965 const bool signal3 = nan3 && ((fpToBits(op3) & qnan) != qnan); 966 if ((!nan1 && !nan2 && !nan3) \|\| (defaultNan == 1)) { 967 dest = bitsToFp(qnan, junk); 968 } else if (signal1) { 969 dest = bitsToFp(fpToBits(op1) \| qnan, junk); 970 } else if (signal2) { 971 dest = bitsToFp(fpToBits(op2) \| qnan, junk); 972 } else if (signal3) { 973 dest = bitsToFp(fpToBits(op3) \| qnan, junk); 974 } else if (nan1) { 975 dest = op1; 976 } else if (nan2) { 977 dest = op2; 978 } else if (nan3) { 979 dest = op3; 980 } 981 } else if (flush && flushToZero(dest)) { 982 feraiseexcept(FeUnderflow); 983 } else if (( 984 (single && (dest == bitsToFp(0x00800000, junk) \|\| 985 dest == bitsToFp(0x80800000, junk))) \|\| 986 (!single && 987 (dest == bitsToFp(ULL(0x0010000000000000), junk) \|\| 988 dest == bitsToFp(ULL(0x8010000000000000), junk))) 989 ) && rMode != VfpRoundZero) { 990 /* 991 * Correct for the fact that underflow is detected -before- rounding 992 * in ARM and -after- rounding in x86. 993 / 994* fesetround(FeRoundZero); 995 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3) 996 : "m" (op1), "m" (op2), "m" (op3)); 997 fpType temp = func(op1, op2, op2); 998 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); 999 if (flush && flushToZero(temp)) { 1000 dest = temp; 1001 } 1002 } 1003 finishVfp(fpscr, state, flush); 1004 return dest; 1005} 1006 1007template 1008float FpOp::ternaryOp(FPSCR &fpscr, float op1, float op2, float op3, 1009 float (func)(float, float, float), 1010* bool flush, bool defaultNan, uint32_t rMode) const; 1011template 1012double FpOp::ternaryOp(FPSCR &fpscr, double op1, double op2, double op3, 1013 double (func)(double, double, double), 1014* bool flush, bool defaultNan, uint32_t rMode) const; 1015 1016template <class fpType> 1017fpType 1018FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2, 1019 fpType (func)(fpType, fpType), 1020* bool flush, bool defaultNan, uint32_t rMode) const 1021{ 1022 const bool single = (sizeof(fpType) == sizeof(float)); 1023 fpType junk = 0.0; 1024 1025 if (flush && flushToZero(op1, op2)) 1026 fpscr.idc = 1; 1027 VfpSavedState state = prepFpState(rMode); 1028 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (state) 1029 : "m" (op1), "m" (op2), "m" (state)); 1030 fpType dest = func(op1, op2); 1031 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); 1032 1033 // Get NAN behavior right. This varies between x86 and ARM. 1034 if (std::isnan(dest)) { 1035 const uint64_t qnan = 1036 single ? 0x7fc00000 : ULL(0x7ff8000000000000); 1037 const bool nan1 = std::isnan(op1); 1038 const bool nan2 = std::isnan(op2); 1039 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); 1040 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); 1041 if ((!nan1 && !nan2) \|\| (defaultNan == 1)) { 1042 dest = bitsToFp(qnan, junk); 1043 } else if (signal1) { 1044 dest = bitsToFp(fpToBits(op1) \| qnan, junk); 1045 } else if (signal2) { 1046 dest = bitsToFp(fpToBits(op2) \| qnan, junk); 1047 } else if (nan1) { 1048 dest = op1; 1049 } else if (nan2) { 1050 dest = op2; 1051 } 1052 } else if (flush && flushToZero(dest)) { 1053 feraiseexcept(FeUnderflow); 1054 } else if (( 1055 (single && (dest == bitsToFp(0x00800000, junk) \|\| 1056 dest == bitsToFp(0x80800000, junk))) \|\| 1057 (!single && 1058 (dest == bitsToFp(ULL(0x0010000000000000), junk) \|\| 1059 dest == bitsToFp(ULL(0x8010000000000000), junk))) 1060 ) && rMode != VfpRoundZero) { 1061 /* 1062 * Correct for the fact that underflow is detected -before- rounding 1063 * in ARM and -after- rounding in x86. 1064 / 1065* fesetround(FeRoundZero); 1066 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2) 1067 : "m" (op1), "m" (op2)); 1068 fpType temp = func(op1, op2); 1069 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); 1070 if (flush && flushToZero(temp)) { 1071 dest = temp; 1072 } 1073 } 1074 finishVfp(fpscr, state, flush); 1075 return dest; 1076} 1077 1078template 1079float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2, 1080 float (func)(float, float), 1081* bool flush, bool defaultNan, uint32_t rMode) const; 1082template 1083double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2, 1084 double (func)(double, double), 1085* bool flush, bool defaultNan, uint32_t rMode) const; 1086 1087template <class fpType> 1088fpType 1089FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (func)(fpType), 1090* bool flush, uint32_t rMode) const 1091{ 1092 const bool single = (sizeof(fpType) == sizeof(float)); 1093 fpType junk = 0.0; 1094 1095 if (flush && flushToZero(op1)) 1096 fpscr.idc = 1; 1097 VfpSavedState state = prepFpState(rMode); 1098 __asm__ __volatile__ ("" : "=m" (op1), "=m" (state) 1099 : "m" (op1), "m" (state)); 1100 fpType dest = func(op1); 1101 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); 1102 1103 // Get NAN behavior right. This varies between x86 and ARM. 1104 if (std::isnan(dest)) { 1105 const uint64_t qnan = 1106 single ? 0x7fc00000 : ULL(0x7ff8000000000000); 1107 const bool nan = std::isnan(op1); 1108 if (!nan \|\| fpscr.dn == 1) { 1109 dest = bitsToFp(qnan, junk); 1110 } else if (nan) { 1111 dest = bitsToFp(fpToBits(op1) \| qnan, junk); 1112 } 1113 } else if (flush && flushToZero(dest)) { 1114 feraiseexcept(FeUnderflow); 1115 } else if (( 1116 (single && (dest == bitsToFp(0x00800000, junk) \|\| 1117 dest == bitsToFp(0x80800000, junk))) \|\| 1118 (!single && 1119 (dest == bitsToFp(ULL(0x0010000000000000), junk) \|\| 1120 dest == bitsToFp(ULL(0x8010000000000000), junk))) 1121 ) && rMode != VfpRoundZero) { 1122 /* 1123 * Correct for the fact that underflow is detected -before- rounding 1124 * in ARM and -after- rounding in x86. 1125 / 1126* fesetround(FeRoundZero); 1127 __asm__ __volatile__ ("" : "=m" (op1) : "m" (op1)); 1128 fpType temp = func(op1); 1129 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); 1130 if (flush && flushToZero(temp)) { 1131 dest = temp; 1132 } 1133 } 1134 finishVfp(fpscr, state, flush); 1135 return dest; 1136} 1137 1138template 1139float FpOp::unaryOp(FPSCR &fpscr, float op1, float (func)(float), 1140* bool flush, uint32_t rMode) const; 1141template 1142double FpOp::unaryOp(FPSCR &fpscr, double op1, double (func)(double), 1143* bool flush, uint32_t rMode) const; 1144 1145IntRegIndex 1146VfpMacroOp::addStride(IntRegIndex idx, unsigned stride) 1147{ 1148 if (wide) { 1149 stride = 2; 1150* } 1151 unsigned offset = idx % 8; 1152 idx = (IntRegIndex)(idx - offset); 1153 offset += stride; 1154 idx = (IntRegIndex)(idx + (offset % 8)); 1155 return idx; 1156} 1157 1158void 1159VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1, IntRegIndex &op2) 1160{ 1161 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; 1162 assert(!inScalarBank(dest)); 1163 dest = addStride(dest, stride); 1164 op1 = addStride(op1, stride); 1165 if (!inScalarBank(op2)) { 1166 op2 = addStride(op2, stride); 1167 } 1168} 1169 1170void 1171VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1) 1172{ 1173 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; 1174 assert(!inScalarBank(dest)); 1175 dest = addStride(dest, stride); 1176 if (!inScalarBank(op1)) { 1177 op1 = addStride(op1, stride); 1178 } 1179} 1180 1181void 1182VfpMacroOp::nextIdxs(IntRegIndex &dest) 1183{ 1184 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; 1185 assert(!inScalarBank(dest)); 1186 dest = addStride(dest, stride); 1187} 1188 1189}