vfp.cc (11671:520509f3e66c) vfp.cc (12104:edd63f9c6184)
1/*
2 * Copyright (c) 2010-2013 ARM Limited
3 * All rights reserved
4 *
5 * The license below extends only to copyright in the software and shall
6 * not be construed as granting a license to any other intellectual
7 * property including but not limited to intellectual property relating
8 * to a hardware implementation of the functionality of the software
9 * licensed hereunder. You may use the software subject to the license
10 * terms below provided that you ensure that this notice is replicated
11 * unmodified and in its entirety in all distributions of the software,
12 * modified or unmodified, in source code or in binary form.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions are
16 * met: redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer;
18 * redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution;
21 * neither the name of the copyright holders nor the names of its
22 * contributors may be used to endorse or promote products derived from
23 * this software without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 *
37 * Authors: Gabe Black
38 */
39
40#include "arch/arm/insts/vfp.hh"
41
42/*
43 * The asm statements below are to keep gcc from reordering code. Otherwise
44 * the rounding mode might be set after the operation it was intended for, the
45 * exception bits read before it, etc.
46 */
47
48std::string
49FpCondCompRegOp::generateDisassembly(
50 Addr pc, const SymbolTable *symtab) const
51{
52 std::stringstream ss;
53 printMnemonic(ss, "", false);
1/*
2 * Copyright (c) 2010-2013 ARM Limited
3 * All rights reserved
4 *
5 * The license below extends only to copyright in the software and shall
6 * not be construed as granting a license to any other intellectual
7 * property including but not limited to intellectual property relating
8 * to a hardware implementation of the functionality of the software
9 * licensed hereunder. You may use the software subject to the license
10 * terms below provided that you ensure that this notice is replicated
11 * unmodified and in its entirety in all distributions of the software,
12 * modified or unmodified, in source code or in binary form.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions are
16 * met: redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer;
18 * redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution;
21 * neither the name of the copyright holders nor the names of its
22 * contributors may be used to endorse or promote products derived from
23 * this software without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 *
37 * Authors: Gabe Black
38 */
39
40#include "arch/arm/insts/vfp.hh"
41
42/*
43 * The asm statements below are to keep gcc from reordering code. Otherwise
44 * the rounding mode might be set after the operation it was intended for, the
45 * exception bits read before it, etc.
46 */
47
48std::string
49FpCondCompRegOp::generateDisassembly(
50 Addr pc, const SymbolTable *symtab) const
51{
52 std::stringstream ss;
53 printMnemonic(ss, "", false);
54 printReg(ss, op1);
54 printIntReg(ss, op1);
55 ccprintf(ss, ", ");
55 ccprintf(ss, ", ");
56 printReg(ss, op2);
56 printIntReg(ss, op2);
57 ccprintf(ss, ", #%d", defCc);
58 ccprintf(ss, ", ");
59 printCondition(ss, condCode, true);
60 return ss.str();
61}
62
63std::string
64FpCondSelOp::generateDisassembly(
65 Addr pc, const SymbolTable *symtab) const
66{
67 std::stringstream ss;
68 printMnemonic(ss, "", false);
57 ccprintf(ss, ", #%d", defCc);
58 ccprintf(ss, ", ");
59 printCondition(ss, condCode, true);
60 return ss.str();
61}
62
63std::string
64FpCondSelOp::generateDisassembly(
65 Addr pc, const SymbolTable *symtab) const
66{
67 std::stringstream ss;
68 printMnemonic(ss, "", false);
69 printReg(ss, dest);
69 printIntReg(ss, dest);
70 ccprintf(ss, ", ");
70 ccprintf(ss, ", ");
71 printReg(ss, op1);
71 printIntReg(ss, op1);
72 ccprintf(ss, ", ");
72 ccprintf(ss, ", ");
73 printReg(ss, op2);
73 printIntReg(ss, op2);
74 ccprintf(ss, ", ");
75 printCondition(ss, condCode, true);
76 return ss.str();
77}
78
79std::string
80FpRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
81{
82 std::stringstream ss;
83 printMnemonic(ss);
74 ccprintf(ss, ", ");
75 printCondition(ss, condCode, true);
76 return ss.str();
77}
78
79std::string
80FpRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
81{
82 std::stringstream ss;
83 printMnemonic(ss);
84 printReg(ss, dest + FP_Reg_Base);
84 printFloatReg(ss, dest);
85 ss << ", ";
85 ss << ", ";
86 printReg(ss, op1 + FP_Reg_Base);
86 printFloatReg(ss, op1);
87 return ss.str();
88}
89
90std::string
91FpRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
92{
93 std::stringstream ss;
94 printMnemonic(ss);
87 return ss.str();
88}
89
90std::string
91FpRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
92{
93 std::stringstream ss;
94 printMnemonic(ss);
95 printReg(ss, dest + FP_Reg_Base);
95 printFloatReg(ss, dest);
96 ccprintf(ss, ", #%d", imm);
97 return ss.str();
98}
99
100std::string
101FpRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
102{
103 std::stringstream ss;
104 printMnemonic(ss);
96 ccprintf(ss, ", #%d", imm);
97 return ss.str();
98}
99
100std::string
101FpRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
102{
103 std::stringstream ss;
104 printMnemonic(ss);
105 printReg(ss, dest + FP_Reg_Base);
105 printFloatReg(ss, dest);
106 ss << ", ";
106 ss << ", ";
107 printReg(ss, op1 + FP_Reg_Base);
107 printFloatReg(ss, op1);
108 ccprintf(ss, ", #%d", imm);
109 return ss.str();
110}
111
112std::string
113FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
114{
115 std::stringstream ss;
116 printMnemonic(ss);
108 ccprintf(ss, ", #%d", imm);
109 return ss.str();
110}
111
112std::string
113FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
114{
115 std::stringstream ss;
116 printMnemonic(ss);
117 printReg(ss, dest + FP_Reg_Base);
117 printFloatReg(ss, dest);
118 ss << ", ";
118 ss << ", ";
119 printReg(ss, op1 + FP_Reg_Base);
119 printFloatReg(ss, op1);
120 ss << ", ";
120 ss << ", ";
121 printReg(ss, op2 + FP_Reg_Base);
121 printFloatReg(ss, op2);
122 return ss.str();
123}
124
125std::string
126FpRegRegRegCondOp::generateDisassembly(Addr pc, const SymbolTable *symtab)
127 const
128{
129 std::stringstream ss;
130 printMnemonic(ss);
131 printCondition(ss, cond);
122 return ss.str();
123}
124
125std::string
126FpRegRegRegCondOp::generateDisassembly(Addr pc, const SymbolTable *symtab)
127 const
128{
129 std::stringstream ss;
130 printMnemonic(ss);
131 printCondition(ss, cond);
132 printReg(ss, dest + FP_Reg_Base);
132 printFloatReg(ss, dest);
133 ss << ", ";
133 ss << ", ";
134 printReg(ss, op1 + FP_Reg_Base);
134 printFloatReg(ss, op1);
135 ss << ", ";
135 ss << ", ";
136 printReg(ss, op2 + FP_Reg_Base);
136 printFloatReg(ss, op2);
137 return ss.str();
138}
139
140std::string
141FpRegRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
142{
143 std::stringstream ss;
144 printMnemonic(ss);
137 return ss.str();
138}
139
140std::string
141FpRegRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
142{
143 std::stringstream ss;
144 printMnemonic(ss);
145 printReg(ss, dest + FP_Reg_Base);
145 printFloatReg(ss, dest);
146 ss << ", ";
146 ss << ", ";
147 printReg(ss, op1 + FP_Reg_Base);
147 printFloatReg(ss, op1);
148 ss << ", ";
148 ss << ", ";
149 printReg(ss, op2 + FP_Reg_Base);
149 printFloatReg(ss, op2);
150 ss << ", ";
150 ss << ", ";
151 printReg(ss, op3 + FP_Reg_Base);
151 printFloatReg(ss, op3);
152 return ss.str();
153}
154
155std::string
156FpRegRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
157{
158 std::stringstream ss;
159 printMnemonic(ss);
152 return ss.str();
153}
154
155std::string
156FpRegRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
157{
158 std::stringstream ss;
159 printMnemonic(ss);
160 printReg(ss, dest + FP_Reg_Base);
160 printFloatReg(ss, dest);
161 ss << ", ";
161 ss << ", ";
162 printReg(ss, op1 + FP_Reg_Base);
162 printFloatReg(ss, op1);
163 ss << ", ";
163 ss << ", ";
164 printReg(ss, op2 + FP_Reg_Base);
164 printFloatReg(ss, op2);
165 ccprintf(ss, ", #%d", imm);
166 return ss.str();
167}
168
169namespace ArmISA
170{
171
172VfpSavedState
173prepFpState(uint32_t rMode)
174{
175 int roundingMode = fegetround();
176 feclearexcept(FeAllExceptions);
177 switch (rMode) {
178 case VfpRoundNearest:
179 fesetround(FeRoundNearest);
180 break;
181 case VfpRoundUpward:
182 fesetround(FeRoundUpward);
183 break;
184 case VfpRoundDown:
185 fesetround(FeRoundDown);
186 break;
187 case VfpRoundZero:
188 fesetround(FeRoundZero);
189 break;
190 }
191 return roundingMode;
192}
193
194void
195finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush, FPSCR mask)
196{
197 int exceptions = fetestexcept(FeAllExceptions);
198 bool underflow = false;
199 if ((exceptions & FeInvalid) && mask.ioc) {
200 fpscr.ioc = 1;
201 }
202 if ((exceptions & FeDivByZero) && mask.dzc) {
203 fpscr.dzc = 1;
204 }
205 if ((exceptions & FeOverflow) && mask.ofc) {
206 fpscr.ofc = 1;
207 }
208 if (exceptions & FeUnderflow) {
209 underflow = true;
210 if (mask.ufc)
211 fpscr.ufc = 1;
212 }
213 if ((exceptions & FeInexact) && !(underflow && flush) && mask.ixc) {
214 fpscr.ixc = 1;
215 }
216 fesetround(state);
217}
218
219template <class fpType>
220fpType
221fixDest(bool flush, bool defaultNan, fpType val, fpType op1)
222{
223 int fpClass = std::fpclassify(val);
224 fpType junk = 0.0;
225 if (fpClass == FP_NAN) {
226 const bool single = (sizeof(val) == sizeof(float));
227 const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
228 const bool nan = std::isnan(op1);
229 if (!nan || defaultNan) {
230 val = bitsToFp(qnan, junk);
231 } else if (nan) {
232 val = bitsToFp(fpToBits(op1) | qnan, junk);
233 }
234 } else if (fpClass == FP_SUBNORMAL && flush == 1) {
235 // Turn val into a zero with the correct sign;
236 uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1);
237 val = bitsToFp(fpToBits(val) & bitMask, junk);
238 feclearexcept(FeInexact);
239 feraiseexcept(FeUnderflow);
240 }
241 return val;
242}
243
244template
245float fixDest<float>(bool flush, bool defaultNan, float val, float op1);
246template
247double fixDest<double>(bool flush, bool defaultNan, double val, double op1);
248
249template <class fpType>
250fpType
251fixDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2)
252{
253 int fpClass = std::fpclassify(val);
254 fpType junk = 0.0;
255 if (fpClass == FP_NAN) {
256 const bool single = (sizeof(val) == sizeof(float));
257 const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
258 const bool nan1 = std::isnan(op1);
259 const bool nan2 = std::isnan(op2);
260 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
261 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
262 if ((!nan1 && !nan2) || defaultNan) {
263 val = bitsToFp(qnan, junk);
264 } else if (signal1) {
265 val = bitsToFp(fpToBits(op1) | qnan, junk);
266 } else if (signal2) {
267 val = bitsToFp(fpToBits(op2) | qnan, junk);
268 } else if (nan1) {
269 val = op1;
270 } else if (nan2) {
271 val = op2;
272 }
273 } else if (fpClass == FP_SUBNORMAL && flush) {
274 // Turn val into a zero with the correct sign;
275 uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1);
276 val = bitsToFp(fpToBits(val) & bitMask, junk);
277 feclearexcept(FeInexact);
278 feraiseexcept(FeUnderflow);
279 }
280 return val;
281}
282
283template
284float fixDest<float>(bool flush, bool defaultNan,
285 float val, float op1, float op2);
286template
287double fixDest<double>(bool flush, bool defaultNan,
288 double val, double op1, double op2);
289
290template <class fpType>
291fpType
292fixDivDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2)
293{
294 fpType mid = fixDest(flush, defaultNan, val, op1, op2);
295 const bool single = (sizeof(fpType) == sizeof(float));
296 const fpType junk = 0.0;
297 if ((single && (val == bitsToFp(0x00800000, junk) ||
298 val == bitsToFp(0x80800000, junk))) ||
299 (!single && (val == bitsToFp(ULL(0x0010000000000000), junk) ||
300 val == bitsToFp(ULL(0x8010000000000000), junk)))
301 ) {
302 __asm__ __volatile__("" : "=m" (op1) : "m" (op1));
303 fesetround(FeRoundZero);
304 fpType temp = 0.0;
305 __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
306 temp = op1 / op2;
307 if (flushToZero(temp)) {
308 feraiseexcept(FeUnderflow);
309 if (flush) {
310 feclearexcept(FeInexact);
311 mid = temp;
312 }
313 }
314 __asm__ __volatile__("" :: "m" (temp));
315 }
316 return mid;
317}
318
319template
320float fixDivDest<float>(bool flush, bool defaultNan,
321 float val, float op1, float op2);
322template
323double fixDivDest<double>(bool flush, bool defaultNan,
324 double val, double op1, double op2);
325
326float
327fixFpDFpSDest(FPSCR fpscr, double val)
328{
329 const float junk = 0.0;
330 float op1 = 0.0;
331 if (std::isnan(val)) {
332 uint64_t valBits = fpToBits(val);
333 uint32_t op1Bits = bits(valBits, 50, 29) |
334 (mask(9) << 22) |
335 (bits(valBits, 63) << 31);
336 op1 = bitsToFp(op1Bits, junk);
337 }
338 float mid = fixDest(fpscr.fz, fpscr.dn, (float)val, op1);
339 if (fpscr.fz && fetestexcept(FeUnderflow | FeInexact) ==
340 (FeUnderflow | FeInexact)) {
341 feclearexcept(FeInexact);
342 }
343 if (mid == bitsToFp(0x00800000, junk) ||
344 mid == bitsToFp(0x80800000, junk)) {
345 __asm__ __volatile__("" : "=m" (val) : "m" (val));
346 fesetround(FeRoundZero);
347 float temp = 0.0;
348 __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
349 temp = val;
350 if (flushToZero(temp)) {
351 feraiseexcept(FeUnderflow);
352 if (fpscr.fz) {
353 feclearexcept(FeInexact);
354 mid = temp;
355 }
356 }
357 __asm__ __volatile__("" :: "m" (temp));
358 }
359 return mid;
360}
361
362double
363fixFpSFpDDest(FPSCR fpscr, float val)
364{
365 const double junk = 0.0;
366 double op1 = 0.0;
367 if (std::isnan(val)) {
368 uint32_t valBits = fpToBits(val);
369 uint64_t op1Bits = ((uint64_t)bits(valBits, 21, 0) << 29) |
370 (mask(12) << 51) |
371 ((uint64_t)bits(valBits, 31) << 63);
372 op1 = bitsToFp(op1Bits, junk);
373 }
374 double mid = fixDest(fpscr.fz, fpscr.dn, (double)val, op1);
375 if (mid == bitsToFp(ULL(0x0010000000000000), junk) ||
376 mid == bitsToFp(ULL(0x8010000000000000), junk)) {
377 __asm__ __volatile__("" : "=m" (val) : "m" (val));
378 fesetround(FeRoundZero);
379 double temp = 0.0;
380 __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
381 temp = val;
382 if (flushToZero(temp)) {
383 feraiseexcept(FeUnderflow);
384 if (fpscr.fz) {
385 feclearexcept(FeInexact);
386 mid = temp;
387 }
388 }
389 __asm__ __volatile__("" :: "m" (temp));
390 }
391 return mid;
392}
393
394static inline uint16_t
395vcvtFpFpH(FPSCR &fpscr, bool flush, bool defaultNan,
396 uint32_t rMode, bool ahp, uint64_t opBits, bool isDouble)
397{
398 uint32_t mWidth;
399 uint32_t eWidth;
400 uint32_t eHalfRange;
401 uint32_t sBitPos;
402
403 if (isDouble) {
404 mWidth = 52;
405 eWidth = 11;
406 } else {
407 mWidth = 23;
408 eWidth = 8;
409 }
410 sBitPos = eWidth + mWidth;
411 eHalfRange = (1 << (eWidth-1)) - 1;
412
413 // Extract the operand.
414 bool neg = bits(opBits, sBitPos);
415 uint32_t exponent = bits(opBits, sBitPos-1, mWidth);
416 uint64_t oldMantissa = bits(opBits, mWidth-1, 0);
417 uint32_t mantissa = oldMantissa >> (mWidth - 10);
418 // Do the conversion.
419 uint64_t extra = oldMantissa & mask(mWidth - 10);
420 if (exponent == mask(eWidth)) {
421 if (oldMantissa != 0) {
422 // Nans.
423 if (bits(mantissa, 9) == 0) {
424 // Signalling nan.
425 fpscr.ioc = 1;
426 }
427 if (ahp) {
428 mantissa = 0;
429 exponent = 0;
430 fpscr.ioc = 1;
431 } else if (defaultNan) {
432 mantissa = (1 << 9);
433 exponent = 0x1f;
434 neg = false;
435 } else {
436 exponent = 0x1f;
437 mantissa |= (1 << 9);
438 }
439 } else {
440 // Infinities.
441 exponent = 0x1F;
442 if (ahp) {
443 fpscr.ioc = 1;
444 mantissa = 0x3ff;
445 } else {
446 mantissa = 0;
447 }
448 }
449 } else if (exponent == 0 && oldMantissa == 0) {
450 // Zero, don't need to do anything.
451 } else {
452 // Normalized or denormalized numbers.
453
454 bool inexact = (extra != 0);
455
456 if (exponent == 0) {
457 // Denormalized.
458 // If flush to zero is on, this shouldn't happen.
459 assert(!flush);
460
461 // Check for underflow
462 if (inexact || fpscr.ufe)
463 fpscr.ufc = 1;
464
465 // Handle rounding.
466 unsigned mode = rMode;
467 if ((mode == VfpRoundUpward && !neg && extra) ||
468 (mode == VfpRoundDown && neg && extra) ||
469 (mode == VfpRoundNearest &&
470 (extra > (1 << 9) ||
471 (extra == (1 << 9) && bits(mantissa, 0))))) {
472 mantissa++;
473 }
474
475 // See if the number became normalized after rounding.
476 if (mantissa == (1 << 10)) {
477 mantissa = 0;
478 exponent = 1;
479 }
480 } else {
481 // Normalized.
482
483 // We need to track the dropped bits differently since
484 // more can be dropped by denormalizing.
485 bool topOne = bits(extra, mWidth - 10 - 1);
486 bool restZeros = bits(extra, mWidth - 10 - 2, 0) == 0;
487
488 if (exponent <= (eHalfRange - 15)) {
489 // The result is too small. Denormalize.
490 mantissa |= (1 << 10);
491 while (mantissa && exponent <= (eHalfRange - 15)) {
492 restZeros = restZeros && !topOne;
493 topOne = bits(mantissa, 0);
494 mantissa = mantissa >> 1;
495 exponent++;
496 }
497 if (topOne || !restZeros)
498 inexact = true;
499 exponent = 0;
500 } else {
501 // Change bias.
502 exponent -= (eHalfRange - 15);
503 }
504
505 if (exponent == 0 && (inexact || fpscr.ufe)) {
506 // Underflow
507 fpscr.ufc = 1;
508 }
509
510 // Handle rounding.
511 unsigned mode = rMode;
512 bool nonZero = topOne || !restZeros;
513 if ((mode == VfpRoundUpward && !neg && nonZero) ||
514 (mode == VfpRoundDown && neg && nonZero) ||
515 (mode == VfpRoundNearest && topOne &&
516 (!restZeros || bits(mantissa, 0)))) {
517 mantissa++;
518 }
519
520 // See if we rounded up and need to bump the exponent.
521 if (mantissa == (1 << 10)) {
522 mantissa = 0;
523 exponent++;
524 }
525
526 // Deal with overflow
527 if (ahp) {
528 if (exponent >= 0x20) {
529 exponent = 0x1f;
530 mantissa = 0x3ff;
531 fpscr.ioc = 1;
532 // Supress inexact exception.
533 inexact = false;
534 }
535 } else {
536 if (exponent >= 0x1f) {
537 if ((mode == VfpRoundNearest) ||
538 (mode == VfpRoundUpward && !neg) ||
539 (mode == VfpRoundDown && neg)) {
540 // Overflow to infinity.
541 exponent = 0x1f;
542 mantissa = 0;
543 } else {
544 // Overflow to max normal.
545 exponent = 0x1e;
546 mantissa = 0x3ff;
547 }
548 fpscr.ofc = 1;
549 inexact = true;
550 }
551 }
552 }
553
554 if (inexact) {
555 fpscr.ixc = 1;
556 }
557 }
558 // Reassemble and install the result.
559 uint32_t result = bits(mantissa, 9, 0);
560 replaceBits(result, 14, 10, exponent);
561 if (neg)
562 result |= (1 << 15);
563 return result;
564}
565
566uint16_t
567vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan,
568 uint32_t rMode, bool ahp, float op)
569{
570 uint64_t opBits = fpToBits(op);
571 return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, false);
572}
573
574uint16_t
575vcvtFpDFpH(FPSCR &fpscr, bool flush, bool defaultNan,
576 uint32_t rMode, bool ahp, double op)
577{
578 uint64_t opBits = fpToBits(op);
579 return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, true);
580}
581
582static inline uint64_t
583vcvtFpHFp(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op, bool isDouble)
584{
585 uint32_t mWidth;
586 uint32_t eWidth;
587 uint32_t eHalfRange;
588 uint32_t sBitPos;
589
590 if (isDouble) {
591 mWidth = 52;
592 eWidth = 11;
593 } else {
594 mWidth = 23;
595 eWidth = 8;
596 }
597 sBitPos = eWidth + mWidth;
598 eHalfRange = (1 << (eWidth-1)) - 1;
599
600 // Extract the bitfields.
601 bool neg = bits(op, 15);
602 uint32_t exponent = bits(op, 14, 10);
603 uint64_t mantissa = bits(op, 9, 0);
604 // Do the conversion.
605 if (exponent == 0) {
606 if (mantissa != 0) {
607 // Normalize the value.
608 exponent = exponent + (eHalfRange - 15) + 1;
609 while (mantissa < (1 << 10)) {
610 mantissa = mantissa << 1;
611 exponent--;
612 }
613 }
614 mantissa = mantissa << (mWidth - 10);
615 } else if (exponent == 0x1f && !ahp) {
616 // Infinities and nans.
617 exponent = mask(eWidth);
618 if (mantissa != 0) {
619 // Nans.
620 mantissa = mantissa << (mWidth - 10);
621 if (bits(mantissa, mWidth-1) == 0) {
622 // Signalling nan.
623 fpscr.ioc = 1;
624 mantissa |= (((uint64_t) 1) << (mWidth-1));
625 }
626 if (defaultNan) {
627 mantissa &= ~mask(mWidth-1);
628 neg = false;
629 }
630 }
631 } else {
632 exponent = exponent + (eHalfRange - 15);
633 mantissa = mantissa << (mWidth - 10);
634 }
635 // Reassemble the result.
636 uint64_t result = bits(mantissa, mWidth-1, 0);
637 replaceBits(result, sBitPos-1, mWidth, exponent);
638 if (neg) {
639 result |= (((uint64_t) 1) << sBitPos);
640 }
641 return result;
642}
643
644double
645vcvtFpHFpD(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op)
646{
647 double junk = 0.0;
648 uint64_t result;
649
650 result = vcvtFpHFp(fpscr, defaultNan, ahp, op, true);
651 return bitsToFp(result, junk);
652}
653
654float
655vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op)
656{
657 float junk = 0.0;
658 uint64_t result;
659
660 result = vcvtFpHFp(fpscr, defaultNan, ahp, op, false);
661 return bitsToFp(result, junk);
662}
663
664float
665vfpUFixedToFpS(bool flush, bool defaultNan,
666 uint64_t val, uint8_t width, uint8_t imm)
667{
668 fesetround(FeRoundNearest);
669 if (width == 16)
670 val = (uint16_t)val;
671 else if (width == 32)
672 val = (uint32_t)val;
673 else if (width != 64)
674 panic("Unsupported width %d", width);
675 float scale = powf(2.0, imm);
676 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
677 feclearexcept(FeAllExceptions);
678 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
679 return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
680}
681
682float
683vfpSFixedToFpS(bool flush, bool defaultNan,
684 int64_t val, uint8_t width, uint8_t imm)
685{
686 fesetround(FeRoundNearest);
687 if (width == 16)
688 val = sext<16>(val & mask(16));
689 else if (width == 32)
690 val = sext<32>(val & mask(32));
691 else if (width != 64)
692 panic("Unsupported width %d", width);
693
694 float scale = powf(2.0, imm);
695 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
696 feclearexcept(FeAllExceptions);
697 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
698 return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
699}
700
701
702double
703vfpUFixedToFpD(bool flush, bool defaultNan,
704 uint64_t val, uint8_t width, uint8_t imm)
705{
706 fesetround(FeRoundNearest);
707 if (width == 16)
708 val = (uint16_t)val;
709 else if (width == 32)
710 val = (uint32_t)val;
711 else if (width != 64)
712 panic("Unsupported width %d", width);
713
714 double scale = pow(2.0, imm);
715 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
716 feclearexcept(FeAllExceptions);
717 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
718 return fixDivDest(flush, defaultNan, val / scale, (double)val, scale);
719}
720
721double
722vfpSFixedToFpD(bool flush, bool defaultNan,
723 int64_t val, uint8_t width, uint8_t imm)
724{
725 fesetround(FeRoundNearest);
726 if (width == 16)
727 val = sext<16>(val & mask(16));
728 else if (width == 32)
729 val = sext<32>(val & mask(32));
730 else if (width != 64)
731 panic("Unsupported width %d", width);
732
733 double scale = pow(2.0, imm);
734 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
735 feclearexcept(FeAllExceptions);
736 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
737 return fixDivDest(flush, defaultNan, val / scale, (double)val, scale);
738}
739
740// This function implements a magic formula taken from the architecture
741// reference manual. It was originally called recip_sqrt_estimate.
742static double
743recipSqrtEstimate(double a)
744{
745 int64_t q0, q1, s;
746 double r;
747 if (a < 0.5) {
748 q0 = (int64_t)(a * 512.0);
749 r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0);
750 } else {
751 q1 = (int64_t)(a * 256.0);
752 r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0);
753 }
754 s = (int64_t)(256.0 * r + 0.5);
755 return (double)s / 256.0;
756}
757
758// This function is only intended for use in Neon instructions because
759// it ignores certain bits in the FPSCR.
760float
761fprSqrtEstimate(FPSCR &fpscr, float op)
762{
763 const uint32_t qnan = 0x7fc00000;
764 float junk = 0.0;
765 int fpClass = std::fpclassify(op);
766 if (fpClass == FP_NAN) {
767 if ((fpToBits(op) & qnan) != qnan)
768 fpscr.ioc = 1;
769 return bitsToFp(qnan, junk);
770 } else if (fpClass == FP_ZERO) {
771 fpscr.dzc = 1;
772 // Return infinity with the same sign as the operand.
773 return bitsToFp((std::signbit(op) << 31) |
774 (0xFF << 23) | (0 << 0), junk);
775 } else if (std::signbit(op)) {
776 // Set invalid op bit.
777 fpscr.ioc = 1;
778 return bitsToFp(qnan, junk);
779 } else if (fpClass == FP_INFINITE) {
780 return 0.0;
781 } else {
782 uint64_t opBits = fpToBits(op);
783 double scaled;
784 if (bits(opBits, 23)) {
785 scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
786 (ULL(0x3fd) << 52) | (bits(opBits, 31) << 63),
787 (double)0.0);
788 } else {
789 scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
790 (ULL(0x3fe) << 52) | (bits(opBits, 31) << 63),
791 (double)0.0);
792 }
793 uint64_t resultExp = (380 - bits(opBits, 30, 23)) / 2;
794
795 uint64_t estimate = fpToBits(recipSqrtEstimate(scaled));
796
797 return bitsToFp((bits(estimate, 63) << 31) |
798 (bits(resultExp, 7, 0) << 23) |
799 (bits(estimate, 51, 29) << 0), junk);
800 }
801}
802
803uint32_t
804unsignedRSqrtEstimate(uint32_t op)
805{
806 if (bits(op, 31, 30) == 0) {
807 return -1;
808 } else {
809 double dpOp;
810 if (bits(op, 31)) {
811 dpOp = bitsToFp((ULL(0) << 63) |
812 (ULL(0x3fe) << 52) |
813 (bits((uint64_t)op, 30, 0) << 21) |
814 (0 << 0), (double)0.0);
815 } else {
816 dpOp = bitsToFp((ULL(0) << 63) |
817 (ULL(0x3fd) << 52) |
818 (bits((uint64_t)op, 29, 0) << 22) |
819 (0 << 0), (double)0.0);
820 }
821 uint64_t estimate = fpToBits(recipSqrtEstimate(dpOp));
822 return (1 << 31) | bits(estimate, 51, 21);
823 }
824}
825
826// This function implements a magic formula taken from the architecture
827// reference manual. It was originally called recip_estimate.
828
829static double
830recipEstimate(double a)
831{
832 int64_t q, s;
833 double r;
834 q = (int64_t)(a * 512.0);
835 r = 1.0 / (((double)q + 0.5) / 512.0);
836 s = (int64_t)(256.0 * r + 0.5);
837 return (double)s / 256.0;
838}
839
840// This function is only intended for use in Neon instructions because
841// it ignores certain bits in the FPSCR.
842float
843fpRecipEstimate(FPSCR &fpscr, float op)
844{
845 const uint32_t qnan = 0x7fc00000;
846 float junk = 0.0;
847 int fpClass = std::fpclassify(op);
848 if (fpClass == FP_NAN) {
849 if ((fpToBits(op) & qnan) != qnan)
850 fpscr.ioc = 1;
851 return bitsToFp(qnan, junk);
852 } else if (fpClass == FP_INFINITE) {
853 return bitsToFp(std::signbit(op) << 31, junk);
854 } else if (fpClass == FP_ZERO) {
855 fpscr.dzc = 1;
856 // Return infinity with the same sign as the operand.
857 return bitsToFp((std::signbit(op) << 31) |
858 (0xFF << 23) | (0 << 0), junk);
859 } else if (fabs(op) >= pow(2.0, 126)) {
860 fpscr.ufc = 1;
861 return bitsToFp(std::signbit(op) << 31, junk);
862 } else {
863 uint64_t opBits = fpToBits(op);
864 double scaled;
865 scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
866 (ULL(0x3fe) << 52) | (ULL(0) << 63),
867 (double)0.0);
868 uint64_t resultExp = 253 - bits(opBits, 30, 23);
869
870 uint64_t estimate = fpToBits(recipEstimate(scaled));
871
872 return bitsToFp((bits(opBits, 31) << 31) |
873 (bits(resultExp, 7, 0) << 23) |
874 (bits(estimate, 51, 29) << 0), junk);
875 }
876}
877
878uint32_t
879unsignedRecipEstimate(uint32_t op)
880{
881 if (bits(op, 31) == 0) {
882 return -1;
883 } else {
884 double dpOp;
885 dpOp = bitsToFp((ULL(0) << 63) |
886 (ULL(0x3fe) << 52) |
887 (bits((uint64_t)op, 30, 0) << 21) |
888 (0 << 0), (double)0.0);
889 uint64_t estimate = fpToBits(recipEstimate(dpOp));
890 return (1 << 31) | bits(estimate, 51, 21);
891 }
892}
893
894template <class fpType>
895fpType
896FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
897 fpType op1, fpType op2) const
898{
899 done = true;
900 fpType junk = 0.0;
901 fpType dest = 0.0;
902 const bool single = (sizeof(fpType) == sizeof(float));
903 const uint64_t qnan =
904 single ? 0x7fc00000 : ULL(0x7ff8000000000000);
905 const bool nan1 = std::isnan(op1);
906 const bool nan2 = std::isnan(op2);
907 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
908 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
909 if (nan1 || nan2) {
910 if (defaultNan) {
911 dest = bitsToFp(qnan, junk);
912 } else if (signal1) {
913 dest = bitsToFp(fpToBits(op1) | qnan, junk);
914 } else if (signal2) {
915 dest = bitsToFp(fpToBits(op2) | qnan, junk);
916 } else if (nan1) {
917 dest = op1;
918 } else if (nan2) {
919 dest = op2;
920 }
921 if (signal1 || signal2) {
922 fpscr.ioc = 1;
923 }
924 } else {
925 done = false;
926 }
927 return dest;
928}
929
930template
931float FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
932 float op1, float op2) const;
933template
934double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
935 double op1, double op2) const;
936
937// @TODO remove this function when we've finished switching all FMA code to use the new FPLIB
938template <class fpType>
939fpType
940FpOp::ternaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType op3,
941 fpType (*func)(fpType, fpType, fpType),
942 bool flush, bool defaultNan, uint32_t rMode) const
943{
944 const bool single = (sizeof(fpType) == sizeof(float));
945 fpType junk = 0.0;
946
947 if (flush && (flushToZero(op1, op2) || flushToZero(op3)))
948 fpscr.idc = 1;
949 VfpSavedState state = prepFpState(rMode);
950 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3), "=m" (state)
951 : "m" (op1), "m" (op2), "m" (op3), "m" (state));
952 fpType dest = func(op1, op2, op3);
953 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
954
955 int fpClass = std::fpclassify(dest);
956 // Get NAN behavior right. This varies between x86 and ARM.
957 if (fpClass == FP_NAN) {
958 const uint64_t qnan =
959 single ? 0x7fc00000 : ULL(0x7ff8000000000000);
960 const bool nan1 = std::isnan(op1);
961 const bool nan2 = std::isnan(op2);
962 const bool nan3 = std::isnan(op3);
963 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
964 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
965 const bool signal3 = nan3 && ((fpToBits(op3) & qnan) != qnan);
966 if ((!nan1 && !nan2 && !nan3) || (defaultNan == 1)) {
967 dest = bitsToFp(qnan, junk);
968 } else if (signal1) {
969 dest = bitsToFp(fpToBits(op1) | qnan, junk);
970 } else if (signal2) {
971 dest = bitsToFp(fpToBits(op2) | qnan, junk);
972 } else if (signal3) {
973 dest = bitsToFp(fpToBits(op3) | qnan, junk);
974 } else if (nan1) {
975 dest = op1;
976 } else if (nan2) {
977 dest = op2;
978 } else if (nan3) {
979 dest = op3;
980 }
981 } else if (flush && flushToZero(dest)) {
982 feraiseexcept(FeUnderflow);
983 } else if ((
984 (single && (dest == bitsToFp(0x00800000, junk) ||
985 dest == bitsToFp(0x80800000, junk))) ||
986 (!single &&
987 (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
988 dest == bitsToFp(ULL(0x8010000000000000), junk)))
989 ) && rMode != VfpRoundZero) {
990 /*
991 * Correct for the fact that underflow is detected -before- rounding
992 * in ARM and -after- rounding in x86.
993 */
994 fesetround(FeRoundZero);
995 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3)
996 : "m" (op1), "m" (op2), "m" (op3));
997 fpType temp = func(op1, op2, op2);
998 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
999 if (flush && flushToZero(temp)) {
1000 dest = temp;
1001 }
1002 }
1003 finishVfp(fpscr, state, flush);
1004 return dest;
1005}
1006
1007template
1008float FpOp::ternaryOp(FPSCR &fpscr, float op1, float op2, float op3,
1009 float (*func)(float, float, float),
1010 bool flush, bool defaultNan, uint32_t rMode) const;
1011template
1012double FpOp::ternaryOp(FPSCR &fpscr, double op1, double op2, double op3,
1013 double (*func)(double, double, double),
1014 bool flush, bool defaultNan, uint32_t rMode) const;
1015
1016template <class fpType>
1017fpType
1018FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2,
1019 fpType (*func)(fpType, fpType),
1020 bool flush, bool defaultNan, uint32_t rMode) const
1021{
1022 const bool single = (sizeof(fpType) == sizeof(float));
1023 fpType junk = 0.0;
1024
1025 if (flush && flushToZero(op1, op2))
1026 fpscr.idc = 1;
1027 VfpSavedState state = prepFpState(rMode);
1028 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (state)
1029 : "m" (op1), "m" (op2), "m" (state));
1030 fpType dest = func(op1, op2);
1031 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
1032
1033 // Get NAN behavior right. This varies between x86 and ARM.
1034 if (std::isnan(dest)) {
1035 const uint64_t qnan =
1036 single ? 0x7fc00000 : ULL(0x7ff8000000000000);
1037 const bool nan1 = std::isnan(op1);
1038 const bool nan2 = std::isnan(op2);
1039 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
1040 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
1041 if ((!nan1 && !nan2) || (defaultNan == 1)) {
1042 dest = bitsToFp(qnan, junk);
1043 } else if (signal1) {
1044 dest = bitsToFp(fpToBits(op1) | qnan, junk);
1045 } else if (signal2) {
1046 dest = bitsToFp(fpToBits(op2) | qnan, junk);
1047 } else if (nan1) {
1048 dest = op1;
1049 } else if (nan2) {
1050 dest = op2;
1051 }
1052 } else if (flush && flushToZero(dest)) {
1053 feraiseexcept(FeUnderflow);
1054 } else if ((
1055 (single && (dest == bitsToFp(0x00800000, junk) ||
1056 dest == bitsToFp(0x80800000, junk))) ||
1057 (!single &&
1058 (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
1059 dest == bitsToFp(ULL(0x8010000000000000), junk)))
1060 ) && rMode != VfpRoundZero) {
1061 /*
1062 * Correct for the fact that underflow is detected -before- rounding
1063 * in ARM and -after- rounding in x86.
1064 */
1065 fesetround(FeRoundZero);
1066 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2)
1067 : "m" (op1), "m" (op2));
1068 fpType temp = func(op1, op2);
1069 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
1070 if (flush && flushToZero(temp)) {
1071 dest = temp;
1072 }
1073 }
1074 finishVfp(fpscr, state, flush);
1075 return dest;
1076}
1077
1078template
1079float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2,
1080 float (*func)(float, float),
1081 bool flush, bool defaultNan, uint32_t rMode) const;
1082template
1083double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2,
1084 double (*func)(double, double),
1085 bool flush, bool defaultNan, uint32_t rMode) const;
1086
1087template <class fpType>
1088fpType
1089FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (*func)(fpType),
1090 bool flush, uint32_t rMode) const
1091{
1092 const bool single = (sizeof(fpType) == sizeof(float));
1093 fpType junk = 0.0;
1094
1095 if (flush && flushToZero(op1))
1096 fpscr.idc = 1;
1097 VfpSavedState state = prepFpState(rMode);
1098 __asm__ __volatile__ ("" : "=m" (op1), "=m" (state)
1099 : "m" (op1), "m" (state));
1100 fpType dest = func(op1);
1101 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
1102
1103 // Get NAN behavior right. This varies between x86 and ARM.
1104 if (std::isnan(dest)) {
1105 const uint64_t qnan =
1106 single ? 0x7fc00000 : ULL(0x7ff8000000000000);
1107 const bool nan = std::isnan(op1);
1108 if (!nan || fpscr.dn == 1) {
1109 dest = bitsToFp(qnan, junk);
1110 } else if (nan) {
1111 dest = bitsToFp(fpToBits(op1) | qnan, junk);
1112 }
1113 } else if (flush && flushToZero(dest)) {
1114 feraiseexcept(FeUnderflow);
1115 } else if ((
1116 (single && (dest == bitsToFp(0x00800000, junk) ||
1117 dest == bitsToFp(0x80800000, junk))) ||
1118 (!single &&
1119 (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
1120 dest == bitsToFp(ULL(0x8010000000000000), junk)))
1121 ) && rMode != VfpRoundZero) {
1122 /*
1123 * Correct for the fact that underflow is detected -before- rounding
1124 * in ARM and -after- rounding in x86.
1125 */
1126 fesetround(FeRoundZero);
1127 __asm__ __volatile__ ("" : "=m" (op1) : "m" (op1));
1128 fpType temp = func(op1);
1129 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
1130 if (flush && flushToZero(temp)) {
1131 dest = temp;
1132 }
1133 }
1134 finishVfp(fpscr, state, flush);
1135 return dest;
1136}
1137
1138template
1139float FpOp::unaryOp(FPSCR &fpscr, float op1, float (*func)(float),
1140 bool flush, uint32_t rMode) const;
1141template
1142double FpOp::unaryOp(FPSCR &fpscr, double op1, double (*func)(double),
1143 bool flush, uint32_t rMode) const;
1144
1145IntRegIndex
1146VfpMacroOp::addStride(IntRegIndex idx, unsigned stride)
1147{
1148 if (wide) {
1149 stride *= 2;
1150 }
1151 unsigned offset = idx % 8;
1152 idx = (IntRegIndex)(idx - offset);
1153 offset += stride;
1154 idx = (IntRegIndex)(idx + (offset % 8));
1155 return idx;
1156}
1157
1158void
1159VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1, IntRegIndex &op2)
1160{
1161 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1162 assert(!inScalarBank(dest));
1163 dest = addStride(dest, stride);
1164 op1 = addStride(op1, stride);
1165 if (!inScalarBank(op2)) {
1166 op2 = addStride(op2, stride);
1167 }
1168}
1169
1170void
1171VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1)
1172{
1173 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1174 assert(!inScalarBank(dest));
1175 dest = addStride(dest, stride);
1176 if (!inScalarBank(op1)) {
1177 op1 = addStride(op1, stride);
1178 }
1179}
1180
1181void
1182VfpMacroOp::nextIdxs(IntRegIndex &dest)
1183{
1184 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1185 assert(!inScalarBank(dest));
1186 dest = addStride(dest, stride);
1187}
1188
1189}
165 ccprintf(ss, ", #%d", imm);
166 return ss.str();
167}
168
169namespace ArmISA
170{
171
172VfpSavedState
173prepFpState(uint32_t rMode)
174{
175 int roundingMode = fegetround();
176 feclearexcept(FeAllExceptions);
177 switch (rMode) {
178 case VfpRoundNearest:
179 fesetround(FeRoundNearest);
180 break;
181 case VfpRoundUpward:
182 fesetround(FeRoundUpward);
183 break;
184 case VfpRoundDown:
185 fesetround(FeRoundDown);
186 break;
187 case VfpRoundZero:
188 fesetround(FeRoundZero);
189 break;
190 }
191 return roundingMode;
192}
193
194void
195finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush, FPSCR mask)
196{
197 int exceptions = fetestexcept(FeAllExceptions);
198 bool underflow = false;
199 if ((exceptions & FeInvalid) && mask.ioc) {
200 fpscr.ioc = 1;
201 }
202 if ((exceptions & FeDivByZero) && mask.dzc) {
203 fpscr.dzc = 1;
204 }
205 if ((exceptions & FeOverflow) && mask.ofc) {
206 fpscr.ofc = 1;
207 }
208 if (exceptions & FeUnderflow) {
209 underflow = true;
210 if (mask.ufc)
211 fpscr.ufc = 1;
212 }
213 if ((exceptions & FeInexact) && !(underflow && flush) && mask.ixc) {
214 fpscr.ixc = 1;
215 }
216 fesetround(state);
217}
218
219template <class fpType>
220fpType
221fixDest(bool flush, bool defaultNan, fpType val, fpType op1)
222{
223 int fpClass = std::fpclassify(val);
224 fpType junk = 0.0;
225 if (fpClass == FP_NAN) {
226 const bool single = (sizeof(val) == sizeof(float));
227 const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
228 const bool nan = std::isnan(op1);
229 if (!nan || defaultNan) {
230 val = bitsToFp(qnan, junk);
231 } else if (nan) {
232 val = bitsToFp(fpToBits(op1) | qnan, junk);
233 }
234 } else if (fpClass == FP_SUBNORMAL && flush == 1) {
235 // Turn val into a zero with the correct sign;
236 uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1);
237 val = bitsToFp(fpToBits(val) & bitMask, junk);
238 feclearexcept(FeInexact);
239 feraiseexcept(FeUnderflow);
240 }
241 return val;
242}
243
244template
245float fixDest<float>(bool flush, bool defaultNan, float val, float op1);
246template
247double fixDest<double>(bool flush, bool defaultNan, double val, double op1);
248
249template <class fpType>
250fpType
251fixDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2)
252{
253 int fpClass = std::fpclassify(val);
254 fpType junk = 0.0;
255 if (fpClass == FP_NAN) {
256 const bool single = (sizeof(val) == sizeof(float));
257 const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
258 const bool nan1 = std::isnan(op1);
259 const bool nan2 = std::isnan(op2);
260 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
261 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
262 if ((!nan1 && !nan2) || defaultNan) {
263 val = bitsToFp(qnan, junk);
264 } else if (signal1) {
265 val = bitsToFp(fpToBits(op1) | qnan, junk);
266 } else if (signal2) {
267 val = bitsToFp(fpToBits(op2) | qnan, junk);
268 } else if (nan1) {
269 val = op1;
270 } else if (nan2) {
271 val = op2;
272 }
273 } else if (fpClass == FP_SUBNORMAL && flush) {
274 // Turn val into a zero with the correct sign;
275 uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1);
276 val = bitsToFp(fpToBits(val) & bitMask, junk);
277 feclearexcept(FeInexact);
278 feraiseexcept(FeUnderflow);
279 }
280 return val;
281}
282
283template
284float fixDest<float>(bool flush, bool defaultNan,
285 float val, float op1, float op2);
286template
287double fixDest<double>(bool flush, bool defaultNan,
288 double val, double op1, double op2);
289
290template <class fpType>
291fpType
292fixDivDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2)
293{
294 fpType mid = fixDest(flush, defaultNan, val, op1, op2);
295 const bool single = (sizeof(fpType) == sizeof(float));
296 const fpType junk = 0.0;
297 if ((single && (val == bitsToFp(0x00800000, junk) ||
298 val == bitsToFp(0x80800000, junk))) ||
299 (!single && (val == bitsToFp(ULL(0x0010000000000000), junk) ||
300 val == bitsToFp(ULL(0x8010000000000000), junk)))
301 ) {
302 __asm__ __volatile__("" : "=m" (op1) : "m" (op1));
303 fesetround(FeRoundZero);
304 fpType temp = 0.0;
305 __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
306 temp = op1 / op2;
307 if (flushToZero(temp)) {
308 feraiseexcept(FeUnderflow);
309 if (flush) {
310 feclearexcept(FeInexact);
311 mid = temp;
312 }
313 }
314 __asm__ __volatile__("" :: "m" (temp));
315 }
316 return mid;
317}
318
319template
320float fixDivDest<float>(bool flush, bool defaultNan,
321 float val, float op1, float op2);
322template
323double fixDivDest<double>(bool flush, bool defaultNan,
324 double val, double op1, double op2);
325
326float
327fixFpDFpSDest(FPSCR fpscr, double val)
328{
329 const float junk = 0.0;
330 float op1 = 0.0;
331 if (std::isnan(val)) {
332 uint64_t valBits = fpToBits(val);
333 uint32_t op1Bits = bits(valBits, 50, 29) |
334 (mask(9) << 22) |
335 (bits(valBits, 63) << 31);
336 op1 = bitsToFp(op1Bits, junk);
337 }
338 float mid = fixDest(fpscr.fz, fpscr.dn, (float)val, op1);
339 if (fpscr.fz && fetestexcept(FeUnderflow | FeInexact) ==
340 (FeUnderflow | FeInexact)) {
341 feclearexcept(FeInexact);
342 }
343 if (mid == bitsToFp(0x00800000, junk) ||
344 mid == bitsToFp(0x80800000, junk)) {
345 __asm__ __volatile__("" : "=m" (val) : "m" (val));
346 fesetround(FeRoundZero);
347 float temp = 0.0;
348 __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
349 temp = val;
350 if (flushToZero(temp)) {
351 feraiseexcept(FeUnderflow);
352 if (fpscr.fz) {
353 feclearexcept(FeInexact);
354 mid = temp;
355 }
356 }
357 __asm__ __volatile__("" :: "m" (temp));
358 }
359 return mid;
360}
361
362double
363fixFpSFpDDest(FPSCR fpscr, float val)
364{
365 const double junk = 0.0;
366 double op1 = 0.0;
367 if (std::isnan(val)) {
368 uint32_t valBits = fpToBits(val);
369 uint64_t op1Bits = ((uint64_t)bits(valBits, 21, 0) << 29) |
370 (mask(12) << 51) |
371 ((uint64_t)bits(valBits, 31) << 63);
372 op1 = bitsToFp(op1Bits, junk);
373 }
374 double mid = fixDest(fpscr.fz, fpscr.dn, (double)val, op1);
375 if (mid == bitsToFp(ULL(0x0010000000000000), junk) ||
376 mid == bitsToFp(ULL(0x8010000000000000), junk)) {
377 __asm__ __volatile__("" : "=m" (val) : "m" (val));
378 fesetround(FeRoundZero);
379 double temp = 0.0;
380 __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
381 temp = val;
382 if (flushToZero(temp)) {
383 feraiseexcept(FeUnderflow);
384 if (fpscr.fz) {
385 feclearexcept(FeInexact);
386 mid = temp;
387 }
388 }
389 __asm__ __volatile__("" :: "m" (temp));
390 }
391 return mid;
392}
393
394static inline uint16_t
395vcvtFpFpH(FPSCR &fpscr, bool flush, bool defaultNan,
396 uint32_t rMode, bool ahp, uint64_t opBits, bool isDouble)
397{
398 uint32_t mWidth;
399 uint32_t eWidth;
400 uint32_t eHalfRange;
401 uint32_t sBitPos;
402
403 if (isDouble) {
404 mWidth = 52;
405 eWidth = 11;
406 } else {
407 mWidth = 23;
408 eWidth = 8;
409 }
410 sBitPos = eWidth + mWidth;
411 eHalfRange = (1 << (eWidth-1)) - 1;
412
413 // Extract the operand.
414 bool neg = bits(opBits, sBitPos);
415 uint32_t exponent = bits(opBits, sBitPos-1, mWidth);
416 uint64_t oldMantissa = bits(opBits, mWidth-1, 0);
417 uint32_t mantissa = oldMantissa >> (mWidth - 10);
418 // Do the conversion.
419 uint64_t extra = oldMantissa & mask(mWidth - 10);
420 if (exponent == mask(eWidth)) {
421 if (oldMantissa != 0) {
422 // Nans.
423 if (bits(mantissa, 9) == 0) {
424 // Signalling nan.
425 fpscr.ioc = 1;
426 }
427 if (ahp) {
428 mantissa = 0;
429 exponent = 0;
430 fpscr.ioc = 1;
431 } else if (defaultNan) {
432 mantissa = (1 << 9);
433 exponent = 0x1f;
434 neg = false;
435 } else {
436 exponent = 0x1f;
437 mantissa |= (1 << 9);
438 }
439 } else {
440 // Infinities.
441 exponent = 0x1F;
442 if (ahp) {
443 fpscr.ioc = 1;
444 mantissa = 0x3ff;
445 } else {
446 mantissa = 0;
447 }
448 }
449 } else if (exponent == 0 && oldMantissa == 0) {
450 // Zero, don't need to do anything.
451 } else {
452 // Normalized or denormalized numbers.
453
454 bool inexact = (extra != 0);
455
456 if (exponent == 0) {
457 // Denormalized.
458 // If flush to zero is on, this shouldn't happen.
459 assert(!flush);
460
461 // Check for underflow
462 if (inexact || fpscr.ufe)
463 fpscr.ufc = 1;
464
465 // Handle rounding.
466 unsigned mode = rMode;
467 if ((mode == VfpRoundUpward && !neg && extra) ||
468 (mode == VfpRoundDown && neg && extra) ||
469 (mode == VfpRoundNearest &&
470 (extra > (1 << 9) ||
471 (extra == (1 << 9) && bits(mantissa, 0))))) {
472 mantissa++;
473 }
474
475 // See if the number became normalized after rounding.
476 if (mantissa == (1 << 10)) {
477 mantissa = 0;
478 exponent = 1;
479 }
480 } else {
481 // Normalized.
482
483 // We need to track the dropped bits differently since
484 // more can be dropped by denormalizing.
485 bool topOne = bits(extra, mWidth - 10 - 1);
486 bool restZeros = bits(extra, mWidth - 10 - 2, 0) == 0;
487
488 if (exponent <= (eHalfRange - 15)) {
489 // The result is too small. Denormalize.
490 mantissa |= (1 << 10);
491 while (mantissa && exponent <= (eHalfRange - 15)) {
492 restZeros = restZeros && !topOne;
493 topOne = bits(mantissa, 0);
494 mantissa = mantissa >> 1;
495 exponent++;
496 }
497 if (topOne || !restZeros)
498 inexact = true;
499 exponent = 0;
500 } else {
501 // Change bias.
502 exponent -= (eHalfRange - 15);
503 }
504
505 if (exponent == 0 && (inexact || fpscr.ufe)) {
506 // Underflow
507 fpscr.ufc = 1;
508 }
509
510 // Handle rounding.
511 unsigned mode = rMode;
512 bool nonZero = topOne || !restZeros;
513 if ((mode == VfpRoundUpward && !neg && nonZero) ||
514 (mode == VfpRoundDown && neg && nonZero) ||
515 (mode == VfpRoundNearest && topOne &&
516 (!restZeros || bits(mantissa, 0)))) {
517 mantissa++;
518 }
519
520 // See if we rounded up and need to bump the exponent.
521 if (mantissa == (1 << 10)) {
522 mantissa = 0;
523 exponent++;
524 }
525
526 // Deal with overflow
527 if (ahp) {
528 if (exponent >= 0x20) {
529 exponent = 0x1f;
530 mantissa = 0x3ff;
531 fpscr.ioc = 1;
532 // Supress inexact exception.
533 inexact = false;
534 }
535 } else {
536 if (exponent >= 0x1f) {
537 if ((mode == VfpRoundNearest) ||
538 (mode == VfpRoundUpward && !neg) ||
539 (mode == VfpRoundDown && neg)) {
540 // Overflow to infinity.
541 exponent = 0x1f;
542 mantissa = 0;
543 } else {
544 // Overflow to max normal.
545 exponent = 0x1e;
546 mantissa = 0x3ff;
547 }
548 fpscr.ofc = 1;
549 inexact = true;
550 }
551 }
552 }
553
554 if (inexact) {
555 fpscr.ixc = 1;
556 }
557 }
558 // Reassemble and install the result.
559 uint32_t result = bits(mantissa, 9, 0);
560 replaceBits(result, 14, 10, exponent);
561 if (neg)
562 result |= (1 << 15);
563 return result;
564}
565
566uint16_t
567vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan,
568 uint32_t rMode, bool ahp, float op)
569{
570 uint64_t opBits = fpToBits(op);
571 return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, false);
572}
573
574uint16_t
575vcvtFpDFpH(FPSCR &fpscr, bool flush, bool defaultNan,
576 uint32_t rMode, bool ahp, double op)
577{
578 uint64_t opBits = fpToBits(op);
579 return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, true);
580}
581
582static inline uint64_t
583vcvtFpHFp(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op, bool isDouble)
584{
585 uint32_t mWidth;
586 uint32_t eWidth;
587 uint32_t eHalfRange;
588 uint32_t sBitPos;
589
590 if (isDouble) {
591 mWidth = 52;
592 eWidth = 11;
593 } else {
594 mWidth = 23;
595 eWidth = 8;
596 }
597 sBitPos = eWidth + mWidth;
598 eHalfRange = (1 << (eWidth-1)) - 1;
599
600 // Extract the bitfields.
601 bool neg = bits(op, 15);
602 uint32_t exponent = bits(op, 14, 10);
603 uint64_t mantissa = bits(op, 9, 0);
604 // Do the conversion.
605 if (exponent == 0) {
606 if (mantissa != 0) {
607 // Normalize the value.
608 exponent = exponent + (eHalfRange - 15) + 1;
609 while (mantissa < (1 << 10)) {
610 mantissa = mantissa << 1;
611 exponent--;
612 }
613 }
614 mantissa = mantissa << (mWidth - 10);
615 } else if (exponent == 0x1f && !ahp) {
616 // Infinities and nans.
617 exponent = mask(eWidth);
618 if (mantissa != 0) {
619 // Nans.
620 mantissa = mantissa << (mWidth - 10);
621 if (bits(mantissa, mWidth-1) == 0) {
622 // Signalling nan.
623 fpscr.ioc = 1;
624 mantissa |= (((uint64_t) 1) << (mWidth-1));
625 }
626 if (defaultNan) {
627 mantissa &= ~mask(mWidth-1);
628 neg = false;
629 }
630 }
631 } else {
632 exponent = exponent + (eHalfRange - 15);
633 mantissa = mantissa << (mWidth - 10);
634 }
635 // Reassemble the result.
636 uint64_t result = bits(mantissa, mWidth-1, 0);
637 replaceBits(result, sBitPos-1, mWidth, exponent);
638 if (neg) {
639 result |= (((uint64_t) 1) << sBitPos);
640 }
641 return result;
642}
643
644double
645vcvtFpHFpD(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op)
646{
647 double junk = 0.0;
648 uint64_t result;
649
650 result = vcvtFpHFp(fpscr, defaultNan, ahp, op, true);
651 return bitsToFp(result, junk);
652}
653
654float
655vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op)
656{
657 float junk = 0.0;
658 uint64_t result;
659
660 result = vcvtFpHFp(fpscr, defaultNan, ahp, op, false);
661 return bitsToFp(result, junk);
662}
663
664float
665vfpUFixedToFpS(bool flush, bool defaultNan,
666 uint64_t val, uint8_t width, uint8_t imm)
667{
668 fesetround(FeRoundNearest);
669 if (width == 16)
670 val = (uint16_t)val;
671 else if (width == 32)
672 val = (uint32_t)val;
673 else if (width != 64)
674 panic("Unsupported width %d", width);
675 float scale = powf(2.0, imm);
676 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
677 feclearexcept(FeAllExceptions);
678 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
679 return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
680}
681
682float
683vfpSFixedToFpS(bool flush, bool defaultNan,
684 int64_t val, uint8_t width, uint8_t imm)
685{
686 fesetround(FeRoundNearest);
687 if (width == 16)
688 val = sext<16>(val & mask(16));
689 else if (width == 32)
690 val = sext<32>(val & mask(32));
691 else if (width != 64)
692 panic("Unsupported width %d", width);
693
694 float scale = powf(2.0, imm);
695 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
696 feclearexcept(FeAllExceptions);
697 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
698 return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
699}
700
701
702double
703vfpUFixedToFpD(bool flush, bool defaultNan,
704 uint64_t val, uint8_t width, uint8_t imm)
705{
706 fesetround(FeRoundNearest);
707 if (width == 16)
708 val = (uint16_t)val;
709 else if (width == 32)
710 val = (uint32_t)val;
711 else if (width != 64)
712 panic("Unsupported width %d", width);
713
714 double scale = pow(2.0, imm);
715 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
716 feclearexcept(FeAllExceptions);
717 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
718 return fixDivDest(flush, defaultNan, val / scale, (double)val, scale);
719}
720
721double
722vfpSFixedToFpD(bool flush, bool defaultNan,
723 int64_t val, uint8_t width, uint8_t imm)
724{
725 fesetround(FeRoundNearest);
726 if (width == 16)
727 val = sext<16>(val & mask(16));
728 else if (width == 32)
729 val = sext<32>(val & mask(32));
730 else if (width != 64)
731 panic("Unsupported width %d", width);
732
733 double scale = pow(2.0, imm);
734 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
735 feclearexcept(FeAllExceptions);
736 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
737 return fixDivDest(flush, defaultNan, val / scale, (double)val, scale);
738}
739
740// This function implements a magic formula taken from the architecture
741// reference manual. It was originally called recip_sqrt_estimate.
742static double
743recipSqrtEstimate(double a)
744{
745 int64_t q0, q1, s;
746 double r;
747 if (a < 0.5) {
748 q0 = (int64_t)(a * 512.0);
749 r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0);
750 } else {
751 q1 = (int64_t)(a * 256.0);
752 r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0);
753 }
754 s = (int64_t)(256.0 * r + 0.5);
755 return (double)s / 256.0;
756}
757
758// This function is only intended for use in Neon instructions because
759// it ignores certain bits in the FPSCR.
760float
761fprSqrtEstimate(FPSCR &fpscr, float op)
762{
763 const uint32_t qnan = 0x7fc00000;
764 float junk = 0.0;
765 int fpClass = std::fpclassify(op);
766 if (fpClass == FP_NAN) {
767 if ((fpToBits(op) & qnan) != qnan)
768 fpscr.ioc = 1;
769 return bitsToFp(qnan, junk);
770 } else if (fpClass == FP_ZERO) {
771 fpscr.dzc = 1;
772 // Return infinity with the same sign as the operand.
773 return bitsToFp((std::signbit(op) << 31) |
774 (0xFF << 23) | (0 << 0), junk);
775 } else if (std::signbit(op)) {
776 // Set invalid op bit.
777 fpscr.ioc = 1;
778 return bitsToFp(qnan, junk);
779 } else if (fpClass == FP_INFINITE) {
780 return 0.0;
781 } else {
782 uint64_t opBits = fpToBits(op);
783 double scaled;
784 if (bits(opBits, 23)) {
785 scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
786 (ULL(0x3fd) << 52) | (bits(opBits, 31) << 63),
787 (double)0.0);
788 } else {
789 scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
790 (ULL(0x3fe) << 52) | (bits(opBits, 31) << 63),
791 (double)0.0);
792 }
793 uint64_t resultExp = (380 - bits(opBits, 30, 23)) / 2;
794
795 uint64_t estimate = fpToBits(recipSqrtEstimate(scaled));
796
797 return bitsToFp((bits(estimate, 63) << 31) |
798 (bits(resultExp, 7, 0) << 23) |
799 (bits(estimate, 51, 29) << 0), junk);
800 }
801}
802
803uint32_t
804unsignedRSqrtEstimate(uint32_t op)
805{
806 if (bits(op, 31, 30) == 0) {
807 return -1;
808 } else {
809 double dpOp;
810 if (bits(op, 31)) {
811 dpOp = bitsToFp((ULL(0) << 63) |
812 (ULL(0x3fe) << 52) |
813 (bits((uint64_t)op, 30, 0) << 21) |
814 (0 << 0), (double)0.0);
815 } else {
816 dpOp = bitsToFp((ULL(0) << 63) |
817 (ULL(0x3fd) << 52) |
818 (bits((uint64_t)op, 29, 0) << 22) |
819 (0 << 0), (double)0.0);
820 }
821 uint64_t estimate = fpToBits(recipSqrtEstimate(dpOp));
822 return (1 << 31) | bits(estimate, 51, 21);
823 }
824}
825
826// This function implements a magic formula taken from the architecture
827// reference manual. It was originally called recip_estimate.
828
829static double
830recipEstimate(double a)
831{
832 int64_t q, s;
833 double r;
834 q = (int64_t)(a * 512.0);
835 r = 1.0 / (((double)q + 0.5) / 512.0);
836 s = (int64_t)(256.0 * r + 0.5);
837 return (double)s / 256.0;
838}
839
840// This function is only intended for use in Neon instructions because
841// it ignores certain bits in the FPSCR.
842float
843fpRecipEstimate(FPSCR &fpscr, float op)
844{
845 const uint32_t qnan = 0x7fc00000;
846 float junk = 0.0;
847 int fpClass = std::fpclassify(op);
848 if (fpClass == FP_NAN) {
849 if ((fpToBits(op) & qnan) != qnan)
850 fpscr.ioc = 1;
851 return bitsToFp(qnan, junk);
852 } else if (fpClass == FP_INFINITE) {
853 return bitsToFp(std::signbit(op) << 31, junk);
854 } else if (fpClass == FP_ZERO) {
855 fpscr.dzc = 1;
856 // Return infinity with the same sign as the operand.
857 return bitsToFp((std::signbit(op) << 31) |
858 (0xFF << 23) | (0 << 0), junk);
859 } else if (fabs(op) >= pow(2.0, 126)) {
860 fpscr.ufc = 1;
861 return bitsToFp(std::signbit(op) << 31, junk);
862 } else {
863 uint64_t opBits = fpToBits(op);
864 double scaled;
865 scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
866 (ULL(0x3fe) << 52) | (ULL(0) << 63),
867 (double)0.0);
868 uint64_t resultExp = 253 - bits(opBits, 30, 23);
869
870 uint64_t estimate = fpToBits(recipEstimate(scaled));
871
872 return bitsToFp((bits(opBits, 31) << 31) |
873 (bits(resultExp, 7, 0) << 23) |
874 (bits(estimate, 51, 29) << 0), junk);
875 }
876}
877
878uint32_t
879unsignedRecipEstimate(uint32_t op)
880{
881 if (bits(op, 31) == 0) {
882 return -1;
883 } else {
884 double dpOp;
885 dpOp = bitsToFp((ULL(0) << 63) |
886 (ULL(0x3fe) << 52) |
887 (bits((uint64_t)op, 30, 0) << 21) |
888 (0 << 0), (double)0.0);
889 uint64_t estimate = fpToBits(recipEstimate(dpOp));
890 return (1 << 31) | bits(estimate, 51, 21);
891 }
892}
893
894template <class fpType>
895fpType
896FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
897 fpType op1, fpType op2) const
898{
899 done = true;
900 fpType junk = 0.0;
901 fpType dest = 0.0;
902 const bool single = (sizeof(fpType) == sizeof(float));
903 const uint64_t qnan =
904 single ? 0x7fc00000 : ULL(0x7ff8000000000000);
905 const bool nan1 = std::isnan(op1);
906 const bool nan2 = std::isnan(op2);
907 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
908 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
909 if (nan1 || nan2) {
910 if (defaultNan) {
911 dest = bitsToFp(qnan, junk);
912 } else if (signal1) {
913 dest = bitsToFp(fpToBits(op1) | qnan, junk);
914 } else if (signal2) {
915 dest = bitsToFp(fpToBits(op2) | qnan, junk);
916 } else if (nan1) {
917 dest = op1;
918 } else if (nan2) {
919 dest = op2;
920 }
921 if (signal1 || signal2) {
922 fpscr.ioc = 1;
923 }
924 } else {
925 done = false;
926 }
927 return dest;
928}
929
930template
931float FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
932 float op1, float op2) const;
933template
934double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
935 double op1, double op2) const;
936
937// @TODO remove this function when we've finished switching all FMA code to use the new FPLIB
938template <class fpType>
939fpType
940FpOp::ternaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType op3,
941 fpType (*func)(fpType, fpType, fpType),
942 bool flush, bool defaultNan, uint32_t rMode) const
943{
944 const bool single = (sizeof(fpType) == sizeof(float));
945 fpType junk = 0.0;
946
947 if (flush && (flushToZero(op1, op2) || flushToZero(op3)))
948 fpscr.idc = 1;
949 VfpSavedState state = prepFpState(rMode);
950 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3), "=m" (state)
951 : "m" (op1), "m" (op2), "m" (op3), "m" (state));
952 fpType dest = func(op1, op2, op3);
953 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
954
955 int fpClass = std::fpclassify(dest);
956 // Get NAN behavior right. This varies between x86 and ARM.
957 if (fpClass == FP_NAN) {
958 const uint64_t qnan =
959 single ? 0x7fc00000 : ULL(0x7ff8000000000000);
960 const bool nan1 = std::isnan(op1);
961 const bool nan2 = std::isnan(op2);
962 const bool nan3 = std::isnan(op3);
963 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
964 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
965 const bool signal3 = nan3 && ((fpToBits(op3) & qnan) != qnan);
966 if ((!nan1 && !nan2 && !nan3) || (defaultNan == 1)) {
967 dest = bitsToFp(qnan, junk);
968 } else if (signal1) {
969 dest = bitsToFp(fpToBits(op1) | qnan, junk);
970 } else if (signal2) {
971 dest = bitsToFp(fpToBits(op2) | qnan, junk);
972 } else if (signal3) {
973 dest = bitsToFp(fpToBits(op3) | qnan, junk);
974 } else if (nan1) {
975 dest = op1;
976 } else if (nan2) {
977 dest = op2;
978 } else if (nan3) {
979 dest = op3;
980 }
981 } else if (flush && flushToZero(dest)) {
982 feraiseexcept(FeUnderflow);
983 } else if ((
984 (single && (dest == bitsToFp(0x00800000, junk) ||
985 dest == bitsToFp(0x80800000, junk))) ||
986 (!single &&
987 (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
988 dest == bitsToFp(ULL(0x8010000000000000), junk)))
989 ) && rMode != VfpRoundZero) {
990 /*
991 * Correct for the fact that underflow is detected -before- rounding
992 * in ARM and -after- rounding in x86.
993 */
994 fesetround(FeRoundZero);
995 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3)
996 : "m" (op1), "m" (op2), "m" (op3));
997 fpType temp = func(op1, op2, op2);
998 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
999 if (flush && flushToZero(temp)) {
1000 dest = temp;
1001 }
1002 }
1003 finishVfp(fpscr, state, flush);
1004 return dest;
1005}
1006
1007template
1008float FpOp::ternaryOp(FPSCR &fpscr, float op1, float op2, float op3,
1009 float (*func)(float, float, float),
1010 bool flush, bool defaultNan, uint32_t rMode) const;
1011template
1012double FpOp::ternaryOp(FPSCR &fpscr, double op1, double op2, double op3,
1013 double (*func)(double, double, double),
1014 bool flush, bool defaultNan, uint32_t rMode) const;
1015
1016template <class fpType>
1017fpType
1018FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2,
1019 fpType (*func)(fpType, fpType),
1020 bool flush, bool defaultNan, uint32_t rMode) const
1021{
1022 const bool single = (sizeof(fpType) == sizeof(float));
1023 fpType junk = 0.0;
1024
1025 if (flush && flushToZero(op1, op2))
1026 fpscr.idc = 1;
1027 VfpSavedState state = prepFpState(rMode);
1028 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (state)
1029 : "m" (op1), "m" (op2), "m" (state));
1030 fpType dest = func(op1, op2);
1031 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
1032
1033 // Get NAN behavior right. This varies between x86 and ARM.
1034 if (std::isnan(dest)) {
1035 const uint64_t qnan =
1036 single ? 0x7fc00000 : ULL(0x7ff8000000000000);
1037 const bool nan1 = std::isnan(op1);
1038 const bool nan2 = std::isnan(op2);
1039 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
1040 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
1041 if ((!nan1 && !nan2) || (defaultNan == 1)) {
1042 dest = bitsToFp(qnan, junk);
1043 } else if (signal1) {
1044 dest = bitsToFp(fpToBits(op1) | qnan, junk);
1045 } else if (signal2) {
1046 dest = bitsToFp(fpToBits(op2) | qnan, junk);
1047 } else if (nan1) {
1048 dest = op1;
1049 } else if (nan2) {
1050 dest = op2;
1051 }
1052 } else if (flush && flushToZero(dest)) {
1053 feraiseexcept(FeUnderflow);
1054 } else if ((
1055 (single && (dest == bitsToFp(0x00800000, junk) ||
1056 dest == bitsToFp(0x80800000, junk))) ||
1057 (!single &&
1058 (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
1059 dest == bitsToFp(ULL(0x8010000000000000), junk)))
1060 ) && rMode != VfpRoundZero) {
1061 /*
1062 * Correct for the fact that underflow is detected -before- rounding
1063 * in ARM and -after- rounding in x86.
1064 */
1065 fesetround(FeRoundZero);
1066 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2)
1067 : "m" (op1), "m" (op2));
1068 fpType temp = func(op1, op2);
1069 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
1070 if (flush && flushToZero(temp)) {
1071 dest = temp;
1072 }
1073 }
1074 finishVfp(fpscr, state, flush);
1075 return dest;
1076}
1077
1078template
1079float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2,
1080 float (*func)(float, float),
1081 bool flush, bool defaultNan, uint32_t rMode) const;
1082template
1083double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2,
1084 double (*func)(double, double),
1085 bool flush, bool defaultNan, uint32_t rMode) const;
1086
1087template <class fpType>
1088fpType
1089FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (*func)(fpType),
1090 bool flush, uint32_t rMode) const
1091{
1092 const bool single = (sizeof(fpType) == sizeof(float));
1093 fpType junk = 0.0;
1094
1095 if (flush && flushToZero(op1))
1096 fpscr.idc = 1;
1097 VfpSavedState state = prepFpState(rMode);
1098 __asm__ __volatile__ ("" : "=m" (op1), "=m" (state)
1099 : "m" (op1), "m" (state));
1100 fpType dest = func(op1);
1101 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
1102
1103 // Get NAN behavior right. This varies between x86 and ARM.
1104 if (std::isnan(dest)) {
1105 const uint64_t qnan =
1106 single ? 0x7fc00000 : ULL(0x7ff8000000000000);
1107 const bool nan = std::isnan(op1);
1108 if (!nan || fpscr.dn == 1) {
1109 dest = bitsToFp(qnan, junk);
1110 } else if (nan) {
1111 dest = bitsToFp(fpToBits(op1) | qnan, junk);
1112 }
1113 } else if (flush && flushToZero(dest)) {
1114 feraiseexcept(FeUnderflow);
1115 } else if ((
1116 (single && (dest == bitsToFp(0x00800000, junk) ||
1117 dest == bitsToFp(0x80800000, junk))) ||
1118 (!single &&
1119 (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
1120 dest == bitsToFp(ULL(0x8010000000000000), junk)))
1121 ) && rMode != VfpRoundZero) {
1122 /*
1123 * Correct for the fact that underflow is detected -before- rounding
1124 * in ARM and -after- rounding in x86.
1125 */
1126 fesetround(FeRoundZero);
1127 __asm__ __volatile__ ("" : "=m" (op1) : "m" (op1));
1128 fpType temp = func(op1);
1129 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
1130 if (flush && flushToZero(temp)) {
1131 dest = temp;
1132 }
1133 }
1134 finishVfp(fpscr, state, flush);
1135 return dest;
1136}
1137
1138template
1139float FpOp::unaryOp(FPSCR &fpscr, float op1, float (*func)(float),
1140 bool flush, uint32_t rMode) const;
1141template
1142double FpOp::unaryOp(FPSCR &fpscr, double op1, double (*func)(double),
1143 bool flush, uint32_t rMode) const;
1144
1145IntRegIndex
1146VfpMacroOp::addStride(IntRegIndex idx, unsigned stride)
1147{
1148 if (wide) {
1149 stride *= 2;
1150 }
1151 unsigned offset = idx % 8;
1152 idx = (IntRegIndex)(idx - offset);
1153 offset += stride;
1154 idx = (IntRegIndex)(idx + (offset % 8));
1155 return idx;
1156}
1157
1158void
1159VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1, IntRegIndex &op2)
1160{
1161 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1162 assert(!inScalarBank(dest));
1163 dest = addStride(dest, stride);
1164 op1 = addStride(op1, stride);
1165 if (!inScalarBank(op2)) {
1166 op2 = addStride(op2, stride);
1167 }
1168}
1169
1170void
1171VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1)
1172{
1173 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1174 assert(!inScalarBank(dest));
1175 dest = addStride(dest, stride);
1176 if (!inScalarBank(op1)) {
1177 op1 = addStride(op1, stride);
1178 }
1179}
1180
1181void
1182VfpMacroOp::nextIdxs(IntRegIndex &dest)
1183{
1184 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1185 assert(!inScalarBank(dest));
1186 dest = addStride(dest, stride);
1187}
1188
1189}