vfp.cc revision 9918:2c7219e2d999
1/*
2 * Copyright (c) 2010 ARM Limited
3 * All rights reserved
4 *
5 * The license below extends only to copyright in the software and shall
6 * not be construed as granting a license to any other intellectual
7 * property including but not limited to intellectual property relating
8 * to a hardware implementation of the functionality of the software
9 * licensed hereunder.  You may use the software subject to the license
10 * terms below provided that you ensure that this notice is replicated
11 * unmodified and in its entirety in all distributions of the software,
12 * modified or unmodified, in source code or in binary form.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions are
16 * met: redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer;
18 * redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution;
21 * neither the name of the copyright holders nor the names of its
22 * contributors may be used to endorse or promote products derived from
23 * this software without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 *
37 * Authors: Gabe Black
38 */
39
40#include "arch/arm/insts/vfp.hh"
41
42/*
43 * The asm statements below are to keep gcc from reordering code. Otherwise
44 * the rounding mode might be set after the operation it was intended for, the
45 * exception bits read before it, etc.
46 */
47
48std::string
49FpRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
50{
51    std::stringstream ss;
52    printMnemonic(ss);
53    printReg(ss, dest + FP_Reg_Base);
54    ss << ", ";
55    printReg(ss, op1 + FP_Reg_Base);
56    return ss.str();
57}
58
59std::string
60FpRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
61{
62    std::stringstream ss;
63    printMnemonic(ss);
64    printReg(ss, dest + FP_Reg_Base);
65    ccprintf(ss, ", #%d", imm);
66    return ss.str();
67}
68
69std::string
70FpRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
71{
72    std::stringstream ss;
73    printMnemonic(ss);
74    printReg(ss, dest + FP_Reg_Base);
75    ss << ", ";
76    printReg(ss, op1 + FP_Reg_Base);
77    ccprintf(ss, ", #%d", imm);
78    return ss.str();
79}
80
81std::string
82FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
83{
84    std::stringstream ss;
85    printMnemonic(ss);
86    printReg(ss, dest + FP_Reg_Base);
87    ss << ", ";
88    printReg(ss, op1 + FP_Reg_Base);
89    ss << ", ";
90    printReg(ss, op2 + FP_Reg_Base);
91    return ss.str();
92}
93
94std::string
95FpRegRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
96{
97    std::stringstream ss;
98    printMnemonic(ss);
99    printReg(ss, dest + FP_Reg_Base);
100    ss << ", ";
101    printReg(ss, op1 + FP_Reg_Base);
102    ss << ", ";
103    printReg(ss, op2 + FP_Reg_Base);
104    ccprintf(ss, ", #%d", imm);
105    return ss.str();
106}
107
108namespace ArmISA
109{
110
111VfpSavedState
112prepFpState(uint32_t rMode)
113{
114    int roundingMode = fegetround();
115    feclearexcept(FeAllExceptions);
116    switch (rMode) {
117      case VfpRoundNearest:
118        fesetround(FeRoundNearest);
119        break;
120      case VfpRoundUpward:
121        fesetround(FeRoundUpward);
122        break;
123      case VfpRoundDown:
124        fesetround(FeRoundDown);
125        break;
126      case VfpRoundZero:
127        fesetround(FeRoundZero);
128        break;
129    }
130    return roundingMode;
131}
132
133void
134finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush)
135{
136    int exceptions = fetestexcept(FeAllExceptions);
137    bool underflow = false;
138    if (exceptions & FeInvalid) {
139        fpscr.ioc = 1;
140    }
141    if (exceptions & FeDivByZero) {
142        fpscr.dzc = 1;
143    }
144    if (exceptions & FeOverflow) {
145        fpscr.ofc = 1;
146    }
147    if (exceptions & FeUnderflow) {
148        underflow = true;
149        fpscr.ufc = 1;
150    }
151    if ((exceptions & FeInexact) && !(underflow && flush)) {
152        fpscr.ixc = 1;
153    }
154    fesetround(state);
155}
156
157template <class fpType>
158fpType
159fixDest(bool flush, bool defaultNan, fpType val, fpType op1)
160{
161    int fpClass = std::fpclassify(val);
162    fpType junk = 0.0;
163    if (fpClass == FP_NAN) {
164        const bool single = (sizeof(val) == sizeof(float));
165        const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
166        const bool nan = std::isnan(op1);
167        if (!nan || defaultNan) {
168            val = bitsToFp(qnan, junk);
169        } else if (nan) {
170            val = bitsToFp(fpToBits(op1) | qnan, junk);
171        }
172    } else if (fpClass == FP_SUBNORMAL && flush == 1) {
173        // Turn val into a zero with the correct sign;
174        uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1);
175        val = bitsToFp(fpToBits(val) & bitMask, junk);
176        feclearexcept(FeInexact);
177        feraiseexcept(FeUnderflow);
178    }
179    return val;
180}
181
182template
183float fixDest<float>(bool flush, bool defaultNan, float val, float op1);
184template
185double fixDest<double>(bool flush, bool defaultNan, double val, double op1);
186
187template <class fpType>
188fpType
189fixDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2)
190{
191    int fpClass = std::fpclassify(val);
192    fpType junk = 0.0;
193    if (fpClass == FP_NAN) {
194        const bool single = (sizeof(val) == sizeof(float));
195        const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
196        const bool nan1 = std::isnan(op1);
197        const bool nan2 = std::isnan(op2);
198        const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
199        const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
200        if ((!nan1 && !nan2) || defaultNan) {
201            val = bitsToFp(qnan, junk);
202        } else if (signal1) {
203            val = bitsToFp(fpToBits(op1) | qnan, junk);
204        } else if (signal2) {
205            val = bitsToFp(fpToBits(op2) | qnan, junk);
206        } else if (nan1) {
207            val = op1;
208        } else if (nan2) {
209            val = op2;
210        }
211    } else if (fpClass == FP_SUBNORMAL && flush) {
212        // Turn val into a zero with the correct sign;
213        uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1);
214        val = bitsToFp(fpToBits(val) & bitMask, junk);
215        feclearexcept(FeInexact);
216        feraiseexcept(FeUnderflow);
217    }
218    return val;
219}
220
221template
222float fixDest<float>(bool flush, bool defaultNan,
223                     float val, float op1, float op2);
224template
225double fixDest<double>(bool flush, bool defaultNan,
226                       double val, double op1, double op2);
227
228template <class fpType>
229fpType
230fixDivDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2)
231{
232    fpType mid = fixDest(flush, defaultNan, val, op1, op2);
233    const bool single = (sizeof(fpType) == sizeof(float));
234    const fpType junk = 0.0;
235    if ((single && (val == bitsToFp(0x00800000, junk) ||
236                    val == bitsToFp(0x80800000, junk))) ||
237        (!single && (val == bitsToFp(ULL(0x0010000000000000), junk) ||
238                     val == bitsToFp(ULL(0x8010000000000000), junk)))
239        ) {
240        __asm__ __volatile__("" : "=m" (op1) : "m" (op1));
241        fesetround(FeRoundZero);
242        fpType temp = 0.0;
243        __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
244        temp = op1 / op2;
245        if (flushToZero(temp)) {
246            feraiseexcept(FeUnderflow);
247            if (flush) {
248                feclearexcept(FeInexact);
249                mid = temp;
250            }
251        }
252        __asm__ __volatile__("" :: "m" (temp));
253    }
254    return mid;
255}
256
257template
258float fixDivDest<float>(bool flush, bool defaultNan,
259                        float val, float op1, float op2);
260template
261double fixDivDest<double>(bool flush, bool defaultNan,
262                          double val, double op1, double op2);
263
264float
265fixFpDFpSDest(FPSCR fpscr, double val)
266{
267    const float junk = 0.0;
268    float op1 = 0.0;
269    if (std::isnan(val)) {
270        uint64_t valBits = fpToBits(val);
271        uint32_t op1Bits = bits(valBits, 50, 29) |
272                           (mask(9) << 22) |
273                           (bits(valBits, 63) << 31);
274        op1 = bitsToFp(op1Bits, junk);
275    }
276    float mid = fixDest(fpscr.fz, fpscr.dn, (float)val, op1);
277    if (fpscr.fz && fetestexcept(FeUnderflow | FeInexact) ==
278                    (FeUnderflow | FeInexact)) {
279        feclearexcept(FeInexact);
280    }
281    if (mid == bitsToFp(0x00800000, junk) ||
282        mid == bitsToFp(0x80800000, junk)) {
283        __asm__ __volatile__("" : "=m" (val) : "m" (val));
284        fesetround(FeRoundZero);
285        float temp = 0.0;
286        __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
287        temp = val;
288        if (flushToZero(temp)) {
289            feraiseexcept(FeUnderflow);
290            if (fpscr.fz) {
291                feclearexcept(FeInexact);
292                mid = temp;
293            }
294        }
295        __asm__ __volatile__("" :: "m" (temp));
296    }
297    return mid;
298}
299
300double
301fixFpSFpDDest(FPSCR fpscr, float val)
302{
303    const double junk = 0.0;
304    double op1 = 0.0;
305    if (std::isnan(val)) {
306        uint32_t valBits = fpToBits(val);
307        uint64_t op1Bits = ((uint64_t)bits(valBits, 21, 0) << 29) |
308                           (mask(12) << 51) |
309                           ((uint64_t)bits(valBits, 31) << 63);
310        op1 = bitsToFp(op1Bits, junk);
311    }
312    double mid = fixDest(fpscr.fz, fpscr.dn, (double)val, op1);
313    if (mid == bitsToFp(ULL(0x0010000000000000), junk) ||
314        mid == bitsToFp(ULL(0x8010000000000000), junk)) {
315        __asm__ __volatile__("" : "=m" (val) : "m" (val));
316        fesetround(FeRoundZero);
317        double temp = 0.0;
318        __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
319        temp = val;
320        if (flushToZero(temp)) {
321            feraiseexcept(FeUnderflow);
322            if (fpscr.fz) {
323                feclearexcept(FeInexact);
324                mid = temp;
325            }
326        }
327        __asm__ __volatile__("" :: "m" (temp));
328    }
329    return mid;
330}
331
332uint16_t
333vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan,
334           uint32_t rMode, bool ahp, float op)
335{
336    uint32_t opBits = fpToBits(op);
337    // Extract the operand.
338    bool neg = bits(opBits, 31);
339    uint32_t exponent = bits(opBits, 30, 23);
340    uint32_t oldMantissa = bits(opBits, 22, 0);
341    uint32_t mantissa = oldMantissa >> (23 - 10);
342    // Do the conversion.
343    uint32_t extra = oldMantissa & mask(23 - 10);
344    if (exponent == 0xff) {
345        if (oldMantissa != 0) {
346            // Nans.
347            if (bits(mantissa, 9) == 0) {
348                // Signalling nan.
349                fpscr.ioc = 1;
350            }
351            if (ahp) {
352                mantissa = 0;
353                exponent = 0;
354                fpscr.ioc = 1;
355            } else if (defaultNan) {
356                mantissa = (1 << 9);
357                exponent = 0x1f;
358                neg = false;
359            } else {
360                exponent = 0x1f;
361                mantissa |= (1 << 9);
362            }
363        } else {
364            // Infinities.
365            exponent = 0x1F;
366            if (ahp) {
367                fpscr.ioc = 1;
368                mantissa = 0x3ff;
369            } else {
370                mantissa = 0;
371            }
372        }
373    } else if (exponent == 0 && oldMantissa == 0) {
374        // Zero, don't need to do anything.
375    } else {
376        // Normalized or denormalized numbers.
377
378        bool inexact = (extra != 0);
379
380        if (exponent == 0) {
381            // Denormalized.
382
383            // If flush to zero is on, this shouldn't happen.
384            assert(!flush);
385
386            // Check for underflow
387            if (inexact || fpscr.ufe)
388                fpscr.ufc = 1;
389
390            // Handle rounding.
391            unsigned mode = rMode;
392            if ((mode == VfpRoundUpward && !neg && extra) ||
393                (mode == VfpRoundDown && neg && extra) ||
394                (mode == VfpRoundNearest &&
395                 (extra > (1 << 9) ||
396                  (extra == (1 << 9) && bits(mantissa, 0))))) {
397                mantissa++;
398            }
399
400            // See if the number became normalized after rounding.
401            if (mantissa == (1 << 10)) {
402                mantissa = 0;
403                exponent = 1;
404            }
405        } else {
406            // Normalized.
407
408            // We need to track the dropped bits differently since
409            // more can be dropped by denormalizing.
410            bool topOne = bits(extra, 12);
411            bool restZeros = bits(extra, 11, 0) == 0;
412
413            if (exponent <= (127 - 15)) {
414                // The result is too small. Denormalize.
415                mantissa |= (1 << 10);
416                while (mantissa && exponent <= (127 - 15)) {
417                    restZeros = restZeros && !topOne;
418                    topOne = bits(mantissa, 0);
419                    mantissa = mantissa >> 1;
420                    exponent++;
421                }
422                if (topOne || !restZeros)
423                    inexact = true;
424                exponent = 0;
425            } else {
426                // Change bias.
427                exponent -= (127 - 15);
428            }
429
430            if (exponent == 0 && (inexact || fpscr.ufe)) {
431                // Underflow
432                fpscr.ufc = 1;
433            }
434
435            // Handle rounding.
436            unsigned mode = rMode;
437            bool nonZero = topOne || !restZeros;
438            if ((mode == VfpRoundUpward && !neg && nonZero) ||
439                (mode == VfpRoundDown && neg && nonZero) ||
440                (mode == VfpRoundNearest && topOne &&
441                 (!restZeros || bits(mantissa, 0)))) {
442                mantissa++;
443            }
444
445            // See if we rounded up and need to bump the exponent.
446            if (mantissa == (1 << 10)) {
447                mantissa = 0;
448                exponent++;
449            }
450
451            // Deal with overflow
452            if (ahp) {
453                if (exponent >= 0x20) {
454                    exponent = 0x1f;
455                    mantissa = 0x3ff;
456                    fpscr.ioc = 1;
457                    // Supress inexact exception.
458                    inexact = false;
459                }
460            } else {
461                if (exponent >= 0x1f) {
462                    if ((mode == VfpRoundNearest) ||
463                        (mode == VfpRoundUpward && !neg) ||
464                        (mode == VfpRoundDown && neg)) {
465                        // Overflow to infinity.
466                        exponent = 0x1f;
467                        mantissa = 0;
468                    } else {
469                        // Overflow to max normal.
470                        exponent = 0x1e;
471                        mantissa = 0x3ff;
472                    }
473                    fpscr.ofc = 1;
474                    inexact = true;
475                }
476            }
477        }
478
479        if (inexact) {
480            fpscr.ixc = 1;
481        }
482    }
483    // Reassemble and install the result.
484    uint32_t result = bits(mantissa, 9, 0);
485    replaceBits(result, 14, 10, exponent);
486    if (neg)
487        result |= (1 << 15);
488    return result;
489}
490
491float
492vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op)
493{
494    float junk = 0.0;
495    // Extract the bitfields.
496    bool neg = bits(op, 15);
497    uint32_t exponent = bits(op, 14, 10);
498    uint32_t mantissa = bits(op, 9, 0);
499    // Do the conversion.
500    if (exponent == 0) {
501        if (mantissa != 0) {
502            // Normalize the value.
503            exponent = exponent + (127 - 15) + 1;
504            while (mantissa < (1 << 10)) {
505                mantissa = mantissa << 1;
506                exponent--;
507            }
508        }
509        mantissa = mantissa << (23 - 10);
510    } else if (exponent == 0x1f && !ahp) {
511        // Infinities and nans.
512        exponent = 0xff;
513        if (mantissa != 0) {
514            // Nans.
515            mantissa = mantissa << (23 - 10);
516            if (bits(mantissa, 22) == 0) {
517                // Signalling nan.
518                fpscr.ioc = 1;
519                mantissa |= (1 << 22);
520            }
521            if (defaultNan) {
522                mantissa &= ~mask(22);
523                neg = false;
524            }
525        }
526    } else {
527        exponent = exponent + (127 - 15);
528        mantissa = mantissa << (23 - 10);
529    }
530    // Reassemble the result.
531    uint32_t result = bits(mantissa, 22, 0);
532    replaceBits(result, 30, 23, exponent);
533    if (neg)
534        result |= (1 << 31);
535    return bitsToFp(result, junk);
536}
537
538uint64_t
539vfpFpSToFixed(float val, bool isSigned, bool half,
540              uint8_t imm, bool rzero)
541{
542    int rmode = rzero ? FeRoundZero : fegetround();
543    __asm__ __volatile__("" : "=m" (rmode) : "m" (rmode));
544    fesetround(FeRoundNearest);
545    val = val * powf(2.0, imm);
546    __asm__ __volatile__("" : "=m" (val) : "m" (val));
547    fesetround(rmode);
548    feclearexcept(FeAllExceptions);
549    __asm__ __volatile__("" : "=m" (val) : "m" (val));
550    float origVal = val;
551    val = rintf(val);
552    int fpType = std::fpclassify(val);
553    if (fpType == FP_SUBNORMAL || fpType == FP_NAN) {
554        if (fpType == FP_NAN) {
555            feraiseexcept(FeInvalid);
556        }
557        val = 0.0;
558    } else if (origVal != val) {
559        switch (rmode) {
560          case FeRoundNearest:
561            if (origVal - val > 0.5)
562                val += 1.0;
563            else if (val - origVal > 0.5)
564                val -= 1.0;
565            break;
566          case FeRoundDown:
567            if (origVal < val)
568                val -= 1.0;
569            break;
570          case FeRoundUpward:
571            if (origVal > val)
572                val += 1.0;
573            break;
574        }
575        feraiseexcept(FeInexact);
576    }
577
578    if (isSigned) {
579        if (half) {
580            if ((double)val < (int16_t)(1 << 15)) {
581                feraiseexcept(FeInvalid);
582                feclearexcept(FeInexact);
583                return (int16_t)(1 << 15);
584            }
585            if ((double)val > (int16_t)mask(15)) {
586                feraiseexcept(FeInvalid);
587                feclearexcept(FeInexact);
588                return (int16_t)mask(15);
589            }
590            return (int16_t)val;
591        } else {
592            if ((double)val < (int32_t)(1 << 31)) {
593                feraiseexcept(FeInvalid);
594                feclearexcept(FeInexact);
595                return (int32_t)(1 << 31);
596            }
597            if ((double)val > (int32_t)mask(31)) {
598                feraiseexcept(FeInvalid);
599                feclearexcept(FeInexact);
600                return (int32_t)mask(31);
601            }
602            return (int32_t)val;
603        }
604    } else {
605        if (half) {
606            if ((double)val < 0) {
607                feraiseexcept(FeInvalid);
608                feclearexcept(FeInexact);
609                return 0;
610            }
611            if ((double)val > (mask(16))) {
612                feraiseexcept(FeInvalid);
613                feclearexcept(FeInexact);
614                return mask(16);
615            }
616            return (uint16_t)val;
617        } else {
618            if ((double)val < 0) {
619                feraiseexcept(FeInvalid);
620                feclearexcept(FeInexact);
621                return 0;
622            }
623            if ((double)val > (mask(32))) {
624                feraiseexcept(FeInvalid);
625                feclearexcept(FeInexact);
626                return mask(32);
627            }
628            return (uint32_t)val;
629        }
630    }
631}
632
633float
634vfpUFixedToFpS(bool flush, bool defaultNan,
635        uint32_t val, bool half, uint8_t imm)
636{
637    fesetround(FeRoundNearest);
638    if (half)
639        val = (uint16_t)val;
640    float scale = powf(2.0, imm);
641    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
642    feclearexcept(FeAllExceptions);
643    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
644    return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
645}
646
647float
648vfpSFixedToFpS(bool flush, bool defaultNan,
649        int32_t val, bool half, uint8_t imm)
650{
651    fesetround(FeRoundNearest);
652    if (half)
653        val = sext<16>(val & mask(16));
654    float scale = powf(2.0, imm);
655    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
656    feclearexcept(FeAllExceptions);
657    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
658    return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
659}
660
661uint64_t
662vfpFpDToFixed(double val, bool isSigned, bool half,
663              uint8_t imm, bool rzero)
664{
665    int rmode = rzero ? FeRoundZero : fegetround();
666    fesetround(FeRoundNearest);
667    val = val * pow(2.0, imm);
668    __asm__ __volatile__("" : "=m" (val) : "m" (val));
669    fesetround(rmode);
670    feclearexcept(FeAllExceptions);
671    __asm__ __volatile__("" : "=m" (val) : "m" (val));
672    double origVal = val;
673    val = rint(val);
674    int fpType = std::fpclassify(val);
675    if (fpType == FP_SUBNORMAL || fpType == FP_NAN) {
676        if (fpType == FP_NAN) {
677            feraiseexcept(FeInvalid);
678        }
679        val = 0.0;
680    } else if (origVal != val) {
681        switch (rmode) {
682          case FeRoundNearest:
683            if (origVal - val > 0.5)
684                val += 1.0;
685            else if (val - origVal > 0.5)
686                val -= 1.0;
687            break;
688          case FeRoundDown:
689            if (origVal < val)
690                val -= 1.0;
691            break;
692          case FeRoundUpward:
693            if (origVal > val)
694                val += 1.0;
695            break;
696        }
697        feraiseexcept(FeInexact);
698    }
699    if (isSigned) {
700        if (half) {
701            if (val < (int16_t)(1 << 15)) {
702                feraiseexcept(FeInvalid);
703                feclearexcept(FeInexact);
704                return (int16_t)(1 << 15);
705            }
706            if (val > (int16_t)mask(15)) {
707                feraiseexcept(FeInvalid);
708                feclearexcept(FeInexact);
709                return (int16_t)mask(15);
710            }
711            return (int16_t)val;
712        } else {
713            if (val < (int32_t)(1 << 31)) {
714                feraiseexcept(FeInvalid);
715                feclearexcept(FeInexact);
716                return (int32_t)(1 << 31);
717            }
718            if (val > (int32_t)mask(31)) {
719                feraiseexcept(FeInvalid);
720                feclearexcept(FeInexact);
721                return (int32_t)mask(31);
722            }
723            return (int32_t)val;
724        }
725    } else {
726        if (half) {
727            if (val < 0) {
728                feraiseexcept(FeInvalid);
729                feclearexcept(FeInexact);
730                return 0;
731            }
732            if (val > mask(16)) {
733                feraiseexcept(FeInvalid);
734                feclearexcept(FeInexact);
735                return mask(16);
736            }
737            return (uint16_t)val;
738        } else {
739            if (val < 0) {
740                feraiseexcept(FeInvalid);
741                feclearexcept(FeInexact);
742                return 0;
743            }
744            if (val > mask(32)) {
745                feraiseexcept(FeInvalid);
746                feclearexcept(FeInexact);
747                return mask(32);
748            }
749            return (uint32_t)val;
750        }
751    }
752}
753
754double
755vfpUFixedToFpD(bool flush, bool defaultNan,
756        uint32_t val, bool half, uint8_t imm)
757{
758    fesetround(FeRoundNearest);
759    if (half)
760        val = (uint16_t)val;
761    double scale = pow(2.0, imm);
762    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
763    feclearexcept(FeAllExceptions);
764    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
765    return fixDivDest(flush, defaultNan, val / scale, (double)val, scale);
766}
767
768double
769vfpSFixedToFpD(bool flush, bool defaultNan,
770        int32_t val, bool half, uint8_t imm)
771{
772    fesetround(FeRoundNearest);
773    if (half)
774        val = sext<16>(val & mask(16));
775    double scale = pow(2.0, imm);
776    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
777    feclearexcept(FeAllExceptions);
778    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
779    return fixDivDest(flush, defaultNan, val / scale, (double)val, scale);
780}
781
782// This function implements a magic formula taken from the architecture
783// reference manual. It was originally called recip_sqrt_estimate.
784static double
785recipSqrtEstimate(double a)
786{
787    int64_t q0, q1, s;
788    double r;
789    if (a < 0.5) {
790        q0 = (int64_t)(a * 512.0);
791        r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0);
792    } else {
793        q1 = (int64_t)(a * 256.0);
794        r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0);
795    }
796    s = (int64_t)(256.0 * r + 0.5);
797    return (double)s / 256.0;
798}
799
800// This function is only intended for use in Neon instructions because
801// it ignores certain bits in the FPSCR.
802float
803fprSqrtEstimate(FPSCR &fpscr, float op)
804{
805    const uint32_t qnan = 0x7fc00000;
806    float junk = 0.0;
807    int fpClass = std::fpclassify(op);
808    if (fpClass == FP_NAN) {
809        if ((fpToBits(op) & qnan) != qnan)
810            fpscr.ioc = 1;
811        return bitsToFp(qnan, junk);
812    } else if (fpClass == FP_ZERO) {
813        fpscr.dzc = 1;
814        // Return infinity with the same sign as the operand.
815        return bitsToFp((std::signbit(op) << 31) |
816                       (0xFF << 23) | (0 << 0), junk);
817    } else if (std::signbit(op)) {
818        // Set invalid op bit.
819        fpscr.ioc = 1;
820        return bitsToFp(qnan, junk);
821    } else if (fpClass == FP_INFINITE) {
822        return 0.0;
823    } else {
824        uint64_t opBits = fpToBits(op);
825        double scaled;
826        if (bits(opBits, 23)) {
827            scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
828                              (ULL(0x3fd) << 52) | (bits(opBits, 31) << 63),
829                              (double)0.0);
830        } else {
831            scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
832                              (ULL(0x3fe) << 52) | (bits(opBits, 31) << 63),
833                              (double)0.0);
834        }
835        uint64_t resultExp = (380 - bits(opBits, 30, 23)) / 2;
836
837        uint64_t estimate = fpToBits(recipSqrtEstimate(scaled));
838
839        return bitsToFp((bits(estimate, 63) << 31) |
840                        (bits(resultExp, 7, 0) << 23) |
841                        (bits(estimate, 51, 29) << 0), junk);
842    }
843}
844
845uint32_t
846unsignedRSqrtEstimate(uint32_t op)
847{
848    if (bits(op, 31, 30) == 0) {
849        return -1;
850    } else {
851        double dpOp;
852        if (bits(op, 31)) {
853            dpOp = bitsToFp((ULL(0) << 63) |
854                            (ULL(0x3fe) << 52) |
855                            (bits((uint64_t)op, 30, 0) << 21) |
856                            (0 << 0), (double)0.0);
857        } else {
858            dpOp = bitsToFp((ULL(0) << 63) |
859                            (ULL(0x3fd) << 52) |
860                            (bits((uint64_t)op, 29, 0) << 22) |
861                            (0 << 0), (double)0.0);
862        }
863        uint64_t estimate = fpToBits(recipSqrtEstimate(dpOp));
864        return (1 << 31) | bits(estimate, 51, 21);
865    }
866}
867
868// This function implements a magic formula taken from the architecture
869// reference manual. It was originally called recip_estimate.
870
871static double
872recipEstimate(double a)
873{
874    int64_t q, s;
875    double r;
876    q = (int64_t)(a * 512.0);
877    r = 1.0 / (((double)q + 0.5) / 512.0);
878    s = (int64_t)(256.0 * r + 0.5);
879    return (double)s / 256.0;
880}
881
882// This function is only intended for use in Neon instructions because
883// it ignores certain bits in the FPSCR.
884float
885fpRecipEstimate(FPSCR &fpscr, float op)
886{
887    const uint32_t qnan = 0x7fc00000;
888    float junk = 0.0;
889    int fpClass = std::fpclassify(op);
890    if (fpClass == FP_NAN) {
891        if ((fpToBits(op) & qnan) != qnan)
892            fpscr.ioc = 1;
893        return bitsToFp(qnan, junk);
894    } else if (fpClass == FP_INFINITE) {
895        return bitsToFp(std::signbit(op) << 31, junk);
896    } else if (fpClass == FP_ZERO) {
897        fpscr.dzc = 1;
898        // Return infinity with the same sign as the operand.
899        return bitsToFp((std::signbit(op) << 31) |
900                       (0xFF << 23) | (0 << 0), junk);
901    } else if (fabs(op) >= pow(2.0, 126)) {
902        fpscr.ufc = 1;
903        return bitsToFp(std::signbit(op) << 31, junk);
904    } else {
905        uint64_t opBits = fpToBits(op);
906        double scaled;
907        scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
908                          (ULL(0x3fe) << 52) | (ULL(0) << 63),
909                          (double)0.0);
910        uint64_t resultExp = 253 - bits(opBits, 30, 23);
911
912        uint64_t estimate = fpToBits(recipEstimate(scaled));
913
914        return bitsToFp((bits(opBits, 31) << 31) |
915                        (bits(resultExp, 7, 0) << 23) |
916                        (bits(estimate, 51, 29) << 0), junk);
917    }
918}
919
920uint32_t
921unsignedRecipEstimate(uint32_t op)
922{
923    if (bits(op, 31) == 0) {
924        return -1;
925    } else {
926        double dpOp;
927        dpOp = bitsToFp((ULL(0) << 63) |
928                        (ULL(0x3fe) << 52) |
929                        (bits((uint64_t)op, 30, 0) << 21) |
930                        (0 << 0), (double)0.0);
931        uint64_t estimate = fpToBits(recipEstimate(dpOp));
932        return (1 << 31) | bits(estimate, 51, 21);
933    }
934}
935
936template <class fpType>
937fpType
938FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
939                  fpType op1, fpType op2) const
940{
941    done = true;
942    fpType junk = 0.0;
943    fpType dest = 0.0;
944    const bool single = (sizeof(fpType) == sizeof(float));
945    const uint64_t qnan =
946        single ? 0x7fc00000 : ULL(0x7ff8000000000000);
947    const bool nan1 = std::isnan(op1);
948    const bool nan2 = std::isnan(op2);
949    const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
950    const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
951    if (nan1 || nan2) {
952        if (defaultNan) {
953            dest = bitsToFp(qnan, junk);
954        }  else if (signal1) {
955            dest = bitsToFp(fpToBits(op1) | qnan, junk);
956        } else if (signal2) {
957            dest = bitsToFp(fpToBits(op2) | qnan, junk);
958        } else if (nan1) {
959            dest = op1;
960        } else if (nan2) {
961            dest = op2;
962        }
963        if (signal1 || signal2) {
964            fpscr.ioc = 1;
965        }
966    } else {
967        done = false;
968    }
969    return dest;
970}
971
972template
973float FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
974                        float op1, float op2) const;
975template
976double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
977                         double op1, double op2) const;
978
979template <class fpType>
980fpType
981FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2,
982               fpType (*func)(fpType, fpType),
983               bool flush, bool defaultNan, uint32_t rMode) const
984{
985    const bool single = (sizeof(fpType) == sizeof(float));
986    fpType junk = 0.0;
987
988    if (flush && flushToZero(op1, op2))
989        fpscr.idc = 1;
990    VfpSavedState state = prepFpState(rMode);
991    __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (state)
992                             : "m" (op1), "m" (op2), "m" (state));
993    fpType dest = func(op1, op2);
994    __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
995
996    // Get NAN behavior right. This varies between x86 and ARM.
997    if (std::isnan(dest)) {
998        const uint64_t qnan =
999            single ? 0x7fc00000 : ULL(0x7ff8000000000000);
1000        const bool nan1 = std::isnan(op1);
1001        const bool nan2 = std::isnan(op2);
1002        const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
1003        const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
1004        if ((!nan1 && !nan2) || (defaultNan == 1)) {
1005            dest = bitsToFp(qnan, junk);
1006        } else if (signal1) {
1007            dest = bitsToFp(fpToBits(op1) | qnan, junk);
1008        } else if (signal2) {
1009            dest = bitsToFp(fpToBits(op2) | qnan, junk);
1010        } else if (nan1) {
1011            dest = op1;
1012        } else if (nan2) {
1013            dest = op2;
1014        }
1015    } else if (flush && flushToZero(dest)) {
1016        feraiseexcept(FeUnderflow);
1017    } else if ((
1018                (single && (dest == bitsToFp(0x00800000, junk) ||
1019                     dest == bitsToFp(0x80800000, junk))) ||
1020                (!single &&
1021                    (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
1022                     dest == bitsToFp(ULL(0x8010000000000000), junk)))
1023               ) && rMode != VfpRoundZero) {
1024        /*
1025         * Correct for the fact that underflow is detected -before- rounding
1026         * in ARM and -after- rounding in x86.
1027         */
1028        fesetround(FeRoundZero);
1029        __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2)
1030                                 : "m" (op1), "m" (op2));
1031        fpType temp = func(op1, op2);
1032        __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
1033        if (flush && flushToZero(temp)) {
1034            dest = temp;
1035        }
1036    }
1037    finishVfp(fpscr, state, flush);
1038    return dest;
1039}
1040
1041template
1042float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2,
1043                     float (*func)(float, float),
1044                     bool flush, bool defaultNan, uint32_t rMode) const;
1045template
1046double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2,
1047                      double (*func)(double, double),
1048                      bool flush, bool defaultNan, uint32_t rMode) const;
1049
1050template <class fpType>
1051fpType
1052FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (*func)(fpType),
1053              bool flush, uint32_t rMode) const
1054{
1055    const bool single = (sizeof(fpType) == sizeof(float));
1056    fpType junk = 0.0;
1057
1058    if (flush && flushToZero(op1))
1059        fpscr.idc = 1;
1060    VfpSavedState state = prepFpState(rMode);
1061    __asm__ __volatile__ ("" : "=m" (op1), "=m" (state)
1062                             : "m" (op1), "m" (state));
1063    fpType dest = func(op1);
1064    __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
1065
1066    // Get NAN behavior right. This varies between x86 and ARM.
1067    if (std::isnan(dest)) {
1068        const uint64_t qnan =
1069            single ? 0x7fc00000 : ULL(0x7ff8000000000000);
1070        const bool nan = std::isnan(op1);
1071        if (!nan || fpscr.dn == 1) {
1072            dest = bitsToFp(qnan, junk);
1073        } else if (nan) {
1074            dest = bitsToFp(fpToBits(op1) | qnan, junk);
1075        }
1076    } else if (flush && flushToZero(dest)) {
1077        feraiseexcept(FeUnderflow);
1078    } else if ((
1079                (single && (dest == bitsToFp(0x00800000, junk) ||
1080                     dest == bitsToFp(0x80800000, junk))) ||
1081                (!single &&
1082                    (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
1083                     dest == bitsToFp(ULL(0x8010000000000000), junk)))
1084               ) && rMode != VfpRoundZero) {
1085        /*
1086         * Correct for the fact that underflow is detected -before- rounding
1087         * in ARM and -after- rounding in x86.
1088         */
1089        fesetround(FeRoundZero);
1090        __asm__ __volatile__ ("" : "=m" (op1) : "m" (op1));
1091        fpType temp = func(op1);
1092        __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
1093        if (flush && flushToZero(temp)) {
1094            dest = temp;
1095        }
1096    }
1097    finishVfp(fpscr, state, flush);
1098    return dest;
1099}
1100
1101template
1102float FpOp::unaryOp(FPSCR &fpscr, float op1, float (*func)(float),
1103                    bool flush, uint32_t rMode) const;
1104template
1105double FpOp::unaryOp(FPSCR &fpscr, double op1, double (*func)(double),
1106                     bool flush, uint32_t rMode) const;
1107
1108IntRegIndex
1109VfpMacroOp::addStride(IntRegIndex idx, unsigned stride)
1110{
1111    if (wide) {
1112        stride *= 2;
1113    }
1114    unsigned offset = idx % 8;
1115    idx = (IntRegIndex)(idx - offset);
1116    offset += stride;
1117    idx = (IntRegIndex)(idx + (offset % 8));
1118    return idx;
1119}
1120
1121void
1122VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1, IntRegIndex &op2)
1123{
1124    unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1125    assert(!inScalarBank(dest));
1126    dest = addStride(dest, stride);
1127    op1 = addStride(op1, stride);
1128    if (!inScalarBank(op2)) {
1129        op2 = addStride(op2, stride);
1130    }
1131}
1132
1133void
1134VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1)
1135{
1136    unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1137    assert(!inScalarBank(dest));
1138    dest = addStride(dest, stride);
1139    if (!inScalarBank(op1)) {
1140        op1 = addStride(op1, stride);
1141    }
1142}
1143
1144void
1145VfpMacroOp::nextIdxs(IntRegIndex &dest)
1146{
1147    unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1148    assert(!inScalarBank(dest));
1149    dest = addStride(dest, stride);
1150}
1151
1152}
1153