vfp.cc revision 12104:edd63f9c6184
1/*
2 * Copyright (c) 2010-2013 ARM Limited
3 * All rights reserved
4 *
5 * The license below extends only to copyright in the software and shall
6 * not be construed as granting a license to any other intellectual
7 * property including but not limited to intellectual property relating
8 * to a hardware implementation of the functionality of the software
9 * licensed hereunder.  You may use the software subject to the license
10 * terms below provided that you ensure that this notice is replicated
11 * unmodified and in its entirety in all distributions of the software,
12 * modified or unmodified, in source code or in binary form.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions are
16 * met: redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer;
18 * redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution;
21 * neither the name of the copyright holders nor the names of its
22 * contributors may be used to endorse or promote products derived from
23 * this software without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 *
37 * Authors: Gabe Black
38 */
39
40#include "arch/arm/insts/vfp.hh"
41
42/*
43 * The asm statements below are to keep gcc from reordering code. Otherwise
44 * the rounding mode might be set after the operation it was intended for, the
45 * exception bits read before it, etc.
46 */
47
48std::string
49FpCondCompRegOp::generateDisassembly(
50        Addr pc, const SymbolTable *symtab) const
51{
52    std::stringstream ss;
53    printMnemonic(ss, "", false);
54    printIntReg(ss, op1);
55    ccprintf(ss, ", ");
56    printIntReg(ss, op2);
57    ccprintf(ss, ", #%d", defCc);
58    ccprintf(ss, ", ");
59    printCondition(ss, condCode, true);
60    return ss.str();
61}
62
63std::string
64FpCondSelOp::generateDisassembly(
65        Addr pc, const SymbolTable *symtab) const
66{
67    std::stringstream ss;
68    printMnemonic(ss, "", false);
69    printIntReg(ss, dest);
70    ccprintf(ss, ", ");
71    printIntReg(ss, op1);
72    ccprintf(ss, ", ");
73    printIntReg(ss, op2);
74    ccprintf(ss, ", ");
75    printCondition(ss, condCode, true);
76    return ss.str();
77}
78
79std::string
80FpRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
81{
82    std::stringstream ss;
83    printMnemonic(ss);
84    printFloatReg(ss, dest);
85    ss << ", ";
86    printFloatReg(ss, op1);
87    return ss.str();
88}
89
90std::string
91FpRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
92{
93    std::stringstream ss;
94    printMnemonic(ss);
95    printFloatReg(ss, dest);
96    ccprintf(ss, ", #%d", imm);
97    return ss.str();
98}
99
100std::string
101FpRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
102{
103    std::stringstream ss;
104    printMnemonic(ss);
105    printFloatReg(ss, dest);
106    ss << ", ";
107    printFloatReg(ss, op1);
108    ccprintf(ss, ", #%d", imm);
109    return ss.str();
110}
111
112std::string
113FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
114{
115    std::stringstream ss;
116    printMnemonic(ss);
117    printFloatReg(ss, dest);
118    ss << ", ";
119    printFloatReg(ss, op1);
120    ss << ", ";
121    printFloatReg(ss, op2);
122    return ss.str();
123}
124
125std::string
126FpRegRegRegCondOp::generateDisassembly(Addr pc, const SymbolTable *symtab)
127    const
128{
129    std::stringstream ss;
130    printMnemonic(ss);
131    printCondition(ss, cond);
132    printFloatReg(ss, dest);
133    ss << ", ";
134    printFloatReg(ss, op1);
135    ss << ", ";
136    printFloatReg(ss, op2);
137    return ss.str();
138}
139
140std::string
141FpRegRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
142{
143    std::stringstream ss;
144    printMnemonic(ss);
145    printFloatReg(ss, dest);
146    ss << ", ";
147    printFloatReg(ss, op1);
148    ss << ", ";
149    printFloatReg(ss, op2);
150    ss << ", ";
151    printFloatReg(ss, op3);
152    return ss.str();
153}
154
155std::string
156FpRegRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
157{
158    std::stringstream ss;
159    printMnemonic(ss);
160    printFloatReg(ss, dest);
161    ss << ", ";
162    printFloatReg(ss, op1);
163    ss << ", ";
164    printFloatReg(ss, op2);
165    ccprintf(ss, ", #%d", imm);
166    return ss.str();
167}
168
169namespace ArmISA
170{
171
172VfpSavedState
173prepFpState(uint32_t rMode)
174{
175    int roundingMode = fegetround();
176    feclearexcept(FeAllExceptions);
177    switch (rMode) {
178      case VfpRoundNearest:
179        fesetround(FeRoundNearest);
180        break;
181      case VfpRoundUpward:
182        fesetround(FeRoundUpward);
183        break;
184      case VfpRoundDown:
185        fesetround(FeRoundDown);
186        break;
187      case VfpRoundZero:
188        fesetround(FeRoundZero);
189        break;
190    }
191    return roundingMode;
192}
193
194void
195finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush, FPSCR mask)
196{
197    int exceptions = fetestexcept(FeAllExceptions);
198    bool underflow = false;
199    if ((exceptions & FeInvalid) && mask.ioc) {
200        fpscr.ioc = 1;
201    }
202    if ((exceptions & FeDivByZero) && mask.dzc) {
203        fpscr.dzc = 1;
204    }
205    if ((exceptions & FeOverflow) && mask.ofc) {
206        fpscr.ofc = 1;
207    }
208    if (exceptions & FeUnderflow) {
209        underflow = true;
210        if (mask.ufc)
211            fpscr.ufc = 1;
212    }
213    if ((exceptions & FeInexact) && !(underflow && flush) && mask.ixc) {
214        fpscr.ixc = 1;
215    }
216    fesetround(state);
217}
218
219template <class fpType>
220fpType
221fixDest(bool flush, bool defaultNan, fpType val, fpType op1)
222{
223    int fpClass = std::fpclassify(val);
224    fpType junk = 0.0;
225    if (fpClass == FP_NAN) {
226        const bool single = (sizeof(val) == sizeof(float));
227        const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
228        const bool nan = std::isnan(op1);
229        if (!nan || defaultNan) {
230            val = bitsToFp(qnan, junk);
231        } else if (nan) {
232            val = bitsToFp(fpToBits(op1) | qnan, junk);
233        }
234    } else if (fpClass == FP_SUBNORMAL && flush == 1) {
235        // Turn val into a zero with the correct sign;
236        uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1);
237        val = bitsToFp(fpToBits(val) & bitMask, junk);
238        feclearexcept(FeInexact);
239        feraiseexcept(FeUnderflow);
240    }
241    return val;
242}
243
244template
245float fixDest<float>(bool flush, bool defaultNan, float val, float op1);
246template
247double fixDest<double>(bool flush, bool defaultNan, double val, double op1);
248
249template <class fpType>
250fpType
251fixDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2)
252{
253    int fpClass = std::fpclassify(val);
254    fpType junk = 0.0;
255    if (fpClass == FP_NAN) {
256        const bool single = (sizeof(val) == sizeof(float));
257        const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
258        const bool nan1 = std::isnan(op1);
259        const bool nan2 = std::isnan(op2);
260        const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
261        const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
262        if ((!nan1 && !nan2) || defaultNan) {
263            val = bitsToFp(qnan, junk);
264        } else if (signal1) {
265            val = bitsToFp(fpToBits(op1) | qnan, junk);
266        } else if (signal2) {
267            val = bitsToFp(fpToBits(op2) | qnan, junk);
268        } else if (nan1) {
269            val = op1;
270        } else if (nan2) {
271            val = op2;
272        }
273    } else if (fpClass == FP_SUBNORMAL && flush) {
274        // Turn val into a zero with the correct sign;
275        uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1);
276        val = bitsToFp(fpToBits(val) & bitMask, junk);
277        feclearexcept(FeInexact);
278        feraiseexcept(FeUnderflow);
279    }
280    return val;
281}
282
283template
284float fixDest<float>(bool flush, bool defaultNan,
285                     float val, float op1, float op2);
286template
287double fixDest<double>(bool flush, bool defaultNan,
288                       double val, double op1, double op2);
289
290template <class fpType>
291fpType
292fixDivDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2)
293{
294    fpType mid = fixDest(flush, defaultNan, val, op1, op2);
295    const bool single = (sizeof(fpType) == sizeof(float));
296    const fpType junk = 0.0;
297    if ((single && (val == bitsToFp(0x00800000, junk) ||
298                    val == bitsToFp(0x80800000, junk))) ||
299        (!single && (val == bitsToFp(ULL(0x0010000000000000), junk) ||
300                     val == bitsToFp(ULL(0x8010000000000000), junk)))
301        ) {
302        __asm__ __volatile__("" : "=m" (op1) : "m" (op1));
303        fesetround(FeRoundZero);
304        fpType temp = 0.0;
305        __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
306        temp = op1 / op2;
307        if (flushToZero(temp)) {
308            feraiseexcept(FeUnderflow);
309            if (flush) {
310                feclearexcept(FeInexact);
311                mid = temp;
312            }
313        }
314        __asm__ __volatile__("" :: "m" (temp));
315    }
316    return mid;
317}
318
319template
320float fixDivDest<float>(bool flush, bool defaultNan,
321                        float val, float op1, float op2);
322template
323double fixDivDest<double>(bool flush, bool defaultNan,
324                          double val, double op1, double op2);
325
326float
327fixFpDFpSDest(FPSCR fpscr, double val)
328{
329    const float junk = 0.0;
330    float op1 = 0.0;
331    if (std::isnan(val)) {
332        uint64_t valBits = fpToBits(val);
333        uint32_t op1Bits = bits(valBits, 50, 29) |
334                           (mask(9) << 22) |
335                           (bits(valBits, 63) << 31);
336        op1 = bitsToFp(op1Bits, junk);
337    }
338    float mid = fixDest(fpscr.fz, fpscr.dn, (float)val, op1);
339    if (fpscr.fz && fetestexcept(FeUnderflow | FeInexact) ==
340                    (FeUnderflow | FeInexact)) {
341        feclearexcept(FeInexact);
342    }
343    if (mid == bitsToFp(0x00800000, junk) ||
344        mid == bitsToFp(0x80800000, junk)) {
345        __asm__ __volatile__("" : "=m" (val) : "m" (val));
346        fesetround(FeRoundZero);
347        float temp = 0.0;
348        __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
349        temp = val;
350        if (flushToZero(temp)) {
351            feraiseexcept(FeUnderflow);
352            if (fpscr.fz) {
353                feclearexcept(FeInexact);
354                mid = temp;
355            }
356        }
357        __asm__ __volatile__("" :: "m" (temp));
358    }
359    return mid;
360}
361
362double
363fixFpSFpDDest(FPSCR fpscr, float val)
364{
365    const double junk = 0.0;
366    double op1 = 0.0;
367    if (std::isnan(val)) {
368        uint32_t valBits = fpToBits(val);
369        uint64_t op1Bits = ((uint64_t)bits(valBits, 21, 0) << 29) |
370                           (mask(12) << 51) |
371                           ((uint64_t)bits(valBits, 31) << 63);
372        op1 = bitsToFp(op1Bits, junk);
373    }
374    double mid = fixDest(fpscr.fz, fpscr.dn, (double)val, op1);
375    if (mid == bitsToFp(ULL(0x0010000000000000), junk) ||
376        mid == bitsToFp(ULL(0x8010000000000000), junk)) {
377        __asm__ __volatile__("" : "=m" (val) : "m" (val));
378        fesetround(FeRoundZero);
379        double temp = 0.0;
380        __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
381        temp = val;
382        if (flushToZero(temp)) {
383            feraiseexcept(FeUnderflow);
384            if (fpscr.fz) {
385                feclearexcept(FeInexact);
386                mid = temp;
387            }
388        }
389        __asm__ __volatile__("" :: "m" (temp));
390    }
391    return mid;
392}
393
394static inline uint16_t
395vcvtFpFpH(FPSCR &fpscr, bool flush, bool defaultNan,
396          uint32_t rMode, bool ahp, uint64_t opBits, bool isDouble)
397{
398    uint32_t mWidth;
399    uint32_t eWidth;
400    uint32_t eHalfRange;
401    uint32_t sBitPos;
402
403    if (isDouble) {
404        mWidth = 52;
405        eWidth = 11;
406    } else {
407        mWidth = 23;
408        eWidth = 8;
409    }
410    sBitPos    = eWidth + mWidth;
411    eHalfRange = (1 << (eWidth-1)) - 1;
412
413    // Extract the operand.
414    bool neg = bits(opBits, sBitPos);
415    uint32_t exponent = bits(opBits, sBitPos-1, mWidth);
416    uint64_t oldMantissa = bits(opBits, mWidth-1, 0);
417    uint32_t mantissa = oldMantissa >> (mWidth - 10);
418    // Do the conversion.
419    uint64_t extra = oldMantissa & mask(mWidth - 10);
420    if (exponent == mask(eWidth)) {
421        if (oldMantissa != 0) {
422            // Nans.
423            if (bits(mantissa, 9) == 0) {
424                // Signalling nan.
425                fpscr.ioc = 1;
426            }
427            if (ahp) {
428                mantissa = 0;
429                exponent = 0;
430                fpscr.ioc = 1;
431            } else if (defaultNan) {
432                mantissa = (1 << 9);
433                exponent = 0x1f;
434                neg = false;
435            } else {
436                exponent = 0x1f;
437                mantissa |= (1 << 9);
438            }
439        } else {
440            // Infinities.
441            exponent = 0x1F;
442            if (ahp) {
443                fpscr.ioc = 1;
444                mantissa = 0x3ff;
445            } else {
446                mantissa = 0;
447            }
448        }
449    } else if (exponent == 0 && oldMantissa == 0) {
450        // Zero, don't need to do anything.
451    } else {
452        // Normalized or denormalized numbers.
453
454        bool inexact = (extra != 0);
455
456        if (exponent == 0) {
457            // Denormalized.
458            // If flush to zero is on, this shouldn't happen.
459            assert(!flush);
460
461            // Check for underflow
462            if (inexact || fpscr.ufe)
463                fpscr.ufc = 1;
464
465            // Handle rounding.
466            unsigned mode = rMode;
467            if ((mode == VfpRoundUpward && !neg && extra) ||
468                (mode == VfpRoundDown && neg && extra) ||
469                (mode == VfpRoundNearest &&
470                 (extra > (1 << 9) ||
471                  (extra == (1 << 9) && bits(mantissa, 0))))) {
472                mantissa++;
473            }
474
475            // See if the number became normalized after rounding.
476            if (mantissa == (1 << 10)) {
477                mantissa = 0;
478                exponent = 1;
479            }
480        } else {
481            // Normalized.
482
483            // We need to track the dropped bits differently since
484            // more can be dropped by denormalizing.
485            bool topOne = bits(extra, mWidth - 10 - 1);
486            bool restZeros = bits(extra, mWidth - 10 - 2, 0) == 0;
487
488            if (exponent <= (eHalfRange - 15)) {
489                // The result is too small. Denormalize.
490                mantissa |= (1 << 10);
491                while (mantissa && exponent <= (eHalfRange - 15)) {
492                    restZeros = restZeros && !topOne;
493                    topOne = bits(mantissa, 0);
494                    mantissa = mantissa >> 1;
495                    exponent++;
496                }
497                if (topOne || !restZeros)
498                    inexact = true;
499                exponent = 0;
500            } else {
501                // Change bias.
502                exponent -= (eHalfRange - 15);
503            }
504
505            if (exponent == 0 && (inexact || fpscr.ufe)) {
506                // Underflow
507                fpscr.ufc = 1;
508            }
509
510            // Handle rounding.
511            unsigned mode = rMode;
512            bool nonZero = topOne || !restZeros;
513            if ((mode == VfpRoundUpward && !neg && nonZero) ||
514                (mode == VfpRoundDown && neg && nonZero) ||
515                (mode == VfpRoundNearest && topOne &&
516                 (!restZeros || bits(mantissa, 0)))) {
517                mantissa++;
518            }
519
520            // See if we rounded up and need to bump the exponent.
521            if (mantissa == (1 << 10)) {
522                mantissa = 0;
523                exponent++;
524            }
525
526            // Deal with overflow
527            if (ahp) {
528                if (exponent >= 0x20) {
529                    exponent = 0x1f;
530                    mantissa = 0x3ff;
531                    fpscr.ioc = 1;
532                    // Supress inexact exception.
533                    inexact = false;
534                }
535            } else {
536                if (exponent >= 0x1f) {
537                    if ((mode == VfpRoundNearest) ||
538                        (mode == VfpRoundUpward && !neg) ||
539                        (mode == VfpRoundDown && neg)) {
540                        // Overflow to infinity.
541                        exponent = 0x1f;
542                        mantissa = 0;
543                    } else {
544                        // Overflow to max normal.
545                        exponent = 0x1e;
546                        mantissa = 0x3ff;
547                    }
548                    fpscr.ofc = 1;
549                    inexact = true;
550                }
551            }
552        }
553
554        if (inexact) {
555            fpscr.ixc = 1;
556        }
557    }
558    // Reassemble and install the result.
559    uint32_t result = bits(mantissa, 9, 0);
560    replaceBits(result, 14, 10, exponent);
561    if (neg)
562        result |= (1 << 15);
563    return result;
564}
565
566uint16_t
567vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan,
568           uint32_t rMode, bool ahp, float op)
569{
570    uint64_t opBits = fpToBits(op);
571    return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, false);
572}
573
574uint16_t
575vcvtFpDFpH(FPSCR &fpscr, bool flush, bool defaultNan,
576           uint32_t rMode, bool ahp, double op)
577{
578    uint64_t opBits = fpToBits(op);
579    return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, true);
580}
581
582static inline uint64_t
583vcvtFpHFp(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op, bool isDouble)
584{
585    uint32_t mWidth;
586    uint32_t eWidth;
587    uint32_t eHalfRange;
588    uint32_t sBitPos;
589
590    if (isDouble) {
591        mWidth = 52;
592        eWidth = 11;
593    } else {
594        mWidth = 23;
595        eWidth = 8;
596    }
597    sBitPos    = eWidth + mWidth;
598    eHalfRange = (1 << (eWidth-1)) - 1;
599
600    // Extract the bitfields.
601    bool neg = bits(op, 15);
602    uint32_t exponent = bits(op, 14, 10);
603    uint64_t mantissa = bits(op, 9, 0);
604    // Do the conversion.
605    if (exponent == 0) {
606        if (mantissa != 0) {
607            // Normalize the value.
608            exponent = exponent + (eHalfRange - 15) + 1;
609            while (mantissa < (1 << 10)) {
610                mantissa = mantissa << 1;
611                exponent--;
612            }
613        }
614        mantissa = mantissa << (mWidth - 10);
615    } else if (exponent == 0x1f && !ahp) {
616        // Infinities and nans.
617        exponent = mask(eWidth);
618        if (mantissa != 0) {
619            // Nans.
620            mantissa = mantissa << (mWidth - 10);
621            if (bits(mantissa, mWidth-1) == 0) {
622                // Signalling nan.
623                fpscr.ioc = 1;
624                mantissa |= (((uint64_t) 1) << (mWidth-1));
625            }
626            if (defaultNan) {
627                mantissa &= ~mask(mWidth-1);
628                neg = false;
629            }
630        }
631    } else {
632        exponent = exponent + (eHalfRange - 15);
633        mantissa = mantissa << (mWidth - 10);
634    }
635    // Reassemble the result.
636    uint64_t result = bits(mantissa, mWidth-1, 0);
637    replaceBits(result, sBitPos-1, mWidth, exponent);
638    if (neg) {
639        result |= (((uint64_t) 1) << sBitPos);
640    }
641    return result;
642}
643
644double
645vcvtFpHFpD(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op)
646{
647    double junk = 0.0;
648    uint64_t result;
649
650    result = vcvtFpHFp(fpscr, defaultNan, ahp, op, true);
651    return bitsToFp(result, junk);
652}
653
654float
655vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op)
656{
657    float junk = 0.0;
658    uint64_t result;
659
660    result = vcvtFpHFp(fpscr, defaultNan, ahp, op, false);
661    return bitsToFp(result, junk);
662}
663
664float
665vfpUFixedToFpS(bool flush, bool defaultNan,
666        uint64_t val, uint8_t width, uint8_t imm)
667{
668    fesetround(FeRoundNearest);
669    if (width == 16)
670        val = (uint16_t)val;
671    else if (width == 32)
672        val = (uint32_t)val;
673    else if (width != 64)
674        panic("Unsupported width %d", width);
675    float scale = powf(2.0, imm);
676    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
677    feclearexcept(FeAllExceptions);
678    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
679    return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
680}
681
682float
683vfpSFixedToFpS(bool flush, bool defaultNan,
684        int64_t val, uint8_t width, uint8_t imm)
685{
686    fesetround(FeRoundNearest);
687    if (width == 16)
688        val = sext<16>(val & mask(16));
689    else if (width == 32)
690        val = sext<32>(val & mask(32));
691    else if (width != 64)
692        panic("Unsupported width %d", width);
693
694    float scale = powf(2.0, imm);
695    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
696    feclearexcept(FeAllExceptions);
697    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
698    return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
699}
700
701
702double
703vfpUFixedToFpD(bool flush, bool defaultNan,
704        uint64_t val, uint8_t width, uint8_t imm)
705{
706    fesetround(FeRoundNearest);
707    if (width == 16)
708        val = (uint16_t)val;
709    else if (width == 32)
710        val = (uint32_t)val;
711    else if (width != 64)
712        panic("Unsupported width %d", width);
713
714    double scale = pow(2.0, imm);
715    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
716    feclearexcept(FeAllExceptions);
717    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
718    return fixDivDest(flush, defaultNan, val / scale, (double)val, scale);
719}
720
721double
722vfpSFixedToFpD(bool flush, bool defaultNan,
723        int64_t val, uint8_t width, uint8_t imm)
724{
725    fesetround(FeRoundNearest);
726    if (width == 16)
727        val = sext<16>(val & mask(16));
728    else if (width == 32)
729        val = sext<32>(val & mask(32));
730    else if (width != 64)
731        panic("Unsupported width %d", width);
732
733    double scale = pow(2.0, imm);
734    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
735    feclearexcept(FeAllExceptions);
736    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
737    return fixDivDest(flush, defaultNan, val / scale, (double)val, scale);
738}
739
740// This function implements a magic formula taken from the architecture
741// reference manual. It was originally called recip_sqrt_estimate.
742static double
743recipSqrtEstimate(double a)
744{
745    int64_t q0, q1, s;
746    double r;
747    if (a < 0.5) {
748        q0 = (int64_t)(a * 512.0);
749        r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0);
750    } else {
751        q1 = (int64_t)(a * 256.0);
752        r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0);
753    }
754    s = (int64_t)(256.0 * r + 0.5);
755    return (double)s / 256.0;
756}
757
758// This function is only intended for use in Neon instructions because
759// it ignores certain bits in the FPSCR.
760float
761fprSqrtEstimate(FPSCR &fpscr, float op)
762{
763    const uint32_t qnan = 0x7fc00000;
764    float junk = 0.0;
765    int fpClass = std::fpclassify(op);
766    if (fpClass == FP_NAN) {
767        if ((fpToBits(op) & qnan) != qnan)
768            fpscr.ioc = 1;
769        return bitsToFp(qnan, junk);
770    } else if (fpClass == FP_ZERO) {
771        fpscr.dzc = 1;
772        // Return infinity with the same sign as the operand.
773        return bitsToFp((std::signbit(op) << 31) |
774                       (0xFF << 23) | (0 << 0), junk);
775    } else if (std::signbit(op)) {
776        // Set invalid op bit.
777        fpscr.ioc = 1;
778        return bitsToFp(qnan, junk);
779    } else if (fpClass == FP_INFINITE) {
780        return 0.0;
781    } else {
782        uint64_t opBits = fpToBits(op);
783        double scaled;
784        if (bits(opBits, 23)) {
785            scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
786                              (ULL(0x3fd) << 52) | (bits(opBits, 31) << 63),
787                              (double)0.0);
788        } else {
789            scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
790                              (ULL(0x3fe) << 52) | (bits(opBits, 31) << 63),
791                              (double)0.0);
792        }
793        uint64_t resultExp = (380 - bits(opBits, 30, 23)) / 2;
794
795        uint64_t estimate = fpToBits(recipSqrtEstimate(scaled));
796
797        return bitsToFp((bits(estimate, 63) << 31) |
798                        (bits(resultExp, 7, 0) << 23) |
799                        (bits(estimate, 51, 29) << 0), junk);
800    }
801}
802
803uint32_t
804unsignedRSqrtEstimate(uint32_t op)
805{
806    if (bits(op, 31, 30) == 0) {
807        return -1;
808    } else {
809        double dpOp;
810        if (bits(op, 31)) {
811            dpOp = bitsToFp((ULL(0) << 63) |
812                            (ULL(0x3fe) << 52) |
813                            (bits((uint64_t)op, 30, 0) << 21) |
814                            (0 << 0), (double)0.0);
815        } else {
816            dpOp = bitsToFp((ULL(0) << 63) |
817                            (ULL(0x3fd) << 52) |
818                            (bits((uint64_t)op, 29, 0) << 22) |
819                            (0 << 0), (double)0.0);
820        }
821        uint64_t estimate = fpToBits(recipSqrtEstimate(dpOp));
822        return (1 << 31) | bits(estimate, 51, 21);
823    }
824}
825
826// This function implements a magic formula taken from the architecture
827// reference manual. It was originally called recip_estimate.
828
829static double
830recipEstimate(double a)
831{
832    int64_t q, s;
833    double r;
834    q = (int64_t)(a * 512.0);
835    r = 1.0 / (((double)q + 0.5) / 512.0);
836    s = (int64_t)(256.0 * r + 0.5);
837    return (double)s / 256.0;
838}
839
840// This function is only intended for use in Neon instructions because
841// it ignores certain bits in the FPSCR.
842float
843fpRecipEstimate(FPSCR &fpscr, float op)
844{
845    const uint32_t qnan = 0x7fc00000;
846    float junk = 0.0;
847    int fpClass = std::fpclassify(op);
848    if (fpClass == FP_NAN) {
849        if ((fpToBits(op) & qnan) != qnan)
850            fpscr.ioc = 1;
851        return bitsToFp(qnan, junk);
852    } else if (fpClass == FP_INFINITE) {
853        return bitsToFp(std::signbit(op) << 31, junk);
854    } else if (fpClass == FP_ZERO) {
855        fpscr.dzc = 1;
856        // Return infinity with the same sign as the operand.
857        return bitsToFp((std::signbit(op) << 31) |
858                       (0xFF << 23) | (0 << 0), junk);
859    } else if (fabs(op) >= pow(2.0, 126)) {
860        fpscr.ufc = 1;
861        return bitsToFp(std::signbit(op) << 31, junk);
862    } else {
863        uint64_t opBits = fpToBits(op);
864        double scaled;
865        scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
866                          (ULL(0x3fe) << 52) | (ULL(0) << 63),
867                          (double)0.0);
868        uint64_t resultExp = 253 - bits(opBits, 30, 23);
869
870        uint64_t estimate = fpToBits(recipEstimate(scaled));
871
872        return bitsToFp((bits(opBits, 31) << 31) |
873                        (bits(resultExp, 7, 0) << 23) |
874                        (bits(estimate, 51, 29) << 0), junk);
875    }
876}
877
878uint32_t
879unsignedRecipEstimate(uint32_t op)
880{
881    if (bits(op, 31) == 0) {
882        return -1;
883    } else {
884        double dpOp;
885        dpOp = bitsToFp((ULL(0) << 63) |
886                        (ULL(0x3fe) << 52) |
887                        (bits((uint64_t)op, 30, 0) << 21) |
888                        (0 << 0), (double)0.0);
889        uint64_t estimate = fpToBits(recipEstimate(dpOp));
890        return (1 << 31) | bits(estimate, 51, 21);
891    }
892}
893
894template <class fpType>
895fpType
896FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
897                  fpType op1, fpType op2) const
898{
899    done = true;
900    fpType junk = 0.0;
901    fpType dest = 0.0;
902    const bool single = (sizeof(fpType) == sizeof(float));
903    const uint64_t qnan =
904        single ? 0x7fc00000 : ULL(0x7ff8000000000000);
905    const bool nan1 = std::isnan(op1);
906    const bool nan2 = std::isnan(op2);
907    const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
908    const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
909    if (nan1 || nan2) {
910        if (defaultNan) {
911            dest = bitsToFp(qnan, junk);
912        }  else if (signal1) {
913            dest = bitsToFp(fpToBits(op1) | qnan, junk);
914        } else if (signal2) {
915            dest = bitsToFp(fpToBits(op2) | qnan, junk);
916        } else if (nan1) {
917            dest = op1;
918        } else if (nan2) {
919            dest = op2;
920        }
921        if (signal1 || signal2) {
922            fpscr.ioc = 1;
923        }
924    } else {
925        done = false;
926    }
927    return dest;
928}
929
930template
931float FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
932                        float op1, float op2) const;
933template
934double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
935                         double op1, double op2) const;
936
937// @TODO remove this function when we've finished switching all FMA code to use the new FPLIB
938template <class fpType>
939fpType
940FpOp::ternaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType op3,
941                fpType (*func)(fpType, fpType, fpType),
942                bool flush, bool defaultNan, uint32_t rMode) const
943{
944    const bool single = (sizeof(fpType) == sizeof(float));
945    fpType junk = 0.0;
946
947    if (flush && (flushToZero(op1, op2) || flushToZero(op3)))
948        fpscr.idc = 1;
949    VfpSavedState state = prepFpState(rMode);
950    __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3), "=m" (state)
951                             :  "m" (op1),  "m" (op2),  "m" (op3),  "m" (state));
952    fpType dest = func(op1, op2, op3);
953    __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
954
955    int fpClass = std::fpclassify(dest);
956    // Get NAN behavior right. This varies between x86 and ARM.
957    if (fpClass == FP_NAN) {
958        const uint64_t qnan =
959            single ? 0x7fc00000 : ULL(0x7ff8000000000000);
960        const bool nan1 = std::isnan(op1);
961        const bool nan2 = std::isnan(op2);
962        const bool nan3 = std::isnan(op3);
963        const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
964        const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
965        const bool signal3 = nan3 && ((fpToBits(op3) & qnan) != qnan);
966        if ((!nan1 && !nan2 && !nan3) || (defaultNan == 1)) {
967            dest = bitsToFp(qnan, junk);
968        } else if (signal1) {
969            dest = bitsToFp(fpToBits(op1) | qnan, junk);
970        } else if (signal2) {
971            dest = bitsToFp(fpToBits(op2) | qnan, junk);
972        } else if (signal3) {
973            dest = bitsToFp(fpToBits(op3) | qnan, junk);
974        } else if (nan1) {
975            dest = op1;
976        } else if (nan2) {
977            dest = op2;
978        } else if (nan3) {
979            dest = op3;
980        }
981    } else if (flush && flushToZero(dest)) {
982        feraiseexcept(FeUnderflow);
983    } else if ((
984                (single && (dest == bitsToFp(0x00800000, junk) ||
985                     dest == bitsToFp(0x80800000, junk))) ||
986                (!single &&
987                    (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
988                     dest == bitsToFp(ULL(0x8010000000000000), junk)))
989               ) && rMode != VfpRoundZero) {
990        /*
991         * Correct for the fact that underflow is detected -before- rounding
992         * in ARM and -after- rounding in x86.
993         */
994        fesetround(FeRoundZero);
995        __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3)
996                                 :  "m" (op1),  "m" (op2),  "m" (op3));
997        fpType temp = func(op1, op2, op2);
998        __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
999        if (flush && flushToZero(temp)) {
1000            dest = temp;
1001        }
1002    }
1003    finishVfp(fpscr, state, flush);
1004    return dest;
1005}
1006
1007template
1008float FpOp::ternaryOp(FPSCR &fpscr, float op1, float op2, float op3,
1009                      float (*func)(float, float, float),
1010                      bool flush, bool defaultNan, uint32_t rMode) const;
1011template
1012double FpOp::ternaryOp(FPSCR &fpscr, double op1, double op2, double op3,
1013                       double (*func)(double, double, double),
1014                       bool flush, bool defaultNan, uint32_t rMode) const;
1015
1016template <class fpType>
1017fpType
1018FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2,
1019               fpType (*func)(fpType, fpType),
1020               bool flush, bool defaultNan, uint32_t rMode) const
1021{
1022    const bool single = (sizeof(fpType) == sizeof(float));
1023    fpType junk = 0.0;
1024
1025    if (flush && flushToZero(op1, op2))
1026        fpscr.idc = 1;
1027    VfpSavedState state = prepFpState(rMode);
1028    __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (state)
1029                             : "m" (op1), "m" (op2), "m" (state));
1030    fpType dest = func(op1, op2);
1031    __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
1032
1033    // Get NAN behavior right. This varies between x86 and ARM.
1034    if (std::isnan(dest)) {
1035        const uint64_t qnan =
1036            single ? 0x7fc00000 : ULL(0x7ff8000000000000);
1037        const bool nan1 = std::isnan(op1);
1038        const bool nan2 = std::isnan(op2);
1039        const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
1040        const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
1041        if ((!nan1 && !nan2) || (defaultNan == 1)) {
1042            dest = bitsToFp(qnan, junk);
1043        } else if (signal1) {
1044            dest = bitsToFp(fpToBits(op1) | qnan, junk);
1045        } else if (signal2) {
1046            dest = bitsToFp(fpToBits(op2) | qnan, junk);
1047        } else if (nan1) {
1048            dest = op1;
1049        } else if (nan2) {
1050            dest = op2;
1051        }
1052    } else if (flush && flushToZero(dest)) {
1053        feraiseexcept(FeUnderflow);
1054    } else if ((
1055                (single && (dest == bitsToFp(0x00800000, junk) ||
1056                     dest == bitsToFp(0x80800000, junk))) ||
1057                (!single &&
1058                    (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
1059                     dest == bitsToFp(ULL(0x8010000000000000), junk)))
1060               ) && rMode != VfpRoundZero) {
1061        /*
1062         * Correct for the fact that underflow is detected -before- rounding
1063         * in ARM and -after- rounding in x86.
1064         */
1065        fesetround(FeRoundZero);
1066        __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2)
1067                                 : "m" (op1), "m" (op2));
1068        fpType temp = func(op1, op2);
1069        __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
1070        if (flush && flushToZero(temp)) {
1071            dest = temp;
1072        }
1073    }
1074    finishVfp(fpscr, state, flush);
1075    return dest;
1076}
1077
1078template
1079float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2,
1080                     float (*func)(float, float),
1081                     bool flush, bool defaultNan, uint32_t rMode) const;
1082template
1083double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2,
1084                      double (*func)(double, double),
1085                      bool flush, bool defaultNan, uint32_t rMode) const;
1086
1087template <class fpType>
1088fpType
1089FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (*func)(fpType),
1090              bool flush, uint32_t rMode) const
1091{
1092    const bool single = (sizeof(fpType) == sizeof(float));
1093    fpType junk = 0.0;
1094
1095    if (flush && flushToZero(op1))
1096        fpscr.idc = 1;
1097    VfpSavedState state = prepFpState(rMode);
1098    __asm__ __volatile__ ("" : "=m" (op1), "=m" (state)
1099                             : "m" (op1), "m" (state));
1100    fpType dest = func(op1);
1101    __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
1102
1103    // Get NAN behavior right. This varies between x86 and ARM.
1104    if (std::isnan(dest)) {
1105        const uint64_t qnan =
1106            single ? 0x7fc00000 : ULL(0x7ff8000000000000);
1107        const bool nan = std::isnan(op1);
1108        if (!nan || fpscr.dn == 1) {
1109            dest = bitsToFp(qnan, junk);
1110        } else if (nan) {
1111            dest = bitsToFp(fpToBits(op1) | qnan, junk);
1112        }
1113    } else if (flush && flushToZero(dest)) {
1114        feraiseexcept(FeUnderflow);
1115    } else if ((
1116                (single && (dest == bitsToFp(0x00800000, junk) ||
1117                     dest == bitsToFp(0x80800000, junk))) ||
1118                (!single &&
1119                    (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
1120                     dest == bitsToFp(ULL(0x8010000000000000), junk)))
1121               ) && rMode != VfpRoundZero) {
1122        /*
1123         * Correct for the fact that underflow is detected -before- rounding
1124         * in ARM and -after- rounding in x86.
1125         */
1126        fesetround(FeRoundZero);
1127        __asm__ __volatile__ ("" : "=m" (op1) : "m" (op1));
1128        fpType temp = func(op1);
1129        __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
1130        if (flush && flushToZero(temp)) {
1131            dest = temp;
1132        }
1133    }
1134    finishVfp(fpscr, state, flush);
1135    return dest;
1136}
1137
1138template
1139float FpOp::unaryOp(FPSCR &fpscr, float op1, float (*func)(float),
1140                    bool flush, uint32_t rMode) const;
1141template
1142double FpOp::unaryOp(FPSCR &fpscr, double op1, double (*func)(double),
1143                     bool flush, uint32_t rMode) const;
1144
1145IntRegIndex
1146VfpMacroOp::addStride(IntRegIndex idx, unsigned stride)
1147{
1148    if (wide) {
1149        stride *= 2;
1150    }
1151    unsigned offset = idx % 8;
1152    idx = (IntRegIndex)(idx - offset);
1153    offset += stride;
1154    idx = (IntRegIndex)(idx + (offset % 8));
1155    return idx;
1156}
1157
1158void
1159VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1, IntRegIndex &op2)
1160{
1161    unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1162    assert(!inScalarBank(dest));
1163    dest = addStride(dest, stride);
1164    op1 = addStride(op1, stride);
1165    if (!inScalarBank(op2)) {
1166        op2 = addStride(op2, stride);
1167    }
1168}
1169
1170void
1171VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1)
1172{
1173    unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1174    assert(!inScalarBank(dest));
1175    dest = addStride(dest, stride);
1176    if (!inScalarBank(op1)) {
1177        op1 = addStride(op1, stride);
1178    }
1179}
1180
1181void
1182VfpMacroOp::nextIdxs(IntRegIndex &dest)
1183{
1184    unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1185    assert(!inScalarBank(dest));
1186    dest = addStride(dest, stride);
1187}
1188
1189}
1190