vfp.cc revision 10037:5cac77888310
1/*
2 * Copyright (c) 2010-2013 ARM Limited
3 * All rights reserved
4 *
5 * The license below extends only to copyright in the software and shall
6 * not be construed as granting a license to any other intellectual
7 * property including but not limited to intellectual property relating
8 * to a hardware implementation of the functionality of the software
9 * licensed hereunder.  You may use the software subject to the license
10 * terms below provided that you ensure that this notice is replicated
11 * unmodified and in its entirety in all distributions of the software,
12 * modified or unmodified, in source code or in binary form.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions are
16 * met: redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer;
18 * redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution;
21 * neither the name of the copyright holders nor the names of its
22 * contributors may be used to endorse or promote products derived from
23 * this software without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 *
37 * Authors: Gabe Black
38 */
39
40#include "arch/arm/insts/vfp.hh"
41
42/*
43 * The asm statements below are to keep gcc from reordering code. Otherwise
44 * the rounding mode might be set after the operation it was intended for, the
45 * exception bits read before it, etc.
46 */
47
48std::string
49FpCondCompRegOp::generateDisassembly(
50        Addr pc, const SymbolTable *symtab) const
51{
52    std::stringstream ss;
53    printMnemonic(ss, "", false);
54    printReg(ss, op1);
55    ccprintf(ss, ", ");
56    printReg(ss, op2);
57    ccprintf(ss, ", #%d", defCc);
58    ccprintf(ss, ", ");
59    printCondition(ss, condCode, true);
60    return ss.str();
61}
62
63std::string
64FpCondSelOp::generateDisassembly(
65        Addr pc, const SymbolTable *symtab) const
66{
67    std::stringstream ss;
68    printMnemonic(ss, "", false);
69    printReg(ss, dest);
70    ccprintf(ss, ", ");
71    printReg(ss, op1);
72    ccprintf(ss, ", ");
73    printReg(ss, op2);
74    ccprintf(ss, ", ");
75    printCondition(ss, condCode, true);
76    return ss.str();
77}
78
79std::string
80FpRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
81{
82    std::stringstream ss;
83    printMnemonic(ss);
84    printReg(ss, dest + FP_Reg_Base);
85    ss << ", ";
86    printReg(ss, op1 + FP_Reg_Base);
87    return ss.str();
88}
89
90std::string
91FpRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
92{
93    std::stringstream ss;
94    printMnemonic(ss);
95    printReg(ss, dest + FP_Reg_Base);
96    ccprintf(ss, ", #%d", imm);
97    return ss.str();
98}
99
100std::string
101FpRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
102{
103    std::stringstream ss;
104    printMnemonic(ss);
105    printReg(ss, dest + FP_Reg_Base);
106    ss << ", ";
107    printReg(ss, op1 + FP_Reg_Base);
108    ccprintf(ss, ", #%d", imm);
109    return ss.str();
110}
111
112std::string
113FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
114{
115    std::stringstream ss;
116    printMnemonic(ss);
117    printReg(ss, dest + FP_Reg_Base);
118    ss << ", ";
119    printReg(ss, op1 + FP_Reg_Base);
120    ss << ", ";
121    printReg(ss, op2 + FP_Reg_Base);
122    return ss.str();
123}
124
125std::string
126FpRegRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
127{
128    std::stringstream ss;
129    printMnemonic(ss);
130    printReg(ss, dest + FP_Reg_Base);
131    ss << ", ";
132    printReg(ss, op1 + FP_Reg_Base);
133    ss << ", ";
134    printReg(ss, op2 + FP_Reg_Base);
135    ss << ", ";
136    printReg(ss, op3 + FP_Reg_Base);
137    return ss.str();
138}
139
140std::string
141FpRegRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
142{
143    std::stringstream ss;
144    printMnemonic(ss);
145    printReg(ss, dest + FP_Reg_Base);
146    ss << ", ";
147    printReg(ss, op1 + FP_Reg_Base);
148    ss << ", ";
149    printReg(ss, op2 + FP_Reg_Base);
150    ccprintf(ss, ", #%d", imm);
151    return ss.str();
152}
153
154namespace ArmISA
155{
156
157VfpSavedState
158prepFpState(uint32_t rMode)
159{
160    int roundingMode = fegetround();
161    feclearexcept(FeAllExceptions);
162    switch (rMode) {
163      case VfpRoundNearest:
164        fesetround(FeRoundNearest);
165        break;
166      case VfpRoundUpward:
167        fesetround(FeRoundUpward);
168        break;
169      case VfpRoundDown:
170        fesetround(FeRoundDown);
171        break;
172      case VfpRoundZero:
173        fesetround(FeRoundZero);
174        break;
175    }
176    return roundingMode;
177}
178
179void
180finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush, FPSCR mask)
181{
182    int exceptions = fetestexcept(FeAllExceptions);
183    bool underflow = false;
184    if ((exceptions & FeInvalid) && mask.ioc) {
185        fpscr.ioc = 1;
186    }
187    if ((exceptions & FeDivByZero) && mask.dzc) {
188        fpscr.dzc = 1;
189    }
190    if ((exceptions & FeOverflow) && mask.ofc) {
191        fpscr.ofc = 1;
192    }
193    if (exceptions & FeUnderflow) {
194        underflow = true;
195        if (mask.ufc)
196            fpscr.ufc = 1;
197    }
198    if ((exceptions & FeInexact) && !(underflow && flush) && mask.ixc) {
199        fpscr.ixc = 1;
200    }
201    fesetround(state);
202}
203
204template <class fpType>
205fpType
206fixDest(bool flush, bool defaultNan, fpType val, fpType op1)
207{
208    int fpClass = std::fpclassify(val);
209    fpType junk = 0.0;
210    if (fpClass == FP_NAN) {
211        const bool single = (sizeof(val) == sizeof(float));
212        const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
213        const bool nan = std::isnan(op1);
214        if (!nan || defaultNan) {
215            val = bitsToFp(qnan, junk);
216        } else if (nan) {
217            val = bitsToFp(fpToBits(op1) | qnan, junk);
218        }
219    } else if (fpClass == FP_SUBNORMAL && flush == 1) {
220        // Turn val into a zero with the correct sign;
221        uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1);
222        val = bitsToFp(fpToBits(val) & bitMask, junk);
223        feclearexcept(FeInexact);
224        feraiseexcept(FeUnderflow);
225    }
226    return val;
227}
228
229template
230float fixDest<float>(bool flush, bool defaultNan, float val, float op1);
231template
232double fixDest<double>(bool flush, bool defaultNan, double val, double op1);
233
234template <class fpType>
235fpType
236fixDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2)
237{
238    int fpClass = std::fpclassify(val);
239    fpType junk = 0.0;
240    if (fpClass == FP_NAN) {
241        const bool single = (sizeof(val) == sizeof(float));
242        const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
243        const bool nan1 = std::isnan(op1);
244        const bool nan2 = std::isnan(op2);
245        const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
246        const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
247        if ((!nan1 && !nan2) || defaultNan) {
248            val = bitsToFp(qnan, junk);
249        } else if (signal1) {
250            val = bitsToFp(fpToBits(op1) | qnan, junk);
251        } else if (signal2) {
252            val = bitsToFp(fpToBits(op2) | qnan, junk);
253        } else if (nan1) {
254            val = op1;
255        } else if (nan2) {
256            val = op2;
257        }
258    } else if (fpClass == FP_SUBNORMAL && flush) {
259        // Turn val into a zero with the correct sign;
260        uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1);
261        val = bitsToFp(fpToBits(val) & bitMask, junk);
262        feclearexcept(FeInexact);
263        feraiseexcept(FeUnderflow);
264    }
265    return val;
266}
267
268template
269float fixDest<float>(bool flush, bool defaultNan,
270                     float val, float op1, float op2);
271template
272double fixDest<double>(bool flush, bool defaultNan,
273                       double val, double op1, double op2);
274
275template <class fpType>
276fpType
277fixDivDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2)
278{
279    fpType mid = fixDest(flush, defaultNan, val, op1, op2);
280    const bool single = (sizeof(fpType) == sizeof(float));
281    const fpType junk = 0.0;
282    if ((single && (val == bitsToFp(0x00800000, junk) ||
283                    val == bitsToFp(0x80800000, junk))) ||
284        (!single && (val == bitsToFp(ULL(0x0010000000000000), junk) ||
285                     val == bitsToFp(ULL(0x8010000000000000), junk)))
286        ) {
287        __asm__ __volatile__("" : "=m" (op1) : "m" (op1));
288        fesetround(FeRoundZero);
289        fpType temp = 0.0;
290        __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
291        temp = op1 / op2;
292        if (flushToZero(temp)) {
293            feraiseexcept(FeUnderflow);
294            if (flush) {
295                feclearexcept(FeInexact);
296                mid = temp;
297            }
298        }
299        __asm__ __volatile__("" :: "m" (temp));
300    }
301    return mid;
302}
303
304template
305float fixDivDest<float>(bool flush, bool defaultNan,
306                        float val, float op1, float op2);
307template
308double fixDivDest<double>(bool flush, bool defaultNan,
309                          double val, double op1, double op2);
310
311float
312fixFpDFpSDest(FPSCR fpscr, double val)
313{
314    const float junk = 0.0;
315    float op1 = 0.0;
316    if (std::isnan(val)) {
317        uint64_t valBits = fpToBits(val);
318        uint32_t op1Bits = bits(valBits, 50, 29) |
319                           (mask(9) << 22) |
320                           (bits(valBits, 63) << 31);
321        op1 = bitsToFp(op1Bits, junk);
322    }
323    float mid = fixDest(fpscr.fz, fpscr.dn, (float)val, op1);
324    if (fpscr.fz && fetestexcept(FeUnderflow | FeInexact) ==
325                    (FeUnderflow | FeInexact)) {
326        feclearexcept(FeInexact);
327    }
328    if (mid == bitsToFp(0x00800000, junk) ||
329        mid == bitsToFp(0x80800000, junk)) {
330        __asm__ __volatile__("" : "=m" (val) : "m" (val));
331        fesetround(FeRoundZero);
332        float temp = 0.0;
333        __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
334        temp = val;
335        if (flushToZero(temp)) {
336            feraiseexcept(FeUnderflow);
337            if (fpscr.fz) {
338                feclearexcept(FeInexact);
339                mid = temp;
340            }
341        }
342        __asm__ __volatile__("" :: "m" (temp));
343    }
344    return mid;
345}
346
347double
348fixFpSFpDDest(FPSCR fpscr, float val)
349{
350    const double junk = 0.0;
351    double op1 = 0.0;
352    if (std::isnan(val)) {
353        uint32_t valBits = fpToBits(val);
354        uint64_t op1Bits = ((uint64_t)bits(valBits, 21, 0) << 29) |
355                           (mask(12) << 51) |
356                           ((uint64_t)bits(valBits, 31) << 63);
357        op1 = bitsToFp(op1Bits, junk);
358    }
359    double mid = fixDest(fpscr.fz, fpscr.dn, (double)val, op1);
360    if (mid == bitsToFp(ULL(0x0010000000000000), junk) ||
361        mid == bitsToFp(ULL(0x8010000000000000), junk)) {
362        __asm__ __volatile__("" : "=m" (val) : "m" (val));
363        fesetround(FeRoundZero);
364        double temp = 0.0;
365        __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
366        temp = val;
367        if (flushToZero(temp)) {
368            feraiseexcept(FeUnderflow);
369            if (fpscr.fz) {
370                feclearexcept(FeInexact);
371                mid = temp;
372            }
373        }
374        __asm__ __volatile__("" :: "m" (temp));
375    }
376    return mid;
377}
378
379static inline uint16_t
380vcvtFpFpH(FPSCR &fpscr, bool flush, bool defaultNan,
381          uint32_t rMode, bool ahp, uint64_t opBits, bool isDouble)
382{
383    uint32_t mWidth;
384    uint32_t eWidth;
385    uint32_t eHalfRange;
386    uint32_t sBitPos;
387
388    if (isDouble) {
389        mWidth = 52;
390        eWidth = 11;
391    } else {
392        mWidth = 23;
393        eWidth = 8;
394    }
395    sBitPos    = eWidth + mWidth;
396    eHalfRange = (1 << (eWidth-1)) - 1;
397
398    // Extract the operand.
399    bool neg = bits(opBits, sBitPos);
400    uint32_t exponent = bits(opBits, sBitPos-1, mWidth);
401    uint64_t oldMantissa = bits(opBits, mWidth-1, 0);
402    uint32_t mantissa = oldMantissa >> (mWidth - 10);
403    // Do the conversion.
404    uint64_t extra = oldMantissa & mask(mWidth - 10);
405    if (exponent == mask(eWidth)) {
406        if (oldMantissa != 0) {
407            // Nans.
408            if (bits(mantissa, 9) == 0) {
409                // Signalling nan.
410                fpscr.ioc = 1;
411            }
412            if (ahp) {
413                mantissa = 0;
414                exponent = 0;
415                fpscr.ioc = 1;
416            } else if (defaultNan) {
417                mantissa = (1 << 9);
418                exponent = 0x1f;
419                neg = false;
420            } else {
421                exponent = 0x1f;
422                mantissa |= (1 << 9);
423            }
424        } else {
425            // Infinities.
426            exponent = 0x1F;
427            if (ahp) {
428                fpscr.ioc = 1;
429                mantissa = 0x3ff;
430            } else {
431                mantissa = 0;
432            }
433        }
434    } else if (exponent == 0 && oldMantissa == 0) {
435        // Zero, don't need to do anything.
436    } else {
437        // Normalized or denormalized numbers.
438
439        bool inexact = (extra != 0);
440
441        if (exponent == 0) {
442            // Denormalized.
443            // If flush to zero is on, this shouldn't happen.
444            assert(!flush);
445
446            // Check for underflow
447            if (inexact || fpscr.ufe)
448                fpscr.ufc = 1;
449
450            // Handle rounding.
451            unsigned mode = rMode;
452            if ((mode == VfpRoundUpward && !neg && extra) ||
453                (mode == VfpRoundDown && neg && extra) ||
454                (mode == VfpRoundNearest &&
455                 (extra > (1 << 9) ||
456                  (extra == (1 << 9) && bits(mantissa, 0))))) {
457                mantissa++;
458            }
459
460            // See if the number became normalized after rounding.
461            if (mantissa == (1 << 10)) {
462                mantissa = 0;
463                exponent = 1;
464            }
465        } else {
466            // Normalized.
467
468            // We need to track the dropped bits differently since
469            // more can be dropped by denormalizing.
470            bool topOne = bits(extra, mWidth - 10 - 1);
471            bool restZeros = bits(extra, mWidth - 10 - 2, 0) == 0;
472
473            if (exponent <= (eHalfRange - 15)) {
474                // The result is too small. Denormalize.
475                mantissa |= (1 << 10);
476                while (mantissa && exponent <= (eHalfRange - 15)) {
477                    restZeros = restZeros && !topOne;
478                    topOne = bits(mantissa, 0);
479                    mantissa = mantissa >> 1;
480                    exponent++;
481                }
482                if (topOne || !restZeros)
483                    inexact = true;
484                exponent = 0;
485            } else {
486                // Change bias.
487                exponent -= (eHalfRange - 15);
488            }
489
490            if (exponent == 0 && (inexact || fpscr.ufe)) {
491                // Underflow
492                fpscr.ufc = 1;
493            }
494
495            // Handle rounding.
496            unsigned mode = rMode;
497            bool nonZero = topOne || !restZeros;
498            if ((mode == VfpRoundUpward && !neg && nonZero) ||
499                (mode == VfpRoundDown && neg && nonZero) ||
500                (mode == VfpRoundNearest && topOne &&
501                 (!restZeros || bits(mantissa, 0)))) {
502                mantissa++;
503            }
504
505            // See if we rounded up and need to bump the exponent.
506            if (mantissa == (1 << 10)) {
507                mantissa = 0;
508                exponent++;
509            }
510
511            // Deal with overflow
512            if (ahp) {
513                if (exponent >= 0x20) {
514                    exponent = 0x1f;
515                    mantissa = 0x3ff;
516                    fpscr.ioc = 1;
517                    // Supress inexact exception.
518                    inexact = false;
519                }
520            } else {
521                if (exponent >= 0x1f) {
522                    if ((mode == VfpRoundNearest) ||
523                        (mode == VfpRoundUpward && !neg) ||
524                        (mode == VfpRoundDown && neg)) {
525                        // Overflow to infinity.
526                        exponent = 0x1f;
527                        mantissa = 0;
528                    } else {
529                        // Overflow to max normal.
530                        exponent = 0x1e;
531                        mantissa = 0x3ff;
532                    }
533                    fpscr.ofc = 1;
534                    inexact = true;
535                }
536            }
537        }
538
539        if (inexact) {
540            fpscr.ixc = 1;
541        }
542    }
543    // Reassemble and install the result.
544    uint32_t result = bits(mantissa, 9, 0);
545    replaceBits(result, 14, 10, exponent);
546    if (neg)
547        result |= (1 << 15);
548    return result;
549}
550
551uint16_t
552vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan,
553           uint32_t rMode, bool ahp, float op)
554{
555    uint64_t opBits = fpToBits(op);
556    return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, false);
557}
558
559uint16_t
560vcvtFpDFpH(FPSCR &fpscr, bool flush, bool defaultNan,
561           uint32_t rMode, bool ahp, double op)
562{
563    uint64_t opBits = fpToBits(op);
564    return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, true);
565}
566
567static inline uint64_t
568vcvtFpHFp(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op, bool isDouble)
569{
570    uint32_t mWidth;
571    uint32_t eWidth;
572    uint32_t eHalfRange;
573    uint32_t sBitPos;
574
575    if (isDouble) {
576        mWidth = 52;
577        eWidth = 11;
578    } else {
579        mWidth = 23;
580        eWidth = 8;
581    }
582    sBitPos    = eWidth + mWidth;
583    eHalfRange = (1 << (eWidth-1)) - 1;
584
585    // Extract the bitfields.
586    bool neg = bits(op, 15);
587    uint32_t exponent = bits(op, 14, 10);
588    uint64_t mantissa = bits(op, 9, 0);
589    // Do the conversion.
590    if (exponent == 0) {
591        if (mantissa != 0) {
592            // Normalize the value.
593            exponent = exponent + (eHalfRange - 15) + 1;
594            while (mantissa < (1 << 10)) {
595                mantissa = mantissa << 1;
596                exponent--;
597            }
598        }
599        mantissa = mantissa << (mWidth - 10);
600    } else if (exponent == 0x1f && !ahp) {
601        // Infinities and nans.
602        exponent = mask(eWidth);
603        if (mantissa != 0) {
604            // Nans.
605            mantissa = mantissa << (mWidth - 10);
606            if (bits(mantissa, mWidth-1) == 0) {
607                // Signalling nan.
608                fpscr.ioc = 1;
609                mantissa |= (((uint64_t) 1) << (mWidth-1));
610            }
611            if (defaultNan) {
612                mantissa &= ~mask(mWidth-1);
613                neg = false;
614            }
615        }
616    } else {
617        exponent = exponent + (eHalfRange - 15);
618        mantissa = mantissa << (mWidth - 10);
619    }
620    // Reassemble the result.
621    uint64_t result = bits(mantissa, mWidth-1, 0);
622    replaceBits(result, sBitPos-1, mWidth, exponent);
623    if (neg) {
624        result |= (((uint64_t) 1) << sBitPos);
625    }
626    return result;
627}
628
629double
630vcvtFpHFpD(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op)
631{
632    double junk = 0.0;
633    uint64_t result;
634
635    result = vcvtFpHFp(fpscr, defaultNan, ahp, op, true);
636    return bitsToFp(result, junk);
637}
638
639float
640vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op)
641{
642    float junk = 0.0;
643    uint64_t result;
644
645    result = vcvtFpHFp(fpscr, defaultNan, ahp, op, false);
646    return bitsToFp(result, junk);
647}
648
649float
650vfpUFixedToFpS(bool flush, bool defaultNan,
651        uint64_t val, uint8_t width, uint8_t imm)
652{
653    fesetround(FeRoundNearest);
654    if (width == 16)
655        val = (uint16_t)val;
656    else if (width == 32)
657        val = (uint32_t)val;
658    else if (width != 64)
659        panic("Unsupported width %d", width);
660    float scale = powf(2.0, imm);
661    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
662    feclearexcept(FeAllExceptions);
663    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
664    return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
665}
666
667float
668vfpSFixedToFpS(bool flush, bool defaultNan,
669        int64_t val, uint8_t width, uint8_t imm)
670{
671    fesetround(FeRoundNearest);
672    if (width == 16)
673        val = sext<16>(val & mask(16));
674    else if (width == 32)
675        val = sext<32>(val & mask(32));
676    else if (width != 64)
677        panic("Unsupported width %d", width);
678
679    float scale = powf(2.0, imm);
680    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
681    feclearexcept(FeAllExceptions);
682    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
683    return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
684}
685
686
687double
688vfpUFixedToFpD(bool flush, bool defaultNan,
689        uint64_t val, uint8_t width, uint8_t imm)
690{
691    fesetround(FeRoundNearest);
692    if (width == 16)
693        val = (uint16_t)val;
694    else if (width == 32)
695        val = (uint32_t)val;
696    else if (width != 64)
697        panic("Unsupported width %d", width);
698
699    double scale = pow(2.0, imm);
700    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
701    feclearexcept(FeAllExceptions);
702    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
703    return fixDivDest(flush, defaultNan, val / scale, (double)val, scale);
704}
705
706double
707vfpSFixedToFpD(bool flush, bool defaultNan,
708        int64_t val, uint8_t width, uint8_t imm)
709{
710    fesetround(FeRoundNearest);
711    if (width == 16)
712        val = sext<16>(val & mask(16));
713    else if (width == 32)
714        val = sext<32>(val & mask(32));
715    else if (width != 64)
716        panic("Unsupported width %d", width);
717
718    double scale = pow(2.0, imm);
719    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
720    feclearexcept(FeAllExceptions);
721    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
722    return fixDivDest(flush, defaultNan, val / scale, (double)val, scale);
723}
724
725// This function implements a magic formula taken from the architecture
726// reference manual. It was originally called recip_sqrt_estimate.
727static double
728recipSqrtEstimate(double a)
729{
730    int64_t q0, q1, s;
731    double r;
732    if (a < 0.5) {
733        q0 = (int64_t)(a * 512.0);
734        r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0);
735    } else {
736        q1 = (int64_t)(a * 256.0);
737        r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0);
738    }
739    s = (int64_t)(256.0 * r + 0.5);
740    return (double)s / 256.0;
741}
742
743// This function is only intended for use in Neon instructions because
744// it ignores certain bits in the FPSCR.
745float
746fprSqrtEstimate(FPSCR &fpscr, float op)
747{
748    const uint32_t qnan = 0x7fc00000;
749    float junk = 0.0;
750    int fpClass = std::fpclassify(op);
751    if (fpClass == FP_NAN) {
752        if ((fpToBits(op) & qnan) != qnan)
753            fpscr.ioc = 1;
754        return bitsToFp(qnan, junk);
755    } else if (fpClass == FP_ZERO) {
756        fpscr.dzc = 1;
757        // Return infinity with the same sign as the operand.
758        return bitsToFp((std::signbit(op) << 31) |
759                       (0xFF << 23) | (0 << 0), junk);
760    } else if (std::signbit(op)) {
761        // Set invalid op bit.
762        fpscr.ioc = 1;
763        return bitsToFp(qnan, junk);
764    } else if (fpClass == FP_INFINITE) {
765        return 0.0;
766    } else {
767        uint64_t opBits = fpToBits(op);
768        double scaled;
769        if (bits(opBits, 23)) {
770            scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
771                              (ULL(0x3fd) << 52) | (bits(opBits, 31) << 63),
772                              (double)0.0);
773        } else {
774            scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
775                              (ULL(0x3fe) << 52) | (bits(opBits, 31) << 63),
776                              (double)0.0);
777        }
778        uint64_t resultExp = (380 - bits(opBits, 30, 23)) / 2;
779
780        uint64_t estimate = fpToBits(recipSqrtEstimate(scaled));
781
782        return bitsToFp((bits(estimate, 63) << 31) |
783                        (bits(resultExp, 7, 0) << 23) |
784                        (bits(estimate, 51, 29) << 0), junk);
785    }
786}
787
788uint32_t
789unsignedRSqrtEstimate(uint32_t op)
790{
791    if (bits(op, 31, 30) == 0) {
792        return -1;
793    } else {
794        double dpOp;
795        if (bits(op, 31)) {
796            dpOp = bitsToFp((ULL(0) << 63) |
797                            (ULL(0x3fe) << 52) |
798                            (bits((uint64_t)op, 30, 0) << 21) |
799                            (0 << 0), (double)0.0);
800        } else {
801            dpOp = bitsToFp((ULL(0) << 63) |
802                            (ULL(0x3fd) << 52) |
803                            (bits((uint64_t)op, 29, 0) << 22) |
804                            (0 << 0), (double)0.0);
805        }
806        uint64_t estimate = fpToBits(recipSqrtEstimate(dpOp));
807        return (1 << 31) | bits(estimate, 51, 21);
808    }
809}
810
811// This function implements a magic formula taken from the architecture
812// reference manual. It was originally called recip_estimate.
813
814static double
815recipEstimate(double a)
816{
817    int64_t q, s;
818    double r;
819    q = (int64_t)(a * 512.0);
820    r = 1.0 / (((double)q + 0.5) / 512.0);
821    s = (int64_t)(256.0 * r + 0.5);
822    return (double)s / 256.0;
823}
824
825// This function is only intended for use in Neon instructions because
826// it ignores certain bits in the FPSCR.
827float
828fpRecipEstimate(FPSCR &fpscr, float op)
829{
830    const uint32_t qnan = 0x7fc00000;
831    float junk = 0.0;
832    int fpClass = std::fpclassify(op);
833    if (fpClass == FP_NAN) {
834        if ((fpToBits(op) & qnan) != qnan)
835            fpscr.ioc = 1;
836        return bitsToFp(qnan, junk);
837    } else if (fpClass == FP_INFINITE) {
838        return bitsToFp(std::signbit(op) << 31, junk);
839    } else if (fpClass == FP_ZERO) {
840        fpscr.dzc = 1;
841        // Return infinity with the same sign as the operand.
842        return bitsToFp((std::signbit(op) << 31) |
843                       (0xFF << 23) | (0 << 0), junk);
844    } else if (fabs(op) >= pow(2.0, 126)) {
845        fpscr.ufc = 1;
846        return bitsToFp(std::signbit(op) << 31, junk);
847    } else {
848        uint64_t opBits = fpToBits(op);
849        double scaled;
850        scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
851                          (ULL(0x3fe) << 52) | (ULL(0) << 63),
852                          (double)0.0);
853        uint64_t resultExp = 253 - bits(opBits, 30, 23);
854
855        uint64_t estimate = fpToBits(recipEstimate(scaled));
856
857        return bitsToFp((bits(opBits, 31) << 31) |
858                        (bits(resultExp, 7, 0) << 23) |
859                        (bits(estimate, 51, 29) << 0), junk);
860    }
861}
862
863uint32_t
864unsignedRecipEstimate(uint32_t op)
865{
866    if (bits(op, 31) == 0) {
867        return -1;
868    } else {
869        double dpOp;
870        dpOp = bitsToFp((ULL(0) << 63) |
871                        (ULL(0x3fe) << 52) |
872                        (bits((uint64_t)op, 30, 0) << 21) |
873                        (0 << 0), (double)0.0);
874        uint64_t estimate = fpToBits(recipEstimate(dpOp));
875        return (1 << 31) | bits(estimate, 51, 21);
876    }
877}
878
879template <class fpType>
880fpType
881FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
882                  fpType op1, fpType op2) const
883{
884    done = true;
885    fpType junk = 0.0;
886    fpType dest = 0.0;
887    const bool single = (sizeof(fpType) == sizeof(float));
888    const uint64_t qnan =
889        single ? 0x7fc00000 : ULL(0x7ff8000000000000);
890    const bool nan1 = std::isnan(op1);
891    const bool nan2 = std::isnan(op2);
892    const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
893    const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
894    if (nan1 || nan2) {
895        if (defaultNan) {
896            dest = bitsToFp(qnan, junk);
897        }  else if (signal1) {
898            dest = bitsToFp(fpToBits(op1) | qnan, junk);
899        } else if (signal2) {
900            dest = bitsToFp(fpToBits(op2) | qnan, junk);
901        } else if (nan1) {
902            dest = op1;
903        } else if (nan2) {
904            dest = op2;
905        }
906        if (signal1 || signal2) {
907            fpscr.ioc = 1;
908        }
909    } else {
910        done = false;
911    }
912    return dest;
913}
914
915template
916float FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
917                        float op1, float op2) const;
918template
919double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
920                         double op1, double op2) const;
921
922// @TODO remove this function when we've finished switching all FMA code to use the new FPLIB
923template <class fpType>
924fpType
925FpOp::ternaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType op3,
926                fpType (*func)(fpType, fpType, fpType),
927                bool flush, bool defaultNan, uint32_t rMode) const
928{
929    const bool single = (sizeof(fpType) == sizeof(float));
930    fpType junk = 0.0;
931
932    if (flush && (flushToZero(op1, op2) || flushToZero(op3)))
933        fpscr.idc = 1;
934    VfpSavedState state = prepFpState(rMode);
935    __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3), "=m" (state)
936                             :  "m" (op1),  "m" (op2),  "m" (op3),  "m" (state));
937    fpType dest = func(op1, op2, op3);
938    __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
939
940    int fpClass = std::fpclassify(dest);
941    // Get NAN behavior right. This varies between x86 and ARM.
942    if (fpClass == FP_NAN) {
943        const uint64_t qnan =
944            single ? 0x7fc00000 : ULL(0x7ff8000000000000);
945        const bool nan1 = std::isnan(op1);
946        const bool nan2 = std::isnan(op2);
947        const bool nan3 = std::isnan(op3);
948        const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
949        const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
950        const bool signal3 = nan3 && ((fpToBits(op3) & qnan) != qnan);
951        if ((!nan1 && !nan2 && !nan3) || (defaultNan == 1)) {
952            dest = bitsToFp(qnan, junk);
953        } else if (signal1) {
954            dest = bitsToFp(fpToBits(op1) | qnan, junk);
955        } else if (signal2) {
956            dest = bitsToFp(fpToBits(op2) | qnan, junk);
957        } else if (signal3) {
958            dest = bitsToFp(fpToBits(op3) | qnan, junk);
959        } else if (nan1) {
960            dest = op1;
961        } else if (nan2) {
962            dest = op2;
963        } else if (nan3) {
964            dest = op3;
965        }
966    } else if (flush && flushToZero(dest)) {
967        feraiseexcept(FeUnderflow);
968    } else if ((
969                (single && (dest == bitsToFp(0x00800000, junk) ||
970                     dest == bitsToFp(0x80800000, junk))) ||
971                (!single &&
972                    (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
973                     dest == bitsToFp(ULL(0x8010000000000000), junk)))
974               ) && rMode != VfpRoundZero) {
975        /*
976         * Correct for the fact that underflow is detected -before- rounding
977         * in ARM and -after- rounding in x86.
978         */
979        fesetround(FeRoundZero);
980        __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3)
981                                 :  "m" (op1),  "m" (op2),  "m" (op3));
982        fpType temp = func(op1, op2, op2);
983        __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
984        if (flush && flushToZero(temp)) {
985            dest = temp;
986        }
987    }
988    finishVfp(fpscr, state, flush);
989    return dest;
990}
991
992template
993float FpOp::ternaryOp(FPSCR &fpscr, float op1, float op2, float op3,
994                      float (*func)(float, float, float),
995                      bool flush, bool defaultNan, uint32_t rMode) const;
996template
997double FpOp::ternaryOp(FPSCR &fpscr, double op1, double op2, double op3,
998                       double (*func)(double, double, double),
999                       bool flush, bool defaultNan, uint32_t rMode) const;
1000
1001template <class fpType>
1002fpType
1003FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2,
1004               fpType (*func)(fpType, fpType),
1005               bool flush, bool defaultNan, uint32_t rMode) const
1006{
1007    const bool single = (sizeof(fpType) == sizeof(float));
1008    fpType junk = 0.0;
1009
1010    if (flush && flushToZero(op1, op2))
1011        fpscr.idc = 1;
1012    VfpSavedState state = prepFpState(rMode);
1013    __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (state)
1014                             : "m" (op1), "m" (op2), "m" (state));
1015    fpType dest = func(op1, op2);
1016    __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
1017
1018    // Get NAN behavior right. This varies between x86 and ARM.
1019    if (std::isnan(dest)) {
1020        const uint64_t qnan =
1021            single ? 0x7fc00000 : ULL(0x7ff8000000000000);
1022        const bool nan1 = std::isnan(op1);
1023        const bool nan2 = std::isnan(op2);
1024        const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
1025        const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
1026        if ((!nan1 && !nan2) || (defaultNan == 1)) {
1027            dest = bitsToFp(qnan, junk);
1028        } else if (signal1) {
1029            dest = bitsToFp(fpToBits(op1) | qnan, junk);
1030        } else if (signal2) {
1031            dest = bitsToFp(fpToBits(op2) | qnan, junk);
1032        } else if (nan1) {
1033            dest = op1;
1034        } else if (nan2) {
1035            dest = op2;
1036        }
1037    } else if (flush && flushToZero(dest)) {
1038        feraiseexcept(FeUnderflow);
1039    } else if ((
1040                (single && (dest == bitsToFp(0x00800000, junk) ||
1041                     dest == bitsToFp(0x80800000, junk))) ||
1042                (!single &&
1043                    (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
1044                     dest == bitsToFp(ULL(0x8010000000000000), junk)))
1045               ) && rMode != VfpRoundZero) {
1046        /*
1047         * Correct for the fact that underflow is detected -before- rounding
1048         * in ARM and -after- rounding in x86.
1049         */
1050        fesetround(FeRoundZero);
1051        __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2)
1052                                 : "m" (op1), "m" (op2));
1053        fpType temp = func(op1, op2);
1054        __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
1055        if (flush && flushToZero(temp)) {
1056            dest = temp;
1057        }
1058    }
1059    finishVfp(fpscr, state, flush);
1060    return dest;
1061}
1062
1063template
1064float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2,
1065                     float (*func)(float, float),
1066                     bool flush, bool defaultNan, uint32_t rMode) const;
1067template
1068double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2,
1069                      double (*func)(double, double),
1070                      bool flush, bool defaultNan, uint32_t rMode) const;
1071
1072template <class fpType>
1073fpType
1074FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (*func)(fpType),
1075              bool flush, uint32_t rMode) const
1076{
1077    const bool single = (sizeof(fpType) == sizeof(float));
1078    fpType junk = 0.0;
1079
1080    if (flush && flushToZero(op1))
1081        fpscr.idc = 1;
1082    VfpSavedState state = prepFpState(rMode);
1083    __asm__ __volatile__ ("" : "=m" (op1), "=m" (state)
1084                             : "m" (op1), "m" (state));
1085    fpType dest = func(op1);
1086    __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
1087
1088    // Get NAN behavior right. This varies between x86 and ARM.
1089    if (std::isnan(dest)) {
1090        const uint64_t qnan =
1091            single ? 0x7fc00000 : ULL(0x7ff8000000000000);
1092        const bool nan = std::isnan(op1);
1093        if (!nan || fpscr.dn == 1) {
1094            dest = bitsToFp(qnan, junk);
1095        } else if (nan) {
1096            dest = bitsToFp(fpToBits(op1) | qnan, junk);
1097        }
1098    } else if (flush && flushToZero(dest)) {
1099        feraiseexcept(FeUnderflow);
1100    } else if ((
1101                (single && (dest == bitsToFp(0x00800000, junk) ||
1102                     dest == bitsToFp(0x80800000, junk))) ||
1103                (!single &&
1104                    (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
1105                     dest == bitsToFp(ULL(0x8010000000000000), junk)))
1106               ) && rMode != VfpRoundZero) {
1107        /*
1108         * Correct for the fact that underflow is detected -before- rounding
1109         * in ARM and -after- rounding in x86.
1110         */
1111        fesetround(FeRoundZero);
1112        __asm__ __volatile__ ("" : "=m" (op1) : "m" (op1));
1113        fpType temp = func(op1);
1114        __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
1115        if (flush && flushToZero(temp)) {
1116            dest = temp;
1117        }
1118    }
1119    finishVfp(fpscr, state, flush);
1120    return dest;
1121}
1122
1123template
1124float FpOp::unaryOp(FPSCR &fpscr, float op1, float (*func)(float),
1125                    bool flush, uint32_t rMode) const;
1126template
1127double FpOp::unaryOp(FPSCR &fpscr, double op1, double (*func)(double),
1128                     bool flush, uint32_t rMode) const;
1129
1130IntRegIndex
1131VfpMacroOp::addStride(IntRegIndex idx, unsigned stride)
1132{
1133    if (wide) {
1134        stride *= 2;
1135    }
1136    unsigned offset = idx % 8;
1137    idx = (IntRegIndex)(idx - offset);
1138    offset += stride;
1139    idx = (IntRegIndex)(idx + (offset % 8));
1140    return idx;
1141}
1142
1143void
1144VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1, IntRegIndex &op2)
1145{
1146    unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1147    assert(!inScalarBank(dest));
1148    dest = addStride(dest, stride);
1149    op1 = addStride(op1, stride);
1150    if (!inScalarBank(op2)) {
1151        op2 = addStride(op2, stride);
1152    }
1153}
1154
1155void
1156VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1)
1157{
1158    unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1159    assert(!inScalarBank(dest));
1160    dest = addStride(dest, stride);
1161    if (!inScalarBank(op1)) {
1162        op1 = addStride(op1, stride);
1163    }
1164}
1165
1166void
1167VfpMacroOp::nextIdxs(IntRegIndex &dest)
1168{
1169    unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1170    assert(!inScalarBank(dest));
1171    dest = addStride(dest, stride);
1172}
1173
1174}
1175