vfp.cc revision 7430
1/*
2 * Copyright (c) 2010 ARM Limited
3 * All rights reserved
4 *
5 * The license below extends only to copyright in the software and shall
6 * not be construed as granting a license to any other intellectual
7 * property including but not limited to intellectual property relating
8 * to a hardware implementation of the functionality of the software
9 * licensed hereunder.  You may use the software subject to the license
10 * terms below provided that you ensure that this notice is replicated
11 * unmodified and in its entirety in all distributions of the software,
12 * modified or unmodified, in source code or in binary form.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions are
16 * met: redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer;
18 * redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution;
21 * neither the name of the copyright holders nor the names of its
22 * contributors may be used to endorse or promote products derived from
23 * this software without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 *
37 * Authors: Gabe Black
38 */
39
40#include "arch/arm/insts/vfp.hh"
41
42std::string
43FpRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
44{
45    std::stringstream ss;
46    printMnemonic(ss);
47    printReg(ss, dest + FP_Base_DepTag);
48    ss << ", ";
49    printReg(ss, op1 + FP_Base_DepTag);
50    return ss.str();
51}
52
53std::string
54FpRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
55{
56    std::stringstream ss;
57    printMnemonic(ss);
58    printReg(ss, dest + FP_Base_DepTag);
59    ccprintf(ss, ", #%d", imm);
60    return ss.str();
61}
62
63std::string
64FpRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
65{
66    std::stringstream ss;
67    printMnemonic(ss);
68    printReg(ss, dest + FP_Base_DepTag);
69    ss << ", ";
70    printReg(ss, op1 + FP_Base_DepTag);
71    ccprintf(ss, ", #%d", imm);
72    return ss.str();
73}
74
75std::string
76FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
77{
78    std::stringstream ss;
79    printMnemonic(ss);
80    printReg(ss, dest + FP_Base_DepTag);
81    ss << ", ";
82    printReg(ss, op1 + FP_Base_DepTag);
83    ss << ", ";
84    printReg(ss, op2 + FP_Base_DepTag);
85    return ss.str();
86}
87
88namespace ArmISA
89{
90
91VfpSavedState
92prepFpState(uint32_t rMode)
93{
94    int roundingMode = fegetround();
95    feclearexcept(FeAllExceptions);
96    switch (rMode) {
97      case VfpRoundNearest:
98        fesetround(FeRoundNearest);
99        break;
100      case VfpRoundUpward:
101        fesetround(FeRoundUpward);
102        break;
103      case VfpRoundDown:
104        fesetround(FeRoundDown);
105        break;
106      case VfpRoundZero:
107        fesetround(FeRoundZero);
108        break;
109    }
110    return roundingMode;
111}
112
113void
114finishVfp(FPSCR &fpscr, VfpSavedState state)
115{
116    int exceptions = fetestexcept(FeAllExceptions);
117    bool underflow = false;
118    if (exceptions & FeInvalid) {
119        fpscr.ioc = 1;
120    }
121    if (exceptions & FeDivByZero) {
122        fpscr.dzc = 1;
123    }
124    if (exceptions & FeOverflow) {
125        fpscr.ofc = 1;
126    }
127    if (exceptions & FeUnderflow) {
128        underflow = true;
129        fpscr.ufc = 1;
130    }
131    if ((exceptions & FeInexact) && !(underflow && fpscr.fz)) {
132        fpscr.ixc = 1;
133    }
134    fesetround(state);
135}
136
137template <class fpType>
138fpType
139fixDest(FPSCR fpscr, fpType val, fpType op1)
140{
141    int fpClass = std::fpclassify(val);
142    fpType junk = 0.0;
143    if (fpClass == FP_NAN) {
144        const bool single = (sizeof(val) == sizeof(float));
145        const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
146        const bool nan = std::isnan(op1);
147        if (!nan || (fpscr.dn == 1)) {
148            val = bitsToFp(qnan, junk);
149        } else if (nan) {
150            val = bitsToFp(fpToBits(op1) | qnan, junk);
151        }
152    } else if (fpClass == FP_SUBNORMAL && fpscr.fz == 1) {
153        // Turn val into a zero with the correct sign;
154        uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1);
155        val = bitsToFp(fpToBits(val) & bitMask, junk);
156        feclearexcept(FeInexact);
157        feraiseexcept(FeUnderflow);
158    }
159    return val;
160}
161
162template
163float fixDest<float>(FPSCR fpscr, float val, float op1);
164template
165double fixDest<double>(FPSCR fpscr, double val, double op1);
166
167template <class fpType>
168fpType
169fixDest(FPSCR fpscr, fpType val, fpType op1, fpType op2)
170{
171    int fpClass = std::fpclassify(val);
172    fpType junk = 0.0;
173    if (fpClass == FP_NAN) {
174        const bool single = (sizeof(val) == sizeof(float));
175        const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
176        const bool nan1 = std::isnan(op1);
177        const bool nan2 = std::isnan(op2);
178        const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
179        const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
180        if ((!nan1 && !nan2) || (fpscr.dn == 1)) {
181            val = bitsToFp(qnan, junk);
182        } else if (signal1) {
183            val = bitsToFp(fpToBits(op1) | qnan, junk);
184        } else if (signal2) {
185            val = bitsToFp(fpToBits(op2) | qnan, junk);
186        } else if (nan1) {
187            val = op1;
188        } else if (nan2) {
189            val = op2;
190        }
191    } else if (fpClass == FP_SUBNORMAL && fpscr.fz == 1) {
192        // Turn val into a zero with the correct sign;
193        uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1);
194        val = bitsToFp(fpToBits(val) & bitMask, junk);
195        feclearexcept(FeInexact);
196        feraiseexcept(FeUnderflow);
197    }
198    return val;
199}
200
201template
202float fixDest<float>(FPSCR fpscr, float val, float op1, float op2);
203template
204double fixDest<double>(FPSCR fpscr, double val, double op1, double op2);
205
206template <class fpType>
207fpType
208fixDivDest(FPSCR fpscr, fpType val, fpType op1, fpType op2)
209{
210    fpType mid = fixDest(fpscr, val, op1, op2);
211    const bool single = (sizeof(fpType) == sizeof(float));
212    const fpType junk = 0.0;
213    if ((single && (val == bitsToFp(0x00800000, junk) ||
214                    val == bitsToFp(0x80800000, junk))) ||
215        (!single && (val == bitsToFp(ULL(0x0010000000000000), junk) ||
216                     val == bitsToFp(ULL(0x8010000000000000), junk)))
217        ) {
218        __asm__ __volatile__("" : "=m" (op1) : "m" (op1));
219        fesetround(FeRoundZero);
220        fpType temp = 0.0;
221        __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
222        temp = op1 / op2;
223        if (flushToZero(temp)) {
224            feraiseexcept(FeUnderflow);
225            if (fpscr.fz) {
226                feclearexcept(FeInexact);
227                mid = temp;
228            }
229        }
230        __asm__ __volatile__("" :: "m" (temp));
231    }
232    return mid;
233}
234
235template
236float fixDivDest<float>(FPSCR fpscr, float val, float op1, float op2);
237template
238double fixDivDest<double>(FPSCR fpscr, double val, double op1, double op2);
239
240float
241fixFpDFpSDest(FPSCR fpscr, double val)
242{
243    const float junk = 0.0;
244    float op1 = 0.0;
245    if (std::isnan(val)) {
246        uint64_t valBits = fpToBits(val);
247        uint32_t op1Bits = bits(valBits, 50, 29) |
248                           (mask(9) << 22) |
249                           (bits(valBits, 63) << 31);
250        op1 = bitsToFp(op1Bits, junk);
251    }
252    float mid = fixDest(fpscr, (float)val, op1);
253    if (fpscr.fz && fetestexcept(FeUnderflow | FeInexact) ==
254                    (FeUnderflow | FeInexact)) {
255        feclearexcept(FeInexact);
256    }
257    if (mid == bitsToFp(0x00800000, junk) ||
258        mid == bitsToFp(0x80800000, junk)) {
259        __asm__ __volatile__("" : "=m" (val) : "m" (val));
260        fesetround(FeRoundZero);
261        float temp = 0.0;
262        __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
263        temp = val;
264        if (flushToZero(temp)) {
265            feraiseexcept(FeUnderflow);
266            if (fpscr.fz) {
267                feclearexcept(FeInexact);
268                mid = temp;
269            }
270        }
271        __asm__ __volatile__("" :: "m" (temp));
272    }
273    return mid;
274}
275
276double
277fixFpSFpDDest(FPSCR fpscr, float val)
278{
279    const double junk = 0.0;
280    double op1 = 0.0;
281    if (std::isnan(val)) {
282        uint32_t valBits = fpToBits(val);
283        uint64_t op1Bits = ((uint64_t)bits(valBits, 21, 0) << 29) |
284                           (mask(12) << 51) |
285                           ((uint64_t)bits(valBits, 31) << 63);
286        op1 = bitsToFp(op1Bits, junk);
287    }
288    double mid = fixDest(fpscr, (double)val, op1);
289    if (mid == bitsToFp(ULL(0x0010000000000000), junk) ||
290        mid == bitsToFp(ULL(0x8010000000000000), junk)) {
291        __asm__ __volatile__("" : "=m" (val) : "m" (val));
292        fesetround(FeRoundZero);
293        double temp = 0.0;
294        __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
295        temp = val;
296        if (flushToZero(temp)) {
297            feraiseexcept(FeUnderflow);
298            if (fpscr.fz) {
299                feclearexcept(FeInexact);
300                mid = temp;
301            }
302        }
303        __asm__ __volatile__("" :: "m" (temp));
304    }
305    return mid;
306}
307
308float
309vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top)
310{
311    float junk = 0.0;
312    uint32_t destBits = fpToBits(dest);
313    uint32_t opBits = fpToBits(op);
314    // Extract the operand.
315    bool neg = bits(opBits, 31);
316    uint32_t exponent = bits(opBits, 30, 23);
317    uint32_t oldMantissa = bits(opBits, 22, 0);
318    uint32_t mantissa = oldMantissa >> (23 - 10);
319    // Do the conversion.
320    uint32_t extra = oldMantissa & mask(23 - 10);
321    if (exponent == 0xff) {
322        if (oldMantissa != 0) {
323            // Nans.
324            if (bits(mantissa, 9) == 0) {
325                // Signalling nan.
326                fpscr.ioc = 1;
327            }
328            if (fpscr.ahp) {
329                mantissa = 0;
330                exponent = 0;
331                fpscr.ioc = 1;
332            } else if (fpscr.dn) {
333                mantissa = (1 << 9);
334                exponent = 0x1f;
335                neg = false;
336            } else {
337                exponent = 0x1f;
338                mantissa |= (1 << 9);
339            }
340        } else {
341            // Infinities.
342            exponent = 0x1F;
343            if (fpscr.ahp) {
344                fpscr.ioc = 1;
345                mantissa = 0x3ff;
346            } else {
347                mantissa = 0;
348            }
349        }
350    } else if (exponent == 0 && oldMantissa == 0) {
351        // Zero, don't need to do anything.
352    } else {
353        // Normalized or denormalized numbers.
354
355        bool inexact = (extra != 0);
356
357        if (exponent == 0) {
358            // Denormalized.
359
360            // If flush to zero is on, this shouldn't happen.
361            assert(fpscr.fz == 0);
362
363            // Check for underflow
364            if (inexact || fpscr.ufe)
365                fpscr.ufc = 1;
366
367            // Handle rounding.
368            unsigned mode = fpscr.rMode;
369            if ((mode == VfpRoundUpward && !neg && extra) ||
370                (mode == VfpRoundDown && neg && extra) ||
371                (mode == VfpRoundNearest &&
372                 (extra > (1 << 9) ||
373                  (extra == (1 << 9) && bits(mantissa, 0))))) {
374                mantissa++;
375            }
376
377            // See if the number became normalized after rounding.
378            if (mantissa == (1 << 10)) {
379                mantissa = 0;
380                exponent = 1;
381            }
382        } else {
383            // Normalized.
384
385            // We need to track the dropped bits differently since
386            // more can be dropped by denormalizing.
387            bool topOne = bits(extra, 12);
388            bool restZeros = bits(extra, 11, 0) == 0;
389
390            if (exponent <= (127 - 15)) {
391                // The result is too small. Denormalize.
392                mantissa |= (1 << 10);
393                while (mantissa && exponent <= (127 - 15)) {
394                    restZeros = restZeros && !topOne;
395                    topOne = bits(mantissa, 0);
396                    mantissa = mantissa >> 1;
397                    exponent++;
398                }
399                if (topOne || !restZeros)
400                    inexact = true;
401                exponent = 0;
402            } else {
403                // Change bias.
404                exponent -= (127 - 15);
405            }
406
407            if (exponent == 0 && (inexact || fpscr.ufe)) {
408                // Underflow
409                fpscr.ufc = 1;
410            }
411
412            // Handle rounding.
413            unsigned mode = fpscr.rMode;
414            bool nonZero = topOne || !restZeros;
415            if ((mode == VfpRoundUpward && !neg && nonZero) ||
416                (mode == VfpRoundDown && neg && nonZero) ||
417                (mode == VfpRoundNearest && topOne &&
418                 (!restZeros || bits(mantissa, 0)))) {
419                mantissa++;
420            }
421
422            // See if we rounded up and need to bump the exponent.
423            if (mantissa == (1 << 10)) {
424                mantissa = 0;
425                exponent++;
426            }
427
428            // Deal with overflow
429            if (fpscr.ahp) {
430                if (exponent >= 0x20) {
431                    exponent = 0x1f;
432                    mantissa = 0x3ff;
433                    fpscr.ioc = 1;
434                    // Supress inexact exception.
435                    inexact = false;
436                }
437            } else {
438                if (exponent >= 0x1f) {
439                    if ((mode == VfpRoundNearest) ||
440                        (mode == VfpRoundUpward && !neg) ||
441                        (mode == VfpRoundDown && neg)) {
442                        // Overflow to infinity.
443                        exponent = 0x1f;
444                        mantissa = 0;
445                    } else {
446                        // Overflow to max normal.
447                        exponent = 0x1e;
448                        mantissa = 0x3ff;
449                    }
450                    fpscr.ofc = 1;
451                    inexact = true;
452                }
453            }
454        }
455
456        if (inexact) {
457            fpscr.ixc = 1;
458        }
459    }
460    // Reassemble and install the result.
461    uint32_t result = bits(mantissa, 9, 0);
462    replaceBits(result, 14, 10, exponent);
463    if (neg)
464        result |= (1 << 15);
465    if (top)
466        replaceBits(destBits, 31, 16, result);
467    else
468        replaceBits(destBits, 15, 0, result);
469    return bitsToFp(destBits, junk);
470}
471
472float
473vcvtFpHFpS(FPSCR &fpscr, float op, bool top)
474{
475    float junk = 0.0;
476    uint32_t opBits = fpToBits(op);
477    // Extract the operand.
478    if (top)
479        opBits = bits(opBits, 31, 16);
480    else
481        opBits = bits(opBits, 15, 0);
482    // Extract the bitfields.
483    bool neg = bits(opBits, 15);
484    uint32_t exponent = bits(opBits, 14, 10);
485    uint32_t mantissa = bits(opBits, 9, 0);
486    // Do the conversion.
487    if (exponent == 0) {
488        if (mantissa != 0) {
489            // Normalize the value.
490            exponent = exponent + (127 - 15) + 1;
491            while (mantissa < (1 << 10)) {
492                mantissa = mantissa << 1;
493                exponent--;
494            }
495        }
496        mantissa = mantissa << (23 - 10);
497    } else if (exponent == 0x1f && !fpscr.ahp) {
498        // Infinities and nans.
499        exponent = 0xff;
500        if (mantissa != 0) {
501            // Nans.
502            mantissa = mantissa << (23 - 10);
503            if (bits(mantissa, 22) == 0) {
504                // Signalling nan.
505                fpscr.ioc = 1;
506                mantissa |= (1 << 22);
507            }
508            if (fpscr.dn) {
509                mantissa &= ~mask(22);
510                neg = false;
511            }
512        }
513    } else {
514        exponent = exponent + (127 - 15);
515        mantissa = mantissa << (23 - 10);
516    }
517    // Reassemble the result.
518    uint32_t result = bits(mantissa, 22, 0);
519    replaceBits(result, 30, 23, exponent);
520    if (neg)
521        result |= (1 << 31);
522    return bitsToFp(result, junk);
523}
524
525uint64_t
526vfpFpSToFixed(float val, bool isSigned, bool half,
527              uint8_t imm, bool rzero)
528{
529    int rmode = rzero ? FeRoundZero : fegetround();
530    __asm__ __volatile__("" : "=m" (rmode) : "m" (rmode));
531    fesetround(FeRoundNearest);
532    val = val * powf(2.0, imm);
533    __asm__ __volatile__("" : "=m" (val) : "m" (val));
534    fesetround(rmode);
535    feclearexcept(FeAllExceptions);
536    __asm__ __volatile__("" : "=m" (val) : "m" (val));
537    float origVal = val;
538    val = rintf(val);
539    int fpType = std::fpclassify(val);
540    if (fpType == FP_SUBNORMAL || fpType == FP_NAN) {
541        if (fpType == FP_NAN) {
542            feraiseexcept(FeInvalid);
543        }
544        val = 0.0;
545    } else if (origVal != val) {
546        switch (rmode) {
547          case FeRoundNearest:
548            if (origVal - val > 0.5)
549                val += 1.0;
550            else if (val - origVal > 0.5)
551                val -= 1.0;
552            break;
553          case FeRoundDown:
554            if (origVal < val)
555                val -= 1.0;
556            break;
557          case FeRoundUpward:
558            if (origVal > val)
559                val += 1.0;
560            break;
561        }
562        feraiseexcept(FeInexact);
563    }
564
565    if (isSigned) {
566        if (half) {
567            if ((double)val < (int16_t)(1 << 15)) {
568                feraiseexcept(FeInvalid);
569                feclearexcept(FeInexact);
570                return (int16_t)(1 << 15);
571            }
572            if ((double)val > (int16_t)mask(15)) {
573                feraiseexcept(FeInvalid);
574                feclearexcept(FeInexact);
575                return (int16_t)mask(15);
576            }
577            return (int16_t)val;
578        } else {
579            if ((double)val < (int32_t)(1 << 31)) {
580                feraiseexcept(FeInvalid);
581                feclearexcept(FeInexact);
582                return (int32_t)(1 << 31);
583            }
584            if ((double)val > (int32_t)mask(31)) {
585                feraiseexcept(FeInvalid);
586                feclearexcept(FeInexact);
587                return (int32_t)mask(31);
588            }
589            return (int32_t)val;
590        }
591    } else {
592        if (half) {
593            if ((double)val < 0) {
594                feraiseexcept(FeInvalid);
595                feclearexcept(FeInexact);
596                return 0;
597            }
598            if ((double)val > (mask(16))) {
599                feraiseexcept(FeInvalid);
600                feclearexcept(FeInexact);
601                return mask(16);
602            }
603            return (uint16_t)val;
604        } else {
605            if ((double)val < 0) {
606                feraiseexcept(FeInvalid);
607                feclearexcept(FeInexact);
608                return 0;
609            }
610            if ((double)val > (mask(32))) {
611                feraiseexcept(FeInvalid);
612                feclearexcept(FeInexact);
613                return mask(32);
614            }
615            return (uint32_t)val;
616        }
617    }
618}
619
620float
621vfpUFixedToFpS(FPSCR fpscr, uint32_t val, bool half, uint8_t imm)
622{
623    fesetround(FeRoundNearest);
624    if (half)
625        val = (uint16_t)val;
626    float scale = powf(2.0, imm);
627    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
628    feclearexcept(FeAllExceptions);
629    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
630    return fixDivDest(fpscr, val / scale, (float)val, scale);
631}
632
633float
634vfpSFixedToFpS(FPSCR fpscr, int32_t val, bool half, uint8_t imm)
635{
636    fesetround(FeRoundNearest);
637    if (half)
638        val = sext<16>(val & mask(16));
639    float scale = powf(2.0, imm);
640    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
641    feclearexcept(FeAllExceptions);
642    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
643    return fixDivDest(fpscr, val / scale, (float)val, scale);
644}
645
646uint64_t
647vfpFpDToFixed(double val, bool isSigned, bool half,
648              uint8_t imm, bool rzero)
649{
650    int rmode = rzero ? FeRoundZero : fegetround();
651    fesetround(FeRoundNearest);
652    val = val * pow(2.0, imm);
653    __asm__ __volatile__("" : "=m" (val) : "m" (val));
654    fesetround(rmode);
655    feclearexcept(FeAllExceptions);
656    __asm__ __volatile__("" : "=m" (val) : "m" (val));
657    double origVal = val;
658    val = rint(val);
659    int fpType = std::fpclassify(val);
660    if (fpType == FP_SUBNORMAL || fpType == FP_NAN) {
661        if (fpType == FP_NAN) {
662            feraiseexcept(FeInvalid);
663        }
664        val = 0.0;
665    } else if (origVal != val) {
666        switch (rmode) {
667          case FeRoundNearest:
668            if (origVal - val > 0.5)
669                val += 1.0;
670            else if (val - origVal > 0.5)
671                val -= 1.0;
672            break;
673          case FeRoundDown:
674            if (origVal < val)
675                val -= 1.0;
676            break;
677          case FeRoundUpward:
678            if (origVal > val)
679                val += 1.0;
680            break;
681        }
682        feraiseexcept(FeInexact);
683    }
684    if (isSigned) {
685        if (half) {
686            if (val < (int16_t)(1 << 15)) {
687                feraiseexcept(FeInvalid);
688                feclearexcept(FeInexact);
689                return (int16_t)(1 << 15);
690            }
691            if (val > (int16_t)mask(15)) {
692                feraiseexcept(FeInvalid);
693                feclearexcept(FeInexact);
694                return (int16_t)mask(15);
695            }
696            return (int16_t)val;
697        } else {
698            if (val < (int32_t)(1 << 31)) {
699                feraiseexcept(FeInvalid);
700                feclearexcept(FeInexact);
701                return (int32_t)(1 << 31);
702            }
703            if (val > (int32_t)mask(31)) {
704                feraiseexcept(FeInvalid);
705                feclearexcept(FeInexact);
706                return (int32_t)mask(31);
707            }
708            return (int32_t)val;
709        }
710    } else {
711        if (half) {
712            if (val < 0) {
713                feraiseexcept(FeInvalid);
714                feclearexcept(FeInexact);
715                return 0;
716            }
717            if (val > mask(16)) {
718                feraiseexcept(FeInvalid);
719                feclearexcept(FeInexact);
720                return mask(16);
721            }
722            return (uint16_t)val;
723        } else {
724            if (val < 0) {
725                feraiseexcept(FeInvalid);
726                feclearexcept(FeInexact);
727                return 0;
728            }
729            if (val > mask(32)) {
730                feraiseexcept(FeInvalid);
731                feclearexcept(FeInexact);
732                return mask(32);
733            }
734            return (uint32_t)val;
735        }
736    }
737}
738
739double
740vfpUFixedToFpD(FPSCR fpscr, uint32_t val, bool half, uint8_t imm)
741{
742    fesetround(FeRoundNearest);
743    if (half)
744        val = (uint16_t)val;
745    double scale = pow(2.0, imm);
746    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
747    feclearexcept(FeAllExceptions);
748    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
749    return fixDivDest(fpscr, val / scale, (double)val, scale);
750}
751
752double
753vfpSFixedToFpD(FPSCR fpscr, int32_t val, bool half, uint8_t imm)
754{
755    fesetround(FeRoundNearest);
756    if (half)
757        val = sext<16>(val & mask(16));
758    double scale = pow(2.0, imm);
759    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
760    feclearexcept(FeAllExceptions);
761    __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
762    return fixDivDest(fpscr, val / scale, (double)val, scale);
763}
764
765template <class fpType>
766fpType
767FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2,
768               fpType (*func)(fpType, fpType),
769               bool flush, uint32_t rMode) const
770{
771    const bool single = (sizeof(fpType) == sizeof(float));
772    fpType junk = 0.0;
773
774    if (flush && flushToZero(op1, op2))
775        fpscr.idc = 1;
776    VfpSavedState state = prepFpState(rMode);
777    __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (state)
778                             : "m" (op1), "m" (op2), "m" (state));
779    fpType dest = func(op1, op2);
780    __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
781
782    int fpClass = std::fpclassify(dest);
783    // Get NAN behavior right. This varies between x86 and ARM.
784    if (fpClass == FP_NAN) {
785        const bool single = (sizeof(fpType) == sizeof(float));
786        const uint64_t qnan =
787            single ? 0x7fc00000 : ULL(0x7ff8000000000000);
788        const bool nan1 = std::isnan(op1);
789        const bool nan2 = std::isnan(op2);
790        const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
791        const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
792        if ((!nan1 && !nan2) || (fpscr.dn == 1)) {
793            dest = bitsToFp(qnan, junk);
794        } else if (signal1) {
795            dest = bitsToFp(fpToBits(op1) | qnan, junk);
796        } else if (signal2) {
797            dest = bitsToFp(fpToBits(op2) | qnan, junk);
798        } else if (nan1) {
799            dest = op1;
800        } else if (nan2) {
801            dest = op2;
802        }
803    } else if (flush && flushToZero(dest)) {
804        feraiseexcept(FeUnderflow);
805    } else if ((
806                (single && (dest == bitsToFp(0x00800000, junk) ||
807                     dest == bitsToFp(0x80800000, junk))) ||
808                (!single &&
809                    (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
810                     dest == bitsToFp(ULL(0x8010000000000000), junk)))
811               ) && rMode != VfpRoundZero) {
812        /*
813         * Correct for the fact that underflow is detected -before- rounding
814         * in ARM and -after- rounding in x86.
815         */
816        fesetround(FeRoundZero);
817        __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2)
818                                 : "m" (op1), "m" (op2));
819        fpType temp = func(op1, op2);
820        __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
821        if (flush && flushToZero(temp)) {
822            dest = temp;
823        }
824    }
825    finishVfp(fpscr, state);
826    return dest;
827}
828
829template
830float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2,
831                     float (*func)(float, float),
832                     bool flush, uint32_t rMode) const;
833template
834double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2,
835                      double (*func)(double, double),
836                      bool flush, uint32_t rMode) const;
837
838template <class fpType>
839fpType
840FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (*func)(fpType),
841              bool flush, uint32_t rMode) const
842{
843    const bool single = (sizeof(fpType) == sizeof(float));
844    fpType junk = 0.0;
845
846    if (flush && flushToZero(op1))
847        fpscr.idc = 1;
848    VfpSavedState state = prepFpState(rMode);
849    __asm__ __volatile__ ("" : "=m" (op1), "=m" (state)
850                             : "m" (op1), "m" (state));
851    fpType dest = func(op1);
852    __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
853
854    int fpClass = std::fpclassify(dest);
855    // Get NAN behavior right. This varies between x86 and ARM.
856    if (fpClass == FP_NAN) {
857        const bool single = (sizeof(fpType) == sizeof(float));
858        const uint64_t qnan =
859            single ? 0x7fc00000 : ULL(0x7ff8000000000000);
860        const bool nan = std::isnan(op1);
861        if (!nan || fpscr.dn == 1) {
862            dest = bitsToFp(qnan, junk);
863        } else if (nan) {
864            dest = bitsToFp(fpToBits(op1) | qnan, junk);
865        }
866    } else if (flush && flushToZero(dest)) {
867        feraiseexcept(FeUnderflow);
868    } else if ((
869                (single && (dest == bitsToFp(0x00800000, junk) ||
870                     dest == bitsToFp(0x80800000, junk))) ||
871                (!single &&
872                    (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
873                     dest == bitsToFp(ULL(0x8010000000000000), junk)))
874               ) && rMode != VfpRoundZero) {
875        /*
876         * Correct for the fact that underflow is detected -before- rounding
877         * in ARM and -after- rounding in x86.
878         */
879        fesetround(FeRoundZero);
880        __asm__ __volatile__ ("" : "=m" (op1) : "m" (op1));
881        fpType temp = func(op1);
882        __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
883        if (flush && flushToZero(temp)) {
884            dest = temp;
885        }
886    }
887    finishVfp(fpscr, state);
888    return dest;
889}
890
891template
892float FpOp::unaryOp(FPSCR &fpscr, float op1, float (*func)(float),
893                    bool flush, uint32_t rMode) const;
894template
895double FpOp::unaryOp(FPSCR &fpscr, double op1, double (*func)(double),
896                     bool flush, uint32_t rMode) const;
897
898IntRegIndex
899VfpMacroOp::addStride(IntRegIndex idx, unsigned stride)
900{
901    if (wide) {
902        stride *= 2;
903    }
904    unsigned offset = idx % 8;
905    idx = (IntRegIndex)(idx - offset);
906    offset += stride;
907    idx = (IntRegIndex)(idx + (offset % 8));
908    return idx;
909}
910
911void
912VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1, IntRegIndex &op2)
913{
914    unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
915    assert(!inScalarBank(dest));
916    dest = addStride(dest, stride);
917    op1 = addStride(op1, stride);
918    if (!inScalarBank(op2)) {
919        op2 = addStride(op2, stride);
920    }
921}
922
923void
924VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1)
925{
926    unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
927    assert(!inScalarBank(dest));
928    dest = addStride(dest, stride);
929    if (!inScalarBank(op1)) {
930        op1 = addStride(op1, stride);
931    }
932}
933
934void
935VfpMacroOp::nextIdxs(IntRegIndex &dest)
936{
937    unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
938    assert(!inScalarBank(dest));
939    dest = addStride(dest, stride);
940}
941
942}
943