1// Copyright (c) 2009 The Regents of The University of Michigan
2// Copyright (c) 2015 Advanced Micro Devices, Inc.
3//
4// All rights reserved.
5//
6// Redistribution and use in source and binary forms, with or without
7// modification, are permitted provided that the following conditions are
8// met: redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer;
10// redistributions in binary form must reproduce the above copyright
11// notice, this list of conditions and the following disclaimer in the
12// documentation and/or other materials provided with the distribution;
13// neither the name of the copyright holders nor the names of its
14// contributors may be used to endorse or promote products derived from
15// this software without specific prior written permission.
16//
17// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28//
29// Authors: Gabe Black
30
31def template MediaOpExecute {{
32        Fault %(class_name)s::execute(ExecContext *xc,
33                Trace::InstRecord *traceData) const
34        {
35            Fault fault = NoFault;
36
37            %(op_decl)s;
38            %(op_rd)s;
39
40            %(code)s;
41
42            //Write the resulting state to the execution context
43            if(fault == NoFault)
44            {
45                %(op_wb)s;
46            }
47            return fault;
48        }
49}};
50
51def template MediaOpRegDeclare {{
52    class %(class_name)s : public %(base_class)s
53    {
54      public:
55        %(class_name)s(ExtMachInst _machInst,
56                const char * instMnem, uint64_t setFlags,
57                InstRegIndex _src1, InstRegIndex _src2, InstRegIndex _dest,
58                uint8_t _srcSize, uint8_t _destSize, uint16_t _ext);
59
60        Fault execute(ExecContext *, Trace::InstRecord *) const;
61    };
62}};
63
64def template MediaOpImmDeclare {{
65
66    class %(class_name)s : public %(base_class)s
67    {
68      public:
69        %(class_name)s(ExtMachInst _machInst,
70                const char * instMnem, uint64_t setFlags,
71                InstRegIndex _src1, uint16_t _imm8, InstRegIndex _dest,
72                uint8_t _srcSize, uint8_t _destSize, uint16_t _ext);
73
74        Fault execute(ExecContext *, Trace::InstRecord *) const;
75    };
76}};
77
78def template MediaOpRegConstructor {{
79    %(class_name)s::%(class_name)s(
80            ExtMachInst machInst, const char * instMnem, uint64_t setFlags,
81            InstRegIndex _src1, InstRegIndex _src2, InstRegIndex _dest,
82            uint8_t _srcSize, uint8_t _destSize, uint16_t _ext) :
83        %(base_class)s(machInst, "%(mnemonic)s", instMnem, setFlags,
84                _src1, _src2, _dest, _srcSize, _destSize, _ext,
85                %(op_class)s)
86    {
87        %(constructor)s;
88    }
89}};
90
91def template MediaOpImmConstructor {{
92    %(class_name)s::%(class_name)s(
93            ExtMachInst machInst, const char * instMnem, uint64_t setFlags,
94            InstRegIndex _src1, uint16_t _imm8, InstRegIndex _dest,
95            uint8_t _srcSize, uint8_t _destSize, uint16_t _ext) :
96        %(base_class)s(machInst, "%(mnemonic)s", instMnem, setFlags,
97                _src1, _imm8, _dest, _srcSize, _destSize, _ext,
98                %(op_class)s)
99    {
100        %(constructor)s;
101    }
102}};
103
104let {{
105    # Make these empty strings so that concatenating onto
106    # them will always work.
107    header_output = ""
108    decoder_output = ""
109    exec_output = ""
110
111    immTemplates = (
112            MediaOpImmDeclare,
113            MediaOpImmConstructor,
114            MediaOpExecute)
115
116    regTemplates = (
117            MediaOpRegDeclare,
118            MediaOpRegConstructor,
119            MediaOpExecute)
120
121    class MediaOpMeta(type):
122        def buildCppClasses(self, name, Name, suffix, code):
123
124            # Globals to stick the output in
125            global header_output
126            global decoder_output
127            global exec_output
128
129            # If op2 is used anywhere, make register and immediate versions
130            # of this code.
131            matcher = re.compile(r"(?<!\w)(?P<prefix>s?)op2(?P<typeQual>_[^\W_]+)?")
132            match = matcher.search(code)
133            if match:
134                typeQual = ""
135                if match.group("typeQual"):
136                    typeQual = match.group("typeQual")
137                src2_name = "%sFpSrcReg2%s" % (match.group("prefix"), typeQual)
138                self.buildCppClasses(name, Name, suffix,
139                        matcher.sub(src2_name, code))
140                self.buildCppClasses(name + "i", Name, suffix + "Imm",
141                        matcher.sub("imm8", code))
142                return
143
144            base = "X86ISA::MediaOp"
145
146            # If imm8 shows up in the code, use the immediate templates, if
147            # not, hopefully the register ones will be correct.
148            matcher = re.compile("(?<!\w)imm8(?!\w)")
149            if matcher.search(code):
150                base += "Imm"
151                templates = immTemplates
152            else:
153                base += "Reg"
154                templates = regTemplates
155
156            # Get everything ready for the substitution
157            opt_args = []
158            if self.op_class:
159                opt_args.append(self.op_class)
160            iop = InstObjParams(name, Name + suffix, base, {"code" : code},
161                                opt_args)
162
163            # Generate the actual code (finally!)
164            header_output += templates[0].subst(iop)
165            decoder_output += templates[1].subst(iop)
166            exec_output += templates[2].subst(iop)
167
168
169        def __new__(mcls, Name, bases, dict):
170            abstract = False
171            name = Name.lower()
172            if "abstract" in dict:
173                abstract = dict['abstract']
174                del dict['abstract']
175            if not "op_class" in dict:
176                dict["op_class"] = None
177
178            cls = super(MediaOpMeta, mcls).__new__(mcls, Name, bases, dict)
179            if not abstract:
180                cls.className = Name
181                cls.base_mnemonic = name
182                code = cls.code
183
184                # Set up the C++ classes
185                mcls.buildCppClasses(cls, name, Name, "", code)
186
187                # Hook into the microassembler dict
188                global microopClasses
189                microopClasses[name] = cls
190
191                # If op2 is used anywhere, make register and immediate versions
192                # of this code.
193                matcher = re.compile(r"op2(?P<typeQual>_[^\W_]+)?")
194                if matcher.search(code):
195                    microopClasses[name + 'i'] = cls
196            return cls
197
198
199    class MediaOp(X86Microop):
200        __metaclass__ = MediaOpMeta
201        # This class itself doesn't act as a microop
202        abstract = True
203
204        def __init__(self, dest, src1, op2,
205                size = None, destSize = None, srcSize = None, ext = None):
206            self.dest = dest
207            self.src1 = src1
208            self.op2 = op2
209            if size is not None:
210                self.srcSize = size
211                self.destSize = size
212            if srcSize is not None:
213                self.srcSize = srcSize
214            if destSize is not None:
215                self.destSize = destSize
216            if self.srcSize is None:
217                raise Exception, "Source size not set."
218            if self.destSize is None:
219                raise Exception, "Dest size not set."
220            if ext is None:
221                self.ext = 0
222            else:
223                self.ext = ext
224
225        def getAllocator(self, microFlags):
226            className = self.className
227            if self.mnemonic == self.base_mnemonic + 'i':
228                className += "Imm"
229            allocator = '''new %(class_name)s(machInst, macrocodeBlock,
230                    %(flags)s, %(src1)s, %(op2)s, %(dest)s,
231                    %(srcSize)s, %(destSize)s, %(ext)s)''' % {
232                "class_name" : className,
233                "flags" : self.microFlagsText(microFlags),
234                "src1" : self.src1, "op2" : self.op2,
235                "dest" : self.dest,
236                "srcSize" : self.srcSize,
237                "destSize" : self.destSize,
238                "ext" : self.ext}
239            return allocator
240
241    class Mov2int(MediaOp):
242        def __init__(self, dest, src1, src2 = 0, \
243                size = None, destSize = None, srcSize = None, ext = None):
244            super(Mov2int, self).__init__(dest, src1,\
245                    src2, size, destSize, srcSize, ext)
246        op_class = 'SimdMiscOp'
247        code = '''
248            int items = sizeof(double) / srcSize;
249            int offset = imm8;
250            if (bits(src1, 0) && (ext & 0x1))
251                offset -= items;
252            if (offset >= 0 && offset < items) {
253                uint64_t fpSrcReg1 =
254                    bits(FpSrcReg1_uqw,
255                            (offset + 1) * srcSize * 8 - 1,
256                            (offset + 0) * srcSize * 8);
257                DestReg = merge(0, fpSrcReg1, destSize);
258            } else {
259                DestReg = DestReg;
260            }
261        '''
262
263    class Mov2fp(MediaOp):
264        def __init__(self, dest, src1, src2 = 0, \
265                size = None, destSize = None, srcSize = None, ext = None):
266            super(Mov2fp, self).__init__(dest, src1,\
267                    src2, size, destSize, srcSize, ext)
268        op_class = 'SimdMiscOp'
269        code = '''
270            int items = sizeof(double) / destSize;
271            int offset = imm8;
272            if (bits(dest, 0) && (ext & 0x1))
273                offset -= items;
274            if (offset >= 0 && offset < items) {
275                uint64_t srcReg1 = pick(SrcReg1, 0, srcSize);
276                FpDestReg_uqw =
277                    insertBits(FpDestReg_uqw,
278                            (offset + 1) * destSize * 8 - 1,
279                            (offset + 0) * destSize * 8, srcReg1);
280            } else {
281                FpDestReg_uqw = FpDestReg_uqw;
282            }
283        '''
284
285    class Movsign(MediaOp):
286        def __init__(self, dest, src, \
287                size = None, destSize = None, srcSize = None, ext = None):
288            super(Movsign, self).__init__(dest, src,\
289                    "InstRegIndex(0)", size, destSize, srcSize, ext)
290        op_class = 'SimdMiscOp'
291        code = '''
292            int items = sizeof(double) / srcSize;
293            uint64_t result = 0;
294            int offset = (ext & 0x1) ? items : 0;
295            for (int i = 0; i < items; i++) {
296                uint64_t picked =
297                    bits(FpSrcReg1_uqw, (i + 1) * 8 * srcSize - 1);
298                result = insertBits(result, i + offset, i + offset, picked);
299            }
300            DestReg = DestReg | result;
301        '''
302
303    class Maskmov(MediaOp):
304        op_class = 'SimdMiscOp'
305        code = '''
306            assert(srcSize == destSize);
307            int size = srcSize;
308            int sizeBits = size * 8;
309            int items = numItems(size);
310            uint64_t result = FpDestReg_uqw;
311
312            for (int i = 0; i < items; i++) {
313                int hiIndex = (i + 1) * sizeBits - 1;
314                int loIndex = (i + 0) * sizeBits;
315                uint64_t arg1Bits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
316                if (bits(FpSrcReg2_uqw, hiIndex))
317                    result = insertBits(result, hiIndex, loIndex, arg1Bits);
318            }
319            FpDestReg_uqw = result;
320        '''
321
322    class shuffle(MediaOp):
323        op_class = 'SimdMiscOp'
324        code = '''
325            assert(srcSize == destSize);
326            int size = srcSize;
327            int sizeBits = size * 8;
328            int items = sizeof(double) / size;
329            int options;
330            int optionBits;
331            if (size == 8) {
332                options = 2;
333                optionBits = 1;
334            } else {
335                options = 4;
336                optionBits = 2;
337            }
338
339            uint64_t result = 0;
340            uint8_t sel = ext;
341
342            for (int i = 0; i < items; i++) {
343                uint64_t resBits;
344                uint8_t lsel = sel & mask(optionBits);
345                if (lsel * size >= sizeof(double)) {
346                    lsel -= options / 2;
347                    resBits = bits(FpSrcReg2_uqw,
348                            (lsel + 1) * sizeBits - 1,
349                            (lsel + 0) * sizeBits);
350                }  else {
351                    resBits = bits(FpSrcReg1_uqw,
352                            (lsel + 1) * sizeBits - 1,
353                            (lsel + 0) * sizeBits);
354                }
355
356                sel >>= optionBits;
357
358                int hiIndex = (i + 1) * sizeBits - 1;
359                int loIndex = (i + 0) * sizeBits;
360                result = insertBits(result, hiIndex, loIndex, resBits);
361            }
362            FpDestReg_uqw = result;
363        '''
364
365    class Unpack(MediaOp):
366        op_class = 'SimdMiscOp'
367        code = '''
368            assert(srcSize == destSize);
369            int size = destSize;
370            int items = (sizeof(double) / size) / 2;
371            int offset = ext ? items : 0;
372            uint64_t result = 0;
373            for (int i = 0; i < items; i++) {
374                uint64_t pickedLow =
375                    bits(FpSrcReg1_uqw, (i + offset + 1) * 8 * size - 1,
376                                        (i + offset) * 8 * size);
377                result = insertBits(result,
378                                    (2 * i + 1) * 8 * size - 1,
379                                    (2 * i + 0) * 8 * size,
380                                    pickedLow);
381                uint64_t pickedHigh =
382                    bits(FpSrcReg2_uqw, (i + offset + 1) * 8 * size - 1,
383                                        (i + offset) * 8 * size);
384                result = insertBits(result,
385                                    (2 * i + 2) * 8 * size - 1,
386                                    (2 * i + 1) * 8 * size,
387                                    pickedHigh);
388            }
389            FpDestReg_uqw = result;
390        '''
391
392    class Pack(MediaOp):
393        op_class = 'SimdMiscOp'
394        code = '''
395            assert(srcSize == destSize * 2);
396            int items = (sizeof(double) / destSize);
397            int destBits = destSize * 8;
398            int srcBits = srcSize * 8;
399            uint64_t result = 0;
400            int i;
401            for (i = 0; i < items / 2; i++) {
402                uint64_t picked =
403                    bits(FpSrcReg1_uqw, (i + 1) * srcBits - 1,
404                                        (i + 0) * srcBits);
405                unsigned signBit = bits(picked, srcBits - 1);
406                uint64_t overflow = bits(picked, srcBits - 1, destBits - 1);
407
408                // Handle saturation.
409                if (signBit) {
410                    if (overflow != mask(destBits - srcBits + 1)) {
411                        if (signedOp())
412                            picked = (ULL(1) << (destBits - 1));
413                        else
414                            picked = 0;
415                    }
416                } else {
417                    if (overflow != 0) {
418                        if (signedOp())
419                            picked = mask(destBits - 1);
420                        else
421                            picked = mask(destBits);
422                    }
423                }
424                result = insertBits(result,
425                                    (i + 1) * destBits - 1,
426                                    (i + 0) * destBits,
427                                    picked);
428            }
429            for (;i < items; i++) {
430                uint64_t picked =
431                    bits(FpSrcReg2_uqw, (i - items + 1) * srcBits - 1,
432                                        (i - items + 0) * srcBits);
433                unsigned signBit = bits(picked, srcBits - 1);
434                uint64_t overflow = bits(picked, srcBits - 1, destBits - 1);
435
436                // Handle saturation.
437                if (signBit) {
438                    if (overflow != mask(destBits - srcBits + 1)) {
439                        if (signedOp())
440                            picked = (ULL(1) << (destBits - 1));
441                        else
442                            picked = 0;
443                    }
444                } else {
445                    if (overflow != 0) {
446                        if (signedOp())
447                            picked = mask(destBits - 1);
448                        else
449                            picked = mask(destBits);
450                    }
451                }
452                result = insertBits(result,
453                                    (i + 1) * destBits - 1,
454                                    (i + 0) * destBits,
455                                    picked);
456            }
457            FpDestReg_uqw = result;
458        '''
459
460    class Mxor(MediaOp):
461        def __init__(self, dest, src1, src2):
462            super(Mxor, self).__init__(dest, src1, src2, 1)
463        op_class = 'SimdAluOp'
464        code = '''
465            FpDestReg_uqw = FpSrcReg1_uqw ^ FpSrcReg2_uqw;
466        '''
467
468    class Mor(MediaOp):
469        def __init__(self, dest, src1, src2):
470            super(Mor, self).__init__(dest, src1, src2, 1)
471        op_class = 'SimdAluOp'
472        code = '''
473            FpDestReg_uqw = FpSrcReg1_uqw | FpSrcReg2_uqw;
474        '''
475
476    class Mand(MediaOp):
477        def __init__(self, dest, src1, src2):
478            super(Mand, self).__init__(dest, src1, src2, 1)
479        op_class = 'SimdAluOp'
480        code = '''
481            FpDestReg_uqw = FpSrcReg1_uqw & FpSrcReg2_uqw;
482        '''
483
484    class Mandn(MediaOp):
485        def __init__(self, dest, src1, src2):
486            super(Mandn, self).__init__(dest, src1, src2, 1)
487        op_class = 'SimdAluOp'
488        code = '''
489            FpDestReg_uqw = ~FpSrcReg1_uqw & FpSrcReg2_uqw;
490        '''
491
492    class Mminf(MediaOp):
493        op_class = 'SimdFloatCmpOp'
494        code = '''
495            union floatInt
496            {
497                float f;
498                uint32_t i;
499            };
500            union doubleInt
501            {
502                double d;
503                uint64_t i;
504            };
505
506            assert(srcSize == destSize);
507            int size = srcSize;
508            int sizeBits = size * 8;
509            assert(srcSize == 4 || srcSize == 8);
510            int items = numItems(size);
511            uint64_t result = FpDestReg_uqw;
512
513            for (int i = 0; i < items; i++) {
514                double arg1, arg2;
515                int hiIndex = (i + 1) * sizeBits - 1;
516                int loIndex = (i + 0) * sizeBits;
517                uint64_t arg1Bits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
518                uint64_t arg2Bits = bits(FpSrcReg2_uqw, hiIndex, loIndex);
519
520                if (size == 4) {
521                    floatInt fi;
522                    fi.i = arg1Bits;
523                    arg1 = fi.f;
524                    fi.i = arg2Bits;
525                    arg2 = fi.f;
526                } else {
527                    doubleInt di;
528                    di.i = arg1Bits;
529                    arg1 = di.d;
530                    di.i = arg2Bits;
531                    arg2 = di.d;
532                }
533
534                if (arg1 < arg2) {
535                    result = insertBits(result, hiIndex, loIndex, arg1Bits);
536                } else {
537                    result = insertBits(result, hiIndex, loIndex, arg2Bits);
538                }
539            }
540            FpDestReg_uqw = result;
541        '''
542
543    class Mmaxf(MediaOp):
544        op_class = 'SimdFloatCmpOp'
545        code = '''
546            union floatInt
547            {
548                float f;
549                uint32_t i;
550            };
551            union doubleInt
552            {
553                double d;
554                uint64_t i;
555            };
556
557            assert(srcSize == destSize);
558            int size = srcSize;
559            int sizeBits = size * 8;
560            assert(srcSize == 4 || srcSize == 8);
561            int items = numItems(size);
562            uint64_t result = FpDestReg_uqw;
563
564            for (int i = 0; i < items; i++) {
565                double arg1, arg2;
566                int hiIndex = (i + 1) * sizeBits - 1;
567                int loIndex = (i + 0) * sizeBits;
568                uint64_t arg1Bits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
569                uint64_t arg2Bits = bits(FpSrcReg2_uqw, hiIndex, loIndex);
570
571                if (size == 4) {
572                    floatInt fi;
573                    fi.i = arg1Bits;
574                    arg1 = fi.f;
575                    fi.i = arg2Bits;
576                    arg2 = fi.f;
577                } else {
578                    doubleInt di;
579                    di.i = arg1Bits;
580                    arg1 = di.d;
581                    di.i = arg2Bits;
582                    arg2 = di.d;
583                }
584
585                if (arg1 > arg2) {
586                    result = insertBits(result, hiIndex, loIndex, arg1Bits);
587                } else {
588                    result = insertBits(result, hiIndex, loIndex, arg2Bits);
589                }
590            }
591            FpDestReg_uqw = result;
592        '''
593
594    class Mmini(MediaOp):
595        op_class = 'SimdCmpOp'
596        code = '''
597
598            assert(srcSize == destSize);
599            int size = srcSize;
600            int sizeBits = size * 8;
601            int items = numItems(size);
602            uint64_t result = FpDestReg_uqw;
603
604            for (int i = 0; i < items; i++) {
605                int hiIndex = (i + 1) * sizeBits - 1;
606                int loIndex = (i + 0) * sizeBits;
607                uint64_t arg1Bits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
608                int64_t arg1 = arg1Bits |
609                    (0 - (arg1Bits & (ULL(1) << (sizeBits - 1))));
610                uint64_t arg2Bits = bits(FpSrcReg2_uqw, hiIndex, loIndex);
611                int64_t arg2 = arg2Bits |
612                    (0 - (arg2Bits & (ULL(1) << (sizeBits - 1))));
613                uint64_t resBits;
614
615                if (signedOp()) {
616                    if (arg1 < arg2) {
617                        resBits = arg1Bits;
618                    } else {
619                        resBits = arg2Bits;
620                    }
621                } else {
622                    if (arg1Bits < arg2Bits) {
623                        resBits = arg1Bits;
624                    } else {
625                        resBits = arg2Bits;
626                    }
627                }
628                result = insertBits(result, hiIndex, loIndex, resBits);
629            }
630            FpDestReg_uqw = result;
631        '''
632
633    class Mmaxi(MediaOp):
634        op_class = 'SimdCmpOp'
635        code = '''
636
637            assert(srcSize == destSize);
638            int size = srcSize;
639            int sizeBits = size * 8;
640            int items = numItems(size);
641            uint64_t result = FpDestReg_uqw;
642
643            for (int i = 0; i < items; i++) {
644                int hiIndex = (i + 1) * sizeBits - 1;
645                int loIndex = (i + 0) * sizeBits;
646                uint64_t arg1Bits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
647                int64_t arg1 = arg1Bits |
648                    (0 - (arg1Bits & (ULL(1) << (sizeBits - 1))));
649                uint64_t arg2Bits = bits(FpSrcReg2_uqw, hiIndex, loIndex);
650                int64_t arg2 = arg2Bits |
651                    (0 - (arg2Bits & (ULL(1) << (sizeBits - 1))));
652                uint64_t resBits;
653
654                if (signedOp()) {
655                    if (arg1 > arg2) {
656                        resBits = arg1Bits;
657                    } else {
658                        resBits = arg2Bits;
659                    }
660                } else {
661                    if (arg1Bits > arg2Bits) {
662                        resBits = arg1Bits;
663                    } else {
664                        resBits = arg2Bits;
665                    }
666                }
667                result = insertBits(result, hiIndex, loIndex, resBits);
668            }
669            FpDestReg_uqw = result;
670        '''
671
672    class Msqrt(MediaOp):
673        op_class = 'SimdFloatSqrtOp'
674        def __init__(self, dest, src, \
675                size = None, destSize = None, srcSize = None, ext = None):
676            super(Msqrt, self).__init__(dest, src,\
677                    "InstRegIndex(0)", size, destSize, srcSize, ext)
678        code = '''
679            union floatInt
680            {
681                float f;
682                uint32_t i;
683            };
684            union doubleInt
685            {
686                double d;
687                uint64_t i;
688            };
689
690            assert(srcSize == destSize);
691            int size = srcSize;
692            int sizeBits = size * 8;
693            assert(srcSize == 4 || srcSize == 8);
694            int items = numItems(size);
695            uint64_t result = FpDestReg_uqw;
696
697            for (int i = 0; i < items; i++) {
698                int hiIndex = (i + 1) * sizeBits - 1;
699                int loIndex = (i + 0) * sizeBits;
700                uint64_t argBits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
701
702                if (size == 4) {
703                    floatInt fi;
704                    fi.i = argBits;
705                    fi.f = sqrt(fi.f);
706                    argBits = fi.i;
707                } else {
708                    doubleInt di;
709                    di.i = argBits;
710                    di.d = sqrt(di.d);
711                    argBits = di.i;
712                }
713                result = insertBits(result, hiIndex, loIndex, argBits);
714            }
715            FpDestReg_uqw = result;
716        '''
717
718    # compute approximate reciprocal --- single-precision only
719    class Mrcp(MediaOp):
720        def __init__(self, dest, src, \
721                size = None, destSize = None, srcSize = None, ext = None):
722            super(Mrcp, self).__init__(dest, src,\
723                    "InstRegIndex(0)", size, destSize, srcSize, ext)
724        op_class = 'SimdFloatAluOp'
725        code = '''
726            union floatInt
727            {
728                float f;
729                uint32_t i;
730            };
731
732            assert(srcSize == 4);  // ISA defines single-precision only
733            assert(srcSize == destSize);
734            const int size = 4;
735            const int sizeBits = size * 8;
736            int items = numItems(size);
737            uint64_t result = FpDestReg_uqw;
738
739            for (int i = 0; i < items; i++) {
740                int hiIndex = (i + 1) * sizeBits - 1;
741                int loIndex = (i + 0) * sizeBits;
742                uint64_t argBits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
743
744                floatInt fi;
745                fi.i = argBits;
746                // This is more accuracy than HW provides, but oh well
747                fi.f = 1.0 / fi.f;
748                argBits = fi.i;
749                result = insertBits(result, hiIndex, loIndex, argBits);
750            }
751            FpDestReg_uqw = result;
752        '''
753
754    class Maddf(MediaOp):
755        op_class = 'SimdFloatAddOp'
756        code = '''
757            union floatInt
758            {
759                float f;
760                uint32_t i;
761            };
762            union doubleInt
763            {
764                double d;
765                uint64_t i;
766            };
767
768            assert(srcSize == destSize);
769            int size = srcSize;
770            int sizeBits = size * 8;
771            assert(srcSize == 4 || srcSize == 8);
772            int items = numItems(size);
773            uint64_t result = FpDestReg_uqw;
774
775            for (int i = 0; i < items; i++) {
776                int hiIndex = (i + 1) * sizeBits - 1;
777                int loIndex = (i + 0) * sizeBits;
778                uint64_t arg1Bits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
779                uint64_t arg2Bits = bits(FpSrcReg2_uqw, hiIndex, loIndex);
780                uint64_t resBits;
781
782                if (size == 4) {
783                    floatInt arg1, arg2, res;
784                    arg1.i = arg1Bits;
785                    arg2.i = arg2Bits;
786                    res.f = arg1.f + arg2.f;
787                    resBits = res.i;
788                } else {
789                    doubleInt arg1, arg2, res;
790                    arg1.i = arg1Bits;
791                    arg2.i = arg2Bits;
792                    res.d = arg1.d + arg2.d;
793                    resBits = res.i;
794                }
795
796                result = insertBits(result, hiIndex, loIndex, resBits);
797            }
798            FpDestReg_uqw = result;
799        '''
800
801    class Msubf(MediaOp):
802        op_class = 'SimdFloatAddOp'
803        code = '''
804            union floatInt
805            {
806                float f;
807                uint32_t i;
808            };
809            union doubleInt
810            {
811                double d;
812                uint64_t i;
813            };
814
815            assert(srcSize == destSize);
816            int size = srcSize;
817            int sizeBits = size * 8;
818            assert(srcSize == 4 || srcSize == 8);
819            int items = numItems(size);
820            uint64_t result = FpDestReg_uqw;
821
822            for (int i = 0; i < items; i++) {
823                int hiIndex = (i + 1) * sizeBits - 1;
824                int loIndex = (i + 0) * sizeBits;
825                uint64_t arg1Bits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
826                uint64_t arg2Bits = bits(FpSrcReg2_uqw, hiIndex, loIndex);
827                uint64_t resBits;
828
829                if (size == 4) {
830                    floatInt arg1, arg2, res;
831                    arg1.i = arg1Bits;
832                    arg2.i = arg2Bits;
833                    res.f = arg1.f - arg2.f;
834                    resBits = res.i;
835                } else {
836                    doubleInt arg1, arg2, res;
837                    arg1.i = arg1Bits;
838                    arg2.i = arg2Bits;
839                    res.d = arg1.d - arg2.d;
840                    resBits = res.i;
841                }
842
843                result = insertBits(result, hiIndex, loIndex, resBits);
844            }
845            FpDestReg_uqw = result;
846        '''
847
848    class Mmulf(MediaOp):
849        op_class = 'SimdFloatMultOp'
850        code = '''
851            union floatInt
852            {
853                float f;
854                uint32_t i;
855            };
856            union doubleInt
857            {
858                double d;
859                uint64_t i;
860            };
861
862            assert(srcSize == destSize);
863            int size = srcSize;
864            int sizeBits = size * 8;
865            assert(srcSize == 4 || srcSize == 8);
866            int items = numItems(size);
867            uint64_t result = FpDestReg_uqw;
868
869            for (int i = 0; i < items; i++) {
870                int hiIndex = (i + 1) * sizeBits - 1;
871                int loIndex = (i + 0) * sizeBits;
872                uint64_t arg1Bits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
873                uint64_t arg2Bits = bits(FpSrcReg2_uqw, hiIndex, loIndex);
874                uint64_t resBits;
875
876                if (size == 4) {
877                    floatInt arg1, arg2, res;
878                    arg1.i = arg1Bits;
879                    arg2.i = arg2Bits;
880                    res.f = arg1.f * arg2.f;
881                    resBits = res.i;
882                } else {
883                    doubleInt arg1, arg2, res;
884                    arg1.i = arg1Bits;
885                    arg2.i = arg2Bits;
886                    res.d = arg1.d * arg2.d;
887                    resBits = res.i;
888                }
889
890                result = insertBits(result, hiIndex, loIndex, resBits);
891            }
892            FpDestReg_uqw = result;
893        '''
894
895    class Mdivf(MediaOp):
896        op_class = 'SimdFloatDivOp'
897        code = '''
898            union floatInt
899            {
900                float f;
901                uint32_t i;
902            };
903            union doubleInt
904            {
905                double d;
906                uint64_t i;
907            };
908
909            assert(srcSize == destSize);
910            int size = srcSize;
911            int sizeBits = size * 8;
912            assert(srcSize == 4 || srcSize == 8);
913            int items = numItems(size);
914            uint64_t result = FpDestReg_uqw;
915
916            for (int i = 0; i < items; i++) {
917                int hiIndex = (i + 1) * sizeBits - 1;
918                int loIndex = (i + 0) * sizeBits;
919                uint64_t arg1Bits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
920                uint64_t arg2Bits = bits(FpSrcReg2_uqw, hiIndex, loIndex);
921                uint64_t resBits;
922
923                if (size == 4) {
924                    floatInt arg1, arg2, res;
925                    arg1.i = arg1Bits;
926                    arg2.i = arg2Bits;
927                    res.f = arg1.f / arg2.f;
928                    resBits = res.i;
929                } else {
930                    doubleInt arg1, arg2, res;
931                    arg1.i = arg1Bits;
932                    arg2.i = arg2Bits;
933                    res.d = arg1.d / arg2.d;
934                    resBits = res.i;
935                }
936
937                result = insertBits(result, hiIndex, loIndex, resBits);
938            }
939            FpDestReg_uqw = result;
940        '''
941
942    class Maddi(MediaOp):
943        op_class = 'SimdAddOp'
944        code = '''
945            assert(srcSize == destSize);
946            int size = srcSize;
947            int sizeBits = size * 8;
948            int items = numItems(size);
949            uint64_t result = FpDestReg_uqw;
950
951            for (int i = 0; i < items; i++) {
952                int hiIndex = (i + 1) * sizeBits - 1;
953                int loIndex = (i + 0) * sizeBits;
954                uint64_t arg1Bits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
955                uint64_t arg2Bits = bits(FpSrcReg2_uqw, hiIndex, loIndex);
956                uint64_t resBits = arg1Bits + arg2Bits;
957
958                if (ext & 0x2) {
959                    if (signedOp()) {
960                        int arg1Sign = bits(arg1Bits, sizeBits - 1);
961                        int arg2Sign = bits(arg2Bits, sizeBits - 1);
962                        int resSign = bits(resBits, sizeBits - 1);
963                        if ((arg1Sign == arg2Sign) && (arg1Sign != resSign)) {
964                            if (resSign == 0)
965                                resBits = (ULL(1) << (sizeBits - 1));
966                            else
967                                resBits = mask(sizeBits - 1);
968                        }
969                    } else {
970                        if (findCarry(sizeBits, resBits, arg1Bits, arg2Bits))
971                            resBits = mask(sizeBits);
972                    }
973                }
974
975                result = insertBits(result, hiIndex, loIndex, resBits);
976            }
977            FpDestReg_uqw = result;
978        '''
979
980    class Msubi(MediaOp):
981        op_class = 'SimdAddOp'
982        code = '''
983            assert(srcSize == destSize);
984            int size = srcSize;
985            int sizeBits = size * 8;
986            int items = numItems(size);
987            uint64_t result = FpDestReg_uqw;
988
989            for (int i = 0; i < items; i++) {
990                int hiIndex = (i + 1) * sizeBits - 1;
991                int loIndex = (i + 0) * sizeBits;
992                uint64_t arg1Bits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
993                uint64_t arg2Bits = bits(FpSrcReg2_uqw, hiIndex, loIndex);
994                uint64_t resBits = arg1Bits - arg2Bits;
995
996                if (ext & 0x2) {
997                    if (signedOp()) {
998                        int arg1Sign = bits(arg1Bits, sizeBits - 1);
999                        int arg2Sign = !bits(arg2Bits, sizeBits - 1);
1000                        int resSign = bits(resBits, sizeBits - 1);
1001                        if ((arg1Sign == arg2Sign) && (arg1Sign != resSign)) {
1002                            if (resSign == 0)
1003                                resBits = (ULL(1) << (sizeBits - 1));
1004                            else
1005                                resBits = mask(sizeBits - 1);
1006                        }
1007                    } else {
1008                        if (arg2Bits > arg1Bits) {
1009                            resBits = 0;
1010                        } else if (!findCarry(sizeBits, resBits,
1011                                             arg1Bits, ~arg2Bits)) {
1012                            resBits = mask(sizeBits);
1013                        }
1014                    }
1015                }
1016
1017                result = insertBits(result, hiIndex, loIndex, resBits);
1018            }
1019            FpDestReg_uqw = result;
1020        '''
1021
1022    class Mmuli(MediaOp):
1023        op_class = 'SimdMultOp'
1024        code = '''
1025            int srcBits = srcSize * 8;
1026            int destBits = destSize * 8;
1027            assert(destBits <= 64);
1028            assert(destSize >= srcSize);
1029            int items = numItems(destSize);
1030            uint64_t result = FpDestReg_uqw;
1031
1032            for (int i = 0; i < items; i++) {
1033                int offset = 0;
1034                if (ext & 16) {
1035                    if (ext & 32)
1036                        offset = i * (destBits - srcBits);
1037                    else
1038                        offset = i * (destBits - srcBits) + srcBits;
1039                }
1040                int srcHiIndex = (i + 1) * srcBits - 1 + offset;
1041                int srcLoIndex = (i + 0) * srcBits + offset;
1042                uint64_t arg1Bits = bits(FpSrcReg1_uqw, srcHiIndex, srcLoIndex);
1043                uint64_t arg2Bits = bits(FpSrcReg2_uqw, srcHiIndex, srcLoIndex);
1044                uint64_t resBits;
1045
1046                if (signedOp()) {
1047                    int64_t arg1 = arg1Bits |
1048                        (0 - (arg1Bits & (ULL(1) << (srcBits - 1))));
1049                    int64_t arg2 = arg2Bits |
1050                        (0 - (arg2Bits & (ULL(1) << (srcBits - 1))));
1051                    resBits = (uint64_t)(arg1 * arg2);
1052                } else {
1053                    resBits = arg1Bits * arg2Bits;
1054                }
1055
1056                if (ext & 0x4)
1057                    resBits += (ULL(1) << (destBits - 1));
1058
1059                if (multHi())
1060                    resBits >>= destBits;
1061
1062                int destHiIndex = (i + 1) * destBits - 1;
1063                int destLoIndex = (i + 0) * destBits;
1064                result = insertBits(result, destHiIndex, destLoIndex, resBits);
1065            }
1066            FpDestReg_uqw = result;
1067        '''
1068
1069    class Mavg(MediaOp):
1070        op_class = 'SimdAddOp'
1071        code = '''
1072            assert(srcSize == destSize);
1073            int size = srcSize;
1074            int sizeBits = size * 8;
1075            int items = numItems(size);
1076            uint64_t result = FpDestReg_uqw;
1077
1078            for (int i = 0; i < items; i++) {
1079                int hiIndex = (i + 1) * sizeBits - 1;
1080                int loIndex = (i + 0) * sizeBits;
1081                uint64_t arg1Bits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
1082                uint64_t arg2Bits = bits(FpSrcReg2_uqw, hiIndex, loIndex);
1083                uint64_t resBits = (arg1Bits + arg2Bits + 1) / 2;
1084
1085                result = insertBits(result, hiIndex, loIndex, resBits);
1086            }
1087            FpDestReg_uqw = result;
1088        '''
1089
1090    class Msad(MediaOp):
1091        op_class = 'SimdAddOp'
1092        code = '''
1093            int srcBits = srcSize * 8;
1094            int items = sizeof(double) / srcSize;
1095
1096            uint64_t sum = 0;
1097            for (int i = 0; i < items; i++) {
1098                int hiIndex = (i + 1) * srcBits - 1;
1099                int loIndex = (i + 0) * srcBits;
1100                uint64_t arg1Bits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
1101                uint64_t arg2Bits = bits(FpSrcReg2_uqw, hiIndex, loIndex);
1102                int64_t resBits = arg1Bits - arg2Bits;
1103                if (resBits < 0)
1104                    resBits = -resBits;
1105                sum += resBits;
1106            }
1107            FpDestReg_uqw = sum & mask(destSize * 8);
1108        '''
1109
1110    class Msrl(MediaOp):
1111        op_class = 'SimdShiftOp'
1112        code = '''
1113
1114            assert(srcSize == destSize);
1115            int size = srcSize;
1116            int sizeBits = size * 8;
1117            int items = numItems(size);
1118            uint64_t shiftAmt = op2_uqw;
1119            uint64_t result = FpDestReg_uqw;
1120
1121            for (int i = 0; i < items; i++) {
1122                int hiIndex = (i + 1) * sizeBits - 1;
1123                int loIndex = (i + 0) * sizeBits;
1124                uint64_t arg1Bits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
1125                uint64_t resBits;
1126                if (shiftAmt >= sizeBits) {
1127                    resBits = 0;
1128                } else {
1129                    resBits = (arg1Bits >> shiftAmt) &
1130                        mask(sizeBits - shiftAmt);
1131                }
1132
1133                result = insertBits(result, hiIndex, loIndex, resBits);
1134            }
1135            FpDestReg_uqw = result;
1136        '''
1137
1138    class Msra(MediaOp):
1139        op_class = 'SimdShiftOp'
1140        code = '''
1141
1142            assert(srcSize == destSize);
1143            int size = srcSize;
1144            int sizeBits = size * 8;
1145            int items = numItems(size);
1146            uint64_t shiftAmt = op2_uqw;
1147            uint64_t result = FpDestReg_uqw;
1148
1149            for (int i = 0; i < items; i++) {
1150                int hiIndex = (i + 1) * sizeBits - 1;
1151                int loIndex = (i + 0) * sizeBits;
1152                uint64_t arg1Bits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
1153                uint64_t resBits;
1154                if (shiftAmt >= sizeBits) {
1155                    if (bits(arg1Bits, sizeBits - 1))
1156                        resBits = mask(sizeBits);
1157                    else
1158                        resBits = 0;
1159                } else {
1160                    resBits = (arg1Bits >> shiftAmt);
1161                    resBits = resBits |
1162                        (0 - (resBits & (ULL(1) << (sizeBits - 1 - shiftAmt))));
1163                }
1164
1165                result = insertBits(result, hiIndex, loIndex, resBits);
1166            }
1167            FpDestReg_uqw = result;
1168        '''
1169
1170    class Msll(MediaOp):
1171        op_class = 'SimdShiftOp'
1172        code = '''
1173
1174            assert(srcSize == destSize);
1175            int size = srcSize;
1176            int sizeBits = size * 8;
1177            int items = numItems(size);
1178            uint64_t shiftAmt = op2_uqw;
1179            uint64_t result = FpDestReg_uqw;
1180
1181            for (int i = 0; i < items; i++) {
1182                int hiIndex = (i + 1) * sizeBits - 1;
1183                int loIndex = (i + 0) * sizeBits;
1184                uint64_t arg1Bits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
1185                uint64_t resBits;
1186                if (shiftAmt >= sizeBits) {
1187                    resBits = 0;
1188                } else {
1189                    resBits = (arg1Bits << shiftAmt);
1190                }
1191
1192                result = insertBits(result, hiIndex, loIndex, resBits);
1193            }
1194            FpDestReg_uqw = result;
1195        '''
1196
1197    class Cvtf2i(MediaOp):
1198        def __init__(self, dest, src, \
1199                size = None, destSize = None, srcSize = None, ext = None):
1200            super(Cvtf2i, self).__init__(dest, src,\
1201                    "InstRegIndex(0)", size, destSize, srcSize, ext)
1202        op_class = 'SimdFloatCvtOp'
1203        code = '''
1204            union floatInt
1205            {
1206                float f;
1207                uint32_t i;
1208            };
1209            union doubleInt
1210            {
1211                double d;
1212                uint64_t i;
1213            };
1214
1215            assert(destSize == 4 || destSize == 8);
1216            assert(srcSize == 4 || srcSize == 8);
1217            int srcSizeBits = srcSize * 8;
1218            int destSizeBits = destSize * 8;
1219            int items;
1220            int srcStart = 0;
1221            int destStart = 0;
1222            if (srcSize == 2 * destSize) {
1223                items = numItems(srcSize);
1224                if (ext & 0x2)
1225                    destStart = destSizeBits * items;
1226            } else if (destSize == 2 * srcSize) {
1227                items = numItems(destSize);
1228                if (ext & 0x2)
1229                    srcStart = srcSizeBits * items;
1230            } else {
1231                items = numItems(destSize);
1232            }
1233            uint64_t result = FpDestReg_uqw;
1234
1235            for (int i = 0; i < items; i++) {
1236                int srcHiIndex = srcStart + (i + 1) * srcSizeBits - 1;
1237                int srcLoIndex = srcStart + (i + 0) * srcSizeBits;
1238                uint64_t argBits = bits(FpSrcReg1_uqw, srcHiIndex, srcLoIndex);
1239                double arg;
1240
1241                if (srcSize == 4) {
1242                    floatInt fi;
1243                    fi.i = argBits;
1244                    arg = fi.f;
1245                } else {
1246                    doubleInt di;
1247                    di.i = argBits;
1248                    arg = di.d;
1249                }
1250
1251                if (ext & 0x4) {
1252                    if (arg >= 0)
1253                        arg += 0.5;
1254                    else
1255                        arg -= 0.5;
1256                }
1257
1258                if (destSize == 4) {
1259                    int32_t i_arg = (int32_t)arg;
1260                    argBits = *((uint32_t*)&i_arg);
1261                } else {
1262                    int64_t i_arg = (int64_t)arg;
1263                    argBits = *((uint64_t*)&i_arg);
1264                }
1265                int destHiIndex = destStart + (i + 1) * destSizeBits - 1;
1266                int destLoIndex = destStart + (i + 0) * destSizeBits;
1267                result = insertBits(result, destHiIndex, destLoIndex, argBits);
1268            }
1269            FpDestReg_uqw = result;
1270        '''
1271
1272    class Cvti2f(MediaOp):
1273        def __init__(self, dest, src, \
1274                size = None, destSize = None, srcSize = None, ext = None):
1275            super(Cvti2f, self).__init__(dest, src,\
1276                    "InstRegIndex(0)", size, destSize, srcSize, ext)
1277        op_class = 'SimdFloatCvtOp'
1278        code = '''
1279            union floatInt
1280            {
1281                float f;
1282                uint32_t i;
1283            };
1284            union doubleInt
1285            {
1286                double d;
1287                uint64_t i;
1288            };
1289
1290            assert(destSize == 4 || destSize == 8);
1291            assert(srcSize == 4 || srcSize == 8);
1292            int srcSizeBits = srcSize * 8;
1293            int destSizeBits = destSize * 8;
1294            int items;
1295            int srcStart = 0;
1296            int destStart = 0;
1297            if (srcSize == 2 * destSize) {
1298                items = numItems(srcSize);
1299                if (ext & 0x2)
1300                    destStart = destSizeBits * items;
1301            } else if (destSize == 2 * srcSize) {
1302                items = numItems(destSize);
1303                if (ext & 0x2)
1304                    srcStart = srcSizeBits * items;
1305            } else {
1306                items = numItems(destSize);
1307            }
1308            uint64_t result = FpDestReg_uqw;
1309
1310            for (int i = 0; i < items; i++) {
1311                int srcHiIndex = srcStart + (i + 1) * srcSizeBits - 1;
1312                int srcLoIndex = srcStart + (i + 0) * srcSizeBits;
1313                uint64_t argBits = bits(FpSrcReg1_uqw, srcHiIndex, srcLoIndex);
1314
1315                int64_t sArg = argBits |
1316                    (0 - (argBits & (ULL(1) << (srcSizeBits - 1))));
1317                double arg = sArg;
1318
1319                if (destSize == 4) {
1320                    floatInt fi;
1321                    fi.f = arg;
1322                    argBits = fi.i;
1323                } else {
1324                    doubleInt di;
1325                    di.d = arg;
1326                    argBits = di.i;
1327                }
1328                int destHiIndex = destStart + (i + 1) * destSizeBits - 1;
1329                int destLoIndex = destStart + (i + 0) * destSizeBits;
1330                result = insertBits(result, destHiIndex, destLoIndex, argBits);
1331            }
1332            FpDestReg_uqw = result;
1333        '''
1334
1335    class Cvtf2f(MediaOp):
1336        def __init__(self, dest, src, \
1337                size = None, destSize = None, srcSize = None, ext = None):
1338            super(Cvtf2f, self).__init__(dest, src,\
1339                    "InstRegIndex(0)", size, destSize, srcSize, ext)
1340        op_class = 'SimdFloatCvtOp'
1341        code = '''
1342            union floatInt
1343            {
1344                float f;
1345                uint32_t i;
1346            };
1347            union doubleInt
1348            {
1349                double d;
1350                uint64_t i;
1351            };
1352
1353            assert(destSize == 4 || destSize == 8);
1354            assert(srcSize == 4 || srcSize == 8);
1355            int srcSizeBits = srcSize * 8;
1356            int destSizeBits = destSize * 8;
1357            int items;
1358            int srcStart = 0;
1359            int destStart = 0;
1360            if (srcSize == 2 * destSize) {
1361                items = numItems(srcSize);
1362                if (ext & 0x2)
1363                    destStart = destSizeBits * items;
1364            } else if (destSize == 2 * srcSize) {
1365                items = numItems(destSize);
1366                if (ext & 0x2)
1367                    srcStart = srcSizeBits * items;
1368            } else {
1369                items = numItems(destSize);
1370            }
1371            uint64_t result = FpDestReg_uqw;
1372
1373            for (int i = 0; i < items; i++) {
1374                int srcHiIndex = srcStart + (i + 1) * srcSizeBits - 1;
1375                int srcLoIndex = srcStart + (i + 0) * srcSizeBits;
1376                uint64_t argBits = bits(FpSrcReg1_uqw, srcHiIndex, srcLoIndex);
1377                double arg;
1378
1379                if (srcSize == 4) {
1380                    floatInt fi;
1381                    fi.i = argBits;
1382                    arg = fi.f;
1383                } else {
1384                    doubleInt di;
1385                    di.i = argBits;
1386                    arg = di.d;
1387                }
1388                if (destSize == 4) {
1389                    floatInt fi;
1390                    fi.f = arg;
1391                    argBits = fi.i;
1392                } else {
1393                    doubleInt di;
1394                    di.d = arg;
1395                    argBits = di.i;
1396                }
1397                int destHiIndex = destStart + (i + 1) * destSizeBits - 1;
1398                int destLoIndex = destStart + (i + 0) * destSizeBits;
1399                result = insertBits(result, destHiIndex, destLoIndex, argBits);
1400            }
1401            FpDestReg_uqw = result;
1402        '''
1403
1404    class Mcmpi2r(MediaOp):
1405        op_class = 'SimdCvtOp'
1406        code = '''
1407            union floatInt
1408            {
1409                float f;
1410                uint32_t i;
1411            };
1412            union doubleInt
1413            {
1414                double d;
1415                uint64_t i;
1416            };
1417
1418            assert(srcSize == destSize);
1419            int size = srcSize;
1420            int sizeBits = size * 8;
1421            int items = numItems(size);
1422            uint64_t result = FpDestReg_uqw;
1423
1424            for (int i = 0; i < items; i++) {
1425                int hiIndex = (i + 1) * sizeBits - 1;
1426                int loIndex = (i + 0) * sizeBits;
1427                uint64_t arg1Bits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
1428                int64_t arg1 = arg1Bits |
1429                    (0 - (arg1Bits & (ULL(1) << (sizeBits - 1))));
1430                uint64_t arg2Bits = bits(FpSrcReg2_uqw, hiIndex, loIndex);
1431                int64_t arg2 = arg2Bits |
1432                    (0 - (arg2Bits & (ULL(1) << (sizeBits - 1))));
1433
1434                uint64_t resBits = 0;
1435                if (((ext & 0x2) == 0 && arg1 == arg2) ||
1436                    ((ext & 0x2) == 0x2 && arg1 > arg2))
1437                    resBits = mask(sizeBits);
1438
1439                result = insertBits(result, hiIndex, loIndex, resBits);
1440            }
1441            FpDestReg_uqw = result;
1442        '''
1443
1444    class Mcmpf2r(MediaOp):
1445        op_class = 'SimdFloatCvtOp'
1446        code = '''
1447            union floatInt
1448            {
1449                float f;
1450                uint32_t i;
1451            };
1452            union doubleInt
1453            {
1454                double d;
1455                uint64_t i;
1456            };
1457
1458            assert(srcSize == destSize);
1459            int size = srcSize;
1460            int sizeBits = size * 8;
1461            int items = numItems(size);
1462            uint64_t result = FpDestReg_uqw;
1463
1464            for (int i = 0; i < items; i++) {
1465                int hiIndex = (i + 1) * sizeBits - 1;
1466                int loIndex = (i + 0) * sizeBits;
1467                uint64_t arg1Bits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
1468                uint64_t arg2Bits = bits(FpSrcReg2_uqw, hiIndex, loIndex);
1469                double arg1, arg2;
1470
1471                if (size == 4) {
1472                    floatInt fi;
1473                    fi.i = arg1Bits;
1474                    arg1 = fi.f;
1475                    fi.i = arg2Bits;
1476                    arg2 = fi.f;
1477                } else {
1478                    doubleInt di;
1479                    di.i = arg1Bits;
1480                    arg1 = di.d;
1481                    di.i = arg2Bits;
1482                    arg2 = di.d;
1483                }
1484
1485                uint64_t resBits = 0;
1486                bool nanop = std::isnan(arg1) || std::isnan(arg2);
1487                switch (ext & mask(3)) {
1488                  case 0:
1489                    if (arg1 == arg2 && !nanop)
1490                        resBits = mask(sizeBits);
1491                    break;
1492                  case 1:
1493                    if (arg1 < arg2 && !nanop)
1494                        resBits = mask(sizeBits);
1495                    break;
1496                  case 2:
1497                    if (arg1 <= arg2 && !nanop)
1498                        resBits = mask(sizeBits);
1499                    break;
1500                  case 3:
1501                    if (nanop)
1502                        resBits = mask(sizeBits);
1503                    break;
1504                  case 4:
1505                    if (arg1 != arg2 || nanop)
1506                        resBits = mask(sizeBits);
1507                    break;
1508                  case 5:
1509                    if (!(arg1 < arg2) || nanop)
1510                        resBits = mask(sizeBits);
1511                    break;
1512                  case 6:
1513                    if (!(arg1 <= arg2) || nanop)
1514                        resBits = mask(sizeBits);
1515                    break;
1516                  case 7:
1517                    if (!nanop)
1518                        resBits = mask(sizeBits);
1519                    break;
1520                };
1521
1522                result = insertBits(result, hiIndex, loIndex, resBits);
1523            }
1524            FpDestReg_uqw = result;
1525        '''
1526
1527    class Mcmpf2rf(MediaOp):
1528        def __init__(self, src1, src2,\
1529                size = None, destSize = None, srcSize = None, ext = None):
1530            super(Mcmpf2rf, self).__init__("InstRegIndex(0)", src1,\
1531                    src2, size, destSize, srcSize, ext)
1532        op_class = 'SimdFloatCvtOp'
1533        code = '''
1534            union floatInt
1535            {
1536                float f;
1537                uint32_t i;
1538            };
1539            union doubleInt
1540            {
1541                double d;
1542                uint64_t i;
1543            };
1544
1545            assert(srcSize == destSize);
1546            assert(srcSize == 4 || srcSize == 8);
1547            int size = srcSize;
1548            int sizeBits = size * 8;
1549
1550            double arg1, arg2;
1551            uint64_t arg1Bits = bits(FpSrcReg1_uqw, sizeBits - 1, 0);
1552            uint64_t arg2Bits = bits(FpSrcReg2_uqw, sizeBits - 1, 0);
1553            if (size == 4) {
1554                floatInt fi;
1555                fi.i = arg1Bits;
1556                arg1 = fi.f;
1557                fi.i = arg2Bits;
1558                arg2 = fi.f;
1559            } else {
1560                doubleInt di;
1561                di.i = arg1Bits;
1562                arg1 = di.d;
1563                di.i = arg2Bits;
1564                arg2 = di.d;
1565            }
1566
1567            //               ZF PF CF
1568            // Unordered      1  1  1
1569            // Greater than   0  0  0
1570            // Less than      0  0  1
1571            // Equal          1  0  0
1572            //           OF = SF = AF = 0
1573            ccFlagBits = ccFlagBits & ~(SFBit | AFBit | ZFBit | PFBit);
1574            cfofBits   = cfofBits   & ~(OFBit | CFBit);
1575
1576            if (std::isnan(arg1) || std::isnan(arg2)) {
1577                ccFlagBits = ccFlagBits | (ZFBit | PFBit);
1578                cfofBits = cfofBits | CFBit;
1579            }
1580            else if(arg1 < arg2)
1581                cfofBits = cfofBits | CFBit;
1582            else if(arg1 == arg2)
1583                ccFlagBits = ccFlagBits | ZFBit;
1584        '''
1585
1586    class Emms(MediaOp):
1587        op_class = 'FloatMiscOp'
1588        def __init__(self):
1589            super(Emms, self).__init__('InstRegIndex(MISCREG_FTW)',
1590                    'InstRegIndex(0)', 'InstRegIndex(0)', 2)
1591        code = 'FTW = 0xFFFF;'
1592}};
1593