neon64.isa revision 12038:619bc4100aa8
1// -*- mode: c++ -*-
2
3// Copyright (c) 2012-2013, 2015 ARM Limited
4// All rights reserved
5//
6// The license below extends only to copyright in the software and shall
7// not be construed as granting a license to any other intellectual
8// property including but not limited to intellectual property relating
9// to a hardware implementation of the functionality of the software
10// licensed hereunder.  You may use the software subject to the license
11// terms below provided that you ensure that this notice is replicated
12// unmodified and in its entirety in all distributions of the software,
13// modified or unmodified, in source code or in binary form.
14//
15// Redistribution and use in source and binary forms, with or without
16// modification, are permitted provided that the following conditions are
17// met: redistributions of source code must retain the above copyright
18// notice, this list of conditions and the following disclaimer;
19// redistributions in binary form must reproduce the above copyright
20// notice, this list of conditions and the following disclaimer in the
21// documentation and/or other materials provided with the distribution;
22// neither the name of the copyright holders nor the names of its
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Authors: Giacomo Gabrielli
39//          Mbou Eyole
40
41let {{
42
43    header_output = ""
44    exec_output = ""
45    decoders = { 'Generic' : {} }
46
47    # FP types (FP operations always work with unsigned representations)
48    floatTypes = ("uint32_t", "uint64_t")
49    smallFloatTypes = ("uint32_t",)
50
51    def threeEqualRegInstX(name, Name, opClass, types, rCount, op,
52                           readDest=False, pairwise=False, scalar=False,
53                           byElem=False, decoder='Generic'):
54        assert (not pairwise) or ((not byElem) and (not scalar))
55        global header_output, exec_output, decoders
56        eWalkCode = simd64EnabledCheckCode + '''
57        RegVect srcReg1, destReg;
58        '''
59        if byElem:
60            # 2nd register operand has to be read fully
61            eWalkCode += '''
62        FullRegVect srcReg2;
63        '''
64        else:
65            eWalkCode += '''
66        RegVect srcReg2;
67        '''
68        for reg in range(rCount):
69            eWalkCode += '''
70        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
71        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
72        ''' % { "reg" : reg }
73            if readDest:
74                eWalkCode += '''
75        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
76        ''' % { "reg" : reg }
77        if byElem:
78            # 2nd operand has to be read fully
79            for reg in range(rCount, 4):
80                eWalkCode += '''
81        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
82        ''' % { "reg" : reg }
83        readDestCode = ''
84        if readDest:
85            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
86        if pairwise:
87            eWalkCode += '''
88        for (unsigned i = 0; i < eCount; i++) {
89            Element srcElem1 = gtoh(2 * i < eCount ?
90                                    srcReg1.elements[2 * i] :
91                                    srcReg2.elements[2 * i - eCount]);
92            Element srcElem2 = gtoh(2 * i < eCount ?
93                                    srcReg1.elements[2 * i + 1] :
94                                    srcReg2.elements[2 * i + 1 - eCount]);
95            Element destElem;
96            %(readDest)s
97            %(op)s
98            destReg.elements[i] = htog(destElem);
99        }
100        ''' % { "op" : op, "readDest" : readDestCode }
101        else:
102            scalarCheck = '''
103            if (i != 0) {
104                destReg.elements[i] = 0;
105                continue;
106            }
107            '''
108            eWalkCode += '''
109        for (unsigned i = 0; i < eCount; i++) {
110            %(scalarCheck)s
111            Element srcElem1 = gtoh(srcReg1.elements[i]);
112            Element srcElem2 = gtoh(srcReg2.elements[%(src2Index)s]);
113            Element destElem;
114            %(readDest)s
115            %(op)s
116            destReg.elements[i] = htog(destElem);
117        }
118        ''' % { "op" : op, "readDest" : readDestCode,
119                "scalarCheck" : scalarCheck if scalar else "",
120                "src2Index" : "imm" if byElem else "i" }
121        for reg in range(rCount):
122            eWalkCode += '''
123        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
124        ''' % { "reg" : reg }
125        if rCount < 4:  # zero upper half
126            for reg in range(rCount, 4):
127                eWalkCode += '''
128        AA64FpDestP%(reg)d_uw = 0;
129        ''' % { "reg" : reg }
130        iop = InstObjParams(name, Name,
131                            "DataX2RegImmOp" if byElem else "DataX2RegOp",
132                            { "code": eWalkCode,
133                              "r_count": rCount,
134                              "op_class": opClass }, [])
135        if byElem:
136            header_output += NeonX2RegImmOpDeclare.subst(iop)
137        else:
138            header_output += NeonX2RegOpDeclare.subst(iop)
139        exec_output += NeonXEqualRegOpExecute.subst(iop)
140        for type in types:
141            substDict = { "targs" : type,
142                          "class_name" : Name }
143            exec_output += NeonXExecDeclare.subst(substDict)
144
145    def threeUnequalRegInstX(name, Name, opClass, types, op,
146                             bigSrc1, bigSrc2, bigDest, readDest, scalar=False,
147                             byElem=False, hi=False):
148        assert not (scalar and hi)
149        global header_output, exec_output
150        src1Cnt = src2Cnt = destCnt = 2
151        src1Prefix = src2Prefix = destPrefix = ''
152        if bigSrc1:
153            src1Cnt = 4
154            src1Prefix = 'Big'
155        if bigSrc2:
156            src2Cnt = 4
157            src2Prefix = 'Big'
158        if bigDest:
159            destCnt = 4
160            destPrefix = 'Big'
161        if byElem:
162            src2Prefix = 'Full'
163        eWalkCode = simd64EnabledCheckCode + '''
164        %sRegVect srcReg1;
165        %sRegVect srcReg2;
166        %sRegVect destReg;
167        ''' % (src1Prefix, src2Prefix, destPrefix)
168        srcReg1 = 0
169        if hi and not bigSrc1:  # long/widening operations
170            srcReg1 = 2
171        for reg in range(src1Cnt):
172            eWalkCode += '''
173        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(srcReg1)d_uw);
174        ''' % { "reg" : reg, "srcReg1" : srcReg1 }
175            srcReg1 += 1
176        srcReg2 = 0
177        if (not byElem) and (hi and not bigSrc2):  # long/widening operations
178            srcReg2 = 2
179        for reg in range(src2Cnt):
180            eWalkCode += '''
181        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(srcReg2)d_uw);
182        ''' % { "reg" : reg, "srcReg2" : srcReg2 }
183            srcReg2 += 1
184        if byElem:
185            # 2nd operand has to be read fully
186            for reg in range(src2Cnt, 4):
187                eWalkCode += '''
188        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
189        ''' % { "reg" : reg }
190        if readDest:
191            for reg in range(destCnt):
192                eWalkCode += '''
193        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
194        ''' % { "reg" : reg }
195        readDestCode = ''
196        if readDest:
197            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
198        scalarCheck = '''
199            if (i != 0) {
200                destReg.elements[i] = 0;
201                continue;
202            }
203            '''
204        eWalkCode += '''
205        for (unsigned i = 0; i < eCount; i++) {
206            %(scalarCheck)s
207            %(src1Prefix)sElement srcElem1 = gtoh(srcReg1.elements[i]);
208            %(src1Prefix)sElement srcElem2 = gtoh(srcReg2.elements[%(src2Index)s]);
209            %(destPrefix)sElement destElem;
210            %(readDest)s
211            %(op)s
212            destReg.elements[i] = htog(destElem);
213        }
214        ''' % { "op" : op, "readDest" : readDestCode,
215                "src1Prefix" : src1Prefix, "src2Prefix" : src2Prefix,
216                "destPrefix" : destPrefix,
217                "scalarCheck" : scalarCheck if scalar else "",
218                "src2Index" : "imm" if byElem else "i" }
219        destReg = 0
220        if hi and not bigDest:
221            # narrowing operations
222            destReg = 2
223        for reg in range(destCnt):
224            eWalkCode += '''
225        AA64FpDestP%(destReg)d_uw = gtoh(destReg.regs[%(reg)d]);
226        ''' % { "reg" : reg, "destReg": destReg }
227            destReg += 1
228        if destCnt < 4 and not hi:  # zero upper half
229            for reg in range(destCnt, 4):
230                eWalkCode += '''
231        AA64FpDestP%(reg)d_uw = 0;
232        ''' % { "reg" : reg }
233        iop = InstObjParams(name, Name,
234                            "DataX2RegImmOp" if byElem else "DataX2RegOp",
235                            { "code": eWalkCode,
236                              "r_count": 2,
237                              "op_class": opClass }, [])
238        if byElem:
239            header_output += NeonX2RegImmOpDeclare.subst(iop)
240        else:
241            header_output += NeonX2RegOpDeclare.subst(iop)
242        exec_output += NeonXUnequalRegOpExecute.subst(iop)
243        for type in types:
244            substDict = { "targs" : type,
245                          "class_name" : Name }
246            exec_output += NeonXExecDeclare.subst(substDict)
247
248    def threeRegNarrowInstX(name, Name, opClass, types, op, readDest=False,
249                            scalar=False, byElem=False, hi=False):
250        assert not byElem
251        threeUnequalRegInstX(name, Name, opClass, types, op,
252                             True, True, False, readDest, scalar, byElem, hi)
253
254    def threeRegLongInstX(name, Name, opClass, types, op, readDest=False,
255                          scalar=False, byElem=False, hi=False):
256        threeUnequalRegInstX(name, Name, opClass, types, op,
257                             False, False, True, readDest, scalar, byElem, hi)
258
259    def threeRegWideInstX(name, Name, opClass, types, op, readDest=False,
260                          scalar=False, byElem=False, hi=False):
261        assert not byElem
262        threeUnequalRegInstX(name, Name, opClass, types, op,
263                             True, False, True, readDest, scalar, byElem, hi)
264
265    def twoEqualRegInstX(name, Name, opClass, types, rCount, op,
266                         readDest=False, scalar=False, byElem=False,
267                         hasImm=False, isDup=False):
268        global header_output, exec_output
269        assert (not isDup) or byElem
270        if byElem:
271            hasImm = True
272        if isDup:
273            eWalkCode = simd64EnabledCheckCode + '''
274        FullRegVect srcReg1;
275        RegVect destReg;
276        '''
277        else:
278            eWalkCode = simd64EnabledCheckCode + '''
279        RegVect srcReg1, destReg;
280        '''
281        for reg in range(4 if isDup else rCount):
282            eWalkCode += '''
283        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
284        ''' % { "reg" : reg }
285            if readDest:
286                eWalkCode += '''
287        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
288        ''' % { "reg" : reg }
289        readDestCode = ''
290        if readDest:
291            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
292        scalarCheck = '''
293            if (i != 0) {
294                destReg.elements[i] = 0;
295                continue;
296            }
297            '''
298        eWalkCode += '''
299        for (unsigned i = 0; i < eCount; i++) {
300            %(scalarCheck)s
301            unsigned j = i;
302            Element srcElem1 = gtoh(srcReg1.elements[%(src1Index)s]);
303            Element destElem;
304            %(readDest)s
305            %(op)s
306            destReg.elements[j] = htog(destElem);
307        }
308        ''' % { "op" : op, "readDest" : readDestCode,
309                "scalarCheck" : scalarCheck if scalar else "",
310                "src1Index" : "imm" if byElem else "i" }
311        for reg in range(rCount):
312            eWalkCode += '''
313        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
314        ''' % { "reg" : reg }
315        if rCount < 4:  # zero upper half
316            for reg in range(rCount, 4):
317                eWalkCode += '''
318        AA64FpDestP%(reg)d_uw = 0;
319        ''' % { "reg" : reg }
320        iop = InstObjParams(name, Name,
321                            "DataX1RegImmOp" if hasImm else "DataX1RegOp",
322                            { "code": eWalkCode,
323                              "r_count": rCount,
324                              "op_class": opClass }, [])
325        if hasImm:
326            header_output += NeonX1RegImmOpDeclare.subst(iop)
327        else:
328            header_output += NeonX1RegOpDeclare.subst(iop)
329        exec_output += NeonXEqualRegOpExecute.subst(iop)
330        for type in types:
331            substDict = { "targs" : type,
332                          "class_name" : Name }
333            exec_output += NeonXExecDeclare.subst(substDict)
334
335    def twoRegLongInstX(name, Name, opClass, types, op, readDest=False,
336                        hi=False, hasImm=False):
337        global header_output, exec_output
338        eWalkCode = simd64EnabledCheckCode + '''
339        RegVect srcReg1;
340        BigRegVect destReg;
341        '''
342        destReg = 0 if not hi else 2
343        for reg in range(2):
344            eWalkCode += '''
345        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(destReg)d_uw);
346        ''' % { "reg" : reg, "destReg": destReg }
347            destReg += 1
348        destReg = 0 if not hi else 2
349        if readDest:
350            for reg in range(4):
351                eWalkCode += '''
352        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
353        ''' % { "reg" : reg }
354                destReg += 1
355        readDestCode = ''
356        if readDest:
357            readDestCode = 'destReg = gtoh(destReg.elements[i]);'
358        eWalkCode += '''
359        for (unsigned i = 0; i < eCount; i++) {
360            Element srcElem1 = gtoh(srcReg1.elements[i]);
361            BigElement destElem;
362            %(readDest)s
363            %(op)s
364            destReg.elements[i] = htog(destElem);
365        }
366        ''' % { "op" : op, "readDest" : readDestCode }
367        for reg in range(4):
368            eWalkCode += '''
369        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
370        ''' % { "reg" : reg }
371        iop = InstObjParams(name, Name,
372                            "DataX1RegImmOp" if hasImm else "DataX1RegOp",
373                            { "code": eWalkCode,
374                              "r_count": 2,
375                              "op_class": opClass }, [])
376        if hasImm:
377            header_output += NeonX1RegImmOpDeclare.subst(iop)
378        else:
379            header_output += NeonX1RegOpDeclare.subst(iop)
380        exec_output += NeonXUnequalRegOpExecute.subst(iop)
381        for type in types:
382            substDict = { "targs" : type,
383                          "class_name" : Name }
384            exec_output += NeonXExecDeclare.subst(substDict)
385
386    def twoRegNarrowInstX(name, Name, opClass, types, op, readDest=False,
387                          scalar=False, hi=False, hasImm=False):
388        global header_output, exec_output
389        eWalkCode = simd64EnabledCheckCode + '''
390        BigRegVect srcReg1;
391        RegVect destReg;
392        '''
393        for reg in range(4):
394            eWalkCode += '''
395        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
396        ''' % { "reg" : reg }
397        if readDest:
398            for reg in range(2):
399                eWalkCode += '''
400        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
401        ''' % { "reg" : reg }
402        else:
403            eWalkCode += '''
404        destReg.elements[0] = 0;
405        ''' % { "reg" : reg }
406        readDestCode = ''
407        if readDest:
408            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
409        scalarCheck = '''
410            if (i != 0) {
411                destReg.elements[i] = 0;
412                continue;
413            }
414            '''
415        eWalkCode += '''
416        for (unsigned i = 0; i < eCount; i++) {
417            %(scalarCheck)s
418            BigElement srcElem1 = gtoh(srcReg1.elements[i]);
419            Element destElem;
420            %(readDest)s
421            %(op)s
422            destReg.elements[i] = htog(destElem);
423        }
424        ''' % { "op" : op, "readDest" : readDestCode,
425                "scalarCheck" : scalarCheck if scalar else "" }
426        destReg = 0 if not hi else 2
427        for reg in range(2):
428            eWalkCode += '''
429        AA64FpDestP%(destReg)d_uw = gtoh(destReg.regs[%(reg)d]);
430        ''' % { "reg" : reg, "destReg": destReg }
431            destReg += 1
432        if not hi:
433            for reg in range(2, 4):  # zero upper half
434                eWalkCode += '''
435        AA64FpDestP%(reg)d_uw = 0;
436        ''' % { "reg" : reg }
437        iop = InstObjParams(name, Name,
438                            "DataX1RegImmOp" if hasImm else "DataX1RegOp",
439                            { "code": eWalkCode,
440                              "r_count": 2,
441                              "op_class": opClass }, [])
442        if hasImm:
443            header_output += NeonX1RegImmOpDeclare.subst(iop)
444        else:
445            header_output += NeonX1RegOpDeclare.subst(iop)
446        exec_output += NeonXUnequalRegOpExecute.subst(iop)
447        for type in types:
448            substDict = { "targs" : type,
449                          "class_name" : Name }
450            exec_output += NeonXExecDeclare.subst(substDict)
451
452    def threeRegScrambleInstX(name, Name, opClass, types, rCount, op):
453        global header_output, exec_output
454        eWalkCode = simd64EnabledCheckCode + '''
455        RegVect srcReg1, srcReg2, destReg;
456        '''
457        for reg in range(rCount):
458            eWalkCode += '''
459        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
460        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
461        ''' % { "reg" : reg }
462        eWalkCode += op
463        for reg in range(rCount):
464            eWalkCode += '''
465        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
466        ''' % { "reg" : reg }
467        if rCount < 4:
468            for reg in range(rCount, 4):
469                eWalkCode += '''
470        AA64FpDestP%(reg)d_uw = 0;
471        ''' % { "reg" : reg }
472        iop = InstObjParams(name, Name,
473                            "DataX2RegOp",
474                            { "code": eWalkCode,
475                              "r_count": rCount,
476                              "op_class": opClass }, [])
477        header_output += NeonX2RegOpDeclare.subst(iop)
478        exec_output += NeonXEqualRegOpExecute.subst(iop)
479        for type in types:
480            substDict = { "targs" : type,
481                          "class_name" : Name }
482            exec_output += NeonXExecDeclare.subst(substDict)
483
484    def insFromVecElemInstX(name, Name, opClass, types, rCount):
485        global header_output, exec_output
486        eWalkCode = simd64EnabledCheckCode + '''
487        FullRegVect srcReg1;
488        RegVect destReg;
489        '''
490        for reg in range(4):
491            eWalkCode += '''
492        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
493        ''' % { "reg" : reg }
494        for reg in range(rCount):
495            eWalkCode += '''
496        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
497        ''' % { "reg" : reg }
498        eWalkCode += '''
499        Element srcElem1 = gtoh(srcReg1.elements[imm2]);
500        Element destElem = srcElem1;
501        destReg.elements[imm1] = htog(destElem);
502        '''
503        for reg in range(rCount):
504            eWalkCode += '''
505        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
506        ''' % { "reg" : reg }
507        iop = InstObjParams(name, Name,
508                            "DataX1Reg2ImmOp",
509                            { "code": eWalkCode,
510                              "r_count": rCount,
511                              "op_class": opClass }, [])
512        header_output += NeonX1Reg2ImmOpDeclare.subst(iop)
513        exec_output += NeonXEqualRegOpExecute.subst(iop)
514        for type in types:
515            substDict = { "targs" : type,
516                          "class_name" : Name }
517            exec_output += NeonXExecDeclare.subst(substDict)
518
519    def twoRegPairwiseScInstX(name, Name, opClass, types, rCount, op):
520        global header_output, exec_output
521        eWalkCode = simd64EnabledCheckCode + '''
522        RegVect srcReg1, destReg;
523        '''
524        for reg in range(rCount):
525            eWalkCode += '''
526        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
527        ''' % { "reg" : reg }
528        eWalkCode += '''
529        Element srcElem1 = gtoh(srcReg1.elements[0]);
530        Element srcElem2 = gtoh(srcReg1.elements[1]);
531        Element destElem;
532        %(op)s
533        destReg.elements[0] = htog(destElem);
534        ''' % { "op" : op }
535        destCnt = rCount / 2
536        for reg in range(destCnt):
537            eWalkCode += '''
538        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
539        ''' % { "reg" : reg }
540        for reg in range(destCnt, 4):  # zero upper half
541            eWalkCode += '''
542        AA64FpDestP%(reg)d_uw = 0;
543        ''' % { "reg" : reg }
544        iop = InstObjParams(name, Name,
545                            "DataX1RegOp",
546                            { "code": eWalkCode,
547                              "r_count": rCount,
548                              "op_class": opClass }, [])
549        header_output += NeonX1RegOpDeclare.subst(iop)
550        exec_output += NeonXEqualRegOpExecute.subst(iop)
551        for type in types:
552            substDict = { "targs" : type,
553                          "class_name" : Name }
554            exec_output += NeonXExecDeclare.subst(substDict)
555
556    def twoRegAcrossInstX(name, Name, opClass, types, rCount, op,
557                          doubleDest=False, long=False):
558        global header_output, exec_output
559        destPrefix = "Big" if long else ""
560        eWalkCode = simd64EnabledCheckCode + '''
561        RegVect srcReg1;
562        %sRegVect destReg;
563        ''' % destPrefix
564        for reg in range(rCount):
565            eWalkCode += '''
566        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
567        ''' % { "reg" : reg }
568        eWalkCode += '''
569        destReg.regs[0] = 0;
570        %(destPrefix)sElement destElem = 0;
571        for (unsigned i = 0; i < eCount; i++) {
572            Element srcElem1 = gtoh(srcReg1.elements[i]);
573            if (i == 0) {
574                destElem = srcElem1;
575            } else {
576                %(op)s
577            }
578        }
579        destReg.elements[0] = htog(destElem);
580        ''' % { "op" : op, "destPrefix" : destPrefix }
581        destCnt = 2 if doubleDest else 1
582        for reg in range(destCnt):
583            eWalkCode += '''
584        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
585        ''' % { "reg" : reg }
586        for reg in range(destCnt, 4):  # zero upper half
587            eWalkCode += '''
588        AA64FpDestP%(reg)d_uw = 0;
589        ''' % { "reg" : reg }
590        iop = InstObjParams(name, Name,
591                            "DataX1RegOp",
592                            { "code": eWalkCode,
593                              "r_count": rCount,
594                              "op_class": opClass }, [])
595        header_output += NeonX1RegOpDeclare.subst(iop)
596        if long:
597            exec_output += NeonXUnequalRegOpExecute.subst(iop)
598        else:
599            exec_output += NeonXEqualRegOpExecute.subst(iop)
600        for type in types:
601            substDict = { "targs" : type,
602                          "class_name" : Name }
603            exec_output += NeonXExecDeclare.subst(substDict)
604
605    def twoRegCondenseInstX(name, Name, opClass, types, rCount, op,
606                            readDest=False):
607        global header_output, exec_output
608        eWalkCode = simd64EnabledCheckCode + '''
609        RegVect srcRegs;
610        BigRegVect destReg;
611        '''
612        for reg in range(rCount):
613            eWalkCode += '''
614        srcRegs.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
615        ''' % { "reg" : reg }
616            if readDest:
617                eWalkCode += '''
618        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
619        ''' % { "reg" : reg }
620        readDestCode = ''
621        if readDest:
622            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
623        eWalkCode += '''
624        for (unsigned i = 0; i < eCount / 2; i++) {
625            Element srcElem1 = gtoh(srcRegs.elements[2 * i]);
626            Element srcElem2 = gtoh(srcRegs.elements[2 * i + 1]);
627            BigElement destElem;
628            %(readDest)s
629            %(op)s
630            destReg.elements[i] = htog(destElem);
631        }
632        ''' % { "op" : op, "readDest" : readDestCode }
633        for reg in range(rCount):
634            eWalkCode += '''
635        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
636        ''' % { "reg" : reg }
637        if rCount < 4:  # zero upper half
638            for reg in range(rCount, 4):
639                eWalkCode += '''
640        AA64FpDestP%(reg)d_uw = 0;
641        ''' % { "reg" : reg }
642        iop = InstObjParams(name, Name,
643                            "DataX1RegOp",
644                            { "code": eWalkCode,
645                              "r_count": rCount,
646                              "op_class": opClass }, [])
647        header_output += NeonX1RegOpDeclare.subst(iop)
648        exec_output += NeonXUnequalRegOpExecute.subst(iop)
649        for type in types:
650            substDict = { "targs" : type,
651                          "class_name" : Name }
652            exec_output += NeonXExecDeclare.subst(substDict)
653
654    def oneRegImmInstX(name, Name, opClass, types, rCount, op, readDest=False):
655        global header_output, exec_output
656        eWalkCode = simd64EnabledCheckCode + '''
657        RegVect destReg;
658        '''
659        if readDest:
660            for reg in range(rCount):
661                eWalkCode += '''
662        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
663        ''' % { "reg" : reg }
664        readDestCode = ''
665        if readDest:
666            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
667        eWalkCode += '''
668        for (unsigned i = 0; i < eCount; i++) {
669            Element destElem;
670            %(readDest)s
671            %(op)s
672            destReg.elements[i] = htog(destElem);
673        }
674        ''' % { "op" : op, "readDest" : readDestCode }
675        for reg in range(rCount):
676            eWalkCode += '''
677        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
678        ''' % { "reg" : reg }
679        if rCount < 4:  # zero upper half
680            for reg in range(rCount, 4):
681                eWalkCode += '''
682        AA64FpDestP%(reg)d_uw = 0;
683        ''' % { "reg" : reg }
684        iop = InstObjParams(name, Name,
685                            "DataXImmOnlyOp",
686                            { "code": eWalkCode,
687                              "r_count": rCount,
688                              "op_class": opClass }, [])
689        header_output += NeonX1RegImmOnlyOpDeclare.subst(iop)
690        exec_output += NeonXEqualRegOpExecute.subst(iop)
691        for type in types:
692            substDict = { "targs" : type,
693                          "class_name" : Name }
694            exec_output += NeonXExecDeclare.subst(substDict)
695
696    def dupGprInstX(name, Name, opClass, types, rCount, gprSpec):
697        global header_output, exec_output
698        eWalkCode = simd64EnabledCheckCode + '''
699        RegVect destReg;
700        for (unsigned i = 0; i < eCount; i++) {
701            destReg.elements[i] = htog((Element) %sOp1);
702        }
703        ''' % gprSpec
704        for reg in range(rCount):
705            eWalkCode += '''
706        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
707        ''' % { "reg" : reg }
708        if rCount < 4:  # zero upper half
709            for reg in range(rCount, 4):
710                eWalkCode += '''
711        AA64FpDestP%(reg)d_uw = 0;
712        ''' % { "reg" : reg }
713        iop = InstObjParams(name, Name,
714                            "DataX1RegOp",
715                            { "code": eWalkCode,
716                              "r_count": rCount,
717                              "op_class": opClass }, [])
718        header_output += NeonX1RegOpDeclare.subst(iop)
719        exec_output += NeonXEqualRegOpExecute.subst(iop)
720        for type in types:
721            substDict = { "targs" : type,
722                          "class_name" : Name }
723            exec_output += NeonXExecDeclare.subst(substDict)
724
725    def extInstX(name, Name, opClass, types, rCount, op):
726        global header_output, exec_output
727        eWalkCode = simd64EnabledCheckCode + '''
728        RegVect srcReg1, srcReg2, destReg;
729        '''
730        for reg in range(rCount):
731            eWalkCode += '''
732        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
733        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
734        ''' % { "reg" : reg }
735        eWalkCode += op
736        for reg in range(rCount):
737            eWalkCode += '''
738        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
739        ''' % { "reg" : reg }
740        if rCount < 4:  # zero upper half
741            for reg in range(rCount, 4):
742                eWalkCode += '''
743        AA64FpDestP%(reg)d_uw = 0;
744        ''' % { "reg" : reg }
745        iop = InstObjParams(name, Name,
746                            "DataX2RegImmOp",
747                            { "code": eWalkCode,
748                              "r_count": rCount,
749                              "op_class": opClass }, [])
750        header_output += NeonX2RegImmOpDeclare.subst(iop)
751        exec_output += NeonXEqualRegOpExecute.subst(iop)
752        for type in types:
753            substDict = { "targs" : type,
754                          "class_name" : Name }
755            exec_output += NeonXExecDeclare.subst(substDict)
756
757    def insFromGprInstX(name, Name, opClass, types, rCount, gprSpec):
758        global header_output, exec_output
759        eWalkCode = simd64EnabledCheckCode + '''
760        RegVect destReg;
761        '''
762        for reg in range(rCount):
763            eWalkCode += '''
764        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
765        ''' % { "reg" : reg }
766        eWalkCode += '''
767        destReg.elements[imm] = htog((Element) %sOp1);
768        ''' % gprSpec
769        for reg in range(rCount):
770            eWalkCode += '''
771        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
772        ''' % { "reg" : reg }
773        iop = InstObjParams(name, Name,
774                            "DataX1RegImmOp",
775                            { "code": eWalkCode,
776                              "r_count": rCount,
777                              "op_class": opClass }, [])
778        header_output += NeonX1RegImmOpDeclare.subst(iop)
779        exec_output += NeonXEqualRegOpExecute.subst(iop)
780        for type in types:
781            substDict = { "targs" : type,
782                          "class_name" : Name }
783            exec_output += NeonXExecDeclare.subst(substDict)
784
785    def insToGprInstX(name, Name, opClass, types, rCount, gprSpec,
786                      signExt=False):
787        global header_output, exec_output
788        eWalkCode = simd64EnabledCheckCode + '''
789        FullRegVect srcReg;
790        '''
791        for reg in range(4):
792            eWalkCode += '''
793        srcReg.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
794        ''' % { "reg" : reg }
795        if signExt:
796            eWalkCode += '''
797        %sDest = sext<sizeof(Element) * 8>(srcReg.elements[imm]);
798        ''' % gprSpec
799        else:
800            eWalkCode += '''
801        %sDest = srcReg.elements[imm];
802        ''' % gprSpec
803        iop = InstObjParams(name, Name,
804                            "DataX1RegImmOp",
805                            { "code": eWalkCode,
806                              "r_count": rCount,
807                              "op_class": opClass }, [])
808        header_output += NeonX1RegImmOpDeclare.subst(iop)
809        exec_output += NeonXEqualRegOpExecute.subst(iop)
810        for type in types:
811            substDict = { "targs" : type,
812                          "class_name" : Name }
813            exec_output += NeonXExecDeclare.subst(substDict)
814
815    def tbxTblInstX(name, Name, opClass, types, length, isTbl, rCount):
816        global header_output, decoder_output, exec_output
817        code = simd64EnabledCheckCode + '''
818        union
819        {
820            uint8_t bytes[64];
821            FloatRegBits regs[16];
822        } table;
823
824        union
825        {
826            uint8_t bytes[%(rCount)d * 4];
827            FloatRegBits regs[%(rCount)d];
828        } destReg, srcReg2;
829
830        const unsigned length = %(length)d;
831        const bool isTbl = %(isTbl)s;
832        ''' % { "rCount" : rCount, "length" : length, "isTbl" : isTbl }
833        for reg in range(rCount):
834            code += '''
835        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
836        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
837        ''' % { "reg" : reg }
838        for reg in range(16):
839            if reg < length * 4:
840                code += '''
841        table.regs[%(reg)d] = htog(AA64FpOp1P%(p)dV%(v)dS_uw);
842        ''' % { "reg" : reg, "p" : reg % 4, "v" : reg / 4 }
843            else:
844                code += '''
845        table.regs[%(reg)d] = 0;
846        ''' % { "reg" : reg }
847        code += '''
848        for (unsigned i = 0; i < sizeof(destReg); i++) {
849            uint8_t index = srcReg2.bytes[i];
850            if (index < 16 * length) {
851                destReg.bytes[i] = table.bytes[index];
852            } else {
853                if (isTbl)
854                    destReg.bytes[i] = 0;
855                // else destReg.bytes[i] unchanged
856            }
857        }
858        '''
859        for reg in range(rCount):
860            code += '''
861        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
862        ''' % { "reg" : reg }
863        if rCount < 4:  # zero upper half
864            for reg in range(rCount, 4):
865                code += '''
866        AA64FpDestP%(reg)d_uw = 0;
867        ''' % { "reg" : reg }
868        iop = InstObjParams(name, Name,
869                            "DataX2RegOp",
870                            { "code": code,
871                              "r_count": rCount,
872                              "op_class": opClass }, [])
873        header_output += NeonX2RegOpDeclare.subst(iop)
874        exec_output += NeonXEqualRegOpExecute.subst(iop)
875        for type in types:
876            substDict = { "targs" : type,
877                          "class_name" : Name }
878            exec_output += NeonXExecDeclare.subst(substDict)
879
880    # ABS
881    absCode = '''
882            if (srcElem1 < 0) {
883                destElem = -srcElem1;
884            } else {
885                destElem = srcElem1;
886            }
887    '''
888    twoEqualRegInstX("abs", "AbsDX", "SimdAluOp", signedTypes, 2, absCode)
889    twoEqualRegInstX("abs", "AbsQX", "SimdAluOp", signedTypes, 4, absCode)
890    # ADD
891    addCode = "destElem = srcElem1 + srcElem2;"
892    threeEqualRegInstX("add", "AddDX", "SimdAddOp", unsignedTypes, 2, addCode)
893    threeEqualRegInstX("add", "AddQX", "SimdAddOp", unsignedTypes, 4, addCode)
894    # ADDHN, ADDHN2
895    addhnCode = '''
896            destElem = ((BigElement)srcElem1 + (BigElement)srcElem2) >>
897                        (sizeof(Element) * 8);
898    '''
899    threeRegNarrowInstX("addhn", "AddhnX", "SimdAddOp", smallUnsignedTypes,
900                        addhnCode)
901    threeRegNarrowInstX("addhn2", "Addhn2X", "SimdAddOp", smallUnsignedTypes,
902                        addhnCode, hi=True)
903    # ADDP (scalar)
904    twoRegPairwiseScInstX("addp", "AddpScQX", "SimdAddOp", ("uint64_t",), 4,
905                          addCode)
906    # ADDP (vector)
907    threeEqualRegInstX("addp", "AddpDX", "SimdAddOp", smallUnsignedTypes, 2,
908                       addCode, pairwise=True)
909    threeEqualRegInstX("addp", "AddpQX", "SimdAddOp", unsignedTypes, 4,
910                       addCode, pairwise=True)
911    # ADDV
912    # Note: SimdAddOp can be a bit optimistic here
913    addAcrossCode = "destElem += srcElem1;"
914    twoRegAcrossInstX("addv", "AddvDX", "SimdAddOp", ("uint8_t", "uint16_t"),
915                      2, addAcrossCode)
916    twoRegAcrossInstX("addv", "AddvQX", "SimdAddOp", smallUnsignedTypes, 4,
917                      addAcrossCode)
918    # AND
919    andCode = "destElem = srcElem1 & srcElem2;"
920    threeEqualRegInstX("and", "AndDX", "SimdAluOp", ("uint64_t",), 2, andCode)
921    threeEqualRegInstX("and", "AndQX", "SimdAluOp", ("uint64_t",), 4, andCode)
922    # BIC (immediate)
923    bicImmCode = "destElem &= ~imm;"
924    oneRegImmInstX("bic", "BicImmDX", "SimdAluOp", ("uint64_t",), 2,
925                   bicImmCode, True)
926    oneRegImmInstX("bic", "BicImmQX", "SimdAluOp", ("uint64_t",), 4,
927                   bicImmCode, True)
928    # BIC (register)
929    bicCode = "destElem = srcElem1 & ~srcElem2;"
930    threeEqualRegInstX("bic", "BicDX", "SimdAluOp", ("uint64_t",), 2, bicCode)
931    threeEqualRegInstX("bic", "BicQX", "SimdAluOp", ("uint64_t",), 4, bicCode)
932    # BIF
933    bifCode = "destElem = (destElem & srcElem2) | (srcElem1 & ~srcElem2);"
934    threeEqualRegInstX("bif", "BifDX", "SimdAluOp", ("uint64_t",), 2, bifCode,
935                       True)
936    threeEqualRegInstX("bif", "BifQX", "SimdAluOp", ("uint64_t",), 4, bifCode,
937                       True)
938    # BIT
939    bitCode = "destElem = (srcElem1 & srcElem2) | (destElem & ~srcElem2);"
940    threeEqualRegInstX("bit", "BitDX", "SimdAluOp", ("uint64_t",), 2, bitCode,
941                       True)
942    threeEqualRegInstX("bit", "BitQX", "SimdAluOp", ("uint64_t",), 4, bitCode,
943                       True)
944    # BSL
945    bslCode = "destElem = (srcElem1 & destElem) | (srcElem2 & ~destElem);"
946    threeEqualRegInstX("bsl", "BslDX", "SimdAluOp", ("uint64_t",), 2, bslCode,
947                       True)
948    threeEqualRegInstX("bsl", "BslQX", "SimdAluOp", ("uint64_t",), 4, bslCode,
949                       True)
950    # CLS
951    clsCode = '''
952            unsigned count = 0;
953            if (srcElem1 < 0) {
954                srcElem1 <<= 1;
955                while (srcElem1 < 0 && count < sizeof(Element) * 8 - 1) {
956                    count++;
957                    srcElem1 <<= 1;
958                }
959            } else {
960                srcElem1 <<= 1;
961                while (srcElem1 >= 0 && count < sizeof(Element) * 8 - 1) {
962                    count++;
963                    srcElem1 <<= 1;
964                }
965            }
966            destElem = count;
967    '''
968    twoEqualRegInstX("cls", "ClsDX", "SimdAluOp", smallSignedTypes, 2, clsCode)
969    twoEqualRegInstX("cls", "ClsQX", "SimdAluOp", smallSignedTypes, 4, clsCode)
970    # CLZ
971    clzCode = '''
972            unsigned count = 0;
973            while (srcElem1 >= 0 && count < sizeof(Element) * 8) {
974                count++;
975                srcElem1 <<= 1;
976            }
977            destElem = count;
978    '''
979    twoEqualRegInstX("clz", "ClzDX", "SimdAluOp", smallSignedTypes, 2, clzCode)
980    twoEqualRegInstX("clz", "ClzQX", "SimdAluOp", smallSignedTypes, 4, clzCode)
981    # CMEQ (register)
982    cmeqCode = "destElem = (srcElem1 == srcElem2) ? (Element)(-1) : 0;"
983    threeEqualRegInstX("cmeq", "CmeqDX", "SimdCmpOp", unsignedTypes, 2,
984                       cmeqCode)
985    threeEqualRegInstX("cmeq", "CmeqQX", "SimdCmpOp", unsignedTypes, 4,
986                       cmeqCode)
987    # CMEQ (zero)
988    cmeqZeroCode = "destElem = (srcElem1 == 0) ? (Element)(-1) : 0;"
989    twoEqualRegInstX("cmeq", "CmeqZeroDX", "SimdCmpOp", signedTypes, 2,
990                     cmeqZeroCode)
991    twoEqualRegInstX("cmeq", "CmeqZeroQX", "SimdCmpOp", signedTypes, 4,
992                     cmeqZeroCode)
993    # CMGE (register)
994    cmgeCode = "destElem = (srcElem1 >= srcElem2) ? (Element)(-1) : 0;"
995    threeEqualRegInstX("cmge", "CmgeDX", "SimdCmpOp", signedTypes, 2, cmgeCode)
996    threeEqualRegInstX("cmge", "CmgeQX", "SimdCmpOp", signedTypes, 4, cmgeCode)
997    # CMGE (zero)
998    cmgeZeroCode = "destElem = (srcElem1 >= 0) ? (Element)(-1) : 0;"
999    twoEqualRegInstX("cmge", "CmgeZeroDX", "SimdCmpOp", signedTypes, 2,
1000                     cmgeZeroCode)
1001    twoEqualRegInstX("cmge", "CmgeZeroQX", "SimdCmpOp", signedTypes, 4,
1002                     cmgeZeroCode)
1003    # CMGT (register)
1004    cmgtCode = "destElem = (srcElem1 > srcElem2) ? (Element)(-1) : 0;"
1005    threeEqualRegInstX("cmgt", "CmgtDX", "SimdCmpOp", signedTypes, 2, cmgtCode)
1006    threeEqualRegInstX("cmgt", "CmgtQX", "SimdCmpOp", signedTypes, 4, cmgtCode)
1007    # CMGT (zero)
1008    cmgtZeroCode = "destElem = (srcElem1 > 0) ? (Element)(-1) : 0;"
1009    twoEqualRegInstX("cmgt", "CmgtZeroDX", "SimdCmpOp", signedTypes, 2,
1010                     cmgtZeroCode)
1011    twoEqualRegInstX("cmgt", "CmgtZeroQX", "SimdCmpOp", signedTypes, 4,
1012                     cmgtZeroCode)
1013    # CMHI (register)
1014    threeEqualRegInstX("cmhi", "CmhiDX", "SimdCmpOp", unsignedTypes, 2,
1015                       cmgtCode)
1016    threeEqualRegInstX("cmhi", "CmhiQX", "SimdCmpOp", unsignedTypes, 4,
1017                       cmgtCode)
1018    # CMHS (register)
1019    threeEqualRegInstX("cmhs", "CmhsDX", "SimdCmpOp", unsignedTypes, 2,
1020                       cmgeCode)
1021    threeEqualRegInstX("cmhs", "CmhsQX", "SimdCmpOp", unsignedTypes, 4,
1022                       cmgeCode)
1023    # CMLE (zero)
1024    cmleZeroCode = "destElem = (srcElem1 <= 0) ? (Element)(-1) : 0;"
1025    twoEqualRegInstX("cmle", "CmleZeroDX", "SimdCmpOp", signedTypes, 2,
1026                     cmleZeroCode)
1027    twoEqualRegInstX("cmle", "CmleZeroQX", "SimdCmpOp", signedTypes, 4,
1028                     cmleZeroCode)
1029    # CMLT (zero)
1030    cmltZeroCode = "destElem = (srcElem1 < 0) ? (Element)(-1) : 0;"
1031    twoEqualRegInstX("cmlt", "CmltZeroDX", "SimdCmpOp", signedTypes, 2,
1032                     cmltZeroCode)
1033    twoEqualRegInstX("cmlt", "CmltZeroQX", "SimdCmpOp", signedTypes, 4,
1034                     cmltZeroCode)
1035    # CMTST (register)
1036    tstCode = "destElem = (srcElem1 & srcElem2) ? (Element)(-1) : 0;"
1037    threeEqualRegInstX("cmtst", "CmtstDX", "SimdAluOp", unsignedTypes, 2,
1038                       tstCode)
1039    threeEqualRegInstX("cmtst", "CmtstQX", "SimdAluOp", unsignedTypes, 4,
1040                       tstCode)
1041    # CNT
1042    cntCode = '''
1043            unsigned count = 0;
1044            while (srcElem1 && count < sizeof(Element) * 8) {
1045                count += srcElem1 & 0x1;
1046                srcElem1 >>= 1;
1047            }
1048            destElem = count;
1049    '''
1050    twoEqualRegInstX("cnt", "CntDX", "SimdAluOp", ("uint8_t",), 2, cntCode)
1051    twoEqualRegInstX("cnt", "CntQX", "SimdAluOp", ("uint8_t",), 4, cntCode)
1052    # DUP (element)
1053    dupCode = "destElem = srcElem1;"
1054    twoEqualRegInstX("dup", "DupElemDX", "SimdMiscOp", smallUnsignedTypes, 2,
1055                     dupCode, isDup=True, byElem=True)
1056    twoEqualRegInstX("dup", "DupElemQX", "SimdMiscOp", unsignedTypes, 4,
1057                     dupCode, isDup=True, byElem=True)
1058    twoEqualRegInstX("dup", "DupElemScX", "SimdMiscOp", unsignedTypes, 4,
1059                     dupCode, isDup=True, byElem=True, scalar=True)
1060    # DUP (general register)
1061    dupGprInstX("dup", "DupGprWDX", "SimdMiscOp", smallUnsignedTypes, 2, 'W')
1062    dupGprInstX("dup", "DupGprWQX", "SimdMiscOp", smallUnsignedTypes, 4, 'W')
1063    dupGprInstX("dup", "DupGprXQX", "SimdMiscOp", ("uint64_t",), 4, 'X')
1064    # EOR
1065    eorCode = "destElem = srcElem1 ^ srcElem2;"
1066    threeEqualRegInstX("eor", "EorDX", "SimdAluOp", ("uint64_t",), 2, eorCode)
1067    threeEqualRegInstX("eor", "EorQX", "SimdAluOp", ("uint64_t",), 4, eorCode)
1068    # EXT
1069    extCode = '''
1070            for (unsigned i = 0; i < eCount; i++) {
1071                unsigned index = i + imm;
1072                if (index < eCount) {
1073                    destReg.elements[i] = srcReg1.elements[index];
1074                } else {
1075                    index -= eCount;
1076                    if (index >= eCount) {
1077                        fault = std::make_shared<UndefinedInstruction>(
1078                                      machInst, false, mnemonic);
1079                    } else {
1080                        destReg.elements[i] = srcReg2.elements[index];
1081                    }
1082                }
1083            }
1084    '''
1085    extInstX("Ext", "ExtDX", "SimdMiscOp", ("uint8_t",), 2, extCode)
1086    extInstX("Ext", "ExtQX", "SimdMiscOp", ("uint8_t",), 4, extCode)
1087    # FABD
1088    fpOp = '''
1089            FPSCR fpscr = (FPSCR) FpscrExc;
1090            destElem = %s;
1091            FpscrExc = fpscr;
1092    '''
1093    fabdCode = fpOp % "fplibAbs<Element>(fplibSub(srcElem1, srcElem2, fpscr))"
1094    threeEqualRegInstX("fabd", "FabdDX", "SimdFloatAddOp", smallFloatTypes, 2,
1095                       fabdCode)
1096    threeEqualRegInstX("fabd", "FabdQX", "SimdFloatAddOp", floatTypes, 4,
1097                       fabdCode)
1098    threeEqualRegInstX("fabd", "FabdScX", "SimdFloatAddOp", floatTypes, 4,
1099                       fabdCode, scalar=True)
1100    # FABS
1101    fabsCode = fpOp % "fplibAbs<Element>(srcElem1)"
1102    twoEqualRegInstX("Abs", "FabsDX", "SimdFloatAluOp", smallFloatTypes, 2,
1103                     fabsCode)
1104    twoEqualRegInstX("Abs", "FabsQX", "SimdFloatAluOp", floatTypes, 4,
1105                     fabsCode)
1106    # FACGE
1107    fpCmpAbsOp = fpOp % ("fplibCompare%s<Element>(fplibAbs<Element>(srcElem1),"
1108                         " fplibAbs<Element>(srcElem2), fpscr) ? -1 : 0")
1109    facgeCode = fpCmpAbsOp % "GE"
1110    threeEqualRegInstX("facge", "FacgeDX", "SimdFloatCmpOp", smallFloatTypes,
1111                       2, facgeCode)
1112    threeEqualRegInstX("facge", "FacgeQX", "SimdFloatCmpOp", floatTypes, 4,
1113                       facgeCode)
1114    threeEqualRegInstX("facge", "FacgeScX", "SimdFloatCmpOp", floatTypes, 4,
1115                       facgeCode, scalar=True)
1116    # FACGT
1117    facgtCode = fpCmpAbsOp % "GT"
1118    threeEqualRegInstX("facgt", "FacgtDX", "SimdFloatCmpOp", smallFloatTypes,
1119                       2, facgtCode)
1120    threeEqualRegInstX("facgt", "FacgtQX", "SimdFloatCmpOp", floatTypes, 4,
1121                       facgtCode)
1122    threeEqualRegInstX("facgt", "FacgtScX", "SimdFloatCmpOp", floatTypes, 4,
1123                       facgtCode, scalar=True)
1124    # FADD
1125    fpBinOp = fpOp % "fplib%s<Element>(srcElem1, srcElem2, fpscr)"
1126    faddCode = fpBinOp % "Add"
1127    threeEqualRegInstX("fadd", "FaddDX", "SimdFloatAddOp", smallFloatTypes, 2,
1128                       faddCode)
1129    threeEqualRegInstX("fadd", "FaddQX", "SimdFloatAddOp", floatTypes, 4,
1130                       faddCode)
1131    # FADDP (scalar)
1132    twoRegPairwiseScInstX("faddp", "FaddpScDX", "SimdFloatAddOp",
1133                          ("uint32_t",), 2, faddCode)
1134    twoRegPairwiseScInstX("faddp", "FaddpScQX", "SimdFloatAddOp",
1135                          ("uint64_t",), 4, faddCode)
1136    # FADDP (vector)
1137    threeEqualRegInstX("faddp", "FaddpDX", "SimdFloatAddOp", smallFloatTypes,
1138                       2, faddCode, pairwise=True)
1139    threeEqualRegInstX("faddp", "FaddpQX", "SimdFloatAddOp", floatTypes, 4,
1140                       faddCode, pairwise=True)
1141    # FCMEQ (register)
1142    fpCmpOp = fpOp % ("fplibCompare%s<Element>(srcElem1, srcElem2, fpscr) ?"
1143                      " -1 : 0")
1144    fcmeqCode = fpCmpOp % "EQ"
1145    threeEqualRegInstX("fcmeq", "FcmeqDX", "SimdFloatCmpOp", smallFloatTypes,
1146                       2, fcmeqCode)
1147    threeEqualRegInstX("fcmeq", "FcmeqQX", "SimdFloatCmpOp", floatTypes, 4,
1148                       fcmeqCode)
1149    threeEqualRegInstX("fcmeq", "FcmeqScX", "SimdFloatCmpOp", floatTypes, 4,
1150                       fcmeqCode, scalar=True)
1151    # FCMEQ (zero)
1152    fpCmpZeroOp = fpOp % "fplibCompare%s<Element>(srcElem1, 0, fpscr) ? -1 : 0"
1153    fcmeqZeroCode = fpCmpZeroOp % "EQ"
1154    twoEqualRegInstX("fcmeq", "FcmeqZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1155                     2, fcmeqZeroCode)
1156    twoEqualRegInstX("fcmeq", "FcmeqZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1157                     fcmeqZeroCode)
1158    twoEqualRegInstX("fcmeq", "FcmeqZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1159                     fcmeqZeroCode, scalar=True)
1160    # FCMGE (register)
1161    fcmgeCode = fpCmpOp % "GE"
1162    threeEqualRegInstX("fcmge", "FcmgeDX", "SimdFloatCmpOp", smallFloatTypes,
1163                       2, fcmgeCode)
1164    threeEqualRegInstX("fcmge", "FcmgeQX", "SimdFloatCmpOp", floatTypes, 4,
1165                       fcmgeCode)
1166    threeEqualRegInstX("fcmge", "FcmgeScX", "SimdFloatCmpOp", floatTypes, 4,
1167                       fcmgeCode, scalar=True)
1168    # FCMGE (zero)
1169    fcmgeZeroCode = fpCmpZeroOp % "GE"
1170    twoEqualRegInstX("fcmge", "FcmgeZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1171                     2, fcmgeZeroCode)
1172    twoEqualRegInstX("fcmge", "FcmgeZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1173                     fcmgeZeroCode)
1174    twoEqualRegInstX("fcmge", "FcmgeZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1175                     fcmgeZeroCode, scalar=True)
1176    # FCMGT (register)
1177    fcmgtCode = fpCmpOp % "GT"
1178    threeEqualRegInstX("fcmgt", "FcmgtDX", "SimdFloatCmpOp", smallFloatTypes,
1179                       2, fcmgtCode)
1180    threeEqualRegInstX("fcmgt", "FcmgtQX", "SimdFloatCmpOp", floatTypes, 4,
1181                       fcmgtCode)
1182    threeEqualRegInstX("fcmgt", "FcmgtScX", "SimdFloatCmpOp", floatTypes, 4,
1183                       fcmgtCode, scalar=True)
1184    # FCMGT (zero)
1185    fcmgtZeroCode = fpCmpZeroOp % "GT"
1186    twoEqualRegInstX("fcmgt", "FcmgtZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1187                     2, fcmgtZeroCode)
1188    twoEqualRegInstX("fcmgt", "FcmgtZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1189                     fcmgtZeroCode)
1190    twoEqualRegInstX("fcmgt", "FcmgtZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1191                     fcmgtZeroCode, scalar=True)
1192    # FCMLE (zero)
1193    fpCmpRevZeroOp = fpOp % ("fplibCompare%s<Element>(0, srcElem1, fpscr) ?"
1194                             " -1 : 0")
1195    fcmleZeroCode = fpCmpRevZeroOp % "GE"
1196    twoEqualRegInstX("fcmle", "FcmleZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1197                     2, fcmleZeroCode)
1198    twoEqualRegInstX("fcmle", "FcmleZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1199                     fcmleZeroCode)
1200    twoEqualRegInstX("fcmle", "FcmleZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1201                     fcmleZeroCode, scalar=True)
1202    # FCMLT (zero)
1203    fcmltZeroCode = fpCmpRevZeroOp % "GT"
1204    twoEqualRegInstX("fcmlt", "FcmltZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1205                     2, fcmltZeroCode)
1206    twoEqualRegInstX("fcmlt", "FcmltZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1207                     fcmltZeroCode)
1208    twoEqualRegInstX("fcmlt", "FcmltZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1209                     fcmltZeroCode, scalar=True)
1210    # FCVTAS
1211    fcvtCode = fpOp % ("fplibFPToFixed<Element, Element>("
1212                       "srcElem1, %s, %s, %s, fpscr)")
1213    fcvtasCode = fcvtCode % ("0", "false", "FPRounding_TIEAWAY")
1214    twoEqualRegInstX("fcvtas", "FcvtasDX", "SimdCvtOp", smallFloatTypes, 2,
1215                     fcvtasCode)
1216    twoEqualRegInstX("fcvtas", "FcvtasQX", "SimdCvtOp", floatTypes, 4,
1217                     fcvtasCode)
1218    twoEqualRegInstX("fcvtas", "FcvtasScX", "SimdCvtOp", floatTypes, 4,
1219                     fcvtasCode, scalar=True)
1220    # FCVTAU
1221    fcvtauCode = fcvtCode % ("0", "true", "FPRounding_TIEAWAY")
1222    twoEqualRegInstX("fcvtau", "FcvtauDX", "SimdCvtOp", smallFloatTypes, 2,
1223                     fcvtauCode)
1224    twoEqualRegInstX("fcvtau", "FcvtauQX", "SimdCvtOp", floatTypes, 4,
1225                     fcvtauCode)
1226    twoEqualRegInstX("fcvtau", "FcvtauScX", "SimdCvtOp", floatTypes, 4,
1227                     fcvtauCode, scalar=True)
1228    # FCVTL, FCVTL2
1229    fcvtlCode = fpOp % ("fplibConvert<Element, BigElement>("
1230                        "srcElem1, FPCRRounding(fpscr), fpscr)")
1231    twoRegLongInstX("fcvtl", "FcvtlX", "SimdCvtOp", ("uint16_t", "uint32_t"),
1232                    fcvtlCode)
1233    twoRegLongInstX("fcvtl", "Fcvtl2X", "SimdCvtOp", ("uint16_t", "uint32_t"),
1234                    fcvtlCode, hi=True)
1235    # FCVTMS
1236    fcvtmsCode = fcvtCode % ("0", "false", "FPRounding_NEGINF")
1237    twoEqualRegInstX("fcvtms", "FcvtmsDX", "SimdCvtOp", smallFloatTypes, 2,
1238                     fcvtmsCode)
1239    twoEqualRegInstX("fcvtms", "FcvtmsQX", "SimdCvtOp", floatTypes, 4,
1240                     fcvtmsCode)
1241    twoEqualRegInstX("fcvtms", "FcvtmsScX", "SimdCvtOp", floatTypes, 4,
1242                     fcvtmsCode, scalar=True)
1243    # FCVTMU
1244    fcvtmuCode = fcvtCode % ("0", "true", "FPRounding_NEGINF")
1245    twoEqualRegInstX("fcvtmu", "FcvtmuDX", "SimdCvtOp", smallFloatTypes, 2,
1246                     fcvtmuCode)
1247    twoEqualRegInstX("fcvtmu", "FcvtmuQX", "SimdCvtOp", floatTypes, 4,
1248                     fcvtmuCode)
1249    twoEqualRegInstX("fcvtmu", "FcvtmuScX", "SimdCvtOp", floatTypes, 4,
1250                     fcvtmuCode, scalar=True)
1251    # FCVTN, FCVTN2
1252    fcvtnCode = fpOp % ("fplibConvert<BigElement, Element>("
1253                        "srcElem1, FPCRRounding(fpscr), fpscr)")
1254    twoRegNarrowInstX("fcvtn", "FcvtnX", "SimdCvtOp",
1255                      ("uint16_t", "uint32_t"), fcvtnCode)
1256    twoRegNarrowInstX("fcvtn", "Fcvtn2X", "SimdCvtOp",
1257                      ("uint16_t", "uint32_t"), fcvtnCode, hi=True)
1258    # FCVTNS
1259    fcvtnsCode = fcvtCode % ("0", "false", "FPRounding_TIEEVEN")
1260    twoEqualRegInstX("fcvtns", "FcvtnsDX", "SimdCvtOp", smallFloatTypes, 2,
1261                     fcvtnsCode)
1262    twoEqualRegInstX("fcvtns", "FcvtnsQX", "SimdCvtOp", floatTypes, 4,
1263                     fcvtnsCode)
1264    twoEqualRegInstX("fcvtns", "FcvtnsScX", "SimdCvtOp", floatTypes, 4,
1265                     fcvtnsCode, scalar=True)
1266    # FCVTNU
1267    fcvtnuCode = fcvtCode % ("0", "true", "FPRounding_TIEEVEN")
1268    twoEqualRegInstX("fcvtnu", "FcvtnuDX", "SimdCvtOp", smallFloatTypes, 2,
1269                     fcvtnuCode)
1270    twoEqualRegInstX("fcvtnu", "FcvtnuQX", "SimdCvtOp", floatTypes, 4,
1271                     fcvtnuCode)
1272    twoEqualRegInstX("fcvtnu", "FcvtnuScX", "SimdCvtOp", floatTypes, 4,
1273                     fcvtnuCode, scalar=True)
1274    # FCVTPS
1275    fcvtpsCode = fcvtCode % ("0", "false", "FPRounding_POSINF")
1276    twoEqualRegInstX("fcvtps", "FcvtpsDX", "SimdCvtOp", smallFloatTypes, 2,
1277                     fcvtpsCode)
1278    twoEqualRegInstX("fcvtps", "FcvtpsQX", "SimdCvtOp", floatTypes, 4,
1279                     fcvtpsCode)
1280    twoEqualRegInstX("fcvtps", "FcvtpsScX", "SimdCvtOp", floatTypes, 4,
1281                     fcvtpsCode, scalar=True)
1282    # FCVTPU
1283    fcvtpuCode = fcvtCode % ("0", "true", "FPRounding_POSINF")
1284    twoEqualRegInstX("fcvtpu", "FcvtpuDX", "SimdCvtOp", smallFloatTypes, 2,
1285                     fcvtpuCode)
1286    twoEqualRegInstX("fcvtpu", "FcvtpuQX", "SimdCvtOp", floatTypes, 4,
1287                     fcvtpuCode)
1288    twoEqualRegInstX("fcvtpu", "FcvtpuScX", "SimdCvtOp", floatTypes, 4,
1289                     fcvtpuCode, scalar=True)
1290    # FCVTXN, FCVTXN2
1291    fcvtxnCode = fpOp % ("fplibConvert<BigElement, Element>("
1292                         "srcElem1, FPRounding_ODD, fpscr)")
1293    twoRegNarrowInstX("fcvtxn", "FcvtxnX", "SimdCvtOp", smallFloatTypes,
1294                      fcvtxnCode)
1295    twoRegNarrowInstX("fcvtxn", "Fcvtxn2X", "SimdCvtOp", smallFloatTypes,
1296                      fcvtxnCode, hi=True)
1297    twoRegNarrowInstX("fcvtxn", "FcvtxnScX", "SimdCvtOp", smallFloatTypes,
1298                      fcvtxnCode, scalar=True)
1299    # FCVTZS (fixed-point)
1300    fcvtzsCode = fcvtCode % ("imm", "false", "FPRounding_ZERO")
1301    twoEqualRegInstX("fcvtzs", "FcvtzsFixedDX", "SimdCvtOp", smallFloatTypes,
1302                     2, fcvtzsCode, hasImm=True)
1303    twoEqualRegInstX("fcvtzs", "FcvtzsFixedQX", "SimdCvtOp", floatTypes, 4,
1304                     fcvtzsCode, hasImm=True)
1305    twoEqualRegInstX("fcvtzs", "FcvtzsFixedScX", "SimdCvtOp", floatTypes, 4,
1306                     fcvtzsCode, hasImm=True, scalar=True)
1307    # FCVTZS (integer)
1308    fcvtzsIntCode = fcvtCode % ("0", "false", "FPRounding_ZERO")
1309    twoEqualRegInstX("fcvtzs", "FcvtzsIntDX", "SimdCvtOp", smallFloatTypes,
1310                     2, fcvtzsIntCode)
1311    twoEqualRegInstX("fcvtzs", "FcvtzsIntQX", "SimdCvtOp", floatTypes, 4,
1312                     fcvtzsIntCode)
1313    twoEqualRegInstX("fcvtzs", "FcvtzsIntScX", "SimdCvtOp", floatTypes, 4,
1314                     fcvtzsIntCode, scalar=True)
1315    # FCVTZU (fixed-point)
1316    fcvtzuCode = fcvtCode % ("imm", "true", "FPRounding_ZERO")
1317    twoEqualRegInstX("fcvtzu", "FcvtzuFixedDX", "SimdCvtOp", smallFloatTypes,
1318                     2, fcvtzuCode, hasImm=True)
1319    twoEqualRegInstX("fcvtzu", "FcvtzuFixedQX", "SimdCvtOp", floatTypes, 4,
1320                     fcvtzuCode, hasImm=True)
1321    twoEqualRegInstX("fcvtzu", "FcvtzuFixedScX", "SimdCvtOp", floatTypes, 4,
1322                     fcvtzuCode, hasImm=True, scalar=True)
1323    # FCVTZU (integer)
1324    fcvtzuIntCode = fcvtCode % ("0", "true", "FPRounding_ZERO")
1325    twoEqualRegInstX("fcvtzu", "FcvtzuIntDX", "SimdCvtOp", smallFloatTypes, 2,
1326                     fcvtzuIntCode)
1327    twoEqualRegInstX("fcvtzu", "FcvtzuIntQX", "SimdCvtOp", floatTypes, 4,
1328                     fcvtzuIntCode)
1329    twoEqualRegInstX("fcvtzu", "FcvtzuIntScX", "SimdCvtOp", floatTypes, 4,
1330                     fcvtzuIntCode, scalar=True)
1331    # FDIV
1332    fdivCode = fpBinOp % "Div"
1333    threeEqualRegInstX("fdiv", "FdivDX", "SimdFloatDivOp", smallFloatTypes, 2,
1334                       fdivCode)
1335    threeEqualRegInstX("fdiv", "FdivQX", "SimdFloatDivOp", floatTypes, 4,
1336                       fdivCode)
1337    # FMAX
1338    fmaxCode = fpBinOp % "Max"
1339    threeEqualRegInstX("fmax", "FmaxDX", "SimdFloatCmpOp", smallFloatTypes, 2,
1340                       fmaxCode)
1341    threeEqualRegInstX("fmax", "FmaxQX", "SimdFloatCmpOp", floatTypes, 4,
1342                       fmaxCode)
1343    # FMAXNM
1344    fmaxnmCode = fpBinOp % "MaxNum"
1345    threeEqualRegInstX("fmaxnm", "FmaxnmDX", "SimdFloatCmpOp", smallFloatTypes,
1346                       2, fmaxnmCode)
1347    threeEqualRegInstX("fmaxnm", "FmaxnmQX", "SimdFloatCmpOp", floatTypes, 4,
1348                       fmaxnmCode)
1349    # FMAXNMP (scalar)
1350    twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScDX", "SimdFloatCmpOp",
1351                          ("uint32_t",), 2, fmaxnmCode)
1352    twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScQX", "SimdFloatCmpOp",
1353                          ("uint64_t",), 4, fmaxnmCode)
1354    # FMAXNMP (vector)
1355    threeEqualRegInstX("fmaxnmp", "FmaxnmpDX", "SimdFloatCmpOp",
1356                       smallFloatTypes, 2, fmaxnmCode, pairwise=True)
1357    threeEqualRegInstX("fmaxnmp", "FmaxnmpQX", "SimdFloatCmpOp", floatTypes, 4,
1358                       fmaxnmCode, pairwise=True)
1359    # FMAXNMV
1360    # Note: SimdFloatCmpOp can be a bit optimistic here
1361    fpAcrossOp = fpOp % "fplib%s<Element>(destElem, srcElem1, fpscr)"
1362    fmaxnmAcrossCode = fpAcrossOp % "MaxNum"
1363    twoRegAcrossInstX("fmaxnmv", "FmaxnmvQX", "SimdFloatCmpOp", ("uint32_t",),
1364                      4, fmaxnmAcrossCode)
1365    # FMAXP (scalar)
1366    twoRegPairwiseScInstX("fmaxp", "FmaxpScDX", "SimdFloatCmpOp",
1367                          ("uint32_t",), 2, fmaxCode)
1368    twoRegPairwiseScInstX("fmaxp", "FmaxpScQX", "SimdFloatCmpOp",
1369                          ("uint64_t",), 4, fmaxCode)
1370    # FMAXP (vector)
1371    threeEqualRegInstX("fmaxp", "FmaxpDX", "SimdFloatCmpOp", smallFloatTypes,
1372                       2, fmaxCode, pairwise=True)
1373    threeEqualRegInstX("fmaxp", "FmaxpQX", "SimdFloatCmpOp", floatTypes, 4,
1374                       fmaxCode, pairwise=True)
1375    # FMAXV
1376    # Note: SimdFloatCmpOp can be a bit optimistic here
1377    fmaxAcrossCode = fpAcrossOp % "Max"
1378    twoRegAcrossInstX("fmaxv", "FmaxvQX", "SimdFloatCmpOp", ("uint32_t",), 4,
1379                      fmaxAcrossCode)
1380    # FMIN
1381    fminCode = fpBinOp % "Min"
1382    threeEqualRegInstX("fmin", "FminDX", "SimdFloatCmpOp", smallFloatTypes, 2,
1383                       fminCode)
1384    threeEqualRegInstX("fmin", "FminQX", "SimdFloatCmpOp", floatTypes, 4,
1385                       fminCode)
1386    # FMINNM
1387    fminnmCode = fpBinOp % "MinNum"
1388    threeEqualRegInstX("fminnm", "FminnmDX", "SimdFloatCmpOp", smallFloatTypes,
1389                       2, fminnmCode)
1390    threeEqualRegInstX("fminnm", "FminnmQX", "SimdFloatCmpOp", floatTypes, 4,
1391                       fminnmCode)
1392    # FMINNMP (scalar)
1393    twoRegPairwiseScInstX("fminnmp", "FminnmpScDX", "SimdFloatCmpOp",
1394                          ("uint32_t",), 2, fminnmCode)
1395    twoRegPairwiseScInstX("fminnmp", "FminnmpScQX", "SimdFloatCmpOp",
1396                          ("uint64_t",), 4, fminnmCode)
1397    # FMINNMP (vector)
1398    threeEqualRegInstX("fminnmp", "FminnmpDX", "SimdFloatCmpOp",
1399                       smallFloatTypes, 2, fminnmCode, pairwise=True)
1400    threeEqualRegInstX("fminnmp", "FminnmpQX", "SimdFloatCmpOp", floatTypes, 4,
1401                       fminnmCode, pairwise=True)
1402    # FMINNMV
1403    # Note: SimdFloatCmpOp can be a bit optimistic here
1404    fminnmAcrossCode = fpAcrossOp % "MinNum"
1405    twoRegAcrossInstX("fminnmv", "FminnmvQX", "SimdFloatCmpOp", ("uint32_t",),
1406                      4, fminnmAcrossCode)
1407    # FMINP (scalar)
1408    twoRegPairwiseScInstX("fminp", "FminpScDX", "SimdFloatCmpOp",
1409                          ("uint32_t",), 2, fminCode)
1410    twoRegPairwiseScInstX("fminp", "FminpScQX", "SimdFloatCmpOp",
1411                          ("uint64_t",), 4, fminCode)
1412    # FMINP (vector)
1413    threeEqualRegInstX("fminp", "FminpDX", "SimdFloatCmpOp", smallFloatTypes,
1414                       2, fminCode, pairwise=True)
1415    threeEqualRegInstX("fminp", "FminpQX", "SimdFloatCmpOp", floatTypes, 4,
1416                       fminCode, pairwise=True)
1417    # FMINV
1418    # Note: SimdFloatCmpOp can be a bit optimistic here
1419    fminAcrossCode = fpAcrossOp % "Min"
1420    twoRegAcrossInstX("fminv", "FminvQX", "SimdFloatCmpOp", ("uint32_t",), 4,
1421                      fminAcrossCode)
1422    # FMLA (by element)
1423    fmlaCode = fpOp % ("fplibMulAdd<Element>("
1424                       "destElem, srcElem1, srcElem2, fpscr)")
1425    threeEqualRegInstX("fmla", "FmlaElemDX", "SimdFloatMultAccOp",
1426                       smallFloatTypes, 2, fmlaCode, True, byElem=True)
1427    threeEqualRegInstX("fmla", "FmlaElemQX", "SimdFloatMultAccOp", floatTypes,
1428                       4, fmlaCode, True, byElem=True)
1429    threeEqualRegInstX("fmla", "FmlaElemScX", "SimdFloatMultAccOp", floatTypes,
1430                       4, fmlaCode, True, byElem=True, scalar=True)
1431    # FMLA (vector)
1432    threeEqualRegInstX("fmla", "FmlaDX", "SimdFloatMultAccOp", smallFloatTypes,
1433                       2, fmlaCode, True)
1434    threeEqualRegInstX("fmla", "FmlaQX", "SimdFloatMultAccOp", floatTypes, 4,
1435                       fmlaCode, True)
1436    # FMLS (by element)
1437    fmlsCode = fpOp % ("fplibMulAdd<Element>(destElem,"
1438                       " fplibNeg<Element>(srcElem1), srcElem2, fpscr)")
1439    threeEqualRegInstX("fmls", "FmlsElemDX", "SimdFloatMultAccOp",
1440                       smallFloatTypes, 2, fmlsCode, True, byElem=True)
1441    threeEqualRegInstX("fmls", "FmlsElemQX", "SimdFloatMultAccOp", floatTypes,
1442                       4, fmlsCode, True, byElem=True)
1443    threeEqualRegInstX("fmls", "FmlsElemScX", "SimdFloatMultAccOp", floatTypes,
1444                       4, fmlsCode, True, byElem=True, scalar=True)
1445    # FMLS (vector)
1446    threeEqualRegInstX("fmls", "FmlsDX", "SimdFloatMultAccOp", smallFloatTypes,
1447                       2, fmlsCode, True)
1448    threeEqualRegInstX("fmls", "FmlsQX", "SimdFloatMultAccOp", floatTypes, 4,
1449                       fmlsCode, True)
1450    # FMOV
1451    fmovCode = 'destElem = imm;'
1452    oneRegImmInstX("fmov", "FmovDX", "SimdMiscOp", smallFloatTypes, 2,
1453                   fmovCode)
1454    oneRegImmInstX("fmov", "FmovQX", "SimdMiscOp", floatTypes, 4, fmovCode)
1455    # FMUL (by element)
1456    fmulCode = fpBinOp % "Mul"
1457    threeEqualRegInstX("fmul", "FmulElemDX", "SimdFloatMultOp",
1458                       smallFloatTypes, 2, fmulCode, byElem=True)
1459    threeEqualRegInstX("fmul", "FmulElemQX", "SimdFloatMultOp", floatTypes, 4,
1460                       fmulCode, byElem=True)
1461    threeEqualRegInstX("fmul", "FmulElemScX", "SimdFloatMultOp", floatTypes, 4,
1462                       fmulCode, byElem=True, scalar=True)
1463    # FMUL (vector)
1464    threeEqualRegInstX("fmul", "FmulDX", "SimdFloatMultOp", smallFloatTypes, 2,
1465                       fmulCode)
1466    threeEqualRegInstX("fmul", "FmulQX", "SimdFloatMultOp", floatTypes, 4,
1467                       fmulCode)
1468    # FMULX
1469    fmulxCode = fpBinOp % "MulX"
1470    threeEqualRegInstX("fmulx", "FmulxDX", "SimdFloatMultOp", smallFloatTypes,
1471                       2, fmulxCode)
1472    threeEqualRegInstX("fmulx", "FmulxQX", "SimdFloatMultOp", floatTypes, 4,
1473                       fmulxCode)
1474    threeEqualRegInstX("fmulx", "FmulxScX", "SimdFloatMultOp", floatTypes, 4,
1475                       fmulxCode, scalar=True)
1476    # FMULX (by element)
1477    threeEqualRegInstX("fmulx", "FmulxElemDX", "SimdFloatMultOp",
1478                       smallFloatTypes, 2, fmulxCode, byElem=True)
1479    threeEqualRegInstX("fmulx", "FmulxElemQX", "SimdFloatMultOp", floatTypes,
1480                       4, fmulxCode, byElem=True)
1481    threeEqualRegInstX("fmulx", "FmulxElemScX", "SimdFloatMultOp", floatTypes,
1482                       4, fmulxCode, byElem=True, scalar=True)
1483    # FNEG
1484    fnegCode = fpOp % "fplibNeg<Element>(srcElem1)"
1485    twoEqualRegInstX("Neg", "FnegDX", "SimdFloatAluOp", smallFloatTypes, 2,
1486                     fnegCode)
1487    twoEqualRegInstX("Neg", "FnegQX", "SimdFloatAluOp", floatTypes, 4,
1488                     fnegCode)
1489    # FRECPE
1490    frecpeCode = fpOp % "fplibRecipEstimate<Element>(srcElem1, fpscr)"
1491    twoEqualRegInstX("frecpe", "FrecpeDX", "SimdFloatMultAccOp",
1492                     smallFloatTypes, 2, frecpeCode)
1493    twoEqualRegInstX("frecpe", "FrecpeQX", "SimdFloatMultAccOp", floatTypes, 4,
1494                     frecpeCode)
1495    twoEqualRegInstX("frecpe", "FrecpeScX", "SimdFloatMultAccOp", floatTypes,
1496                     4, frecpeCode, scalar=True)
1497    # FRECPS
1498    frecpsCode = fpBinOp % "RecipStepFused"
1499    threeEqualRegInstX("frecps", "FrecpsDX", "SimdFloatMultAccOp",
1500                       smallFloatTypes, 2, frecpsCode)
1501    threeEqualRegInstX("frecps", "FrecpsQX", "SimdFloatMultAccOp", floatTypes,
1502                       4, frecpsCode)
1503    threeEqualRegInstX("frecps", "FrecpsScX", "SimdFloatMultAccOp", floatTypes,
1504                       4, frecpsCode, scalar=True)
1505    # FRECPX
1506    frecpxCode = fpOp % "fplibRecpX<Element>(srcElem1, fpscr)"
1507    twoEqualRegInstX("frecpx", "FrecpxX", "SimdFloatMultAccOp", floatTypes, 4,
1508                     frecpxCode, scalar=True)
1509    # FRINTA
1510    frintCode = fpOp % "fplibRoundInt<Element>(srcElem1, %s, %s, fpscr)"
1511    frintaCode = frintCode % ("FPRounding_TIEAWAY", "false")
1512    twoEqualRegInstX("frinta", "FrintaDX", "SimdCvtOp", smallFloatTypes, 2,
1513                     frintaCode)
1514    twoEqualRegInstX("frinta", "FrintaQX", "SimdCvtOp", floatTypes, 4,
1515                     frintaCode)
1516    # FRINTI
1517    frintiCode = frintCode % ("FPCRRounding(fpscr)", "false")
1518    twoEqualRegInstX("frinti", "FrintiDX", "SimdCvtOp", smallFloatTypes, 2,
1519                     frintiCode)
1520    twoEqualRegInstX("frinti", "FrintiQX", "SimdCvtOp", floatTypes, 4,
1521                     frintiCode)
1522    # FRINTM
1523    frintmCode = frintCode % ("FPRounding_NEGINF", "false")
1524    twoEqualRegInstX("frintm", "FrintmDX", "SimdCvtOp", smallFloatTypes, 2,
1525                     frintmCode)
1526    twoEqualRegInstX("frintm", "FrintmQX", "SimdCvtOp", floatTypes, 4,
1527                     frintmCode)
1528    # FRINTN
1529    frintnCode = frintCode % ("FPRounding_TIEEVEN", "false")
1530    twoEqualRegInstX("frintn", "FrintnDX", "SimdCvtOp", smallFloatTypes, 2,
1531                     frintnCode)
1532    twoEqualRegInstX("frintn", "FrintnQX", "SimdCvtOp", floatTypes, 4,
1533                     frintnCode)
1534    # FRINTP
1535    frintpCode = frintCode % ("FPRounding_POSINF", "false")
1536    twoEqualRegInstX("frintp", "FrintpDX", "SimdCvtOp", smallFloatTypes, 2,
1537                     frintpCode)
1538    twoEqualRegInstX("frintp", "FrintpQX", "SimdCvtOp", floatTypes, 4,
1539                     frintpCode)
1540    # FRINTX
1541    frintxCode = frintCode % ("FPCRRounding(fpscr)", "true")
1542    twoEqualRegInstX("frintx", "FrintxDX", "SimdCvtOp", smallFloatTypes, 2,
1543                     frintxCode)
1544    twoEqualRegInstX("frintx", "FrintxQX", "SimdCvtOp", floatTypes, 4,
1545                     frintxCode)
1546    # FRINTZ
1547    frintzCode = frintCode % ("FPRounding_ZERO", "false")
1548    twoEqualRegInstX("frintz", "FrintzDX", "SimdCvtOp", smallFloatTypes, 2,
1549                     frintzCode)
1550    twoEqualRegInstX("frintz", "FrintzQX", "SimdCvtOp", floatTypes, 4,
1551                     frintzCode)
1552    # FRSQRTE
1553    frsqrteCode = fpOp % "fplibRSqrtEstimate<Element>(srcElem1, fpscr)"
1554    twoEqualRegInstX("frsqrte", "FrsqrteDX", "SimdFloatSqrtOp",
1555                     smallFloatTypes, 2, frsqrteCode)
1556    twoEqualRegInstX("frsqrte", "FrsqrteQX", "SimdFloatSqrtOp", floatTypes, 4,
1557                     frsqrteCode)
1558    twoEqualRegInstX("frsqrte", "FrsqrteScX", "SimdFloatSqrtOp", floatTypes, 4,
1559                     frsqrteCode, scalar=True)
1560    # FRSQRTS
1561    frsqrtsCode = fpBinOp % "RSqrtStepFused"
1562    threeEqualRegInstX("frsqrts", "FrsqrtsDX", "SimdFloatMiscOp",
1563                       smallFloatTypes, 2, frsqrtsCode)
1564    threeEqualRegInstX("frsqrts", "FrsqrtsQX", "SimdFloatMiscOp", floatTypes,
1565                       4, frsqrtsCode)
1566    threeEqualRegInstX("frsqrts", "FrsqrtsScX", "SimdFloatMiscOp", floatTypes,
1567                       4, frsqrtsCode, scalar=True)
1568    # FSQRT
1569    fsqrtCode = fpOp % "fplibSqrt<Element>(srcElem1, fpscr)"
1570    twoEqualRegInstX("fsqrt", "FsqrtDX", "SimdFloatSqrtOp", smallFloatTypes, 2,
1571                     fsqrtCode)
1572    twoEqualRegInstX("fsqrt", "FsqrtQX", "SimdFloatSqrtOp", floatTypes, 4,
1573                     fsqrtCode)
1574    # FSUB
1575    fsubCode = fpBinOp % "Sub"
1576    threeEqualRegInstX("fsub", "FsubDX", "SimdFloatAddOp", smallFloatTypes, 2,
1577                       fsubCode)
1578    threeEqualRegInstX("fsub", "FsubQX", "SimdFloatAddOp", floatTypes, 4,
1579                       fsubCode)
1580    # INS (element)
1581    insFromVecElemInstX("ins", "InsElemX", "SimdMiscOp", unsignedTypes, 4)
1582    # INS (general register)
1583    insFromGprInstX("ins", "InsGprWX", "SimdMiscOp", smallUnsignedTypes, 4,
1584                    'W')
1585    insFromGprInstX("ins", "InsGprXX", "SimdMiscOp", unsignedTypes, 4, 'X')
1586    # MLA (by element)
1587    mlaCode = "destElem += srcElem1 * srcElem2;"
1588    threeEqualRegInstX("mla", "MlaElemDX", "SimdMultAccOp",
1589                       ("uint16_t", "uint32_t"), 2, mlaCode, True, byElem=True)
1590    threeEqualRegInstX("mla", "MlaElemQX", "SimdMultAccOp",
1591                       ("uint16_t", "uint32_t"), 4, mlaCode, True, byElem=True)
1592    # MLA (vector)
1593    threeEqualRegInstX("mla", "MlaDX", "SimdMultAccOp", smallUnsignedTypes, 2,
1594                       mlaCode, True)
1595    threeEqualRegInstX("mla", "MlaQX", "SimdMultAccOp", smallUnsignedTypes, 4,
1596                       mlaCode, True)
1597    # MLS (by element)
1598    mlsCode = "destElem -= srcElem1 * srcElem2;"
1599    threeEqualRegInstX("mls", "MlsElemDX", "SimdMultAccOp",
1600                       ("uint16_t", "uint32_t"), 2, mlsCode, True, byElem=True)
1601    threeEqualRegInstX("mls", "MlsElemQX", "SimdMultAccOp",
1602                       ("uint16_t", "uint32_t"), 4, mlsCode, True, byElem=True)
1603    # MLS (vector)
1604    threeEqualRegInstX("mls", "MlsDX", "SimdMultAccOp", smallUnsignedTypes, 2,
1605                       mlsCode, True)
1606    threeEqualRegInstX("mls", "MlsQX", "SimdMultAccOp", smallUnsignedTypes, 4,
1607                       mlsCode, True)
1608    # MOV (element) -> alias to INS (element)
1609    # MOV (from general) -> alias to INS (general register)
1610    # MOV (scalar) -> alias to DUP (element)
1611    # MOV (to general) -> alias to UMOV
1612    # MOV (vector) -> alias to ORR (register)
1613    # MOVI
1614    movImmCode = "destElem = imm;"
1615    oneRegImmInstX("movi", "MoviDX", "SimdMiscOp", ("uint64_t",), 2,
1616                   movImmCode)
1617    oneRegImmInstX("movi", "MoviQX", "SimdMiscOp", ("uint64_t",), 4,
1618                   movImmCode)
1619    # MUL (by element)
1620    mulCode = "destElem = srcElem1 * srcElem2;"
1621    threeEqualRegInstX("mul", "MulElemDX", "SimdMultOp",
1622                       ("uint16_t", "uint32_t"), 2, mulCode, byElem=True)
1623    threeEqualRegInstX("mul", "MulElemQX", "SimdMultOp",
1624                       ("uint16_t", "uint32_t"), 4, mulCode, byElem=True)
1625    # MUL (vector)
1626    threeEqualRegInstX("mul", "MulDX", "SimdMultOp", smallUnsignedTypes, 2,
1627                       mulCode)
1628    threeEqualRegInstX("mul", "MulQX", "SimdMultOp", smallUnsignedTypes, 4,
1629                       mulCode)
1630    # MVN
1631    mvnCode = "destElem = ~srcElem1;"
1632    twoEqualRegInstX("mvn", "MvnDX", "SimdAluOp", ("uint64_t",), 2, mvnCode)
1633    twoEqualRegInstX("mvn", "MvnQX", "SimdAluOp", ("uint64_t",), 4, mvnCode)
1634    # MVNI
1635    mvniCode = "destElem = ~imm;"
1636    oneRegImmInstX("mvni", "MvniDX", "SimdAluOp", ("uint64_t",), 2, mvniCode)
1637    oneRegImmInstX("mvni", "MvniQX", "SimdAluOp", ("uint64_t",), 4, mvniCode)
1638    # NEG
1639    negCode = "destElem = -srcElem1;"
1640    twoEqualRegInstX("neg", "NegDX", "SimdAluOp", signedTypes, 2, negCode)
1641    twoEqualRegInstX("neg", "NegQX", "SimdAluOp", signedTypes, 4, negCode)
1642    # NOT -> alias to MVN
1643    # ORN
1644    ornCode = "destElem = srcElem1 | ~srcElem2;"
1645    threeEqualRegInstX("orn", "OrnDX", "SimdAluOp", ("uint64_t",), 2, ornCode)
1646    threeEqualRegInstX("orn", "OrnQX", "SimdAluOp", ("uint64_t",), 4, ornCode)
1647    # ORR (immediate)
1648    orrImmCode = "destElem |= imm;"
1649    oneRegImmInstX("orr", "OrrImmDX", "SimdAluOp", ("uint64_t",), 2,
1650                   orrImmCode, True)
1651    oneRegImmInstX("orr", "OrrImmQX", "SimdAluOp", ("uint64_t",), 4,
1652                   orrImmCode, True)
1653    # ORR (register)
1654    orrCode = "destElem = srcElem1 | srcElem2;"
1655    threeEqualRegInstX("orr", "OrrDX", "SimdAluOp", ("uint64_t",), 2, orrCode)
1656    threeEqualRegInstX("orr", "OrrQX", "SimdAluOp", ("uint64_t",), 4, orrCode)
1657    # PMUL
1658    pmulCode = '''
1659            destElem = 0;
1660            for (unsigned j = 0; j < sizeof(Element) * 8; j++) {
1661                if (bits(srcElem2, j))
1662                    destElem ^= srcElem1 << j;
1663            }
1664    '''
1665    threeEqualRegInstX("pmul", "PmulDX", "SimdMultOp", ("uint8_t",), 2,
1666                       pmulCode)
1667    threeEqualRegInstX("pmul", "PmulQX", "SimdMultOp", ("uint8_t",), 4,
1668                       pmulCode)
1669    # PMULL, PMULL2
1670    # Note: 64-bit PMULL is not available (Crypto. Extension)
1671    pmullCode = '''
1672            destElem = 0;
1673            for (unsigned j = 0; j < sizeof(Element) * 8; j++) {
1674                if (bits(srcElem2, j))
1675                    destElem ^= (BigElement)srcElem1 << j;
1676            }
1677    '''
1678    threeRegLongInstX("pmull", "PmullX", "SimdMultOp", ("uint8_t",), pmullCode)
1679    threeRegLongInstX("pmull", "Pmull2X", "SimdMultOp", ("uint8_t",),
1680                      pmullCode, hi=True)
1681    # RADDHN, RADDHN2
1682    raddhnCode = '''
1683            destElem = ((BigElement)srcElem1 + (BigElement)srcElem2 +
1684                        ((BigElement)1 << (sizeof(Element) * 8 - 1))) >>
1685                       (sizeof(Element) * 8);
1686    '''
1687    threeRegNarrowInstX("raddhn", "RaddhnX", "SimdAddOp", smallUnsignedTypes,
1688                        raddhnCode)
1689    threeRegNarrowInstX("raddhn2", "Raddhn2X", "SimdAddOp", smallUnsignedTypes,
1690                        raddhnCode, hi=True)
1691    # RBIT
1692    rbitCode = '''
1693            destElem = 0;
1694            Element temp = srcElem1;
1695            for (int i = 0; i < 8 * sizeof(Element); i++) {
1696                destElem = destElem  | ((temp & 0x1) <<
1697                                        (8 * sizeof(Element) - 1 - i));
1698                temp >>= 1;
1699            }
1700    '''
1701    twoEqualRegInstX("rbit", "RbitDX", "SimdAluOp", ("uint8_t",), 2, rbitCode)
1702    twoEqualRegInstX("rbit", "RbitQX", "SimdAluOp", ("uint8_t",), 4, rbitCode)
1703    # REV16
1704    rev16Code = '''
1705            destElem = srcElem1;
1706            unsigned groupSize = ((1 << 1) / sizeof(Element));
1707            unsigned reverseMask = (groupSize - 1);
1708            j = i ^ reverseMask;
1709    '''
1710    twoEqualRegInstX("rev16", "Rev16DX", "SimdAluOp", ("uint8_t",), 2,
1711                     rev16Code)
1712    twoEqualRegInstX("rev16", "Rev16QX", "SimdAluOp", ("uint8_t",), 4,
1713                     rev16Code)
1714    # REV32
1715    rev32Code = '''
1716            destElem = srcElem1;
1717            unsigned groupSize = ((1 << 2) / sizeof(Element));
1718            unsigned reverseMask = (groupSize - 1);
1719            j = i ^ reverseMask;
1720    '''
1721    twoEqualRegInstX("rev32", "Rev32DX", "SimdAluOp", ("uint8_t", "uint16_t"),
1722                     2, rev32Code)
1723    twoEqualRegInstX("rev32", "Rev32QX", "SimdAluOp", ("uint8_t", "uint16_t"),
1724                     4, rev32Code)
1725    # REV64
1726    rev64Code = '''
1727            destElem = srcElem1;
1728            unsigned groupSize = ((1 << 3) / sizeof(Element));
1729            unsigned reverseMask = (groupSize - 1);
1730            j = i ^ reverseMask;
1731    '''
1732    twoEqualRegInstX("rev64", "Rev64DX", "SimdAluOp", smallUnsignedTypes, 2,
1733                     rev64Code)
1734    twoEqualRegInstX("rev64", "Rev64QX", "SimdAluOp", smallUnsignedTypes, 4,
1735                     rev64Code)
1736    # RSHRN, RSHRN2
1737    rshrnCode = '''
1738            if (imm > sizeof(srcElem1) * 8) {
1739                destElem = 0;
1740            } else if (imm) {
1741                Element rBit = bits(srcElem1, imm - 1);
1742                destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit;
1743            } else {
1744                destElem = srcElem1;
1745            }
1746    '''
1747    twoRegNarrowInstX("rshrn", "RshrnX", "SimdShiftOp", smallUnsignedTypes,
1748                      rshrnCode, hasImm=True)
1749    twoRegNarrowInstX("rshrn2", "Rshrn2X", "SimdShiftOp", smallUnsignedTypes,
1750                      rshrnCode, hasImm=True, hi=True)
1751    # RSUBHN, RSUBHN2
1752    rsubhnCode = '''
1753            destElem = ((BigElement)srcElem1 - (BigElement)srcElem2 +
1754                        ((BigElement)1 << (sizeof(Element) * 8 - 1))) >>
1755                       (sizeof(Element) * 8);
1756    '''
1757    threeRegNarrowInstX("rsubhn", "RsubhnX", "SimdAddOp", smallTypes,
1758                        rsubhnCode)
1759    threeRegNarrowInstX("rsubhn2", "Rsubhn2X", "SimdAddOp", smallTypes,
1760                        rsubhnCode, hi=True)
1761    # SABA
1762    abaCode = '''
1763            destElem += (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) :
1764                                                (srcElem2 - srcElem1);
1765    '''
1766    threeEqualRegInstX("saba", "SabaDX", "SimdAddAccOp", smallSignedTypes, 2,
1767                       abaCode, True)
1768    threeEqualRegInstX("saba", "SabaQX", "SimdAddAccOp", smallSignedTypes, 4,
1769                       abaCode, True)
1770    # SABAL, SABAL2
1771    abalCode = '''
1772            destElem += (srcElem1 > srcElem2) ?
1773                ((BigElement)srcElem1 - (BigElement)srcElem2) :
1774                ((BigElement)srcElem2 - (BigElement)srcElem1);
1775    '''
1776    threeRegLongInstX("sabal", "SabalX", "SimdAddAccOp", smallSignedTypes,
1777                      abalCode, True)
1778    threeRegLongInstX("sabal2", "Sabal2X", "SimdAddAccOp", smallSignedTypes,
1779                      abalCode, True, hi=True)
1780    # SABD
1781    abdCode = '''
1782            destElem = (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) :
1783                                               (srcElem2 - srcElem1);
1784    '''
1785    threeEqualRegInstX("sabd", "SabdDX", "SimdAddOp", smallSignedTypes, 2,
1786                       abdCode)
1787    threeEqualRegInstX("sabd", "SabdQX", "SimdAddOp", smallSignedTypes, 4,
1788                       abdCode)
1789    # SABDL, SABDL2
1790    abdlCode = '''
1791            destElem = (srcElem1 > srcElem2) ?
1792                ((BigElement)srcElem1 - (BigElement)srcElem2) :
1793                ((BigElement)srcElem2 - (BigElement)srcElem1);
1794    '''
1795    threeRegLongInstX("sabdl", "SabdlX", "SimdAddAccOp", smallSignedTypes,
1796                      abdlCode, True)
1797    threeRegLongInstX("sabdl2", "Sabdl2X", "SimdAddAccOp", smallSignedTypes,
1798                      abdlCode, True, hi=True)
1799    # SADALP
1800    adalpCode = "destElem += (BigElement)srcElem1 + (BigElement)srcElem2;"
1801    twoRegCondenseInstX("sadalp", "SadalpDX", "SimdAddOp", smallSignedTypes, 2,
1802                        adalpCode, True)
1803    twoRegCondenseInstX("sadalp", "SadalpQX", "SimdAddOp", smallSignedTypes, 4,
1804                        adalpCode, True)
1805    # SADDL, SADDL2
1806    addlwCode = "destElem = (BigElement)srcElem1 + (BigElement)srcElem2;"
1807    threeRegLongInstX("saddl", "SaddlX", "SimdAddAccOp", smallSignedTypes,
1808                      addlwCode)
1809    threeRegLongInstX("saddl2", "Saddl2X", "SimdAddAccOp", smallSignedTypes,
1810                      addlwCode, hi=True)
1811    # SADDLP
1812    twoRegCondenseInstX("saddlp", "SaddlpDX", "SimdAddOp", smallSignedTypes, 2,
1813                        addlwCode)
1814    twoRegCondenseInstX("saddlp", "SaddlpQX", "SimdAddOp", smallSignedTypes, 4,
1815                        addlwCode)
1816    # SADDLV
1817    # Note: SimdAddOp can be a bit optimistic here
1818    addAcrossLongCode = "destElem += (BigElement)srcElem1;"
1819    twoRegAcrossInstX("saddlv", "SaddlvDX", "SimdAddOp", ("int8_t", "int16_t"),
1820                      2, addAcrossLongCode, long=True)
1821    twoRegAcrossInstX("saddlv", "SaddlvQX", "SimdAddOp", ("int8_t", "int16_t"),
1822                      4, addAcrossLongCode, long=True)
1823    twoRegAcrossInstX("saddlv", "SaddlvBQX", "SimdAddOp", ("int32_t",), 4,
1824                      addAcrossLongCode, doubleDest=True, long=True)
1825    # SADDW, SADDW2
1826    threeRegWideInstX("saddw", "SaddwX", "SimdAddAccOp", smallSignedTypes,
1827                      addlwCode)
1828    threeRegWideInstX("saddw2", "Saddw2X", "SimdAddAccOp", smallSignedTypes,
1829                      addlwCode, hi=True)
1830    # SCVTF (fixed-point)
1831    scvtfFixedCode = fpOp % ("fplibFixedToFP<Element>((int%d_t) srcElem1, imm,"
1832                             " false, FPCRRounding(fpscr), fpscr)")
1833    twoEqualRegInstX("scvtf", "ScvtfFixedDX", "SimdCvtOp", smallFloatTypes, 2,
1834                     scvtfFixedCode % 32, hasImm=True)
1835    twoEqualRegInstX("scvtf", "ScvtfFixedSQX", "SimdCvtOp", smallFloatTypes, 4,
1836                     scvtfFixedCode % 32, hasImm=True)
1837    twoEqualRegInstX("scvtf", "ScvtfFixedDQX", "SimdCvtOp", ("uint64_t",), 4,
1838                     scvtfFixedCode % 64, hasImm=True)
1839    twoEqualRegInstX("scvtf", "ScvtfFixedScSX", "SimdCvtOp", smallFloatTypes,
1840                     4, scvtfFixedCode % 32, hasImm=True, scalar=True)
1841    twoEqualRegInstX("scvtf", "ScvtfFixedScDX", "SimdCvtOp", ("uint64_t",), 4,
1842                     scvtfFixedCode % 64, hasImm=True, scalar=True)
1843    # SCVTF (integer)
1844    scvtfIntCode = fpOp % ("fplibFixedToFP<Element>((int%d_t) srcElem1, 0,"
1845                           " false, FPCRRounding(fpscr), fpscr)")
1846    twoEqualRegInstX("scvtf", "ScvtfIntDX", "SimdCvtOp", smallFloatTypes, 2,
1847                     scvtfIntCode % 32)
1848    twoEqualRegInstX("scvtf", "ScvtfIntSQX", "SimdCvtOp", smallFloatTypes, 4,
1849                     scvtfIntCode % 32)
1850    twoEqualRegInstX("scvtf", "ScvtfIntDQX", "SimdCvtOp", ("uint64_t",), 4,
1851                     scvtfIntCode % 64)
1852    twoEqualRegInstX("scvtf", "ScvtfIntScSX", "SimdCvtOp", smallFloatTypes, 4,
1853                     scvtfIntCode % 32, scalar=True)
1854    twoEqualRegInstX("scvtf", "ScvtfIntScDX", "SimdCvtOp", ("uint64_t",), 4,
1855                     scvtfIntCode % 64, scalar=True)
1856    # SHADD
1857    haddCode = '''
1858            Element carryBit =
1859                (((unsigned)srcElem1 & 0x1) +
1860                 ((unsigned)srcElem2 & 0x1)) >> 1;
1861            // Use division instead of a shift to ensure the sign extension works
1862            // right. The compiler will figure out if it can be a shift. Mask the
1863            // inputs so they get truncated correctly.
1864            destElem = (((srcElem1 & ~(Element)1) / 2) +
1865                        ((srcElem2 & ~(Element)1) / 2)) + carryBit;
1866    '''
1867    threeEqualRegInstX("shadd", "ShaddDX", "SimdAddOp", smallSignedTypes, 2,
1868                       haddCode)
1869    threeEqualRegInstX("shadd", "ShaddQX", "SimdAddOp", smallSignedTypes, 4,
1870                       haddCode)
1871    # SHL
1872    shlCode = '''
1873            if (imm >= sizeof(Element) * 8)
1874                destElem = (srcElem1 << (sizeof(Element) * 8 - 1)) << 1;
1875            else
1876                destElem = srcElem1 << imm;
1877    '''
1878    twoEqualRegInstX("shl", "ShlDX", "SimdShiftOp", unsignedTypes, 2, shlCode,
1879                     hasImm=True)
1880    twoEqualRegInstX("shl", "ShlQX", "SimdShiftOp", unsignedTypes, 4, shlCode,
1881                     hasImm=True)
1882    # SHLL, SHLL2
1883    shllCode = "destElem = ((BigElement)srcElem1) << (sizeof(Element) * 8);"
1884    twoRegLongInstX("shll", "ShllX", "SimdShiftOp", smallTypes, shllCode)
1885    twoRegLongInstX("shll", "Shll2X", "SimdShiftOp", smallTypes, shllCode,
1886                    hi=True)
1887    # SHRN, SHRN2
1888    shrnCode = '''
1889            if (imm >= sizeof(srcElem1) * 8) {
1890                destElem = 0;
1891            } else {
1892                destElem = srcElem1 >> imm;
1893            }
1894    '''
1895    twoRegNarrowInstX("shrn", "ShrnX", "SimdShiftOp", smallUnsignedTypes,
1896                      shrnCode, hasImm=True)
1897    twoRegNarrowInstX("shrn2", "Shrn2X", "SimdShiftOp", smallUnsignedTypes,
1898                      shrnCode, hasImm=True, hi=True)
1899    # SHSUB
1900    hsubCode = '''
1901            Element borrowBit =
1902                (((srcElem1 & 0x1) - (srcElem2 & 0x1)) >> 1) & 0x1;
1903            // Use division instead of a shift to ensure the sign extension works
1904            // right. The compiler will figure out if it can be a shift. Mask the
1905            // inputs so they get truncated correctly.
1906            destElem = (((srcElem1 & ~(Element)1) / 2) -
1907                        ((srcElem2 & ~(Element)1) / 2)) - borrowBit;
1908    '''
1909    threeEqualRegInstX("shsub", "ShsubDX", "SimdAddOp", smallSignedTypes, 2,
1910                       hsubCode)
1911    threeEqualRegInstX("shsub", "ShsubQX", "SimdAddOp", smallSignedTypes, 4,
1912                       hsubCode)
1913    # SLI
1914    sliCode = '''
1915            if (imm >= sizeof(Element) * 8)
1916                destElem = destElem;
1917            else
1918                destElem = (srcElem1 << imm) | (destElem & mask(imm));
1919    '''
1920    twoEqualRegInstX("sli", "SliDX", "SimdShiftOp", unsignedTypes, 2, sliCode,
1921                     True, hasImm=True)
1922    twoEqualRegInstX("sli", "SliQX", "SimdShiftOp", unsignedTypes, 4, sliCode,
1923                     True, hasImm=True)
1924    # SMAX
1925    maxCode = "destElem = (srcElem1 > srcElem2) ? srcElem1 : srcElem2;"
1926    threeEqualRegInstX("smax", "SmaxDX", "SimdCmpOp", smallSignedTypes, 2,
1927                       maxCode)
1928    threeEqualRegInstX("smax", "SmaxQX", "SimdCmpOp", smallSignedTypes, 4,
1929                       maxCode)
1930    # SMAXP
1931    threeEqualRegInstX("smaxp", "SmaxpDX", "SimdCmpOp", smallSignedTypes, 2,
1932                       maxCode, pairwise=True)
1933    threeEqualRegInstX("smaxp", "SmaxpQX", "SimdCmpOp", smallSignedTypes, 4,
1934                       maxCode, pairwise=True)
1935    # SMAXV
1936    maxAcrossCode = '''
1937            if (i == 0 || srcElem1 > destElem)
1938                destElem = srcElem1;
1939    '''
1940    twoRegAcrossInstX("smaxv", "SmaxvDX", "SimdCmpOp", ("int8_t", "int16_t"),
1941                      2, maxAcrossCode)
1942    twoRegAcrossInstX("smaxv", "SmaxvQX", "SimdCmpOp", smallSignedTypes, 4,
1943                      maxAcrossCode)
1944    # SMIN
1945    minCode = "destElem = (srcElem1 < srcElem2) ? srcElem1 : srcElem2;"
1946    threeEqualRegInstX("smin", "SminDX", "SimdCmpOp", smallSignedTypes, 2,
1947                       minCode)
1948    threeEqualRegInstX("smin", "SminQX", "SimdCmpOp", smallSignedTypes, 4,
1949                       minCode)
1950    # SMINP
1951    threeEqualRegInstX("sminp", "SminpDX", "SimdCmpOp", smallSignedTypes, 2,
1952                       minCode, pairwise=True)
1953    threeEqualRegInstX("sminp", "SminpQX", "SimdCmpOp", smallSignedTypes, 4,
1954                       minCode, pairwise=True)
1955    # SMINV
1956    minAcrossCode = '''
1957            if (i == 0 || srcElem1 < destElem)
1958                destElem = srcElem1;
1959    '''
1960    twoRegAcrossInstX("sminv", "SminvDX", "SimdCmpOp", ("int8_t", "int16_t"),
1961                      2, minAcrossCode)
1962    twoRegAcrossInstX("sminv", "SminvQX", "SimdCmpOp", smallSignedTypes, 4,
1963                      minAcrossCode)
1964
1965    split('exec')
1966
1967    # SMLAL, SMLAL2 (by element)
1968    mlalCode = "destElem += (BigElement)srcElem1 * (BigElement)srcElem2;"
1969    threeRegLongInstX("smlal", "SmlalElemX", "SimdMultAccOp",
1970                      ("int16_t", "int32_t"), mlalCode, True, byElem=True)
1971    threeRegLongInstX("smlal", "SmlalElem2X", "SimdMultAccOp",
1972                      ("int16_t", "int32_t"), mlalCode, True, byElem=True,
1973                      hi=True)
1974    # SMLAL, SMLAL2 (vector)
1975    threeRegLongInstX("smlal", "SmlalX", "SimdMultAccOp", smallSignedTypes,
1976                      mlalCode, True)
1977    threeRegLongInstX("smlal", "Smlal2X", "SimdMultAccOp", smallSignedTypes,
1978                      mlalCode, True, hi=True)
1979    # SMLSL, SMLSL2 (by element)
1980    mlslCode = "destElem -= (BigElement)srcElem1 * (BigElement)srcElem2;"
1981    threeRegLongInstX("smlsl", "SmlslElemX", "SimdMultAccOp", smallSignedTypes,
1982                      mlslCode, True, byElem=True)
1983    threeRegLongInstX("smlsl", "SmlslElem2X", "SimdMultAccOp",
1984                      smallSignedTypes, mlslCode, True, byElem=True, hi=True)
1985    # SMLSL, SMLSL2 (vector)
1986    threeRegLongInstX("smlsl", "SmlslX", "SimdMultAccOp", smallSignedTypes,
1987                      mlslCode, True)
1988    threeRegLongInstX("smlsl", "Smlsl2X", "SimdMultAccOp", smallSignedTypes,
1989                      mlslCode, True, hi=True)
1990    # SMOV
1991    insToGprInstX("smov", "SmovWX", "SimdMiscOp", ("int8_t", "int16_t"), 4,
1992                  'W', True)
1993    insToGprInstX("smov", "SmovXX", "SimdMiscOp", smallSignedTypes, 4, 'X',
1994                  True)
1995    # SMULL, SMULL2 (by element)
1996    mullCode = "destElem = (BigElement)srcElem1 * (BigElement)srcElem2;"
1997    threeRegLongInstX("smull", "SmullElemX", "SimdMultOp", smallSignedTypes,
1998                      mullCode, byElem=True)
1999    threeRegLongInstX("smull", "SmullElem2X", "SimdMultOp", smallSignedTypes,
2000                      mullCode, byElem=True, hi=True)
2001    # SMULL, SMULL2 (vector)
2002    threeRegLongInstX("smull", "SmullX", "SimdMultOp", smallSignedTypes,
2003                      mullCode)
2004    threeRegLongInstX("smull", "Smull2X", "SimdMultOp", smallSignedTypes,
2005                      mullCode, hi=True)
2006    # SQABS
2007    sqabsCode = '''
2008        FPSCR fpscr = (FPSCR) FpscrQc;
2009        if (srcElem1 == (Element)(std::numeric_limits<Element>::min())) {
2010            fpscr.qc = 1;
2011            destElem = ~srcElem1;
2012        } else if (srcElem1 < 0) {
2013            destElem = -srcElem1;
2014        } else {
2015            destElem = srcElem1;
2016        }
2017        FpscrQc = fpscr;
2018    '''
2019    twoEqualRegInstX("sqabs", "SqabsDX", "SimdAluOp", smallSignedTypes, 2,
2020                     sqabsCode)
2021    twoEqualRegInstX("sqabs", "SqabsQX", "SimdAluOp", signedTypes, 4,
2022                     sqabsCode)
2023    twoEqualRegInstX("sqabs", "SqabsScX", "SimdAluOp", signedTypes, 4,
2024                     sqabsCode, scalar=True)
2025    # SQADD
2026    sqaddCode = '''
2027            destElem = srcElem1 + srcElem2;
2028            FPSCR fpscr = (FPSCR) FpscrQc;
2029            bool negDest = (destElem < 0);
2030            bool negSrc1 = (srcElem1 < 0);
2031            bool negSrc2 = (srcElem2 < 0);
2032            if ((negDest != negSrc1) && (negSrc1 == negSrc2)) {
2033                destElem = std::numeric_limits<Element>::min();
2034                if (negDest)
2035                    destElem -= 1;
2036                fpscr.qc = 1;
2037            }
2038            FpscrQc = fpscr;
2039    '''
2040    threeEqualRegInstX("sqadd", "SqaddDX", "SimdAddOp", smallSignedTypes, 2,
2041                       sqaddCode)
2042    threeEqualRegInstX("sqadd", "SqaddQX", "SimdAddOp", signedTypes, 4,
2043                       sqaddCode)
2044    threeEqualRegInstX("sqadd", "SqaddScX", "SimdAddOp", signedTypes, 4,
2045                       sqaddCode, scalar=True)
2046    # SQDMLAL, SQDMLAL2 (by element)
2047    qdmlalCode = '''
2048        FPSCR fpscr = (FPSCR) FpscrQc;
2049        BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2050        Element maxNeg = std::numeric_limits<Element>::min();
2051        Element halfNeg = maxNeg / 2;
2052        if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2053            (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2054            (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2055            midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8));
2056            fpscr.qc = 1;
2057        }
2058        bool negPreDest = ltz(destElem);
2059        destElem += midElem;
2060        bool negDest = ltz(destElem);
2061        bool negMid = ltz(midElem);
2062        if (negPreDest == negMid && negMid != negDest) {
2063            destElem = mask(sizeof(BigElement) * 8 - 1);
2064            if (negPreDest)
2065                destElem = ~destElem;
2066            fpscr.qc = 1;
2067        }
2068        FpscrQc = fpscr;
2069    '''
2070    threeRegLongInstX("sqdmlal", "SqdmlalElemX", "SimdMultAccOp",
2071                      ("int16_t", "int32_t"), qdmlalCode, True, byElem=True)
2072    threeRegLongInstX("sqdmlal", "SqdmlalElem2X", "SimdMultAccOp",
2073                      ("int16_t", "int32_t"), qdmlalCode, True, byElem=True,
2074                      hi=True)
2075    threeRegLongInstX("sqdmlal", "SqdmlalElemScX", "SimdMultAccOp",
2076                      ("int16_t", "int32_t"), qdmlalCode, True, byElem=True,
2077                      scalar=True)
2078    # SQDMLAL, SQDMLAL2 (vector)
2079    threeRegLongInstX("sqdmlal", "SqdmlalX", "SimdMultAccOp",
2080                      ("int16_t", "int32_t"), qdmlalCode, True)
2081    threeRegLongInstX("sqdmlal", "Sqdmlal2X", "SimdMultAccOp",
2082                      ("int16_t", "int32_t"), qdmlalCode, True, hi=True)
2083    threeRegLongInstX("sqdmlal", "SqdmlalScX", "SimdMultAccOp",
2084                      ("int16_t", "int32_t"), qdmlalCode, True, scalar=True)
2085    # SQDMLSL, SQDMLSL2 (by element)
2086    qdmlslCode = '''
2087        FPSCR fpscr = (FPSCR) FpscrQc;
2088        BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2089        Element maxNeg = std::numeric_limits<Element>::min();
2090        Element halfNeg = maxNeg / 2;
2091        if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2092            (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2093            (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2094            midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8));
2095            fpscr.qc = 1;
2096        }
2097        bool negPreDest = ltz(destElem);
2098        destElem -= midElem;
2099        bool negDest = ltz(destElem);
2100        bool posMid = ltz((BigElement)-midElem);
2101        if (negPreDest == posMid && posMid != negDest) {
2102            destElem = mask(sizeof(BigElement) * 8 - 1);
2103            if (negPreDest)
2104                destElem = ~destElem;
2105            fpscr.qc = 1;
2106        }
2107        FpscrQc = fpscr;
2108    '''
2109    threeRegLongInstX("sqdmlsl", "SqdmlslElemX", "SimdMultAccOp",
2110                      ("int16_t", "int32_t"), qdmlslCode, True, byElem=True)
2111    threeRegLongInstX("sqdmlsl", "SqdmlslElem2X", "SimdMultAccOp",
2112                      ("int16_t", "int32_t"), qdmlslCode, True, byElem=True,
2113                      hi=True)
2114    threeRegLongInstX("sqdmlsl", "SqdmlslElemScX", "SimdMultAccOp",
2115                      ("int16_t", "int32_t"), qdmlslCode, True, byElem=True,
2116                      scalar=True)
2117    # SQDMLSL, SQDMLSL2 (vector)
2118    threeRegLongInstX("sqdmlsl", "SqdmlslX", "SimdMultAccOp",
2119                      ("int16_t", "int32_t"), qdmlslCode, True)
2120    threeRegLongInstX("sqdmlsl", "Sqdmlsl2X", "SimdMultAccOp",
2121                      ("int16_t", "int32_t"), qdmlslCode, True, hi=True)
2122    threeRegLongInstX("sqdmlsl", "SqdmlslScX", "SimdMultAccOp",
2123                      ("int16_t", "int32_t"), qdmlslCode, True, scalar=True)
2124    # SQDMULH (by element)
2125    sqdmulhCode = '''
2126            FPSCR fpscr = (FPSCR) FpscrQc;
2127            destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2) >>
2128                       (sizeof(Element) * 8);
2129            if (srcElem1 == srcElem2 &&
2130                    srcElem1 == (Element)((Element)1 <<
2131                        (sizeof(Element) * 8 - 1))) {
2132                destElem = ~srcElem1;
2133                fpscr.qc = 1;
2134            }
2135            FpscrQc = fpscr;
2136    '''
2137    threeEqualRegInstX("sqdmulh", "SqdmulhElemDX", "SimdMultOp",
2138                       ("int16_t", "int32_t"), 2, sqdmulhCode, byElem=True)
2139    threeEqualRegInstX("sqdmulh", "SqdmulhElemQX", "SimdMultOp",
2140                       ("int16_t", "int32_t"), 4, sqdmulhCode, byElem=True)
2141    threeEqualRegInstX("sqdmulh", "SqdmulhElemScX", "SimdMultOp",
2142                       ("int16_t", "int32_t"), 4, sqdmulhCode, byElem=True,
2143                       scalar=True)
2144    # SQDMULH (vector)
2145    threeEqualRegInstX("sqdmulh", "SqdmulhDX", "SimdMultOp",
2146                       ("int16_t", "int32_t"), 2, sqdmulhCode)
2147    threeEqualRegInstX("sqdmulh", "SqdmulhQX", "SimdMultOp",
2148                       ("int16_t", "int32_t"), 4, sqdmulhCode)
2149    threeEqualRegInstX("sqdmulh", "SqdmulhScX", "SimdMultOp",
2150                       ("int16_t", "int32_t"), 4, sqdmulhCode, scalar=True)
2151    # SQDMULL, SQDMULL2 (by element)
2152    qdmullCode = '''
2153        FPSCR fpscr = (FPSCR) FpscrQc;
2154        destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2155        if (srcElem1 == srcElem2 &&
2156                srcElem1 == (Element)((Element)1 <<
2157                    (Element)(sizeof(Element) * 8 - 1))) {
2158            destElem = ~((BigElement)srcElem1 << (sizeof(Element) * 8));
2159            fpscr.qc = 1;
2160        }
2161        FpscrQc = fpscr;
2162    '''
2163    threeRegLongInstX("sqdmull", "SqdmullElemX", "SimdMultOp",
2164                      ("int16_t", "int32_t"), qdmullCode, True, byElem=True)
2165    threeRegLongInstX("sqdmull", "SqdmullElem2X", "SimdMultOp",
2166                      ("int16_t", "int32_t"), qdmullCode, True, byElem=True,
2167                      hi=True)
2168    threeRegLongInstX("sqdmull", "SqdmullElemScX", "SimdMultOp",
2169                      ("int16_t", "int32_t"), qdmullCode, True, byElem=True,
2170                      scalar=True)
2171    # SQDMULL, SQDMULL2 (vector)
2172    threeRegLongInstX("sqdmull", "SqdmullX", "SimdMultOp",
2173                      ("int16_t", "int32_t"), qdmullCode, True)
2174    threeRegLongInstX("sqdmull", "Sqdmull2X", "SimdMultOp",
2175                      ("int16_t", "int32_t"), qdmullCode, True, hi=True)
2176    threeRegLongInstX("sqdmull", "SqdmullScX", "SimdMultOp",
2177                      ("int16_t", "int32_t"), qdmullCode, True, scalar=True)
2178    # SQNEG
2179    sqnegCode = '''
2180        FPSCR fpscr = (FPSCR) FpscrQc;
2181        if (srcElem1 == (Element)(std::numeric_limits<Element>::min())) {
2182            fpscr.qc = 1;
2183            destElem = ~srcElem1;
2184        } else {
2185            destElem = -srcElem1;
2186        }
2187        FpscrQc = fpscr;
2188    '''
2189    twoEqualRegInstX("sqneg", "SqnegDX", "SimdAluOp", smallSignedTypes, 2,
2190                     sqnegCode)
2191    twoEqualRegInstX("sqneg", "SqnegQX", "SimdAluOp", signedTypes, 4,
2192                     sqnegCode)
2193    twoEqualRegInstX("sqneg", "SqnegScX", "SimdAluOp", signedTypes, 4,
2194                     sqnegCode, scalar=True)
2195    # SQRDMULH (by element)
2196    sqrdmulhCode = '''
2197            FPSCR fpscr = (FPSCR) FpscrQc;
2198            destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2 +
2199                        ((int64_t)1 << (sizeof(Element) * 8 - 1))) >>
2200                       (sizeof(Element) * 8);
2201            Element maxNeg = std::numeric_limits<Element>::min();
2202            Element halfNeg = maxNeg / 2;
2203            if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2204                (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2205                (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2206                if (destElem < 0) {
2207                    destElem = mask(sizeof(Element) * 8 - 1);
2208                } else {
2209                    destElem = std::numeric_limits<Element>::min();
2210                }
2211                fpscr.qc = 1;
2212            }
2213            FpscrQc = fpscr;
2214    '''
2215    threeEqualRegInstX("sqrdmulh", "SqrdmulhElemDX", "SimdMultOp",
2216                       ("int16_t", "int32_t"), 2, sqrdmulhCode, byElem=True)
2217    threeEqualRegInstX("sqrdmulh", "SqrdmulhElemQX", "SimdMultOp",
2218                       ("int16_t", "int32_t"), 4, sqrdmulhCode, byElem=True)
2219    threeEqualRegInstX("sqrdmulh", "SqrdmulhElemScX", "SimdMultOp",
2220                       ("int16_t", "int32_t"), 4, sqrdmulhCode, byElem=True,
2221                       scalar=True)
2222    # SQRDMULH (vector)
2223    threeEqualRegInstX("sqrdmulh", "SqrdmulhDX", "SimdMultOp",
2224                       ("int16_t", "int32_t"), 2, sqrdmulhCode)
2225    threeEqualRegInstX("sqrdmulh", "SqrdmulhQX", "SimdMultOp",
2226                       ("int16_t", "int32_t"), 4, sqrdmulhCode)
2227    threeEqualRegInstX("sqrdmulh", "SqrdmulhScX", "SimdMultOp",
2228                       ("int16_t", "int32_t"), 4, sqrdmulhCode, scalar=True)
2229    # SQRSHL
2230    sqrshlCode = '''
2231            int16_t shiftAmt = (int8_t)srcElem2;
2232            FPSCR fpscr = (FPSCR) FpscrQc;
2233            if (shiftAmt < 0) {
2234                shiftAmt = -shiftAmt;
2235                Element rBit = 0;
2236                if (shiftAmt <= sizeof(Element) * 8)
2237                    rBit = bits(srcElem1, shiftAmt - 1);
2238                if (shiftAmt > sizeof(Element) * 8 && srcElem1 < 0)
2239                    rBit = 1;
2240                if (shiftAmt >= sizeof(Element) * 8) {
2241                    shiftAmt = sizeof(Element) * 8 - 1;
2242                    destElem = 0;
2243                } else {
2244                    destElem = (srcElem1 >> shiftAmt);
2245                }
2246                // Make sure the right shift sign extended when it should.
2247                if (srcElem1 < 0 && destElem >= 0) {
2248                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
2249                                                 1 - shiftAmt));
2250                }
2251                destElem += rBit;
2252            } else if (shiftAmt > 0) {
2253                bool sat = false;
2254                if (shiftAmt >= sizeof(Element) * 8) {
2255                    if (srcElem1 != 0)
2256                        sat = true;
2257                    else
2258                        destElem = 0;
2259                } else {
2260                    if (bits((uint64_t) srcElem1, sizeof(Element) * 8 - 1,
2261                                sizeof(Element) * 8 - 1 - shiftAmt) !=
2262                            ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) {
2263                        sat = true;
2264                    } else {
2265                        destElem = srcElem1 << shiftAmt;
2266                    }
2267                }
2268                if (sat) {
2269                    fpscr.qc = 1;
2270                    destElem = mask(sizeof(Element) * 8 - 1);
2271                    if (srcElem1 < 0)
2272                        destElem = ~destElem;
2273                }
2274            } else {
2275                destElem = srcElem1;
2276            }
2277            FpscrQc = fpscr;
2278    '''
2279    threeEqualRegInstX("sqrshl", "SqrshlDX", "SimdCmpOp", smallSignedTypes, 2,
2280                       sqrshlCode)
2281    threeEqualRegInstX("sqrshl", "SqrshlQX", "SimdCmpOp", signedTypes, 4,
2282                       sqrshlCode)
2283    threeEqualRegInstX("sqrshl", "SqrshlScX", "SimdCmpOp", signedTypes, 4,
2284                       sqrshlCode, scalar=True)
2285    # SQRSHRN, SQRSHRN2
2286    sqrshrnCode = '''
2287            FPSCR fpscr = (FPSCR) FpscrQc;
2288            if (imm > sizeof(srcElem1) * 8) {
2289                if (srcElem1 != 0 && srcElem1 != -1)
2290                    fpscr.qc = 1;
2291                destElem = 0;
2292            } else if (imm) {
2293                BigElement mid = (srcElem1 >> (imm - 1));
2294                uint64_t rBit = mid & 0x1;
2295                mid >>= 1;
2296                mid |= -(mid & ((BigElement)1 <<
2297                            (sizeof(BigElement) * 8 - 1 - imm)));
2298                mid += rBit;
2299                if (mid != (Element)mid) {
2300                    destElem = mask(sizeof(Element) * 8 - 1);
2301                    if (srcElem1 < 0)
2302                        destElem = ~destElem;
2303                    fpscr.qc = 1;
2304                } else {
2305                    destElem = mid;
2306                }
2307            } else {
2308                if (srcElem1 != (Element)srcElem1) {
2309                    destElem = mask(sizeof(Element) * 8 - 1);
2310                    if (srcElem1 < 0)
2311                        destElem = ~destElem;
2312                    fpscr.qc = 1;
2313                } else {
2314                    destElem = srcElem1;
2315                }
2316            }
2317            FpscrQc = fpscr;
2318    '''
2319    twoRegNarrowInstX("sqrshrn", "SqrshrnX", "SimdShiftOp", smallSignedTypes,
2320                      sqrshrnCode, hasImm=True)
2321    twoRegNarrowInstX("sqrshrn2", "Sqrshrn2X", "SimdShiftOp", smallSignedTypes,
2322                      sqrshrnCode, hasImm=True, hi=True)
2323    twoRegNarrowInstX("sqrshrn", "SqrshrnScX", "SimdShiftOp", smallSignedTypes,
2324                      sqrshrnCode, hasImm=True, scalar=True)
2325    # SQRSHRUN, SQRSHRUN2
2326    sqrshrunCode = '''
2327            FPSCR fpscr = (FPSCR) FpscrQc;
2328            if (imm > sizeof(srcElem1) * 8) {
2329                if (srcElem1 != 0)
2330                    fpscr.qc = 1;
2331                destElem = 0;
2332            } else if (imm) {
2333                BigElement mid = (srcElem1 >> (imm - 1));
2334                uint64_t rBit = mid & 0x1;
2335                mid >>= 1;
2336                mid |= -(mid & ((BigElement)1 <<
2337                                (sizeof(BigElement) * 8 - 1 - imm)));
2338                mid += rBit;
2339                if (bits(mid, sizeof(BigElement) * 8 - 1,
2340                              sizeof(Element) * 8) != 0) {
2341                    if (srcElem1 < 0) {
2342                        destElem = 0;
2343                    } else {
2344                        destElem = mask(sizeof(Element) * 8);
2345                    }
2346                    fpscr.qc = 1;
2347                } else {
2348                    destElem = mid;
2349                }
2350            } else {
2351                if (srcElem1 < 0) {
2352                    fpscr.qc = 1;
2353                    destElem = 0;
2354                } else {
2355                    destElem = srcElem1;
2356                }
2357            }
2358            FpscrQc = fpscr;
2359    '''
2360    twoRegNarrowInstX("sqrshrun", "SqrshrunX", "SimdShiftOp", smallSignedTypes,
2361                      sqrshrunCode, hasImm=True)
2362    twoRegNarrowInstX("sqrshrun", "Sqrshrun2X", "SimdShiftOp",
2363                      smallSignedTypes, sqrshrunCode, hasImm=True, hi=True)
2364    twoRegNarrowInstX("sqrshrun", "SqrshrunScX", "SimdShiftOp",
2365                      smallSignedTypes, sqrshrunCode, hasImm=True, scalar=True)
2366    # SQSHL (immediate)
2367    sqshlImmCode = '''
2368            FPSCR fpscr = (FPSCR) FpscrQc;
2369            if (imm >= sizeof(Element) * 8) {
2370                if (srcElem1 != 0) {
2371                    destElem = std::numeric_limits<Element>::min();
2372                    if (srcElem1 > 0)
2373                        destElem = ~destElem;
2374                    fpscr.qc = 1;
2375                } else {
2376                    destElem = 0;
2377                }
2378            } else if (imm) {
2379                destElem = (srcElem1 << imm);
2380                uint64_t topBits = bits((uint64_t)srcElem1,
2381                                        sizeof(Element) * 8 - 1,
2382                                        sizeof(Element) * 8 - 1 - imm);
2383                if (topBits != 0 && topBits != mask(imm + 1)) {
2384                    destElem = std::numeric_limits<Element>::min();
2385                    if (srcElem1 > 0)
2386                        destElem = ~destElem;
2387                    fpscr.qc = 1;
2388                }
2389            } else {
2390                destElem = srcElem1;
2391            }
2392            FpscrQc = fpscr;
2393    '''
2394    twoEqualRegInstX("sqshl", "SqshlImmDX", "SimdAluOp", smallSignedTypes, 2,
2395                     sqshlImmCode, hasImm=True)
2396    twoEqualRegInstX("sqshl", "SqshlImmQX", "SimdAluOp", signedTypes, 4,
2397                     sqshlImmCode, hasImm=True)
2398    twoEqualRegInstX("sqshl", "SqshlImmScX", "SimdAluOp", signedTypes, 4,
2399                     sqshlImmCode, hasImm=True, scalar=True)
2400    # SQSHL (register)
2401    sqshlCode = '''
2402            int16_t shiftAmt = (int8_t)srcElem2;
2403            FPSCR fpscr = (FPSCR) FpscrQc;
2404            if (shiftAmt < 0) {
2405                shiftAmt = -shiftAmt;
2406                if (shiftAmt >= sizeof(Element) * 8) {
2407                    shiftAmt = sizeof(Element) * 8 - 1;
2408                    destElem = 0;
2409                } else {
2410                    destElem = (srcElem1 >> shiftAmt);
2411                }
2412                // Make sure the right shift sign extended when it should.
2413                if (srcElem1 < 0 && destElem >= 0) {
2414                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
2415                                                 1 - shiftAmt));
2416                }
2417            } else if (shiftAmt > 0) {
2418                bool sat = false;
2419                if (shiftAmt >= sizeof(Element) * 8) {
2420                    if (srcElem1 != 0)
2421                        sat = true;
2422                    else
2423                        destElem = 0;
2424                } else {
2425                    if (bits((uint64_t) srcElem1, sizeof(Element) * 8 - 1,
2426                                sizeof(Element) * 8 - 1 - shiftAmt) !=
2427                            ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) {
2428                        sat = true;
2429                    } else {
2430                        destElem = srcElem1 << shiftAmt;
2431                    }
2432                }
2433                if (sat) {
2434                    fpscr.qc = 1;
2435                    destElem = mask(sizeof(Element) * 8 - 1);
2436                    if (srcElem1 < 0)
2437                        destElem = ~destElem;
2438                }
2439            } else {
2440                destElem = srcElem1;
2441            }
2442            FpscrQc = fpscr;
2443    '''
2444    threeEqualRegInstX("sqshl", "SqshlDX", "SimdAluOp", smallSignedTypes, 2,
2445                       sqshlCode)
2446    threeEqualRegInstX("sqshl", "SqshlQX", "SimdAluOp", signedTypes, 4,
2447                       sqshlCode)
2448    threeEqualRegInstX("sqshl", "SqshlScX", "SimdAluOp", signedTypes, 4,
2449                       sqshlCode, scalar=True)
2450    # SQSHLU
2451    sqshluCode = '''
2452            FPSCR fpscr = (FPSCR) FpscrQc;
2453            if (imm >= sizeof(Element) * 8) {
2454                if (srcElem1 < 0) {
2455                    destElem = 0;
2456                    fpscr.qc = 1;
2457                } else if (srcElem1 > 0) {
2458                    destElem = mask(sizeof(Element) * 8);
2459                    fpscr.qc = 1;
2460                } else {
2461                    destElem = 0;
2462                }
2463            } else if (imm) {
2464                destElem = (srcElem1 << imm);
2465                uint64_t topBits = bits((uint64_t)srcElem1,
2466                                        sizeof(Element) * 8 - 1,
2467                                        sizeof(Element) * 8 - imm);
2468                if (srcElem1 < 0) {
2469                    destElem = 0;
2470                    fpscr.qc = 1;
2471                } else if (topBits != 0) {
2472                    destElem = mask(sizeof(Element) * 8);
2473                    fpscr.qc = 1;
2474                }
2475            } else {
2476                if (srcElem1 < 0) {
2477                    fpscr.qc = 1;
2478                    destElem = 0;
2479                } else {
2480                    destElem = srcElem1;
2481                }
2482            }
2483            FpscrQc = fpscr;
2484    '''
2485    twoEqualRegInstX("sqshlu", "SqshluDX", "SimdAluOp", smallSignedTypes, 2,
2486                     sqshluCode, hasImm=True)
2487    twoEqualRegInstX("sqshlu", "SqshluQX", "SimdAluOp", signedTypes, 4,
2488                     sqshluCode, hasImm=True)
2489    twoEqualRegInstX("sqshlu", "SqshluScX", "SimdAluOp", signedTypes, 4,
2490                     sqshluCode, hasImm=True, scalar=True)
2491    # SQSHRN, SQSHRN2
2492    sqshrnCode = '''
2493        FPSCR fpscr = (FPSCR) FpscrQc;
2494        if (imm > sizeof(srcElem1) * 8) {
2495            if (srcElem1 != 0 && srcElem1 != -1)
2496                fpscr.qc = 1;
2497            destElem = 0;
2498        } else if (imm) {
2499            BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
2500            mid |= -(mid & ((BigElement)1 <<
2501                        (sizeof(BigElement) * 8 - 1 - imm)));
2502            if (mid != (Element)mid) {
2503                destElem = mask(sizeof(Element) * 8 - 1);
2504                if (srcElem1 < 0)
2505                    destElem = ~destElem;
2506                fpscr.qc = 1;
2507            } else {
2508                destElem = mid;
2509            }
2510        } else {
2511            destElem = srcElem1;
2512        }
2513        FpscrQc = fpscr;
2514    '''
2515    twoRegNarrowInstX("sqshrn", "SqshrnX", "SimdShiftOp", smallSignedTypes,
2516                      sqshrnCode, hasImm=True)
2517    twoRegNarrowInstX("sqshrn2", "Sqshrn2X", "SimdShiftOp", smallSignedTypes,
2518                      sqshrnCode, hasImm=True, hi=True)
2519    twoRegNarrowInstX("sqshrn", "SqshrnScX", "SimdShiftOp", smallSignedTypes,
2520                      sqshrnCode, hasImm=True, scalar=True)
2521    # SQSHRUN, SQSHRUN2
2522    sqshrunCode = '''
2523            FPSCR fpscr = (FPSCR) FpscrQc;
2524            if (imm > sizeof(srcElem1) * 8) {
2525                if (srcElem1 != 0)
2526                    fpscr.qc = 1;
2527                destElem = 0;
2528            } else if (imm) {
2529                BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
2530                if (bits(mid, sizeof(BigElement) * 8 - 1,
2531                              sizeof(Element) * 8) != 0) {
2532                    if (srcElem1 < 0) {
2533                        destElem = 0;
2534                    } else {
2535                        destElem = mask(sizeof(Element) * 8);
2536                    }
2537                    fpscr.qc = 1;
2538                } else {
2539                    destElem = mid;
2540                }
2541            } else {
2542                destElem = srcElem1;
2543            }
2544            FpscrQc = fpscr;
2545    '''
2546    twoRegNarrowInstX("sqshrun", "SqshrunX", "SimdShiftOp", smallSignedTypes,
2547                      sqshrunCode, hasImm=True)
2548    twoRegNarrowInstX("sqshrun", "Sqshrun2X", "SimdShiftOp", smallSignedTypes,
2549                      sqshrunCode, hasImm=True, hi=True)
2550    twoRegNarrowInstX("sqshrun", "SqshrunScX", "SimdShiftOp", smallSignedTypes,
2551                      sqshrunCode, hasImm=True, scalar=True)
2552    # SQSUB
2553    sqsubCode = '''
2554            destElem = srcElem1 - srcElem2;
2555            FPSCR fpscr = (FPSCR) FpscrQc;
2556            bool negDest = (destElem < 0);
2557            bool negSrc1 = (srcElem1 < 0);
2558            bool posSrc2 = (srcElem2 >= 0);
2559            if ((negDest != negSrc1) && (negSrc1 == posSrc2)) {
2560                destElem = std::numeric_limits<Element>::min();
2561                if (negDest)
2562                    destElem -= 1;
2563                fpscr.qc = 1;
2564            }
2565            FpscrQc = fpscr;
2566    '''
2567    threeEqualRegInstX("sqsub", "SqsubDX", "SimdAddOp", smallSignedTypes, 2,
2568                       sqsubCode)
2569    threeEqualRegInstX("sqsub", "SqsubQX", "SimdAddOp", signedTypes, 4,
2570                       sqsubCode)
2571    threeEqualRegInstX("sqsub", "SqsubScX", "SimdAddOp", signedTypes, 4,
2572                       sqsubCode, scalar=True)
2573    # SQXTN, SQXTN2
2574    sqxtnCode = '''
2575            FPSCR fpscr = (FPSCR) FpscrQc;
2576            destElem = srcElem1;
2577            if ((BigElement)destElem != srcElem1) {
2578                fpscr.qc = 1;
2579                destElem = mask(sizeof(Element) * 8 - 1);
2580                if (srcElem1 < 0)
2581                    destElem = ~destElem;
2582            }
2583            FpscrQc = fpscr;
2584    '''
2585    twoRegNarrowInstX("sqxtn", "SqxtnX", "SimdMiscOp", smallSignedTypes,
2586                      sqxtnCode)
2587    twoRegNarrowInstX("sqxtn", "Sqxtn2X", "SimdMiscOp", smallSignedTypes,
2588                      sqxtnCode, hi=True)
2589    twoRegNarrowInstX("sqxtn", "SqxtnScX", "SimdMiscOp", smallSignedTypes,
2590                      sqxtnCode, scalar=True)
2591    # SQXTUN, SQXTUN2
2592    sqxtunCode = '''
2593            FPSCR fpscr = (FPSCR) FpscrQc;
2594            destElem = srcElem1;
2595            if (srcElem1 < 0 ||
2596                    ((BigElement)destElem & mask(sizeof(Element) * 8)) != srcElem1) {
2597                fpscr.qc = 1;
2598                destElem = mask(sizeof(Element) * 8);
2599                if (srcElem1 < 0)
2600                    destElem = ~destElem;
2601            }
2602            FpscrQc = fpscr;
2603    '''
2604    twoRegNarrowInstX("sqxtun", "SqxtunX", "SimdMiscOp", smallSignedTypes,
2605                      sqxtunCode)
2606    twoRegNarrowInstX("sqxtun", "Sqxtun2X", "SimdMiscOp", smallSignedTypes,
2607                      sqxtunCode, hi=True)
2608    twoRegNarrowInstX("sqxtun", "SqxtunScX", "SimdMiscOp", smallSignedTypes,
2609                      sqxtunCode, scalar=True)
2610    # SRHADD
2611    rhaddCode = '''
2612            Element carryBit =
2613                (((unsigned)srcElem1 & 0x1) +
2614                 ((unsigned)srcElem2 & 0x1) + 1) >> 1;
2615            // Use division instead of a shift to ensure the sign extension works
2616            // right. The compiler will figure out if it can be a shift. Mask the
2617            // inputs so they get truncated correctly.
2618            destElem = (((srcElem1 & ~(Element)1) / 2) +
2619                        ((srcElem2 & ~(Element)1) / 2)) + carryBit;
2620    '''
2621    threeEqualRegInstX("srhadd", "SrhaddDX", "SimdAddOp", smallSignedTypes, 2,
2622                       rhaddCode)
2623    threeEqualRegInstX("srhadd", "SrhaddQX", "SimdAddOp", smallSignedTypes, 4,
2624                       rhaddCode)
2625    # SRI
2626    sriCode = '''
2627            if (imm >= sizeof(Element) * 8)
2628                destElem = destElem;
2629            else
2630                destElem = (srcElem1 >> imm) |
2631                    (destElem & ~mask(sizeof(Element) * 8 - imm));
2632    '''
2633    twoEqualRegInstX("sri", "SriDX", "SimdShiftOp", unsignedTypes, 2, sriCode,
2634                     True, hasImm=True)
2635    twoEqualRegInstX("sri", "SriQX", "SimdShiftOp", unsignedTypes, 4, sriCode,
2636                     True, hasImm=True)
2637    # SRSHL
2638    rshlCode = '''
2639            int16_t shiftAmt = (int8_t)srcElem2;
2640            if (shiftAmt < 0) {
2641                shiftAmt = -shiftAmt;
2642                Element rBit = 0;
2643                if (shiftAmt <= sizeof(Element) * 8)
2644                    rBit = bits(srcElem1, shiftAmt - 1);
2645                if (shiftAmt > sizeof(Element) * 8 && ltz(srcElem1))
2646                    rBit = 1;
2647                if (shiftAmt >= sizeof(Element) * 8) {
2648                    shiftAmt = sizeof(Element) * 8 - 1;
2649                    destElem = 0;
2650                } else {
2651                    destElem = (srcElem1 >> shiftAmt);
2652                }
2653                // Make sure the right shift sign extended when it should.
2654                if (ltz(srcElem1) && !ltz(destElem)) {
2655                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
2656                                                 1 - shiftAmt));
2657                }
2658                destElem += rBit;
2659            } else if (shiftAmt > 0) {
2660                if (shiftAmt >= sizeof(Element) * 8) {
2661                    destElem = 0;
2662                } else {
2663                    destElem = srcElem1 << shiftAmt;
2664                }
2665            } else {
2666                destElem = srcElem1;
2667            }
2668    '''
2669    threeEqualRegInstX("srshl", "SrshlDX", "SimdShiftOp", signedTypes, 2,
2670                       rshlCode)
2671    threeEqualRegInstX("srshl", "SrshlQX", "SimdShiftOp", signedTypes, 4,
2672                       rshlCode)
2673    # SRSHR
2674    rshrCode = '''
2675            if (imm > sizeof(srcElem1) * 8) {
2676                destElem = 0;
2677            } else if (imm) {
2678                Element rBit = bits(srcElem1, imm - 1);
2679                destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit;
2680            } else {
2681                destElem = srcElem1;
2682            }
2683    '''
2684    twoEqualRegInstX("srshr", "SrshrDX", "SimdShiftOp", signedTypes, 2,
2685                     rshrCode, hasImm=True)
2686    twoEqualRegInstX("srshr", "SrshrQX", "SimdShiftOp", signedTypes, 4,
2687                     rshrCode, hasImm=True)
2688    # SRSRA
2689    rsraCode = '''
2690            if (imm > sizeof(srcElem1) * 8) {
2691                destElem += 0;
2692            } else if (imm) {
2693                Element rBit = bits(srcElem1, imm - 1);
2694                destElem += ((srcElem1 >> (imm - 1)) >> 1) + rBit;
2695            } else {
2696                destElem += srcElem1;
2697            }
2698    '''
2699    twoEqualRegInstX("srsra", "SrsraDX", "SimdShiftOp", signedTypes, 2,
2700                     rsraCode, True, hasImm=True)
2701    twoEqualRegInstX("srsra", "SrsraQX", "SimdShiftOp", signedTypes, 4,
2702                     rsraCode, True, hasImm=True)
2703    # SSHL
2704    shlCode = '''
2705            int16_t shiftAmt = (int8_t)srcElem2;
2706            if (shiftAmt < 0) {
2707                shiftAmt = -shiftAmt;
2708                if (shiftAmt >= sizeof(Element) * 8) {
2709                    shiftAmt = sizeof(Element) * 8 - 1;
2710                    destElem = 0;
2711                } else {
2712                    destElem = (srcElem1 >> shiftAmt);
2713                }
2714                // Make sure the right shift sign extended when it should.
2715                if (ltz(srcElem1) && !ltz(destElem)) {
2716                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
2717                                                 1 - shiftAmt));
2718                }
2719            } else {
2720                if (shiftAmt >= sizeof(Element) * 8) {
2721                    destElem = 0;
2722                } else {
2723                    destElem = srcElem1 << shiftAmt;
2724                }
2725            }
2726    '''
2727    threeEqualRegInstX("sshl", "SshlDX", "SimdShiftOp", signedTypes, 2,
2728                       shlCode)
2729    threeEqualRegInstX("sshl", "SshlQX", "SimdShiftOp", signedTypes, 4,
2730                       shlCode)
2731    # SSHLL, SSHLL2
2732    shllCode = '''
2733            if (imm >= sizeof(destElem) * 8) {
2734                destElem = 0;
2735            } else {
2736                destElem = (BigElement)srcElem1 << imm;
2737            }
2738    '''
2739    twoRegLongInstX("sshll", "SshllX", "SimdShiftOp", smallSignedTypes,
2740                    shllCode, hasImm=True)
2741    twoRegLongInstX("sshll", "Sshll2X", "SimdShiftOp", smallSignedTypes,
2742                    shllCode, hasImm=True, hi=True)
2743    # SSHR
2744    shrCode = '''
2745            if (imm >= sizeof(srcElem1) * 8) {
2746                if (ltz(srcElem1))
2747                    destElem = -1;
2748                else
2749                    destElem = 0;
2750            } else {
2751                destElem = srcElem1 >> imm;
2752            }
2753    '''
2754    twoEqualRegInstX("sshr", "SshrDX", "SimdShiftOp", signedTypes, 2, shrCode,
2755                     hasImm=True)
2756    twoEqualRegInstX("sshr", "SshrQX", "SimdShiftOp", signedTypes, 4, shrCode,
2757                     hasImm=True)
2758    # SSRA
2759    sraCode = '''
2760            Element mid;;
2761            if (imm >= sizeof(srcElem1) * 8) {
2762                mid = ltz(srcElem1) ? -1 : 0;
2763            } else {
2764                mid = srcElem1 >> imm;
2765                if (ltz(srcElem1) && !ltz(mid)) {
2766                    mid |= -(mid & ((Element)1 <<
2767                                    (sizeof(Element) * 8 - 1 - imm)));
2768                }
2769            }
2770            destElem += mid;
2771    '''
2772    twoEqualRegInstX("ssra", "SsraDX", "SimdShiftOp", signedTypes, 2, sraCode,
2773                     True, hasImm=True)
2774    twoEqualRegInstX("ssra", "SsraQX", "SimdShiftOp", signedTypes, 4, sraCode,
2775                     True, hasImm=True)
2776    # SSUBL
2777    sublwCode = "destElem = (BigElement)srcElem1 - (BigElement)srcElem2;"
2778    threeRegLongInstX("ssubl", "SsublX", "SimdAddOp", smallSignedTypes,
2779                      sublwCode)
2780    threeRegLongInstX("ssubl2", "Ssubl2X", "SimdAddOp", smallSignedTypes,
2781                      sublwCode, hi=True)
2782    # SSUBW
2783    threeRegWideInstX("ssubw", "SsubwX", "SimdAddOp", smallSignedTypes,
2784                      sublwCode)
2785    threeRegWideInstX("ssubw2", "Ssubw2X", "SimdAddOp", smallSignedTypes,
2786                      sublwCode, hi=True)
2787    # SUB
2788    subCode = "destElem = srcElem1 - srcElem2;"
2789    threeEqualRegInstX("sub", "SubDX", "SimdAddOp", unsignedTypes, 2, subCode)
2790    threeEqualRegInstX("sub", "SubQX", "SimdAddOp", unsignedTypes, 4, subCode)
2791    # SUBHN, SUBHN2
2792    subhnCode = '''
2793            destElem = ((BigElement)srcElem1 - (BigElement)srcElem2) >>
2794                        (sizeof(Element) * 8);
2795    '''
2796    threeRegNarrowInstX("subhn", "SubhnX", "SimdAddOp", smallUnsignedTypes,
2797                        subhnCode)
2798    threeRegNarrowInstX("subhn2", "Subhn2X", "SimdAddOp", smallUnsignedTypes,
2799                        subhnCode, hi=True)
2800    # SUQADD
2801    suqaddCode = '''
2802            FPSCR fpscr = (FPSCR) FpscrQc;
2803            Element tmp = destElem + srcElem1;
2804            if (bits(destElem, sizeof(Element) * 8 - 1) == 0) {
2805                if (bits(tmp, sizeof(Element) * 8 - 1) == 1 ||
2806                        tmp < srcElem1 || tmp < destElem) {
2807                    destElem = (((Element) 1) << (sizeof(Element) * 8 - 1)) - 1;
2808                    fpscr.qc = 1;
2809                } else {
2810                    destElem = tmp;
2811                }
2812            } else {
2813                Element absDestElem = (~destElem) + 1;
2814                if (absDestElem < srcElem1) {
2815                    // Still check for positive sat., no need to check for negative sat.
2816                    if (bits(tmp, sizeof(Element) * 8 - 1) == 1) {
2817                        destElem = (((Element) 1) << (sizeof(Element) * 8 - 1)) - 1;
2818                        fpscr.qc = 1;
2819                    } else {
2820                        destElem = tmp;
2821                    }
2822                } else {
2823                    destElem = tmp;
2824                }
2825            }
2826            FpscrQc = fpscr;
2827    '''
2828    twoEqualRegInstX("suqadd", "SuqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
2829                     suqaddCode, True)
2830    twoEqualRegInstX("suqadd", "SuqaddQX", "SimdAddOp", unsignedTypes, 4,
2831                     suqaddCode, True)
2832    twoEqualRegInstX("suqadd", "SuqaddScX", "SimdAddOp", unsignedTypes, 4,
2833                     suqaddCode, True, scalar=True)
2834    # SXTL -> alias to SSHLL
2835    # TBL
2836    tbxTblInstX("tbl", "Tbl1DX", "SimdMiscOp", ("uint8_t",), 1, "true", 2)
2837    tbxTblInstX("tbl", "Tbl1QX", "SimdMiscOp", ("uint8_t",), 1, "true", 4)
2838    tbxTblInstX("tbl", "Tbl2DX", "SimdMiscOp", ("uint8_t",), 2, "true", 2)
2839    tbxTblInstX("tbl", "Tbl2QX", "SimdMiscOp", ("uint8_t",), 2, "true", 4)
2840    tbxTblInstX("tbl", "Tbl3DX", "SimdMiscOp", ("uint8_t",), 3, "true", 2)
2841    tbxTblInstX("tbl", "Tbl3QX", "SimdMiscOp", ("uint8_t",), 3, "true", 4)
2842    tbxTblInstX("tbl", "Tbl4DX", "SimdMiscOp", ("uint8_t",), 4, "true", 2)
2843    tbxTblInstX("tbl", "Tbl4QX", "SimdMiscOp", ("uint8_t",), 4, "true", 4)
2844    # TBX
2845    tbxTblInstX("tbx", "Tbx1DX", "SimdMiscOp", ("uint8_t",), 1, "false", 2)
2846    tbxTblInstX("tbx", "Tbx1QX", "SimdMiscOp", ("uint8_t",), 1, "false", 4)
2847    tbxTblInstX("tbx", "Tbx2DX", "SimdMiscOp", ("uint8_t",), 2, "false", 2)
2848    tbxTblInstX("tbx", "Tbx2QX", "SimdMiscOp", ("uint8_t",), 2, "false", 4)
2849    tbxTblInstX("tbx", "Tbx3DX", "SimdMiscOp", ("uint8_t",), 3, "false", 2)
2850    tbxTblInstX("tbx", "Tbx3QX", "SimdMiscOp", ("uint8_t",), 3, "false", 4)
2851    tbxTblInstX("tbx", "Tbx4DX", "SimdMiscOp", ("uint8_t",), 4, "false", 2)
2852    tbxTblInstX("tbx", "Tbx4QX", "SimdMiscOp", ("uint8_t",), 4, "false", 4)
2853    # TRN1
2854    trnCode = '''
2855        unsigned part = %s;
2856        for (unsigned i = 0; i < eCount / 2; i++) {
2857            destReg.elements[2 * i] = srcReg1.elements[2 * i + part];
2858            destReg.elements[2 * i + 1] = srcReg2.elements[2 * i + part];
2859        }
2860    '''
2861    threeRegScrambleInstX("trn1", "Trn1DX", "SimdAluOp", smallUnsignedTypes, 2,
2862                          trnCode % "0")
2863    threeRegScrambleInstX("trn1", "Trn1QX", "SimdAluOp", unsignedTypes, 4,
2864                          trnCode % "0")
2865    # TRN2
2866    threeRegScrambleInstX("trn2", "Trn2DX", "SimdAluOp", smallUnsignedTypes, 2,
2867                          trnCode % "1")
2868    threeRegScrambleInstX("trn2", "Trn2QX", "SimdAluOp", unsignedTypes, 4,
2869                          trnCode % "1")
2870    # UABA
2871    threeEqualRegInstX("uaba", "UabaDX", "SimdAddAccOp", smallUnsignedTypes, 2,
2872                       abaCode, True)
2873    threeEqualRegInstX("uaba", "UabaQX", "SimdAddAccOp", smallUnsignedTypes, 4,
2874                       abaCode, True)
2875    # UABAL, UABAL2
2876    threeRegLongInstX("uabal", "UabalX", "SimdAddAccOp", smallUnsignedTypes,
2877                      abalCode, True)
2878    threeRegLongInstX("uabal2", "Uabal2X", "SimdAddAccOp", smallUnsignedTypes,
2879                      abalCode, True, hi=True)
2880    # UABD
2881    threeEqualRegInstX("uabd", "UabdDX", "SimdAddOp", smallUnsignedTypes, 2,
2882                       abdCode)
2883    threeEqualRegInstX("uabd", "UabdQX", "SimdAddOp", smallUnsignedTypes, 4,
2884                       abdCode)
2885    # UABDL, UABDL2
2886    threeRegLongInstX("uabdl", "UabdlX", "SimdAddAccOp", smallUnsignedTypes,
2887                      abdlCode, True)
2888    threeRegLongInstX("uabdl2", "Uabdl2X", "SimdAddAccOp", smallUnsignedTypes,
2889                      abdlCode, True, hi=True)
2890    # UADALP
2891    twoRegCondenseInstX("uadalp", "UadalpDX", "SimdAddOp", smallUnsignedTypes,
2892                        2, adalpCode, True)
2893    twoRegCondenseInstX("uadalp", "UadalpQX", "SimdAddOp", smallUnsignedTypes,
2894                        4, adalpCode, True)
2895    # UADDL, UADDL2
2896    threeRegLongInstX("uaddl", "UaddlX", "SimdAddAccOp", smallUnsignedTypes,
2897                      addlwCode)
2898    threeRegLongInstX("uaddl2", "Uaddl2X", "SimdAddAccOp", smallUnsignedTypes,
2899                      addlwCode, hi=True)
2900    # UADDLP
2901    twoRegCondenseInstX("uaddlp", "UaddlpDX", "SimdAddOp", smallUnsignedTypes,
2902                        2, addlwCode)
2903    twoRegCondenseInstX("uaddlp", "UaddlpQX", "SimdAddOp", smallUnsignedTypes,
2904                        4, addlwCode)
2905    # UADDLV
2906    twoRegAcrossInstX("uaddlv", "UaddlvDX", "SimdAddOp",
2907                      ("uint8_t", "uint16_t"), 2, addAcrossLongCode, long=True)
2908    twoRegAcrossInstX("uaddlv", "UaddlvQX", "SimdAddOp",
2909                      ("uint8_t", "uint16_t"), 4, addAcrossLongCode, long=True)
2910    twoRegAcrossInstX("uaddlv", "UaddlvBQX", "SimdAddOp", ("uint32_t",), 4,
2911                      addAcrossLongCode, doubleDest=True, long=True)
2912    # UADDW
2913    threeRegWideInstX("uaddw", "UaddwX", "SimdAddAccOp", smallUnsignedTypes,
2914                      addlwCode)
2915    threeRegWideInstX("uaddw2", "Uaddw2X", "SimdAddAccOp", smallUnsignedTypes,
2916                      addlwCode, hi=True)
2917    # UCVTF (fixed-point)
2918    ucvtfFixedCode = fpOp % ("fplibFixedToFP<Element>(srcElem1, imm, true,"
2919                             " FPCRRounding(fpscr), fpscr)")
2920    twoEqualRegInstX("ucvtf", "UcvtfFixedDX", "SimdCvtOp", smallFloatTypes, 2,
2921                     ucvtfFixedCode, hasImm=True)
2922    twoEqualRegInstX("ucvtf", "UcvtfFixedQX", "SimdCvtOp", floatTypes, 4,
2923                     ucvtfFixedCode, hasImm=True)
2924    twoEqualRegInstX("ucvtf", "UcvtfFixedScX", "SimdCvtOp", floatTypes, 4,
2925                     ucvtfFixedCode, hasImm=True, scalar=True)
2926    # UCVTF (integer)
2927    ucvtfIntCode = fpOp % ("fplibFixedToFP<Element>(srcElem1, 0, true,"
2928                           " FPCRRounding(fpscr), fpscr)")
2929    twoEqualRegInstX("ucvtf", "UcvtfIntDX", "SimdCvtOp", smallFloatTypes, 2,
2930                     ucvtfIntCode)
2931    twoEqualRegInstX("ucvtf", "UcvtfIntQX", "SimdCvtOp", floatTypes, 4,
2932                     ucvtfIntCode)
2933    twoEqualRegInstX("ucvtf", "UcvtfIntScX", "SimdCvtOp", floatTypes, 4,
2934                     ucvtfIntCode, scalar=True)
2935    # UHADD
2936    threeEqualRegInstX("uhadd", "UhaddDX", "SimdAddOp", smallUnsignedTypes, 2,
2937                       haddCode)
2938    threeEqualRegInstX("uhadd", "UhaddQX", "SimdAddOp", smallUnsignedTypes, 4,
2939                       haddCode)
2940    # UHSUB
2941    threeEqualRegInstX("uhsub", "UhsubDX", "SimdAddOp", smallUnsignedTypes, 2,
2942                       hsubCode)
2943    threeEqualRegInstX("uhsub", "UhsubQX", "SimdAddOp", smallUnsignedTypes, 4,
2944                       hsubCode)
2945    # UMAX
2946    threeEqualRegInstX("umax", "UmaxDX", "SimdCmpOp", smallUnsignedTypes, 2,
2947                       maxCode)
2948    threeEqualRegInstX("umax", "UmaxQX", "SimdCmpOp", smallUnsignedTypes, 4,
2949                       maxCode)
2950    # UMAXP
2951    threeEqualRegInstX("umaxp", "UmaxpDX", "SimdCmpOp", smallUnsignedTypes, 2,
2952                       maxCode, pairwise=True)
2953    threeEqualRegInstX("umaxp", "UmaxpQX", "SimdCmpOp", smallUnsignedTypes, 4,
2954                       maxCode, pairwise=True)
2955    # UMAXV
2956    twoRegAcrossInstX("umaxv", "UmaxvDX", "SimdCmpOp", ("uint8_t", "uint16_t"),
2957                      2, maxAcrossCode)
2958    twoRegAcrossInstX("umaxv", "UmaxvQX", "SimdCmpOp", smallUnsignedTypes, 4,
2959                      maxAcrossCode)
2960    # UMIN
2961    threeEqualRegInstX("umin", "UminDX", "SimdCmpOp", smallUnsignedTypes, 2,
2962                       minCode)
2963    threeEqualRegInstX("umin", "UminQX", "SimdCmpOp", smallUnsignedTypes, 4,
2964                       minCode)
2965    # UMINP
2966    threeEqualRegInstX("uminp", "UminpDX", "SimdCmpOp", smallUnsignedTypes, 2,
2967                       minCode, pairwise=True)
2968    threeEqualRegInstX("uminp", "UminpQX", "SimdCmpOp", smallUnsignedTypes, 4,
2969                       minCode, pairwise=True)
2970    # UMINV
2971    twoRegAcrossInstX("uminv", "UminvDX", "SimdCmpOp", ("uint8_t", "uint16_t"),
2972                      2, minAcrossCode)
2973    twoRegAcrossInstX("uminv", "UminvQX", "SimdCmpOp", smallUnsignedTypes, 4,
2974                      minAcrossCode)
2975    # UMLAL (by element)
2976    threeRegLongInstX("umlal", "UmlalElemX", "SimdMultAccOp",
2977                      smallUnsignedTypes, mlalCode, True, byElem=True)
2978    threeRegLongInstX("umlal", "UmlalElem2X", "SimdMultAccOp",
2979                      smallUnsignedTypes, mlalCode, True, byElem=True, hi=True)
2980    # UMLAL (vector)
2981    threeRegLongInstX("umlal", "UmlalX", "SimdMultAccOp", smallUnsignedTypes,
2982                      mlalCode, True)
2983    threeRegLongInstX("umlal", "Umlal2X", "SimdMultAccOp", smallUnsignedTypes,
2984                      mlalCode, True, hi=True)
2985    # UMLSL (by element)
2986    threeRegLongInstX("umlsl", "UmlslElemX", "SimdMultAccOp",
2987                      smallUnsignedTypes, mlslCode, True, byElem=True)
2988    threeRegLongInstX("umlsl", "UmlslElem2X", "SimdMultAccOp",
2989                      smallUnsignedTypes, mlslCode, True, byElem=True, hi=True)
2990    # UMLSL (vector)
2991    threeRegLongInstX("umlsl", "UmlslX", "SimdMultAccOp", smallUnsignedTypes,
2992                      mlslCode, True)
2993    threeRegLongInstX("umlsl", "Umlsl2X", "SimdMultAccOp", smallUnsignedTypes,
2994                      mlslCode, True, hi=True)
2995    # UMOV
2996    insToGprInstX("umov", "UmovWX", "SimdMiscOp", smallUnsignedTypes, 4, 'W')
2997    insToGprInstX("umov", "UmovXX", "SimdMiscOp", ("uint64_t",), 4, 'X')
2998    # UMULL, UMULL2 (by element)
2999    threeRegLongInstX("umull", "UmullElemX", "SimdMultOp", smallUnsignedTypes,
3000                      mullCode, byElem=True)
3001    threeRegLongInstX("umull", "UmullElem2X", "SimdMultOp", smallUnsignedTypes,
3002                      mullCode, byElem=True, hi=True)
3003    # UMULL, UMULL2 (vector)
3004    threeRegLongInstX("umull", "UmullX", "SimdMultOp", smallUnsignedTypes,
3005                      mullCode)
3006    threeRegLongInstX("umull", "Umull2X", "SimdMultOp", smallUnsignedTypes,
3007                      mullCode, hi=True)
3008    # UQADD
3009    uqaddCode = '''
3010            destElem = srcElem1 + srcElem2;
3011            FPSCR fpscr = (FPSCR) FpscrQc;
3012            if (destElem < srcElem1 || destElem < srcElem2) {
3013                destElem = (Element)(-1);
3014                fpscr.qc = 1;
3015            }
3016            FpscrQc = fpscr;
3017    '''
3018    threeEqualRegInstX("uqadd", "UqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
3019                       uqaddCode)
3020    threeEqualRegInstX("uqadd", "UqaddQX", "SimdAddOp", unsignedTypes, 4,
3021                       uqaddCode)
3022    threeEqualRegInstX("uqadd", "UqaddScX", "SimdAddOp", unsignedTypes, 4,
3023                       uqaddCode, scalar=True)
3024    # UQRSHL
3025    uqrshlCode = '''
3026            int16_t shiftAmt = (int8_t)srcElem2;
3027            FPSCR fpscr = (FPSCR) FpscrQc;
3028            if (shiftAmt < 0) {
3029                shiftAmt = -shiftAmt;
3030                Element rBit = 0;
3031                if (shiftAmt <= sizeof(Element) * 8)
3032                    rBit = bits(srcElem1, shiftAmt - 1);
3033                if (shiftAmt >= sizeof(Element) * 8) {
3034                    shiftAmt = sizeof(Element) * 8 - 1;
3035                    destElem = 0;
3036                } else {
3037                    destElem = (srcElem1 >> shiftAmt);
3038                }
3039                destElem += rBit;
3040            } else {
3041                if (shiftAmt >= sizeof(Element) * 8) {
3042                    if (srcElem1 != 0) {
3043                        destElem = mask(sizeof(Element) * 8);
3044                        fpscr.qc = 1;
3045                    } else {
3046                        destElem = 0;
3047                    }
3048                } else {
3049                    if (bits(srcElem1, sizeof(Element) * 8 - 1,
3050                                sizeof(Element) * 8 - shiftAmt)) {
3051                        destElem = mask(sizeof(Element) * 8);
3052                        fpscr.qc = 1;
3053                    } else {
3054                        destElem = srcElem1 << shiftAmt;
3055                    }
3056                }
3057            }
3058            FpscrQc = fpscr;
3059    '''
3060    threeEqualRegInstX("uqrshl", "UqrshlDX", "SimdCmpOp", smallUnsignedTypes,
3061                       2, uqrshlCode)
3062    threeEqualRegInstX("uqrshl", "UqrshlQX", "SimdCmpOp", unsignedTypes, 4,
3063                       uqrshlCode)
3064    threeEqualRegInstX("uqrshl", "UqrshlScX", "SimdCmpOp", unsignedTypes, 4,
3065                       uqrshlCode, scalar=True)
3066    # UQRSHRN
3067    uqrshrnCode = '''
3068            FPSCR fpscr = (FPSCR) FpscrQc;
3069            if (imm > sizeof(srcElem1) * 8) {
3070                if (srcElem1 != 0)
3071                    fpscr.qc = 1;
3072                destElem = 0;
3073            } else if (imm) {
3074                BigElement mid = (srcElem1 >> (imm - 1));
3075                uint64_t rBit = mid & 0x1;
3076                mid >>= 1;
3077                mid += rBit;
3078                if (mid != (Element)mid) {
3079                    destElem = mask(sizeof(Element) * 8);
3080                    fpscr.qc = 1;
3081                } else {
3082                    destElem = mid;
3083                }
3084            } else {
3085                if (srcElem1 != (Element)srcElem1) {
3086                    destElem = mask(sizeof(Element) * 8 - 1);
3087                    fpscr.qc = 1;
3088                } else {
3089                    destElem = srcElem1;
3090                }
3091            }
3092            FpscrQc = fpscr;
3093    '''
3094    twoRegNarrowInstX("uqrshrn", "UqrshrnX", "SimdShiftOp", smallUnsignedTypes,
3095                      uqrshrnCode, hasImm=True)
3096    twoRegNarrowInstX("uqrshrn2", "Uqrshrn2X", "SimdShiftOp",
3097                      smallUnsignedTypes, uqrshrnCode, hasImm=True, hi=True)
3098    twoRegNarrowInstX("uqrshrn", "UqrshrnScX", "SimdShiftOp",
3099                      smallUnsignedTypes, uqrshrnCode, hasImm=True,
3100                      scalar=True)
3101    # UQSHL (immediate)
3102    uqshlImmCode = '''
3103            FPSCR fpscr = (FPSCR) FpscrQc;
3104            if (imm >= sizeof(Element) * 8) {
3105                if (srcElem1 != 0) {
3106                    destElem = mask(sizeof(Element) * 8);
3107                    fpscr.qc = 1;
3108                } else {
3109                    destElem = 0;
3110                }
3111            } else if (imm) {
3112                destElem = (srcElem1 << imm);
3113                uint64_t topBits = bits((uint64_t)srcElem1,
3114                                        sizeof(Element) * 8 - 1,
3115                                        sizeof(Element) * 8 - imm);
3116                if (topBits != 0) {
3117                    destElem = mask(sizeof(Element) * 8);
3118                    fpscr.qc = 1;
3119                }
3120            } else {
3121                destElem = srcElem1;
3122            }
3123            FpscrQc = fpscr;
3124    '''
3125    twoEqualRegInstX("uqshl", "UqshlImmDX", "SimdAluOp", smallUnsignedTypes, 2,
3126                     uqshlImmCode, hasImm=True)
3127    twoEqualRegInstX("uqshl", "UqshlImmQX", "SimdAluOp", unsignedTypes, 4,
3128                     uqshlImmCode, hasImm=True)
3129    twoEqualRegInstX("uqshl", "UqshlImmScX", "SimdAluOp", unsignedTypes, 4,
3130                     uqshlImmCode, hasImm=True, scalar=True)
3131    # UQSHL (register)
3132    uqshlCode = '''
3133            int16_t shiftAmt = (int8_t)srcElem2;
3134            FPSCR fpscr = (FPSCR) FpscrQc;
3135            if (shiftAmt < 0) {
3136                shiftAmt = -shiftAmt;
3137                if (shiftAmt >= sizeof(Element) * 8) {
3138                    shiftAmt = sizeof(Element) * 8 - 1;
3139                    destElem = 0;
3140                } else {
3141                    destElem = (srcElem1 >> shiftAmt);
3142                }
3143            } else if (shiftAmt > 0) {
3144                if (shiftAmt >= sizeof(Element) * 8) {
3145                    if (srcElem1 != 0) {
3146                        destElem = mask(sizeof(Element) * 8);
3147                        fpscr.qc = 1;
3148                    } else {
3149                        destElem = 0;
3150                    }
3151                } else {
3152                    if (bits(srcElem1, sizeof(Element) * 8 - 1,
3153                                sizeof(Element) * 8 - shiftAmt)) {
3154                        destElem = mask(sizeof(Element) * 8);
3155                        fpscr.qc = 1;
3156                    } else {
3157                        destElem = srcElem1 << shiftAmt;
3158                    }
3159                }
3160            } else {
3161                destElem = srcElem1;
3162            }
3163            FpscrQc = fpscr;
3164    '''
3165    threeEqualRegInstX("uqshl", "UqshlDX", "SimdAluOp", smallUnsignedTypes, 2,
3166                       uqshlCode)
3167    threeEqualRegInstX("uqshl", "UqshlQX", "SimdAluOp", unsignedTypes, 4,
3168                       uqshlCode)
3169    threeEqualRegInstX("uqshl", "UqshlScX", "SimdAluOp", unsignedTypes, 4,
3170                       uqshlCode, scalar=True)
3171    # UQSHRN, UQSHRN2
3172    uqshrnCode = '''
3173            FPSCR fpscr = (FPSCR) FpscrQc;
3174            if (imm > sizeof(srcElem1) * 8) {
3175                if (srcElem1 != 0)
3176                    fpscr.qc = 1;
3177                destElem = 0;
3178            } else if (imm) {
3179                BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
3180                if (mid != (Element)mid) {
3181                    destElem = mask(sizeof(Element) * 8);
3182                    fpscr.qc = 1;
3183                } else {
3184                    destElem = mid;
3185                }
3186            } else {
3187                destElem = srcElem1;
3188            }
3189            FpscrQc = fpscr;
3190    '''
3191    twoRegNarrowInstX("uqshrn", "UqshrnX", "SimdShiftOp", smallUnsignedTypes,
3192                      uqshrnCode, hasImm=True)
3193    twoRegNarrowInstX("uqshrn2", "Uqshrn2X", "SimdShiftOp", smallUnsignedTypes,
3194                      uqshrnCode, hasImm=True, hi=True)
3195    twoRegNarrowInstX("uqshrn", "UqshrnScX", "SimdShiftOp", smallUnsignedTypes,
3196                      uqshrnCode, hasImm=True, scalar=True)
3197    # UQSUB
3198    uqsubCode = '''
3199            destElem = srcElem1 - srcElem2;
3200            FPSCR fpscr = (FPSCR) FpscrQc;
3201            if (destElem > srcElem1) {
3202                destElem = 0;
3203                fpscr.qc = 1;
3204            }
3205            FpscrQc = fpscr;
3206    '''
3207    threeEqualRegInstX("uqsub", "UqsubDX", "SimdAddOp", smallUnsignedTypes, 2,
3208                       uqsubCode)
3209    threeEqualRegInstX("uqsub", "UqsubQX", "SimdAddOp", unsignedTypes, 4,
3210                       uqsubCode)
3211    threeEqualRegInstX("uqsub", "UqsubScX", "SimdAddOp", unsignedTypes, 4,
3212                       uqsubCode, scalar=True)
3213    # UQXTN
3214    uqxtnCode = '''
3215            FPSCR fpscr = (FPSCR) FpscrQc;
3216            destElem = srcElem1;
3217            if ((BigElement)destElem != srcElem1) {
3218                fpscr.qc = 1;
3219                destElem = mask(sizeof(Element) * 8);
3220            }
3221            FpscrQc = fpscr;
3222    '''
3223    twoRegNarrowInstX("uqxtn", "UqxtnX", "SimdMiscOp", smallUnsignedTypes,
3224                      uqxtnCode)
3225    twoRegNarrowInstX("uqxtn", "Uqxtn2X", "SimdMiscOp", smallUnsignedTypes,
3226                      uqxtnCode, hi=True)
3227    twoRegNarrowInstX("uqxtn", "UqxtnScX", "SimdMiscOp", smallUnsignedTypes,
3228                      uqxtnCode, scalar=True)
3229    # URECPE
3230    urecpeCode = "destElem = unsignedRecipEstimate(srcElem1);"
3231    twoEqualRegInstX("urecpe", "UrecpeDX", "SimdMultAccOp", ("uint32_t",), 2,
3232                     urecpeCode)
3233    twoEqualRegInstX("urecpe", "UrecpeQX", "SimdMultAccOp", ("uint32_t",), 4,
3234                     urecpeCode)
3235    # URHADD
3236    threeEqualRegInstX("urhadd", "UrhaddDX", "SimdAddOp", smallUnsignedTypes,
3237                       2, rhaddCode)
3238    threeEqualRegInstX("urhadd", "UrhaddQX", "SimdAddOp", smallUnsignedTypes,
3239                       4, rhaddCode)
3240    # URSHL
3241    threeEqualRegInstX("urshl", "UrshlDX", "SimdShiftOp", unsignedTypes, 2,
3242                       rshlCode)
3243    threeEqualRegInstX("urshl", "UrshlQX", "SimdShiftOp", unsignedTypes, 4,
3244                       rshlCode)
3245    # URSHR
3246    twoEqualRegInstX("urshr", "UrshrDX", "SimdShiftOp", unsignedTypes, 2,
3247                     rshrCode, hasImm=True)
3248    twoEqualRegInstX("urshr", "UrshrQX", "SimdShiftOp", unsignedTypes, 4,
3249                     rshrCode, hasImm=True)
3250    # URSQRTE
3251    ursqrteCode = "destElem = unsignedRSqrtEstimate(srcElem1);"
3252    twoEqualRegInstX("ursqrte", "UrsqrteDX", "SimdSqrtOp", ("uint32_t",), 2,
3253                     ursqrteCode)
3254    twoEqualRegInstX("ursqrte", "UrsqrteQX", "SimdSqrtOp", ("uint32_t",), 4,
3255                     ursqrteCode)
3256    # URSRA
3257    twoEqualRegInstX("ursra", "UrsraDX", "SimdShiftOp", unsignedTypes, 2,
3258                     rsraCode, True, hasImm=True)
3259    twoEqualRegInstX("ursra", "UrsraQX", "SimdShiftOp", unsignedTypes, 4,
3260                     rsraCode, True, hasImm=True)
3261    # USHL
3262    threeEqualRegInstX("ushl", "UshlDX", "SimdShiftOp", unsignedTypes, 2,
3263                       shlCode)
3264    threeEqualRegInstX("ushl", "UshlQX", "SimdShiftOp", unsignedTypes, 4,
3265                       shlCode)
3266    # USHLL, USHLL2
3267    twoRegLongInstX("ushll", "UshllX", "SimdShiftOp", smallUnsignedTypes,
3268                    shllCode, hasImm=True)
3269    twoRegLongInstX("ushll", "Ushll2X", "SimdShiftOp", smallUnsignedTypes,
3270                    shllCode, hi=True, hasImm=True)
3271    # USHR
3272    twoEqualRegInstX("ushr", "UshrDX", "SimdShiftOp", unsignedTypes, 2,
3273                     shrCode, hasImm=True)
3274    twoEqualRegInstX("ushr", "UshrQX", "SimdShiftOp", unsignedTypes, 4,
3275                     shrCode, hasImm=True)
3276    # USQADD
3277    usqaddCode = '''
3278            FPSCR fpscr = (FPSCR) FpscrQc;
3279            Element tmp = destElem + srcElem1;
3280            if (bits(srcElem1, sizeof(Element) * 8 - 1) == 0) {
3281                if (tmp < srcElem1 || tmp < destElem) {
3282                    destElem = (Element)(-1);
3283                    fpscr.qc = 1;
3284                } else {
3285                    destElem = tmp;
3286                }
3287            } else {
3288                Element absSrcElem1 = (~srcElem1) + 1;
3289                if (absSrcElem1 > destElem) {
3290                    destElem = 0;
3291                    fpscr.qc = 1;
3292                } else {
3293                    destElem = tmp;
3294                }
3295            }
3296            FpscrQc = fpscr;
3297    '''
3298    twoEqualRegInstX("usqadd", "UsqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
3299                     usqaddCode, True)
3300    twoEqualRegInstX("usqadd", "UsqaddQX", "SimdAddOp", unsignedTypes, 4,
3301                     usqaddCode, True)
3302    twoEqualRegInstX("usqadd", "UsqaddScX", "SimdAddOp", unsignedTypes, 4,
3303                     usqaddCode, True, scalar=True)
3304    # USRA
3305    twoEqualRegInstX("usra", "UsraDX", "SimdShiftOp", unsignedTypes, 2,
3306                     sraCode, True, hasImm=True)
3307    twoEqualRegInstX("usra", "UsraQX", "SimdShiftOp", unsignedTypes, 4,
3308                     sraCode, True, hasImm=True)
3309    # USUBL
3310    threeRegLongInstX("usubl", "UsublX", "SimdAddOp", smallUnsignedTypes,
3311                      sublwCode)
3312    threeRegLongInstX("usubl2", "Usubl2X", "SimdAddOp", smallUnsignedTypes,
3313                      sublwCode, hi=True)
3314    # USUBW
3315    threeRegWideInstX("usubw", "UsubwX", "SimdAddOp", smallUnsignedTypes,
3316                      sublwCode)
3317    threeRegWideInstX("usubw2", "Usubw2X", "SimdAddOp", smallUnsignedTypes,
3318                      sublwCode, hi=True)
3319    # UXTL -> alias to USHLL
3320    # UZP1
3321    uzpCode = '''
3322        unsigned part = %s;
3323        for (unsigned i = 0; i < eCount / 2; i++) {
3324            destReg.elements[i] = srcReg1.elements[2 * i + part];
3325            destReg.elements[eCount / 2 + i] = srcReg2.elements[2 * i + part];
3326        }
3327    '''
3328    threeRegScrambleInstX("Uzp1", "Uzp1DX", "SimdAluOp", smallUnsignedTypes, 2,
3329                          uzpCode % "0")
3330    threeRegScrambleInstX("Uzp1", "Uzp1QX", "SimdAluOp", unsignedTypes, 4,
3331                          uzpCode % "0")
3332    # UZP2
3333    threeRegScrambleInstX("Uzp2", "Uzp2DX", "SimdAluOp", smallUnsignedTypes, 2,
3334                          uzpCode % "1")
3335    threeRegScrambleInstX("Uzp2", "Uzp2QX", "SimdAluOp", unsignedTypes, 4,
3336                          uzpCode % "1")
3337    # XTN, XTN2
3338    xtnCode = "destElem = srcElem1;"
3339    twoRegNarrowInstX("Xtn", "XtnX", "SimdMiscOp", smallUnsignedTypes, xtnCode)
3340    twoRegNarrowInstX("Xtn", "Xtn2X", "SimdMiscOp", smallUnsignedTypes,
3341                      xtnCode, hi=True)
3342    # ZIP1
3343    zipCode = '''
3344        unsigned base = %s;
3345        for (unsigned i = 0; i < eCount / 2; i++) {
3346            destReg.elements[2 * i] = srcReg1.elements[base + i];
3347            destReg.elements[2 * i + 1] = srcReg2.elements[base + i];
3348        }
3349    '''
3350    threeRegScrambleInstX("zip1", "Zip1DX", "SimdAluOp", smallUnsignedTypes, 2,
3351                          zipCode % "0")
3352    threeRegScrambleInstX("zip1", "Zip1QX", "SimdAluOp", unsignedTypes, 4,
3353                          zipCode % "0")
3354    # ZIP2
3355    threeRegScrambleInstX("zip2", "Zip2DX", "SimdAluOp", smallUnsignedTypes, 2,
3356                          zipCode % "eCount / 2")
3357    threeRegScrambleInstX("zip2", "Zip2QX", "SimdAluOp", unsignedTypes, 4,
3358                          zipCode % "eCount / 2")
3359
3360    for decoderFlavour, type_dict in decoders.iteritems():
3361        header_output += '''
3362        class %(decoder_flavour)sDecoder {
3363        public:
3364        ''' % { "decoder_flavour" : decoderFlavour }
3365        for type,name in type_dict.iteritems():
3366            header_output += '''
3367            template<typename Elem> using %(type)s = %(new_name)s<Elem>;''' % {
3368               "type" : type, "new_name" : name
3369            }
3370        header_output += '''
3371        };'''
3372}};
3373