neon64.isa revision 10474:799c8ee4ecba
1// -*- mode: c++ -*-
2
3// Copyright (c) 2012-2013 ARM Limited
4// All rights reserved
5//
6// The license below extends only to copyright in the software and shall
7// not be construed as granting a license to any other intellectual
8// property including but not limited to intellectual property relating
9// to a hardware implementation of the functionality of the software
10// licensed hereunder.  You may use the software subject to the license
11// terms below provided that you ensure that this notice is replicated
12// unmodified and in its entirety in all distributions of the software,
13// modified or unmodified, in source code or in binary form.
14//
15// Redistribution and use in source and binary forms, with or without
16// modification, are permitted provided that the following conditions are
17// met: redistributions of source code must retain the above copyright
18// notice, this list of conditions and the following disclaimer;
19// redistributions in binary form must reproduce the above copyright
20// notice, this list of conditions and the following disclaimer in the
21// documentation and/or other materials provided with the distribution;
22// neither the name of the copyright holders nor the names of its
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Authors: Giacomo Gabrielli
39//          Mbou Eyole
40
41let {{
42
43    header_output = ""
44    exec_output = ""
45
46    # FP types (FP operations always work with unsigned representations)
47    floatTypes = ("uint32_t", "uint64_t")
48    smallFloatTypes = ("uint32_t",)
49
50    def threeEqualRegInstX(name, Name, opClass, types, rCount, op,
51                           readDest=False, pairwise=False, scalar=False,
52                           byElem=False):
53        assert (not pairwise) or ((not byElem) and (not scalar))
54        global header_output, exec_output
55        eWalkCode = simd64EnabledCheckCode + '''
56        RegVect srcReg1, destReg;
57        '''
58        if byElem:
59            # 2nd register operand has to be read fully
60            eWalkCode += '''
61        FullRegVect srcReg2;
62        '''
63        else:
64            eWalkCode += '''
65        RegVect srcReg2;
66        '''
67        for reg in range(rCount):
68            eWalkCode += '''
69        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
70        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
71        ''' % { "reg" : reg }
72            if readDest:
73                eWalkCode += '''
74        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
75        ''' % { "reg" : reg }
76        if byElem:
77            # 2nd operand has to be read fully
78            for reg in range(rCount, 4):
79                eWalkCode += '''
80        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
81        ''' % { "reg" : reg }
82        readDestCode = ''
83        if readDest:
84            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
85        if pairwise:
86            eWalkCode += '''
87        for (unsigned i = 0; i < eCount; i++) {
88            Element srcElem1 = gtoh(2 * i < eCount ?
89                                    srcReg1.elements[2 * i] :
90                                    srcReg2.elements[2 * i - eCount]);
91            Element srcElem2 = gtoh(2 * i < eCount ?
92                                    srcReg1.elements[2 * i + 1] :
93                                    srcReg2.elements[2 * i + 1 - eCount]);
94            Element destElem;
95            %(readDest)s
96            %(op)s
97            destReg.elements[i] = htog(destElem);
98        }
99        ''' % { "op" : op, "readDest" : readDestCode }
100        else:
101            scalarCheck = '''
102            if (i != 0) {
103                destReg.elements[i] = 0;
104                continue;
105            }
106            '''
107            eWalkCode += '''
108        for (unsigned i = 0; i < eCount; i++) {
109            %(scalarCheck)s
110            Element srcElem1 = gtoh(srcReg1.elements[i]);
111            Element srcElem2 = gtoh(srcReg2.elements[%(src2Index)s]);
112            Element destElem;
113            %(readDest)s
114            %(op)s
115            destReg.elements[i] = htog(destElem);
116        }
117        ''' % { "op" : op, "readDest" : readDestCode,
118                "scalarCheck" : scalarCheck if scalar else "",
119                "src2Index" : "imm" if byElem else "i" }
120        for reg in range(rCount):
121            eWalkCode += '''
122        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
123        ''' % { "reg" : reg }
124        if rCount < 4:  # zero upper half
125            for reg in range(rCount, 4):
126                eWalkCode += '''
127        AA64FpDestP%(reg)d_uw = 0;
128        ''' % { "reg" : reg }
129        iop = InstObjParams(name, Name,
130                            "DataX2RegImmOp" if byElem else "DataX2RegOp",
131                            { "code": eWalkCode,
132                              "r_count": rCount,
133                              "op_class": opClass }, [])
134        if byElem:
135            header_output += NeonX2RegImmOpDeclare.subst(iop)
136        else:
137            header_output += NeonX2RegOpDeclare.subst(iop)
138        exec_output += NeonXEqualRegOpExecute.subst(iop)
139        for type in types:
140            substDict = { "targs" : type,
141                          "class_name" : Name }
142            exec_output += NeonXExecDeclare.subst(substDict)
143
144    def threeUnequalRegInstX(name, Name, opClass, types, op,
145                             bigSrc1, bigSrc2, bigDest, readDest, scalar=False,
146                             byElem=False, hi=False):
147        assert not (scalar and hi)
148        global header_output, exec_output
149        src1Cnt = src2Cnt = destCnt = 2
150        src1Prefix = src2Prefix = destPrefix = ''
151        if bigSrc1:
152            src1Cnt = 4
153            src1Prefix = 'Big'
154        if bigSrc2:
155            src2Cnt = 4
156            src2Prefix = 'Big'
157        if bigDest:
158            destCnt = 4
159            destPrefix = 'Big'
160        if byElem:
161            src2Prefix = 'Full'
162        eWalkCode = simd64EnabledCheckCode + '''
163        %sRegVect srcReg1;
164        %sRegVect srcReg2;
165        %sRegVect destReg;
166        ''' % (src1Prefix, src2Prefix, destPrefix)
167        srcReg1 = 0
168        if hi and not bigSrc1:  # long/widening operations
169            srcReg1 = 2
170        for reg in range(src1Cnt):
171            eWalkCode += '''
172        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(srcReg1)d_uw);
173        ''' % { "reg" : reg, "srcReg1" : srcReg1 }
174            srcReg1 += 1
175        srcReg2 = 0
176        if (not byElem) and (hi and not bigSrc2):  # long/widening operations
177            srcReg2 = 2
178        for reg in range(src2Cnt):
179            eWalkCode += '''
180        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(srcReg2)d_uw);
181        ''' % { "reg" : reg, "srcReg2" : srcReg2 }
182            srcReg2 += 1
183        if byElem:
184            # 2nd operand has to be read fully
185            for reg in range(src2Cnt, 4):
186                eWalkCode += '''
187        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
188        ''' % { "reg" : reg }
189        if readDest:
190            for reg in range(destCnt):
191                eWalkCode += '''
192        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
193        ''' % { "reg" : reg }
194        readDestCode = ''
195        if readDest:
196            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
197        scalarCheck = '''
198            if (i != 0) {
199                destReg.elements[i] = 0;
200                continue;
201            }
202            '''
203        eWalkCode += '''
204        for (unsigned i = 0; i < eCount; i++) {
205            %(scalarCheck)s
206            %(src1Prefix)sElement srcElem1 = gtoh(srcReg1.elements[i]);
207            %(src1Prefix)sElement srcElem2 = gtoh(srcReg2.elements[%(src2Index)s]);
208            %(destPrefix)sElement destElem;
209            %(readDest)s
210            %(op)s
211            destReg.elements[i] = htog(destElem);
212        }
213        ''' % { "op" : op, "readDest" : readDestCode,
214                "src1Prefix" : src1Prefix, "src2Prefix" : src2Prefix,
215                "destPrefix" : destPrefix,
216                "scalarCheck" : scalarCheck if scalar else "",
217                "src2Index" : "imm" if byElem else "i" }
218        destReg = 0
219        if hi and not bigDest:
220            # narrowing operations
221            destReg = 2
222        for reg in range(destCnt):
223            eWalkCode += '''
224        AA64FpDestP%(destReg)d_uw = gtoh(destReg.regs[%(reg)d]);
225        ''' % { "reg" : reg, "destReg": destReg }
226            destReg += 1
227        if destCnt < 4 and not hi:  # zero upper half
228            for reg in range(destCnt, 4):
229                eWalkCode += '''
230        AA64FpDestP%(reg)d_uw = 0;
231        ''' % { "reg" : reg }
232        iop = InstObjParams(name, Name,
233                            "DataX2RegImmOp" if byElem else "DataX2RegOp",
234                            { "code": eWalkCode,
235                              "r_count": 2,
236                              "op_class": opClass }, [])
237        if byElem:
238            header_output += NeonX2RegImmOpDeclare.subst(iop)
239        else:
240            header_output += NeonX2RegOpDeclare.subst(iop)
241        exec_output += NeonXUnequalRegOpExecute.subst(iop)
242        for type in types:
243            substDict = { "targs" : type,
244                          "class_name" : Name }
245            exec_output += NeonXExecDeclare.subst(substDict)
246
247    def threeRegNarrowInstX(name, Name, opClass, types, op, readDest=False,
248                            scalar=False, byElem=False, hi=False):
249        assert not byElem
250        threeUnequalRegInstX(name, Name, opClass, types, op,
251                             True, True, False, readDest, scalar, byElem, hi)
252
253    def threeRegLongInstX(name, Name, opClass, types, op, readDest=False,
254                          scalar=False, byElem=False, hi=False):
255        threeUnequalRegInstX(name, Name, opClass, types, op,
256                             False, False, True, readDest, scalar, byElem, hi)
257
258    def threeRegWideInstX(name, Name, opClass, types, op, readDest=False,
259                          scalar=False, byElem=False, hi=False):
260        assert not byElem
261        threeUnequalRegInstX(name, Name, opClass, types, op,
262                             True, False, True, readDest, scalar, byElem, hi)
263
264    def twoEqualRegInstX(name, Name, opClass, types, rCount, op,
265                         readDest=False, scalar=False, byElem=False,
266                         hasImm=False, isDup=False):
267        global header_output, exec_output
268        assert (not isDup) or byElem
269        if byElem:
270            hasImm = True
271        if isDup:
272            eWalkCode = simd64EnabledCheckCode + '''
273        FullRegVect srcReg1;
274        RegVect destReg;
275        '''
276        else:
277            eWalkCode = simd64EnabledCheckCode + '''
278        RegVect srcReg1, destReg;
279        '''
280        for reg in range(4 if isDup else rCount):
281            eWalkCode += '''
282        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
283        ''' % { "reg" : reg }
284            if readDest:
285                eWalkCode += '''
286        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
287        ''' % { "reg" : reg }
288        readDestCode = ''
289        if readDest:
290            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
291        scalarCheck = '''
292            if (i != 0) {
293                destReg.elements[i] = 0;
294                continue;
295            }
296            '''
297        eWalkCode += '''
298        for (unsigned i = 0; i < eCount; i++) {
299            %(scalarCheck)s
300            unsigned j = i;
301            Element srcElem1 = gtoh(srcReg1.elements[%(src1Index)s]);
302            Element destElem;
303            %(readDest)s
304            %(op)s
305            destReg.elements[j] = htog(destElem);
306        }
307        ''' % { "op" : op, "readDest" : readDestCode,
308                "scalarCheck" : scalarCheck if scalar else "",
309                "src1Index" : "imm" if byElem else "i" }
310        for reg in range(rCount):
311            eWalkCode += '''
312        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
313        ''' % { "reg" : reg }
314        if rCount < 4:  # zero upper half
315            for reg in range(rCount, 4):
316                eWalkCode += '''
317        AA64FpDestP%(reg)d_uw = 0;
318        ''' % { "reg" : reg }
319        iop = InstObjParams(name, Name,
320                            "DataX1RegImmOp" if hasImm else "DataX1RegOp",
321                            { "code": eWalkCode,
322                              "r_count": rCount,
323                              "op_class": opClass }, [])
324        if hasImm:
325            header_output += NeonX1RegImmOpDeclare.subst(iop)
326        else:
327            header_output += NeonX1RegOpDeclare.subst(iop)
328        exec_output += NeonXEqualRegOpExecute.subst(iop)
329        for type in types:
330            substDict = { "targs" : type,
331                          "class_name" : Name }
332            exec_output += NeonXExecDeclare.subst(substDict)
333
334    def twoRegLongInstX(name, Name, opClass, types, op, readDest=False,
335                        hi=False, hasImm=False):
336        global header_output, exec_output
337        eWalkCode = simd64EnabledCheckCode + '''
338        RegVect srcReg1;
339        BigRegVect destReg;
340        '''
341        destReg = 0 if not hi else 2
342        for reg in range(2):
343            eWalkCode += '''
344        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(destReg)d_uw);
345        ''' % { "reg" : reg, "destReg": destReg }
346            destReg += 1
347        destReg = 0 if not hi else 2
348        if readDest:
349            for reg in range(4):
350                eWalkCode += '''
351        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
352        ''' % { "reg" : reg }
353                destReg += 1
354        readDestCode = ''
355        if readDest:
356            readDestCode = 'destReg = gtoh(destReg.elements[i]);'
357        eWalkCode += '''
358        for (unsigned i = 0; i < eCount; i++) {
359            Element srcElem1 = gtoh(srcReg1.elements[i]);
360            BigElement destElem;
361            %(readDest)s
362            %(op)s
363            destReg.elements[i] = htog(destElem);
364        }
365        ''' % { "op" : op, "readDest" : readDestCode }
366        for reg in range(4):
367            eWalkCode += '''
368        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
369        ''' % { "reg" : reg }
370        iop = InstObjParams(name, Name,
371                            "DataX1RegImmOp" if hasImm else "DataX1RegOp",
372                            { "code": eWalkCode,
373                              "r_count": 2,
374                              "op_class": opClass }, [])
375        if hasImm:
376            header_output += NeonX1RegImmOpDeclare.subst(iop)
377        else:
378            header_output += NeonX1RegOpDeclare.subst(iop)
379        exec_output += NeonXUnequalRegOpExecute.subst(iop)
380        for type in types:
381            substDict = { "targs" : type,
382                          "class_name" : Name }
383            exec_output += NeonXExecDeclare.subst(substDict)
384
385    def twoRegNarrowInstX(name, Name, opClass, types, op, readDest=False,
386                          scalar=False, hi=False, hasImm=False):
387        global header_output, exec_output
388        eWalkCode = simd64EnabledCheckCode + '''
389        BigRegVect srcReg1;
390        RegVect destReg;
391        '''
392        for reg in range(4):
393            eWalkCode += '''
394        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
395        ''' % { "reg" : reg }
396        if readDest:
397            for reg in range(2):
398                eWalkCode += '''
399        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
400        ''' % { "reg" : reg }
401        else:
402            eWalkCode += '''
403        destReg.elements[0] = 0;
404        ''' % { "reg" : reg }
405        readDestCode = ''
406        if readDest:
407            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
408        scalarCheck = '''
409            if (i != 0) {
410                destReg.elements[i] = 0;
411                continue;
412            }
413            '''
414        eWalkCode += '''
415        for (unsigned i = 0; i < eCount; i++) {
416            %(scalarCheck)s
417            BigElement srcElem1 = gtoh(srcReg1.elements[i]);
418            Element destElem;
419            %(readDest)s
420            %(op)s
421            destReg.elements[i] = htog(destElem);
422        }
423        ''' % { "op" : op, "readDest" : readDestCode,
424                "scalarCheck" : scalarCheck if scalar else "" }
425        destReg = 0 if not hi else 2
426        for reg in range(2):
427            eWalkCode += '''
428        AA64FpDestP%(destReg)d_uw = gtoh(destReg.regs[%(reg)d]);
429        ''' % { "reg" : reg, "destReg": destReg }
430            destReg += 1
431        if not hi:
432            for reg in range(2, 4):  # zero upper half
433                eWalkCode += '''
434        AA64FpDestP%(reg)d_uw = 0;
435        ''' % { "reg" : reg }
436        iop = InstObjParams(name, Name,
437                            "DataX1RegImmOp" if hasImm else "DataX1RegOp",
438                            { "code": eWalkCode,
439                              "r_count": 2,
440                              "op_class": opClass }, [])
441        if hasImm:
442            header_output += NeonX1RegImmOpDeclare.subst(iop)
443        else:
444            header_output += NeonX1RegOpDeclare.subst(iop)
445        exec_output += NeonXUnequalRegOpExecute.subst(iop)
446        for type in types:
447            substDict = { "targs" : type,
448                          "class_name" : Name }
449            exec_output += NeonXExecDeclare.subst(substDict)
450
451    def threeRegScrambleInstX(name, Name, opClass, types, rCount, op):
452        global header_output, exec_output
453        eWalkCode = simd64EnabledCheckCode + '''
454        RegVect srcReg1, srcReg2, destReg;
455        '''
456        for reg in range(rCount):
457            eWalkCode += '''
458        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
459        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
460        ''' % { "reg" : reg }
461        eWalkCode += op
462        for reg in range(rCount):
463            eWalkCode += '''
464        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
465        ''' % { "reg" : reg }
466        if rCount < 4:
467            for reg in range(rCount, 4):
468                eWalkCode += '''
469        AA64FpDestP%(reg)d_uw = 0;
470        ''' % { "reg" : reg }
471        iop = InstObjParams(name, Name,
472                            "DataX2RegOp",
473                            { "code": eWalkCode,
474                              "r_count": rCount,
475                              "op_class": opClass }, [])
476        header_output += NeonX2RegOpDeclare.subst(iop)
477        exec_output += NeonXEqualRegOpExecute.subst(iop)
478        for type in types:
479            substDict = { "targs" : type,
480                          "class_name" : Name }
481            exec_output += NeonXExecDeclare.subst(substDict)
482
483    def insFromVecElemInstX(name, Name, opClass, types, rCount):
484        global header_output, exec_output
485        eWalkCode = simd64EnabledCheckCode + '''
486        FullRegVect srcReg1;
487        RegVect destReg;
488        '''
489        for reg in range(4):
490            eWalkCode += '''
491        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
492        ''' % { "reg" : reg }
493        for reg in range(rCount):
494            eWalkCode += '''
495        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
496        ''' % { "reg" : reg }
497        eWalkCode += '''
498        Element srcElem1 = gtoh(srcReg1.elements[imm2]);
499        Element destElem = srcElem1;
500        destReg.elements[imm1] = htog(destElem);
501        '''
502        for reg in range(rCount):
503            eWalkCode += '''
504        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
505        ''' % { "reg" : reg }
506        iop = InstObjParams(name, Name,
507                            "DataX1Reg2ImmOp",
508                            { "code": eWalkCode,
509                              "r_count": rCount,
510                              "op_class": opClass }, [])
511        header_output += NeonX1Reg2ImmOpDeclare.subst(iop)
512        exec_output += NeonXEqualRegOpExecute.subst(iop)
513        for type in types:
514            substDict = { "targs" : type,
515                          "class_name" : Name }
516            exec_output += NeonXExecDeclare.subst(substDict)
517
518    def twoRegPairwiseScInstX(name, Name, opClass, types, rCount, op):
519        global header_output, exec_output
520        eWalkCode = simd64EnabledCheckCode + '''
521        RegVect srcReg1, destReg;
522        '''
523        for reg in range(rCount):
524            eWalkCode += '''
525        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
526        ''' % { "reg" : reg }
527        eWalkCode += '''
528        Element srcElem1 = gtoh(srcReg1.elements[0]);
529        Element srcElem2 = gtoh(srcReg1.elements[1]);
530        Element destElem;
531        %(op)s
532        destReg.elements[0] = htog(destElem);
533        ''' % { "op" : op }
534        destCnt = rCount / 2
535        for reg in range(destCnt):
536            eWalkCode += '''
537        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
538        ''' % { "reg" : reg }
539        for reg in range(destCnt, 4):  # zero upper half
540            eWalkCode += '''
541        AA64FpDestP%(reg)d_uw = 0;
542        ''' % { "reg" : reg }
543        iop = InstObjParams(name, Name,
544                            "DataX1RegOp",
545                            { "code": eWalkCode,
546                              "r_count": rCount,
547                              "op_class": opClass }, [])
548        header_output += NeonX1RegOpDeclare.subst(iop)
549        exec_output += NeonXEqualRegOpExecute.subst(iop)
550        for type in types:
551            substDict = { "targs" : type,
552                          "class_name" : Name }
553            exec_output += NeonXExecDeclare.subst(substDict)
554
555    def twoRegAcrossInstX(name, Name, opClass, types, rCount, op,
556                          doubleDest=False, long=False):
557        global header_output, exec_output
558        destPrefix = "Big" if long else ""
559        eWalkCode = simd64EnabledCheckCode + '''
560        RegVect srcReg1;
561        %sRegVect destReg;
562        ''' % destPrefix
563        for reg in range(rCount):
564            eWalkCode += '''
565        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
566        ''' % { "reg" : reg }
567        eWalkCode += '''
568        destReg.regs[0] = 0;
569        %(destPrefix)sElement destElem = 0;
570        for (unsigned i = 0; i < eCount; i++) {
571            Element srcElem1 = gtoh(srcReg1.elements[i]);
572            if (i == 0) {
573                destElem = srcElem1;
574            } else {
575                %(op)s
576            }
577        }
578        destReg.elements[0] = htog(destElem);
579        ''' % { "op" : op, "destPrefix" : destPrefix }
580        destCnt = 2 if doubleDest else 1
581        for reg in range(destCnt):
582            eWalkCode += '''
583        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
584        ''' % { "reg" : reg }
585        for reg in range(destCnt, 4):  # zero upper half
586            eWalkCode += '''
587        AA64FpDestP%(reg)d_uw = 0;
588        ''' % { "reg" : reg }
589        iop = InstObjParams(name, Name,
590                            "DataX1RegOp",
591                            { "code": eWalkCode,
592                              "r_count": rCount,
593                              "op_class": opClass }, [])
594        header_output += NeonX1RegOpDeclare.subst(iop)
595        if long:
596            exec_output += NeonXUnequalRegOpExecute.subst(iop)
597        else:
598            exec_output += NeonXEqualRegOpExecute.subst(iop)
599        for type in types:
600            substDict = { "targs" : type,
601                          "class_name" : Name }
602            exec_output += NeonXExecDeclare.subst(substDict)
603
604    def twoRegCondenseInstX(name, Name, opClass, types, rCount, op,
605                            readDest=False):
606        global header_output, exec_output
607        eWalkCode = simd64EnabledCheckCode + '''
608        RegVect srcRegs;
609        BigRegVect destReg;
610        '''
611        for reg in range(rCount):
612            eWalkCode += '''
613        srcRegs.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
614        ''' % { "reg" : reg }
615            if readDest:
616                eWalkCode += '''
617        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
618        ''' % { "reg" : reg }
619        readDestCode = ''
620        if readDest:
621            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
622        eWalkCode += '''
623        for (unsigned i = 0; i < eCount / 2; i++) {
624            Element srcElem1 = gtoh(srcRegs.elements[2 * i]);
625            Element srcElem2 = gtoh(srcRegs.elements[2 * i + 1]);
626            BigElement destElem;
627            %(readDest)s
628            %(op)s
629            destReg.elements[i] = htog(destElem);
630        }
631        ''' % { "op" : op, "readDest" : readDestCode }
632        for reg in range(rCount):
633            eWalkCode += '''
634        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
635        ''' % { "reg" : reg }
636        if rCount < 4:  # zero upper half
637            for reg in range(rCount, 4):
638                eWalkCode += '''
639        AA64FpDestP%(reg)d_uw = 0;
640        ''' % { "reg" : reg }
641        iop = InstObjParams(name, Name,
642                            "DataX1RegOp",
643                            { "code": eWalkCode,
644                              "r_count": rCount,
645                              "op_class": opClass }, [])
646        header_output += NeonX1RegOpDeclare.subst(iop)
647        exec_output += NeonXUnequalRegOpExecute.subst(iop)
648        for type in types:
649            substDict = { "targs" : type,
650                          "class_name" : Name }
651            exec_output += NeonXExecDeclare.subst(substDict)
652
653    def oneRegImmInstX(name, Name, opClass, types, rCount, op, readDest=False):
654        global header_output, exec_output
655        eWalkCode = simd64EnabledCheckCode + '''
656        RegVect destReg;
657        '''
658        if readDest:
659            for reg in range(rCount):
660                eWalkCode += '''
661        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
662        ''' % { "reg" : reg }
663        readDestCode = ''
664        if readDest:
665            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
666        eWalkCode += '''
667        for (unsigned i = 0; i < eCount; i++) {
668            Element destElem;
669            %(readDest)s
670            %(op)s
671            destReg.elements[i] = htog(destElem);
672        }
673        ''' % { "op" : op, "readDest" : readDestCode }
674        for reg in range(rCount):
675            eWalkCode += '''
676        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
677        ''' % { "reg" : reg }
678        if rCount < 4:  # zero upper half
679            for reg in range(rCount, 4):
680                eWalkCode += '''
681        AA64FpDestP%(reg)d_uw = 0;
682        ''' % { "reg" : reg }
683        iop = InstObjParams(name, Name,
684                            "DataXImmOnlyOp",
685                            { "code": eWalkCode,
686                              "r_count": rCount,
687                              "op_class": opClass }, [])
688        header_output += NeonX1RegImmOnlyOpDeclare.subst(iop)
689        exec_output += NeonXEqualRegOpExecute.subst(iop)
690        for type in types:
691            substDict = { "targs" : type,
692                          "class_name" : Name }
693            exec_output += NeonXExecDeclare.subst(substDict)
694
695    def dupGprInstX(name, Name, opClass, types, rCount, gprSpec):
696        global header_output, exec_output
697        eWalkCode = simd64EnabledCheckCode + '''
698        RegVect destReg;
699        for (unsigned i = 0; i < eCount; i++) {
700            destReg.elements[i] = htog((Element) %sOp1);
701        }
702        ''' % gprSpec
703        for reg in range(rCount):
704            eWalkCode += '''
705        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
706        ''' % { "reg" : reg }
707        if rCount < 4:  # zero upper half
708            for reg in range(rCount, 4):
709                eWalkCode += '''
710        AA64FpDestP%(reg)d_uw = 0;
711        ''' % { "reg" : reg }
712        iop = InstObjParams(name, Name,
713                            "DataX1RegOp",
714                            { "code": eWalkCode,
715                              "r_count": rCount,
716                              "op_class": opClass }, [])
717        header_output += NeonX1RegOpDeclare.subst(iop)
718        exec_output += NeonXEqualRegOpExecute.subst(iop)
719        for type in types:
720            substDict = { "targs" : type,
721                          "class_name" : Name }
722            exec_output += NeonXExecDeclare.subst(substDict)
723
724    def extInstX(name, Name, opClass, types, rCount, op):
725        global header_output, exec_output
726        eWalkCode = simd64EnabledCheckCode + '''
727        RegVect srcReg1, srcReg2, destReg;
728        '''
729        for reg in range(rCount):
730            eWalkCode += '''
731        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
732        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
733        ''' % { "reg" : reg }
734        eWalkCode += op
735        for reg in range(rCount):
736            eWalkCode += '''
737        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
738        ''' % { "reg" : reg }
739        if rCount < 4:  # zero upper half
740            for reg in range(rCount, 4):
741                eWalkCode += '''
742        AA64FpDestP%(reg)d_uw = 0;
743        ''' % { "reg" : reg }
744        iop = InstObjParams(name, Name,
745                            "DataX2RegImmOp",
746                            { "code": eWalkCode,
747                              "r_count": rCount,
748                              "op_class": opClass }, [])
749        header_output += NeonX2RegImmOpDeclare.subst(iop)
750        exec_output += NeonXEqualRegOpExecute.subst(iop)
751        for type in types:
752            substDict = { "targs" : type,
753                          "class_name" : Name }
754            exec_output += NeonXExecDeclare.subst(substDict)
755
756    def insFromGprInstX(name, Name, opClass, types, rCount, gprSpec):
757        global header_output, exec_output
758        eWalkCode = simd64EnabledCheckCode + '''
759        RegVect destReg;
760        '''
761        for reg in range(rCount):
762            eWalkCode += '''
763        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
764        ''' % { "reg" : reg }
765        eWalkCode += '''
766        destReg.elements[imm] = htog((Element) %sOp1);
767        ''' % gprSpec
768        for reg in range(rCount):
769            eWalkCode += '''
770        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
771        ''' % { "reg" : reg }
772        iop = InstObjParams(name, Name,
773                            "DataX1RegImmOp",
774                            { "code": eWalkCode,
775                              "r_count": rCount,
776                              "op_class": opClass }, [])
777        header_output += NeonX1RegImmOpDeclare.subst(iop)
778        exec_output += NeonXEqualRegOpExecute.subst(iop)
779        for type in types:
780            substDict = { "targs" : type,
781                          "class_name" : Name }
782            exec_output += NeonXExecDeclare.subst(substDict)
783
784    def insToGprInstX(name, Name, opClass, types, rCount, gprSpec,
785                      signExt=False):
786        global header_output, exec_output
787        eWalkCode = simd64EnabledCheckCode + '''
788        FullRegVect srcReg;
789        '''
790        for reg in range(4):
791            eWalkCode += '''
792        srcReg.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
793        ''' % { "reg" : reg }
794        if signExt:
795            eWalkCode += '''
796        %sDest = sext<sizeof(Element) * 8>(srcReg.elements[imm]);
797        ''' % gprSpec
798        else:
799            eWalkCode += '''
800        %sDest = srcReg.elements[imm];
801        ''' % gprSpec
802        iop = InstObjParams(name, Name,
803                            "DataX1RegImmOp",
804                            { "code": eWalkCode,
805                              "r_count": rCount,
806                              "op_class": opClass }, [])
807        header_output += NeonX1RegImmOpDeclare.subst(iop)
808        exec_output += NeonXEqualRegOpExecute.subst(iop)
809        for type in types:
810            substDict = { "targs" : type,
811                          "class_name" : Name }
812            exec_output += NeonXExecDeclare.subst(substDict)
813
814    def tbxTblInstX(name, Name, opClass, types, length, isTbl, rCount):
815        global header_output, decoder_output, exec_output
816        code = simd64EnabledCheckCode + '''
817        union
818        {
819            uint8_t bytes[64];
820            FloatRegBits regs[16];
821        } table;
822
823        union
824        {
825            uint8_t bytes[%(rCount)d * 4];
826            FloatRegBits regs[%(rCount)d];
827        } destReg, srcReg2;
828
829        const unsigned length = %(length)d;
830        const bool isTbl = %(isTbl)s;
831        ''' % { "rCount" : rCount, "length" : length, "isTbl" : isTbl }
832        for reg in range(rCount):
833            code += '''
834        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
835        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
836        ''' % { "reg" : reg }
837        for reg in range(16):
838            if reg < length * 4:
839                code += '''
840        table.regs[%(reg)d] = htog(AA64FpOp1P%(p)dV%(v)dS_uw);
841        ''' % { "reg" : reg, "p" : reg % 4, "v" : reg / 4 }
842            else:
843                code += '''
844        table.regs[%(reg)d] = 0;
845        ''' % { "reg" : reg }
846        code += '''
847        for (unsigned i = 0; i < sizeof(destReg); i++) {
848            uint8_t index = srcReg2.bytes[i];
849            if (index < 16 * length) {
850                destReg.bytes[i] = table.bytes[index];
851            } else {
852                if (isTbl)
853                    destReg.bytes[i] = 0;
854                // else destReg.bytes[i] unchanged
855            }
856        }
857        '''
858        for reg in range(rCount):
859            code += '''
860        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
861        ''' % { "reg" : reg }
862        if rCount < 4:  # zero upper half
863            for reg in range(rCount, 4):
864                code += '''
865        AA64FpDestP%(reg)d_uw = 0;
866        ''' % { "reg" : reg }
867        iop = InstObjParams(name, Name,
868                            "DataX2RegOp",
869                            { "code": code,
870                              "r_count": rCount,
871                              "op_class": opClass }, [])
872        header_output += NeonX2RegOpDeclare.subst(iop)
873        exec_output += NeonXEqualRegOpExecute.subst(iop)
874        for type in types:
875            substDict = { "targs" : type,
876                          "class_name" : Name }
877            exec_output += NeonXExecDeclare.subst(substDict)
878
879    # ABS
880    absCode = '''
881            if (srcElem1 < 0) {
882                destElem = -srcElem1;
883            } else {
884                destElem = srcElem1;
885            }
886    '''
887    twoEqualRegInstX("abs", "AbsDX", "SimdAluOp", signedTypes, 2, absCode)
888    twoEqualRegInstX("abs", "AbsQX", "SimdAluOp", signedTypes, 4, absCode)
889    # ADD
890    addCode = "destElem = srcElem1 + srcElem2;"
891    threeEqualRegInstX("add", "AddDX", "SimdAddOp", unsignedTypes, 2, addCode)
892    threeEqualRegInstX("add", "AddQX", "SimdAddOp", unsignedTypes, 4, addCode)
893    # ADDHN, ADDHN2
894    addhnCode = '''
895            destElem = ((BigElement)srcElem1 + (BigElement)srcElem2) >>
896                        (sizeof(Element) * 8);
897    '''
898    threeRegNarrowInstX("addhn", "AddhnX", "SimdAddOp", smallUnsignedTypes,
899                        addhnCode)
900    threeRegNarrowInstX("addhn2", "Addhn2X", "SimdAddOp", smallUnsignedTypes,
901                        addhnCode, hi=True)
902    # ADDP (scalar)
903    twoRegPairwiseScInstX("addp", "AddpScQX", "SimdAddOp", ("uint64_t",), 4,
904                          addCode)
905    # ADDP (vector)
906    threeEqualRegInstX("addp", "AddpDX", "SimdAddOp", smallUnsignedTypes, 2,
907                       addCode, pairwise=True)
908    threeEqualRegInstX("addp", "AddpQX", "SimdAddOp", unsignedTypes, 4,
909                       addCode, pairwise=True)
910    # ADDV
911    # Note: SimdAddOp can be a bit optimistic here
912    addAcrossCode = "destElem += srcElem1;"
913    twoRegAcrossInstX("addv", "AddvDX", "SimdAddOp", ("uint8_t", "uint16_t"),
914                      2, addAcrossCode)
915    twoRegAcrossInstX("addv", "AddvQX", "SimdAddOp", smallUnsignedTypes, 4,
916                      addAcrossCode)
917    # AND
918    andCode = "destElem = srcElem1 & srcElem2;"
919    threeEqualRegInstX("and", "AndDX", "SimdAluOp", ("uint64_t",), 2, andCode)
920    threeEqualRegInstX("and", "AndQX", "SimdAluOp", ("uint64_t",), 4, andCode)
921    # BIC (immediate)
922    bicImmCode = "destElem &= ~imm;"
923    oneRegImmInstX("bic", "BicImmDX", "SimdAluOp", ("uint64_t",), 2,
924                   bicImmCode, True)
925    oneRegImmInstX("bic", "BicImmQX", "SimdAluOp", ("uint64_t",), 4,
926                   bicImmCode, True)
927    # BIC (register)
928    bicCode = "destElem = srcElem1 & ~srcElem2;"
929    threeEqualRegInstX("bic", "BicDX", "SimdAluOp", ("uint64_t",), 2, bicCode)
930    threeEqualRegInstX("bic", "BicQX", "SimdAluOp", ("uint64_t",), 4, bicCode)
931    # BIF
932    bifCode = "destElem = (destElem & srcElem2) | (srcElem1 & ~srcElem2);"
933    threeEqualRegInstX("bif", "BifDX", "SimdAluOp", ("uint64_t",), 2, bifCode,
934                       True)
935    threeEqualRegInstX("bif", "BifQX", "SimdAluOp", ("uint64_t",), 4, bifCode,
936                       True)
937    # BIT
938    bitCode = "destElem = (srcElem1 & srcElem2) | (destElem & ~srcElem2);"
939    threeEqualRegInstX("bit", "BitDX", "SimdAluOp", ("uint64_t",), 2, bitCode,
940                       True)
941    threeEqualRegInstX("bit", "BitQX", "SimdAluOp", ("uint64_t",), 4, bitCode,
942                       True)
943    # BSL
944    bslCode = "destElem = (srcElem1 & destElem) | (srcElem2 & ~destElem);"
945    threeEqualRegInstX("bsl", "BslDX", "SimdAluOp", ("uint64_t",), 2, bslCode,
946                       True)
947    threeEqualRegInstX("bsl", "BslQX", "SimdAluOp", ("uint64_t",), 4, bslCode,
948                       True)
949    # CLS
950    clsCode = '''
951            unsigned count = 0;
952            if (srcElem1 < 0) {
953                srcElem1 <<= 1;
954                while (srcElem1 < 0 && count < sizeof(Element) * 8 - 1) {
955                    count++;
956                    srcElem1 <<= 1;
957                }
958            } else {
959                srcElem1 <<= 1;
960                while (srcElem1 >= 0 && count < sizeof(Element) * 8 - 1) {
961                    count++;
962                    srcElem1 <<= 1;
963                }
964            }
965            destElem = count;
966    '''
967    twoEqualRegInstX("cls", "ClsDX", "SimdAluOp", smallSignedTypes, 2, clsCode)
968    twoEqualRegInstX("cls", "ClsQX", "SimdAluOp", smallSignedTypes, 4, clsCode)
969    # CLZ
970    clzCode = '''
971            unsigned count = 0;
972            while (srcElem1 >= 0 && count < sizeof(Element) * 8) {
973                count++;
974                srcElem1 <<= 1;
975            }
976            destElem = count;
977    '''
978    twoEqualRegInstX("clz", "ClzDX", "SimdAluOp", smallSignedTypes, 2, clzCode)
979    twoEqualRegInstX("clz", "ClzQX", "SimdAluOp", smallSignedTypes, 4, clzCode)
980    # CMEQ (register)
981    cmeqCode = "destElem = (srcElem1 == srcElem2) ? (Element)(-1) : 0;"
982    threeEqualRegInstX("cmeq", "CmeqDX", "SimdCmpOp", unsignedTypes, 2,
983                       cmeqCode)
984    threeEqualRegInstX("cmeq", "CmeqQX", "SimdCmpOp", unsignedTypes, 4,
985                       cmeqCode)
986    # CMEQ (zero)
987    cmeqZeroCode = "destElem = (srcElem1 == 0) ? (Element)(-1) : 0;"
988    twoEqualRegInstX("cmeq", "CmeqZeroDX", "SimdCmpOp", signedTypes, 2,
989                     cmeqZeroCode)
990    twoEqualRegInstX("cmeq", "CmeqZeroQX", "SimdCmpOp", signedTypes, 4,
991                     cmeqZeroCode)
992    # CMGE (register)
993    cmgeCode = "destElem = (srcElem1 >= srcElem2) ? (Element)(-1) : 0;"
994    threeEqualRegInstX("cmge", "CmgeDX", "SimdCmpOp", signedTypes, 2, cmgeCode)
995    threeEqualRegInstX("cmge", "CmgeQX", "SimdCmpOp", signedTypes, 4, cmgeCode)
996    # CMGE (zero)
997    cmgeZeroCode = "destElem = (srcElem1 >= 0) ? (Element)(-1) : 0;"
998    twoEqualRegInstX("cmge", "CmgeZeroDX", "SimdCmpOp", signedTypes, 2,
999                     cmgeZeroCode)
1000    twoEqualRegInstX("cmge", "CmgeZeroQX", "SimdCmpOp", signedTypes, 4,
1001                     cmgeZeroCode)
1002    # CMGT (register)
1003    cmgtCode = "destElem = (srcElem1 > srcElem2) ? (Element)(-1) : 0;"
1004    threeEqualRegInstX("cmgt", "CmgtDX", "SimdCmpOp", signedTypes, 2, cmgtCode)
1005    threeEqualRegInstX("cmgt", "CmgtQX", "SimdCmpOp", signedTypes, 4, cmgtCode)
1006    # CMGT (zero)
1007    cmgtZeroCode = "destElem = (srcElem1 > 0) ? (Element)(-1) : 0;"
1008    twoEqualRegInstX("cmgt", "CmgtZeroDX", "SimdCmpOp", signedTypes, 2,
1009                     cmgtZeroCode)
1010    twoEqualRegInstX("cmgt", "CmgtZeroQX", "SimdCmpOp", signedTypes, 4,
1011                     cmgtZeroCode)
1012    # CMHI (register)
1013    threeEqualRegInstX("cmhi", "CmhiDX", "SimdCmpOp", unsignedTypes, 2,
1014                       cmgtCode)
1015    threeEqualRegInstX("cmhi", "CmhiQX", "SimdCmpOp", unsignedTypes, 4,
1016                       cmgtCode)
1017    # CMHS (register)
1018    threeEqualRegInstX("cmhs", "CmhsDX", "SimdCmpOp", unsignedTypes, 2,
1019                       cmgeCode)
1020    threeEqualRegInstX("cmhs", "CmhsQX", "SimdCmpOp", unsignedTypes, 4,
1021                       cmgeCode)
1022    # CMLE (zero)
1023    cmleZeroCode = "destElem = (srcElem1 <= 0) ? (Element)(-1) : 0;"
1024    twoEqualRegInstX("cmle", "CmleZeroDX", "SimdCmpOp", signedTypes, 2,
1025                     cmleZeroCode)
1026    twoEqualRegInstX("cmle", "CmleZeroQX", "SimdCmpOp", signedTypes, 4,
1027                     cmleZeroCode)
1028    # CMLT (zero)
1029    cmltZeroCode = "destElem = (srcElem1 < 0) ? (Element)(-1) : 0;"
1030    twoEqualRegInstX("cmlt", "CmltZeroDX", "SimdCmpOp", signedTypes, 2,
1031                     cmltZeroCode)
1032    twoEqualRegInstX("cmlt", "CmltZeroQX", "SimdCmpOp", signedTypes, 4,
1033                     cmltZeroCode)
1034    # CMTST (register)
1035    tstCode = "destElem = (srcElem1 & srcElem2) ? (Element)(-1) : 0;"
1036    threeEqualRegInstX("cmtst", "CmtstDX", "SimdAluOp", unsignedTypes, 2,
1037                       tstCode)
1038    threeEqualRegInstX("cmtst", "CmtstQX", "SimdAluOp", unsignedTypes, 4,
1039                       tstCode)
1040    # CNT
1041    cntCode = '''
1042            unsigned count = 0;
1043            while (srcElem1 && count < sizeof(Element) * 8) {
1044                count += srcElem1 & 0x1;
1045                srcElem1 >>= 1;
1046            }
1047            destElem = count;
1048    '''
1049    twoEqualRegInstX("cnt", "CntDX", "SimdAluOp", ("uint8_t",), 2, cntCode)
1050    twoEqualRegInstX("cnt", "CntQX", "SimdAluOp", ("uint8_t",), 4, cntCode)
1051    # DUP (element)
1052    dupCode = "destElem = srcElem1;"
1053    twoEqualRegInstX("dup", "DupElemDX", "SimdMiscOp", smallUnsignedTypes, 2,
1054                     dupCode, isDup=True, byElem=True)
1055    twoEqualRegInstX("dup", "DupElemQX", "SimdMiscOp", unsignedTypes, 4,
1056                     dupCode, isDup=True, byElem=True)
1057    twoEqualRegInstX("dup", "DupElemScX", "SimdMiscOp", unsignedTypes, 4,
1058                     dupCode, isDup=True, byElem=True, scalar=True)
1059    # DUP (general register)
1060    dupGprInstX("dup", "DupGprWDX", "SimdMiscOp", smallUnsignedTypes, 2, 'W')
1061    dupGprInstX("dup", "DupGprWQX", "SimdMiscOp", smallUnsignedTypes, 4, 'W')
1062    dupGprInstX("dup", "DupGprXQX", "SimdMiscOp", ("uint64_t",), 4, 'X')
1063    # EOR
1064    eorCode = "destElem = srcElem1 ^ srcElem2;"
1065    threeEqualRegInstX("eor", "EorDX", "SimdAluOp", ("uint64_t",), 2, eorCode)
1066    threeEqualRegInstX("eor", "EorQX", "SimdAluOp", ("uint64_t",), 4, eorCode)
1067    # EXT
1068    extCode = '''
1069            for (unsigned i = 0; i < eCount; i++) {
1070                unsigned index = i + imm;
1071                if (index < eCount) {
1072                    destReg.elements[i] = srcReg1.elements[index];
1073                } else {
1074                    index -= eCount;
1075                    if (index >= eCount) {
1076                        fault = std::make_shared<UndefinedInstruction>(
1077                                      machInst, false, mnemonic);
1078                    } else {
1079                        destReg.elements[i] = srcReg2.elements[index];
1080                    }
1081                }
1082            }
1083    '''
1084    extInstX("Ext", "ExtDX", "SimdMiscOp", ("uint8_t",), 2, extCode)
1085    extInstX("Ext", "ExtQX", "SimdMiscOp", ("uint8_t",), 4, extCode)
1086    # FABD
1087    fpOp = '''
1088            FPSCR fpscr = (FPSCR) FpscrExc;
1089            destElem = %s;
1090            FpscrExc = fpscr;
1091    '''
1092    fabdCode = fpOp % "fplibAbs<Element>(fplibSub(srcElem1, srcElem2, fpscr))"
1093    threeEqualRegInstX("fabd", "FabdDX", "SimdFloatAddOp", smallFloatTypes, 2,
1094                       fabdCode)
1095    threeEqualRegInstX("fabd", "FabdQX", "SimdFloatAddOp", floatTypes, 4,
1096                       fabdCode)
1097    threeEqualRegInstX("fabd", "FabdScX", "SimdFloatAddOp", floatTypes, 4,
1098                       fabdCode, scalar=True)
1099    # FABS
1100    fabsCode = fpOp % "fplibAbs<Element>(srcElem1)"
1101    twoEqualRegInstX("Abs", "FabsDX", "SimdFloatAluOp", smallFloatTypes, 2,
1102                     fabsCode)
1103    twoEqualRegInstX("Abs", "FabsQX", "SimdFloatAluOp", floatTypes, 4,
1104                     fabsCode)
1105    # FACGE
1106    fpCmpAbsOp = fpOp % ("fplibCompare%s<Element>(fplibAbs<Element>(srcElem1),"
1107                         " fplibAbs<Element>(srcElem2), fpscr) ? -1 : 0")
1108    facgeCode = fpCmpAbsOp % "GE"
1109    threeEqualRegInstX("facge", "FacgeDX", "SimdFloatCmpOp", smallFloatTypes,
1110                       2, facgeCode)
1111    threeEqualRegInstX("facge", "FacgeQX", "SimdFloatCmpOp", floatTypes, 4,
1112                       facgeCode)
1113    threeEqualRegInstX("facge", "FacgeScX", "SimdFloatCmpOp", floatTypes, 4,
1114                       facgeCode, scalar=True)
1115    # FACGT
1116    facgtCode = fpCmpAbsOp % "GT"
1117    threeEqualRegInstX("facgt", "FacgtDX", "SimdFloatCmpOp", smallFloatTypes,
1118                       2, facgtCode)
1119    threeEqualRegInstX("facgt", "FacgtQX", "SimdFloatCmpOp", floatTypes, 4,
1120                       facgtCode)
1121    threeEqualRegInstX("facgt", "FacgtScX", "SimdFloatCmpOp", floatTypes, 4,
1122                       facgtCode, scalar=True)
1123    # FADD
1124    fpBinOp = fpOp % "fplib%s<Element>(srcElem1, srcElem2, fpscr)"
1125    faddCode = fpBinOp % "Add"
1126    threeEqualRegInstX("fadd", "FaddDX", "SimdFloatAddOp", smallFloatTypes, 2,
1127                       faddCode)
1128    threeEqualRegInstX("fadd", "FaddQX", "SimdFloatAddOp", floatTypes, 4,
1129                       faddCode)
1130    # FADDP (scalar)
1131    twoRegPairwiseScInstX("faddp", "FaddpScDX", "SimdFloatAddOp",
1132                          ("uint32_t",), 2, faddCode)
1133    twoRegPairwiseScInstX("faddp", "FaddpScQX", "SimdFloatAddOp",
1134                          ("uint64_t",), 4, faddCode)
1135    # FADDP (vector)
1136    threeEqualRegInstX("faddp", "FaddpDX", "SimdFloatAddOp", smallFloatTypes,
1137                       2, faddCode, pairwise=True)
1138    threeEqualRegInstX("faddp", "FaddpQX", "SimdFloatAddOp", floatTypes, 4,
1139                       faddCode, pairwise=True)
1140    # FCMEQ (register)
1141    fpCmpOp = fpOp % ("fplibCompare%s<Element>(srcElem1, srcElem2, fpscr) ?"
1142                      " -1 : 0")
1143    fcmeqCode = fpCmpOp % "EQ"
1144    threeEqualRegInstX("fcmeq", "FcmeqDX", "SimdFloatCmpOp", smallFloatTypes,
1145                       2, fcmeqCode)
1146    threeEqualRegInstX("fcmeq", "FcmeqQX", "SimdFloatCmpOp", floatTypes, 4,
1147                       fcmeqCode)
1148    threeEqualRegInstX("fcmeq", "FcmeqScX", "SimdFloatCmpOp", floatTypes, 4,
1149                       fcmeqCode, scalar=True)
1150    # FCMEQ (zero)
1151    fpCmpZeroOp = fpOp % "fplibCompare%s<Element>(srcElem1, 0, fpscr) ? -1 : 0"
1152    fcmeqZeroCode = fpCmpZeroOp % "EQ"
1153    twoEqualRegInstX("fcmeq", "FcmeqZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1154                     2, fcmeqZeroCode)
1155    twoEqualRegInstX("fcmeq", "FcmeqZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1156                     fcmeqZeroCode)
1157    twoEqualRegInstX("fcmeq", "FcmeqZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1158                     fcmeqZeroCode, scalar=True)
1159    # FCMGE (register)
1160    fcmgeCode = fpCmpOp % "GE"
1161    threeEqualRegInstX("fcmge", "FcmgeDX", "SimdFloatCmpOp", smallFloatTypes,
1162                       2, fcmgeCode)
1163    threeEqualRegInstX("fcmge", "FcmgeQX", "SimdFloatCmpOp", floatTypes, 4,
1164                       fcmgeCode)
1165    threeEqualRegInstX("fcmge", "FcmgeScX", "SimdFloatCmpOp", floatTypes, 4,
1166                       fcmgeCode, scalar=True)
1167    # FCMGE (zero)
1168    fcmgeZeroCode = fpCmpZeroOp % "GE"
1169    twoEqualRegInstX("fcmge", "FcmgeZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1170                     2, fcmgeZeroCode)
1171    twoEqualRegInstX("fcmge", "FcmgeZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1172                     fcmgeZeroCode)
1173    twoEqualRegInstX("fcmge", "FcmgeZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1174                     fcmgeZeroCode, scalar=True)
1175    # FCMGT (register)
1176    fcmgtCode = fpCmpOp % "GT"
1177    threeEqualRegInstX("fcmgt", "FcmgtDX", "SimdFloatCmpOp", smallFloatTypes,
1178                       2, fcmgtCode)
1179    threeEqualRegInstX("fcmgt", "FcmgtQX", "SimdFloatCmpOp", floatTypes, 4,
1180                       fcmgtCode)
1181    threeEqualRegInstX("fcmgt", "FcmgtScX", "SimdFloatCmpOp", floatTypes, 4,
1182                       fcmgtCode, scalar=True)
1183    # FCMGT (zero)
1184    fcmgtZeroCode = fpCmpZeroOp % "GT"
1185    twoEqualRegInstX("fcmgt", "FcmgtZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1186                     2, fcmgtZeroCode)
1187    twoEqualRegInstX("fcmgt", "FcmgtZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1188                     fcmgtZeroCode)
1189    twoEqualRegInstX("fcmgt", "FcmgtZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1190                     fcmgtZeroCode, scalar=True)
1191    # FCMLE (zero)
1192    fpCmpRevZeroOp = fpOp % ("fplibCompare%s<Element>(0, srcElem1, fpscr) ?"
1193                             " -1 : 0")
1194    fcmleZeroCode = fpCmpRevZeroOp % "GE"
1195    twoEqualRegInstX("fcmle", "FcmleZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1196                     2, fcmleZeroCode)
1197    twoEqualRegInstX("fcmle", "FcmleZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1198                     fcmleZeroCode)
1199    twoEqualRegInstX("fcmle", "FcmleZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1200                     fcmleZeroCode, scalar=True)
1201    # FCMLT (zero)
1202    fcmltZeroCode = fpCmpRevZeroOp % "GT"
1203    twoEqualRegInstX("fcmlt", "FcmltZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1204                     2, fcmltZeroCode)
1205    twoEqualRegInstX("fcmlt", "FcmltZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1206                     fcmltZeroCode)
1207    twoEqualRegInstX("fcmlt", "FcmltZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1208                     fcmltZeroCode, scalar=True)
1209    # FCVTAS
1210    fcvtCode = fpOp % ("fplibFPToFixed<Element, Element>("
1211                       "srcElem1, %s, %s, %s, fpscr)")
1212    fcvtasCode = fcvtCode % ("0", "false", "FPRounding_TIEAWAY")
1213    twoEqualRegInstX("fcvtas", "FcvtasDX", "SimdCvtOp", smallFloatTypes, 2,
1214                     fcvtasCode)
1215    twoEqualRegInstX("fcvtas", "FcvtasQX", "SimdCvtOp", floatTypes, 4,
1216                     fcvtasCode)
1217    twoEqualRegInstX("fcvtas", "FcvtasScX", "SimdCvtOp", floatTypes, 4,
1218                     fcvtasCode, scalar=True)
1219    # FCVTAU
1220    fcvtauCode = fcvtCode % ("0", "true", "FPRounding_TIEAWAY")
1221    twoEqualRegInstX("fcvtau", "FcvtauDX", "SimdCvtOp", smallFloatTypes, 2,
1222                     fcvtauCode)
1223    twoEqualRegInstX("fcvtau", "FcvtauQX", "SimdCvtOp", floatTypes, 4,
1224                     fcvtauCode)
1225    twoEqualRegInstX("fcvtau", "FcvtauScX", "SimdCvtOp", floatTypes, 4,
1226                     fcvtauCode, scalar=True)
1227    # FCVTL, FCVTL2
1228    fcvtlCode = fpOp % ("fplibConvert<Element, BigElement>("
1229                        "srcElem1, FPCRRounding(fpscr), fpscr)")
1230    twoRegLongInstX("fcvtl", "FcvtlX", "SimdCvtOp", ("uint16_t", "uint32_t"),
1231                    fcvtlCode)
1232    twoRegLongInstX("fcvtl", "Fcvtl2X", "SimdCvtOp", ("uint16_t", "uint32_t"),
1233                    fcvtlCode, hi=True)
1234    # FCVTMS
1235    fcvtmsCode = fcvtCode % ("0", "false", "FPRounding_NEGINF")
1236    twoEqualRegInstX("fcvtms", "FcvtmsDX", "SimdCvtOp", smallFloatTypes, 2,
1237                     fcvtmsCode)
1238    twoEqualRegInstX("fcvtms", "FcvtmsQX", "SimdCvtOp", floatTypes, 4,
1239                     fcvtmsCode)
1240    twoEqualRegInstX("fcvtms", "FcvtmsScX", "SimdCvtOp", floatTypes, 4,
1241                     fcvtmsCode, scalar=True)
1242    # FCVTMU
1243    fcvtmuCode = fcvtCode % ("0", "true", "FPRounding_NEGINF")
1244    twoEqualRegInstX("fcvtmu", "FcvtmuDX", "SimdCvtOp", smallFloatTypes, 2,
1245                     fcvtmuCode)
1246    twoEqualRegInstX("fcvtmu", "FcvtmuQX", "SimdCvtOp", floatTypes, 4,
1247                     fcvtmuCode)
1248    twoEqualRegInstX("fcvtmu", "FcvtmuScX", "SimdCvtOp", floatTypes, 4,
1249                     fcvtmuCode, scalar=True)
1250    # FCVTN, FCVTN2
1251    fcvtnCode = fpOp % ("fplibConvert<BigElement, Element>("
1252                        "srcElem1, FPCRRounding(fpscr), fpscr)")
1253    twoRegNarrowInstX("fcvtn", "FcvtnX", "SimdCvtOp",
1254                      ("uint16_t", "uint32_t"), fcvtnCode)
1255    twoRegNarrowInstX("fcvtn", "Fcvtn2X", "SimdCvtOp",
1256                      ("uint16_t", "uint32_t"), fcvtnCode, hi=True)
1257    # FCVTNS
1258    fcvtnsCode = fcvtCode % ("0", "false", "FPRounding_TIEEVEN")
1259    twoEqualRegInstX("fcvtns", "FcvtnsDX", "SimdCvtOp", smallFloatTypes, 2,
1260                     fcvtnsCode)
1261    twoEqualRegInstX("fcvtns", "FcvtnsQX", "SimdCvtOp", floatTypes, 4,
1262                     fcvtnsCode)
1263    twoEqualRegInstX("fcvtns", "FcvtnsScX", "SimdCvtOp", floatTypes, 4,
1264                     fcvtnsCode, scalar=True)
1265    # FCVTNU
1266    fcvtnuCode = fcvtCode % ("0", "true", "FPRounding_TIEEVEN")
1267    twoEqualRegInstX("fcvtnu", "FcvtnuDX", "SimdCvtOp", smallFloatTypes, 2,
1268                     fcvtnuCode)
1269    twoEqualRegInstX("fcvtnu", "FcvtnuQX", "SimdCvtOp", floatTypes, 4,
1270                     fcvtnuCode)
1271    twoEqualRegInstX("fcvtnu", "FcvtnuScX", "SimdCvtOp", floatTypes, 4,
1272                     fcvtnuCode, scalar=True)
1273    # FCVTPS
1274    fcvtpsCode = fcvtCode % ("0", "false", "FPRounding_POSINF")
1275    twoEqualRegInstX("fcvtps", "FcvtpsDX", "SimdCvtOp", smallFloatTypes, 2,
1276                     fcvtpsCode)
1277    twoEqualRegInstX("fcvtps", "FcvtpsQX", "SimdCvtOp", floatTypes, 4,
1278                     fcvtpsCode)
1279    twoEqualRegInstX("fcvtps", "FcvtpsScX", "SimdCvtOp", floatTypes, 4,
1280                     fcvtpsCode, scalar=True)
1281    # FCVTPU
1282    fcvtpuCode = fcvtCode % ("0", "true", "FPRounding_POSINF")
1283    twoEqualRegInstX("fcvtpu", "FcvtpuDX", "SimdCvtOp", smallFloatTypes, 2,
1284                     fcvtpuCode)
1285    twoEqualRegInstX("fcvtpu", "FcvtpuQX", "SimdCvtOp", floatTypes, 4,
1286                     fcvtpuCode)
1287    twoEqualRegInstX("fcvtpu", "FcvtpuScX", "SimdCvtOp", floatTypes, 4,
1288                     fcvtpuCode, scalar=True)
1289    # FCVTXN, FCVTXN2
1290    fcvtxnCode = fpOp % ("fplibConvert<BigElement, Element>("
1291                         "srcElem1, FPRounding_ODD, fpscr)")
1292    twoRegNarrowInstX("fcvtxn", "FcvtxnX", "SimdCvtOp", smallFloatTypes,
1293                      fcvtxnCode)
1294    twoRegNarrowInstX("fcvtxn", "Fcvtxn2X", "SimdCvtOp", smallFloatTypes,
1295                      fcvtxnCode, hi=True)
1296    twoRegNarrowInstX("fcvtxn", "FcvtxnScX", "SimdCvtOp", smallFloatTypes,
1297                      fcvtxnCode, scalar=True)
1298    # FCVTZS (fixed-point)
1299    fcvtzsCode = fcvtCode % ("imm", "false", "FPRounding_ZERO")
1300    twoEqualRegInstX("fcvtzs", "FcvtzsFixedDX", "SimdCvtOp", smallFloatTypes,
1301                     2, fcvtzsCode, hasImm=True)
1302    twoEqualRegInstX("fcvtzs", "FcvtzsFixedQX", "SimdCvtOp", floatTypes, 4,
1303                     fcvtzsCode, hasImm=True)
1304    twoEqualRegInstX("fcvtzs", "FcvtzsFixedScX", "SimdCvtOp", floatTypes, 4,
1305                     fcvtzsCode, hasImm=True, scalar=True)
1306    # FCVTZS (integer)
1307    fcvtzsIntCode = fcvtCode % ("0", "false", "FPRounding_ZERO")
1308    twoEqualRegInstX("fcvtzs", "FcvtzsIntDX", "SimdCvtOp", smallFloatTypes,
1309                     2, fcvtzsIntCode)
1310    twoEqualRegInstX("fcvtzs", "FcvtzsIntQX", "SimdCvtOp", floatTypes, 4,
1311                     fcvtzsIntCode)
1312    twoEqualRegInstX("fcvtzs", "FcvtzsIntScX", "SimdCvtOp", floatTypes, 4,
1313                     fcvtzsIntCode, scalar=True)
1314    # FCVTZU (fixed-point)
1315    fcvtzuCode = fcvtCode % ("imm", "true", "FPRounding_ZERO")
1316    twoEqualRegInstX("fcvtzu", "FcvtzuFixedDX", "SimdCvtOp", smallFloatTypes,
1317                     2, fcvtzuCode, hasImm=True)
1318    twoEqualRegInstX("fcvtzu", "FcvtzuFixedQX", "SimdCvtOp", floatTypes, 4,
1319                     fcvtzuCode, hasImm=True)
1320    twoEqualRegInstX("fcvtzu", "FcvtzuFixedScX", "SimdCvtOp", floatTypes, 4,
1321                     fcvtzuCode, hasImm=True, scalar=True)
1322    # FCVTZU (integer)
1323    fcvtzuIntCode = fcvtCode % ("0", "true", "FPRounding_ZERO")
1324    twoEqualRegInstX("fcvtzu", "FcvtzuIntDX", "SimdCvtOp", smallFloatTypes, 2,
1325                     fcvtzuIntCode)
1326    twoEqualRegInstX("fcvtzu", "FcvtzuIntQX", "SimdCvtOp", floatTypes, 4,
1327                     fcvtzuIntCode)
1328    twoEqualRegInstX("fcvtzu", "FcvtzuIntScX", "SimdCvtOp", floatTypes, 4,
1329                     fcvtzuIntCode, scalar=True)
1330    # FDIV
1331    fdivCode = fpBinOp % "Div"
1332    threeEqualRegInstX("fdiv", "FdivDX", "SimdFloatDivOp", smallFloatTypes, 2,
1333                       fdivCode)
1334    threeEqualRegInstX("fdiv", "FdivQX", "SimdFloatDivOp", floatTypes, 4,
1335                       fdivCode)
1336    # FMAX
1337    fmaxCode = fpBinOp % "Max"
1338    threeEqualRegInstX("fmax", "FmaxDX", "SimdFloatCmpOp", smallFloatTypes, 2,
1339                       fmaxCode)
1340    threeEqualRegInstX("fmax", "FmaxQX", "SimdFloatCmpOp", floatTypes, 4,
1341                       fmaxCode)
1342    # FMAXNM
1343    fmaxnmCode = fpBinOp % "MaxNum"
1344    threeEqualRegInstX("fmaxnm", "FmaxnmDX", "SimdFloatCmpOp", smallFloatTypes,
1345                       2, fmaxnmCode)
1346    threeEqualRegInstX("fmaxnm", "FmaxnmQX", "SimdFloatCmpOp", floatTypes, 4,
1347                       fmaxnmCode)
1348    # FMAXNMP (scalar)
1349    twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScDX", "SimdFloatCmpOp",
1350                          ("uint32_t",), 2, fmaxnmCode)
1351    twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScQX", "SimdFloatCmpOp",
1352                          ("uint64_t",), 4, fmaxnmCode)
1353    # FMAXNMP (vector)
1354    threeEqualRegInstX("fmaxnmp", "FmaxnmpDX", "SimdFloatCmpOp",
1355                       smallFloatTypes, 2, fmaxnmCode, pairwise=True)
1356    threeEqualRegInstX("fmaxnmp", "FmaxnmpQX", "SimdFloatCmpOp", floatTypes, 4,
1357                       fmaxnmCode, pairwise=True)
1358    # FMAXNMV
1359    # Note: SimdFloatCmpOp can be a bit optimistic here
1360    fpAcrossOp = fpOp % "fplib%s<Element>(destElem, srcElem1, fpscr)"
1361    fmaxnmAcrossCode = fpAcrossOp % "MaxNum"
1362    twoRegAcrossInstX("fmaxnmv", "FmaxnmvQX", "SimdFloatCmpOp", ("uint32_t",),
1363                      4, fmaxnmAcrossCode)
1364    # FMAXP (scalar)
1365    twoRegPairwiseScInstX("fmaxp", "FmaxpScDX", "SimdFloatCmpOp",
1366                          ("uint32_t",), 2, fmaxCode)
1367    twoRegPairwiseScInstX("fmaxp", "FmaxpScQX", "SimdFloatCmpOp",
1368                          ("uint64_t",), 4, fmaxCode)
1369    # FMAXP (vector)
1370    threeEqualRegInstX("fmaxp", "FmaxpDX", "SimdFloatCmpOp", smallFloatTypes,
1371                       2, fmaxCode, pairwise=True)
1372    threeEqualRegInstX("fmaxp", "FmaxpQX", "SimdFloatCmpOp", floatTypes, 4,
1373                       fmaxCode, pairwise=True)
1374    # FMAXV
1375    # Note: SimdFloatCmpOp can be a bit optimistic here
1376    fmaxAcrossCode = fpAcrossOp % "Max"
1377    twoRegAcrossInstX("fmaxv", "FmaxvQX", "SimdFloatCmpOp", ("uint32_t",), 4,
1378                      fmaxAcrossCode)
1379    # FMIN
1380    fminCode = fpBinOp % "Min"
1381    threeEqualRegInstX("fmin", "FminDX", "SimdFloatCmpOp", smallFloatTypes, 2,
1382                       fminCode)
1383    threeEqualRegInstX("fmin", "FminQX", "SimdFloatCmpOp", floatTypes, 4,
1384                       fminCode)
1385    # FMINNM
1386    fminnmCode = fpBinOp % "MinNum"
1387    threeEqualRegInstX("fminnm", "FminnmDX", "SimdFloatCmpOp", smallFloatTypes,
1388                       2, fminnmCode)
1389    threeEqualRegInstX("fminnm", "FminnmQX", "SimdFloatCmpOp", floatTypes, 4,
1390                       fminnmCode)
1391    # FMINNMP (scalar)
1392    twoRegPairwiseScInstX("fminnmp", "FminnmpScDX", "SimdFloatCmpOp",
1393                          ("uint32_t",), 2, fminnmCode)
1394    twoRegPairwiseScInstX("fminnmp", "FminnmpScQX", "SimdFloatCmpOp",
1395                          ("uint64_t",), 4, fminnmCode)
1396    # FMINNMP (vector)
1397    threeEqualRegInstX("fminnmp", "FminnmpDX", "SimdFloatCmpOp",
1398                       smallFloatTypes, 2, fminnmCode, pairwise=True)
1399    threeEqualRegInstX("fminnmp", "FminnmpQX", "SimdFloatCmpOp", floatTypes, 4,
1400                       fminnmCode, pairwise=True)
1401    # FMINNMV
1402    # Note: SimdFloatCmpOp can be a bit optimistic here
1403    fminnmAcrossCode = fpAcrossOp % "MinNum"
1404    twoRegAcrossInstX("fminnmv", "FminnmvQX", "SimdFloatCmpOp", ("uint32_t",),
1405                      4, fminnmAcrossCode)
1406    # FMINP (scalar)
1407    twoRegPairwiseScInstX("fminp", "FminpScDX", "SimdFloatCmpOp",
1408                          ("uint32_t",), 2, fminCode)
1409    twoRegPairwiseScInstX("fminp", "FminpScQX", "SimdFloatCmpOp",
1410                          ("uint64_t",), 4, fminCode)
1411    # FMINP (vector)
1412    threeEqualRegInstX("fminp", "FminpDX", "SimdFloatCmpOp", smallFloatTypes,
1413                       2, fminCode, pairwise=True)
1414    threeEqualRegInstX("fminp", "FminpQX", "SimdFloatCmpOp", floatTypes, 4,
1415                       fminCode, pairwise=True)
1416    # FMINV
1417    # Note: SimdFloatCmpOp can be a bit optimistic here
1418    fminAcrossCode = fpAcrossOp % "Min"
1419    twoRegAcrossInstX("fminv", "FminvQX", "SimdFloatCmpOp", ("uint32_t",), 4,
1420                      fminAcrossCode)
1421    # FMLA (by element)
1422    fmlaCode = fpOp % ("fplibMulAdd<Element>("
1423                       "destElem, srcElem1, srcElem2, fpscr)")
1424    threeEqualRegInstX("fmla", "FmlaElemDX", "SimdFloatMultAccOp",
1425                       smallFloatTypes, 2, fmlaCode, True, byElem=True)
1426    threeEqualRegInstX("fmla", "FmlaElemQX", "SimdFloatMultAccOp", floatTypes,
1427                       4, fmlaCode, True, byElem=True)
1428    threeEqualRegInstX("fmla", "FmlaElemScX", "SimdFloatMultAccOp", floatTypes,
1429                       4, fmlaCode, True, byElem=True, scalar=True)
1430    # FMLA (vector)
1431    threeEqualRegInstX("fmla", "FmlaDX", "SimdFloatMultAccOp", smallFloatTypes,
1432                       2, fmlaCode, True)
1433    threeEqualRegInstX("fmla", "FmlaQX", "SimdFloatMultAccOp", floatTypes, 4,
1434                       fmlaCode, True)
1435    # FMLS (by element)
1436    fmlsCode = fpOp % ("fplibMulAdd<Element>(destElem,"
1437                       " fplibNeg<Element>(srcElem1), srcElem2, fpscr)")
1438    threeEqualRegInstX("fmls", "FmlsElemDX", "SimdFloatMultAccOp",
1439                       smallFloatTypes, 2, fmlsCode, True, byElem=True)
1440    threeEqualRegInstX("fmls", "FmlsElemQX", "SimdFloatMultAccOp", floatTypes,
1441                       4, fmlsCode, True, byElem=True)
1442    threeEqualRegInstX("fmls", "FmlsElemScX", "SimdFloatMultAccOp", floatTypes,
1443                       4, fmlsCode, True, byElem=True, scalar=True)
1444    # FMLS (vector)
1445    threeEqualRegInstX("fmls", "FmlsDX", "SimdFloatMultAccOp", smallFloatTypes,
1446                       2, fmlsCode, True)
1447    threeEqualRegInstX("fmls", "FmlsQX", "SimdFloatMultAccOp", floatTypes, 4,
1448                       fmlsCode, True)
1449    # FMOV
1450    fmovCode = 'destElem = imm;'
1451    oneRegImmInstX("fmov", "FmovDX", "SimdMiscOp", smallFloatTypes, 2,
1452                   fmovCode)
1453    oneRegImmInstX("fmov", "FmovQX", "SimdMiscOp", floatTypes, 4, fmovCode)
1454    # FMUL (by element)
1455    fmulCode = fpBinOp % "Mul"
1456    threeEqualRegInstX("fmul", "FmulElemDX", "SimdFloatMultOp",
1457                       smallFloatTypes, 2, fmulCode, byElem=True)
1458    threeEqualRegInstX("fmul", "FmulElemQX", "SimdFloatMultOp", floatTypes, 4,
1459                       fmulCode, byElem=True)
1460    threeEqualRegInstX("fmul", "FmulElemScX", "SimdFloatMultOp", floatTypes, 4,
1461                       fmulCode, byElem=True, scalar=True)
1462    # FMUL (vector)
1463    threeEqualRegInstX("fmul", "FmulDX", "SimdFloatMultOp", smallFloatTypes, 2,
1464                       fmulCode)
1465    threeEqualRegInstX("fmul", "FmulQX", "SimdFloatMultOp", floatTypes, 4,
1466                       fmulCode)
1467    # FMULX
1468    fmulxCode = fpBinOp % "MulX"
1469    threeEqualRegInstX("fmulx", "FmulxDX", "SimdFloatMultOp", smallFloatTypes,
1470                       2, fmulxCode)
1471    threeEqualRegInstX("fmulx", "FmulxQX", "SimdFloatMultOp", floatTypes, 4,
1472                       fmulxCode)
1473    threeEqualRegInstX("fmulx", "FmulxScX", "SimdFloatMultOp", floatTypes, 4,
1474                       fmulxCode, scalar=True)
1475    # FMULX (by element)
1476    threeEqualRegInstX("fmulx", "FmulxElemDX", "SimdFloatMultOp",
1477                       smallFloatTypes, 2, fmulxCode, byElem=True)
1478    threeEqualRegInstX("fmulx", "FmulxElemQX", "SimdFloatMultOp", floatTypes,
1479                       4, fmulxCode, byElem=True)
1480    threeEqualRegInstX("fmulx", "FmulxElemScX", "SimdFloatMultOp", floatTypes,
1481                       4, fmulxCode, byElem=True, scalar=True)
1482    # FNEG
1483    fnegCode = fpOp % "fplibNeg<Element>(srcElem1)"
1484    twoEqualRegInstX("Neg", "FnegDX", "SimdFloatAluOp", smallFloatTypes, 2,
1485                     fnegCode)
1486    twoEqualRegInstX("Neg", "FnegQX", "SimdFloatAluOp", floatTypes, 4,
1487                     fnegCode)
1488    # FRECPE
1489    frecpeCode = fpOp % "fplibRecipEstimate<Element>(srcElem1, fpscr)"
1490    twoEqualRegInstX("frecpe", "FrecpeDX", "SimdFloatMultAccOp",
1491                     smallFloatTypes, 2, frecpeCode)
1492    twoEqualRegInstX("frecpe", "FrecpeQX", "SimdFloatMultAccOp", floatTypes, 4,
1493                     frecpeCode)
1494    twoEqualRegInstX("frecpe", "FrecpeScX", "SimdFloatMultAccOp", floatTypes,
1495                     4, frecpeCode, scalar=True)
1496    # FRECPS
1497    frecpsCode = fpBinOp % "RecipStepFused"
1498    threeEqualRegInstX("frecps", "FrecpsDX", "SimdFloatMultAccOp",
1499                       smallFloatTypes, 2, frecpsCode)
1500    threeEqualRegInstX("frecps", "FrecpsQX", "SimdFloatMultAccOp", floatTypes,
1501                       4, frecpsCode)
1502    threeEqualRegInstX("frecps", "FrecpsScX", "SimdFloatMultAccOp", floatTypes,
1503                       4, frecpsCode, scalar=True)
1504    # FRECPX
1505    frecpxCode = fpOp % "fplibRecpX<Element>(srcElem1, fpscr)"
1506    twoEqualRegInstX("frecpx", "FrecpxX", "SimdFloatMultAccOp", floatTypes, 4,
1507                     frecpxCode, scalar=True)
1508    # FRINTA
1509    frintCode = fpOp % "fplibRoundInt<Element>(srcElem1, %s, %s, fpscr)"
1510    frintaCode = frintCode % ("FPRounding_TIEAWAY", "false")
1511    twoEqualRegInstX("frinta", "FrintaDX", "SimdCvtOp", smallFloatTypes, 2,
1512                     frintaCode)
1513    twoEqualRegInstX("frinta", "FrintaQX", "SimdCvtOp", floatTypes, 4,
1514                     frintaCode)
1515    # FRINTI
1516    frintiCode = frintCode % ("FPCRRounding(fpscr)", "false")
1517    twoEqualRegInstX("frinti", "FrintiDX", "SimdCvtOp", smallFloatTypes, 2,
1518                     frintiCode)
1519    twoEqualRegInstX("frinti", "FrintiQX", "SimdCvtOp", floatTypes, 4,
1520                     frintiCode)
1521    # FRINTM
1522    frintmCode = frintCode % ("FPRounding_NEGINF", "false")
1523    twoEqualRegInstX("frintm", "FrintmDX", "SimdCvtOp", smallFloatTypes, 2,
1524                     frintmCode)
1525    twoEqualRegInstX("frintm", "FrintmQX", "SimdCvtOp", floatTypes, 4,
1526                     frintmCode)
1527    # FRINTN
1528    frintnCode = frintCode % ("FPRounding_TIEEVEN", "false")
1529    twoEqualRegInstX("frintn", "FrintnDX", "SimdCvtOp", smallFloatTypes, 2,
1530                     frintnCode)
1531    twoEqualRegInstX("frintn", "FrintnQX", "SimdCvtOp", floatTypes, 4,
1532                     frintnCode)
1533    # FRINTP
1534    frintpCode = frintCode % ("FPRounding_POSINF", "false")
1535    twoEqualRegInstX("frintp", "FrintpDX", "SimdCvtOp", smallFloatTypes, 2,
1536                     frintpCode)
1537    twoEqualRegInstX("frintp", "FrintpQX", "SimdCvtOp", floatTypes, 4,
1538                     frintpCode)
1539    # FRINTX
1540    frintxCode = frintCode % ("FPCRRounding(fpscr)", "true")
1541    twoEqualRegInstX("frintx", "FrintxDX", "SimdCvtOp", smallFloatTypes, 2,
1542                     frintxCode)
1543    twoEqualRegInstX("frintx", "FrintxQX", "SimdCvtOp", floatTypes, 4,
1544                     frintxCode)
1545    # FRINTZ
1546    frintzCode = frintCode % ("FPRounding_ZERO", "false")
1547    twoEqualRegInstX("frintz", "FrintzDX", "SimdCvtOp", smallFloatTypes, 2,
1548                     frintzCode)
1549    twoEqualRegInstX("frintz", "FrintzQX", "SimdCvtOp", floatTypes, 4,
1550                     frintzCode)
1551    # FRSQRTE
1552    frsqrteCode = fpOp % "fplibRSqrtEstimate<Element>(srcElem1, fpscr)"
1553    twoEqualRegInstX("frsqrte", "FrsqrteDX", "SimdFloatSqrtOp",
1554                     smallFloatTypes, 2, frsqrteCode)
1555    twoEqualRegInstX("frsqrte", "FrsqrteQX", "SimdFloatSqrtOp", floatTypes, 4,
1556                     frsqrteCode)
1557    twoEqualRegInstX("frsqrte", "FrsqrteScX", "SimdFloatSqrtOp", floatTypes, 4,
1558                     frsqrteCode, scalar=True)
1559    # FRSQRTS
1560    frsqrtsCode = fpBinOp % "RSqrtStepFused"
1561    threeEqualRegInstX("frsqrts", "FrsqrtsDX", "SimdFloatMiscOp",
1562                       smallFloatTypes, 2, frsqrtsCode)
1563    threeEqualRegInstX("frsqrts", "FrsqrtsQX", "SimdFloatMiscOp", floatTypes,
1564                       4, frsqrtsCode)
1565    threeEqualRegInstX("frsqrts", "FrsqrtsScX", "SimdFloatMiscOp", floatTypes,
1566                       4, frsqrtsCode, scalar=True)
1567    # FSQRT
1568    fsqrtCode = fpOp % "fplibSqrt<Element>(srcElem1, fpscr)"
1569    twoEqualRegInstX("fsqrt", "FsqrtDX", "SimdFloatSqrtOp", smallFloatTypes, 2,
1570                     fsqrtCode)
1571    twoEqualRegInstX("fsqrt", "FsqrtQX", "SimdFloatSqrtOp", floatTypes, 4,
1572                     fsqrtCode)
1573    # FSUB
1574    fsubCode = fpBinOp % "Sub"
1575    threeEqualRegInstX("fsub", "FsubDX", "SimdFloatAddOp", smallFloatTypes, 2,
1576                       fsubCode)
1577    threeEqualRegInstX("fsub", "FsubQX", "SimdFloatAddOp", floatTypes, 4,
1578                       fsubCode)
1579    # INS (element)
1580    insFromVecElemInstX("ins", "InsElemX", "SimdMiscOp", unsignedTypes, 4)
1581    # INS (general register)
1582    insFromGprInstX("ins", "InsGprWX", "SimdMiscOp", smallUnsignedTypes, 4,
1583                    'W')
1584    insFromGprInstX("ins", "InsGprXX", "SimdMiscOp", unsignedTypes, 4, 'X')
1585    # MLA (by element)
1586    mlaCode = "destElem += srcElem1 * srcElem2;"
1587    threeEqualRegInstX("mla", "MlaElemDX", "SimdMultAccOp",
1588                       ("uint16_t", "uint32_t"), 2, mlaCode, True, byElem=True)
1589    threeEqualRegInstX("mla", "MlaElemQX", "SimdMultAccOp",
1590                       ("uint16_t", "uint32_t"), 4, mlaCode, True, byElem=True)
1591    # MLA (vector)
1592    threeEqualRegInstX("mla", "MlaDX", "SimdMultAccOp", smallUnsignedTypes, 2,
1593                       mlaCode, True)
1594    threeEqualRegInstX("mla", "MlaQX", "SimdMultAccOp", smallUnsignedTypes, 4,
1595                       mlaCode, True)
1596    # MLS (by element)
1597    mlsCode = "destElem -= srcElem1 * srcElem2;"
1598    threeEqualRegInstX("mls", "MlsElemDX", "SimdMultAccOp",
1599                       ("uint16_t", "uint32_t"), 2, mlsCode, True, byElem=True)
1600    threeEqualRegInstX("mls", "MlsElemQX", "SimdMultAccOp",
1601                       ("uint16_t", "uint32_t"), 4, mlsCode, True, byElem=True)
1602    # MLS (vector)
1603    threeEqualRegInstX("mls", "MlsDX", "SimdMultAccOp", smallUnsignedTypes, 2,
1604                       mlsCode, True)
1605    threeEqualRegInstX("mls", "MlsQX", "SimdMultAccOp", smallUnsignedTypes, 4,
1606                       mlsCode, True)
1607    # MOV (element) -> alias to INS (element)
1608    # MOV (from general) -> alias to INS (general register)
1609    # MOV (scalar) -> alias to DUP (element)
1610    # MOV (to general) -> alias to UMOV
1611    # MOV (vector) -> alias to ORR (register)
1612    # MOVI
1613    movImmCode = "destElem = imm;"
1614    oneRegImmInstX("movi", "MoviDX", "SimdMiscOp", ("uint64_t",), 2,
1615                   movImmCode)
1616    oneRegImmInstX("movi", "MoviQX", "SimdMiscOp", ("uint64_t",), 4,
1617                   movImmCode)
1618    # MUL (by element)
1619    mulCode = "destElem = srcElem1 * srcElem2;"
1620    threeEqualRegInstX("mul", "MulElemDX", "SimdMultOp",
1621                       ("uint16_t", "uint32_t"), 2, mulCode, byElem=True)
1622    threeEqualRegInstX("mul", "MulElemQX", "SimdMultOp",
1623                       ("uint16_t", "uint32_t"), 4, mulCode, byElem=True)
1624    # MUL (vector)
1625    threeEqualRegInstX("mul", "MulDX", "SimdMultOp", smallUnsignedTypes, 2,
1626                       mulCode)
1627    threeEqualRegInstX("mul", "MulQX", "SimdMultOp", smallUnsignedTypes, 4,
1628                       mulCode)
1629    # MVN
1630    mvnCode = "destElem = ~srcElem1;"
1631    twoEqualRegInstX("mvn", "MvnDX", "SimdAluOp", ("uint64_t",), 2, mvnCode)
1632    twoEqualRegInstX("mvn", "MvnQX", "SimdAluOp", ("uint64_t",), 4, mvnCode)
1633    # MVNI
1634    mvniCode = "destElem = ~imm;"
1635    oneRegImmInstX("mvni", "MvniDX", "SimdAluOp", ("uint64_t",), 2, mvniCode)
1636    oneRegImmInstX("mvni", "MvniQX", "SimdAluOp", ("uint64_t",), 4, mvniCode)
1637    # NEG
1638    negCode = "destElem = -srcElem1;"
1639    twoEqualRegInstX("neg", "NegDX", "SimdAluOp", signedTypes, 2, negCode)
1640    twoEqualRegInstX("neg", "NegQX", "SimdAluOp", signedTypes, 4, negCode)
1641    # NOT -> alias to MVN
1642    # ORN
1643    ornCode = "destElem = srcElem1 | ~srcElem2;"
1644    threeEqualRegInstX("orn", "OrnDX", "SimdAluOp", ("uint64_t",), 2, ornCode)
1645    threeEqualRegInstX("orn", "OrnQX", "SimdAluOp", ("uint64_t",), 4, ornCode)
1646    # ORR (immediate)
1647    orrImmCode = "destElem |= imm;"
1648    oneRegImmInstX("orr", "OrrImmDX", "SimdAluOp", ("uint64_t",), 2,
1649                   orrImmCode, True)
1650    oneRegImmInstX("orr", "OrrImmQX", "SimdAluOp", ("uint64_t",), 4,
1651                   orrImmCode, True)
1652    # ORR (register)
1653    orrCode = "destElem = srcElem1 | srcElem2;"
1654    threeEqualRegInstX("orr", "OrrDX", "SimdAluOp", ("uint64_t",), 2, orrCode)
1655    threeEqualRegInstX("orr", "OrrQX", "SimdAluOp", ("uint64_t",), 4, orrCode)
1656    # PMUL
1657    pmulCode = '''
1658            destElem = 0;
1659            for (unsigned j = 0; j < sizeof(Element) * 8; j++) {
1660                if (bits(srcElem2, j))
1661                    destElem ^= srcElem1 << j;
1662            }
1663    '''
1664    threeEqualRegInstX("pmul", "PmulDX", "SimdMultOp", ("uint8_t",), 2,
1665                       pmulCode)
1666    threeEqualRegInstX("pmul", "PmulQX", "SimdMultOp", ("uint8_t",), 4,
1667                       pmulCode)
1668    # PMULL, PMULL2
1669    # Note: 64-bit PMULL is not available (Crypto. Extension)
1670    pmullCode = '''
1671            destElem = 0;
1672            for (unsigned j = 0; j < sizeof(Element) * 8; j++) {
1673                if (bits(srcElem2, j))
1674                    destElem ^= (BigElement)srcElem1 << j;
1675            }
1676    '''
1677    threeRegLongInstX("pmull", "PmullX", "SimdMultOp", ("uint8_t",), pmullCode)
1678    threeRegLongInstX("pmull", "Pmull2X", "SimdMultOp", ("uint8_t",),
1679                      pmullCode, hi=True)
1680    # RADDHN, RADDHN2
1681    raddhnCode = '''
1682            destElem = ((BigElement)srcElem1 + (BigElement)srcElem2 +
1683                        ((BigElement)1 << (sizeof(Element) * 8 - 1))) >>
1684                       (sizeof(Element) * 8);
1685    '''
1686    threeRegNarrowInstX("raddhn", "RaddhnX", "SimdAddOp", smallUnsignedTypes,
1687                        raddhnCode)
1688    threeRegNarrowInstX("raddhn2", "Raddhn2X", "SimdAddOp", smallUnsignedTypes,
1689                        raddhnCode, hi=True)
1690    # RBIT
1691    rbitCode = '''
1692            destElem = 0;
1693            Element temp = srcElem1;
1694            for (int i = 0; i < 8 * sizeof(Element); i++) {
1695                destElem = destElem  | ((temp & 0x1) <<
1696                                        (8 * sizeof(Element) - 1 - i));
1697                temp >>= 1;
1698            }
1699    '''
1700    twoEqualRegInstX("rbit", "RbitDX", "SimdAluOp", ("uint8_t",), 2, rbitCode)
1701    twoEqualRegInstX("rbit", "RbitQX", "SimdAluOp", ("uint8_t",), 4, rbitCode)
1702    # REV16
1703    rev16Code = '''
1704            destElem = srcElem1;
1705            unsigned groupSize = ((1 << 1) / sizeof(Element));
1706            unsigned reverseMask = (groupSize - 1);
1707            j = i ^ reverseMask;
1708    '''
1709    twoEqualRegInstX("rev16", "Rev16DX", "SimdAluOp", ("uint8_t",), 2,
1710                     rev16Code)
1711    twoEqualRegInstX("rev16", "Rev16QX", "SimdAluOp", ("uint8_t",), 4,
1712                     rev16Code)
1713    # REV32
1714    rev32Code = '''
1715            destElem = srcElem1;
1716            unsigned groupSize = ((1 << 2) / sizeof(Element));
1717            unsigned reverseMask = (groupSize - 1);
1718            j = i ^ reverseMask;
1719    '''
1720    twoEqualRegInstX("rev32", "Rev32DX", "SimdAluOp", ("uint8_t", "uint16_t"),
1721                     2, rev32Code)
1722    twoEqualRegInstX("rev32", "Rev32QX", "SimdAluOp", ("uint8_t", "uint16_t"),
1723                     4, rev32Code)
1724    # REV64
1725    rev64Code = '''
1726            destElem = srcElem1;
1727            unsigned groupSize = ((1 << 3) / sizeof(Element));
1728            unsigned reverseMask = (groupSize - 1);
1729            j = i ^ reverseMask;
1730    '''
1731    twoEqualRegInstX("rev64", "Rev64DX", "SimdAluOp", smallUnsignedTypes, 2,
1732                     rev64Code)
1733    twoEqualRegInstX("rev64", "Rev64QX", "SimdAluOp", smallUnsignedTypes, 4,
1734                     rev64Code)
1735    # RSHRN, RSHRN2
1736    rshrnCode = '''
1737            if (imm > sizeof(srcElem1) * 8) {
1738                destElem = 0;
1739            } else if (imm) {
1740                Element rBit = bits(srcElem1, imm - 1);
1741                destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit;
1742            } else {
1743                destElem = srcElem1;
1744            }
1745    '''
1746    twoRegNarrowInstX("rshrn", "RshrnX", "SimdShiftOp", smallUnsignedTypes,
1747                      rshrnCode, hasImm=True)
1748    twoRegNarrowInstX("rshrn2", "Rshrn2X", "SimdShiftOp", smallUnsignedTypes,
1749                      rshrnCode, hasImm=True, hi=True)
1750    # RSUBHN, RSUBHN2
1751    rsubhnCode = '''
1752            destElem = ((BigElement)srcElem1 - (BigElement)srcElem2 +
1753                        ((BigElement)1 << (sizeof(Element) * 8 - 1))) >>
1754                       (sizeof(Element) * 8);
1755    '''
1756    threeRegNarrowInstX("rsubhn", "RsubhnX", "SimdAddOp", smallTypes,
1757                        rsubhnCode)
1758    threeRegNarrowInstX("rsubhn2", "Rsubhn2X", "SimdAddOp", smallTypes,
1759                        rsubhnCode, hi=True)
1760    # SABA
1761    abaCode = '''
1762            destElem += (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) :
1763                                                (srcElem2 - srcElem1);
1764    '''
1765    threeEqualRegInstX("saba", "SabaDX", "SimdAddAccOp", smallSignedTypes, 2,
1766                       abaCode, True)
1767    threeEqualRegInstX("saba", "SabaQX", "SimdAddAccOp", smallSignedTypes, 4,
1768                       abaCode, True)
1769    # SABAL, SABAL2
1770    abalCode = '''
1771            destElem += (srcElem1 > srcElem2) ?
1772                ((BigElement)srcElem1 - (BigElement)srcElem2) :
1773                ((BigElement)srcElem2 - (BigElement)srcElem1);
1774    '''
1775    threeRegLongInstX("sabal", "SabalX", "SimdAddAccOp", smallSignedTypes,
1776                      abalCode, True)
1777    threeRegLongInstX("sabal2", "Sabal2X", "SimdAddAccOp", smallSignedTypes,
1778                      abalCode, True, hi=True)
1779    # SABD
1780    abdCode = '''
1781            destElem = (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) :
1782                                               (srcElem2 - srcElem1);
1783    '''
1784    threeEqualRegInstX("sabd", "SabdDX", "SimdAddOp", smallSignedTypes, 2,
1785                       abdCode)
1786    threeEqualRegInstX("sabd", "SabdQX", "SimdAddOp", smallSignedTypes, 4,
1787                       abdCode)
1788    # SABDL, SABDL2
1789    abdlCode = '''
1790            destElem = (srcElem1 > srcElem2) ?
1791                ((BigElement)srcElem1 - (BigElement)srcElem2) :
1792                ((BigElement)srcElem2 - (BigElement)srcElem1);
1793    '''
1794    threeRegLongInstX("sabdl", "SabdlX", "SimdAddAccOp", smallSignedTypes,
1795                      abdlCode, True)
1796    threeRegLongInstX("sabdl2", "Sabdl2X", "SimdAddAccOp", smallSignedTypes,
1797                      abdlCode, True, hi=True)
1798    # SADALP
1799    adalpCode = "destElem += (BigElement)srcElem1 + (BigElement)srcElem2;"
1800    twoRegCondenseInstX("sadalp", "SadalpDX", "SimdAddOp", smallSignedTypes, 2,
1801                        adalpCode, True)
1802    twoRegCondenseInstX("sadalp", "SadalpQX", "SimdAddOp", smallSignedTypes, 4,
1803                        adalpCode, True)
1804    # SADDL, SADDL2
1805    addlwCode = "destElem = (BigElement)srcElem1 + (BigElement)srcElem2;"
1806    threeRegLongInstX("saddl", "SaddlX", "SimdAddAccOp", smallSignedTypes,
1807                      addlwCode)
1808    threeRegLongInstX("saddl2", "Saddl2X", "SimdAddAccOp", smallSignedTypes,
1809                      addlwCode, hi=True)
1810    # SADDLP
1811    twoRegCondenseInstX("saddlp", "SaddlpDX", "SimdAddOp", smallSignedTypes, 2,
1812                        addlwCode)
1813    twoRegCondenseInstX("saddlp", "SaddlpQX", "SimdAddOp", smallSignedTypes, 4,
1814                        addlwCode)
1815    # SADDLV
1816    # Note: SimdAddOp can be a bit optimistic here
1817    addAcrossLongCode = "destElem += (BigElement)srcElem1;"
1818    twoRegAcrossInstX("saddlv", "SaddlvDX", "SimdAddOp", ("int8_t", "int16_t"),
1819                      2, addAcrossLongCode, long=True)
1820    twoRegAcrossInstX("saddlv", "SaddlvQX", "SimdAddOp", ("int8_t", "int16_t"),
1821                      4, addAcrossLongCode, long=True)
1822    twoRegAcrossInstX("saddlv", "SaddlvBQX", "SimdAddOp", ("int32_t",), 4,
1823                      addAcrossLongCode, doubleDest=True, long=True)
1824    # SADDW, SADDW2
1825    threeRegWideInstX("saddw", "SaddwX", "SimdAddAccOp", smallSignedTypes,
1826                      addlwCode)
1827    threeRegWideInstX("saddw2", "Saddw2X", "SimdAddAccOp", smallSignedTypes,
1828                      addlwCode, hi=True)
1829    # SCVTF (fixed-point)
1830    scvtfFixedCode = fpOp % ("fplibFixedToFP<Element>((int%d_t) srcElem1, imm,"
1831                             " false, FPCRRounding(fpscr), fpscr)")
1832    twoEqualRegInstX("scvtf", "ScvtfFixedDX", "SimdCvtOp", smallFloatTypes, 2,
1833                     scvtfFixedCode % 32, hasImm=True)
1834    twoEqualRegInstX("scvtf", "ScvtfFixedSQX", "SimdCvtOp", smallFloatTypes, 4,
1835                     scvtfFixedCode % 32, hasImm=True)
1836    twoEqualRegInstX("scvtf", "ScvtfFixedDQX", "SimdCvtOp", ("uint64_t",), 4,
1837                     scvtfFixedCode % 64, hasImm=True)
1838    twoEqualRegInstX("scvtf", "ScvtfFixedScSX", "SimdCvtOp", smallFloatTypes,
1839                     4, scvtfFixedCode % 32, hasImm=True, scalar=True)
1840    twoEqualRegInstX("scvtf", "ScvtfFixedScDX", "SimdCvtOp", ("uint64_t",), 4,
1841                     scvtfFixedCode % 64, hasImm=True, scalar=True)
1842    # SCVTF (integer)
1843    scvtfIntCode = fpOp % ("fplibFixedToFP<Element>((int%d_t) srcElem1, 0,"
1844                           " false, FPCRRounding(fpscr), fpscr)")
1845    twoEqualRegInstX("scvtf", "ScvtfIntDX", "SimdCvtOp", smallFloatTypes, 2,
1846                     scvtfIntCode % 32)
1847    twoEqualRegInstX("scvtf", "ScvtfIntSQX", "SimdCvtOp", smallFloatTypes, 4,
1848                     scvtfIntCode % 32)
1849    twoEqualRegInstX("scvtf", "ScvtfIntDQX", "SimdCvtOp", ("uint64_t",), 4,
1850                     scvtfIntCode % 64)
1851    twoEqualRegInstX("scvtf", "ScvtfIntScSX", "SimdCvtOp", smallFloatTypes, 4,
1852                     scvtfIntCode % 32, scalar=True)
1853    twoEqualRegInstX("scvtf", "ScvtfIntScDX", "SimdCvtOp", ("uint64_t",), 4,
1854                     scvtfIntCode % 64, scalar=True)
1855    # SHADD
1856    haddCode = '''
1857            Element carryBit =
1858                (((unsigned)srcElem1 & 0x1) +
1859                 ((unsigned)srcElem2 & 0x1)) >> 1;
1860            // Use division instead of a shift to ensure the sign extension works
1861            // right. The compiler will figure out if it can be a shift. Mask the
1862            // inputs so they get truncated correctly.
1863            destElem = (((srcElem1 & ~(Element)1) / 2) +
1864                        ((srcElem2 & ~(Element)1) / 2)) + carryBit;
1865    '''
1866    threeEqualRegInstX("shadd", "ShaddDX", "SimdAddOp", smallSignedTypes, 2,
1867                       haddCode)
1868    threeEqualRegInstX("shadd", "ShaddQX", "SimdAddOp", smallSignedTypes, 4,
1869                       haddCode)
1870    # SHL
1871    shlCode = '''
1872            if (imm >= sizeof(Element) * 8)
1873                destElem = (srcElem1 << (sizeof(Element) * 8 - 1)) << 1;
1874            else
1875                destElem = srcElem1 << imm;
1876    '''
1877    twoEqualRegInstX("shl", "ShlDX", "SimdShiftOp", unsignedTypes, 2, shlCode,
1878                     hasImm=True)
1879    twoEqualRegInstX("shl", "ShlQX", "SimdShiftOp", unsignedTypes, 4, shlCode,
1880                     hasImm=True)
1881    # SHLL, SHLL2
1882    shllCode = "destElem = ((BigElement)srcElem1) << (sizeof(Element) * 8);"
1883    twoRegLongInstX("shll", "ShllX", "SimdShiftOp", smallTypes, shllCode)
1884    twoRegLongInstX("shll", "Shll2X", "SimdShiftOp", smallTypes, shllCode,
1885                    hi=True)
1886    # SHRN, SHRN2
1887    shrnCode = '''
1888            if (imm >= sizeof(srcElem1) * 8) {
1889                destElem = 0;
1890            } else {
1891                destElem = srcElem1 >> imm;
1892            }
1893    '''
1894    twoRegNarrowInstX("shrn", "ShrnX", "SimdShiftOp", smallUnsignedTypes,
1895                      shrnCode, hasImm=True)
1896    twoRegNarrowInstX("shrn2", "Shrn2X", "SimdShiftOp", smallUnsignedTypes,
1897                      shrnCode, hasImm=True, hi=True)
1898    # SHSUB
1899    hsubCode = '''
1900            Element borrowBit =
1901                (((srcElem1 & 0x1) - (srcElem2 & 0x1)) >> 1) & 0x1;
1902            // Use division instead of a shift to ensure the sign extension works
1903            // right. The compiler will figure out if it can be a shift. Mask the
1904            // inputs so they get truncated correctly.
1905            destElem = (((srcElem1 & ~(Element)1) / 2) -
1906                        ((srcElem2 & ~(Element)1) / 2)) - borrowBit;
1907    '''
1908    threeEqualRegInstX("shsub", "ShsubDX", "SimdAddOp", smallSignedTypes, 2,
1909                       hsubCode)
1910    threeEqualRegInstX("shsub", "ShsubQX", "SimdAddOp", smallSignedTypes, 4,
1911                       hsubCode)
1912    # SLI
1913    sliCode = '''
1914            if (imm >= sizeof(Element) * 8)
1915                destElem = destElem;
1916            else
1917                destElem = (srcElem1 << imm) | (destElem & mask(imm));
1918    '''
1919    twoEqualRegInstX("sli", "SliDX", "SimdShiftOp", unsignedTypes, 2, sliCode,
1920                     True, hasImm=True)
1921    twoEqualRegInstX("sli", "SliQX", "SimdShiftOp", unsignedTypes, 4, sliCode,
1922                     True, hasImm=True)
1923    # SMAX
1924    maxCode = "destElem = (srcElem1 > srcElem2) ? srcElem1 : srcElem2;"
1925    threeEqualRegInstX("smax", "SmaxDX", "SimdCmpOp", smallSignedTypes, 2,
1926                       maxCode)
1927    threeEqualRegInstX("smax", "SmaxQX", "SimdCmpOp", smallSignedTypes, 4,
1928                       maxCode)
1929    # SMAXP
1930    threeEqualRegInstX("smaxp", "SmaxpDX", "SimdCmpOp", smallSignedTypes, 2,
1931                       maxCode, pairwise=True)
1932    threeEqualRegInstX("smaxp", "SmaxpQX", "SimdCmpOp", smallSignedTypes, 4,
1933                       maxCode, pairwise=True)
1934    # SMAXV
1935    maxAcrossCode = '''
1936            if (i == 0 || srcElem1 > destElem)
1937                destElem = srcElem1;
1938    '''
1939    twoRegAcrossInstX("smaxv", "SmaxvDX", "SimdCmpOp", ("int8_t", "int16_t"),
1940                      2, maxAcrossCode)
1941    twoRegAcrossInstX("smaxv", "SmaxvQX", "SimdCmpOp", smallSignedTypes, 4,
1942                      maxAcrossCode)
1943    # SMIN
1944    minCode = "destElem = (srcElem1 < srcElem2) ? srcElem1 : srcElem2;"
1945    threeEqualRegInstX("smin", "SminDX", "SimdCmpOp", smallSignedTypes, 2,
1946                       minCode)
1947    threeEqualRegInstX("smin", "SminQX", "SimdCmpOp", smallSignedTypes, 4,
1948                       minCode)
1949    # SMINP
1950    threeEqualRegInstX("sminp", "SminpDX", "SimdCmpOp", smallSignedTypes, 2,
1951                       minCode, pairwise=True)
1952    threeEqualRegInstX("sminp", "SminpQX", "SimdCmpOp", smallSignedTypes, 4,
1953                       minCode, pairwise=True)
1954    # SMINV
1955    minAcrossCode = '''
1956            if (i == 0 || srcElem1 < destElem)
1957                destElem = srcElem1;
1958    '''
1959    twoRegAcrossInstX("sminv", "SminvDX", "SimdCmpOp", ("int8_t", "int16_t"),
1960                      2, minAcrossCode)
1961    twoRegAcrossInstX("sminv", "SminvQX", "SimdCmpOp", smallSignedTypes, 4,
1962                      minAcrossCode)
1963
1964    split('exec')
1965
1966    # SMLAL, SMLAL2 (by element)
1967    mlalCode = "destElem += (BigElement)srcElem1 * (BigElement)srcElem2;"
1968    threeRegLongInstX("smlal", "SmlalElemX", "SimdMultAccOp",
1969                      ("int16_t", "int32_t"), mlalCode, True, byElem=True)
1970    threeRegLongInstX("smlal", "SmlalElem2X", "SimdMultAccOp",
1971                      ("int16_t", "int32_t"), mlalCode, True, byElem=True,
1972                      hi=True)
1973    # SMLAL, SMLAL2 (vector)
1974    threeRegLongInstX("smlal", "SmlalX", "SimdMultAccOp", smallSignedTypes,
1975                      mlalCode, True)
1976    threeRegLongInstX("smlal", "Smlal2X", "SimdMultAccOp", smallSignedTypes,
1977                      mlalCode, True, hi=True)
1978    # SMLSL, SMLSL2 (by element)
1979    mlslCode = "destElem -= (BigElement)srcElem1 * (BigElement)srcElem2;"
1980    threeRegLongInstX("smlsl", "SmlslElemX", "SimdMultAccOp", smallSignedTypes,
1981                      mlslCode, True, byElem=True)
1982    threeRegLongInstX("smlsl", "SmlslElem2X", "SimdMultAccOp",
1983                      smallSignedTypes, mlslCode, True, byElem=True, hi=True)
1984    # SMLSL, SMLSL2 (vector)
1985    threeRegLongInstX("smlsl", "SmlslX", "SimdMultAccOp", smallSignedTypes,
1986                      mlslCode, True)
1987    threeRegLongInstX("smlsl", "Smlsl2X", "SimdMultAccOp", smallSignedTypes,
1988                      mlslCode, True, hi=True)
1989    # SMOV
1990    insToGprInstX("smov", "SmovWX", "SimdMiscOp", ("int8_t", "int16_t"), 4,
1991                  'W', True)
1992    insToGprInstX("smov", "SmovXX", "SimdMiscOp", smallSignedTypes, 4, 'X',
1993                  True)
1994    # SMULL, SMULL2 (by element)
1995    mullCode = "destElem = (BigElement)srcElem1 * (BigElement)srcElem2;"
1996    threeRegLongInstX("smull", "SmullElemX", "SimdMultOp", smallSignedTypes,
1997                      mullCode, byElem=True)
1998    threeRegLongInstX("smull", "SmullElem2X", "SimdMultOp", smallSignedTypes,
1999                      mullCode, byElem=True, hi=True)
2000    # SMULL, SMULL2 (vector)
2001    threeRegLongInstX("smull", "SmullX", "SimdMultOp", smallSignedTypes,
2002                      mullCode)
2003    threeRegLongInstX("smull", "Smull2X", "SimdMultOp", smallSignedTypes,
2004                      mullCode, hi=True)
2005    # SQABS
2006    sqabsCode = '''
2007        FPSCR fpscr = (FPSCR) FpscrQc;
2008        if (srcElem1 == (Element)((Element)1 << (sizeof(Element) * 8 - 1))) {
2009            fpscr.qc = 1;
2010            destElem = ~srcElem1;
2011        } else if (srcElem1 < 0) {
2012            destElem = -srcElem1;
2013        } else {
2014            destElem = srcElem1;
2015        }
2016        FpscrQc = fpscr;
2017    '''
2018    twoEqualRegInstX("sqabs", "SqabsDX", "SimdAluOp", smallSignedTypes, 2,
2019                     sqabsCode)
2020    twoEqualRegInstX("sqabs", "SqabsQX", "SimdAluOp", signedTypes, 4,
2021                     sqabsCode)
2022    twoEqualRegInstX("sqabs", "SqabsScX", "SimdAluOp", signedTypes, 4,
2023                     sqabsCode, scalar=True)
2024    # SQADD
2025    sqaddCode = '''
2026            destElem = srcElem1 + srcElem2;
2027            FPSCR fpscr = (FPSCR) FpscrQc;
2028            bool negDest = (destElem < 0);
2029            bool negSrc1 = (srcElem1 < 0);
2030            bool negSrc2 = (srcElem2 < 0);
2031            if ((negDest != negSrc1) && (negSrc1 == negSrc2)) {
2032                destElem = (Element)1 << (sizeof(Element) * 8 - 1);
2033                if (negDest)
2034                    destElem -= 1;
2035                fpscr.qc = 1;
2036            }
2037            FpscrQc = fpscr;
2038    '''
2039    threeEqualRegInstX("sqadd", "SqaddDX", "SimdAddOp", smallSignedTypes, 2,
2040                       sqaddCode)
2041    threeEqualRegInstX("sqadd", "SqaddQX", "SimdAddOp", signedTypes, 4,
2042                       sqaddCode)
2043    threeEqualRegInstX("sqadd", "SqaddScX", "SimdAddOp", signedTypes, 4,
2044                       sqaddCode, scalar=True)
2045    # SQDMLAL, SQDMLAL2 (by element)
2046    qdmlalCode = '''
2047        FPSCR fpscr = (FPSCR) FpscrQc;
2048        BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2049        Element maxNeg = (Element)1 << (sizeof(Element) * 8 - 1);
2050        Element halfNeg = maxNeg / 2;
2051        if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2052            (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2053            (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2054            midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8));
2055            fpscr.qc = 1;
2056        }
2057        bool negPreDest = ltz(destElem);
2058        destElem += midElem;
2059        bool negDest = ltz(destElem);
2060        bool negMid = ltz(midElem);
2061        if (negPreDest == negMid && negMid != negDest) {
2062            destElem = mask(sizeof(BigElement) * 8 - 1);
2063            if (negPreDest)
2064                destElem = ~destElem;
2065            fpscr.qc = 1;
2066        }
2067        FpscrQc = fpscr;
2068    '''
2069    threeRegLongInstX("sqdmlal", "SqdmlalElemX", "SimdMultAccOp",
2070                      ("int16_t", "int32_t"), qdmlalCode, True, byElem=True)
2071    threeRegLongInstX("sqdmlal", "SqdmlalElem2X", "SimdMultAccOp",
2072                      ("int16_t", "int32_t"), qdmlalCode, True, byElem=True,
2073                      hi=True)
2074    threeRegLongInstX("sqdmlal", "SqdmlalElemScX", "SimdMultAccOp",
2075                      ("int16_t", "int32_t"), qdmlalCode, True, byElem=True,
2076                      scalar=True)
2077    # SQDMLAL, SQDMLAL2 (vector)
2078    threeRegLongInstX("sqdmlal", "SqdmlalX", "SimdMultAccOp",
2079                      ("int16_t", "int32_t"), qdmlalCode, True)
2080    threeRegLongInstX("sqdmlal", "Sqdmlal2X", "SimdMultAccOp",
2081                      ("int16_t", "int32_t"), qdmlalCode, True, hi=True)
2082    threeRegLongInstX("sqdmlal", "SqdmlalScX", "SimdMultAccOp",
2083                      ("int16_t", "int32_t"), qdmlalCode, True, scalar=True)
2084    # SQDMLSL, SQDMLSL2 (by element)
2085    qdmlslCode = '''
2086        FPSCR fpscr = (FPSCR) FpscrQc;
2087        BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2088        Element maxNeg = (Element)1 << (sizeof(Element) * 8 - 1);
2089        Element halfNeg = maxNeg / 2;
2090        if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2091            (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2092            (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2093            midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8));
2094            fpscr.qc = 1;
2095        }
2096        bool negPreDest = ltz(destElem);
2097        destElem -= midElem;
2098        bool negDest = ltz(destElem);
2099        bool posMid = ltz((BigElement)-midElem);
2100        if (negPreDest == posMid && posMid != negDest) {
2101            destElem = mask(sizeof(BigElement) * 8 - 1);
2102            if (negPreDest)
2103                destElem = ~destElem;
2104            fpscr.qc = 1;
2105        }
2106        FpscrQc = fpscr;
2107    '''
2108    threeRegLongInstX("sqdmlsl", "SqdmlslElemX", "SimdMultAccOp",
2109                      ("int16_t", "int32_t"), qdmlslCode, True, byElem=True)
2110    threeRegLongInstX("sqdmlsl", "SqdmlslElem2X", "SimdMultAccOp",
2111                      ("int16_t", "int32_t"), qdmlslCode, True, byElem=True,
2112                      hi=True)
2113    threeRegLongInstX("sqdmlsl", "SqdmlslElemScX", "SimdMultAccOp",
2114                      ("int16_t", "int32_t"), qdmlslCode, True, byElem=True,
2115                      scalar=True)
2116    # SQDMLSL, SQDMLSL2 (vector)
2117    threeRegLongInstX("sqdmlsl", "SqdmlslX", "SimdMultAccOp",
2118                      ("int16_t", "int32_t"), qdmlslCode, True)
2119    threeRegLongInstX("sqdmlsl", "Sqdmlsl2X", "SimdMultAccOp",
2120                      ("int16_t", "int32_t"), qdmlslCode, True, hi=True)
2121    threeRegLongInstX("sqdmlsl", "SqdmlslScX", "SimdMultAccOp",
2122                      ("int16_t", "int32_t"), qdmlslCode, True, scalar=True)
2123    # SQDMULH (by element)
2124    sqdmulhCode = '''
2125            FPSCR fpscr = (FPSCR) FpscrQc;
2126            destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2) >>
2127                       (sizeof(Element) * 8);
2128            if (srcElem1 == srcElem2 &&
2129                    srcElem1 == (Element)((Element)1 <<
2130                        (sizeof(Element) * 8 - 1))) {
2131                destElem = ~srcElem1;
2132                fpscr.qc = 1;
2133            }
2134            FpscrQc = fpscr;
2135    '''
2136    threeEqualRegInstX("sqdmulh", "SqdmulhElemDX", "SimdMultOp",
2137                       ("int16_t", "int32_t"), 2, sqdmulhCode, byElem=True)
2138    threeEqualRegInstX("sqdmulh", "SqdmulhElemQX", "SimdMultOp",
2139                       ("int16_t", "int32_t"), 4, sqdmulhCode, byElem=True)
2140    threeEqualRegInstX("sqdmulh", "SqdmulhElemScX", "SimdMultOp",
2141                       ("int16_t", "int32_t"), 4, sqdmulhCode, byElem=True,
2142                       scalar=True)
2143    # SQDMULH (vector)
2144    threeEqualRegInstX("sqdmulh", "SqdmulhDX", "SimdMultOp",
2145                       ("int16_t", "int32_t"), 2, sqdmulhCode)
2146    threeEqualRegInstX("sqdmulh", "SqdmulhQX", "SimdMultOp",
2147                       ("int16_t", "int32_t"), 4, sqdmulhCode)
2148    threeEqualRegInstX("sqdmulh", "SqdmulhScX", "SimdMultOp",
2149                       ("int16_t", "int32_t"), 4, sqdmulhCode, scalar=True)
2150    # SQDMULL, SQDMULL2 (by element)
2151    qdmullCode = '''
2152        FPSCR fpscr = (FPSCR) FpscrQc;
2153        destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2154        if (srcElem1 == srcElem2 &&
2155                srcElem1 == (Element)((Element)1 <<
2156                    (Element)(sizeof(Element) * 8 - 1))) {
2157            destElem = ~((BigElement)srcElem1 << (sizeof(Element) * 8));
2158            fpscr.qc = 1;
2159        }
2160        FpscrQc = fpscr;
2161    '''
2162    threeRegLongInstX("sqdmull", "SqdmullElemX", "SimdMultOp",
2163                      ("int16_t", "int32_t"), qdmullCode, True, byElem=True)
2164    threeRegLongInstX("sqdmull", "SqdmullElem2X", "SimdMultOp",
2165                      ("int16_t", "int32_t"), qdmullCode, True, byElem=True,
2166                      hi=True)
2167    threeRegLongInstX("sqdmull", "SqdmullElemScX", "SimdMultOp",
2168                      ("int16_t", "int32_t"), qdmullCode, True, byElem=True,
2169                      scalar=True)
2170    # SQDMULL, SQDMULL2 (vector)
2171    threeRegLongInstX("sqdmull", "SqdmullX", "SimdMultOp",
2172                      ("int16_t", "int32_t"), qdmullCode, True)
2173    threeRegLongInstX("sqdmull", "Sqdmull2X", "SimdMultOp",
2174                      ("int16_t", "int32_t"), qdmullCode, True, hi=True)
2175    threeRegLongInstX("sqdmull", "SqdmullScX", "SimdMultOp",
2176                      ("int16_t", "int32_t"), qdmullCode, True, scalar=True)
2177    # SQNEG
2178    sqnegCode = '''
2179        FPSCR fpscr = (FPSCR) FpscrQc;
2180        if (srcElem1 == (Element)((Element)1 << (sizeof(Element) * 8 - 1))) {
2181            fpscr.qc = 1;
2182            destElem = ~srcElem1;
2183        } else {
2184            destElem = -srcElem1;
2185        }
2186        FpscrQc = fpscr;
2187    '''
2188    twoEqualRegInstX("sqneg", "SqnegDX", "SimdAluOp", smallSignedTypes, 2,
2189                     sqnegCode)
2190    twoEqualRegInstX("sqneg", "SqnegQX", "SimdAluOp", signedTypes, 4,
2191                     sqnegCode)
2192    twoEqualRegInstX("sqneg", "SqnegScX", "SimdAluOp", signedTypes, 4,
2193                     sqnegCode, scalar=True)
2194    # SQRDMULH (by element)
2195    sqrdmulhCode = '''
2196            FPSCR fpscr = (FPSCR) FpscrQc;
2197            destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2 +
2198                        ((int64_t)1 << (sizeof(Element) * 8 - 1))) >>
2199                       (sizeof(Element) * 8);
2200            Element maxNeg = (Element)1 << (sizeof(Element) * 8 - 1);
2201            Element halfNeg = maxNeg / 2;
2202            if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2203                (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2204                (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2205                if (destElem < 0) {
2206                    destElem = mask(sizeof(Element) * 8 - 1);
2207                } else {
2208                    destElem = (Element)1 << (sizeof(Element) * 8 - 1);
2209                }
2210                fpscr.qc = 1;
2211            }
2212            FpscrQc = fpscr;
2213    '''
2214    threeEqualRegInstX("sqrdmulh", "SqrdmulhElemDX", "SimdMultOp",
2215                       ("int16_t", "int32_t"), 2, sqrdmulhCode, byElem=True)
2216    threeEqualRegInstX("sqrdmulh", "SqrdmulhElemQX", "SimdMultOp",
2217                       ("int16_t", "int32_t"), 4, sqrdmulhCode, byElem=True)
2218    threeEqualRegInstX("sqrdmulh", "SqrdmulhElemScX", "SimdMultOp",
2219                       ("int16_t", "int32_t"), 4, sqrdmulhCode, byElem=True,
2220                       scalar=True)
2221    # SQRDMULH (vector)
2222    threeEqualRegInstX("sqrdmulh", "SqrdmulhDX", "SimdMultOp",
2223                       ("int16_t", "int32_t"), 2, sqrdmulhCode)
2224    threeEqualRegInstX("sqrdmulh", "SqrdmulhQX", "SimdMultOp",
2225                       ("int16_t", "int32_t"), 4, sqrdmulhCode)
2226    threeEqualRegInstX("sqrdmulh", "SqrdmulhScX", "SimdMultOp",
2227                       ("int16_t", "int32_t"), 4, sqrdmulhCode, scalar=True)
2228    # SQRSHL
2229    sqrshlCode = '''
2230            int16_t shiftAmt = (int8_t)srcElem2;
2231            FPSCR fpscr = (FPSCR) FpscrQc;
2232            if (shiftAmt < 0) {
2233                shiftAmt = -shiftAmt;
2234                Element rBit = 0;
2235                if (shiftAmt <= sizeof(Element) * 8)
2236                    rBit = bits(srcElem1, shiftAmt - 1);
2237                if (shiftAmt > sizeof(Element) * 8 && srcElem1 < 0)
2238                    rBit = 1;
2239                if (shiftAmt >= sizeof(Element) * 8) {
2240                    shiftAmt = sizeof(Element) * 8 - 1;
2241                    destElem = 0;
2242                } else {
2243                    destElem = (srcElem1 >> shiftAmt);
2244                }
2245                // Make sure the right shift sign extended when it should.
2246                if (srcElem1 < 0 && destElem >= 0) {
2247                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
2248                                                 1 - shiftAmt));
2249                }
2250                destElem += rBit;
2251            } else if (shiftAmt > 0) {
2252                bool sat = false;
2253                if (shiftAmt >= sizeof(Element) * 8) {
2254                    if (srcElem1 != 0)
2255                        sat = true;
2256                    else
2257                        destElem = 0;
2258                } else {
2259                    if (bits((uint64_t) srcElem1, sizeof(Element) * 8 - 1,
2260                                sizeof(Element) * 8 - 1 - shiftAmt) !=
2261                            ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) {
2262                        sat = true;
2263                    } else {
2264                        destElem = srcElem1 << shiftAmt;
2265                    }
2266                }
2267                if (sat) {
2268                    fpscr.qc = 1;
2269                    destElem = mask(sizeof(Element) * 8 - 1);
2270                    if (srcElem1 < 0)
2271                        destElem = ~destElem;
2272                }
2273            } else {
2274                destElem = srcElem1;
2275            }
2276            FpscrQc = fpscr;
2277    '''
2278    threeEqualRegInstX("sqrshl", "SqrshlDX", "SimdCmpOp", smallSignedTypes, 2,
2279                       sqrshlCode)
2280    threeEqualRegInstX("sqrshl", "SqrshlQX", "SimdCmpOp", signedTypes, 4,
2281                       sqrshlCode)
2282    threeEqualRegInstX("sqrshl", "SqrshlScX", "SimdCmpOp", signedTypes, 4,
2283                       sqrshlCode, scalar=True)
2284    # SQRSHRN, SQRSHRN2
2285    sqrshrnCode = '''
2286            FPSCR fpscr = (FPSCR) FpscrQc;
2287            if (imm > sizeof(srcElem1) * 8) {
2288                if (srcElem1 != 0 && srcElem1 != -1)
2289                    fpscr.qc = 1;
2290                destElem = 0;
2291            } else if (imm) {
2292                BigElement mid = (srcElem1 >> (imm - 1));
2293                uint64_t rBit = mid & 0x1;
2294                mid >>= 1;
2295                mid |= -(mid & ((BigElement)1 <<
2296                            (sizeof(BigElement) * 8 - 1 - imm)));
2297                mid += rBit;
2298                if (mid != (Element)mid) {
2299                    destElem = mask(sizeof(Element) * 8 - 1);
2300                    if (srcElem1 < 0)
2301                        destElem = ~destElem;
2302                    fpscr.qc = 1;
2303                } else {
2304                    destElem = mid;
2305                }
2306            } else {
2307                if (srcElem1 != (Element)srcElem1) {
2308                    destElem = mask(sizeof(Element) * 8 - 1);
2309                    if (srcElem1 < 0)
2310                        destElem = ~destElem;
2311                    fpscr.qc = 1;
2312                } else {
2313                    destElem = srcElem1;
2314                }
2315            }
2316            FpscrQc = fpscr;
2317    '''
2318    twoRegNarrowInstX("sqrshrn", "SqrshrnX", "SimdShiftOp", smallSignedTypes,
2319                      sqrshrnCode, hasImm=True)
2320    twoRegNarrowInstX("sqrshrn2", "Sqrshrn2X", "SimdShiftOp", smallSignedTypes,
2321                      sqrshrnCode, hasImm=True, hi=True)
2322    twoRegNarrowInstX("sqrshrn", "SqrshrnScX", "SimdShiftOp", smallSignedTypes,
2323                      sqrshrnCode, hasImm=True, scalar=True)
2324    # SQRSHRUN, SQRSHRUN2
2325    sqrshrunCode = '''
2326            FPSCR fpscr = (FPSCR) FpscrQc;
2327            if (imm > sizeof(srcElem1) * 8) {
2328                if (srcElem1 != 0)
2329                    fpscr.qc = 1;
2330                destElem = 0;
2331            } else if (imm) {
2332                BigElement mid = (srcElem1 >> (imm - 1));
2333                uint64_t rBit = mid & 0x1;
2334                mid >>= 1;
2335                mid |= -(mid & ((BigElement)1 <<
2336                                (sizeof(BigElement) * 8 - 1 - imm)));
2337                mid += rBit;
2338                if (bits(mid, sizeof(BigElement) * 8 - 1,
2339                              sizeof(Element) * 8) != 0) {
2340                    if (srcElem1 < 0) {
2341                        destElem = 0;
2342                    } else {
2343                        destElem = mask(sizeof(Element) * 8);
2344                    }
2345                    fpscr.qc = 1;
2346                } else {
2347                    destElem = mid;
2348                }
2349            } else {
2350                if (srcElem1 < 0) {
2351                    fpscr.qc = 1;
2352                    destElem = 0;
2353                } else {
2354                    destElem = srcElem1;
2355                }
2356            }
2357            FpscrQc = fpscr;
2358    '''
2359    twoRegNarrowInstX("sqrshrun", "SqrshrunX", "SimdShiftOp", smallSignedTypes,
2360                      sqrshrunCode, hasImm=True)
2361    twoRegNarrowInstX("sqrshrun", "Sqrshrun2X", "SimdShiftOp",
2362                      smallSignedTypes, sqrshrunCode, hasImm=True, hi=True)
2363    twoRegNarrowInstX("sqrshrun", "SqrshrunScX", "SimdShiftOp",
2364                      smallSignedTypes, sqrshrunCode, hasImm=True, scalar=True)
2365    # SQSHL (immediate)
2366    sqshlImmCode = '''
2367            FPSCR fpscr = (FPSCR) FpscrQc;
2368            if (imm >= sizeof(Element) * 8) {
2369                if (srcElem1 != 0) {
2370                    destElem = (Element)1 << (sizeof(Element) * 8 - 1);
2371                    if (srcElem1 > 0)
2372                        destElem = ~destElem;
2373                    fpscr.qc = 1;
2374                } else {
2375                    destElem = 0;
2376                }
2377            } else if (imm) {
2378                destElem = (srcElem1 << imm);
2379                uint64_t topBits = bits((uint64_t)srcElem1,
2380                                        sizeof(Element) * 8 - 1,
2381                                        sizeof(Element) * 8 - 1 - imm);
2382                if (topBits != 0 && topBits != mask(imm + 1)) {
2383                    destElem = (Element)1 << (sizeof(Element) * 8 - 1);
2384                    if (srcElem1 > 0)
2385                        destElem = ~destElem;
2386                    fpscr.qc = 1;
2387                }
2388            } else {
2389                destElem = srcElem1;
2390            }
2391            FpscrQc = fpscr;
2392    '''
2393    twoEqualRegInstX("sqshl", "SqshlImmDX", "SimdAluOp", smallSignedTypes, 2,
2394                     sqshlImmCode, hasImm=True)
2395    twoEqualRegInstX("sqshl", "SqshlImmQX", "SimdAluOp", signedTypes, 4,
2396                     sqshlImmCode, hasImm=True)
2397    twoEqualRegInstX("sqshl", "SqshlImmScX", "SimdAluOp", signedTypes, 4,
2398                     sqshlImmCode, hasImm=True, scalar=True)
2399    # SQSHL (register)
2400    sqshlCode = '''
2401            int16_t shiftAmt = (int8_t)srcElem2;
2402            FPSCR fpscr = (FPSCR) FpscrQc;
2403            if (shiftAmt < 0) {
2404                shiftAmt = -shiftAmt;
2405                if (shiftAmt >= sizeof(Element) * 8) {
2406                    shiftAmt = sizeof(Element) * 8 - 1;
2407                    destElem = 0;
2408                } else {
2409                    destElem = (srcElem1 >> shiftAmt);
2410                }
2411                // Make sure the right shift sign extended when it should.
2412                if (srcElem1 < 0 && destElem >= 0) {
2413                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
2414                                                 1 - shiftAmt));
2415                }
2416            } else if (shiftAmt > 0) {
2417                bool sat = false;
2418                if (shiftAmt >= sizeof(Element) * 8) {
2419                    if (srcElem1 != 0)
2420                        sat = true;
2421                    else
2422                        destElem = 0;
2423                } else {
2424                    if (bits((uint64_t) srcElem1, sizeof(Element) * 8 - 1,
2425                                sizeof(Element) * 8 - 1 - shiftAmt) !=
2426                            ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) {
2427                        sat = true;
2428                    } else {
2429                        destElem = srcElem1 << shiftAmt;
2430                    }
2431                }
2432                if (sat) {
2433                    fpscr.qc = 1;
2434                    destElem = mask(sizeof(Element) * 8 - 1);
2435                    if (srcElem1 < 0)
2436                        destElem = ~destElem;
2437                }
2438            } else {
2439                destElem = srcElem1;
2440            }
2441            FpscrQc = fpscr;
2442    '''
2443    threeEqualRegInstX("sqshl", "SqshlDX", "SimdAluOp", smallSignedTypes, 2,
2444                       sqshlCode)
2445    threeEqualRegInstX("sqshl", "SqshlQX", "SimdAluOp", signedTypes, 4,
2446                       sqshlCode)
2447    threeEqualRegInstX("sqshl", "SqshlScX", "SimdAluOp", signedTypes, 4,
2448                       sqshlCode, scalar=True)
2449    # SQSHLU
2450    sqshluCode = '''
2451            FPSCR fpscr = (FPSCR) FpscrQc;
2452            if (imm >= sizeof(Element) * 8) {
2453                if (srcElem1 < 0) {
2454                    destElem = 0;
2455                    fpscr.qc = 1;
2456                } else if (srcElem1 > 0) {
2457                    destElem = mask(sizeof(Element) * 8);
2458                    fpscr.qc = 1;
2459                } else {
2460                    destElem = 0;
2461                }
2462            } else if (imm) {
2463                destElem = (srcElem1 << imm);
2464                uint64_t topBits = bits((uint64_t)srcElem1,
2465                                        sizeof(Element) * 8 - 1,
2466                                        sizeof(Element) * 8 - imm);
2467                if (srcElem1 < 0) {
2468                    destElem = 0;
2469                    fpscr.qc = 1;
2470                } else if (topBits != 0) {
2471                    destElem = mask(sizeof(Element) * 8);
2472                    fpscr.qc = 1;
2473                }
2474            } else {
2475                if (srcElem1 < 0) {
2476                    fpscr.qc = 1;
2477                    destElem = 0;
2478                } else {
2479                    destElem = srcElem1;
2480                }
2481            }
2482            FpscrQc = fpscr;
2483    '''
2484    twoEqualRegInstX("sqshlu", "SqshluDX", "SimdAluOp", smallSignedTypes, 2,
2485                     sqshluCode, hasImm=True)
2486    twoEqualRegInstX("sqshlu", "SqshluQX", "SimdAluOp", signedTypes, 4,
2487                     sqshluCode, hasImm=True)
2488    twoEqualRegInstX("sqshlu", "SqshluScX", "SimdAluOp", signedTypes, 4,
2489                     sqshluCode, hasImm=True, scalar=True)
2490    # SQSHRN, SQSHRN2
2491    sqshrnCode = '''
2492        FPSCR fpscr = (FPSCR) FpscrQc;
2493        if (imm > sizeof(srcElem1) * 8) {
2494            if (srcElem1 != 0 && srcElem1 != -1)
2495                fpscr.qc = 1;
2496            destElem = 0;
2497        } else if (imm) {
2498            BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
2499            mid |= -(mid & ((BigElement)1 <<
2500                        (sizeof(BigElement) * 8 - 1 - imm)));
2501            if (mid != (Element)mid) {
2502                destElem = mask(sizeof(Element) * 8 - 1);
2503                if (srcElem1 < 0)
2504                    destElem = ~destElem;
2505                fpscr.qc = 1;
2506            } else {
2507                destElem = mid;
2508            }
2509        } else {
2510            destElem = srcElem1;
2511        }
2512        FpscrQc = fpscr;
2513    '''
2514    twoRegNarrowInstX("sqshrn", "SqshrnX", "SimdShiftOp", smallSignedTypes,
2515                      sqshrnCode, hasImm=True)
2516    twoRegNarrowInstX("sqshrn2", "Sqshrn2X", "SimdShiftOp", smallSignedTypes,
2517                      sqshrnCode, hasImm=True, hi=True)
2518    twoRegNarrowInstX("sqshrn", "SqshrnScX", "SimdShiftOp", smallSignedTypes,
2519                      sqshrnCode, hasImm=True, scalar=True)
2520    # SQSHRUN, SQSHRUN2
2521    sqshrunCode = '''
2522            FPSCR fpscr = (FPSCR) FpscrQc;
2523            if (imm > sizeof(srcElem1) * 8) {
2524                if (srcElem1 != 0)
2525                    fpscr.qc = 1;
2526                destElem = 0;
2527            } else if (imm) {
2528                BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
2529                if (bits(mid, sizeof(BigElement) * 8 - 1,
2530                              sizeof(Element) * 8) != 0) {
2531                    if (srcElem1 < 0) {
2532                        destElem = 0;
2533                    } else {
2534                        destElem = mask(sizeof(Element) * 8);
2535                    }
2536                    fpscr.qc = 1;
2537                } else {
2538                    destElem = mid;
2539                }
2540            } else {
2541                destElem = srcElem1;
2542            }
2543            FpscrQc = fpscr;
2544    '''
2545    twoRegNarrowInstX("sqshrun", "SqshrunX", "SimdShiftOp", smallSignedTypes,
2546                      sqshrunCode, hasImm=True)
2547    twoRegNarrowInstX("sqshrun", "Sqshrun2X", "SimdShiftOp", smallSignedTypes,
2548                      sqshrunCode, hasImm=True, hi=True)
2549    twoRegNarrowInstX("sqshrun", "SqshrunScX", "SimdShiftOp", smallSignedTypes,
2550                      sqshrunCode, hasImm=True, scalar=True)
2551    # SQSUB
2552    sqsubCode = '''
2553            destElem = srcElem1 - srcElem2;
2554            FPSCR fpscr = (FPSCR) FpscrQc;
2555            bool negDest = (destElem < 0);
2556            bool negSrc1 = (srcElem1 < 0);
2557            bool posSrc2 = (srcElem2 >= 0);
2558            if ((negDest != negSrc1) && (negSrc1 == posSrc2)) {
2559                destElem = (Element)1 << (sizeof(Element) * 8 - 1);
2560                if (negDest)
2561                    destElem -= 1;
2562                fpscr.qc = 1;
2563            }
2564            FpscrQc = fpscr;
2565    '''
2566    threeEqualRegInstX("sqsub", "SqsubDX", "SimdAddOp", smallSignedTypes, 2,
2567                       sqsubCode)
2568    threeEqualRegInstX("sqsub", "SqsubQX", "SimdAddOp", signedTypes, 4,
2569                       sqsubCode)
2570    threeEqualRegInstX("sqsub", "SqsubScX", "SimdAddOp", signedTypes, 4,
2571                       sqsubCode, scalar=True)
2572    # SQXTN, SQXTN2
2573    sqxtnCode = '''
2574            FPSCR fpscr = (FPSCR) FpscrQc;
2575            destElem = srcElem1;
2576            if ((BigElement)destElem != srcElem1) {
2577                fpscr.qc = 1;
2578                destElem = mask(sizeof(Element) * 8 - 1);
2579                if (srcElem1 < 0)
2580                    destElem = ~destElem;
2581            }
2582            FpscrQc = fpscr;
2583    '''
2584    twoRegNarrowInstX("sqxtn", "SqxtnX", "SimdMiscOp", smallSignedTypes,
2585                      sqxtnCode)
2586    twoRegNarrowInstX("sqxtn", "Sqxtn2X", "SimdMiscOp", smallSignedTypes,
2587                      sqxtnCode, hi=True)
2588    twoRegNarrowInstX("sqxtn", "SqxtnScX", "SimdMiscOp", smallSignedTypes,
2589                      sqxtnCode, scalar=True)
2590    # SQXTUN, SQXTUN2
2591    sqxtunCode = '''
2592            FPSCR fpscr = (FPSCR) FpscrQc;
2593            destElem = srcElem1;
2594            if (srcElem1 < 0 ||
2595                    ((BigElement)destElem & mask(sizeof(Element) * 8)) != srcElem1) {
2596                fpscr.qc = 1;
2597                destElem = mask(sizeof(Element) * 8);
2598                if (srcElem1 < 0)
2599                    destElem = ~destElem;
2600            }
2601            FpscrQc = fpscr;
2602    '''
2603    twoRegNarrowInstX("sqxtun", "SqxtunX", "SimdMiscOp", smallSignedTypes,
2604                      sqxtunCode)
2605    twoRegNarrowInstX("sqxtun", "Sqxtun2X", "SimdMiscOp", smallSignedTypes,
2606                      sqxtunCode, hi=True)
2607    twoRegNarrowInstX("sqxtun", "SqxtunScX", "SimdMiscOp", smallSignedTypes,
2608                      sqxtunCode, scalar=True)
2609    # SRHADD
2610    rhaddCode = '''
2611            Element carryBit =
2612                (((unsigned)srcElem1 & 0x1) +
2613                 ((unsigned)srcElem2 & 0x1) + 1) >> 1;
2614            // Use division instead of a shift to ensure the sign extension works
2615            // right. The compiler will figure out if it can be a shift. Mask the
2616            // inputs so they get truncated correctly.
2617            destElem = (((srcElem1 & ~(Element)1) / 2) +
2618                        ((srcElem2 & ~(Element)1) / 2)) + carryBit;
2619    '''
2620    threeEqualRegInstX("srhadd", "SrhaddDX", "SimdAddOp", smallSignedTypes, 2,
2621                       rhaddCode)
2622    threeEqualRegInstX("srhadd", "SrhaddQX", "SimdAddOp", smallSignedTypes, 4,
2623                       rhaddCode)
2624    # SRI
2625    sriCode = '''
2626            if (imm >= sizeof(Element) * 8)
2627                destElem = destElem;
2628            else
2629                destElem = (srcElem1 >> imm) |
2630                    (destElem & ~mask(sizeof(Element) * 8 - imm));
2631    '''
2632    twoEqualRegInstX("sri", "SriDX", "SimdShiftOp", unsignedTypes, 2, sriCode,
2633                     True, hasImm=True)
2634    twoEqualRegInstX("sri", "SriQX", "SimdShiftOp", unsignedTypes, 4, sriCode,
2635                     True, hasImm=True)
2636    # SRSHL
2637    rshlCode = '''
2638            int16_t shiftAmt = (int8_t)srcElem2;
2639            if (shiftAmt < 0) {
2640                shiftAmt = -shiftAmt;
2641                Element rBit = 0;
2642                if (shiftAmt <= sizeof(Element) * 8)
2643                    rBit = bits(srcElem1, shiftAmt - 1);
2644                if (shiftAmt > sizeof(Element) * 8 && ltz(srcElem1))
2645                    rBit = 1;
2646                if (shiftAmt >= sizeof(Element) * 8) {
2647                    shiftAmt = sizeof(Element) * 8 - 1;
2648                    destElem = 0;
2649                } else {
2650                    destElem = (srcElem1 >> shiftAmt);
2651                }
2652                // Make sure the right shift sign extended when it should.
2653                if (ltz(srcElem1) && !ltz(destElem)) {
2654                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
2655                                                 1 - shiftAmt));
2656                }
2657                destElem += rBit;
2658            } else if (shiftAmt > 0) {
2659                if (shiftAmt >= sizeof(Element) * 8) {
2660                    destElem = 0;
2661                } else {
2662                    destElem = srcElem1 << shiftAmt;
2663                }
2664            } else {
2665                destElem = srcElem1;
2666            }
2667    '''
2668    threeEqualRegInstX("srshl", "SrshlDX", "SimdShiftOp", signedTypes, 2,
2669                       rshlCode)
2670    threeEqualRegInstX("srshl", "SrshlQX", "SimdShiftOp", signedTypes, 4,
2671                       rshlCode)
2672    # SRSHR
2673    rshrCode = '''
2674            if (imm > sizeof(srcElem1) * 8) {
2675                destElem = 0;
2676            } else if (imm) {
2677                Element rBit = bits(srcElem1, imm - 1);
2678                destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit;
2679            } else {
2680                destElem = srcElem1;
2681            }
2682    '''
2683    twoEqualRegInstX("srshr", "SrshrDX", "SimdShiftOp", signedTypes, 2,
2684                     rshrCode, hasImm=True)
2685    twoEqualRegInstX("srshr", "SrshrQX", "SimdShiftOp", signedTypes, 4,
2686                     rshrCode, hasImm=True)
2687    # SRSRA
2688    rsraCode = '''
2689            if (imm > sizeof(srcElem1) * 8) {
2690                destElem += 0;
2691            } else if (imm) {
2692                Element rBit = bits(srcElem1, imm - 1);
2693                destElem += ((srcElem1 >> (imm - 1)) >> 1) + rBit;
2694            } else {
2695                destElem += srcElem1;
2696            }
2697    '''
2698    twoEqualRegInstX("srsra", "SrsraDX", "SimdShiftOp", signedTypes, 2,
2699                     rsraCode, True, hasImm=True)
2700    twoEqualRegInstX("srsra", "SrsraQX", "SimdShiftOp", signedTypes, 4,
2701                     rsraCode, True, hasImm=True)
2702    # SSHL
2703    shlCode = '''
2704            int16_t shiftAmt = (int8_t)srcElem2;
2705            if (shiftAmt < 0) {
2706                shiftAmt = -shiftAmt;
2707                if (shiftAmt >= sizeof(Element) * 8) {
2708                    shiftAmt = sizeof(Element) * 8 - 1;
2709                    destElem = 0;
2710                } else {
2711                    destElem = (srcElem1 >> shiftAmt);
2712                }
2713                // Make sure the right shift sign extended when it should.
2714                if (ltz(srcElem1) && !ltz(destElem)) {
2715                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
2716                                                 1 - shiftAmt));
2717                }
2718            } else {
2719                if (shiftAmt >= sizeof(Element) * 8) {
2720                    destElem = 0;
2721                } else {
2722                    destElem = srcElem1 << shiftAmt;
2723                }
2724            }
2725    '''
2726    threeEqualRegInstX("sshl", "SshlDX", "SimdShiftOp", signedTypes, 2,
2727                       shlCode)
2728    threeEqualRegInstX("sshl", "SshlQX", "SimdShiftOp", signedTypes, 4,
2729                       shlCode)
2730    # SSHLL, SSHLL2
2731    shllCode = '''
2732            if (imm >= sizeof(destElem) * 8) {
2733                destElem = 0;
2734            } else {
2735                destElem = (BigElement)srcElem1 << imm;
2736            }
2737    '''
2738    twoRegLongInstX("sshll", "SshllX", "SimdShiftOp", smallSignedTypes,
2739                    shllCode, hasImm=True)
2740    twoRegLongInstX("sshll", "Sshll2X", "SimdShiftOp", smallSignedTypes,
2741                    shllCode, hasImm=True, hi=True)
2742    # SSHR
2743    shrCode = '''
2744            if (imm >= sizeof(srcElem1) * 8) {
2745                if (ltz(srcElem1))
2746                    destElem = -1;
2747                else
2748                    destElem = 0;
2749            } else {
2750                destElem = srcElem1 >> imm;
2751            }
2752    '''
2753    twoEqualRegInstX("sshr", "SshrDX", "SimdShiftOp", signedTypes, 2, shrCode,
2754                     hasImm=True)
2755    twoEqualRegInstX("sshr", "SshrQX", "SimdShiftOp", signedTypes, 4, shrCode,
2756                     hasImm=True)
2757    # SSRA
2758    sraCode = '''
2759            Element mid;;
2760            if (imm >= sizeof(srcElem1) * 8) {
2761                mid = ltz(srcElem1) ? -1 : 0;
2762            } else {
2763                mid = srcElem1 >> imm;
2764                if (ltz(srcElem1) && !ltz(mid)) {
2765                    mid |= -(mid & ((Element)1 <<
2766                                    (sizeof(Element) * 8 - 1 - imm)));
2767                }
2768            }
2769            destElem += mid;
2770    '''
2771    twoEqualRegInstX("ssra", "SsraDX", "SimdShiftOp", signedTypes, 2, sraCode,
2772                     True, hasImm=True)
2773    twoEqualRegInstX("ssra", "SsraQX", "SimdShiftOp", signedTypes, 4, sraCode,
2774                     True, hasImm=True)
2775    # SSUBL
2776    sublwCode = "destElem = (BigElement)srcElem1 - (BigElement)srcElem2;"
2777    threeRegLongInstX("ssubl", "SsublX", "SimdAddOp", smallSignedTypes,
2778                      sublwCode)
2779    threeRegLongInstX("ssubl2", "Ssubl2X", "SimdAddOp", smallSignedTypes,
2780                      sublwCode, hi=True)
2781    # SSUBW
2782    threeRegWideInstX("ssubw", "SsubwX", "SimdAddOp", smallSignedTypes,
2783                      sublwCode)
2784    threeRegWideInstX("ssubw2", "Ssubw2X", "SimdAddOp", smallSignedTypes,
2785                      sublwCode, hi=True)
2786    # SUB
2787    subCode = "destElem = srcElem1 - srcElem2;"
2788    threeEqualRegInstX("sub", "SubDX", "SimdAddOp", unsignedTypes, 2, subCode)
2789    threeEqualRegInstX("sub", "SubQX", "SimdAddOp", unsignedTypes, 4, subCode)
2790    # SUBHN, SUBHN2
2791    subhnCode = '''
2792            destElem = ((BigElement)srcElem1 - (BigElement)srcElem2) >>
2793                        (sizeof(Element) * 8);
2794    '''
2795    threeRegNarrowInstX("subhn", "SubhnX", "SimdAddOp", smallUnsignedTypes,
2796                        subhnCode)
2797    threeRegNarrowInstX("subhn2", "Subhn2X", "SimdAddOp", smallUnsignedTypes,
2798                        subhnCode, hi=True)
2799    # SUQADD
2800    suqaddCode = '''
2801            FPSCR fpscr = (FPSCR) FpscrQc;
2802            Element tmp = destElem + srcElem1;
2803            if (bits(destElem, sizeof(Element) * 8 - 1) == 0) {
2804                if (bits(tmp, sizeof(Element) * 8 - 1) == 1 ||
2805                        tmp < srcElem1 || tmp < destElem) {
2806                    destElem = (((Element) 1) << (sizeof(Element) * 8 - 1)) - 1;
2807                    fpscr.qc = 1;
2808                } else {
2809                    destElem = tmp;
2810                }
2811            } else {
2812                Element absDestElem = (~destElem) + 1;
2813                if (absDestElem < srcElem1) {
2814                    // Still check for positive sat., no need to check for negative sat.
2815                    if (bits(tmp, sizeof(Element) * 8 - 1) == 1) {
2816                        destElem = (((Element) 1) << (sizeof(Element) * 8 - 1)) - 1;
2817                        fpscr.qc = 1;
2818                    } else {
2819                        destElem = tmp;
2820                    }
2821                } else {
2822                    destElem = tmp;
2823                }
2824            }
2825            FpscrQc = fpscr;
2826    '''
2827    twoEqualRegInstX("suqadd", "SuqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
2828                     suqaddCode, True)
2829    twoEqualRegInstX("suqadd", "SuqaddQX", "SimdAddOp", unsignedTypes, 4,
2830                     suqaddCode, True)
2831    twoEqualRegInstX("suqadd", "SuqaddScX", "SimdAddOp", unsignedTypes, 4,
2832                     suqaddCode, True, scalar=True)
2833    # SXTL -> alias to SSHLL
2834    # TBL
2835    tbxTblInstX("tbl", "Tbl1DX", "SimdMiscOp", ("uint8_t",), 1, "true", 2)
2836    tbxTblInstX("tbl", "Tbl1QX", "SimdMiscOp", ("uint8_t",), 1, "true", 4)
2837    tbxTblInstX("tbl", "Tbl2DX", "SimdMiscOp", ("uint8_t",), 2, "true", 2)
2838    tbxTblInstX("tbl", "Tbl2QX", "SimdMiscOp", ("uint8_t",), 2, "true", 4)
2839    tbxTblInstX("tbl", "Tbl3DX", "SimdMiscOp", ("uint8_t",), 3, "true", 2)
2840    tbxTblInstX("tbl", "Tbl3QX", "SimdMiscOp", ("uint8_t",), 3, "true", 4)
2841    tbxTblInstX("tbl", "Tbl4DX", "SimdMiscOp", ("uint8_t",), 4, "true", 2)
2842    tbxTblInstX("tbl", "Tbl4QX", "SimdMiscOp", ("uint8_t",), 4, "true", 4)
2843    # TBX
2844    tbxTblInstX("tbx", "Tbx1DX", "SimdMiscOp", ("uint8_t",), 1, "false", 2)
2845    tbxTblInstX("tbx", "Tbx1QX", "SimdMiscOp", ("uint8_t",), 1, "false", 4)
2846    tbxTblInstX("tbx", "Tbx2DX", "SimdMiscOp", ("uint8_t",), 2, "false", 2)
2847    tbxTblInstX("tbx", "Tbx2QX", "SimdMiscOp", ("uint8_t",), 2, "false", 4)
2848    tbxTblInstX("tbx", "Tbx3DX", "SimdMiscOp", ("uint8_t",), 3, "false", 2)
2849    tbxTblInstX("tbx", "Tbx3QX", "SimdMiscOp", ("uint8_t",), 3, "false", 4)
2850    tbxTblInstX("tbx", "Tbx4DX", "SimdMiscOp", ("uint8_t",), 4, "false", 2)
2851    tbxTblInstX("tbx", "Tbx4QX", "SimdMiscOp", ("uint8_t",), 4, "false", 4)
2852    # TRN1
2853    trnCode = '''
2854        unsigned part = %s;
2855        for (unsigned i = 0; i < eCount / 2; i++) {
2856            destReg.elements[2 * i] = srcReg1.elements[2 * i + part];
2857            destReg.elements[2 * i + 1] = srcReg2.elements[2 * i + part];
2858        }
2859    '''
2860    threeRegScrambleInstX("trn1", "Trn1DX", "SimdAluOp", smallUnsignedTypes, 2,
2861                          trnCode % "0")
2862    threeRegScrambleInstX("trn1", "Trn1QX", "SimdAluOp", unsignedTypes, 4,
2863                          trnCode % "0")
2864    # TRN2
2865    threeRegScrambleInstX("trn2", "Trn2DX", "SimdAluOp", smallUnsignedTypes, 2,
2866                          trnCode % "1")
2867    threeRegScrambleInstX("trn2", "Trn2QX", "SimdAluOp", unsignedTypes, 4,
2868                          trnCode % "1")
2869    # UABA
2870    threeEqualRegInstX("uaba", "UabaDX", "SimdAddAccOp", smallUnsignedTypes, 2,
2871                       abaCode, True)
2872    threeEqualRegInstX("uaba", "UabaQX", "SimdAddAccOp", smallUnsignedTypes, 4,
2873                       abaCode, True)
2874    # UABAL, UABAL2
2875    threeRegLongInstX("uabal", "UabalX", "SimdAddAccOp", smallUnsignedTypes,
2876                      abalCode, True)
2877    threeRegLongInstX("uabal2", "Uabal2X", "SimdAddAccOp", smallUnsignedTypes,
2878                      abalCode, True, hi=True)
2879    # UABD
2880    threeEqualRegInstX("uabd", "UabdDX", "SimdAddOp", smallUnsignedTypes, 2,
2881                       abdCode)
2882    threeEqualRegInstX("uabd", "UabdQX", "SimdAddOp", smallUnsignedTypes, 4,
2883                       abdCode)
2884    # UABDL, UABDL2
2885    threeRegLongInstX("uabdl", "UabdlX", "SimdAddAccOp", smallUnsignedTypes,
2886                      abdlCode, True)
2887    threeRegLongInstX("uabdl2", "Uabdl2X", "SimdAddAccOp", smallUnsignedTypes,
2888                      abdlCode, True, hi=True)
2889    # UADALP
2890    twoRegCondenseInstX("uadalp", "UadalpDX", "SimdAddOp", smallUnsignedTypes,
2891                        2, adalpCode, True)
2892    twoRegCondenseInstX("uadalp", "UadalpQX", "SimdAddOp", smallUnsignedTypes,
2893                        4, adalpCode, True)
2894    # UADDL, UADDL2
2895    threeRegLongInstX("uaddl", "UaddlX", "SimdAddAccOp", smallUnsignedTypes,
2896                      addlwCode)
2897    threeRegLongInstX("uaddl2", "Uaddl2X", "SimdAddAccOp", smallUnsignedTypes,
2898                      addlwCode, hi=True)
2899    # UADDLP
2900    twoRegCondenseInstX("uaddlp", "UaddlpDX", "SimdAddOp", smallUnsignedTypes,
2901                        2, addlwCode)
2902    twoRegCondenseInstX("uaddlp", "UaddlpQX", "SimdAddOp", smallUnsignedTypes,
2903                        4, addlwCode)
2904    # UADDLV
2905    twoRegAcrossInstX("uaddlv", "UaddlvDX", "SimdAddOp",
2906                      ("uint8_t", "uint16_t"), 2, addAcrossLongCode, long=True)
2907    twoRegAcrossInstX("uaddlv", "UaddlvQX", "SimdAddOp",
2908                      ("uint8_t", "uint16_t"), 4, addAcrossLongCode, long=True)
2909    twoRegAcrossInstX("uaddlv", "UaddlvBQX", "SimdAddOp", ("uint32_t",), 4,
2910                      addAcrossLongCode, doubleDest=True, long=True)
2911    # UADDW
2912    threeRegWideInstX("uaddw", "UaddwX", "SimdAddAccOp", smallUnsignedTypes,
2913                      addlwCode)
2914    threeRegWideInstX("uaddw2", "Uaddw2X", "SimdAddAccOp", smallUnsignedTypes,
2915                      addlwCode, hi=True)
2916    # UCVTF (fixed-point)
2917    ucvtfFixedCode = fpOp % ("fplibFixedToFP<Element>(srcElem1, imm, true,"
2918                             " FPCRRounding(fpscr), fpscr)")
2919    twoEqualRegInstX("ucvtf", "UcvtfFixedDX", "SimdCvtOp", smallFloatTypes, 2,
2920                     ucvtfFixedCode, hasImm=True)
2921    twoEqualRegInstX("ucvtf", "UcvtfFixedQX", "SimdCvtOp", floatTypes, 4,
2922                     ucvtfFixedCode, hasImm=True)
2923    twoEqualRegInstX("ucvtf", "UcvtfFixedScX", "SimdCvtOp", floatTypes, 4,
2924                     ucvtfFixedCode, hasImm=True, scalar=True)
2925    # UCVTF (integer)
2926    ucvtfIntCode = fpOp % ("fplibFixedToFP<Element>(srcElem1, 0, true,"
2927                           " FPCRRounding(fpscr), fpscr)")
2928    twoEqualRegInstX("ucvtf", "UcvtfIntDX", "SimdCvtOp", smallFloatTypes, 2,
2929                     ucvtfIntCode)
2930    twoEqualRegInstX("ucvtf", "UcvtfIntQX", "SimdCvtOp", floatTypes, 4,
2931                     ucvtfIntCode)
2932    twoEqualRegInstX("ucvtf", "UcvtfIntScX", "SimdCvtOp", floatTypes, 4,
2933                     ucvtfIntCode, scalar=True)
2934    # UHADD
2935    threeEqualRegInstX("uhadd", "UhaddDX", "SimdAddOp", smallUnsignedTypes, 2,
2936                       haddCode)
2937    threeEqualRegInstX("uhadd", "UhaddQX", "SimdAddOp", smallUnsignedTypes, 4,
2938                       haddCode)
2939    # UHSUB
2940    threeEqualRegInstX("uhsub", "UhsubDX", "SimdAddOp", smallUnsignedTypes, 2,
2941                       hsubCode)
2942    threeEqualRegInstX("uhsub", "UhsubQX", "SimdAddOp", smallUnsignedTypes, 4,
2943                       hsubCode)
2944    # UMAX
2945    threeEqualRegInstX("umax", "UmaxDX", "SimdCmpOp", smallUnsignedTypes, 2,
2946                       maxCode)
2947    threeEqualRegInstX("umax", "UmaxQX", "SimdCmpOp", smallUnsignedTypes, 4,
2948                       maxCode)
2949    # UMAXP
2950    threeEqualRegInstX("umaxp", "UmaxpDX", "SimdCmpOp", smallUnsignedTypes, 2,
2951                       maxCode, pairwise=True)
2952    threeEqualRegInstX("umaxp", "UmaxpQX", "SimdCmpOp", smallUnsignedTypes, 4,
2953                       maxCode, pairwise=True)
2954    # UMAXV
2955    twoRegAcrossInstX("umaxv", "UmaxvDX", "SimdCmpOp", ("uint8_t", "uint16_t"),
2956                      2, maxAcrossCode)
2957    twoRegAcrossInstX("umaxv", "UmaxvQX", "SimdCmpOp", smallUnsignedTypes, 4,
2958                      maxAcrossCode)
2959    # UMIN
2960    threeEqualRegInstX("umin", "UminDX", "SimdCmpOp", smallUnsignedTypes, 2,
2961                       minCode)
2962    threeEqualRegInstX("umin", "UminQX", "SimdCmpOp", smallUnsignedTypes, 4,
2963                       minCode)
2964    # UMINP
2965    threeEqualRegInstX("uminp", "UminpDX", "SimdCmpOp", smallUnsignedTypes, 2,
2966                       minCode, pairwise=True)
2967    threeEqualRegInstX("uminp", "UminpQX", "SimdCmpOp", smallUnsignedTypes, 4,
2968                       minCode, pairwise=True)
2969    # UMINV
2970    twoRegAcrossInstX("uminv", "UminvDX", "SimdCmpOp", ("uint8_t", "uint16_t"),
2971                      2, minAcrossCode)
2972    twoRegAcrossInstX("uminv", "UminvQX", "SimdCmpOp", smallUnsignedTypes, 4,
2973                      minAcrossCode)
2974    # UMLAL (by element)
2975    threeRegLongInstX("umlal", "UmlalElemX", "SimdMultAccOp",
2976                      smallUnsignedTypes, mlalCode, True, byElem=True)
2977    threeRegLongInstX("umlal", "UmlalElem2X", "SimdMultAccOp",
2978                      smallUnsignedTypes, mlalCode, True, byElem=True, hi=True)
2979    # UMLAL (vector)
2980    threeRegLongInstX("umlal", "UmlalX", "SimdMultAccOp", smallUnsignedTypes,
2981                      mlalCode, True)
2982    threeRegLongInstX("umlal", "Umlal2X", "SimdMultAccOp", smallUnsignedTypes,
2983                      mlalCode, True, hi=True)
2984    # UMLSL (by element)
2985    threeRegLongInstX("umlsl", "UmlslElemX", "SimdMultAccOp",
2986                      smallUnsignedTypes, mlslCode, True, byElem=True)
2987    threeRegLongInstX("umlsl", "UmlslElem2X", "SimdMultAccOp",
2988                      smallUnsignedTypes, mlslCode, True, byElem=True, hi=True)
2989    # UMLSL (vector)
2990    threeRegLongInstX("umlsl", "UmlslX", "SimdMultAccOp", smallUnsignedTypes,
2991                      mlslCode, True)
2992    threeRegLongInstX("umlsl", "Umlsl2X", "SimdMultAccOp", smallUnsignedTypes,
2993                      mlslCode, True, hi=True)
2994    # UMOV
2995    insToGprInstX("umov", "UmovWX", "SimdMiscOp", smallUnsignedTypes, 4, 'W')
2996    insToGprInstX("umov", "UmovXX", "SimdMiscOp", ("uint64_t",), 4, 'X')
2997    # UMULL, UMULL2 (by element)
2998    threeRegLongInstX("umull", "UmullElemX", "SimdMultOp", smallUnsignedTypes,
2999                      mullCode, byElem=True)
3000    threeRegLongInstX("umull", "UmullElem2X", "SimdMultOp", smallUnsignedTypes,
3001                      mullCode, byElem=True, hi=True)
3002    # UMULL, UMULL2 (vector)
3003    threeRegLongInstX("umull", "UmullX", "SimdMultOp", smallUnsignedTypes,
3004                      mullCode)
3005    threeRegLongInstX("umull", "Umull2X", "SimdMultOp", smallUnsignedTypes,
3006                      mullCode, hi=True)
3007    # UQADD
3008    uqaddCode = '''
3009            destElem = srcElem1 + srcElem2;
3010            FPSCR fpscr = (FPSCR) FpscrQc;
3011            if (destElem < srcElem1 || destElem < srcElem2) {
3012                destElem = (Element)(-1);
3013                fpscr.qc = 1;
3014            }
3015            FpscrQc = fpscr;
3016    '''
3017    threeEqualRegInstX("uqadd", "UqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
3018                       uqaddCode)
3019    threeEqualRegInstX("uqadd", "UqaddQX", "SimdAddOp", unsignedTypes, 4,
3020                       uqaddCode)
3021    threeEqualRegInstX("uqadd", "UqaddScX", "SimdAddOp", unsignedTypes, 4,
3022                       uqaddCode, scalar=True)
3023    # UQRSHL
3024    uqrshlCode = '''
3025            int16_t shiftAmt = (int8_t)srcElem2;
3026            FPSCR fpscr = (FPSCR) FpscrQc;
3027            if (shiftAmt < 0) {
3028                shiftAmt = -shiftAmt;
3029                Element rBit = 0;
3030                if (shiftAmt <= sizeof(Element) * 8)
3031                    rBit = bits(srcElem1, shiftAmt - 1);
3032                if (shiftAmt >= sizeof(Element) * 8) {
3033                    shiftAmt = sizeof(Element) * 8 - 1;
3034                    destElem = 0;
3035                } else {
3036                    destElem = (srcElem1 >> shiftAmt);
3037                }
3038                destElem += rBit;
3039            } else {
3040                if (shiftAmt >= sizeof(Element) * 8) {
3041                    if (srcElem1 != 0) {
3042                        destElem = mask(sizeof(Element) * 8);
3043                        fpscr.qc = 1;
3044                    } else {
3045                        destElem = 0;
3046                    }
3047                } else {
3048                    if (bits(srcElem1, sizeof(Element) * 8 - 1,
3049                                sizeof(Element) * 8 - shiftAmt)) {
3050                        destElem = mask(sizeof(Element) * 8);
3051                        fpscr.qc = 1;
3052                    } else {
3053                        destElem = srcElem1 << shiftAmt;
3054                    }
3055                }
3056            }
3057            FpscrQc = fpscr;
3058    '''
3059    threeEqualRegInstX("uqrshl", "UqrshlDX", "SimdCmpOp", smallUnsignedTypes,
3060                       2, uqrshlCode)
3061    threeEqualRegInstX("uqrshl", "UqrshlQX", "SimdCmpOp", unsignedTypes, 4,
3062                       uqrshlCode)
3063    threeEqualRegInstX("uqrshl", "UqrshlScX", "SimdCmpOp", unsignedTypes, 4,
3064                       uqrshlCode, scalar=True)
3065    # UQRSHRN
3066    uqrshrnCode = '''
3067            FPSCR fpscr = (FPSCR) FpscrQc;
3068            if (imm > sizeof(srcElem1) * 8) {
3069                if (srcElem1 != 0)
3070                    fpscr.qc = 1;
3071                destElem = 0;
3072            } else if (imm) {
3073                BigElement mid = (srcElem1 >> (imm - 1));
3074                uint64_t rBit = mid & 0x1;
3075                mid >>= 1;
3076                mid += rBit;
3077                if (mid != (Element)mid) {
3078                    destElem = mask(sizeof(Element) * 8);
3079                    fpscr.qc = 1;
3080                } else {
3081                    destElem = mid;
3082                }
3083            } else {
3084                if (srcElem1 != (Element)srcElem1) {
3085                    destElem = mask(sizeof(Element) * 8 - 1);
3086                    fpscr.qc = 1;
3087                } else {
3088                    destElem = srcElem1;
3089                }
3090            }
3091            FpscrQc = fpscr;
3092    '''
3093    twoRegNarrowInstX("uqrshrn", "UqrshrnX", "SimdShiftOp", smallUnsignedTypes,
3094                      uqrshrnCode, hasImm=True)
3095    twoRegNarrowInstX("uqrshrn2", "Uqrshrn2X", "SimdShiftOp",
3096                      smallUnsignedTypes, uqrshrnCode, hasImm=True, hi=True)
3097    twoRegNarrowInstX("uqrshrn", "UqrshrnScX", "SimdShiftOp",
3098                      smallUnsignedTypes, uqrshrnCode, hasImm=True,
3099                      scalar=True)
3100    # UQSHL (immediate)
3101    uqshlImmCode = '''
3102            FPSCR fpscr = (FPSCR) FpscrQc;
3103            if (imm >= sizeof(Element) * 8) {
3104                if (srcElem1 != 0) {
3105                    destElem = mask(sizeof(Element) * 8);
3106                    fpscr.qc = 1;
3107                } else {
3108                    destElem = 0;
3109                }
3110            } else if (imm) {
3111                destElem = (srcElem1 << imm);
3112                uint64_t topBits = bits((uint64_t)srcElem1,
3113                                        sizeof(Element) * 8 - 1,
3114                                        sizeof(Element) * 8 - imm);
3115                if (topBits != 0) {
3116                    destElem = mask(sizeof(Element) * 8);
3117                    fpscr.qc = 1;
3118                }
3119            } else {
3120                destElem = srcElem1;
3121            }
3122            FpscrQc = fpscr;
3123    '''
3124    twoEqualRegInstX("uqshl", "UqshlImmDX", "SimdAluOp", smallUnsignedTypes, 2,
3125                     uqshlImmCode, hasImm=True)
3126    twoEqualRegInstX("uqshl", "UqshlImmQX", "SimdAluOp", unsignedTypes, 4,
3127                     uqshlImmCode, hasImm=True)
3128    twoEqualRegInstX("uqshl", "UqshlImmScX", "SimdAluOp", unsignedTypes, 4,
3129                     uqshlImmCode, hasImm=True, scalar=True)
3130    # UQSHL (register)
3131    uqshlCode = '''
3132            int16_t shiftAmt = (int8_t)srcElem2;
3133            FPSCR fpscr = (FPSCR) FpscrQc;
3134            if (shiftAmt < 0) {
3135                shiftAmt = -shiftAmt;
3136                if (shiftAmt >= sizeof(Element) * 8) {
3137                    shiftAmt = sizeof(Element) * 8 - 1;
3138                    destElem = 0;
3139                } else {
3140                    destElem = (srcElem1 >> shiftAmt);
3141                }
3142            } else if (shiftAmt > 0) {
3143                if (shiftAmt >= sizeof(Element) * 8) {
3144                    if (srcElem1 != 0) {
3145                        destElem = mask(sizeof(Element) * 8);
3146                        fpscr.qc = 1;
3147                    } else {
3148                        destElem = 0;
3149                    }
3150                } else {
3151                    if (bits(srcElem1, sizeof(Element) * 8 - 1,
3152                                sizeof(Element) * 8 - shiftAmt)) {
3153                        destElem = mask(sizeof(Element) * 8);
3154                        fpscr.qc = 1;
3155                    } else {
3156                        destElem = srcElem1 << shiftAmt;
3157                    }
3158                }
3159            } else {
3160                destElem = srcElem1;
3161            }
3162            FpscrQc = fpscr;
3163    '''
3164    threeEqualRegInstX("uqshl", "UqshlDX", "SimdAluOp", smallUnsignedTypes, 2,
3165                       uqshlCode)
3166    threeEqualRegInstX("uqshl", "UqshlQX", "SimdAluOp", unsignedTypes, 4,
3167                       uqshlCode)
3168    threeEqualRegInstX("uqshl", "UqshlScX", "SimdAluOp", unsignedTypes, 4,
3169                       uqshlCode, scalar=True)
3170    # UQSHRN, UQSHRN2
3171    uqshrnCode = '''
3172            FPSCR fpscr = (FPSCR) FpscrQc;
3173            if (imm > sizeof(srcElem1) * 8) {
3174                if (srcElem1 != 0)
3175                    fpscr.qc = 1;
3176                destElem = 0;
3177            } else if (imm) {
3178                BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
3179                if (mid != (Element)mid) {
3180                    destElem = mask(sizeof(Element) * 8);
3181                    fpscr.qc = 1;
3182                } else {
3183                    destElem = mid;
3184                }
3185            } else {
3186                destElem = srcElem1;
3187            }
3188            FpscrQc = fpscr;
3189    '''
3190    twoRegNarrowInstX("uqshrn", "UqshrnX", "SimdShiftOp", smallUnsignedTypes,
3191                      uqshrnCode, hasImm=True)
3192    twoRegNarrowInstX("uqshrn2", "Uqshrn2X", "SimdShiftOp", smallUnsignedTypes,
3193                      uqshrnCode, hasImm=True, hi=True)
3194    twoRegNarrowInstX("uqshrn", "UqshrnScX", "SimdShiftOp", smallUnsignedTypes,
3195                      uqshrnCode, hasImm=True, scalar=True)
3196    # UQSUB
3197    uqsubCode = '''
3198            destElem = srcElem1 - srcElem2;
3199            FPSCR fpscr = (FPSCR) FpscrQc;
3200            if (destElem > srcElem1) {
3201                destElem = 0;
3202                fpscr.qc = 1;
3203            }
3204            FpscrQc = fpscr;
3205    '''
3206    threeEqualRegInstX("uqsub", "UqsubDX", "SimdAddOp", smallUnsignedTypes, 2,
3207                       uqsubCode)
3208    threeEqualRegInstX("uqsub", "UqsubQX", "SimdAddOp", unsignedTypes, 4,
3209                       uqsubCode)
3210    threeEqualRegInstX("uqsub", "UqsubScX", "SimdAddOp", unsignedTypes, 4,
3211                       uqsubCode, scalar=True)
3212    # UQXTN
3213    uqxtnCode = '''
3214            FPSCR fpscr = (FPSCR) FpscrQc;
3215            destElem = srcElem1;
3216            if ((BigElement)destElem != srcElem1) {
3217                fpscr.qc = 1;
3218                destElem = mask(sizeof(Element) * 8);
3219            }
3220            FpscrQc = fpscr;
3221    '''
3222    twoRegNarrowInstX("uqxtn", "UqxtnX", "SimdMiscOp", smallUnsignedTypes,
3223                      uqxtnCode)
3224    twoRegNarrowInstX("uqxtn", "Uqxtn2X", "SimdMiscOp", smallUnsignedTypes,
3225                      uqxtnCode, hi=True)
3226    twoRegNarrowInstX("uqxtn", "UqxtnScX", "SimdMiscOp", smallUnsignedTypes,
3227                      uqxtnCode, scalar=True)
3228    # URECPE
3229    urecpeCode = "destElem = unsignedRecipEstimate(srcElem1);"
3230    twoEqualRegInstX("urecpe", "UrecpeDX", "SimdMultAccOp", ("uint32_t",), 2,
3231                     urecpeCode)
3232    twoEqualRegInstX("urecpe", "UrecpeQX", "SimdMultAccOp", ("uint32_t",), 4,
3233                     urecpeCode)
3234    # URHADD
3235    threeEqualRegInstX("urhadd", "UrhaddDX", "SimdAddOp", smallUnsignedTypes,
3236                       2, rhaddCode)
3237    threeEqualRegInstX("urhadd", "UrhaddQX", "SimdAddOp", smallUnsignedTypes,
3238                       4, rhaddCode)
3239    # URSHL
3240    threeEqualRegInstX("urshl", "UrshlDX", "SimdShiftOp", unsignedTypes, 2,
3241                       rshlCode)
3242    threeEqualRegInstX("urshl", "UrshlQX", "SimdShiftOp", unsignedTypes, 4,
3243                       rshlCode)
3244    # URSHR
3245    twoEqualRegInstX("urshr", "UrshrDX", "SimdShiftOp", unsignedTypes, 2,
3246                     rshrCode, hasImm=True)
3247    twoEqualRegInstX("urshr", "UrshrQX", "SimdShiftOp", unsignedTypes, 4,
3248                     rshrCode, hasImm=True)
3249    # URSQRTE
3250    ursqrteCode = "destElem = unsignedRSqrtEstimate(srcElem1);"
3251    twoEqualRegInstX("ursqrte", "UrsqrteDX", "SimdSqrtOp", ("uint32_t",), 2,
3252                     ursqrteCode)
3253    twoEqualRegInstX("ursqrte", "UrsqrteQX", "SimdSqrtOp", ("uint32_t",), 4,
3254                     ursqrteCode)
3255    # URSRA
3256    twoEqualRegInstX("ursra", "UrsraDX", "SimdShiftOp", unsignedTypes, 2,
3257                     rsraCode, True, hasImm=True)
3258    twoEqualRegInstX("ursra", "UrsraQX", "SimdShiftOp", unsignedTypes, 4,
3259                     rsraCode, True, hasImm=True)
3260    # USHL
3261    threeEqualRegInstX("ushl", "UshlDX", "SimdShiftOp", unsignedTypes, 2,
3262                       shlCode)
3263    threeEqualRegInstX("ushl", "UshlQX", "SimdShiftOp", unsignedTypes, 4,
3264                       shlCode)
3265    # USHLL, USHLL2
3266    twoRegLongInstX("ushll", "UshllX", "SimdShiftOp", smallUnsignedTypes,
3267                    shllCode, hasImm=True)
3268    twoRegLongInstX("ushll", "Ushll2X", "SimdShiftOp", smallUnsignedTypes,
3269                    shllCode, hi=True, hasImm=True)
3270    # USHR
3271    twoEqualRegInstX("ushr", "UshrDX", "SimdShiftOp", unsignedTypes, 2,
3272                     shrCode, hasImm=True)
3273    twoEqualRegInstX("ushr", "UshrQX", "SimdShiftOp", unsignedTypes, 4,
3274                     shrCode, hasImm=True)
3275    # USQADD
3276    usqaddCode = '''
3277            FPSCR fpscr = (FPSCR) FpscrQc;
3278            Element tmp = destElem + srcElem1;
3279            if (bits(srcElem1, sizeof(Element) * 8 - 1) == 0) {
3280                if (tmp < srcElem1 || tmp < destElem) {
3281                    destElem = (Element)(-1);
3282                    fpscr.qc = 1;
3283                } else {
3284                    destElem = tmp;
3285                }
3286            } else {
3287                Element absSrcElem1 = (~srcElem1) + 1;
3288                if (absSrcElem1 > destElem) {
3289                    destElem = 0;
3290                    fpscr.qc = 1;
3291                } else {
3292                    destElem = tmp;
3293                }
3294            }
3295            FpscrQc = fpscr;
3296    '''
3297    twoEqualRegInstX("usqadd", "UsqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
3298                     usqaddCode, True)
3299    twoEqualRegInstX("usqadd", "UsqaddQX", "SimdAddOp", unsignedTypes, 4,
3300                     usqaddCode, True)
3301    twoEqualRegInstX("usqadd", "UsqaddScX", "SimdAddOp", unsignedTypes, 4,
3302                     usqaddCode, True, scalar=True)
3303    # USRA
3304    twoEqualRegInstX("usra", "UsraDX", "SimdShiftOp", unsignedTypes, 2,
3305                     sraCode, True, hasImm=True)
3306    twoEqualRegInstX("usra", "UsraQX", "SimdShiftOp", unsignedTypes, 4,
3307                     sraCode, True, hasImm=True)
3308    # USUBL
3309    threeRegLongInstX("usubl", "UsublX", "SimdAddOp", smallUnsignedTypes,
3310                      sublwCode)
3311    threeRegLongInstX("usubl2", "Usubl2X", "SimdAddOp", smallUnsignedTypes,
3312                      sublwCode, hi=True)
3313    # USUBW
3314    threeRegWideInstX("usubw", "UsubwX", "SimdAddOp", smallUnsignedTypes,
3315                      sublwCode)
3316    threeRegWideInstX("usubw2", "Usubw2X", "SimdAddOp", smallUnsignedTypes,
3317                      sublwCode, hi=True)
3318    # UXTL -> alias to USHLL
3319    # UZP1
3320    uzpCode = '''
3321        unsigned part = %s;
3322        for (unsigned i = 0; i < eCount / 2; i++) {
3323            destReg.elements[i] = srcReg1.elements[2 * i + part];
3324            destReg.elements[eCount / 2 + i] = srcReg2.elements[2 * i + part];
3325        }
3326    '''
3327    threeRegScrambleInstX("Uzp1", "Uzp1DX", "SimdAluOp", smallUnsignedTypes, 2,
3328                          uzpCode % "0")
3329    threeRegScrambleInstX("Uzp1", "Uzp1QX", "SimdAluOp", unsignedTypes, 4,
3330                          uzpCode % "0")
3331    # UZP2
3332    threeRegScrambleInstX("Uzp2", "Uzp2DX", "SimdAluOp", smallUnsignedTypes, 2,
3333                          uzpCode % "1")
3334    threeRegScrambleInstX("Uzp2", "Uzp2QX", "SimdAluOp", unsignedTypes, 4,
3335                          uzpCode % "1")
3336    # XTN, XTN2
3337    xtnCode = "destElem = srcElem1;"
3338    twoRegNarrowInstX("Xtn", "XtnX", "SimdMiscOp", smallUnsignedTypes, xtnCode)
3339    twoRegNarrowInstX("Xtn", "Xtn2X", "SimdMiscOp", smallUnsignedTypes,
3340                      xtnCode, hi=True)
3341    # ZIP1
3342    zipCode = '''
3343        unsigned base = %s;
3344        for (unsigned i = 0; i < eCount / 2; i++) {
3345            destReg.elements[2 * i] = srcReg1.elements[base + i];
3346            destReg.elements[2 * i + 1] = srcReg2.elements[base + i];
3347        }
3348    '''
3349    threeRegScrambleInstX("zip1", "Zip1DX", "SimdAluOp", smallUnsignedTypes, 2,
3350                          zipCode % "0")
3351    threeRegScrambleInstX("zip1", "Zip1QX", "SimdAluOp", unsignedTypes, 4,
3352                          zipCode % "0")
3353    # ZIP2
3354    threeRegScrambleInstX("zip2", "Zip2DX", "SimdAluOp", smallUnsignedTypes, 2,
3355                          zipCode % "eCount / 2")
3356    threeRegScrambleInstX("zip2", "Zip2QX", "SimdAluOp", unsignedTypes, 4,
3357                          zipCode % "eCount / 2")
3358
3359}};
3360