neon64.isa revision 13544:0b4e5446167c
1// -*- mode: c++ -*-
2
3// Copyright (c) 2012-2013, 2015-2018 ARM Limited
4// All rights reserved
5//
6// The license below extends only to copyright in the software and shall
7// not be construed as granting a license to any other intellectual
8// property including but not limited to intellectual property relating
9// to a hardware implementation of the functionality of the software
10// licensed hereunder.  You may use the software subject to the license
11// terms below provided that you ensure that this notice is replicated
12// unmodified and in its entirety in all distributions of the software,
13// modified or unmodified, in source code or in binary form.
14//
15// Redistribution and use in source and binary forms, with or without
16// modification, are permitted provided that the following conditions are
17// met: redistributions of source code must retain the above copyright
18// notice, this list of conditions and the following disclaimer;
19// redistributions in binary form must reproduce the above copyright
20// notice, this list of conditions and the following disclaimer in the
21// documentation and/or other materials provided with the distribution;
22// neither the name of the copyright holders nor the names of its
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Authors: Giacomo Gabrielli
39//          Mbou Eyole
40
41let {{
42
43    header_output = ""
44    exec_output = ""
45    decoders = { 'Generic' : {} }
46
47    # FP types (FP operations always work with unsigned representations)
48    floatTypes = ("uint16_t", "uint32_t", "uint64_t")
49    smallFloatTypes = ("uint32_t",)
50
51    def threeEqualRegInstX(name, Name, opClass, types, rCount, op,
52                           readDest=False, pairwise=False, scalar=False,
53                           byElem=False, decoder='Generic'):
54        assert (not pairwise) or ((not byElem) and (not scalar))
55        global header_output, exec_output, decoders
56        eWalkCode = simd64EnabledCheckCode + '''
57        RegVect srcReg1, destReg;
58        '''
59        if byElem:
60            # 2nd register operand has to be read fully
61            eWalkCode += '''
62        FullRegVect srcReg2;
63        '''
64        else:
65            eWalkCode += '''
66        RegVect srcReg2;
67        '''
68        for reg in range(rCount):
69            eWalkCode += '''
70        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
71        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
72        ''' % { "reg" : reg }
73            if readDest:
74                eWalkCode += '''
75        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
76        ''' % { "reg" : reg }
77        if byElem:
78            # 2nd operand has to be read fully
79            for reg in range(rCount, 4):
80                eWalkCode += '''
81        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
82        ''' % { "reg" : reg }
83        readDestCode = ''
84        if readDest:
85            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
86        if pairwise:
87            eWalkCode += '''
88        for (unsigned i = 0; i < eCount; i++) {
89            Element srcElem1 = gtoh(2 * i < eCount ?
90                                    srcReg1.elements[2 * i] :
91                                    srcReg2.elements[2 * i - eCount]);
92            Element srcElem2 = gtoh(2 * i < eCount ?
93                                    srcReg1.elements[2 * i + 1] :
94                                    srcReg2.elements[2 * i + 1 - eCount]);
95            Element destElem;
96            %(readDest)s
97            %(op)s
98            destReg.elements[i] = htog(destElem);
99        }
100        ''' % { "op" : op, "readDest" : readDestCode }
101        else:
102            scalarCheck = '''
103            if (i != 0) {
104                destReg.elements[i] = 0;
105                continue;
106            }
107            '''
108            eWalkCode += '''
109        for (unsigned i = 0; i < eCount; i++) {
110            %(scalarCheck)s
111            Element srcElem1 = gtoh(srcReg1.elements[i]);
112            Element srcElem2 = gtoh(srcReg2.elements[%(src2Index)s]);
113            Element destElem;
114            %(readDest)s
115            %(op)s
116            destReg.elements[i] = htog(destElem);
117        }
118        ''' % { "op" : op, "readDest" : readDestCode,
119                "scalarCheck" : scalarCheck if scalar else "",
120                "src2Index" : "imm" if byElem else "i" }
121        for reg in range(rCount):
122            eWalkCode += '''
123        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
124        ''' % { "reg" : reg }
125        if rCount < 4:  # zero upper half
126            for reg in range(rCount, 4):
127                eWalkCode += '''
128        AA64FpDestP%(reg)d_uw = 0;
129        ''' % { "reg" : reg }
130        iop = InstObjParams(name, Name,
131                            "DataX2RegImmOp" if byElem else "DataX2RegOp",
132                            { "code": eWalkCode,
133                              "r_count": rCount,
134                              "op_class": opClass }, [])
135        if byElem:
136            header_output += NeonX2RegImmOpDeclare.subst(iop)
137        else:
138            header_output += NeonX2RegOpDeclare.subst(iop)
139        exec_output += NeonXEqualRegOpExecute.subst(iop)
140        for type in types:
141            substDict = { "targs" : type,
142                          "class_name" : Name }
143            exec_output += NeonXExecDeclare.subst(substDict)
144
145    def threeUnequalRegInstX(name, Name, opClass, types, op,
146                             bigSrc1, bigSrc2, bigDest, readDest, scalar=False,
147                             byElem=False, hi=False):
148        assert not (scalar and hi)
149        global header_output, exec_output
150        src1Cnt = src2Cnt = destCnt = 2
151        src1Prefix = src2Prefix = destPrefix = ''
152        if bigSrc1:
153            src1Cnt = 4
154            src1Prefix = 'Big'
155        if bigSrc2:
156            src2Cnt = 4
157            src2Prefix = 'Big'
158        if bigDest:
159            destCnt = 4
160            destPrefix = 'Big'
161        if byElem:
162            src2Prefix = 'Full'
163        eWalkCode = simd64EnabledCheckCode + '''
164        %sRegVect srcReg1;
165        %sRegVect srcReg2;
166        %sRegVect destReg;
167        ''' % (src1Prefix, src2Prefix, destPrefix)
168        srcReg1 = 0
169        if hi and not bigSrc1:  # long/widening operations
170            srcReg1 = 2
171        for reg in range(src1Cnt):
172            eWalkCode += '''
173        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(srcReg1)d_uw);
174        ''' % { "reg" : reg, "srcReg1" : srcReg1 }
175            srcReg1 += 1
176        srcReg2 = 0
177        if (not byElem) and (hi and not bigSrc2):  # long/widening operations
178            srcReg2 = 2
179        for reg in range(src2Cnt):
180            eWalkCode += '''
181        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(srcReg2)d_uw);
182        ''' % { "reg" : reg, "srcReg2" : srcReg2 }
183            srcReg2 += 1
184        if byElem:
185            # 2nd operand has to be read fully
186            for reg in range(src2Cnt, 4):
187                eWalkCode += '''
188        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
189        ''' % { "reg" : reg }
190        if readDest:
191            for reg in range(destCnt):
192                eWalkCode += '''
193        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
194        ''' % { "reg" : reg }
195        readDestCode = ''
196        if readDest:
197            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
198        scalarCheck = '''
199            if (i != 0) {
200                destReg.elements[i] = 0;
201                continue;
202            }
203            '''
204        eWalkCode += '''
205        for (unsigned i = 0; i < eCount; i++) {
206            %(scalarCheck)s
207            %(src1Prefix)sElement srcElem1 = gtoh(srcReg1.elements[i]);
208            %(src1Prefix)sElement srcElem2 = gtoh(srcReg2.elements[%(src2Index)s]);
209            %(destPrefix)sElement destElem;
210            %(readDest)s
211            %(op)s
212            destReg.elements[i] = htog(destElem);
213        }
214        ''' % { "op" : op, "readDest" : readDestCode,
215                "src1Prefix" : src1Prefix, "src2Prefix" : src2Prefix,
216                "destPrefix" : destPrefix,
217                "scalarCheck" : scalarCheck if scalar else "",
218                "src2Index" : "imm" if byElem else "i" }
219        destReg = 0
220        if hi and not bigDest:
221            # narrowing operations
222            destReg = 2
223        for reg in range(destCnt):
224            eWalkCode += '''
225        AA64FpDestP%(destReg)d_uw = gtoh(destReg.regs[%(reg)d]);
226        ''' % { "reg" : reg, "destReg": destReg }
227            destReg += 1
228        if destCnt < 4:
229            if hi:  # Explicitly merge with lower half
230                for reg in range(0, destCnt):
231                    eWalkCode += '''
232        AA64FpDestP%(reg)d_uw = AA64FpDestP%(reg)d_uw;''' % { "reg" : reg }
233            else:  # zero upper half
234                for reg in range(destCnt, 4):
235                    eWalkCode += '''
236        AA64FpDestP%(reg)d_uw = 0;''' % { "reg" : reg }
237
238        iop = InstObjParams(name, Name,
239                            "DataX2RegImmOp" if byElem else "DataX2RegOp",
240                            { "code": eWalkCode,
241                              "r_count": 2,
242                              "op_class": opClass }, [])
243        if byElem:
244            header_output += NeonX2RegImmOpDeclare.subst(iop)
245        else:
246            header_output += NeonX2RegOpDeclare.subst(iop)
247        exec_output += NeonXUnequalRegOpExecute.subst(iop)
248        for type in types:
249            substDict = { "targs" : type,
250                          "class_name" : Name }
251            exec_output += NeonXExecDeclare.subst(substDict)
252
253    def threeRegNarrowInstX(name, Name, opClass, types, op, readDest=False,
254                            scalar=False, byElem=False, hi=False):
255        assert not byElem
256        threeUnequalRegInstX(name, Name, opClass, types, op,
257                             True, True, False, readDest, scalar, byElem, hi)
258
259    def threeRegLongInstX(name, Name, opClass, types, op, readDest=False,
260                          scalar=False, byElem=False, hi=False):
261        threeUnequalRegInstX(name, Name, opClass, types, op,
262                             False, False, True, readDest, scalar, byElem, hi)
263
264    def threeRegWideInstX(name, Name, opClass, types, op, readDest=False,
265                          scalar=False, byElem=False, hi=False):
266        assert not byElem
267        threeUnequalRegInstX(name, Name, opClass, types, op,
268                             True, False, True, readDest, scalar, byElem, hi)
269
270    def twoEqualRegInstX(name, Name, opClass, types, rCount, op,
271                         readDest=False, scalar=False, byElem=False,
272                         hasImm=False, isDup=False):
273        global header_output, exec_output
274        assert (not isDup) or byElem
275        if byElem:
276            hasImm = True
277        if isDup:
278            eWalkCode = simd64EnabledCheckCode + '''
279        FullRegVect srcReg1;
280        RegVect destReg;
281        '''
282        else:
283            eWalkCode = simd64EnabledCheckCode + '''
284        RegVect srcReg1, destReg;
285        '''
286        for reg in range(4 if isDup else rCount):
287            eWalkCode += '''
288        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
289        ''' % { "reg" : reg }
290            if readDest:
291                eWalkCode += '''
292        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
293        ''' % { "reg" : reg }
294        readDestCode = ''
295        if readDest:
296            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
297        scalarCheck = '''
298            if (i != 0) {
299                destReg.elements[i] = 0;
300                continue;
301            }
302            '''
303        eWalkCode += '''
304        for (unsigned i = 0; i < eCount; i++) {
305            %(scalarCheck)s
306            unsigned j = i;
307            Element srcElem1 = gtoh(srcReg1.elements[%(src1Index)s]);
308            Element destElem;
309            %(readDest)s
310            %(op)s
311            destReg.elements[j] = htog(destElem);
312        }
313        ''' % { "op" : op, "readDest" : readDestCode,
314                "scalarCheck" : scalarCheck if scalar else "",
315                "src1Index" : "imm" if byElem else "i" }
316        for reg in range(rCount):
317            eWalkCode += '''
318        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
319        ''' % { "reg" : reg }
320        if rCount < 4:  # zero upper half
321            for reg in range(rCount, 4):
322                eWalkCode += '''
323        AA64FpDestP%(reg)d_uw = 0;
324        ''' % { "reg" : reg }
325        iop = InstObjParams(name, Name,
326                            "DataX1RegImmOp" if hasImm else "DataX1RegOp",
327                            { "code": eWalkCode,
328                              "r_count": rCount,
329                              "op_class": opClass }, [])
330        if hasImm:
331            header_output += NeonX1RegImmOpDeclare.subst(iop)
332        else:
333            header_output += NeonX1RegOpDeclare.subst(iop)
334        exec_output += NeonXEqualRegOpExecute.subst(iop)
335        for type in types:
336            substDict = { "targs" : type,
337                          "class_name" : Name }
338            exec_output += NeonXExecDeclare.subst(substDict)
339
340    def twoRegLongInstX(name, Name, opClass, types, op, readDest=False,
341                        hi=False, hasImm=False):
342        global header_output, exec_output
343        eWalkCode = simd64EnabledCheckCode + '''
344        RegVect srcReg1;
345        BigRegVect destReg;
346        '''
347        destReg = 0 if not hi else 2
348        for reg in range(2):
349            eWalkCode += '''
350        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(destReg)d_uw);
351        ''' % { "reg" : reg, "destReg": destReg }
352            destReg += 1
353        destReg = 0 if not hi else 2
354        if readDest:
355            for reg in range(4):
356                eWalkCode += '''
357        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
358        ''' % { "reg" : reg }
359                destReg += 1
360        readDestCode = ''
361        if readDest:
362            readDestCode = 'destReg = gtoh(destReg.elements[i]);'
363        eWalkCode += '''
364        for (unsigned i = 0; i < eCount; i++) {
365            Element srcElem1 = gtoh(srcReg1.elements[i]);
366            BigElement destElem;
367            %(readDest)s
368            %(op)s
369            destReg.elements[i] = htog(destElem);
370        }
371        ''' % { "op" : op, "readDest" : readDestCode }
372        for reg in range(4):
373            eWalkCode += '''
374        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
375        ''' % { "reg" : reg }
376        iop = InstObjParams(name, Name,
377                            "DataX1RegImmOp" if hasImm else "DataX1RegOp",
378                            { "code": eWalkCode,
379                              "r_count": 2,
380                              "op_class": opClass }, [])
381        if hasImm:
382            header_output += NeonX1RegImmOpDeclare.subst(iop)
383        else:
384            header_output += NeonX1RegOpDeclare.subst(iop)
385        exec_output += NeonXUnequalRegOpExecute.subst(iop)
386        for type in types:
387            substDict = { "targs" : type,
388                          "class_name" : Name }
389            exec_output += NeonXExecDeclare.subst(substDict)
390
391    def twoRegNarrowInstX(name, Name, opClass, types, op, readDest=False,
392                          scalar=False, hi=False, hasImm=False):
393        global header_output, exec_output
394        eWalkCode = simd64EnabledCheckCode + '''
395        BigRegVect srcReg1;
396        RegVect destReg;
397        '''
398        for reg in range(4):
399            eWalkCode += '''
400        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
401        ''' % { "reg" : reg }
402        if readDest:
403            for reg in range(2):
404                eWalkCode += '''
405        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
406        ''' % { "reg" : reg }
407        else:
408            eWalkCode += '''
409        destReg.elements[0] = 0;
410        ''' % { "reg" : reg }
411        readDestCode = ''
412        if readDest:
413            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
414        scalarCheck = '''
415            if (i != 0) {
416                destReg.elements[i] = 0;
417                continue;
418            }
419            '''
420        eWalkCode += '''
421        for (unsigned i = 0; i < eCount; i++) {
422            %(scalarCheck)s
423            BigElement srcElem1 = gtoh(srcReg1.elements[i]);
424            Element destElem;
425            %(readDest)s
426            %(op)s
427            destReg.elements[i] = htog(destElem);
428        }
429        ''' % { "op" : op, "readDest" : readDestCode,
430                "scalarCheck" : scalarCheck if scalar else "" }
431        destReg = 0 if not hi else 2
432        for reg in range(2):
433            eWalkCode += '''
434        AA64FpDestP%(destReg)d_uw = gtoh(destReg.regs[%(reg)d]);
435        ''' % { "reg" : reg, "destReg": destReg }
436            destReg += 1
437        if hi:
438            for reg in range(0, 2):  # Explicitly merge with the lower half
439                eWalkCode += '''
440        AA64FpDestP%(reg)d_uw = AA64FpDestP%(reg)d_uw;''' % { "reg" : reg }
441        else:
442            for reg in range(2, 4):  # zero upper half
443                eWalkCode += '''
444        AA64FpDestP%(reg)d_uw = 0;
445        ''' % { "reg" : reg }
446
447        iop = InstObjParams(name, Name,
448                            "DataX1RegImmOp" if hasImm else "DataX1RegOp",
449                            { "code": eWalkCode,
450                              "r_count": 2,
451                              "op_class": opClass }, [])
452        if hasImm:
453            header_output += NeonX1RegImmOpDeclare.subst(iop)
454        else:
455            header_output += NeonX1RegOpDeclare.subst(iop)
456        exec_output += NeonXUnequalRegOpExecute.subst(iop)
457        for type in types:
458            substDict = { "targs" : type,
459                          "class_name" : Name }
460            exec_output += NeonXExecDeclare.subst(substDict)
461
462    def threeRegScrambleInstX(name, Name, opClass, types, rCount, op):
463        global header_output, exec_output
464        eWalkCode = simd64EnabledCheckCode + '''
465        RegVect srcReg1, srcReg2, destReg;
466        '''
467        for reg in range(rCount):
468            eWalkCode += '''
469        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
470        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
471        ''' % { "reg" : reg }
472        eWalkCode += op
473        for reg in range(rCount):
474            eWalkCode += '''
475        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
476        ''' % { "reg" : reg }
477        if rCount < 4:
478            for reg in range(rCount, 4):
479                eWalkCode += '''
480        AA64FpDestP%(reg)d_uw = 0;
481        ''' % { "reg" : reg }
482        iop = InstObjParams(name, Name,
483                            "DataX2RegOp",
484                            { "code": eWalkCode,
485                              "r_count": rCount,
486                              "op_class": opClass }, [])
487        header_output += NeonX2RegOpDeclare.subst(iop)
488        exec_output += NeonXEqualRegOpExecute.subst(iop)
489        for type in types:
490            substDict = { "targs" : type,
491                          "class_name" : Name }
492            exec_output += NeonXExecDeclare.subst(substDict)
493
494    def insFromVecElemInstX(name, Name, opClass, types, rCount):
495        global header_output, exec_output
496        eWalkCode = simd64EnabledCheckCode + '''
497        FullRegVect srcReg1;
498        RegVect destReg;
499        '''
500        for reg in range(4):
501            eWalkCode += '''
502        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
503        ''' % { "reg" : reg }
504        for reg in range(rCount):
505            eWalkCode += '''
506        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
507        ''' % { "reg" : reg }
508        eWalkCode += '''
509        Element srcElem1 = gtoh(srcReg1.elements[imm2]);
510        Element destElem = srcElem1;
511        destReg.elements[imm1] = htog(destElem);
512        '''
513        for reg in range(rCount):
514            eWalkCode += '''
515        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
516        ''' % { "reg" : reg }
517        iop = InstObjParams(name, Name,
518                            "DataX1Reg2ImmOp",
519                            { "code": eWalkCode,
520                              "r_count": rCount,
521                              "op_class": opClass }, [])
522        header_output += NeonX1Reg2ImmOpDeclare.subst(iop)
523        exec_output += NeonXEqualRegOpExecute.subst(iop)
524        for type in types:
525            substDict = { "targs" : type,
526                          "class_name" : Name }
527            exec_output += NeonXExecDeclare.subst(substDict)
528
529    def twoRegPairwiseScInstX(name, Name, opClass, types, rCount, op):
530        global header_output, exec_output
531        eWalkCode = simd64EnabledCheckCode + '''
532        RegVect srcReg1, destReg;
533        '''
534        for reg in range(rCount):
535            eWalkCode += '''
536        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
537        ''' % { "reg" : reg }
538        eWalkCode += '''
539        Element srcElem1 = gtoh(srcReg1.elements[0]);
540        Element srcElem2 = gtoh(srcReg1.elements[1]);
541        Element destElem;
542        %(op)s
543        destReg.elements[0] = htog(destElem);
544        ''' % { "op" : op }
545        destCnt = rCount / 2
546        for reg in range(destCnt):
547            eWalkCode += '''
548        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
549        ''' % { "reg" : reg }
550        for reg in range(destCnt, 4):  # zero upper half
551            eWalkCode += '''
552        AA64FpDestP%(reg)d_uw = 0;
553        ''' % { "reg" : reg }
554        iop = InstObjParams(name, Name,
555                            "DataX1RegOp",
556                            { "code": eWalkCode,
557                              "r_count": rCount,
558                              "op_class": opClass }, [])
559        header_output += NeonX1RegOpDeclare.subst(iop)
560        exec_output += NeonXEqualRegOpExecute.subst(iop)
561        for type in types:
562            substDict = { "targs" : type,
563                          "class_name" : Name }
564            exec_output += NeonXExecDeclare.subst(substDict)
565
566    def twoRegAcrossInstX(name, Name, opClass, types, rCount, op,
567                          doubleDest=False, long=False):
568        global header_output, exec_output
569        destPrefix = "Big" if long else ""
570        eWalkCode = simd64EnabledCheckCode + '''
571        RegVect srcReg1;
572        %sRegVect destReg;
573        ''' % destPrefix
574        for reg in range(rCount):
575            eWalkCode += '''
576        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
577        ''' % { "reg" : reg }
578        eWalkCode += '''
579        destReg.regs[0] = 0;
580        %(destPrefix)sElement destElem = 0;
581        for (unsigned i = 0; i < eCount; i++) {
582            Element srcElem1 = gtoh(srcReg1.elements[i]);
583            if (i == 0) {
584                destElem = srcElem1;
585            } else {
586                %(op)s
587            }
588        }
589        destReg.elements[0] = htog(destElem);
590        ''' % { "op" : op, "destPrefix" : destPrefix }
591        destCnt = 2 if doubleDest else 1
592        for reg in range(destCnt):
593            eWalkCode += '''
594        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
595        ''' % { "reg" : reg }
596        for reg in range(destCnt, 4):  # zero upper half
597            eWalkCode += '''
598        AA64FpDestP%(reg)d_uw = 0;
599        ''' % { "reg" : reg }
600        iop = InstObjParams(name, Name,
601                            "DataX1RegOp",
602                            { "code": eWalkCode,
603                              "r_count": rCount,
604                              "op_class": opClass }, [])
605        header_output += NeonX1RegOpDeclare.subst(iop)
606        if long:
607            exec_output += NeonXUnequalRegOpExecute.subst(iop)
608        else:
609            exec_output += NeonXEqualRegOpExecute.subst(iop)
610        for type in types:
611            substDict = { "targs" : type,
612                          "class_name" : Name }
613            exec_output += NeonXExecDeclare.subst(substDict)
614
615    def twoRegCondenseInstX(name, Name, opClass, types, rCount, op,
616                            readDest=False):
617        global header_output, exec_output
618        eWalkCode = simd64EnabledCheckCode + '''
619        RegVect srcRegs;
620        BigRegVect destReg;
621        '''
622        for reg in range(rCount):
623            eWalkCode += '''
624        srcRegs.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
625        ''' % { "reg" : reg }
626            if readDest:
627                eWalkCode += '''
628        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
629        ''' % { "reg" : reg }
630        readDestCode = ''
631        if readDest:
632            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
633        eWalkCode += '''
634        for (unsigned i = 0; i < eCount / 2; i++) {
635            Element srcElem1 = gtoh(srcRegs.elements[2 * i]);
636            Element srcElem2 = gtoh(srcRegs.elements[2 * i + 1]);
637            BigElement destElem;
638            %(readDest)s
639            %(op)s
640            destReg.elements[i] = htog(destElem);
641        }
642        ''' % { "op" : op, "readDest" : readDestCode }
643        for reg in range(rCount):
644            eWalkCode += '''
645        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
646        ''' % { "reg" : reg }
647        if rCount < 4:  # zero upper half
648            for reg in range(rCount, 4):
649                eWalkCode += '''
650        AA64FpDestP%(reg)d_uw = 0;
651        ''' % { "reg" : reg }
652        iop = InstObjParams(name, Name,
653                            "DataX1RegOp",
654                            { "code": eWalkCode,
655                              "r_count": rCount,
656                              "op_class": opClass }, [])
657        header_output += NeonX1RegOpDeclare.subst(iop)
658        exec_output += NeonXUnequalRegOpExecute.subst(iop)
659        for type in types:
660            substDict = { "targs" : type,
661                          "class_name" : Name }
662            exec_output += NeonXExecDeclare.subst(substDict)
663
664    def oneRegImmInstX(name, Name, opClass, types, rCount, op, readDest=False):
665        global header_output, exec_output
666        eWalkCode = simd64EnabledCheckCode + '''
667        RegVect destReg;
668        '''
669        if readDest:
670            for reg in range(rCount):
671                eWalkCode += '''
672        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
673        ''' % { "reg" : reg }
674        readDestCode = ''
675        if readDest:
676            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
677        eWalkCode += '''
678        for (unsigned i = 0; i < eCount; i++) {
679            Element destElem;
680            %(readDest)s
681            %(op)s
682            destReg.elements[i] = htog(destElem);
683        }
684        ''' % { "op" : op, "readDest" : readDestCode }
685        for reg in range(rCount):
686            eWalkCode += '''
687        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
688        ''' % { "reg" : reg }
689        if rCount < 4:  # zero upper half
690            for reg in range(rCount, 4):
691                eWalkCode += '''
692        AA64FpDestP%(reg)d_uw = 0;
693        ''' % { "reg" : reg }
694        iop = InstObjParams(name, Name,
695                            "DataXImmOnlyOp",
696                            { "code": eWalkCode,
697                              "r_count": rCount,
698                              "op_class": opClass }, [])
699        header_output += NeonX1RegImmOnlyOpDeclare.subst(iop)
700        exec_output += NeonXEqualRegOpExecute.subst(iop)
701        for type in types:
702            substDict = { "targs" : type,
703                          "class_name" : Name }
704            exec_output += NeonXExecDeclare.subst(substDict)
705
706    def dupGprInstX(name, Name, opClass, types, rCount, gprSpec):
707        global header_output, exec_output
708        eWalkCode = simd64EnabledCheckCode + '''
709        RegVect destReg;
710        for (unsigned i = 0; i < eCount; i++) {
711            destReg.elements[i] = htog((Element) %sOp1);
712        }
713        ''' % gprSpec
714        for reg in range(rCount):
715            eWalkCode += '''
716        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
717        ''' % { "reg" : reg }
718        if rCount < 4:  # zero upper half
719            for reg in range(rCount, 4):
720                eWalkCode += '''
721        AA64FpDestP%(reg)d_uw = 0;
722        ''' % { "reg" : reg }
723        iop = InstObjParams(name, Name,
724                            "DataX1RegOp",
725                            { "code": eWalkCode,
726                              "r_count": rCount,
727                              "op_class": opClass }, [])
728        header_output += NeonX1RegOpDeclare.subst(iop)
729        exec_output += NeonXEqualRegOpExecute.subst(iop)
730        for type in types:
731            substDict = { "targs" : type,
732                          "class_name" : Name }
733            exec_output += NeonXExecDeclare.subst(substDict)
734
735    def extInstX(name, Name, opClass, types, rCount, op):
736        global header_output, exec_output
737        eWalkCode = simd64EnabledCheckCode + '''
738        RegVect srcReg1, srcReg2, destReg;
739        '''
740        for reg in range(rCount):
741            eWalkCode += '''
742        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
743        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
744        ''' % { "reg" : reg }
745        eWalkCode += op
746        for reg in range(rCount):
747            eWalkCode += '''
748        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
749        ''' % { "reg" : reg }
750        if rCount < 4:  # zero upper half
751            for reg in range(rCount, 4):
752                eWalkCode += '''
753        AA64FpDestP%(reg)d_uw = 0;
754        ''' % { "reg" : reg }
755        iop = InstObjParams(name, Name,
756                            "DataX2RegImmOp",
757                            { "code": eWalkCode,
758                              "r_count": rCount,
759                              "op_class": opClass }, [])
760        header_output += NeonX2RegImmOpDeclare.subst(iop)
761        exec_output += NeonXEqualRegOpExecute.subst(iop)
762        for type in types:
763            substDict = { "targs" : type,
764                          "class_name" : Name }
765            exec_output += NeonXExecDeclare.subst(substDict)
766
767    def insFromGprInstX(name, Name, opClass, types, rCount, gprSpec):
768        global header_output, exec_output
769        eWalkCode = simd64EnabledCheckCode + '''
770        RegVect destReg;
771        '''
772        for reg in range(rCount):
773            eWalkCode += '''
774        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
775        ''' % { "reg" : reg }
776        eWalkCode += '''
777        destReg.elements[imm] = htog((Element) %sOp1);
778        ''' % gprSpec
779        for reg in range(rCount):
780            eWalkCode += '''
781        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
782        ''' % { "reg" : reg }
783        iop = InstObjParams(name, Name,
784                            "DataX1RegImmOp",
785                            { "code": eWalkCode,
786                              "r_count": rCount,
787                              "op_class": opClass }, [])
788        header_output += NeonX1RegImmOpDeclare.subst(iop)
789        exec_output += NeonXEqualRegOpExecute.subst(iop)
790        for type in types:
791            substDict = { "targs" : type,
792                          "class_name" : Name }
793            exec_output += NeonXExecDeclare.subst(substDict)
794
795    def insToGprInstX(name, Name, opClass, types, rCount, gprSpec,
796                      signExt=False):
797        global header_output, exec_output
798        eWalkCode = simd64EnabledCheckCode + '''
799        FullRegVect srcReg;
800        '''
801        for reg in range(4):
802            eWalkCode += '''
803        srcReg.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
804        ''' % { "reg" : reg }
805        if signExt:
806            eWalkCode += '''
807        %sDest = sext<sizeof(Element) * 8>(srcReg.elements[imm]);
808        ''' % gprSpec
809        else:
810            eWalkCode += '''
811        %sDest = srcReg.elements[imm];
812        ''' % gprSpec
813        iop = InstObjParams(name, Name,
814                            "DataX1RegImmOp",
815                            { "code": eWalkCode,
816                              "r_count": rCount,
817                              "op_class": opClass }, [])
818        header_output += NeonX1RegImmOpDeclare.subst(iop)
819        exec_output += NeonXEqualRegOpExecute.subst(iop)
820        for type in types:
821            substDict = { "targs" : type,
822                          "class_name" : Name }
823            exec_output += NeonXExecDeclare.subst(substDict)
824
825    def tbxTblInstX(name, Name, opClass, types, length, isTbl, rCount):
826        global header_output, decoder_output, exec_output
827        code = simd64EnabledCheckCode + '''
828        union
829        {
830            uint8_t bytes[64];
831            uint32_t regs[16];
832        } table;
833
834        union
835        {
836            uint8_t bytes[%(rCount)d * 4];
837            uint32_t regs[%(rCount)d];
838        } destReg, srcReg2;
839
840        const unsigned length = %(length)d;
841        const bool isTbl = %(isTbl)s;
842        ''' % { "rCount" : rCount, "length" : length, "isTbl" : isTbl }
843        for reg in range(rCount):
844            code += '''
845        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
846        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
847        ''' % { "reg" : reg }
848        for reg in range(16):
849            if reg < length * 4:
850                code += '''
851        table.regs[%(reg)d] = htog(AA64FpOp1P%(p)dV%(v)dS_uw);
852        ''' % { "reg" : reg, "p" : reg % 4, "v" : reg / 4 }
853            else:
854                code += '''
855        table.regs[%(reg)d] = 0;
856        ''' % { "reg" : reg }
857        code += '''
858        for (unsigned i = 0; i < sizeof(destReg); i++) {
859            uint8_t index = srcReg2.bytes[i];
860            if (index < 16 * length) {
861                destReg.bytes[i] = table.bytes[index];
862            } else {
863                if (isTbl)
864                    destReg.bytes[i] = 0;
865                // else destReg.bytes[i] unchanged
866            }
867        }
868        '''
869        for reg in range(rCount):
870            code += '''
871        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
872        ''' % { "reg" : reg }
873        if rCount < 4:  # zero upper half
874            for reg in range(rCount, 4):
875                code += '''
876        AA64FpDestP%(reg)d_uw = 0;
877        ''' % { "reg" : reg }
878        iop = InstObjParams(name, Name,
879                            "DataX2RegOp",
880                            { "code": code,
881                              "r_count": rCount,
882                              "op_class": opClass }, [])
883        header_output += NeonX2RegOpDeclare.subst(iop)
884        exec_output += NeonXEqualRegOpExecute.subst(iop)
885        for type in types:
886            substDict = { "targs" : type,
887                          "class_name" : Name }
888            exec_output += NeonXExecDeclare.subst(substDict)
889
890    # ABS
891    absCode = '''
892            if (srcElem1 < 0) {
893                destElem = -srcElem1;
894            } else {
895                destElem = srcElem1;
896            }
897    '''
898    twoEqualRegInstX("abs", "AbsDX", "SimdAluOp", signedTypes, 2, absCode)
899    twoEqualRegInstX("abs", "AbsQX", "SimdAluOp", signedTypes, 4, absCode)
900    # ADD
901    addCode = "destElem = srcElem1 + srcElem2;"
902    threeEqualRegInstX("add", "AddDX", "SimdAddOp", unsignedTypes, 2, addCode)
903    threeEqualRegInstX("add", "AddQX", "SimdAddOp", unsignedTypes, 4, addCode)
904    # ADDHN, ADDHN2
905    addhnCode = '''
906            destElem = ((BigElement)srcElem1 + (BigElement)srcElem2) >>
907                        (sizeof(Element) * 8);
908    '''
909    threeRegNarrowInstX("addhn", "AddhnX", "SimdAddOp", smallUnsignedTypes,
910                        addhnCode)
911    threeRegNarrowInstX("addhn2", "Addhn2X", "SimdAddOp", smallUnsignedTypes,
912                        addhnCode, hi=True)
913    # ADDP (scalar)
914    twoRegPairwiseScInstX("addp", "AddpScQX", "SimdAddOp", ("uint64_t",), 4,
915                          addCode)
916    # ADDP (vector)
917    threeEqualRegInstX("addp", "AddpDX", "SimdAddOp", smallUnsignedTypes, 2,
918                       addCode, pairwise=True)
919    threeEqualRegInstX("addp", "AddpQX", "SimdAddOp", unsignedTypes, 4,
920                       addCode, pairwise=True)
921    # ADDV
922    # Note: SimdAddOp can be a bit optimistic here
923    addAcrossCode = "destElem += srcElem1;"
924    twoRegAcrossInstX("addv", "AddvDX", "SimdAddOp", ("uint8_t", "uint16_t"),
925                      2, addAcrossCode)
926    twoRegAcrossInstX("addv", "AddvQX", "SimdAddOp", smallUnsignedTypes, 4,
927                      addAcrossCode)
928    # AND
929    andCode = "destElem = srcElem1 & srcElem2;"
930    threeEqualRegInstX("and", "AndDX", "SimdAluOp", ("uint64_t",), 2, andCode)
931    threeEqualRegInstX("and", "AndQX", "SimdAluOp", ("uint64_t",), 4, andCode)
932    # BIC (immediate)
933    bicImmCode = "destElem &= ~imm;"
934    oneRegImmInstX("bic", "BicImmDX", "SimdAluOp", ("uint64_t",), 2,
935                   bicImmCode, True)
936    oneRegImmInstX("bic", "BicImmQX", "SimdAluOp", ("uint64_t",), 4,
937                   bicImmCode, True)
938    # BIC (register)
939    bicCode = "destElem = srcElem1 & ~srcElem2;"
940    threeEqualRegInstX("bic", "BicDX", "SimdAluOp", ("uint64_t",), 2, bicCode)
941    threeEqualRegInstX("bic", "BicQX", "SimdAluOp", ("uint64_t",), 4, bicCode)
942    # BIF
943    bifCode = "destElem = (destElem & srcElem2) | (srcElem1 & ~srcElem2);"
944    threeEqualRegInstX("bif", "BifDX", "SimdAluOp", ("uint64_t",), 2, bifCode,
945                       True)
946    threeEqualRegInstX("bif", "BifQX", "SimdAluOp", ("uint64_t",), 4, bifCode,
947                       True)
948    # BIT
949    bitCode = "destElem = (srcElem1 & srcElem2) | (destElem & ~srcElem2);"
950    threeEqualRegInstX("bit", "BitDX", "SimdAluOp", ("uint64_t",), 2, bitCode,
951                       True)
952    threeEqualRegInstX("bit", "BitQX", "SimdAluOp", ("uint64_t",), 4, bitCode,
953                       True)
954    # BSL
955    bslCode = "destElem = (srcElem1 & destElem) | (srcElem2 & ~destElem);"
956    threeEqualRegInstX("bsl", "BslDX", "SimdAluOp", ("uint64_t",), 2, bslCode,
957                       True)
958    threeEqualRegInstX("bsl", "BslQX", "SimdAluOp", ("uint64_t",), 4, bslCode,
959                       True)
960    # CLS
961    clsCode = '''
962            unsigned count = 0;
963            if (srcElem1 < 0) {
964                srcElem1 <<= 1;
965                while (srcElem1 < 0 && count < sizeof(Element) * 8 - 1) {
966                    count++;
967                    srcElem1 <<= 1;
968                }
969            } else {
970                srcElem1 <<= 1;
971                while (srcElem1 >= 0 && count < sizeof(Element) * 8 - 1) {
972                    count++;
973                    srcElem1 <<= 1;
974                }
975            }
976            destElem = count;
977    '''
978    twoEqualRegInstX("cls", "ClsDX", "SimdAluOp", smallSignedTypes, 2, clsCode)
979    twoEqualRegInstX("cls", "ClsQX", "SimdAluOp", smallSignedTypes, 4, clsCode)
980    # CLZ
981    clzCode = '''
982            unsigned count = 0;
983            while (srcElem1 >= 0 && count < sizeof(Element) * 8) {
984                count++;
985                srcElem1 <<= 1;
986            }
987            destElem = count;
988    '''
989    twoEqualRegInstX("clz", "ClzDX", "SimdAluOp", smallSignedTypes, 2, clzCode)
990    twoEqualRegInstX("clz", "ClzQX", "SimdAluOp", smallSignedTypes, 4, clzCode)
991    # CMEQ (register)
992    cmeqCode = "destElem = (srcElem1 == srcElem2) ? (Element)(-1) : 0;"
993    threeEqualRegInstX("cmeq", "CmeqDX", "SimdCmpOp", unsignedTypes, 2,
994                       cmeqCode)
995    threeEqualRegInstX("cmeq", "CmeqQX", "SimdCmpOp", unsignedTypes, 4,
996                       cmeqCode)
997    # CMEQ (zero)
998    cmeqZeroCode = "destElem = (srcElem1 == 0) ? (Element)(-1) : 0;"
999    twoEqualRegInstX("cmeq", "CmeqZeroDX", "SimdCmpOp", signedTypes, 2,
1000                     cmeqZeroCode)
1001    twoEqualRegInstX("cmeq", "CmeqZeroQX", "SimdCmpOp", signedTypes, 4,
1002                     cmeqZeroCode)
1003    # CMGE (register)
1004    cmgeCode = "destElem = (srcElem1 >= srcElem2) ? (Element)(-1) : 0;"
1005    threeEqualRegInstX("cmge", "CmgeDX", "SimdCmpOp", signedTypes, 2, cmgeCode)
1006    threeEqualRegInstX("cmge", "CmgeQX", "SimdCmpOp", signedTypes, 4, cmgeCode)
1007    # CMGE (zero)
1008    cmgeZeroCode = "destElem = (srcElem1 >= 0) ? (Element)(-1) : 0;"
1009    twoEqualRegInstX("cmge", "CmgeZeroDX", "SimdCmpOp", signedTypes, 2,
1010                     cmgeZeroCode)
1011    twoEqualRegInstX("cmge", "CmgeZeroQX", "SimdCmpOp", signedTypes, 4,
1012                     cmgeZeroCode)
1013    # CMGT (register)
1014    cmgtCode = "destElem = (srcElem1 > srcElem2) ? (Element)(-1) : 0;"
1015    threeEqualRegInstX("cmgt", "CmgtDX", "SimdCmpOp", signedTypes, 2, cmgtCode)
1016    threeEqualRegInstX("cmgt", "CmgtQX", "SimdCmpOp", signedTypes, 4, cmgtCode)
1017    # CMGT (zero)
1018    cmgtZeroCode = "destElem = (srcElem1 > 0) ? (Element)(-1) : 0;"
1019    twoEqualRegInstX("cmgt", "CmgtZeroDX", "SimdCmpOp", signedTypes, 2,
1020                     cmgtZeroCode)
1021    twoEqualRegInstX("cmgt", "CmgtZeroQX", "SimdCmpOp", signedTypes, 4,
1022                     cmgtZeroCode)
1023    # CMHI (register)
1024    threeEqualRegInstX("cmhi", "CmhiDX", "SimdCmpOp", unsignedTypes, 2,
1025                       cmgtCode)
1026    threeEqualRegInstX("cmhi", "CmhiQX", "SimdCmpOp", unsignedTypes, 4,
1027                       cmgtCode)
1028    # CMHS (register)
1029    threeEqualRegInstX("cmhs", "CmhsDX", "SimdCmpOp", unsignedTypes, 2,
1030                       cmgeCode)
1031    threeEqualRegInstX("cmhs", "CmhsQX", "SimdCmpOp", unsignedTypes, 4,
1032                       cmgeCode)
1033    # CMLE (zero)
1034    cmleZeroCode = "destElem = (srcElem1 <= 0) ? (Element)(-1) : 0;"
1035    twoEqualRegInstX("cmle", "CmleZeroDX", "SimdCmpOp", signedTypes, 2,
1036                     cmleZeroCode)
1037    twoEqualRegInstX("cmle", "CmleZeroQX", "SimdCmpOp", signedTypes, 4,
1038                     cmleZeroCode)
1039    # CMLT (zero)
1040    cmltZeroCode = "destElem = (srcElem1 < 0) ? (Element)(-1) : 0;"
1041    twoEqualRegInstX("cmlt", "CmltZeroDX", "SimdCmpOp", signedTypes, 2,
1042                     cmltZeroCode)
1043    twoEqualRegInstX("cmlt", "CmltZeroQX", "SimdCmpOp", signedTypes, 4,
1044                     cmltZeroCode)
1045    # CMTST (register)
1046    tstCode = "destElem = (srcElem1 & srcElem2) ? (Element)(-1) : 0;"
1047    threeEqualRegInstX("cmtst", "CmtstDX", "SimdAluOp", unsignedTypes, 2,
1048                       tstCode)
1049    threeEqualRegInstX("cmtst", "CmtstQX", "SimdAluOp", unsignedTypes, 4,
1050                       tstCode)
1051    # CNT
1052    cntCode = '''
1053            unsigned count = 0;
1054            while (srcElem1 && count < sizeof(Element) * 8) {
1055                count += srcElem1 & 0x1;
1056                srcElem1 >>= 1;
1057            }
1058            destElem = count;
1059    '''
1060    twoEqualRegInstX("cnt", "CntDX", "SimdAluOp", ("uint8_t",), 2, cntCode)
1061    twoEqualRegInstX("cnt", "CntQX", "SimdAluOp", ("uint8_t",), 4, cntCode)
1062    # DUP (element)
1063    dupCode = "destElem = srcElem1;"
1064    twoEqualRegInstX("dup", "DupElemDX", "SimdMiscOp", smallUnsignedTypes, 2,
1065                     dupCode, isDup=True, byElem=True)
1066    twoEqualRegInstX("dup", "DupElemQX", "SimdMiscOp", unsignedTypes, 4,
1067                     dupCode, isDup=True, byElem=True)
1068    twoEqualRegInstX("dup", "DupElemScX", "SimdMiscOp", unsignedTypes, 4,
1069                     dupCode, isDup=True, byElem=True, scalar=True)
1070    # DUP (general register)
1071    dupGprInstX("dup", "DupGprWDX", "SimdMiscOp", smallUnsignedTypes, 2, 'W')
1072    dupGprInstX("dup", "DupGprWQX", "SimdMiscOp", smallUnsignedTypes, 4, 'W')
1073    dupGprInstX("dup", "DupGprXQX", "SimdMiscOp", ("uint64_t",), 4, 'X')
1074    # EOR
1075    eorCode = "destElem = srcElem1 ^ srcElem2;"
1076    threeEqualRegInstX("eor", "EorDX", "SimdAluOp", ("uint64_t",), 2, eorCode)
1077    threeEqualRegInstX("eor", "EorQX", "SimdAluOp", ("uint64_t",), 4, eorCode)
1078    # EXT
1079    extCode = '''
1080            for (unsigned i = 0; i < eCount; i++) {
1081                unsigned index = i + imm;
1082                if (index < eCount) {
1083                    destReg.elements[i] = srcReg1.elements[index];
1084                } else {
1085                    index -= eCount;
1086                    if (index >= eCount) {
1087                        fault = std::make_shared<UndefinedInstruction>(
1088                                      machInst, false, mnemonic);
1089                    } else {
1090                        destReg.elements[i] = srcReg2.elements[index];
1091                    }
1092                }
1093            }
1094    '''
1095    extInstX("Ext", "ExtDX", "SimdMiscOp", ("uint8_t",), 2, extCode)
1096    extInstX("Ext", "ExtQX", "SimdMiscOp", ("uint8_t",), 4, extCode)
1097    # FABD
1098    fpOp = '''
1099            FPSCR fpscr = (FPSCR) FpscrExc;
1100            destElem = %s;
1101            FpscrExc = fpscr;
1102    '''
1103    fabdCode = fpOp % "fplibAbs<Element>(fplibSub(srcElem1, srcElem2, fpscr))"
1104    threeEqualRegInstX("fabd", "FabdDX", "SimdFloatAddOp", smallFloatTypes, 2,
1105                       fabdCode)
1106    threeEqualRegInstX("fabd", "FabdQX", "SimdFloatAddOp", floatTypes, 4,
1107                       fabdCode)
1108    threeEqualRegInstX("fabd", "FabdScX", "SimdFloatAddOp", floatTypes, 4,
1109                       fabdCode, scalar=True)
1110    # FABS
1111    fabsCode = fpOp % "fplibAbs<Element>(srcElem1)"
1112    twoEqualRegInstX("Abs", "FabsDX", "SimdFloatAluOp", smallFloatTypes, 2,
1113                     fabsCode)
1114    twoEqualRegInstX("Abs", "FabsQX", "SimdFloatAluOp", floatTypes, 4,
1115                     fabsCode)
1116    # FACGE
1117    fpCmpAbsOp = fpOp % ("fplibCompare%s<Element>(fplibAbs<Element>(srcElem1),"
1118                         " fplibAbs<Element>(srcElem2), fpscr) ? -1 : 0")
1119    facgeCode = fpCmpAbsOp % "GE"
1120    threeEqualRegInstX("facge", "FacgeDX", "SimdFloatCmpOp", smallFloatTypes,
1121                       2, facgeCode)
1122    threeEqualRegInstX("facge", "FacgeQX", "SimdFloatCmpOp", floatTypes, 4,
1123                       facgeCode)
1124    threeEqualRegInstX("facge", "FacgeScX", "SimdFloatCmpOp", floatTypes, 4,
1125                       facgeCode, scalar=True)
1126    # FACGT
1127    facgtCode = fpCmpAbsOp % "GT"
1128    threeEqualRegInstX("facgt", "FacgtDX", "SimdFloatCmpOp", smallFloatTypes,
1129                       2, facgtCode)
1130    threeEqualRegInstX("facgt", "FacgtQX", "SimdFloatCmpOp", floatTypes, 4,
1131                       facgtCode)
1132    threeEqualRegInstX("facgt", "FacgtScX", "SimdFloatCmpOp", floatTypes, 4,
1133                       facgtCode, scalar=True)
1134    # FADD
1135    fpBinOp = fpOp % "fplib%s<Element>(srcElem1, srcElem2, fpscr)"
1136    faddCode = fpBinOp % "Add"
1137    threeEqualRegInstX("fadd", "FaddDX", "SimdFloatAddOp", smallFloatTypes, 2,
1138                       faddCode)
1139    threeEqualRegInstX("fadd", "FaddQX", "SimdFloatAddOp", floatTypes, 4,
1140                       faddCode)
1141    # FADDP (scalar)
1142    twoRegPairwiseScInstX("faddp", "FaddpScDX", "SimdFloatAddOp",
1143                          ("uint32_t",), 2, faddCode)
1144    twoRegPairwiseScInstX("faddp", "FaddpScQX", "SimdFloatAddOp",
1145                          ("uint64_t",), 4, faddCode)
1146    # FADDP (vector)
1147    threeEqualRegInstX("faddp", "FaddpDX", "SimdFloatAddOp", smallFloatTypes,
1148                       2, faddCode, pairwise=True)
1149    threeEqualRegInstX("faddp", "FaddpQX", "SimdFloatAddOp", floatTypes, 4,
1150                       faddCode, pairwise=True)
1151    # FCMEQ (register)
1152    fpCmpOp = fpOp % ("fplibCompare%s<Element>(srcElem1, srcElem2, fpscr) ?"
1153                      " -1 : 0")
1154    fcmeqCode = fpCmpOp % "EQ"
1155    threeEqualRegInstX("fcmeq", "FcmeqDX", "SimdFloatCmpOp", smallFloatTypes,
1156                       2, fcmeqCode)
1157    threeEqualRegInstX("fcmeq", "FcmeqQX", "SimdFloatCmpOp", floatTypes, 4,
1158                       fcmeqCode)
1159    threeEqualRegInstX("fcmeq", "FcmeqScX", "SimdFloatCmpOp", floatTypes, 4,
1160                       fcmeqCode, scalar=True)
1161    # FCMEQ (zero)
1162    fpCmpZeroOp = fpOp % "fplibCompare%s<Element>(srcElem1, 0, fpscr) ? -1 : 0"
1163    fcmeqZeroCode = fpCmpZeroOp % "EQ"
1164    twoEqualRegInstX("fcmeq", "FcmeqZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1165                     2, fcmeqZeroCode)
1166    twoEqualRegInstX("fcmeq", "FcmeqZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1167                     fcmeqZeroCode)
1168    twoEqualRegInstX("fcmeq", "FcmeqZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1169                     fcmeqZeroCode, scalar=True)
1170    # FCMGE (register)
1171    fcmgeCode = fpCmpOp % "GE"
1172    threeEqualRegInstX("fcmge", "FcmgeDX", "SimdFloatCmpOp", smallFloatTypes,
1173                       2, fcmgeCode)
1174    threeEqualRegInstX("fcmge", "FcmgeQX", "SimdFloatCmpOp", floatTypes, 4,
1175                       fcmgeCode)
1176    threeEqualRegInstX("fcmge", "FcmgeScX", "SimdFloatCmpOp", floatTypes, 4,
1177                       fcmgeCode, scalar=True)
1178    # FCMGE (zero)
1179    fcmgeZeroCode = fpCmpZeroOp % "GE"
1180    twoEqualRegInstX("fcmge", "FcmgeZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1181                     2, fcmgeZeroCode)
1182    twoEqualRegInstX("fcmge", "FcmgeZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1183                     fcmgeZeroCode)
1184    twoEqualRegInstX("fcmge", "FcmgeZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1185                     fcmgeZeroCode, scalar=True)
1186    # FCMGT (register)
1187    fcmgtCode = fpCmpOp % "GT"
1188    threeEqualRegInstX("fcmgt", "FcmgtDX", "SimdFloatCmpOp", smallFloatTypes,
1189                       2, fcmgtCode)
1190    threeEqualRegInstX("fcmgt", "FcmgtQX", "SimdFloatCmpOp", floatTypes, 4,
1191                       fcmgtCode)
1192    threeEqualRegInstX("fcmgt", "FcmgtScX", "SimdFloatCmpOp", floatTypes, 4,
1193                       fcmgtCode, scalar=True)
1194    # FCMGT (zero)
1195    fcmgtZeroCode = fpCmpZeroOp % "GT"
1196    twoEqualRegInstX("fcmgt", "FcmgtZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1197                     2, fcmgtZeroCode)
1198    twoEqualRegInstX("fcmgt", "FcmgtZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1199                     fcmgtZeroCode)
1200    twoEqualRegInstX("fcmgt", "FcmgtZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1201                     fcmgtZeroCode, scalar=True)
1202    # FCMLE (zero)
1203    fpCmpRevZeroOp = fpOp % ("fplibCompare%s<Element>(0, srcElem1, fpscr) ?"
1204                             " -1 : 0")
1205    fcmleZeroCode = fpCmpRevZeroOp % "GE"
1206    twoEqualRegInstX("fcmle", "FcmleZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1207                     2, fcmleZeroCode)
1208    twoEqualRegInstX("fcmle", "FcmleZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1209                     fcmleZeroCode)
1210    twoEqualRegInstX("fcmle", "FcmleZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1211                     fcmleZeroCode, scalar=True)
1212    # FCMLT (zero)
1213    fcmltZeroCode = fpCmpRevZeroOp % "GT"
1214    twoEqualRegInstX("fcmlt", "FcmltZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1215                     2, fcmltZeroCode)
1216    twoEqualRegInstX("fcmlt", "FcmltZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1217                     fcmltZeroCode)
1218    twoEqualRegInstX("fcmlt", "FcmltZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1219                     fcmltZeroCode, scalar=True)
1220    # FCVTAS
1221    fcvtCode = fpOp % ("fplibFPToFixed<Element, Element>("
1222                       "srcElem1, %s, %s, %s, fpscr)")
1223    fcvtasCode = fcvtCode % ("0", "false", "FPRounding_TIEAWAY")
1224    twoEqualRegInstX("fcvtas", "FcvtasDX", "SimdCvtOp", smallFloatTypes, 2,
1225                     fcvtasCode)
1226    twoEqualRegInstX("fcvtas", "FcvtasQX", "SimdCvtOp", floatTypes, 4,
1227                     fcvtasCode)
1228    twoEqualRegInstX("fcvtas", "FcvtasScX", "SimdCvtOp", floatTypes, 4,
1229                     fcvtasCode, scalar=True)
1230    # FCVTAU
1231    fcvtauCode = fcvtCode % ("0", "true", "FPRounding_TIEAWAY")
1232    twoEqualRegInstX("fcvtau", "FcvtauDX", "SimdCvtOp", smallFloatTypes, 2,
1233                     fcvtauCode)
1234    twoEqualRegInstX("fcvtau", "FcvtauQX", "SimdCvtOp", floatTypes, 4,
1235                     fcvtauCode)
1236    twoEqualRegInstX("fcvtau", "FcvtauScX", "SimdCvtOp", floatTypes, 4,
1237                     fcvtauCode, scalar=True)
1238    # FCVTL, FCVTL2
1239    fcvtlCode = fpOp % ("fplibConvert<Element, BigElement>("
1240                        "srcElem1, FPCRRounding(fpscr), fpscr)")
1241    twoRegLongInstX("fcvtl", "FcvtlX", "SimdCvtOp", ("uint16_t", "uint32_t"),
1242                    fcvtlCode)
1243    twoRegLongInstX("fcvtl", "Fcvtl2X", "SimdCvtOp", ("uint16_t", "uint32_t"),
1244                    fcvtlCode, hi=True)
1245    # FCVTMS
1246    fcvtmsCode = fcvtCode % ("0", "false", "FPRounding_NEGINF")
1247    twoEqualRegInstX("fcvtms", "FcvtmsDX", "SimdCvtOp", smallFloatTypes, 2,
1248                     fcvtmsCode)
1249    twoEqualRegInstX("fcvtms", "FcvtmsQX", "SimdCvtOp", floatTypes, 4,
1250                     fcvtmsCode)
1251    twoEqualRegInstX("fcvtms", "FcvtmsScX", "SimdCvtOp", floatTypes, 4,
1252                     fcvtmsCode, scalar=True)
1253    # FCVTMU
1254    fcvtmuCode = fcvtCode % ("0", "true", "FPRounding_NEGINF")
1255    twoEqualRegInstX("fcvtmu", "FcvtmuDX", "SimdCvtOp", smallFloatTypes, 2,
1256                     fcvtmuCode)
1257    twoEqualRegInstX("fcvtmu", "FcvtmuQX", "SimdCvtOp", floatTypes, 4,
1258                     fcvtmuCode)
1259    twoEqualRegInstX("fcvtmu", "FcvtmuScX", "SimdCvtOp", floatTypes, 4,
1260                     fcvtmuCode, scalar=True)
1261    # FCVTN, FCVTN2
1262    fcvtnCode = fpOp % ("fplibConvert<BigElement, Element>("
1263                        "srcElem1, FPCRRounding(fpscr), fpscr)")
1264    twoRegNarrowInstX("fcvtn", "FcvtnX", "SimdCvtOp",
1265                      ("uint16_t", "uint32_t"), fcvtnCode)
1266    twoRegNarrowInstX("fcvtn", "Fcvtn2X", "SimdCvtOp",
1267                      ("uint16_t", "uint32_t"), fcvtnCode, hi=True)
1268    # FCVTNS
1269    fcvtnsCode = fcvtCode % ("0", "false", "FPRounding_TIEEVEN")
1270    twoEqualRegInstX("fcvtns", "FcvtnsDX", "SimdCvtOp", smallFloatTypes, 2,
1271                     fcvtnsCode)
1272    twoEqualRegInstX("fcvtns", "FcvtnsQX", "SimdCvtOp", floatTypes, 4,
1273                     fcvtnsCode)
1274    twoEqualRegInstX("fcvtns", "FcvtnsScX", "SimdCvtOp", floatTypes, 4,
1275                     fcvtnsCode, scalar=True)
1276    # FCVTNU
1277    fcvtnuCode = fcvtCode % ("0", "true", "FPRounding_TIEEVEN")
1278    twoEqualRegInstX("fcvtnu", "FcvtnuDX", "SimdCvtOp", smallFloatTypes, 2,
1279                     fcvtnuCode)
1280    twoEqualRegInstX("fcvtnu", "FcvtnuQX", "SimdCvtOp", floatTypes, 4,
1281                     fcvtnuCode)
1282    twoEqualRegInstX("fcvtnu", "FcvtnuScX", "SimdCvtOp", floatTypes, 4,
1283                     fcvtnuCode, scalar=True)
1284    # FCVTPS
1285    fcvtpsCode = fcvtCode % ("0", "false", "FPRounding_POSINF")
1286    twoEqualRegInstX("fcvtps", "FcvtpsDX", "SimdCvtOp", smallFloatTypes, 2,
1287                     fcvtpsCode)
1288    twoEqualRegInstX("fcvtps", "FcvtpsQX", "SimdCvtOp", floatTypes, 4,
1289                     fcvtpsCode)
1290    twoEqualRegInstX("fcvtps", "FcvtpsScX", "SimdCvtOp", floatTypes, 4,
1291                     fcvtpsCode, scalar=True)
1292    # FCVTPU
1293    fcvtpuCode = fcvtCode % ("0", "true", "FPRounding_POSINF")
1294    twoEqualRegInstX("fcvtpu", "FcvtpuDX", "SimdCvtOp", smallFloatTypes, 2,
1295                     fcvtpuCode)
1296    twoEqualRegInstX("fcvtpu", "FcvtpuQX", "SimdCvtOp", floatTypes, 4,
1297                     fcvtpuCode)
1298    twoEqualRegInstX("fcvtpu", "FcvtpuScX", "SimdCvtOp", floatTypes, 4,
1299                     fcvtpuCode, scalar=True)
1300    # FCVTXN, FCVTXN2
1301    fcvtxnCode = fpOp % ("fplibConvert<BigElement, Element>("
1302                         "srcElem1, FPRounding_ODD, fpscr)")
1303    twoRegNarrowInstX("fcvtxn", "FcvtxnX", "SimdCvtOp", smallFloatTypes,
1304                      fcvtxnCode)
1305    twoRegNarrowInstX("fcvtxn", "Fcvtxn2X", "SimdCvtOp", smallFloatTypes,
1306                      fcvtxnCode, hi=True)
1307    twoRegNarrowInstX("fcvtxn", "FcvtxnScX", "SimdCvtOp", smallFloatTypes,
1308                      fcvtxnCode, scalar=True)
1309    # FCVTZS (fixed-point)
1310    fcvtzsCode = fcvtCode % ("imm", "false", "FPRounding_ZERO")
1311    twoEqualRegInstX("fcvtzs", "FcvtzsFixedDX", "SimdCvtOp", smallFloatTypes,
1312                     2, fcvtzsCode, hasImm=True)
1313    twoEqualRegInstX("fcvtzs", "FcvtzsFixedQX", "SimdCvtOp", floatTypes, 4,
1314                     fcvtzsCode, hasImm=True)
1315    twoEqualRegInstX("fcvtzs", "FcvtzsFixedScX", "SimdCvtOp", floatTypes, 4,
1316                     fcvtzsCode, hasImm=True, scalar=True)
1317    # FCVTZS (integer)
1318    fcvtzsIntCode = fcvtCode % ("0", "false", "FPRounding_ZERO")
1319    twoEqualRegInstX("fcvtzs", "FcvtzsIntDX", "SimdCvtOp", smallFloatTypes,
1320                     2, fcvtzsIntCode)
1321    twoEqualRegInstX("fcvtzs", "FcvtzsIntQX", "SimdCvtOp", floatTypes, 4,
1322                     fcvtzsIntCode)
1323    twoEqualRegInstX("fcvtzs", "FcvtzsIntScX", "SimdCvtOp", floatTypes, 4,
1324                     fcvtzsIntCode, scalar=True)
1325    # FCVTZU (fixed-point)
1326    fcvtzuCode = fcvtCode % ("imm", "true", "FPRounding_ZERO")
1327    twoEqualRegInstX("fcvtzu", "FcvtzuFixedDX", "SimdCvtOp", smallFloatTypes,
1328                     2, fcvtzuCode, hasImm=True)
1329    twoEqualRegInstX("fcvtzu", "FcvtzuFixedQX", "SimdCvtOp", floatTypes, 4,
1330                     fcvtzuCode, hasImm=True)
1331    twoEqualRegInstX("fcvtzu", "FcvtzuFixedScX", "SimdCvtOp", floatTypes, 4,
1332                     fcvtzuCode, hasImm=True, scalar=True)
1333    # FCVTZU (integer)
1334    fcvtzuIntCode = fcvtCode % ("0", "true", "FPRounding_ZERO")
1335    twoEqualRegInstX("fcvtzu", "FcvtzuIntDX", "SimdCvtOp", smallFloatTypes, 2,
1336                     fcvtzuIntCode)
1337    twoEqualRegInstX("fcvtzu", "FcvtzuIntQX", "SimdCvtOp", floatTypes, 4,
1338                     fcvtzuIntCode)
1339    twoEqualRegInstX("fcvtzu", "FcvtzuIntScX", "SimdCvtOp", floatTypes, 4,
1340                     fcvtzuIntCode, scalar=True)
1341    # FDIV
1342    fdivCode = fpBinOp % "Div"
1343    threeEqualRegInstX("fdiv", "FdivDX", "SimdFloatDivOp", smallFloatTypes, 2,
1344                       fdivCode)
1345    threeEqualRegInstX("fdiv", "FdivQX", "SimdFloatDivOp", floatTypes, 4,
1346                       fdivCode)
1347    # FMAX
1348    fmaxCode = fpBinOp % "Max"
1349    threeEqualRegInstX("fmax", "FmaxDX", "SimdFloatCmpOp", smallFloatTypes, 2,
1350                       fmaxCode)
1351    threeEqualRegInstX("fmax", "FmaxQX", "SimdFloatCmpOp", floatTypes, 4,
1352                       fmaxCode)
1353    # FMAXNM
1354    fmaxnmCode = fpBinOp % "MaxNum"
1355    threeEqualRegInstX("fmaxnm", "FmaxnmDX", "SimdFloatCmpOp", smallFloatTypes,
1356                       2, fmaxnmCode)
1357    threeEqualRegInstX("fmaxnm", "FmaxnmQX", "SimdFloatCmpOp", floatTypes, 4,
1358                       fmaxnmCode)
1359    # FMAXNMP (scalar)
1360    twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScDX", "SimdFloatCmpOp",
1361                          ("uint32_t",), 2, fmaxnmCode)
1362    twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScQX", "SimdFloatCmpOp",
1363                          ("uint64_t",), 4, fmaxnmCode)
1364    # FMAXNMP (vector)
1365    threeEqualRegInstX("fmaxnmp", "FmaxnmpDX", "SimdFloatCmpOp",
1366                       smallFloatTypes, 2, fmaxnmCode, pairwise=True)
1367    threeEqualRegInstX("fmaxnmp", "FmaxnmpQX", "SimdFloatCmpOp", floatTypes, 4,
1368                       fmaxnmCode, pairwise=True)
1369    # FMAXNMV
1370    # Note: SimdFloatCmpOp can be a bit optimistic here
1371    fpAcrossOp = fpOp % "fplib%s<Element>(destElem, srcElem1, fpscr)"
1372    fmaxnmAcrossCode = fpAcrossOp % "MaxNum"
1373    twoRegAcrossInstX("fmaxnmv", "FmaxnmvQX", "SimdFloatCmpOp", ("uint32_t",),
1374                      4, fmaxnmAcrossCode)
1375    # FMAXP (scalar)
1376    twoRegPairwiseScInstX("fmaxp", "FmaxpScDX", "SimdFloatCmpOp",
1377                          ("uint32_t",), 2, fmaxCode)
1378    twoRegPairwiseScInstX("fmaxp", "FmaxpScQX", "SimdFloatCmpOp",
1379                          ("uint64_t",), 4, fmaxCode)
1380    # FMAXP (vector)
1381    threeEqualRegInstX("fmaxp", "FmaxpDX", "SimdFloatCmpOp", smallFloatTypes,
1382                       2, fmaxCode, pairwise=True)
1383    threeEqualRegInstX("fmaxp", "FmaxpQX", "SimdFloatCmpOp", floatTypes, 4,
1384                       fmaxCode, pairwise=True)
1385    # FMAXV
1386    # Note: SimdFloatCmpOp can be a bit optimistic here
1387    fmaxAcrossCode = fpAcrossOp % "Max"
1388    twoRegAcrossInstX("fmaxv", "FmaxvQX", "SimdFloatCmpOp", ("uint32_t",), 4,
1389                      fmaxAcrossCode)
1390    # FMIN
1391    fminCode = fpBinOp % "Min"
1392    threeEqualRegInstX("fmin", "FminDX", "SimdFloatCmpOp", smallFloatTypes, 2,
1393                       fminCode)
1394    threeEqualRegInstX("fmin", "FminQX", "SimdFloatCmpOp", floatTypes, 4,
1395                       fminCode)
1396    # FMINNM
1397    fminnmCode = fpBinOp % "MinNum"
1398    threeEqualRegInstX("fminnm", "FminnmDX", "SimdFloatCmpOp", smallFloatTypes,
1399                       2, fminnmCode)
1400    threeEqualRegInstX("fminnm", "FminnmQX", "SimdFloatCmpOp", floatTypes, 4,
1401                       fminnmCode)
1402    # FMINNMP (scalar)
1403    twoRegPairwiseScInstX("fminnmp", "FminnmpScDX", "SimdFloatCmpOp",
1404                          ("uint32_t",), 2, fminnmCode)
1405    twoRegPairwiseScInstX("fminnmp", "FminnmpScQX", "SimdFloatCmpOp",
1406                          ("uint64_t",), 4, fminnmCode)
1407    # FMINNMP (vector)
1408    threeEqualRegInstX("fminnmp", "FminnmpDX", "SimdFloatCmpOp",
1409                       smallFloatTypes, 2, fminnmCode, pairwise=True)
1410    threeEqualRegInstX("fminnmp", "FminnmpQX", "SimdFloatCmpOp", floatTypes, 4,
1411                       fminnmCode, pairwise=True)
1412    # FMINNMV
1413    # Note: SimdFloatCmpOp can be a bit optimistic here
1414    fminnmAcrossCode = fpAcrossOp % "MinNum"
1415    twoRegAcrossInstX("fminnmv", "FminnmvQX", "SimdFloatCmpOp", ("uint32_t",),
1416                      4, fminnmAcrossCode)
1417    # FMINP (scalar)
1418    twoRegPairwiseScInstX("fminp", "FminpScDX", "SimdFloatCmpOp",
1419                          ("uint32_t",), 2, fminCode)
1420    twoRegPairwiseScInstX("fminp", "FminpScQX", "SimdFloatCmpOp",
1421                          ("uint64_t",), 4, fminCode)
1422    # FMINP (vector)
1423    threeEqualRegInstX("fminp", "FminpDX", "SimdFloatCmpOp", smallFloatTypes,
1424                       2, fminCode, pairwise=True)
1425    threeEqualRegInstX("fminp", "FminpQX", "SimdFloatCmpOp", floatTypes, 4,
1426                       fminCode, pairwise=True)
1427    # FMINV
1428    # Note: SimdFloatCmpOp can be a bit optimistic here
1429    fminAcrossCode = fpAcrossOp % "Min"
1430    twoRegAcrossInstX("fminv", "FminvQX", "SimdFloatCmpOp", ("uint32_t",), 4,
1431                      fminAcrossCode)
1432    # FMLA (by element)
1433    fmlaCode = fpOp % ("fplibMulAdd<Element>("
1434                       "destElem, srcElem1, srcElem2, fpscr)")
1435    threeEqualRegInstX("fmla", "FmlaElemDX", "SimdFloatMultAccOp",
1436                       smallFloatTypes, 2, fmlaCode, True, byElem=True)
1437    threeEqualRegInstX("fmla", "FmlaElemQX", "SimdFloatMultAccOp", floatTypes,
1438                       4, fmlaCode, True, byElem=True)
1439    threeEqualRegInstX("fmla", "FmlaElemScX", "SimdFloatMultAccOp", floatTypes,
1440                       4, fmlaCode, True, byElem=True, scalar=True)
1441    # FMLA (vector)
1442    threeEqualRegInstX("fmla", "FmlaDX", "SimdFloatMultAccOp", smallFloatTypes,
1443                       2, fmlaCode, True)
1444    threeEqualRegInstX("fmla", "FmlaQX", "SimdFloatMultAccOp", floatTypes, 4,
1445                       fmlaCode, True)
1446    # FMLS (by element)
1447    fmlsCode = fpOp % ("fplibMulAdd<Element>(destElem,"
1448                       " fplibNeg<Element>(srcElem1), srcElem2, fpscr)")
1449    threeEqualRegInstX("fmls", "FmlsElemDX", "SimdFloatMultAccOp",
1450                       smallFloatTypes, 2, fmlsCode, True, byElem=True)
1451    threeEqualRegInstX("fmls", "FmlsElemQX", "SimdFloatMultAccOp", floatTypes,
1452                       4, fmlsCode, True, byElem=True)
1453    threeEqualRegInstX("fmls", "FmlsElemScX", "SimdFloatMultAccOp", floatTypes,
1454                       4, fmlsCode, True, byElem=True, scalar=True)
1455    # FMLS (vector)
1456    threeEqualRegInstX("fmls", "FmlsDX", "SimdFloatMultAccOp", smallFloatTypes,
1457                       2, fmlsCode, True)
1458    threeEqualRegInstX("fmls", "FmlsQX", "SimdFloatMultAccOp", floatTypes, 4,
1459                       fmlsCode, True)
1460    # FMOV
1461    fmovCode = 'destElem = imm;'
1462    oneRegImmInstX("fmov", "FmovDX", "SimdMiscOp", smallFloatTypes, 2,
1463                   fmovCode)
1464    oneRegImmInstX("fmov", "FmovQX", "SimdMiscOp", floatTypes, 4, fmovCode)
1465    # FMUL (by element)
1466    fmulCode = fpBinOp % "Mul"
1467    threeEqualRegInstX("fmul", "FmulElemDX", "SimdFloatMultOp",
1468                       smallFloatTypes, 2, fmulCode, byElem=True)
1469    threeEqualRegInstX("fmul", "FmulElemQX", "SimdFloatMultOp", floatTypes, 4,
1470                       fmulCode, byElem=True)
1471    threeEqualRegInstX("fmul", "FmulElemScX", "SimdFloatMultOp", floatTypes, 4,
1472                       fmulCode, byElem=True, scalar=True)
1473    # FMUL (vector)
1474    threeEqualRegInstX("fmul", "FmulDX", "SimdFloatMultOp", smallFloatTypes, 2,
1475                       fmulCode)
1476    threeEqualRegInstX("fmul", "FmulQX", "SimdFloatMultOp", floatTypes, 4,
1477                       fmulCode)
1478    # FMULX
1479    fmulxCode = fpBinOp % "MulX"
1480    threeEqualRegInstX("fmulx", "FmulxDX", "SimdFloatMultOp", smallFloatTypes,
1481                       2, fmulxCode)
1482    threeEqualRegInstX("fmulx", "FmulxQX", "SimdFloatMultOp", floatTypes, 4,
1483                       fmulxCode)
1484    threeEqualRegInstX("fmulx", "FmulxScX", "SimdFloatMultOp", floatTypes, 4,
1485                       fmulxCode, scalar=True)
1486    # FMULX (by element)
1487    threeEqualRegInstX("fmulx", "FmulxElemDX", "SimdFloatMultOp",
1488                       smallFloatTypes, 2, fmulxCode, byElem=True)
1489    threeEqualRegInstX("fmulx", "FmulxElemQX", "SimdFloatMultOp", floatTypes,
1490                       4, fmulxCode, byElem=True)
1491    threeEqualRegInstX("fmulx", "FmulxElemScX", "SimdFloatMultOp", floatTypes,
1492                       4, fmulxCode, byElem=True, scalar=True)
1493    # FNEG
1494    fnegCode = fpOp % "fplibNeg<Element>(srcElem1)"
1495    twoEqualRegInstX("Neg", "FnegDX", "SimdFloatAluOp", smallFloatTypes, 2,
1496                     fnegCode)
1497    twoEqualRegInstX("Neg", "FnegQX", "SimdFloatAluOp", floatTypes, 4,
1498                     fnegCode)
1499    # FRECPE
1500    frecpeCode = fpOp % "fplibRecipEstimate<Element>(srcElem1, fpscr)"
1501    twoEqualRegInstX("frecpe", "FrecpeDX", "SimdFloatMultAccOp",
1502                     smallFloatTypes, 2, frecpeCode)
1503    twoEqualRegInstX("frecpe", "FrecpeQX", "SimdFloatMultAccOp", floatTypes, 4,
1504                     frecpeCode)
1505    twoEqualRegInstX("frecpe", "FrecpeScX", "SimdFloatMultAccOp", floatTypes,
1506                     4, frecpeCode, scalar=True)
1507    # FRECPS
1508    frecpsCode = fpBinOp % "RecipStepFused"
1509    threeEqualRegInstX("frecps", "FrecpsDX", "SimdFloatMultAccOp",
1510                       smallFloatTypes, 2, frecpsCode)
1511    threeEqualRegInstX("frecps", "FrecpsQX", "SimdFloatMultAccOp", floatTypes,
1512                       4, frecpsCode)
1513    threeEqualRegInstX("frecps", "FrecpsScX", "SimdFloatMultAccOp", floatTypes,
1514                       4, frecpsCode, scalar=True)
1515    # FRECPX
1516    frecpxCode = fpOp % "fplibRecpX<Element>(srcElem1, fpscr)"
1517    twoEqualRegInstX("frecpx", "FrecpxX", "SimdFloatMultAccOp", floatTypes, 4,
1518                     frecpxCode, scalar=True)
1519    # FRINTA
1520    frintCode = fpOp % "fplibRoundInt<Element>(srcElem1, %s, %s, fpscr)"
1521    frintaCode = frintCode % ("FPRounding_TIEAWAY", "false")
1522    twoEqualRegInstX("frinta", "FrintaDX", "SimdCvtOp", smallFloatTypes, 2,
1523                     frintaCode)
1524    twoEqualRegInstX("frinta", "FrintaQX", "SimdCvtOp", floatTypes, 4,
1525                     frintaCode)
1526    # FRINTI
1527    frintiCode = frintCode % ("FPCRRounding(fpscr)", "false")
1528    twoEqualRegInstX("frinti", "FrintiDX", "SimdCvtOp", smallFloatTypes, 2,
1529                     frintiCode)
1530    twoEqualRegInstX("frinti", "FrintiQX", "SimdCvtOp", floatTypes, 4,
1531                     frintiCode)
1532    # FRINTM
1533    frintmCode = frintCode % ("FPRounding_NEGINF", "false")
1534    twoEqualRegInstX("frintm", "FrintmDX", "SimdCvtOp", smallFloatTypes, 2,
1535                     frintmCode)
1536    twoEqualRegInstX("frintm", "FrintmQX", "SimdCvtOp", floatTypes, 4,
1537                     frintmCode)
1538    # FRINTN
1539    frintnCode = frintCode % ("FPRounding_TIEEVEN", "false")
1540    twoEqualRegInstX("frintn", "FrintnDX", "SimdCvtOp", smallFloatTypes, 2,
1541                     frintnCode)
1542    twoEqualRegInstX("frintn", "FrintnQX", "SimdCvtOp", floatTypes, 4,
1543                     frintnCode)
1544    # FRINTP
1545    frintpCode = frintCode % ("FPRounding_POSINF", "false")
1546    twoEqualRegInstX("frintp", "FrintpDX", "SimdCvtOp", smallFloatTypes, 2,
1547                     frintpCode)
1548    twoEqualRegInstX("frintp", "FrintpQX", "SimdCvtOp", floatTypes, 4,
1549                     frintpCode)
1550    # FRINTX
1551    frintxCode = frintCode % ("FPCRRounding(fpscr)", "true")
1552    twoEqualRegInstX("frintx", "FrintxDX", "SimdCvtOp", smallFloatTypes, 2,
1553                     frintxCode)
1554    twoEqualRegInstX("frintx", "FrintxQX", "SimdCvtOp", floatTypes, 4,
1555                     frintxCode)
1556    # FRINTZ
1557    frintzCode = frintCode % ("FPRounding_ZERO", "false")
1558    twoEqualRegInstX("frintz", "FrintzDX", "SimdCvtOp", smallFloatTypes, 2,
1559                     frintzCode)
1560    twoEqualRegInstX("frintz", "FrintzQX", "SimdCvtOp", floatTypes, 4,
1561                     frintzCode)
1562    # FRSQRTE
1563    frsqrteCode = fpOp % "fplibRSqrtEstimate<Element>(srcElem1, fpscr)"
1564    twoEqualRegInstX("frsqrte", "FrsqrteDX", "SimdFloatSqrtOp",
1565                     smallFloatTypes, 2, frsqrteCode)
1566    twoEqualRegInstX("frsqrte", "FrsqrteQX", "SimdFloatSqrtOp", floatTypes, 4,
1567                     frsqrteCode)
1568    twoEqualRegInstX("frsqrte", "FrsqrteScX", "SimdFloatSqrtOp", floatTypes, 4,
1569                     frsqrteCode, scalar=True)
1570    # FRSQRTS
1571    frsqrtsCode = fpBinOp % "RSqrtStepFused"
1572    threeEqualRegInstX("frsqrts", "FrsqrtsDX", "SimdFloatMiscOp",
1573                       smallFloatTypes, 2, frsqrtsCode)
1574    threeEqualRegInstX("frsqrts", "FrsqrtsQX", "SimdFloatMiscOp", floatTypes,
1575                       4, frsqrtsCode)
1576    threeEqualRegInstX("frsqrts", "FrsqrtsScX", "SimdFloatMiscOp", floatTypes,
1577                       4, frsqrtsCode, scalar=True)
1578    # FSQRT
1579    fsqrtCode = fpOp % "fplibSqrt<Element>(srcElem1, fpscr)"
1580    twoEqualRegInstX("fsqrt", "FsqrtDX", "SimdFloatSqrtOp", smallFloatTypes, 2,
1581                     fsqrtCode)
1582    twoEqualRegInstX("fsqrt", "FsqrtQX", "SimdFloatSqrtOp", floatTypes, 4,
1583                     fsqrtCode)
1584    # FSUB
1585    fsubCode = fpBinOp % "Sub"
1586    threeEqualRegInstX("fsub", "FsubDX", "SimdFloatAddOp", smallFloatTypes, 2,
1587                       fsubCode)
1588    threeEqualRegInstX("fsub", "FsubQX", "SimdFloatAddOp", floatTypes, 4,
1589                       fsubCode)
1590    # INS (element)
1591    insFromVecElemInstX("ins", "InsElemX", "SimdMiscOp", unsignedTypes, 4)
1592    # INS (general register)
1593    insFromGprInstX("ins", "InsGprWX", "SimdMiscOp", smallUnsignedTypes, 4,
1594                    'W')
1595    insFromGprInstX("ins", "InsGprXX", "SimdMiscOp", unsignedTypes, 4, 'X')
1596    # MLA (by element)
1597    mlaCode = "destElem += srcElem1 * srcElem2;"
1598    threeEqualRegInstX("mla", "MlaElemDX", "SimdMultAccOp",
1599                       ("uint16_t", "uint32_t"), 2, mlaCode, True, byElem=True)
1600    threeEqualRegInstX("mla", "MlaElemQX", "SimdMultAccOp",
1601                       ("uint16_t", "uint32_t"), 4, mlaCode, True, byElem=True)
1602    # MLA (vector)
1603    threeEqualRegInstX("mla", "MlaDX", "SimdMultAccOp", smallUnsignedTypes, 2,
1604                       mlaCode, True)
1605    threeEqualRegInstX("mla", "MlaQX", "SimdMultAccOp", smallUnsignedTypes, 4,
1606                       mlaCode, True)
1607    # MLS (by element)
1608    mlsCode = "destElem -= srcElem1 * srcElem2;"
1609    threeEqualRegInstX("mls", "MlsElemDX", "SimdMultAccOp",
1610                       ("uint16_t", "uint32_t"), 2, mlsCode, True, byElem=True)
1611    threeEqualRegInstX("mls", "MlsElemQX", "SimdMultAccOp",
1612                       ("uint16_t", "uint32_t"), 4, mlsCode, True, byElem=True)
1613    # MLS (vector)
1614    threeEqualRegInstX("mls", "MlsDX", "SimdMultAccOp", smallUnsignedTypes, 2,
1615                       mlsCode, True)
1616    threeEqualRegInstX("mls", "MlsQX", "SimdMultAccOp", smallUnsignedTypes, 4,
1617                       mlsCode, True)
1618    # MOV (element) -> alias to INS (element)
1619    # MOV (from general) -> alias to INS (general register)
1620    # MOV (scalar) -> alias to DUP (element)
1621    # MOV (to general) -> alias to UMOV
1622    # MOV (vector) -> alias to ORR (register)
1623    # MOVI
1624    movImmCode = "destElem = imm;"
1625    oneRegImmInstX("movi", "MoviDX", "SimdMiscOp", ("uint64_t",), 2,
1626                   movImmCode)
1627    oneRegImmInstX("movi", "MoviQX", "SimdMiscOp", ("uint64_t",), 4,
1628                   movImmCode)
1629    # MUL (by element)
1630    mulCode = "destElem = srcElem1 * srcElem2;"
1631    threeEqualRegInstX("mul", "MulElemDX", "SimdMultOp",
1632                       ("uint16_t", "uint32_t"), 2, mulCode, byElem=True)
1633    threeEqualRegInstX("mul", "MulElemQX", "SimdMultOp",
1634                       ("uint16_t", "uint32_t"), 4, mulCode, byElem=True)
1635    # MUL (vector)
1636    threeEqualRegInstX("mul", "MulDX", "SimdMultOp", smallUnsignedTypes, 2,
1637                       mulCode)
1638    threeEqualRegInstX("mul", "MulQX", "SimdMultOp", smallUnsignedTypes, 4,
1639                       mulCode)
1640    # MVN
1641    mvnCode = "destElem = ~srcElem1;"
1642    twoEqualRegInstX("mvn", "MvnDX", "SimdAluOp", ("uint64_t",), 2, mvnCode)
1643    twoEqualRegInstX("mvn", "MvnQX", "SimdAluOp", ("uint64_t",), 4, mvnCode)
1644    # MVNI
1645    mvniCode = "destElem = ~imm;"
1646    oneRegImmInstX("mvni", "MvniDX", "SimdAluOp", ("uint64_t",), 2, mvniCode)
1647    oneRegImmInstX("mvni", "MvniQX", "SimdAluOp", ("uint64_t",), 4, mvniCode)
1648    # NEG
1649    negCode = "destElem = -srcElem1;"
1650    twoEqualRegInstX("neg", "NegDX", "SimdAluOp", signedTypes, 2, negCode)
1651    twoEqualRegInstX("neg", "NegQX", "SimdAluOp", signedTypes, 4, negCode)
1652    # NOT -> alias to MVN
1653    # ORN
1654    ornCode = "destElem = srcElem1 | ~srcElem2;"
1655    threeEqualRegInstX("orn", "OrnDX", "SimdAluOp", ("uint64_t",), 2, ornCode)
1656    threeEqualRegInstX("orn", "OrnQX", "SimdAluOp", ("uint64_t",), 4, ornCode)
1657    # ORR (immediate)
1658    orrImmCode = "destElem |= imm;"
1659    oneRegImmInstX("orr", "OrrImmDX", "SimdAluOp", ("uint64_t",), 2,
1660                   orrImmCode, True)
1661    oneRegImmInstX("orr", "OrrImmQX", "SimdAluOp", ("uint64_t",), 4,
1662                   orrImmCode, True)
1663    # ORR (register)
1664    orrCode = "destElem = srcElem1 | srcElem2;"
1665    threeEqualRegInstX("orr", "OrrDX", "SimdAluOp", ("uint64_t",), 2, orrCode)
1666    threeEqualRegInstX("orr", "OrrQX", "SimdAluOp", ("uint64_t",), 4, orrCode)
1667    # PMUL
1668    pmulCode = '''
1669            destElem = 0;
1670            for (unsigned j = 0; j < sizeof(Element) * 8; j++) {
1671                if (bits(srcElem2, j))
1672                    destElem ^= srcElem1 << j;
1673            }
1674    '''
1675    threeEqualRegInstX("pmul", "PmulDX", "SimdMultOp", ("uint8_t",), 2,
1676                       pmulCode)
1677    threeEqualRegInstX("pmul", "PmulQX", "SimdMultOp", ("uint8_t",), 4,
1678                       pmulCode)
1679    # PMULL, PMULL2
1680    # Note: 64-bit PMULL is not available (Crypto. Extension)
1681    pmullCode = '''
1682            destElem = 0;
1683            for (unsigned j = 0; j < sizeof(Element) * 8; j++) {
1684                if (bits(srcElem2, j))
1685                    destElem ^= (BigElement)srcElem1 << j;
1686            }
1687    '''
1688    threeRegLongInstX("pmull", "PmullX", "SimdMultOp", ("uint8_t",), pmullCode)
1689    threeRegLongInstX("pmull", "Pmull2X", "SimdMultOp", ("uint8_t",),
1690                      pmullCode, hi=True)
1691    # RADDHN, RADDHN2
1692    raddhnCode = '''
1693            destElem = ((BigElement)srcElem1 + (BigElement)srcElem2 +
1694                        ((BigElement)1 << (sizeof(Element) * 8 - 1))) >>
1695                       (sizeof(Element) * 8);
1696    '''
1697    threeRegNarrowInstX("raddhn", "RaddhnX", "SimdAddOp", smallUnsignedTypes,
1698                        raddhnCode)
1699    threeRegNarrowInstX("raddhn2", "Raddhn2X", "SimdAddOp", smallUnsignedTypes,
1700                        raddhnCode, hi=True)
1701    # RBIT
1702    rbitCode = '''
1703            destElem = 0;
1704            Element temp = srcElem1;
1705            for (int i = 0; i < 8 * sizeof(Element); i++) {
1706                destElem = destElem  | ((temp & 0x1) <<
1707                                        (8 * sizeof(Element) - 1 - i));
1708                temp >>= 1;
1709            }
1710    '''
1711    twoEqualRegInstX("rbit", "RbitDX", "SimdAluOp", ("uint8_t",), 2, rbitCode)
1712    twoEqualRegInstX("rbit", "RbitQX", "SimdAluOp", ("uint8_t",), 4, rbitCode)
1713    # REV16
1714    rev16Code = '''
1715            destElem = srcElem1;
1716            unsigned groupSize = ((1 << 1) / sizeof(Element));
1717            unsigned reverseMask = (groupSize - 1);
1718            j = i ^ reverseMask;
1719    '''
1720    twoEqualRegInstX("rev16", "Rev16DX", "SimdAluOp", ("uint8_t",), 2,
1721                     rev16Code)
1722    twoEqualRegInstX("rev16", "Rev16QX", "SimdAluOp", ("uint8_t",), 4,
1723                     rev16Code)
1724    # REV32
1725    rev32Code = '''
1726            destElem = srcElem1;
1727            unsigned groupSize = ((1 << 2) / sizeof(Element));
1728            unsigned reverseMask = (groupSize - 1);
1729            j = i ^ reverseMask;
1730    '''
1731    twoEqualRegInstX("rev32", "Rev32DX", "SimdAluOp", ("uint8_t", "uint16_t"),
1732                     2, rev32Code)
1733    twoEqualRegInstX("rev32", "Rev32QX", "SimdAluOp", ("uint8_t", "uint16_t"),
1734                     4, rev32Code)
1735    # REV64
1736    rev64Code = '''
1737            destElem = srcElem1;
1738            unsigned groupSize = ((1 << 3) / sizeof(Element));
1739            unsigned reverseMask = (groupSize - 1);
1740            j = i ^ reverseMask;
1741    '''
1742    twoEqualRegInstX("rev64", "Rev64DX", "SimdAluOp", smallUnsignedTypes, 2,
1743                     rev64Code)
1744    twoEqualRegInstX("rev64", "Rev64QX", "SimdAluOp", smallUnsignedTypes, 4,
1745                     rev64Code)
1746    # RSHRN, RSHRN2
1747    rshrnCode = '''
1748            if (imm > sizeof(srcElem1) * 8) {
1749                destElem = 0;
1750            } else if (imm) {
1751                Element rBit = bits(srcElem1, imm - 1);
1752                destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit;
1753            } else {
1754                destElem = srcElem1;
1755            }
1756    '''
1757    twoRegNarrowInstX("rshrn", "RshrnX", "SimdShiftOp", smallUnsignedTypes,
1758                      rshrnCode, hasImm=True)
1759    twoRegNarrowInstX("rshrn2", "Rshrn2X", "SimdShiftOp", smallUnsignedTypes,
1760                      rshrnCode, hasImm=True, hi=True)
1761    # RSUBHN, RSUBHN2
1762    rsubhnCode = '''
1763            destElem = ((BigElement)srcElem1 - (BigElement)srcElem2 +
1764                        ((BigElement)1 << (sizeof(Element) * 8 - 1))) >>
1765                       (sizeof(Element) * 8);
1766    '''
1767    threeRegNarrowInstX("rsubhn", "RsubhnX", "SimdAddOp", smallTypes,
1768                        rsubhnCode)
1769    threeRegNarrowInstX("rsubhn2", "Rsubhn2X", "SimdAddOp", smallTypes,
1770                        rsubhnCode, hi=True)
1771    # SABA
1772    abaCode = '''
1773            destElem += (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) :
1774                                                (srcElem2 - srcElem1);
1775    '''
1776    threeEqualRegInstX("saba", "SabaDX", "SimdAddAccOp", smallSignedTypes, 2,
1777                       abaCode, True)
1778    threeEqualRegInstX("saba", "SabaQX", "SimdAddAccOp", smallSignedTypes, 4,
1779                       abaCode, True)
1780    # SABAL, SABAL2
1781    abalCode = '''
1782            destElem += (srcElem1 > srcElem2) ?
1783                ((BigElement)srcElem1 - (BigElement)srcElem2) :
1784                ((BigElement)srcElem2 - (BigElement)srcElem1);
1785    '''
1786    threeRegLongInstX("sabal", "SabalX", "SimdAddAccOp", smallSignedTypes,
1787                      abalCode, True)
1788    threeRegLongInstX("sabal2", "Sabal2X", "SimdAddAccOp", smallSignedTypes,
1789                      abalCode, True, hi=True)
1790    # SABD
1791    abdCode = '''
1792            destElem = (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) :
1793                                               (srcElem2 - srcElem1);
1794    '''
1795    threeEqualRegInstX("sabd", "SabdDX", "SimdAddOp", smallSignedTypes, 2,
1796                       abdCode)
1797    threeEqualRegInstX("sabd", "SabdQX", "SimdAddOp", smallSignedTypes, 4,
1798                       abdCode)
1799    # SABDL, SABDL2
1800    abdlCode = '''
1801            destElem = (srcElem1 > srcElem2) ?
1802                ((BigElement)srcElem1 - (BigElement)srcElem2) :
1803                ((BigElement)srcElem2 - (BigElement)srcElem1);
1804    '''
1805    threeRegLongInstX("sabdl", "SabdlX", "SimdAddAccOp", smallSignedTypes,
1806                      abdlCode, True)
1807    threeRegLongInstX("sabdl2", "Sabdl2X", "SimdAddAccOp", smallSignedTypes,
1808                      abdlCode, True, hi=True)
1809    # SADALP
1810    adalpCode = "destElem += (BigElement)srcElem1 + (BigElement)srcElem2;"
1811    twoRegCondenseInstX("sadalp", "SadalpDX", "SimdAddOp", smallSignedTypes, 2,
1812                        adalpCode, True)
1813    twoRegCondenseInstX("sadalp", "SadalpQX", "SimdAddOp", smallSignedTypes, 4,
1814                        adalpCode, True)
1815    # SADDL, SADDL2
1816    addlwCode = "destElem = (BigElement)srcElem1 + (BigElement)srcElem2;"
1817    threeRegLongInstX("saddl", "SaddlX", "SimdAddAccOp", smallSignedTypes,
1818                      addlwCode)
1819    threeRegLongInstX("saddl2", "Saddl2X", "SimdAddAccOp", smallSignedTypes,
1820                      addlwCode, hi=True)
1821    # SADDLP
1822    twoRegCondenseInstX("saddlp", "SaddlpDX", "SimdAddOp", smallSignedTypes, 2,
1823                        addlwCode)
1824    twoRegCondenseInstX("saddlp", "SaddlpQX", "SimdAddOp", smallSignedTypes, 4,
1825                        addlwCode)
1826    # SADDLV
1827    # Note: SimdAddOp can be a bit optimistic here
1828    addAcrossLongCode = "destElem += (BigElement)srcElem1;"
1829    twoRegAcrossInstX("saddlv", "SaddlvDX", "SimdAddOp", ("int8_t", "int16_t"),
1830                      2, addAcrossLongCode, long=True)
1831    twoRegAcrossInstX("saddlv", "SaddlvQX", "SimdAddOp", ("int8_t", "int16_t"),
1832                      4, addAcrossLongCode, long=True)
1833    twoRegAcrossInstX("saddlv", "SaddlvBQX", "SimdAddOp", ("int32_t",), 4,
1834                      addAcrossLongCode, doubleDest=True, long=True)
1835    # SADDW, SADDW2
1836    threeRegWideInstX("saddw", "SaddwX", "SimdAddAccOp", smallSignedTypes,
1837                      addlwCode)
1838    threeRegWideInstX("saddw2", "Saddw2X", "SimdAddAccOp", smallSignedTypes,
1839                      addlwCode, hi=True)
1840    # SCVTF (fixed-point)
1841    scvtfFixedCode = fpOp % ("fplibFixedToFP<Element>((int%d_t) srcElem1, imm,"
1842                             " false, FPCRRounding(fpscr), fpscr)")
1843    twoEqualRegInstX("scvtf", "ScvtfFixedDX", "SimdCvtOp", smallFloatTypes, 2,
1844                     scvtfFixedCode % 32, hasImm=True)
1845    twoEqualRegInstX("scvtf", "ScvtfFixedSQX", "SimdCvtOp", smallFloatTypes, 4,
1846                     scvtfFixedCode % 32, hasImm=True)
1847    twoEqualRegInstX("scvtf", "ScvtfFixedDQX", "SimdCvtOp", ("uint64_t",), 4,
1848                     scvtfFixedCode % 64, hasImm=True)
1849    twoEqualRegInstX("scvtf", "ScvtfFixedScSX", "SimdCvtOp", smallFloatTypes,
1850                     4, scvtfFixedCode % 32, hasImm=True, scalar=True)
1851    twoEqualRegInstX("scvtf", "ScvtfFixedScDX", "SimdCvtOp", ("uint64_t",), 4,
1852                     scvtfFixedCode % 64, hasImm=True, scalar=True)
1853    # SCVTF (integer)
1854    scvtfIntCode = fpOp % ("fplibFixedToFP<Element>((int%d_t) srcElem1, 0,"
1855                           " false, FPCRRounding(fpscr), fpscr)")
1856    twoEqualRegInstX("scvtf", "ScvtfIntDX", "SimdCvtOp", smallFloatTypes, 2,
1857                     scvtfIntCode % 32)
1858    twoEqualRegInstX("scvtf", "ScvtfIntSQX", "SimdCvtOp", smallFloatTypes, 4,
1859                     scvtfIntCode % 32)
1860    twoEqualRegInstX("scvtf", "ScvtfIntDQX", "SimdCvtOp", ("uint64_t",), 4,
1861                     scvtfIntCode % 64)
1862    twoEqualRegInstX("scvtf", "ScvtfIntScSX", "SimdCvtOp", smallFloatTypes, 4,
1863                     scvtfIntCode % 32, scalar=True)
1864    twoEqualRegInstX("scvtf", "ScvtfIntScDX", "SimdCvtOp", ("uint64_t",), 4,
1865                     scvtfIntCode % 64, scalar=True)
1866    # SHADD
1867    haddCode = '''
1868            Element carryBit =
1869                (((unsigned)srcElem1 & 0x1) +
1870                 ((unsigned)srcElem2 & 0x1)) >> 1;
1871            // Use division instead of a shift to ensure the sign extension works
1872            // right. The compiler will figure out if it can be a shift. Mask the
1873            // inputs so they get truncated correctly.
1874            destElem = (((srcElem1 & ~(Element)1) / 2) +
1875                        ((srcElem2 & ~(Element)1) / 2)) + carryBit;
1876    '''
1877    threeEqualRegInstX("shadd", "ShaddDX", "SimdAddOp", smallSignedTypes, 2,
1878                       haddCode)
1879    threeEqualRegInstX("shadd", "ShaddQX", "SimdAddOp", smallSignedTypes, 4,
1880                       haddCode)
1881    # SHL
1882    shlCode = '''
1883            if (imm >= sizeof(Element) * 8)
1884                destElem = (srcElem1 << (sizeof(Element) * 8 - 1)) << 1;
1885            else
1886                destElem = srcElem1 << imm;
1887    '''
1888    twoEqualRegInstX("shl", "ShlDX", "SimdShiftOp", unsignedTypes, 2, shlCode,
1889                     hasImm=True)
1890    twoEqualRegInstX("shl", "ShlQX", "SimdShiftOp", unsignedTypes, 4, shlCode,
1891                     hasImm=True)
1892    # SHLL, SHLL2
1893    shllCode = "destElem = ((BigElement)srcElem1) << (sizeof(Element) * 8);"
1894    twoRegLongInstX("shll", "ShllX", "SimdShiftOp", smallTypes, shllCode)
1895    twoRegLongInstX("shll", "Shll2X", "SimdShiftOp", smallTypes, shllCode,
1896                    hi=True)
1897    # SHRN, SHRN2
1898    shrnCode = '''
1899            if (imm >= sizeof(srcElem1) * 8) {
1900                destElem = 0;
1901            } else {
1902                destElem = srcElem1 >> imm;
1903            }
1904    '''
1905    twoRegNarrowInstX("shrn", "ShrnX", "SimdShiftOp", smallUnsignedTypes,
1906                      shrnCode, hasImm=True)
1907    twoRegNarrowInstX("shrn2", "Shrn2X", "SimdShiftOp", smallUnsignedTypes,
1908                      shrnCode, hasImm=True, hi=True)
1909    # SHSUB
1910    hsubCode = '''
1911            Element borrowBit =
1912                (((srcElem1 & 0x1) - (srcElem2 & 0x1)) >> 1) & 0x1;
1913            // Use division instead of a shift to ensure the sign extension works
1914            // right. The compiler will figure out if it can be a shift. Mask the
1915            // inputs so they get truncated correctly.
1916            destElem = (((srcElem1 & ~(Element)1) / 2) -
1917                        ((srcElem2 & ~(Element)1) / 2)) - borrowBit;
1918    '''
1919    threeEqualRegInstX("shsub", "ShsubDX", "SimdAddOp", smallSignedTypes, 2,
1920                       hsubCode)
1921    threeEqualRegInstX("shsub", "ShsubQX", "SimdAddOp", smallSignedTypes, 4,
1922                       hsubCode)
1923    # SLI
1924    sliCode = '''
1925            if (imm >= sizeof(Element) * 8)
1926                destElem = destElem;
1927            else
1928                destElem = (srcElem1 << imm) | (destElem & mask(imm));
1929    '''
1930    twoEqualRegInstX("sli", "SliDX", "SimdShiftOp", unsignedTypes, 2, sliCode,
1931                     True, hasImm=True)
1932    twoEqualRegInstX("sli", "SliQX", "SimdShiftOp", unsignedTypes, 4, sliCode,
1933                     True, hasImm=True)
1934    # SMAX
1935    maxCode = "destElem = (srcElem1 > srcElem2) ? srcElem1 : srcElem2;"
1936    threeEqualRegInstX("smax", "SmaxDX", "SimdCmpOp", smallSignedTypes, 2,
1937                       maxCode)
1938    threeEqualRegInstX("smax", "SmaxQX", "SimdCmpOp", smallSignedTypes, 4,
1939                       maxCode)
1940    # SMAXP
1941    threeEqualRegInstX("smaxp", "SmaxpDX", "SimdCmpOp", smallSignedTypes, 2,
1942                       maxCode, pairwise=True)
1943    threeEqualRegInstX("smaxp", "SmaxpQX", "SimdCmpOp", smallSignedTypes, 4,
1944                       maxCode, pairwise=True)
1945    # SMAXV
1946    maxAcrossCode = '''
1947            if (i == 0 || srcElem1 > destElem)
1948                destElem = srcElem1;
1949    '''
1950    twoRegAcrossInstX("smaxv", "SmaxvDX", "SimdCmpOp", ("int8_t", "int16_t"),
1951                      2, maxAcrossCode)
1952    twoRegAcrossInstX("smaxv", "SmaxvQX", "SimdCmpOp", smallSignedTypes, 4,
1953                      maxAcrossCode)
1954    # SMIN
1955    minCode = "destElem = (srcElem1 < srcElem2) ? srcElem1 : srcElem2;"
1956    threeEqualRegInstX("smin", "SminDX", "SimdCmpOp", smallSignedTypes, 2,
1957                       minCode)
1958    threeEqualRegInstX("smin", "SminQX", "SimdCmpOp", smallSignedTypes, 4,
1959                       minCode)
1960    # SMINP
1961    threeEqualRegInstX("sminp", "SminpDX", "SimdCmpOp", smallSignedTypes, 2,
1962                       minCode, pairwise=True)
1963    threeEqualRegInstX("sminp", "SminpQX", "SimdCmpOp", smallSignedTypes, 4,
1964                       minCode, pairwise=True)
1965    # SMINV
1966    minAcrossCode = '''
1967            if (i == 0 || srcElem1 < destElem)
1968                destElem = srcElem1;
1969    '''
1970    twoRegAcrossInstX("sminv", "SminvDX", "SimdCmpOp", ("int8_t", "int16_t"),
1971                      2, minAcrossCode)
1972    twoRegAcrossInstX("sminv", "SminvQX", "SimdCmpOp", smallSignedTypes, 4,
1973                      minAcrossCode)
1974
1975    split('exec')
1976
1977    # SMLAL, SMLAL2 (by element)
1978    mlalCode = "destElem += (BigElement)srcElem1 * (BigElement)srcElem2;"
1979    threeRegLongInstX("smlal", "SmlalElemX", "SimdMultAccOp",
1980                      ("int16_t", "int32_t"), mlalCode, True, byElem=True)
1981    threeRegLongInstX("smlal", "SmlalElem2X", "SimdMultAccOp",
1982                      ("int16_t", "int32_t"), mlalCode, True, byElem=True,
1983                      hi=True)
1984    # SMLAL, SMLAL2 (vector)
1985    threeRegLongInstX("smlal", "SmlalX", "SimdMultAccOp", smallSignedTypes,
1986                      mlalCode, True)
1987    threeRegLongInstX("smlal", "Smlal2X", "SimdMultAccOp", smallSignedTypes,
1988                      mlalCode, True, hi=True)
1989    # SMLSL, SMLSL2 (by element)
1990    mlslCode = "destElem -= (BigElement)srcElem1 * (BigElement)srcElem2;"
1991    threeRegLongInstX("smlsl", "SmlslElemX", "SimdMultAccOp", smallSignedTypes,
1992                      mlslCode, True, byElem=True)
1993    threeRegLongInstX("smlsl", "SmlslElem2X", "SimdMultAccOp",
1994                      smallSignedTypes, mlslCode, True, byElem=True, hi=True)
1995    # SMLSL, SMLSL2 (vector)
1996    threeRegLongInstX("smlsl", "SmlslX", "SimdMultAccOp", smallSignedTypes,
1997                      mlslCode, True)
1998    threeRegLongInstX("smlsl", "Smlsl2X", "SimdMultAccOp", smallSignedTypes,
1999                      mlslCode, True, hi=True)
2000    # SMOV
2001    insToGprInstX("smov", "SmovWX", "SimdMiscOp", ("int8_t", "int16_t"), 4,
2002                  'W', True)
2003    insToGprInstX("smov", "SmovXX", "SimdMiscOp", smallSignedTypes, 4, 'X',
2004                  True)
2005    # SMULL, SMULL2 (by element)
2006    mullCode = "destElem = (BigElement)srcElem1 * (BigElement)srcElem2;"
2007    threeRegLongInstX("smull", "SmullElemX", "SimdMultOp", smallSignedTypes,
2008                      mullCode, byElem=True)
2009    threeRegLongInstX("smull", "SmullElem2X", "SimdMultOp", smallSignedTypes,
2010                      mullCode, byElem=True, hi=True)
2011    # SMULL, SMULL2 (vector)
2012    threeRegLongInstX("smull", "SmullX", "SimdMultOp", smallSignedTypes,
2013                      mullCode)
2014    threeRegLongInstX("smull", "Smull2X", "SimdMultOp", smallSignedTypes,
2015                      mullCode, hi=True)
2016    # SQABS
2017    sqabsCode = '''
2018        FPSCR fpscr = (FPSCR) FpscrQc;
2019        if (srcElem1 == (Element)(std::numeric_limits<Element>::min())) {
2020            fpscr.qc = 1;
2021            destElem = ~srcElem1;
2022        } else if (srcElem1 < 0) {
2023            destElem = -srcElem1;
2024        } else {
2025            destElem = srcElem1;
2026        }
2027        FpscrQc = fpscr;
2028    '''
2029    twoEqualRegInstX("sqabs", "SqabsDX", "SimdAluOp", smallSignedTypes, 2,
2030                     sqabsCode)
2031    twoEqualRegInstX("sqabs", "SqabsQX", "SimdAluOp", signedTypes, 4,
2032                     sqabsCode)
2033    twoEqualRegInstX("sqabs", "SqabsScX", "SimdAluOp", signedTypes, 4,
2034                     sqabsCode, scalar=True)
2035    # SQADD
2036    sqaddCode = '''
2037            destElem = srcElem1 + srcElem2;
2038            FPSCR fpscr = (FPSCR) FpscrQc;
2039            bool negDest = (destElem < 0);
2040            bool negSrc1 = (srcElem1 < 0);
2041            bool negSrc2 = (srcElem2 < 0);
2042            if ((negDest != negSrc1) && (negSrc1 == negSrc2)) {
2043                destElem = std::numeric_limits<Element>::min();
2044                if (negDest)
2045                    destElem -= 1;
2046                fpscr.qc = 1;
2047            }
2048            FpscrQc = fpscr;
2049    '''
2050    threeEqualRegInstX("sqadd", "SqaddDX", "SimdAddOp", smallSignedTypes, 2,
2051                       sqaddCode)
2052    threeEqualRegInstX("sqadd", "SqaddQX", "SimdAddOp", signedTypes, 4,
2053                       sqaddCode)
2054    threeEqualRegInstX("sqadd", "SqaddScX", "SimdAddOp", signedTypes, 4,
2055                       sqaddCode, scalar=True)
2056    # SQDMLAL, SQDMLAL2 (by element)
2057    qdmlalCode = '''
2058        FPSCR fpscr = (FPSCR) FpscrQc;
2059        BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2060        Element maxNeg = std::numeric_limits<Element>::min();
2061        Element halfNeg = maxNeg / 2;
2062        if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2063            (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2064            (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2065            midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8));
2066            fpscr.qc = 1;
2067        }
2068        bool negPreDest = ltz(destElem);
2069        destElem += midElem;
2070        bool negDest = ltz(destElem);
2071        bool negMid = ltz(midElem);
2072        if (negPreDest == negMid && negMid != negDest) {
2073            destElem = mask(sizeof(BigElement) * 8 - 1);
2074            if (negPreDest)
2075                destElem = ~destElem;
2076            fpscr.qc = 1;
2077        }
2078        FpscrQc = fpscr;
2079    '''
2080    threeRegLongInstX("sqdmlal", "SqdmlalElemX", "SimdMultAccOp",
2081                      ("int16_t", "int32_t"), qdmlalCode, True, byElem=True)
2082    threeRegLongInstX("sqdmlal", "SqdmlalElem2X", "SimdMultAccOp",
2083                      ("int16_t", "int32_t"), qdmlalCode, True, byElem=True,
2084                      hi=True)
2085    threeRegLongInstX("sqdmlal", "SqdmlalElemScX", "SimdMultAccOp",
2086                      ("int16_t", "int32_t"), qdmlalCode, True, byElem=True,
2087                      scalar=True)
2088    # SQDMLAL, SQDMLAL2 (vector)
2089    threeRegLongInstX("sqdmlal", "SqdmlalX", "SimdMultAccOp",
2090                      ("int16_t", "int32_t"), qdmlalCode, True)
2091    threeRegLongInstX("sqdmlal", "Sqdmlal2X", "SimdMultAccOp",
2092                      ("int16_t", "int32_t"), qdmlalCode, True, hi=True)
2093    threeRegLongInstX("sqdmlal", "SqdmlalScX", "SimdMultAccOp",
2094                      ("int16_t", "int32_t"), qdmlalCode, True, scalar=True)
2095    # SQDMLSL, SQDMLSL2 (by element)
2096    qdmlslCode = '''
2097        FPSCR fpscr = (FPSCR) FpscrQc;
2098        BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2099        Element maxNeg = std::numeric_limits<Element>::min();
2100        Element halfNeg = maxNeg / 2;
2101        if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2102            (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2103            (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2104            midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8));
2105            fpscr.qc = 1;
2106        }
2107        bool negPreDest = ltz(destElem);
2108        destElem -= midElem;
2109        bool negDest = ltz(destElem);
2110        bool posMid = ltz((BigElement)-midElem);
2111        if (negPreDest == posMid && posMid != negDest) {
2112            destElem = mask(sizeof(BigElement) * 8 - 1);
2113            if (negPreDest)
2114                destElem = ~destElem;
2115            fpscr.qc = 1;
2116        }
2117        FpscrQc = fpscr;
2118    '''
2119    threeRegLongInstX("sqdmlsl", "SqdmlslElemX", "SimdMultAccOp",
2120                      ("int16_t", "int32_t"), qdmlslCode, True, byElem=True)
2121    threeRegLongInstX("sqdmlsl", "SqdmlslElem2X", "SimdMultAccOp",
2122                      ("int16_t", "int32_t"), qdmlslCode, True, byElem=True,
2123                      hi=True)
2124    threeRegLongInstX("sqdmlsl", "SqdmlslElemScX", "SimdMultAccOp",
2125                      ("int16_t", "int32_t"), qdmlslCode, True, byElem=True,
2126                      scalar=True)
2127    # SQDMLSL, SQDMLSL2 (vector)
2128    threeRegLongInstX("sqdmlsl", "SqdmlslX", "SimdMultAccOp",
2129                      ("int16_t", "int32_t"), qdmlslCode, True)
2130    threeRegLongInstX("sqdmlsl", "Sqdmlsl2X", "SimdMultAccOp",
2131                      ("int16_t", "int32_t"), qdmlslCode, True, hi=True)
2132    threeRegLongInstX("sqdmlsl", "SqdmlslScX", "SimdMultAccOp",
2133                      ("int16_t", "int32_t"), qdmlslCode, True, scalar=True)
2134    # SQDMULH (by element)
2135    sqdmulhCode = '''
2136            FPSCR fpscr = (FPSCR) FpscrQc;
2137            destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2) >>
2138                       (sizeof(Element) * 8);
2139            if (srcElem1 == srcElem2 &&
2140                    srcElem1 == (Element)((Element)1 <<
2141                        (sizeof(Element) * 8 - 1))) {
2142                destElem = ~srcElem1;
2143                fpscr.qc = 1;
2144            }
2145            FpscrQc = fpscr;
2146    '''
2147    threeEqualRegInstX("sqdmulh", "SqdmulhElemDX", "SimdMultOp",
2148                       ("int16_t", "int32_t"), 2, sqdmulhCode, byElem=True)
2149    threeEqualRegInstX("sqdmulh", "SqdmulhElemQX", "SimdMultOp",
2150                       ("int16_t", "int32_t"), 4, sqdmulhCode, byElem=True)
2151    threeEqualRegInstX("sqdmulh", "SqdmulhElemScX", "SimdMultOp",
2152                       ("int16_t", "int32_t"), 4, sqdmulhCode, byElem=True,
2153                       scalar=True)
2154    # SQDMULH (vector)
2155    threeEqualRegInstX("sqdmulh", "SqdmulhDX", "SimdMultOp",
2156                       ("int16_t", "int32_t"), 2, sqdmulhCode)
2157    threeEqualRegInstX("sqdmulh", "SqdmulhQX", "SimdMultOp",
2158                       ("int16_t", "int32_t"), 4, sqdmulhCode)
2159    threeEqualRegInstX("sqdmulh", "SqdmulhScX", "SimdMultOp",
2160                       ("int16_t", "int32_t"), 4, sqdmulhCode, scalar=True)
2161    # SQDMULL, SQDMULL2 (by element)
2162    qdmullCode = '''
2163        FPSCR fpscr = (FPSCR) FpscrQc;
2164        destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2165        if (srcElem1 == srcElem2 &&
2166                srcElem1 == (Element)((Element)1 <<
2167                    (Element)(sizeof(Element) * 8 - 1))) {
2168            destElem = ~((BigElement)srcElem1 << (sizeof(Element) * 8));
2169            fpscr.qc = 1;
2170        }
2171        FpscrQc = fpscr;
2172    '''
2173    threeRegLongInstX("sqdmull", "SqdmullElemX", "SimdMultOp",
2174                      ("int16_t", "int32_t"), qdmullCode, True, byElem=True)
2175    threeRegLongInstX("sqdmull", "SqdmullElem2X", "SimdMultOp",
2176                      ("int16_t", "int32_t"), qdmullCode, True, byElem=True,
2177                      hi=True)
2178    threeRegLongInstX("sqdmull", "SqdmullElemScX", "SimdMultOp",
2179                      ("int16_t", "int32_t"), qdmullCode, True, byElem=True,
2180                      scalar=True)
2181    # SQDMULL, SQDMULL2 (vector)
2182    threeRegLongInstX("sqdmull", "SqdmullX", "SimdMultOp",
2183                      ("int16_t", "int32_t"), qdmullCode, True)
2184    threeRegLongInstX("sqdmull", "Sqdmull2X", "SimdMultOp",
2185                      ("int16_t", "int32_t"), qdmullCode, True, hi=True)
2186    threeRegLongInstX("sqdmull", "SqdmullScX", "SimdMultOp",
2187                      ("int16_t", "int32_t"), qdmullCode, True, scalar=True)
2188    # SQNEG
2189    sqnegCode = '''
2190        FPSCR fpscr = (FPSCR) FpscrQc;
2191        if (srcElem1 == (Element)(std::numeric_limits<Element>::min())) {
2192            fpscr.qc = 1;
2193            destElem = ~srcElem1;
2194        } else {
2195            destElem = -srcElem1;
2196        }
2197        FpscrQc = fpscr;
2198    '''
2199    twoEqualRegInstX("sqneg", "SqnegDX", "SimdAluOp", smallSignedTypes, 2,
2200                     sqnegCode)
2201    twoEqualRegInstX("sqneg", "SqnegQX", "SimdAluOp", signedTypes, 4,
2202                     sqnegCode)
2203    twoEqualRegInstX("sqneg", "SqnegScX", "SimdAluOp", signedTypes, 4,
2204                     sqnegCode, scalar=True)
2205    # SQRDMULH (by element)
2206    sqrdmulhCode = '''
2207            FPSCR fpscr = (FPSCR) FpscrQc;
2208            destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2 +
2209                        ((int64_t)1 << (sizeof(Element) * 8 - 1))) >>
2210                       (sizeof(Element) * 8);
2211            Element maxNeg = std::numeric_limits<Element>::min();
2212            Element halfNeg = maxNeg / 2;
2213            if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2214                (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2215                (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2216                if (destElem < 0) {
2217                    destElem = mask(sizeof(Element) * 8 - 1);
2218                } else {
2219                    destElem = std::numeric_limits<Element>::min();
2220                }
2221                fpscr.qc = 1;
2222            }
2223            FpscrQc = fpscr;
2224    '''
2225    threeEqualRegInstX("sqrdmulh", "SqrdmulhElemDX", "SimdMultOp",
2226                       ("int16_t", "int32_t"), 2, sqrdmulhCode, byElem=True)
2227    threeEqualRegInstX("sqrdmulh", "SqrdmulhElemQX", "SimdMultOp",
2228                       ("int16_t", "int32_t"), 4, sqrdmulhCode, byElem=True)
2229    threeEqualRegInstX("sqrdmulh", "SqrdmulhElemScX", "SimdMultOp",
2230                       ("int16_t", "int32_t"), 4, sqrdmulhCode, byElem=True,
2231                       scalar=True)
2232    # SQRDMULH (vector)
2233    threeEqualRegInstX("sqrdmulh", "SqrdmulhDX", "SimdMultOp",
2234                       ("int16_t", "int32_t"), 2, sqrdmulhCode)
2235    threeEqualRegInstX("sqrdmulh", "SqrdmulhQX", "SimdMultOp",
2236                       ("int16_t", "int32_t"), 4, sqrdmulhCode)
2237    threeEqualRegInstX("sqrdmulh", "SqrdmulhScX", "SimdMultOp",
2238                       ("int16_t", "int32_t"), 4, sqrdmulhCode, scalar=True)
2239    # SQRSHL
2240    sqrshlCode = '''
2241            int16_t shiftAmt = (int8_t)srcElem2;
2242            FPSCR fpscr = (FPSCR) FpscrQc;
2243            if (shiftAmt < 0) {
2244                shiftAmt = -shiftAmt;
2245                Element rBit = 0;
2246                if (shiftAmt <= sizeof(Element) * 8)
2247                    rBit = bits(srcElem1, shiftAmt - 1);
2248                if (shiftAmt > sizeof(Element) * 8 && srcElem1 < 0)
2249                    rBit = 1;
2250                if (shiftAmt >= sizeof(Element) * 8) {
2251                    shiftAmt = sizeof(Element) * 8 - 1;
2252                    destElem = 0;
2253                } else {
2254                    destElem = (srcElem1 >> shiftAmt);
2255                }
2256                // Make sure the right shift sign extended when it should.
2257                if (srcElem1 < 0 && destElem >= 0) {
2258                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
2259                                                 1 - shiftAmt));
2260                }
2261                destElem += rBit;
2262            } else if (shiftAmt > 0) {
2263                bool sat = false;
2264                if (shiftAmt >= sizeof(Element) * 8) {
2265                    if (srcElem1 != 0)
2266                        sat = true;
2267                    else
2268                        destElem = 0;
2269                } else {
2270                    if (bits((uint64_t) srcElem1, sizeof(Element) * 8 - 1,
2271                                sizeof(Element) * 8 - 1 - shiftAmt) !=
2272                            ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) {
2273                        sat = true;
2274                    } else {
2275                        destElem = srcElem1 << shiftAmt;
2276                    }
2277                }
2278                if (sat) {
2279                    fpscr.qc = 1;
2280                    destElem = mask(sizeof(Element) * 8 - 1);
2281                    if (srcElem1 < 0)
2282                        destElem = ~destElem;
2283                }
2284            } else {
2285                destElem = srcElem1;
2286            }
2287            FpscrQc = fpscr;
2288    '''
2289    threeEqualRegInstX("sqrshl", "SqrshlDX", "SimdCmpOp", smallSignedTypes, 2,
2290                       sqrshlCode)
2291    threeEqualRegInstX("sqrshl", "SqrshlQX", "SimdCmpOp", signedTypes, 4,
2292                       sqrshlCode)
2293    threeEqualRegInstX("sqrshl", "SqrshlScX", "SimdCmpOp", signedTypes, 4,
2294                       sqrshlCode, scalar=True)
2295    # SQRSHRN, SQRSHRN2
2296    sqrshrnCode = '''
2297            FPSCR fpscr = (FPSCR) FpscrQc;
2298            if (imm > sizeof(srcElem1) * 8) {
2299                if (srcElem1 != 0 && srcElem1 != -1)
2300                    fpscr.qc = 1;
2301                destElem = 0;
2302            } else if (imm) {
2303                BigElement mid = (srcElem1 >> (imm - 1));
2304                uint64_t rBit = mid & 0x1;
2305                mid >>= 1;
2306                mid |= -(mid & ((BigElement)1 <<
2307                            (sizeof(BigElement) * 8 - 1 - imm)));
2308                mid += rBit;
2309                if (mid != (Element)mid) {
2310                    destElem = mask(sizeof(Element) * 8 - 1);
2311                    if (srcElem1 < 0)
2312                        destElem = ~destElem;
2313                    fpscr.qc = 1;
2314                } else {
2315                    destElem = mid;
2316                }
2317            } else {
2318                if (srcElem1 != (Element)srcElem1) {
2319                    destElem = mask(sizeof(Element) * 8 - 1);
2320                    if (srcElem1 < 0)
2321                        destElem = ~destElem;
2322                    fpscr.qc = 1;
2323                } else {
2324                    destElem = srcElem1;
2325                }
2326            }
2327            FpscrQc = fpscr;
2328    '''
2329    twoRegNarrowInstX("sqrshrn", "SqrshrnX", "SimdShiftOp", smallSignedTypes,
2330                      sqrshrnCode, hasImm=True)
2331    twoRegNarrowInstX("sqrshrn2", "Sqrshrn2X", "SimdShiftOp", smallSignedTypes,
2332                      sqrshrnCode, hasImm=True, hi=True)
2333    twoRegNarrowInstX("sqrshrn", "SqrshrnScX", "SimdShiftOp", smallSignedTypes,
2334                      sqrshrnCode, hasImm=True, scalar=True)
2335    # SQRSHRUN, SQRSHRUN2
2336    sqrshrunCode = '''
2337            FPSCR fpscr = (FPSCR) FpscrQc;
2338            if (imm > sizeof(srcElem1) * 8) {
2339                if (srcElem1 != 0)
2340                    fpscr.qc = 1;
2341                destElem = 0;
2342            } else if (imm) {
2343                BigElement mid = (srcElem1 >> (imm - 1));
2344                uint64_t rBit = mid & 0x1;
2345                mid >>= 1;
2346                mid |= -(mid & ((BigElement)1 <<
2347                                (sizeof(BigElement) * 8 - 1 - imm)));
2348                mid += rBit;
2349                if (bits(mid, sizeof(BigElement) * 8 - 1,
2350                              sizeof(Element) * 8) != 0) {
2351                    if (srcElem1 < 0) {
2352                        destElem = 0;
2353                    } else {
2354                        destElem = mask(sizeof(Element) * 8);
2355                    }
2356                    fpscr.qc = 1;
2357                } else {
2358                    destElem = mid;
2359                }
2360            } else {
2361                if (srcElem1 < 0) {
2362                    fpscr.qc = 1;
2363                    destElem = 0;
2364                } else {
2365                    destElem = srcElem1;
2366                }
2367            }
2368            FpscrQc = fpscr;
2369    '''
2370    twoRegNarrowInstX("sqrshrun", "SqrshrunX", "SimdShiftOp", smallSignedTypes,
2371                      sqrshrunCode, hasImm=True)
2372    twoRegNarrowInstX("sqrshrun", "Sqrshrun2X", "SimdShiftOp",
2373                      smallSignedTypes, sqrshrunCode, hasImm=True, hi=True)
2374    twoRegNarrowInstX("sqrshrun", "SqrshrunScX", "SimdShiftOp",
2375                      smallSignedTypes, sqrshrunCode, hasImm=True, scalar=True)
2376    # SQSHL (immediate)
2377    sqshlImmCode = '''
2378            FPSCR fpscr = (FPSCR) FpscrQc;
2379            if (imm >= sizeof(Element) * 8) {
2380                if (srcElem1 != 0) {
2381                    destElem = std::numeric_limits<Element>::min();
2382                    if (srcElem1 > 0)
2383                        destElem = ~destElem;
2384                    fpscr.qc = 1;
2385                } else {
2386                    destElem = 0;
2387                }
2388            } else if (imm) {
2389                destElem = (srcElem1 << imm);
2390                uint64_t topBits = bits((uint64_t)srcElem1,
2391                                        sizeof(Element) * 8 - 1,
2392                                        sizeof(Element) * 8 - 1 - imm);
2393                if (topBits != 0 && topBits != mask(imm + 1)) {
2394                    destElem = std::numeric_limits<Element>::min();
2395                    if (srcElem1 > 0)
2396                        destElem = ~destElem;
2397                    fpscr.qc = 1;
2398                }
2399            } else {
2400                destElem = srcElem1;
2401            }
2402            FpscrQc = fpscr;
2403    '''
2404    twoEqualRegInstX("sqshl", "SqshlImmDX", "SimdAluOp", smallSignedTypes, 2,
2405                     sqshlImmCode, hasImm=True)
2406    twoEqualRegInstX("sqshl", "SqshlImmQX", "SimdAluOp", signedTypes, 4,
2407                     sqshlImmCode, hasImm=True)
2408    twoEqualRegInstX("sqshl", "SqshlImmScX", "SimdAluOp", signedTypes, 4,
2409                     sqshlImmCode, hasImm=True, scalar=True)
2410    # SQSHL (register)
2411    sqshlCode = '''
2412            int16_t shiftAmt = (int8_t)srcElem2;
2413            FPSCR fpscr = (FPSCR) FpscrQc;
2414            if (shiftAmt < 0) {
2415                shiftAmt = -shiftAmt;
2416                if (shiftAmt >= sizeof(Element) * 8) {
2417                    shiftAmt = sizeof(Element) * 8 - 1;
2418                    destElem = 0;
2419                } else {
2420                    destElem = (srcElem1 >> shiftAmt);
2421                }
2422                // Make sure the right shift sign extended when it should.
2423                if (srcElem1 < 0 && destElem >= 0) {
2424                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
2425                                                 1 - shiftAmt));
2426                }
2427            } else if (shiftAmt > 0) {
2428                bool sat = false;
2429                if (shiftAmt >= sizeof(Element) * 8) {
2430                    if (srcElem1 != 0)
2431                        sat = true;
2432                    else
2433                        destElem = 0;
2434                } else {
2435                    if (bits((uint64_t) srcElem1, sizeof(Element) * 8 - 1,
2436                                sizeof(Element) * 8 - 1 - shiftAmt) !=
2437                            ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) {
2438                        sat = true;
2439                    } else {
2440                        destElem = srcElem1 << shiftAmt;
2441                    }
2442                }
2443                if (sat) {
2444                    fpscr.qc = 1;
2445                    destElem = mask(sizeof(Element) * 8 - 1);
2446                    if (srcElem1 < 0)
2447                        destElem = ~destElem;
2448                }
2449            } else {
2450                destElem = srcElem1;
2451            }
2452            FpscrQc = fpscr;
2453    '''
2454    threeEqualRegInstX("sqshl", "SqshlDX", "SimdAluOp", smallSignedTypes, 2,
2455                       sqshlCode)
2456    threeEqualRegInstX("sqshl", "SqshlQX", "SimdAluOp", signedTypes, 4,
2457                       sqshlCode)
2458    threeEqualRegInstX("sqshl", "SqshlScX", "SimdAluOp", signedTypes, 4,
2459                       sqshlCode, scalar=True)
2460    # SQSHLU
2461    sqshluCode = '''
2462            FPSCR fpscr = (FPSCR) FpscrQc;
2463            if (imm >= sizeof(Element) * 8) {
2464                if (srcElem1 < 0) {
2465                    destElem = 0;
2466                    fpscr.qc = 1;
2467                } else if (srcElem1 > 0) {
2468                    destElem = mask(sizeof(Element) * 8);
2469                    fpscr.qc = 1;
2470                } else {
2471                    destElem = 0;
2472                }
2473            } else if (imm) {
2474                destElem = (srcElem1 << imm);
2475                uint64_t topBits = bits((uint64_t)srcElem1,
2476                                        sizeof(Element) * 8 - 1,
2477                                        sizeof(Element) * 8 - imm);
2478                if (srcElem1 < 0) {
2479                    destElem = 0;
2480                    fpscr.qc = 1;
2481                } else if (topBits != 0) {
2482                    destElem = mask(sizeof(Element) * 8);
2483                    fpscr.qc = 1;
2484                }
2485            } else {
2486                if (srcElem1 < 0) {
2487                    fpscr.qc = 1;
2488                    destElem = 0;
2489                } else {
2490                    destElem = srcElem1;
2491                }
2492            }
2493            FpscrQc = fpscr;
2494    '''
2495    twoEqualRegInstX("sqshlu", "SqshluDX", "SimdAluOp", smallSignedTypes, 2,
2496                     sqshluCode, hasImm=True)
2497    twoEqualRegInstX("sqshlu", "SqshluQX", "SimdAluOp", signedTypes, 4,
2498                     sqshluCode, hasImm=True)
2499    twoEqualRegInstX("sqshlu", "SqshluScX", "SimdAluOp", signedTypes, 4,
2500                     sqshluCode, hasImm=True, scalar=True)
2501    # SQSHRN, SQSHRN2
2502    sqshrnCode = '''
2503        FPSCR fpscr = (FPSCR) FpscrQc;
2504        if (imm > sizeof(srcElem1) * 8) {
2505            if (srcElem1 != 0 && srcElem1 != -1)
2506                fpscr.qc = 1;
2507            destElem = 0;
2508        } else if (imm) {
2509            BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
2510            mid |= -(mid & ((BigElement)1 <<
2511                        (sizeof(BigElement) * 8 - 1 - imm)));
2512            if (mid != (Element)mid) {
2513                destElem = mask(sizeof(Element) * 8 - 1);
2514                if (srcElem1 < 0)
2515                    destElem = ~destElem;
2516                fpscr.qc = 1;
2517            } else {
2518                destElem = mid;
2519            }
2520        } else {
2521            destElem = srcElem1;
2522        }
2523        FpscrQc = fpscr;
2524    '''
2525    twoRegNarrowInstX("sqshrn", "SqshrnX", "SimdShiftOp", smallSignedTypes,
2526                      sqshrnCode, hasImm=True)
2527    twoRegNarrowInstX("sqshrn2", "Sqshrn2X", "SimdShiftOp", smallSignedTypes,
2528                      sqshrnCode, hasImm=True, hi=True)
2529    twoRegNarrowInstX("sqshrn", "SqshrnScX", "SimdShiftOp", smallSignedTypes,
2530                      sqshrnCode, hasImm=True, scalar=True)
2531    # SQSHRUN, SQSHRUN2
2532    sqshrunCode = '''
2533            FPSCR fpscr = (FPSCR) FpscrQc;
2534            if (imm > sizeof(srcElem1) * 8) {
2535                if (srcElem1 != 0)
2536                    fpscr.qc = 1;
2537                destElem = 0;
2538            } else if (imm) {
2539                BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
2540                if (bits(mid, sizeof(BigElement) * 8 - 1,
2541                              sizeof(Element) * 8) != 0) {
2542                    if (srcElem1 < 0) {
2543                        destElem = 0;
2544                    } else {
2545                        destElem = mask(sizeof(Element) * 8);
2546                    }
2547                    fpscr.qc = 1;
2548                } else {
2549                    destElem = mid;
2550                }
2551            } else {
2552                destElem = srcElem1;
2553            }
2554            FpscrQc = fpscr;
2555    '''
2556    twoRegNarrowInstX("sqshrun", "SqshrunX", "SimdShiftOp", smallSignedTypes,
2557                      sqshrunCode, hasImm=True)
2558    twoRegNarrowInstX("sqshrun", "Sqshrun2X", "SimdShiftOp", smallSignedTypes,
2559                      sqshrunCode, hasImm=True, hi=True)
2560    twoRegNarrowInstX("sqshrun", "SqshrunScX", "SimdShiftOp", smallSignedTypes,
2561                      sqshrunCode, hasImm=True, scalar=True)
2562    # SQSUB
2563    sqsubCode = '''
2564            destElem = srcElem1 - srcElem2;
2565            FPSCR fpscr = (FPSCR) FpscrQc;
2566            bool negDest = (destElem < 0);
2567            bool negSrc1 = (srcElem1 < 0);
2568            bool posSrc2 = (srcElem2 >= 0);
2569            if ((negDest != negSrc1) && (negSrc1 == posSrc2)) {
2570                destElem = std::numeric_limits<Element>::min();
2571                if (negDest)
2572                    destElem -= 1;
2573                fpscr.qc = 1;
2574            }
2575            FpscrQc = fpscr;
2576    '''
2577    threeEqualRegInstX("sqsub", "SqsubDX", "SimdAddOp", smallSignedTypes, 2,
2578                       sqsubCode)
2579    threeEqualRegInstX("sqsub", "SqsubQX", "SimdAddOp", signedTypes, 4,
2580                       sqsubCode)
2581    threeEqualRegInstX("sqsub", "SqsubScX", "SimdAddOp", signedTypes, 4,
2582                       sqsubCode, scalar=True)
2583    # SQXTN, SQXTN2
2584    sqxtnCode = '''
2585            FPSCR fpscr = (FPSCR) FpscrQc;
2586            destElem = srcElem1;
2587            if ((BigElement)destElem != srcElem1) {
2588                fpscr.qc = 1;
2589                destElem = mask(sizeof(Element) * 8 - 1);
2590                if (srcElem1 < 0)
2591                    destElem = ~destElem;
2592            }
2593            FpscrQc = fpscr;
2594    '''
2595    twoRegNarrowInstX("sqxtn", "SqxtnX", "SimdMiscOp", smallSignedTypes,
2596                      sqxtnCode)
2597    twoRegNarrowInstX("sqxtn", "Sqxtn2X", "SimdMiscOp", smallSignedTypes,
2598                      sqxtnCode, hi=True)
2599    twoRegNarrowInstX("sqxtn", "SqxtnScX", "SimdMiscOp", smallSignedTypes,
2600                      sqxtnCode, scalar=True)
2601    # SQXTUN, SQXTUN2
2602    sqxtunCode = '''
2603            FPSCR fpscr = (FPSCR) FpscrQc;
2604            destElem = srcElem1;
2605            if (srcElem1 < 0 ||
2606                    ((BigElement)destElem & mask(sizeof(Element) * 8)) != srcElem1) {
2607                fpscr.qc = 1;
2608                destElem = mask(sizeof(Element) * 8);
2609                if (srcElem1 < 0)
2610                    destElem = ~destElem;
2611            }
2612            FpscrQc = fpscr;
2613    '''
2614    twoRegNarrowInstX("sqxtun", "SqxtunX", "SimdMiscOp", smallSignedTypes,
2615                      sqxtunCode)
2616    twoRegNarrowInstX("sqxtun", "Sqxtun2X", "SimdMiscOp", smallSignedTypes,
2617                      sqxtunCode, hi=True)
2618    twoRegNarrowInstX("sqxtun", "SqxtunScX", "SimdMiscOp", smallSignedTypes,
2619                      sqxtunCode, scalar=True)
2620    # SRHADD
2621    rhaddCode = '''
2622            Element carryBit =
2623                (((unsigned)srcElem1 & 0x1) +
2624                 ((unsigned)srcElem2 & 0x1) + 1) >> 1;
2625            // Use division instead of a shift to ensure the sign extension works
2626            // right. The compiler will figure out if it can be a shift. Mask the
2627            // inputs so they get truncated correctly.
2628            destElem = (((srcElem1 & ~(Element)1) / 2) +
2629                        ((srcElem2 & ~(Element)1) / 2)) + carryBit;
2630    '''
2631    threeEqualRegInstX("srhadd", "SrhaddDX", "SimdAddOp", smallSignedTypes, 2,
2632                       rhaddCode)
2633    threeEqualRegInstX("srhadd", "SrhaddQX", "SimdAddOp", smallSignedTypes, 4,
2634                       rhaddCode)
2635    # SRI
2636    sriCode = '''
2637            if (imm >= sizeof(Element) * 8)
2638                destElem = destElem;
2639            else
2640                destElem = (srcElem1 >> imm) |
2641                    (destElem & ~mask(sizeof(Element) * 8 - imm));
2642    '''
2643    twoEqualRegInstX("sri", "SriDX", "SimdShiftOp", unsignedTypes, 2, sriCode,
2644                     True, hasImm=True)
2645    twoEqualRegInstX("sri", "SriQX", "SimdShiftOp", unsignedTypes, 4, sriCode,
2646                     True, hasImm=True)
2647    # SRSHL
2648    rshlCode = '''
2649            int16_t shiftAmt = (int8_t)srcElem2;
2650            if (shiftAmt < 0) {
2651                shiftAmt = -shiftAmt;
2652                Element rBit = 0;
2653                if (shiftAmt <= sizeof(Element) * 8)
2654                    rBit = bits(srcElem1, shiftAmt - 1);
2655                if (shiftAmt > sizeof(Element) * 8 && ltz(srcElem1))
2656                    rBit = 1;
2657                if (shiftAmt >= sizeof(Element) * 8) {
2658                    shiftAmt = sizeof(Element) * 8 - 1;
2659                    destElem = 0;
2660                } else {
2661                    destElem = (srcElem1 >> shiftAmt);
2662                }
2663                // Make sure the right shift sign extended when it should.
2664                if (ltz(srcElem1) && !ltz(destElem)) {
2665                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
2666                                                 1 - shiftAmt));
2667                }
2668                destElem += rBit;
2669            } else if (shiftAmt > 0) {
2670                if (shiftAmt >= sizeof(Element) * 8) {
2671                    destElem = 0;
2672                } else {
2673                    destElem = srcElem1 << shiftAmt;
2674                }
2675            } else {
2676                destElem = srcElem1;
2677            }
2678    '''
2679    threeEqualRegInstX("srshl", "SrshlDX", "SimdShiftOp", signedTypes, 2,
2680                       rshlCode)
2681    threeEqualRegInstX("srshl", "SrshlQX", "SimdShiftOp", signedTypes, 4,
2682                       rshlCode)
2683    # SRSHR
2684    rshrCode = '''
2685            if (imm > sizeof(srcElem1) * 8) {
2686                destElem = 0;
2687            } else if (imm) {
2688                Element rBit = bits(srcElem1, imm - 1);
2689                destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit;
2690            } else {
2691                destElem = srcElem1;
2692            }
2693    '''
2694    twoEqualRegInstX("srshr", "SrshrDX", "SimdShiftOp", signedTypes, 2,
2695                     rshrCode, hasImm=True)
2696    twoEqualRegInstX("srshr", "SrshrQX", "SimdShiftOp", signedTypes, 4,
2697                     rshrCode, hasImm=True)
2698    # SRSRA
2699    rsraCode = '''
2700            if (imm > sizeof(srcElem1) * 8) {
2701                destElem += 0;
2702            } else if (imm) {
2703                Element rBit = bits(srcElem1, imm - 1);
2704                destElem += ((srcElem1 >> (imm - 1)) >> 1) + rBit;
2705            } else {
2706                destElem += srcElem1;
2707            }
2708    '''
2709    twoEqualRegInstX("srsra", "SrsraDX", "SimdShiftOp", signedTypes, 2,
2710                     rsraCode, True, hasImm=True)
2711    twoEqualRegInstX("srsra", "SrsraQX", "SimdShiftOp", signedTypes, 4,
2712                     rsraCode, True, hasImm=True)
2713    # SSHL
2714    shlCode = '''
2715            int16_t shiftAmt = (int8_t)srcElem2;
2716            if (shiftAmt < 0) {
2717                shiftAmt = -shiftAmt;
2718                if (shiftAmt >= sizeof(Element) * 8) {
2719                    shiftAmt = sizeof(Element) * 8 - 1;
2720                    destElem = 0;
2721                } else {
2722                    destElem = (srcElem1 >> shiftAmt);
2723                }
2724                // Make sure the right shift sign extended when it should.
2725                if (ltz(srcElem1) && !ltz(destElem)) {
2726                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
2727                                                 1 - shiftAmt));
2728                }
2729            } else {
2730                if (shiftAmt >= sizeof(Element) * 8) {
2731                    destElem = 0;
2732                } else {
2733                    destElem = srcElem1 << shiftAmt;
2734                }
2735            }
2736    '''
2737    threeEqualRegInstX("sshl", "SshlDX", "SimdShiftOp", signedTypes, 2,
2738                       shlCode)
2739    threeEqualRegInstX("sshl", "SshlQX", "SimdShiftOp", signedTypes, 4,
2740                       shlCode)
2741    # SSHLL, SSHLL2
2742    shllCode = '''
2743            if (imm >= sizeof(destElem) * 8) {
2744                destElem = 0;
2745            } else {
2746                destElem = (BigElement)srcElem1 << imm;
2747            }
2748    '''
2749    twoRegLongInstX("sshll", "SshllX", "SimdShiftOp", smallSignedTypes,
2750                    shllCode, hasImm=True)
2751    twoRegLongInstX("sshll", "Sshll2X", "SimdShiftOp", smallSignedTypes,
2752                    shllCode, hasImm=True, hi=True)
2753    # SSHR
2754    shrCode = '''
2755            if (imm >= sizeof(srcElem1) * 8) {
2756                if (ltz(srcElem1))
2757                    destElem = -1;
2758                else
2759                    destElem = 0;
2760            } else {
2761                destElem = srcElem1 >> imm;
2762            }
2763    '''
2764    twoEqualRegInstX("sshr", "SshrDX", "SimdShiftOp", signedTypes, 2, shrCode,
2765                     hasImm=True)
2766    twoEqualRegInstX("sshr", "SshrQX", "SimdShiftOp", signedTypes, 4, shrCode,
2767                     hasImm=True)
2768    # SSRA
2769    sraCode = '''
2770            Element mid;;
2771            if (imm >= sizeof(srcElem1) * 8) {
2772                mid = ltz(srcElem1) ? -1 : 0;
2773            } else {
2774                mid = srcElem1 >> imm;
2775                if (ltz(srcElem1) && !ltz(mid)) {
2776                    mid |= -(mid & ((Element)1 <<
2777                                    (sizeof(Element) * 8 - 1 - imm)));
2778                }
2779            }
2780            destElem += mid;
2781    '''
2782    twoEqualRegInstX("ssra", "SsraDX", "SimdShiftOp", signedTypes, 2, sraCode,
2783                     True, hasImm=True)
2784    twoEqualRegInstX("ssra", "SsraQX", "SimdShiftOp", signedTypes, 4, sraCode,
2785                     True, hasImm=True)
2786    # SSUBL
2787    sublwCode = "destElem = (BigElement)srcElem1 - (BigElement)srcElem2;"
2788    threeRegLongInstX("ssubl", "SsublX", "SimdAddOp", smallSignedTypes,
2789                      sublwCode)
2790    threeRegLongInstX("ssubl2", "Ssubl2X", "SimdAddOp", smallSignedTypes,
2791                      sublwCode, hi=True)
2792    # SSUBW
2793    threeRegWideInstX("ssubw", "SsubwX", "SimdAddOp", smallSignedTypes,
2794                      sublwCode)
2795    threeRegWideInstX("ssubw2", "Ssubw2X", "SimdAddOp", smallSignedTypes,
2796                      sublwCode, hi=True)
2797    # SUB
2798    subCode = "destElem = srcElem1 - srcElem2;"
2799    threeEqualRegInstX("sub", "SubDX", "SimdAddOp", unsignedTypes, 2, subCode)
2800    threeEqualRegInstX("sub", "SubQX", "SimdAddOp", unsignedTypes, 4, subCode)
2801    # SUBHN, SUBHN2
2802    subhnCode = '''
2803            destElem = ((BigElement)srcElem1 - (BigElement)srcElem2) >>
2804                        (sizeof(Element) * 8);
2805    '''
2806    threeRegNarrowInstX("subhn", "SubhnX", "SimdAddOp", smallUnsignedTypes,
2807                        subhnCode)
2808    threeRegNarrowInstX("subhn2", "Subhn2X", "SimdAddOp", smallUnsignedTypes,
2809                        subhnCode, hi=True)
2810    # SUQADD
2811    suqaddCode = '''
2812            FPSCR fpscr = (FPSCR) FpscrQc;
2813            Element tmp = destElem + srcElem1;
2814            if (bits(destElem, sizeof(Element) * 8 - 1) == 0) {
2815                if (bits(tmp, sizeof(Element) * 8 - 1) == 1 ||
2816                        tmp < srcElem1 || tmp < destElem) {
2817                    destElem = (((Element) 1) << (sizeof(Element) * 8 - 1)) - 1;
2818                    fpscr.qc = 1;
2819                } else {
2820                    destElem = tmp;
2821                }
2822            } else {
2823                Element absDestElem = (~destElem) + 1;
2824                if (absDestElem < srcElem1) {
2825                    // Still check for positive sat., no need to check for negative sat.
2826                    if (bits(tmp, sizeof(Element) * 8 - 1) == 1) {
2827                        destElem = (((Element) 1) << (sizeof(Element) * 8 - 1)) - 1;
2828                        fpscr.qc = 1;
2829                    } else {
2830                        destElem = tmp;
2831                    }
2832                } else {
2833                    destElem = tmp;
2834                }
2835            }
2836            FpscrQc = fpscr;
2837    '''
2838    twoEqualRegInstX("suqadd", "SuqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
2839                     suqaddCode, True)
2840    twoEqualRegInstX("suqadd", "SuqaddQX", "SimdAddOp", unsignedTypes, 4,
2841                     suqaddCode, True)
2842    twoEqualRegInstX("suqadd", "SuqaddScX", "SimdAddOp", unsignedTypes, 4,
2843                     suqaddCode, True, scalar=True)
2844    # SXTL -> alias to SSHLL
2845    # TBL
2846    tbxTblInstX("tbl", "Tbl1DX", "SimdMiscOp", ("uint8_t",), 1, "true", 2)
2847    tbxTblInstX("tbl", "Tbl1QX", "SimdMiscOp", ("uint8_t",), 1, "true", 4)
2848    tbxTblInstX("tbl", "Tbl2DX", "SimdMiscOp", ("uint8_t",), 2, "true", 2)
2849    tbxTblInstX("tbl", "Tbl2QX", "SimdMiscOp", ("uint8_t",), 2, "true", 4)
2850    tbxTblInstX("tbl", "Tbl3DX", "SimdMiscOp", ("uint8_t",), 3, "true", 2)
2851    tbxTblInstX("tbl", "Tbl3QX", "SimdMiscOp", ("uint8_t",), 3, "true", 4)
2852    tbxTblInstX("tbl", "Tbl4DX", "SimdMiscOp", ("uint8_t",), 4, "true", 2)
2853    tbxTblInstX("tbl", "Tbl4QX", "SimdMiscOp", ("uint8_t",), 4, "true", 4)
2854    # TBX
2855    tbxTblInstX("tbx", "Tbx1DX", "SimdMiscOp", ("uint8_t",), 1, "false", 2)
2856    tbxTblInstX("tbx", "Tbx1QX", "SimdMiscOp", ("uint8_t",), 1, "false", 4)
2857    tbxTblInstX("tbx", "Tbx2DX", "SimdMiscOp", ("uint8_t",), 2, "false", 2)
2858    tbxTblInstX("tbx", "Tbx2QX", "SimdMiscOp", ("uint8_t",), 2, "false", 4)
2859    tbxTblInstX("tbx", "Tbx3DX", "SimdMiscOp", ("uint8_t",), 3, "false", 2)
2860    tbxTblInstX("tbx", "Tbx3QX", "SimdMiscOp", ("uint8_t",), 3, "false", 4)
2861    tbxTblInstX("tbx", "Tbx4DX", "SimdMiscOp", ("uint8_t",), 4, "false", 2)
2862    tbxTblInstX("tbx", "Tbx4QX", "SimdMiscOp", ("uint8_t",), 4, "false", 4)
2863    # TRN1
2864    trnCode = '''
2865        unsigned part = %s;
2866        for (unsigned i = 0; i < eCount / 2; i++) {
2867            destReg.elements[2 * i] = srcReg1.elements[2 * i + part];
2868            destReg.elements[2 * i + 1] = srcReg2.elements[2 * i + part];
2869        }
2870    '''
2871    threeRegScrambleInstX("trn1", "Trn1DX", "SimdAluOp", smallUnsignedTypes, 2,
2872                          trnCode % "0")
2873    threeRegScrambleInstX("trn1", "Trn1QX", "SimdAluOp", unsignedTypes, 4,
2874                          trnCode % "0")
2875    # TRN2
2876    threeRegScrambleInstX("trn2", "Trn2DX", "SimdAluOp", smallUnsignedTypes, 2,
2877                          trnCode % "1")
2878    threeRegScrambleInstX("trn2", "Trn2QX", "SimdAluOp", unsignedTypes, 4,
2879                          trnCode % "1")
2880    # UABA
2881    threeEqualRegInstX("uaba", "UabaDX", "SimdAddAccOp", smallUnsignedTypes, 2,
2882                       abaCode, True)
2883    threeEqualRegInstX("uaba", "UabaQX", "SimdAddAccOp", smallUnsignedTypes, 4,
2884                       abaCode, True)
2885    # UABAL, UABAL2
2886    threeRegLongInstX("uabal", "UabalX", "SimdAddAccOp", smallUnsignedTypes,
2887                      abalCode, True)
2888    threeRegLongInstX("uabal2", "Uabal2X", "SimdAddAccOp", smallUnsignedTypes,
2889                      abalCode, True, hi=True)
2890    # UABD
2891    threeEqualRegInstX("uabd", "UabdDX", "SimdAddOp", smallUnsignedTypes, 2,
2892                       abdCode)
2893    threeEqualRegInstX("uabd", "UabdQX", "SimdAddOp", smallUnsignedTypes, 4,
2894                       abdCode)
2895    # UABDL, UABDL2
2896    threeRegLongInstX("uabdl", "UabdlX", "SimdAddAccOp", smallUnsignedTypes,
2897                      abdlCode, True)
2898    threeRegLongInstX("uabdl2", "Uabdl2X", "SimdAddAccOp", smallUnsignedTypes,
2899                      abdlCode, True, hi=True)
2900    # UADALP
2901    twoRegCondenseInstX("uadalp", "UadalpDX", "SimdAddOp", smallUnsignedTypes,
2902                        2, adalpCode, True)
2903    twoRegCondenseInstX("uadalp", "UadalpQX", "SimdAddOp", smallUnsignedTypes,
2904                        4, adalpCode, True)
2905    # UADDL, UADDL2
2906    threeRegLongInstX("uaddl", "UaddlX", "SimdAddAccOp", smallUnsignedTypes,
2907                      addlwCode)
2908    threeRegLongInstX("uaddl2", "Uaddl2X", "SimdAddAccOp", smallUnsignedTypes,
2909                      addlwCode, hi=True)
2910    # UADDLP
2911    twoRegCondenseInstX("uaddlp", "UaddlpDX", "SimdAddOp", smallUnsignedTypes,
2912                        2, addlwCode)
2913    twoRegCondenseInstX("uaddlp", "UaddlpQX", "SimdAddOp", smallUnsignedTypes,
2914                        4, addlwCode)
2915    # UADDLV
2916    twoRegAcrossInstX("uaddlv", "UaddlvDX", "SimdAddOp",
2917                      ("uint8_t", "uint16_t"), 2, addAcrossLongCode, long=True)
2918    twoRegAcrossInstX("uaddlv", "UaddlvQX", "SimdAddOp",
2919                      ("uint8_t", "uint16_t"), 4, addAcrossLongCode, long=True)
2920    twoRegAcrossInstX("uaddlv", "UaddlvBQX", "SimdAddOp", ("uint32_t",), 4,
2921                      addAcrossLongCode, doubleDest=True, long=True)
2922    # UADDW
2923    threeRegWideInstX("uaddw", "UaddwX", "SimdAddAccOp", smallUnsignedTypes,
2924                      addlwCode)
2925    threeRegWideInstX("uaddw2", "Uaddw2X", "SimdAddAccOp", smallUnsignedTypes,
2926                      addlwCode, hi=True)
2927    # UCVTF (fixed-point)
2928    ucvtfFixedCode = fpOp % ("fplibFixedToFP<Element>(srcElem1, imm, true,"
2929                             " FPCRRounding(fpscr), fpscr)")
2930    twoEqualRegInstX("ucvtf", "UcvtfFixedDX", "SimdCvtOp", smallFloatTypes, 2,
2931                     ucvtfFixedCode, hasImm=True)
2932    twoEqualRegInstX("ucvtf", "UcvtfFixedQX", "SimdCvtOp", floatTypes, 4,
2933                     ucvtfFixedCode, hasImm=True)
2934    twoEqualRegInstX("ucvtf", "UcvtfFixedScX", "SimdCvtOp", floatTypes, 4,
2935                     ucvtfFixedCode, hasImm=True, scalar=True)
2936    # UCVTF (integer)
2937    ucvtfIntCode = fpOp % ("fplibFixedToFP<Element>(srcElem1, 0, true,"
2938                           " FPCRRounding(fpscr), fpscr)")
2939    twoEqualRegInstX("ucvtf", "UcvtfIntDX", "SimdCvtOp", smallFloatTypes, 2,
2940                     ucvtfIntCode)
2941    twoEqualRegInstX("ucvtf", "UcvtfIntQX", "SimdCvtOp", floatTypes, 4,
2942                     ucvtfIntCode)
2943    twoEqualRegInstX("ucvtf", "UcvtfIntScX", "SimdCvtOp", floatTypes, 4,
2944                     ucvtfIntCode, scalar=True)
2945    # UHADD
2946    threeEqualRegInstX("uhadd", "UhaddDX", "SimdAddOp", smallUnsignedTypes, 2,
2947                       haddCode)
2948    threeEqualRegInstX("uhadd", "UhaddQX", "SimdAddOp", smallUnsignedTypes, 4,
2949                       haddCode)
2950    # UHSUB
2951    threeEqualRegInstX("uhsub", "UhsubDX", "SimdAddOp", smallUnsignedTypes, 2,
2952                       hsubCode)
2953    threeEqualRegInstX("uhsub", "UhsubQX", "SimdAddOp", smallUnsignedTypes, 4,
2954                       hsubCode)
2955    # UMAX
2956    threeEqualRegInstX("umax", "UmaxDX", "SimdCmpOp", smallUnsignedTypes, 2,
2957                       maxCode)
2958    threeEqualRegInstX("umax", "UmaxQX", "SimdCmpOp", smallUnsignedTypes, 4,
2959                       maxCode)
2960    # UMAXP
2961    threeEqualRegInstX("umaxp", "UmaxpDX", "SimdCmpOp", smallUnsignedTypes, 2,
2962                       maxCode, pairwise=True)
2963    threeEqualRegInstX("umaxp", "UmaxpQX", "SimdCmpOp", smallUnsignedTypes, 4,
2964                       maxCode, pairwise=True)
2965    # UMAXV
2966    twoRegAcrossInstX("umaxv", "UmaxvDX", "SimdCmpOp", ("uint8_t", "uint16_t"),
2967                      2, maxAcrossCode)
2968    twoRegAcrossInstX("umaxv", "UmaxvQX", "SimdCmpOp", smallUnsignedTypes, 4,
2969                      maxAcrossCode)
2970    # UMIN
2971    threeEqualRegInstX("umin", "UminDX", "SimdCmpOp", smallUnsignedTypes, 2,
2972                       minCode)
2973    threeEqualRegInstX("umin", "UminQX", "SimdCmpOp", smallUnsignedTypes, 4,
2974                       minCode)
2975    # UMINP
2976    threeEqualRegInstX("uminp", "UminpDX", "SimdCmpOp", smallUnsignedTypes, 2,
2977                       minCode, pairwise=True)
2978    threeEqualRegInstX("uminp", "UminpQX", "SimdCmpOp", smallUnsignedTypes, 4,
2979                       minCode, pairwise=True)
2980    # UMINV
2981    twoRegAcrossInstX("uminv", "UminvDX", "SimdCmpOp", ("uint8_t", "uint16_t"),
2982                      2, minAcrossCode)
2983    twoRegAcrossInstX("uminv", "UminvQX", "SimdCmpOp", smallUnsignedTypes, 4,
2984                      minAcrossCode)
2985    # UMLAL (by element)
2986    threeRegLongInstX("umlal", "UmlalElemX", "SimdMultAccOp",
2987                      smallUnsignedTypes, mlalCode, True, byElem=True)
2988    threeRegLongInstX("umlal", "UmlalElem2X", "SimdMultAccOp",
2989                      smallUnsignedTypes, mlalCode, True, byElem=True, hi=True)
2990    # UMLAL (vector)
2991    threeRegLongInstX("umlal", "UmlalX", "SimdMultAccOp", smallUnsignedTypes,
2992                      mlalCode, True)
2993    threeRegLongInstX("umlal", "Umlal2X", "SimdMultAccOp", smallUnsignedTypes,
2994                      mlalCode, True, hi=True)
2995    # UMLSL (by element)
2996    threeRegLongInstX("umlsl", "UmlslElemX", "SimdMultAccOp",
2997                      smallUnsignedTypes, mlslCode, True, byElem=True)
2998    threeRegLongInstX("umlsl", "UmlslElem2X", "SimdMultAccOp",
2999                      smallUnsignedTypes, mlslCode, True, byElem=True, hi=True)
3000    # UMLSL (vector)
3001    threeRegLongInstX("umlsl", "UmlslX", "SimdMultAccOp", smallUnsignedTypes,
3002                      mlslCode, True)
3003    threeRegLongInstX("umlsl", "Umlsl2X", "SimdMultAccOp", smallUnsignedTypes,
3004                      mlslCode, True, hi=True)
3005    # UMOV
3006    insToGprInstX("umov", "UmovWX", "SimdMiscOp", smallUnsignedTypes, 4, 'W')
3007    insToGprInstX("umov", "UmovXX", "SimdMiscOp", ("uint64_t",), 4, 'X')
3008    # UMULL, UMULL2 (by element)
3009    threeRegLongInstX("umull", "UmullElemX", "SimdMultOp", smallUnsignedTypes,
3010                      mullCode, byElem=True)
3011    threeRegLongInstX("umull", "UmullElem2X", "SimdMultOp", smallUnsignedTypes,
3012                      mullCode, byElem=True, hi=True)
3013    # UMULL, UMULL2 (vector)
3014    threeRegLongInstX("umull", "UmullX", "SimdMultOp", smallUnsignedTypes,
3015                      mullCode)
3016    threeRegLongInstX("umull", "Umull2X", "SimdMultOp", smallUnsignedTypes,
3017                      mullCode, hi=True)
3018    # UQADD
3019    uqaddCode = '''
3020            destElem = srcElem1 + srcElem2;
3021            FPSCR fpscr = (FPSCR) FpscrQc;
3022            if (destElem < srcElem1 || destElem < srcElem2) {
3023                destElem = (Element)(-1);
3024                fpscr.qc = 1;
3025            }
3026            FpscrQc = fpscr;
3027    '''
3028    threeEqualRegInstX("uqadd", "UqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
3029                       uqaddCode)
3030    threeEqualRegInstX("uqadd", "UqaddQX", "SimdAddOp", unsignedTypes, 4,
3031                       uqaddCode)
3032    threeEqualRegInstX("uqadd", "UqaddScX", "SimdAddOp", unsignedTypes, 4,
3033                       uqaddCode, scalar=True)
3034    # UQRSHL
3035    uqrshlCode = '''
3036            int16_t shiftAmt = (int8_t)srcElem2;
3037            FPSCR fpscr = (FPSCR) FpscrQc;
3038            if (shiftAmt < 0) {
3039                shiftAmt = -shiftAmt;
3040                Element rBit = 0;
3041                if (shiftAmt <= sizeof(Element) * 8)
3042                    rBit = bits(srcElem1, shiftAmt - 1);
3043                if (shiftAmt >= sizeof(Element) * 8) {
3044                    shiftAmt = sizeof(Element) * 8 - 1;
3045                    destElem = 0;
3046                } else {
3047                    destElem = (srcElem1 >> shiftAmt);
3048                }
3049                destElem += rBit;
3050            } else {
3051                if (shiftAmt >= sizeof(Element) * 8) {
3052                    if (srcElem1 != 0) {
3053                        destElem = mask(sizeof(Element) * 8);
3054                        fpscr.qc = 1;
3055                    } else {
3056                        destElem = 0;
3057                    }
3058                } else {
3059                    if (bits(srcElem1, sizeof(Element) * 8 - 1,
3060                                sizeof(Element) * 8 - shiftAmt)) {
3061                        destElem = mask(sizeof(Element) * 8);
3062                        fpscr.qc = 1;
3063                    } else {
3064                        destElem = srcElem1 << shiftAmt;
3065                    }
3066                }
3067            }
3068            FpscrQc = fpscr;
3069    '''
3070    threeEqualRegInstX("uqrshl", "UqrshlDX", "SimdCmpOp", smallUnsignedTypes,
3071                       2, uqrshlCode)
3072    threeEqualRegInstX("uqrshl", "UqrshlQX", "SimdCmpOp", unsignedTypes, 4,
3073                       uqrshlCode)
3074    threeEqualRegInstX("uqrshl", "UqrshlScX", "SimdCmpOp", unsignedTypes, 4,
3075                       uqrshlCode, scalar=True)
3076    # UQRSHRN
3077    uqrshrnCode = '''
3078            FPSCR fpscr = (FPSCR) FpscrQc;
3079            if (imm > sizeof(srcElem1) * 8) {
3080                if (srcElem1 != 0)
3081                    fpscr.qc = 1;
3082                destElem = 0;
3083            } else if (imm) {
3084                BigElement mid = (srcElem1 >> (imm - 1));
3085                uint64_t rBit = mid & 0x1;
3086                mid >>= 1;
3087                mid += rBit;
3088                if (mid != (Element)mid) {
3089                    destElem = mask(sizeof(Element) * 8);
3090                    fpscr.qc = 1;
3091                } else {
3092                    destElem = mid;
3093                }
3094            } else {
3095                if (srcElem1 != (Element)srcElem1) {
3096                    destElem = mask(sizeof(Element) * 8 - 1);
3097                    fpscr.qc = 1;
3098                } else {
3099                    destElem = srcElem1;
3100                }
3101            }
3102            FpscrQc = fpscr;
3103    '''
3104    twoRegNarrowInstX("uqrshrn", "UqrshrnX", "SimdShiftOp", smallUnsignedTypes,
3105                      uqrshrnCode, hasImm=True)
3106    twoRegNarrowInstX("uqrshrn2", "Uqrshrn2X", "SimdShiftOp",
3107                      smallUnsignedTypes, uqrshrnCode, hasImm=True, hi=True)
3108    twoRegNarrowInstX("uqrshrn", "UqrshrnScX", "SimdShiftOp",
3109                      smallUnsignedTypes, uqrshrnCode, hasImm=True,
3110                      scalar=True)
3111    # UQSHL (immediate)
3112    uqshlImmCode = '''
3113            FPSCR fpscr = (FPSCR) FpscrQc;
3114            if (imm >= sizeof(Element) * 8) {
3115                if (srcElem1 != 0) {
3116                    destElem = mask(sizeof(Element) * 8);
3117                    fpscr.qc = 1;
3118                } else {
3119                    destElem = 0;
3120                }
3121            } else if (imm) {
3122                destElem = (srcElem1 << imm);
3123                uint64_t topBits = bits((uint64_t)srcElem1,
3124                                        sizeof(Element) * 8 - 1,
3125                                        sizeof(Element) * 8 - imm);
3126                if (topBits != 0) {
3127                    destElem = mask(sizeof(Element) * 8);
3128                    fpscr.qc = 1;
3129                }
3130            } else {
3131                destElem = srcElem1;
3132            }
3133            FpscrQc = fpscr;
3134    '''
3135    twoEqualRegInstX("uqshl", "UqshlImmDX", "SimdAluOp", smallUnsignedTypes, 2,
3136                     uqshlImmCode, hasImm=True)
3137    twoEqualRegInstX("uqshl", "UqshlImmQX", "SimdAluOp", unsignedTypes, 4,
3138                     uqshlImmCode, hasImm=True)
3139    twoEqualRegInstX("uqshl", "UqshlImmScX", "SimdAluOp", unsignedTypes, 4,
3140                     uqshlImmCode, hasImm=True, scalar=True)
3141    # UQSHL (register)
3142    uqshlCode = '''
3143            int16_t shiftAmt = (int8_t)srcElem2;
3144            FPSCR fpscr = (FPSCR) FpscrQc;
3145            if (shiftAmt < 0) {
3146                shiftAmt = -shiftAmt;
3147                if (shiftAmt >= sizeof(Element) * 8) {
3148                    shiftAmt = sizeof(Element) * 8 - 1;
3149                    destElem = 0;
3150                } else {
3151                    destElem = (srcElem1 >> shiftAmt);
3152                }
3153            } else if (shiftAmt > 0) {
3154                if (shiftAmt >= sizeof(Element) * 8) {
3155                    if (srcElem1 != 0) {
3156                        destElem = mask(sizeof(Element) * 8);
3157                        fpscr.qc = 1;
3158                    } else {
3159                        destElem = 0;
3160                    }
3161                } else {
3162                    if (bits(srcElem1, sizeof(Element) * 8 - 1,
3163                                sizeof(Element) * 8 - shiftAmt)) {
3164                        destElem = mask(sizeof(Element) * 8);
3165                        fpscr.qc = 1;
3166                    } else {
3167                        destElem = srcElem1 << shiftAmt;
3168                    }
3169                }
3170            } else {
3171                destElem = srcElem1;
3172            }
3173            FpscrQc = fpscr;
3174    '''
3175    threeEqualRegInstX("uqshl", "UqshlDX", "SimdAluOp", smallUnsignedTypes, 2,
3176                       uqshlCode)
3177    threeEqualRegInstX("uqshl", "UqshlQX", "SimdAluOp", unsignedTypes, 4,
3178                       uqshlCode)
3179    threeEqualRegInstX("uqshl", "UqshlScX", "SimdAluOp", unsignedTypes, 4,
3180                       uqshlCode, scalar=True)
3181    # UQSHRN, UQSHRN2
3182    uqshrnCode = '''
3183            FPSCR fpscr = (FPSCR) FpscrQc;
3184            if (imm > sizeof(srcElem1) * 8) {
3185                if (srcElem1 != 0)
3186                    fpscr.qc = 1;
3187                destElem = 0;
3188            } else if (imm) {
3189                BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
3190                if (mid != (Element)mid) {
3191                    destElem = mask(sizeof(Element) * 8);
3192                    fpscr.qc = 1;
3193                } else {
3194                    destElem = mid;
3195                }
3196            } else {
3197                destElem = srcElem1;
3198            }
3199            FpscrQc = fpscr;
3200    '''
3201    twoRegNarrowInstX("uqshrn", "UqshrnX", "SimdShiftOp", smallUnsignedTypes,
3202                      uqshrnCode, hasImm=True)
3203    twoRegNarrowInstX("uqshrn2", "Uqshrn2X", "SimdShiftOp", smallUnsignedTypes,
3204                      uqshrnCode, hasImm=True, hi=True)
3205    twoRegNarrowInstX("uqshrn", "UqshrnScX", "SimdShiftOp", smallUnsignedTypes,
3206                      uqshrnCode, hasImm=True, scalar=True)
3207    # UQSUB
3208    uqsubCode = '''
3209            destElem = srcElem1 - srcElem2;
3210            FPSCR fpscr = (FPSCR) FpscrQc;
3211            if (destElem > srcElem1) {
3212                destElem = 0;
3213                fpscr.qc = 1;
3214            }
3215            FpscrQc = fpscr;
3216    '''
3217    threeEqualRegInstX("uqsub", "UqsubDX", "SimdAddOp", smallUnsignedTypes, 2,
3218                       uqsubCode)
3219    threeEqualRegInstX("uqsub", "UqsubQX", "SimdAddOp", unsignedTypes, 4,
3220                       uqsubCode)
3221    threeEqualRegInstX("uqsub", "UqsubScX", "SimdAddOp", unsignedTypes, 4,
3222                       uqsubCode, scalar=True)
3223    # UQXTN
3224    uqxtnCode = '''
3225            FPSCR fpscr = (FPSCR) FpscrQc;
3226            destElem = srcElem1;
3227            if ((BigElement)destElem != srcElem1) {
3228                fpscr.qc = 1;
3229                destElem = mask(sizeof(Element) * 8);
3230            }
3231            FpscrQc = fpscr;
3232    '''
3233    twoRegNarrowInstX("uqxtn", "UqxtnX", "SimdMiscOp", smallUnsignedTypes,
3234                      uqxtnCode)
3235    twoRegNarrowInstX("uqxtn", "Uqxtn2X", "SimdMiscOp", smallUnsignedTypes,
3236                      uqxtnCode, hi=True)
3237    twoRegNarrowInstX("uqxtn", "UqxtnScX", "SimdMiscOp", smallUnsignedTypes,
3238                      uqxtnCode, scalar=True)
3239    # URECPE
3240    urecpeCode = "destElem = unsignedRecipEstimate(srcElem1);"
3241    twoEqualRegInstX("urecpe", "UrecpeDX", "SimdMultAccOp", ("uint32_t",), 2,
3242                     urecpeCode)
3243    twoEqualRegInstX("urecpe", "UrecpeQX", "SimdMultAccOp", ("uint32_t",), 4,
3244                     urecpeCode)
3245    # URHADD
3246    threeEqualRegInstX("urhadd", "UrhaddDX", "SimdAddOp", smallUnsignedTypes,
3247                       2, rhaddCode)
3248    threeEqualRegInstX("urhadd", "UrhaddQX", "SimdAddOp", smallUnsignedTypes,
3249                       4, rhaddCode)
3250    # URSHL
3251    threeEqualRegInstX("urshl", "UrshlDX", "SimdShiftOp", unsignedTypes, 2,
3252                       rshlCode)
3253    threeEqualRegInstX("urshl", "UrshlQX", "SimdShiftOp", unsignedTypes, 4,
3254                       rshlCode)
3255    # URSHR
3256    twoEqualRegInstX("urshr", "UrshrDX", "SimdShiftOp", unsignedTypes, 2,
3257                     rshrCode, hasImm=True)
3258    twoEqualRegInstX("urshr", "UrshrQX", "SimdShiftOp", unsignedTypes, 4,
3259                     rshrCode, hasImm=True)
3260    # URSQRTE
3261    ursqrteCode = "destElem = unsignedRSqrtEstimate(srcElem1);"
3262    twoEqualRegInstX("ursqrte", "UrsqrteDX", "SimdSqrtOp", ("uint32_t",), 2,
3263                     ursqrteCode)
3264    twoEqualRegInstX("ursqrte", "UrsqrteQX", "SimdSqrtOp", ("uint32_t",), 4,
3265                     ursqrteCode)
3266    # URSRA
3267    twoEqualRegInstX("ursra", "UrsraDX", "SimdShiftOp", unsignedTypes, 2,
3268                     rsraCode, True, hasImm=True)
3269    twoEqualRegInstX("ursra", "UrsraQX", "SimdShiftOp", unsignedTypes, 4,
3270                     rsraCode, True, hasImm=True)
3271    # USHL
3272    threeEqualRegInstX("ushl", "UshlDX", "SimdShiftOp", unsignedTypes, 2,
3273                       shlCode)
3274    threeEqualRegInstX("ushl", "UshlQX", "SimdShiftOp", unsignedTypes, 4,
3275                       shlCode)
3276    # USHLL, USHLL2
3277    twoRegLongInstX("ushll", "UshllX", "SimdShiftOp", smallUnsignedTypes,
3278                    shllCode, hasImm=True)
3279    twoRegLongInstX("ushll", "Ushll2X", "SimdShiftOp", smallUnsignedTypes,
3280                    shllCode, hi=True, hasImm=True)
3281    # USHR
3282    twoEqualRegInstX("ushr", "UshrDX", "SimdShiftOp", unsignedTypes, 2,
3283                     shrCode, hasImm=True)
3284    twoEqualRegInstX("ushr", "UshrQX", "SimdShiftOp", unsignedTypes, 4,
3285                     shrCode, hasImm=True)
3286    # USQADD
3287    usqaddCode = '''
3288            FPSCR fpscr = (FPSCR) FpscrQc;
3289            Element tmp = destElem + srcElem1;
3290            if (bits(srcElem1, sizeof(Element) * 8 - 1) == 0) {
3291                if (tmp < srcElem1 || tmp < destElem) {
3292                    destElem = (Element)(-1);
3293                    fpscr.qc = 1;
3294                } else {
3295                    destElem = tmp;
3296                }
3297            } else {
3298                Element absSrcElem1 = (~srcElem1) + 1;
3299                if (absSrcElem1 > destElem) {
3300                    destElem = 0;
3301                    fpscr.qc = 1;
3302                } else {
3303                    destElem = tmp;
3304                }
3305            }
3306            FpscrQc = fpscr;
3307    '''
3308    twoEqualRegInstX("usqadd", "UsqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
3309                     usqaddCode, True)
3310    twoEqualRegInstX("usqadd", "UsqaddQX", "SimdAddOp", unsignedTypes, 4,
3311                     usqaddCode, True)
3312    twoEqualRegInstX("usqadd", "UsqaddScX", "SimdAddOp", unsignedTypes, 4,
3313                     usqaddCode, True, scalar=True)
3314    # USRA
3315    twoEqualRegInstX("usra", "UsraDX", "SimdShiftOp", unsignedTypes, 2,
3316                     sraCode, True, hasImm=True)
3317    twoEqualRegInstX("usra", "UsraQX", "SimdShiftOp", unsignedTypes, 4,
3318                     sraCode, True, hasImm=True)
3319    # USUBL
3320    threeRegLongInstX("usubl", "UsublX", "SimdAddOp", smallUnsignedTypes,
3321                      sublwCode)
3322    threeRegLongInstX("usubl2", "Usubl2X", "SimdAddOp", smallUnsignedTypes,
3323                      sublwCode, hi=True)
3324    # USUBW
3325    threeRegWideInstX("usubw", "UsubwX", "SimdAddOp", smallUnsignedTypes,
3326                      sublwCode)
3327    threeRegWideInstX("usubw2", "Usubw2X", "SimdAddOp", smallUnsignedTypes,
3328                      sublwCode, hi=True)
3329    # UXTL -> alias to USHLL
3330    # UZP1
3331    uzpCode = '''
3332        unsigned part = %s;
3333        for (unsigned i = 0; i < eCount / 2; i++) {
3334            destReg.elements[i] = srcReg1.elements[2 * i + part];
3335            destReg.elements[eCount / 2 + i] = srcReg2.elements[2 * i + part];
3336        }
3337    '''
3338    threeRegScrambleInstX("Uzp1", "Uzp1DX", "SimdAluOp", smallUnsignedTypes, 2,
3339                          uzpCode % "0")
3340    threeRegScrambleInstX("Uzp1", "Uzp1QX", "SimdAluOp", unsignedTypes, 4,
3341                          uzpCode % "0")
3342    # UZP2
3343    threeRegScrambleInstX("Uzp2", "Uzp2DX", "SimdAluOp", smallUnsignedTypes, 2,
3344                          uzpCode % "1")
3345    threeRegScrambleInstX("Uzp2", "Uzp2QX", "SimdAluOp", unsignedTypes, 4,
3346                          uzpCode % "1")
3347    # XTN, XTN2
3348    xtnCode = "destElem = srcElem1;"
3349    twoRegNarrowInstX("Xtn", "XtnX", "SimdMiscOp", smallUnsignedTypes, xtnCode)
3350    twoRegNarrowInstX("Xtn", "Xtn2X", "SimdMiscOp", smallUnsignedTypes,
3351                      xtnCode, hi=True)
3352    # ZIP1
3353    zipCode = '''
3354        unsigned base = %s;
3355        for (unsigned i = 0; i < eCount / 2; i++) {
3356            destReg.elements[2 * i] = srcReg1.elements[base + i];
3357            destReg.elements[2 * i + 1] = srcReg2.elements[base + i];
3358        }
3359    '''
3360    threeRegScrambleInstX("zip1", "Zip1DX", "SimdAluOp", smallUnsignedTypes, 2,
3361                          zipCode % "0")
3362    threeRegScrambleInstX("zip1", "Zip1QX", "SimdAluOp", unsignedTypes, 4,
3363                          zipCode % "0")
3364    # ZIP2
3365    threeRegScrambleInstX("zip2", "Zip2DX", "SimdAluOp", smallUnsignedTypes, 2,
3366                          zipCode % "eCount / 2")
3367    threeRegScrambleInstX("zip2", "Zip2QX", "SimdAluOp", unsignedTypes, 4,
3368                          zipCode % "eCount / 2")
3369
3370    for decoderFlavour, type_dict in decoders.iteritems():
3371        header_output += '''
3372        class %(decoder_flavour)sDecoder {
3373        public:
3374        ''' % { "decoder_flavour" : decoderFlavour }
3375        for type,name in type_dict.iteritems():
3376            header_output += '''
3377            template<typename Elem> using %(type)s = %(new_name)s<Elem>;''' % {
3378               "type" : type, "new_name" : name
3379            }
3380        header_output += '''
3381        };'''
3382}};
3383