1// -*- mode: c++ -*-
2
3// Copyright (c) 2012-2013, 2015-2018 ARM Limited
4// All rights reserved
5//
6// The license below extends only to copyright in the software and shall
7// not be construed as granting a license to any other intellectual
8// property including but not limited to intellectual property relating
9// to a hardware implementation of the functionality of the software
10// licensed hereunder.  You may use the software subject to the license
11// terms below provided that you ensure that this notice is replicated
12// unmodified and in its entirety in all distributions of the software,
13// modified or unmodified, in source code or in binary form.
14//
15// Redistribution and use in source and binary forms, with or without
16// modification, are permitted provided that the following conditions are
17// met: redistributions of source code must retain the above copyright
18// notice, this list of conditions and the following disclaimer;
19// redistributions in binary form must reproduce the above copyright
20// notice, this list of conditions and the following disclaimer in the
21// documentation and/or other materials provided with the distribution;
22// neither the name of the copyright holders nor the names of its
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Authors: Giacomo Gabrielli
39//          Mbou Eyole
40
41let {{
42
43    header_output = ""
44    exec_output = ""
45    decoders = { 'Generic' : {} }
46
47    # FP types (FP operations always work with unsigned representations)
48    floatTypes = ("uint16_t", "uint32_t", "uint64_t")
49    smallFloatTypes = ("uint32_t",)
50
51    zeroSveVecRegUpperPartCode = '''
52        TheISA::ISA::zeroSveVecRegUpperPart(%s,
53            ArmStaticInst::getCurSveVecLen<uint64_t>(xc->tcBase()));
54    '''
55
56    def threeEqualRegInstX(name, Name, opClass, types, rCount, op,
57                           readDest=False, pairwise=False, scalar=False,
58                           byElem=False, decoder='Generic'):
59        assert (not pairwise) or ((not byElem) and (not scalar))
60        global header_output, exec_output, decoders
61        eWalkCode = simd64EnabledCheckCode + '''
62        RegVect srcReg1, destReg;
63        '''
64        if byElem:
65            # 2nd register operand has to be read fully
66            eWalkCode += '''
67        FullRegVect srcReg2;
68        '''
69        else:
70            eWalkCode += '''
71        RegVect srcReg2;
72        '''
73        for reg in range(rCount):
74            eWalkCode += '''
75        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
76        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
77        ''' % { "reg" : reg }
78            if readDest:
79                eWalkCode += '''
80        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
81        ''' % { "reg" : reg }
82        if byElem:
83            # 2nd operand has to be read fully
84            for reg in range(rCount, 4):
85                eWalkCode += '''
86        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
87        ''' % { "reg" : reg }
88        readDestCode = ''
89        if readDest:
90            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
91        if pairwise:
92            eWalkCode += '''
93        for (unsigned i = 0; i < eCount; i++) {
94            Element srcElem1 = gtoh(2 * i < eCount ?
95                                    srcReg1.elements[2 * i] :
96                                    srcReg2.elements[2 * i - eCount]);
97            Element srcElem2 = gtoh(2 * i < eCount ?
98                                    srcReg1.elements[2 * i + 1] :
99                                    srcReg2.elements[2 * i + 1 - eCount]);
100            Element destElem;
101            %(readDest)s
102            %(op)s
103            destReg.elements[i] = htog(destElem);
104        }
105        ''' % { "op" : op, "readDest" : readDestCode }
106        else:
107            scalarCheck = '''
108            if (i != 0) {
109                destReg.elements[i] = 0;
110                continue;
111            }
112            '''
113            eWalkCode += '''
114        for (unsigned i = 0; i < eCount; i++) {
115            %(scalarCheck)s
116            Element srcElem1 = gtoh(srcReg1.elements[i]);
117            Element srcElem2 = gtoh(srcReg2.elements[%(src2Index)s]);
118            Element destElem;
119            %(readDest)s
120            %(op)s
121            destReg.elements[i] = htog(destElem);
122        }
123        ''' % { "op" : op, "readDest" : readDestCode,
124                "scalarCheck" : scalarCheck if scalar else "",
125                "src2Index" : "imm" if byElem else "i" }
126        for reg in range(rCount):
127            eWalkCode += '''
128        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
129        ''' % { "reg" : reg }
130        if rCount < 4:  # zero upper half
131            for reg in range(rCount, 4):
132                eWalkCode += '''
133        AA64FpDestP%(reg)d_uw = 0;
134        ''' % { "reg" : reg }
135        iop = InstObjParams(name, Name,
136                            "DataX2RegImmOp" if byElem else "DataX2RegOp",
137                            { "code": eWalkCode,
138                              "r_count": rCount,
139                              "op_class": opClass }, [])
140        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
141        if byElem:
142            header_output += NeonX2RegImmOpDeclare.subst(iop)
143        else:
144            header_output += NeonX2RegOpDeclare.subst(iop)
145        exec_output += NeonXEqualRegOpExecute.subst(iop)
146        for type in types:
147            substDict = { "targs" : type,
148                          "class_name" : Name }
149            exec_output += NeonXExecDeclare.subst(substDict)
150
151    def threeUnequalRegInstX(name, Name, opClass, types, op,
152                             bigSrc1, bigSrc2, bigDest, readDest, scalar=False,
153                             byElem=False, hi=False):
154        assert not (scalar and hi)
155        global header_output, exec_output
156        src1Cnt = src2Cnt = destCnt = 2
157        src1Prefix = src2Prefix = destPrefix = ''
158        if bigSrc1:
159            src1Cnt = 4
160            src1Prefix = 'Big'
161        if bigSrc2:
162            src2Cnt = 4
163            src2Prefix = 'Big'
164        if bigDest:
165            destCnt = 4
166            destPrefix = 'Big'
167        if byElem:
168            src2Prefix = 'Full'
169        eWalkCode = simd64EnabledCheckCode + '''
170        %sRegVect srcReg1;
171        %sRegVect srcReg2;
172        %sRegVect destReg;
173        ''' % (src1Prefix, src2Prefix, destPrefix)
174        srcReg1 = 0
175        if hi and not bigSrc1:  # long/widening operations
176            srcReg1 = 2
177        for reg in range(src1Cnt):
178            eWalkCode += '''
179        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(srcReg1)d_uw);
180        ''' % { "reg" : reg, "srcReg1" : srcReg1 }
181            srcReg1 += 1
182        srcReg2 = 0
183        if (not byElem) and (hi and not bigSrc2):  # long/widening operations
184            srcReg2 = 2
185        for reg in range(src2Cnt):
186            eWalkCode += '''
187        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(srcReg2)d_uw);
188        ''' % { "reg" : reg, "srcReg2" : srcReg2 }
189            srcReg2 += 1
190        if byElem:
191            # 2nd operand has to be read fully
192            for reg in range(src2Cnt, 4):
193                eWalkCode += '''
194        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
195        ''' % { "reg" : reg }
196        if readDest:
197            for reg in range(destCnt):
198                eWalkCode += '''
199        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
200        ''' % { "reg" : reg }
201        readDestCode = ''
202        if readDest:
203            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
204        scalarCheck = '''
205            if (i != 0) {
206                destReg.elements[i] = 0;
207                continue;
208            }
209            '''
210        eWalkCode += '''
211        for (unsigned i = 0; i < eCount; i++) {
212            %(scalarCheck)s
213            %(src1Prefix)sElement srcElem1 = gtoh(srcReg1.elements[i]);
214            %(src1Prefix)sElement srcElem2 = gtoh(srcReg2.elements[%(src2Index)s]);
215            %(destPrefix)sElement destElem;
216            %(readDest)s
217            %(op)s
218            destReg.elements[i] = htog(destElem);
219        }
220        ''' % { "op" : op, "readDest" : readDestCode,
221                "src1Prefix" : src1Prefix, "src2Prefix" : src2Prefix,
222                "destPrefix" : destPrefix,
223                "scalarCheck" : scalarCheck if scalar else "",
224                "src2Index" : "imm" if byElem else "i" }
225        destReg = 0
226        if hi and not bigDest:
227            # narrowing operations
228            destReg = 2
229        for reg in range(destCnt):
230            eWalkCode += '''
231        AA64FpDestP%(destReg)d_uw = gtoh(destReg.regs[%(reg)d]);
232        ''' % { "reg" : reg, "destReg": destReg }
233            destReg += 1
234        if destCnt < 4:
235            if hi:  # Explicitly merge with lower half
236                for reg in range(0, destCnt):
237                    eWalkCode += '''
238        AA64FpDestP%(reg)d_uw = AA64FpDestP%(reg)d_uw;''' % { "reg" : reg }
239            else:  # zero upper half
240                for reg in range(destCnt, 4):
241                    eWalkCode += '''
242        AA64FpDestP%(reg)d_uw = 0;''' % { "reg" : reg }
243
244        iop = InstObjParams(name, Name,
245                            "DataX2RegImmOp" if byElem else "DataX2RegOp",
246                            { "code": eWalkCode,
247                              "r_count": 2,
248                              "op_class": opClass }, [])
249        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
250        if byElem:
251            header_output += NeonX2RegImmOpDeclare.subst(iop)
252        else:
253            header_output += NeonX2RegOpDeclare.subst(iop)
254        exec_output += NeonXUnequalRegOpExecute.subst(iop)
255        for type in types:
256            substDict = { "targs" : type,
257                          "class_name" : Name }
258            exec_output += NeonXExecDeclare.subst(substDict)
259
260    def threeRegNarrowInstX(name, Name, opClass, types, op, readDest=False,
261                            scalar=False, byElem=False, hi=False):
262        assert not byElem
263        threeUnequalRegInstX(name, Name, opClass, types, op,
264                             True, True, False, readDest, scalar, byElem, hi)
265
266    def threeRegLongInstX(name, Name, opClass, types, op, readDest=False,
267                          scalar=False, byElem=False, hi=False):
268        threeUnequalRegInstX(name, Name, opClass, types, op,
269                             False, False, True, readDest, scalar, byElem, hi)
270
271    def threeRegWideInstX(name, Name, opClass, types, op, readDest=False,
272                          scalar=False, byElem=False, hi=False):
273        assert not byElem
274        threeUnequalRegInstX(name, Name, opClass, types, op,
275                             True, False, True, readDest, scalar, byElem, hi)
276
277    def twoEqualRegInstX(name, Name, opClass, types, rCount, op,
278                         readDest=False, scalar=False, byElem=False,
279                         hasImm=False, isDup=False):
280        global header_output, exec_output
281        assert (not isDup) or byElem
282        if byElem:
283            hasImm = True
284        if isDup:
285            eWalkCode = simd64EnabledCheckCode + '''
286        FullRegVect srcReg1;
287        RegVect destReg;
288        '''
289        else:
290            eWalkCode = simd64EnabledCheckCode + '''
291        RegVect srcReg1, destReg;
292        '''
293        for reg in range(4 if isDup else rCount):
294            eWalkCode += '''
295        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
296        ''' % { "reg" : reg }
297            if readDest:
298                eWalkCode += '''
299        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
300        ''' % { "reg" : reg }
301        readDestCode = ''
302        if readDest:
303            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
304        scalarCheck = '''
305            if (i != 0) {
306                destReg.elements[i] = 0;
307                continue;
308            }
309            '''
310        eWalkCode += '''
311        for (unsigned i = 0; i < eCount; i++) {
312            %(scalarCheck)s
313            unsigned j = i;
314            Element srcElem1 = gtoh(srcReg1.elements[%(src1Index)s]);
315            Element destElem;
316            %(readDest)s
317            %(op)s
318            destReg.elements[j] = htog(destElem);
319        }
320        ''' % { "op" : op, "readDest" : readDestCode,
321                "scalarCheck" : scalarCheck if scalar else "",
322                "src1Index" : "imm" if byElem else "i" }
323        for reg in range(rCount):
324            eWalkCode += '''
325        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
326        ''' % { "reg" : reg }
327        if rCount < 4:  # zero upper half
328            for reg in range(rCount, 4):
329                eWalkCode += '''
330        AA64FpDestP%(reg)d_uw = 0;
331        ''' % { "reg" : reg }
332        iop = InstObjParams(name, Name,
333                            "DataX1RegImmOp" if hasImm else "DataX1RegOp",
334                            { "code": eWalkCode,
335                              "r_count": rCount,
336                              "op_class": opClass }, [])
337        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
338        if hasImm:
339            header_output += NeonX1RegImmOpDeclare.subst(iop)
340        else:
341            header_output += NeonX1RegOpDeclare.subst(iop)
342        exec_output += NeonXEqualRegOpExecute.subst(iop)
343        for type in types:
344            substDict = { "targs" : type,
345                          "class_name" : Name }
346            exec_output += NeonXExecDeclare.subst(substDict)
347
348    def twoRegLongInstX(name, Name, opClass, types, op, readDest=False,
349                        hi=False, hasImm=False):
350        global header_output, exec_output
351        eWalkCode = simd64EnabledCheckCode + '''
352        RegVect srcReg1;
353        BigRegVect destReg;
354        '''
355        destReg = 0 if not hi else 2
356        for reg in range(2):
357            eWalkCode += '''
358        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(destReg)d_uw);
359        ''' % { "reg" : reg, "destReg": destReg }
360            destReg += 1
361        destReg = 0 if not hi else 2
362        if readDest:
363            for reg in range(4):
364                eWalkCode += '''
365        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
366        ''' % { "reg" : reg }
367                destReg += 1
368        readDestCode = ''
369        if readDest:
370            readDestCode = 'destReg = gtoh(destReg.elements[i]);'
371        eWalkCode += '''
372        for (unsigned i = 0; i < eCount; i++) {
373            Element srcElem1 = gtoh(srcReg1.elements[i]);
374            BigElement destElem;
375            %(readDest)s
376            %(op)s
377            destReg.elements[i] = htog(destElem);
378        }
379        ''' % { "op" : op, "readDest" : readDestCode }
380        for reg in range(4):
381            eWalkCode += '''
382        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
383        ''' % { "reg" : reg }
384        iop = InstObjParams(name, Name,
385                            "DataX1RegImmOp" if hasImm else "DataX1RegOp",
386                            { "code": eWalkCode,
387                              "r_count": 2,
388                              "op_class": opClass }, [])
389        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
390        if hasImm:
391            header_output += NeonX1RegImmOpDeclare.subst(iop)
392        else:
393            header_output += NeonX1RegOpDeclare.subst(iop)
394        exec_output += NeonXUnequalRegOpExecute.subst(iop)
395        for type in types:
396            substDict = { "targs" : type,
397                          "class_name" : Name }
398            exec_output += NeonXExecDeclare.subst(substDict)
399
400    def twoRegNarrowInstX(name, Name, opClass, types, op, readDest=False,
401                          scalar=False, hi=False, hasImm=False):
402        global header_output, exec_output
403        eWalkCode = simd64EnabledCheckCode + '''
404        BigRegVect srcReg1;
405        RegVect destReg;
406        '''
407        for reg in range(4):
408            eWalkCode += '''
409        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
410        ''' % { "reg" : reg }
411        if readDest:
412            for reg in range(2):
413                eWalkCode += '''
414        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
415        ''' % { "reg" : reg }
416        else:
417            eWalkCode += '''
418        destReg.elements[0] = 0;
419        ''' % { "reg" : reg }
420        readDestCode = ''
421        if readDest:
422            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
423        scalarCheck = '''
424            if (i != 0) {
425                destReg.elements[i] = 0;
426                continue;
427            }
428            '''
429        eWalkCode += '''
430        for (unsigned i = 0; i < eCount; i++) {
431            %(scalarCheck)s
432            BigElement srcElem1 = gtoh(srcReg1.elements[i]);
433            Element destElem;
434            %(readDest)s
435            %(op)s
436            destReg.elements[i] = htog(destElem);
437        }
438        ''' % { "op" : op, "readDest" : readDestCode,
439                "scalarCheck" : scalarCheck if scalar else "" }
440        destReg = 0 if not hi else 2
441        for reg in range(2):
442            eWalkCode += '''
443        AA64FpDestP%(destReg)d_uw = gtoh(destReg.regs[%(reg)d]);
444        ''' % { "reg" : reg, "destReg": destReg }
445            destReg += 1
446        if hi:
447            for reg in range(0, 2):  # Explicitly merge with the lower half
448                eWalkCode += '''
449        AA64FpDestP%(reg)d_uw = AA64FpDestP%(reg)d_uw;''' % { "reg" : reg }
450        else:
451            for reg in range(2, 4):  # zero upper half
452                eWalkCode += '''
453        AA64FpDestP%(reg)d_uw = 0;
454        ''' % { "reg" : reg }
455
456        iop = InstObjParams(name, Name,
457                            "DataX1RegImmOp" if hasImm else "DataX1RegOp",
458                            { "code": eWalkCode,
459                              "r_count": 2,
460                              "op_class": opClass }, [])
461        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
462        if hasImm:
463            header_output += NeonX1RegImmOpDeclare.subst(iop)
464        else:
465            header_output += NeonX1RegOpDeclare.subst(iop)
466        exec_output += NeonXUnequalRegOpExecute.subst(iop)
467        for type in types:
468            substDict = { "targs" : type,
469                          "class_name" : Name }
470            exec_output += NeonXExecDeclare.subst(substDict)
471
472    def threeRegScrambleInstX(name, Name, opClass, types, rCount, op):
473        global header_output, exec_output
474        eWalkCode = simd64EnabledCheckCode + '''
475        RegVect srcReg1, srcReg2, destReg;
476        '''
477        for reg in range(rCount):
478            eWalkCode += '''
479        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
480        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
481        ''' % { "reg" : reg }
482        eWalkCode += op
483        for reg in range(rCount):
484            eWalkCode += '''
485        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
486        ''' % { "reg" : reg }
487        if rCount < 4:
488            for reg in range(rCount, 4):
489                eWalkCode += '''
490        AA64FpDestP%(reg)d_uw = 0;
491        ''' % { "reg" : reg }
492        iop = InstObjParams(name, Name,
493                            "DataX2RegOp",
494                            { "code": eWalkCode,
495                              "r_count": rCount,
496                              "op_class": opClass }, [])
497        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
498        header_output += NeonX2RegOpDeclare.subst(iop)
499        exec_output += NeonXEqualRegOpExecute.subst(iop)
500        for type in types:
501            substDict = { "targs" : type,
502                          "class_name" : Name }
503            exec_output += NeonXExecDeclare.subst(substDict)
504
505    def insFromVecElemInstX(name, Name, opClass, types, rCount):
506        global header_output, exec_output
507        eWalkCode = simd64EnabledCheckCode + '''
508        FullRegVect srcReg1;
509        RegVect destReg;
510        '''
511        for reg in range(4):
512            eWalkCode += '''
513        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
514        ''' % { "reg" : reg }
515        for reg in range(rCount):
516            eWalkCode += '''
517        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
518        ''' % { "reg" : reg }
519        eWalkCode += '''
520        Element srcElem1 = gtoh(srcReg1.elements[imm2]);
521        Element destElem = srcElem1;
522        destReg.elements[imm1] = htog(destElem);
523        '''
524        for reg in range(rCount):
525            eWalkCode += '''
526        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
527        ''' % { "reg" : reg }
528        iop = InstObjParams(name, Name,
529                            "DataX1Reg2ImmOp",
530                            { "code": eWalkCode,
531                              "r_count": rCount,
532                              "op_class": opClass }, [])
533        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
534        header_output += NeonX1Reg2ImmOpDeclare.subst(iop)
535        exec_output += NeonXEqualRegOpExecute.subst(iop)
536        for type in types:
537            substDict = { "targs" : type,
538                          "class_name" : Name }
539            exec_output += NeonXExecDeclare.subst(substDict)
540
541    def twoRegPairwiseScInstX(name, Name, opClass, types, rCount, op):
542        global header_output, exec_output
543        eWalkCode = simd64EnabledCheckCode + '''
544        RegVect srcReg1, destReg;
545        '''
546        for reg in range(rCount):
547            eWalkCode += '''
548        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
549        ''' % { "reg" : reg }
550        eWalkCode += '''
551        Element srcElem1 = gtoh(srcReg1.elements[0]);
552        Element srcElem2 = gtoh(srcReg1.elements[1]);
553        Element destElem;
554        %(op)s
555        destReg.elements[0] = htog(destElem);
556        ''' % { "op" : op }
557        destCnt = rCount / 2
558        for reg in range(destCnt):
559            eWalkCode += '''
560        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
561        ''' % { "reg" : reg }
562        for reg in range(destCnt, 4):  # zero upper half
563            eWalkCode += '''
564        AA64FpDestP%(reg)d_uw = 0;
565        ''' % { "reg" : reg }
566        iop = InstObjParams(name, Name,
567                            "DataX1RegOp",
568                            { "code": eWalkCode,
569                              "r_count": rCount,
570                              "op_class": opClass }, [])
571        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
572        header_output += NeonX1RegOpDeclare.subst(iop)
573        exec_output += NeonXEqualRegOpExecute.subst(iop)
574        for type in types:
575            substDict = { "targs" : type,
576                          "class_name" : Name }
577            exec_output += NeonXExecDeclare.subst(substDict)
578
579    def twoRegAcrossInstX(name, Name, opClass, types, rCount, op,
580                          doubleDest=False, long=False):
581        global header_output, exec_output
582        destPrefix = "Big" if long else ""
583        eWalkCode = simd64EnabledCheckCode + '''
584        RegVect srcReg1;
585        %sRegVect destReg;
586        ''' % destPrefix
587        for reg in range(rCount):
588            eWalkCode += '''
589        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
590        ''' % { "reg" : reg }
591        eWalkCode += '''
592        destReg.regs[0] = 0;
593        %(destPrefix)sElement destElem = 0;
594        for (unsigned i = 0; i < eCount; i++) {
595            Element srcElem1 = gtoh(srcReg1.elements[i]);
596            if (i == 0) {
597                destElem = srcElem1;
598            } else {
599                %(op)s
600            }
601        }
602        destReg.elements[0] = htog(destElem);
603        ''' % { "op" : op, "destPrefix" : destPrefix }
604        destCnt = 2 if doubleDest else 1
605        for reg in range(destCnt):
606            eWalkCode += '''
607        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
608        ''' % { "reg" : reg }
609        for reg in range(destCnt, 4):  # zero upper half
610            eWalkCode += '''
611        AA64FpDestP%(reg)d_uw = 0;
612        ''' % { "reg" : reg }
613        iop = InstObjParams(name, Name,
614                            "DataX1RegOp",
615                            { "code": eWalkCode,
616                              "r_count": rCount,
617                              "op_class": opClass }, [])
618        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
619        header_output += NeonX1RegOpDeclare.subst(iop)
620        if long:
621            exec_output += NeonXUnequalRegOpExecute.subst(iop)
622        else:
623            exec_output += NeonXEqualRegOpExecute.subst(iop)
624        for type in types:
625            substDict = { "targs" : type,
626                          "class_name" : Name }
627            exec_output += NeonXExecDeclare.subst(substDict)
628
629    def twoRegCondenseInstX(name, Name, opClass, types, rCount, op,
630                            readDest=False):
631        global header_output, exec_output
632        eWalkCode = simd64EnabledCheckCode + '''
633        RegVect srcRegs;
634        BigRegVect destReg;
635        '''
636        for reg in range(rCount):
637            eWalkCode += '''
638        srcRegs.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
639        ''' % { "reg" : reg }
640            if readDest:
641                eWalkCode += '''
642        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
643        ''' % { "reg" : reg }
644        readDestCode = ''
645        if readDest:
646            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
647        eWalkCode += '''
648        for (unsigned i = 0; i < eCount / 2; i++) {
649            Element srcElem1 = gtoh(srcRegs.elements[2 * i]);
650            Element srcElem2 = gtoh(srcRegs.elements[2 * i + 1]);
651            BigElement destElem;
652            %(readDest)s
653            %(op)s
654            destReg.elements[i] = htog(destElem);
655        }
656        ''' % { "op" : op, "readDest" : readDestCode }
657        for reg in range(rCount):
658            eWalkCode += '''
659        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
660        ''' % { "reg" : reg }
661        if rCount < 4:  # zero upper half
662            for reg in range(rCount, 4):
663                eWalkCode += '''
664        AA64FpDestP%(reg)d_uw = 0;
665        ''' % { "reg" : reg }
666        iop = InstObjParams(name, Name,
667                            "DataX1RegOp",
668                            { "code": eWalkCode,
669                              "r_count": rCount,
670                              "op_class": opClass }, [])
671        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
672        header_output += NeonX1RegOpDeclare.subst(iop)
673        exec_output += NeonXUnequalRegOpExecute.subst(iop)
674        for type in types:
675            substDict = { "targs" : type,
676                          "class_name" : Name }
677            exec_output += NeonXExecDeclare.subst(substDict)
678
679    def oneRegImmInstX(name, Name, opClass, types, rCount, op, readDest=False):
680        global header_output, exec_output
681        eWalkCode = simd64EnabledCheckCode + '''
682        RegVect destReg;
683        '''
684        if readDest:
685            for reg in range(rCount):
686                eWalkCode += '''
687        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
688        ''' % { "reg" : reg }
689        readDestCode = ''
690        if readDest:
691            readDestCode = 'destElem = gtoh(destReg.elements[i]);'
692        eWalkCode += '''
693        for (unsigned i = 0; i < eCount; i++) {
694            Element destElem;
695            %(readDest)s
696            %(op)s
697            destReg.elements[i] = htog(destElem);
698        }
699        ''' % { "op" : op, "readDest" : readDestCode }
700        for reg in range(rCount):
701            eWalkCode += '''
702        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
703        ''' % { "reg" : reg }
704        if rCount < 4:  # zero upper half
705            for reg in range(rCount, 4):
706                eWalkCode += '''
707        AA64FpDestP%(reg)d_uw = 0;
708        ''' % { "reg" : reg }
709        iop = InstObjParams(name, Name,
710                            "DataXImmOnlyOp",
711                            { "code": eWalkCode,
712                              "r_count": rCount,
713                              "op_class": opClass }, [])
714        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
715        header_output += NeonX1RegImmOnlyOpDeclare.subst(iop)
716        exec_output += NeonXEqualRegOpExecute.subst(iop)
717        for type in types:
718            substDict = { "targs" : type,
719                          "class_name" : Name }
720            exec_output += NeonXExecDeclare.subst(substDict)
721
722    def dupGprInstX(name, Name, opClass, types, rCount, gprSpec):
723        global header_output, exec_output
724        eWalkCode = simd64EnabledCheckCode + '''
725        RegVect destReg;
726        for (unsigned i = 0; i < eCount; i++) {
727            destReg.elements[i] = htog((Element) %sOp1);
728        }
729        ''' % gprSpec
730        for reg in range(rCount):
731            eWalkCode += '''
732        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
733        ''' % { "reg" : reg }
734        if rCount < 4:  # zero upper half
735            for reg in range(rCount, 4):
736                eWalkCode += '''
737        AA64FpDestP%(reg)d_uw = 0;
738        ''' % { "reg" : reg }
739        iop = InstObjParams(name, Name,
740                            "DataX1RegOp",
741                            { "code": eWalkCode,
742                              "r_count": rCount,
743                              "op_class": opClass }, [])
744        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
745        header_output += NeonX1RegOpDeclare.subst(iop)
746        exec_output += NeonXEqualRegOpExecute.subst(iop)
747        for type in types:
748            substDict = { "targs" : type,
749                          "class_name" : Name }
750            exec_output += NeonXExecDeclare.subst(substDict)
751
752    def extInstX(name, Name, opClass, types, rCount, op):
753        global header_output, exec_output
754        eWalkCode = simd64EnabledCheckCode + '''
755        RegVect srcReg1, srcReg2, destReg;
756        '''
757        for reg in range(rCount):
758            eWalkCode += '''
759        srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
760        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
761        ''' % { "reg" : reg }
762        eWalkCode += op
763        for reg in range(rCount):
764            eWalkCode += '''
765        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
766        ''' % { "reg" : reg }
767        if rCount < 4:  # zero upper half
768            for reg in range(rCount, 4):
769                eWalkCode += '''
770        AA64FpDestP%(reg)d_uw = 0;
771        ''' % { "reg" : reg }
772        iop = InstObjParams(name, Name,
773                            "DataX2RegImmOp",
774                            { "code": eWalkCode,
775                              "r_count": rCount,
776                              "op_class": opClass }, [])
777        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
778        header_output += NeonX2RegImmOpDeclare.subst(iop)
779        exec_output += NeonXEqualRegOpExecute.subst(iop)
780        for type in types:
781            substDict = { "targs" : type,
782                          "class_name" : Name }
783            exec_output += NeonXExecDeclare.subst(substDict)
784
785    def insFromGprInstX(name, Name, opClass, types, rCount, gprSpec):
786        global header_output, exec_output
787        eWalkCode = simd64EnabledCheckCode + '''
788        RegVect destReg;
789        '''
790        for reg in range(rCount):
791            eWalkCode += '''
792        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
793        ''' % { "reg" : reg }
794        eWalkCode += '''
795        destReg.elements[imm] = htog((Element) %sOp1);
796        ''' % gprSpec
797        for reg in range(rCount):
798            eWalkCode += '''
799        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
800        ''' % { "reg" : reg }
801        iop = InstObjParams(name, Name,
802                            "DataX1RegImmOp",
803                            { "code": eWalkCode,
804                              "r_count": rCount,
805                              "op_class": opClass }, [])
806        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
807        header_output += NeonX1RegImmOpDeclare.subst(iop)
808        exec_output += NeonXEqualRegOpExecute.subst(iop)
809        for type in types:
810            substDict = { "targs" : type,
811                          "class_name" : Name }
812            exec_output += NeonXExecDeclare.subst(substDict)
813
814    def insToGprInstX(name, Name, opClass, types, rCount, gprSpec,
815                      signExt=False):
816        global header_output, exec_output
817        eWalkCode = simd64EnabledCheckCode + '''
818        FullRegVect srcReg;
819        '''
820        for reg in range(4):
821            eWalkCode += '''
822        srcReg.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
823        ''' % { "reg" : reg }
824        if signExt:
825            eWalkCode += '''
826        %sDest = sext<sizeof(Element) * 8>(srcReg.elements[imm]);
827        ''' % gprSpec
828        else:
829            eWalkCode += '''
830        %sDest = srcReg.elements[imm];
831        ''' % gprSpec
832        iop = InstObjParams(name, Name,
833                            "DataX1RegImmOp",
834                            { "code": eWalkCode,
835                              "r_count": rCount,
836                              "op_class": opClass }, [])
837        header_output += NeonX1RegImmOpDeclare.subst(iop)
838        exec_output += NeonXEqualRegOpExecute.subst(iop)
839        for type in types:
840            substDict = { "targs" : type,
841                          "class_name" : Name }
842            exec_output += NeonXExecDeclare.subst(substDict)
843
844    def tbxTblInstX(name, Name, opClass, types, length, isTbl, rCount):
845        global header_output, decoder_output, exec_output
846        code = simd64EnabledCheckCode + '''
847        union
848        {
849            uint8_t bytes[64];
850            uint32_t regs[16];
851        } table;
852
853        union
854        {
855            uint8_t bytes[%(rCount)d * 4];
856            uint32_t regs[%(rCount)d];
857        } destReg, srcReg2;
858
859        const unsigned length = %(length)d;
860        const bool isTbl = %(isTbl)s;
861        ''' % { "rCount" : rCount, "length" : length, "isTbl" : isTbl }
862        for reg in range(rCount):
863            code += '''
864        srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
865        destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
866        ''' % { "reg" : reg }
867        for reg in range(16):
868            if reg < length * 4:
869                code += '''
870        table.regs[%(reg)d] = htog(AA64FpOp1P%(p)dV%(v)dS_uw);
871        ''' % { "reg" : reg, "p" : reg % 4, "v" : reg / 4 }
872            else:
873                code += '''
874        table.regs[%(reg)d] = 0;
875        ''' % { "reg" : reg }
876        code += '''
877        for (unsigned i = 0; i < sizeof(destReg); i++) {
878            uint8_t index = srcReg2.bytes[i];
879            if (index < 16 * length) {
880                destReg.bytes[i] = table.bytes[index];
881            } else {
882                if (isTbl)
883                    destReg.bytes[i] = 0;
884                // else destReg.bytes[i] unchanged
885            }
886        }
887        '''
888        for reg in range(rCount):
889            code += '''
890        AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
891        ''' % { "reg" : reg }
892        if rCount < 4:  # zero upper half
893            for reg in range(rCount, 4):
894                code += '''
895        AA64FpDestP%(reg)d_uw = 0;
896        ''' % { "reg" : reg }
897        iop = InstObjParams(name, Name,
898                            "DataX2RegOp",
899                            { "code": code,
900                              "r_count": rCount,
901                              "op_class": opClass }, [])
902        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
903        header_output += NeonX2RegOpDeclare.subst(iop)
904        exec_output += NeonXEqualRegOpExecute.subst(iop)
905        for type in types:
906            substDict = { "targs" : type,
907                          "class_name" : Name }
908            exec_output += NeonXExecDeclare.subst(substDict)
909
910    # ABS
911    absCode = '''
912            if (srcElem1 < 0) {
913                destElem = -srcElem1;
914            } else {
915                destElem = srcElem1;
916            }
917    '''
918    twoEqualRegInstX("abs", "AbsDX", "SimdAluOp", signedTypes, 2, absCode)
919    twoEqualRegInstX("abs", "AbsQX", "SimdAluOp", signedTypes, 4, absCode)
920    # ADD
921    addCode = "destElem = srcElem1 + srcElem2;"
922    threeEqualRegInstX("add", "AddDX", "SimdAddOp", unsignedTypes, 2, addCode)
923    threeEqualRegInstX("add", "AddQX", "SimdAddOp", unsignedTypes, 4, addCode)
924    # ADDHN, ADDHN2
925    addhnCode = '''
926            destElem = ((BigElement)srcElem1 + (BigElement)srcElem2) >>
927                        (sizeof(Element) * 8);
928    '''
929    threeRegNarrowInstX("addhn", "AddhnX", "SimdAddOp", smallUnsignedTypes,
930                        addhnCode)
931    threeRegNarrowInstX("addhn2", "Addhn2X", "SimdAddOp", smallUnsignedTypes,
932                        addhnCode, hi=True)
933    # ADDP (scalar)
934    twoRegPairwiseScInstX("addp", "AddpScQX", "SimdAddOp", ("uint64_t",), 4,
935                          addCode)
936    # ADDP (vector)
937    threeEqualRegInstX("addp", "AddpDX", "SimdAddOp", smallUnsignedTypes, 2,
938                       addCode, pairwise=True)
939    threeEqualRegInstX("addp", "AddpQX", "SimdAddOp", unsignedTypes, 4,
940                       addCode, pairwise=True)
941    # ADDV
942    # Note: SimdAddOp can be a bit optimistic here
943    addAcrossCode = "destElem += srcElem1;"
944    twoRegAcrossInstX("addv", "AddvDX", "SimdAddOp", ("uint8_t", "uint16_t"),
945                      2, addAcrossCode)
946    twoRegAcrossInstX("addv", "AddvQX", "SimdAddOp", smallUnsignedTypes, 4,
947                      addAcrossCode)
948    # AND
949    andCode = "destElem = srcElem1 & srcElem2;"
950    threeEqualRegInstX("and", "AndDX", "SimdAluOp", ("uint64_t",), 2, andCode)
951    threeEqualRegInstX("and", "AndQX", "SimdAluOp", ("uint64_t",), 4, andCode)
952    # BIC (immediate)
953    bicImmCode = "destElem &= ~imm;"
954    oneRegImmInstX("bic", "BicImmDX", "SimdAluOp", ("uint64_t",), 2,
955                   bicImmCode, True)
956    oneRegImmInstX("bic", "BicImmQX", "SimdAluOp", ("uint64_t",), 4,
957                   bicImmCode, True)
958    # BIC (register)
959    bicCode = "destElem = srcElem1 & ~srcElem2;"
960    threeEqualRegInstX("bic", "BicDX", "SimdAluOp", ("uint64_t",), 2, bicCode)
961    threeEqualRegInstX("bic", "BicQX", "SimdAluOp", ("uint64_t",), 4, bicCode)
962    # BIF
963    bifCode = "destElem = (destElem & srcElem2) | (srcElem1 & ~srcElem2);"
964    threeEqualRegInstX("bif", "BifDX", "SimdAluOp", ("uint64_t",), 2, bifCode,
965                       True)
966    threeEqualRegInstX("bif", "BifQX", "SimdAluOp", ("uint64_t",), 4, bifCode,
967                       True)
968    # BIT
969    bitCode = "destElem = (srcElem1 & srcElem2) | (destElem & ~srcElem2);"
970    threeEqualRegInstX("bit", "BitDX", "SimdAluOp", ("uint64_t",), 2, bitCode,
971                       True)
972    threeEqualRegInstX("bit", "BitQX", "SimdAluOp", ("uint64_t",), 4, bitCode,
973                       True)
974    # BSL
975    bslCode = "destElem = (srcElem1 & destElem) | (srcElem2 & ~destElem);"
976    threeEqualRegInstX("bsl", "BslDX", "SimdAluOp", ("uint64_t",), 2, bslCode,
977                       True)
978    threeEqualRegInstX("bsl", "BslQX", "SimdAluOp", ("uint64_t",), 4, bslCode,
979                       True)
980    # CLS
981    clsCode = '''
982            unsigned count = 0;
983            if (srcElem1 < 0) {
984                srcElem1 <<= 1;
985                while (srcElem1 < 0 && count < sizeof(Element) * 8 - 1) {
986                    count++;
987                    srcElem1 <<= 1;
988                }
989            } else {
990                srcElem1 <<= 1;
991                while (srcElem1 >= 0 && count < sizeof(Element) * 8 - 1) {
992                    count++;
993                    srcElem1 <<= 1;
994                }
995            }
996            destElem = count;
997    '''
998    twoEqualRegInstX("cls", "ClsDX", "SimdAluOp", smallSignedTypes, 2, clsCode)
999    twoEqualRegInstX("cls", "ClsQX", "SimdAluOp", smallSignedTypes, 4, clsCode)
1000    # CLZ
1001    clzCode = '''
1002            unsigned count = 0;
1003            while (srcElem1 >= 0 && count < sizeof(Element) * 8) {
1004                count++;
1005                srcElem1 <<= 1;
1006            }
1007            destElem = count;
1008    '''
1009    twoEqualRegInstX("clz", "ClzDX", "SimdAluOp", smallSignedTypes, 2, clzCode)
1010    twoEqualRegInstX("clz", "ClzQX", "SimdAluOp", smallSignedTypes, 4, clzCode)
1011    # CMEQ (register)
1012    cmeqCode = "destElem = (srcElem1 == srcElem2) ? (Element)(-1) : 0;"
1013    threeEqualRegInstX("cmeq", "CmeqDX", "SimdCmpOp", unsignedTypes, 2,
1014                       cmeqCode)
1015    threeEqualRegInstX("cmeq", "CmeqQX", "SimdCmpOp", unsignedTypes, 4,
1016                       cmeqCode)
1017    # CMEQ (zero)
1018    cmeqZeroCode = "destElem = (srcElem1 == 0) ? (Element)(-1) : 0;"
1019    twoEqualRegInstX("cmeq", "CmeqZeroDX", "SimdCmpOp", signedTypes, 2,
1020                     cmeqZeroCode)
1021    twoEqualRegInstX("cmeq", "CmeqZeroQX", "SimdCmpOp", signedTypes, 4,
1022                     cmeqZeroCode)
1023    # CMGE (register)
1024    cmgeCode = "destElem = (srcElem1 >= srcElem2) ? (Element)(-1) : 0;"
1025    threeEqualRegInstX("cmge", "CmgeDX", "SimdCmpOp", signedTypes, 2, cmgeCode)
1026    threeEqualRegInstX("cmge", "CmgeQX", "SimdCmpOp", signedTypes, 4, cmgeCode)
1027    # CMGE (zero)
1028    cmgeZeroCode = "destElem = (srcElem1 >= 0) ? (Element)(-1) : 0;"
1029    twoEqualRegInstX("cmge", "CmgeZeroDX", "SimdCmpOp", signedTypes, 2,
1030                     cmgeZeroCode)
1031    twoEqualRegInstX("cmge", "CmgeZeroQX", "SimdCmpOp", signedTypes, 4,
1032                     cmgeZeroCode)
1033    # CMGT (register)
1034    cmgtCode = "destElem = (srcElem1 > srcElem2) ? (Element)(-1) : 0;"
1035    threeEqualRegInstX("cmgt", "CmgtDX", "SimdCmpOp", signedTypes, 2, cmgtCode)
1036    threeEqualRegInstX("cmgt", "CmgtQX", "SimdCmpOp", signedTypes, 4, cmgtCode)
1037    # CMGT (zero)
1038    cmgtZeroCode = "destElem = (srcElem1 > 0) ? (Element)(-1) : 0;"
1039    twoEqualRegInstX("cmgt", "CmgtZeroDX", "SimdCmpOp", signedTypes, 2,
1040                     cmgtZeroCode)
1041    twoEqualRegInstX("cmgt", "CmgtZeroQX", "SimdCmpOp", signedTypes, 4,
1042                     cmgtZeroCode)
1043    # CMHI (register)
1044    threeEqualRegInstX("cmhi", "CmhiDX", "SimdCmpOp", unsignedTypes, 2,
1045                       cmgtCode)
1046    threeEqualRegInstX("cmhi", "CmhiQX", "SimdCmpOp", unsignedTypes, 4,
1047                       cmgtCode)
1048    # CMHS (register)
1049    threeEqualRegInstX("cmhs", "CmhsDX", "SimdCmpOp", unsignedTypes, 2,
1050                       cmgeCode)
1051    threeEqualRegInstX("cmhs", "CmhsQX", "SimdCmpOp", unsignedTypes, 4,
1052                       cmgeCode)
1053    # CMLE (zero)
1054    cmleZeroCode = "destElem = (srcElem1 <= 0) ? (Element)(-1) : 0;"
1055    twoEqualRegInstX("cmle", "CmleZeroDX", "SimdCmpOp", signedTypes, 2,
1056                     cmleZeroCode)
1057    twoEqualRegInstX("cmle", "CmleZeroQX", "SimdCmpOp", signedTypes, 4,
1058                     cmleZeroCode)
1059    # CMLT (zero)
1060    cmltZeroCode = "destElem = (srcElem1 < 0) ? (Element)(-1) : 0;"
1061    twoEqualRegInstX("cmlt", "CmltZeroDX", "SimdCmpOp", signedTypes, 2,
1062                     cmltZeroCode)
1063    twoEqualRegInstX("cmlt", "CmltZeroQX", "SimdCmpOp", signedTypes, 4,
1064                     cmltZeroCode)
1065    # CMTST (register)
1066    tstCode = "destElem = (srcElem1 & srcElem2) ? (Element)(-1) : 0;"
1067    threeEqualRegInstX("cmtst", "CmtstDX", "SimdAluOp", unsignedTypes, 2,
1068                       tstCode)
1069    threeEqualRegInstX("cmtst", "CmtstQX", "SimdAluOp", unsignedTypes, 4,
1070                       tstCode)
1071    # CNT
1072    cntCode = '''
1073            unsigned count = 0;
1074            while (srcElem1 && count < sizeof(Element) * 8) {
1075                count += srcElem1 & 0x1;
1076                srcElem1 >>= 1;
1077            }
1078            destElem = count;
1079    '''
1080    twoEqualRegInstX("cnt", "CntDX", "SimdAluOp", ("uint8_t",), 2, cntCode)
1081    twoEqualRegInstX("cnt", "CntQX", "SimdAluOp", ("uint8_t",), 4, cntCode)
1082    # DUP (element)
1083    dupCode = "destElem = srcElem1;"
1084    twoEqualRegInstX("dup", "DupElemDX", "SimdMiscOp", smallUnsignedTypes, 2,
1085                     dupCode, isDup=True, byElem=True)
1086    twoEqualRegInstX("dup", "DupElemQX", "SimdMiscOp", unsignedTypes, 4,
1087                     dupCode, isDup=True, byElem=True)
1088    twoEqualRegInstX("dup", "DupElemScX", "SimdMiscOp", unsignedTypes, 4,
1089                     dupCode, isDup=True, byElem=True, scalar=True)
1090    # DUP (general register)
1091    dupGprInstX("dup", "DupGprWDX", "SimdMiscOp", smallUnsignedTypes, 2, 'W')
1092    dupGprInstX("dup", "DupGprWQX", "SimdMiscOp", smallUnsignedTypes, 4, 'W')
1093    dupGprInstX("dup", "DupGprXQX", "SimdMiscOp", ("uint64_t",), 4, 'X')
1094    # EOR
1095    eorCode = "destElem = srcElem1 ^ srcElem2;"
1096    threeEqualRegInstX("eor", "EorDX", "SimdAluOp", ("uint64_t",), 2, eorCode)
1097    threeEqualRegInstX("eor", "EorQX", "SimdAluOp", ("uint64_t",), 4, eorCode)
1098    # EXT
1099    extCode = '''
1100            for (unsigned i = 0; i < eCount; i++) {
1101                unsigned index = i + imm;
1102                if (index < eCount) {
1103                    destReg.elements[i] = srcReg1.elements[index];
1104                } else {
1105                    index -= eCount;
1106                    if (index >= eCount) {
1107                        fault = std::make_shared<UndefinedInstruction>(
1108                                      machInst, false, mnemonic);
1109                    } else {
1110                        destReg.elements[i] = srcReg2.elements[index];
1111                    }
1112                }
1113            }
1114    '''
1115    extInstX("Ext", "ExtDX", "SimdMiscOp", ("uint8_t",), 2, extCode)
1116    extInstX("Ext", "ExtQX", "SimdMiscOp", ("uint8_t",), 4, extCode)
1117    # FABD
1118    fpOp = '''
1119            FPSCR fpscr = (FPSCR) FpscrExc;
1120            destElem = %s;
1121            FpscrExc = fpscr;
1122    '''
1123    fabdCode = fpOp % "fplibAbs<Element>(fplibSub(srcElem1, srcElem2, fpscr))"
1124    threeEqualRegInstX("fabd", "FabdDX", "SimdFloatAddOp", smallFloatTypes, 2,
1125                       fabdCode)
1126    threeEqualRegInstX("fabd", "FabdQX", "SimdFloatAddOp", floatTypes, 4,
1127                       fabdCode)
1128    threeEqualRegInstX("fabd", "FabdScX", "SimdFloatAddOp", floatTypes, 4,
1129                       fabdCode, scalar=True)
1130    # FABS
1131    fabsCode = fpOp % "fplibAbs<Element>(srcElem1)"
1132    twoEqualRegInstX("Abs", "FabsDX", "SimdFloatAluOp", smallFloatTypes, 2,
1133                     fabsCode)
1134    twoEqualRegInstX("Abs", "FabsQX", "SimdFloatAluOp", floatTypes, 4,
1135                     fabsCode)
1136    # FACGE
1137    fpCmpAbsOp = fpOp % ("fplibCompare%s<Element>(fplibAbs<Element>(srcElem1),"
1138                         " fplibAbs<Element>(srcElem2), fpscr) ? -1 : 0")
1139    facgeCode = fpCmpAbsOp % "GE"
1140    threeEqualRegInstX("facge", "FacgeDX", "SimdFloatCmpOp", smallFloatTypes,
1141                       2, facgeCode)
1142    threeEqualRegInstX("facge", "FacgeQX", "SimdFloatCmpOp", floatTypes, 4,
1143                       facgeCode)
1144    threeEqualRegInstX("facge", "FacgeScX", "SimdFloatCmpOp", floatTypes, 4,
1145                       facgeCode, scalar=True)
1146    # FACGT
1147    facgtCode = fpCmpAbsOp % "GT"
1148    threeEqualRegInstX("facgt", "FacgtDX", "SimdFloatCmpOp", smallFloatTypes,
1149                       2, facgtCode)
1150    threeEqualRegInstX("facgt", "FacgtQX", "SimdFloatCmpOp", floatTypes, 4,
1151                       facgtCode)
1152    threeEqualRegInstX("facgt", "FacgtScX", "SimdFloatCmpOp", floatTypes, 4,
1153                       facgtCode, scalar=True)
1154    # FADD
1155    fpBinOp = fpOp % "fplib%s<Element>(srcElem1, srcElem2, fpscr)"
1156    faddCode = fpBinOp % "Add"
1157    threeEqualRegInstX("fadd", "FaddDX", "SimdFloatAddOp", smallFloatTypes, 2,
1158                       faddCode)
1159    threeEqualRegInstX("fadd", "FaddQX", "SimdFloatAddOp", floatTypes, 4,
1160                       faddCode)
1161    # FADDP (scalar)
1162    twoRegPairwiseScInstX("faddp", "FaddpScDX", "SimdFloatAddOp",
1163                          ("uint32_t",), 2, faddCode)
1164    twoRegPairwiseScInstX("faddp", "FaddpScQX", "SimdFloatAddOp",
1165                          ("uint64_t",), 4, faddCode)
1166    # FADDP (vector)
1167    threeEqualRegInstX("faddp", "FaddpDX", "SimdFloatAddOp", smallFloatTypes,
1168                       2, faddCode, pairwise=True)
1169    threeEqualRegInstX("faddp", "FaddpQX", "SimdFloatAddOp", floatTypes, 4,
1170                       faddCode, pairwise=True)
1171    # FCMEQ (register)
1172    fpCmpOp = fpOp % ("fplibCompare%s<Element>(srcElem1, srcElem2, fpscr) ?"
1173                      " -1 : 0")
1174    fcmeqCode = fpCmpOp % "EQ"
1175    threeEqualRegInstX("fcmeq", "FcmeqDX", "SimdFloatCmpOp", smallFloatTypes,
1176                       2, fcmeqCode)
1177    threeEqualRegInstX("fcmeq", "FcmeqQX", "SimdFloatCmpOp", floatTypes, 4,
1178                       fcmeqCode)
1179    threeEqualRegInstX("fcmeq", "FcmeqScX", "SimdFloatCmpOp", floatTypes, 4,
1180                       fcmeqCode, scalar=True)
1181    # FCMEQ (zero)
1182    fpCmpZeroOp = fpOp % "fplibCompare%s<Element>(srcElem1, 0, fpscr) ? -1 : 0"
1183    fcmeqZeroCode = fpCmpZeroOp % "EQ"
1184    twoEqualRegInstX("fcmeq", "FcmeqZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1185                     2, fcmeqZeroCode)
1186    twoEqualRegInstX("fcmeq", "FcmeqZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1187                     fcmeqZeroCode)
1188    twoEqualRegInstX("fcmeq", "FcmeqZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1189                     fcmeqZeroCode, scalar=True)
1190    # FCMGE (register)
1191    fcmgeCode = fpCmpOp % "GE"
1192    threeEqualRegInstX("fcmge", "FcmgeDX", "SimdFloatCmpOp", smallFloatTypes,
1193                       2, fcmgeCode)
1194    threeEqualRegInstX("fcmge", "FcmgeQX", "SimdFloatCmpOp", floatTypes, 4,
1195                       fcmgeCode)
1196    threeEqualRegInstX("fcmge", "FcmgeScX", "SimdFloatCmpOp", floatTypes, 4,
1197                       fcmgeCode, scalar=True)
1198    # FCMGE (zero)
1199    fcmgeZeroCode = fpCmpZeroOp % "GE"
1200    twoEqualRegInstX("fcmge", "FcmgeZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1201                     2, fcmgeZeroCode)
1202    twoEqualRegInstX("fcmge", "FcmgeZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1203                     fcmgeZeroCode)
1204    twoEqualRegInstX("fcmge", "FcmgeZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1205                     fcmgeZeroCode, scalar=True)
1206    # FCMGT (register)
1207    fcmgtCode = fpCmpOp % "GT"
1208    threeEqualRegInstX("fcmgt", "FcmgtDX", "SimdFloatCmpOp", smallFloatTypes,
1209                       2, fcmgtCode)
1210    threeEqualRegInstX("fcmgt", "FcmgtQX", "SimdFloatCmpOp", floatTypes, 4,
1211                       fcmgtCode)
1212    threeEqualRegInstX("fcmgt", "FcmgtScX", "SimdFloatCmpOp", floatTypes, 4,
1213                       fcmgtCode, scalar=True)
1214    # FCMGT (zero)
1215    fcmgtZeroCode = fpCmpZeroOp % "GT"
1216    twoEqualRegInstX("fcmgt", "FcmgtZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1217                     2, fcmgtZeroCode)
1218    twoEqualRegInstX("fcmgt", "FcmgtZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1219                     fcmgtZeroCode)
1220    twoEqualRegInstX("fcmgt", "FcmgtZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1221                     fcmgtZeroCode, scalar=True)
1222    # FCMLE (zero)
1223    fpCmpRevZeroOp = fpOp % ("fplibCompare%s<Element>(0, srcElem1, fpscr) ?"
1224                             " -1 : 0")
1225    fcmleZeroCode = fpCmpRevZeroOp % "GE"
1226    twoEqualRegInstX("fcmle", "FcmleZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1227                     2, fcmleZeroCode)
1228    twoEqualRegInstX("fcmle", "FcmleZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1229                     fcmleZeroCode)
1230    twoEqualRegInstX("fcmle", "FcmleZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1231                     fcmleZeroCode, scalar=True)
1232    # FCMLT (zero)
1233    fcmltZeroCode = fpCmpRevZeroOp % "GT"
1234    twoEqualRegInstX("fcmlt", "FcmltZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1235                     2, fcmltZeroCode)
1236    twoEqualRegInstX("fcmlt", "FcmltZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1237                     fcmltZeroCode)
1238    twoEqualRegInstX("fcmlt", "FcmltZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1239                     fcmltZeroCode, scalar=True)
1240    # FCVTAS
1241    fcvtCode = fpOp % ("fplibFPToFixed<Element, Element>("
1242                       "srcElem1, %s, %s, %s, fpscr)")
1243    fcvtasCode = fcvtCode % ("0", "false", "FPRounding_TIEAWAY")
1244    twoEqualRegInstX("fcvtas", "FcvtasDX", "SimdCvtOp", smallFloatTypes, 2,
1245                     fcvtasCode)
1246    twoEqualRegInstX("fcvtas", "FcvtasQX", "SimdCvtOp", floatTypes, 4,
1247                     fcvtasCode)
1248    twoEqualRegInstX("fcvtas", "FcvtasScX", "SimdCvtOp", floatTypes, 4,
1249                     fcvtasCode, scalar=True)
1250    # FCVTAU
1251    fcvtauCode = fcvtCode % ("0", "true", "FPRounding_TIEAWAY")
1252    twoEqualRegInstX("fcvtau", "FcvtauDX", "SimdCvtOp", smallFloatTypes, 2,
1253                     fcvtauCode)
1254    twoEqualRegInstX("fcvtau", "FcvtauQX", "SimdCvtOp", floatTypes, 4,
1255                     fcvtauCode)
1256    twoEqualRegInstX("fcvtau", "FcvtauScX", "SimdCvtOp", floatTypes, 4,
1257                     fcvtauCode, scalar=True)
1258    # FCVTL, FCVTL2
1259    fcvtlCode = fpOp % ("fplibConvert<Element, BigElement>("
1260                        "srcElem1, FPCRRounding(fpscr), fpscr)")
1261    twoRegLongInstX("fcvtl", "FcvtlX", "SimdCvtOp", ("uint16_t", "uint32_t"),
1262                    fcvtlCode)
1263    twoRegLongInstX("fcvtl", "Fcvtl2X", "SimdCvtOp", ("uint16_t", "uint32_t"),
1264                    fcvtlCode, hi=True)
1265    # FCVTMS
1266    fcvtmsCode = fcvtCode % ("0", "false", "FPRounding_NEGINF")
1267    twoEqualRegInstX("fcvtms", "FcvtmsDX", "SimdCvtOp", smallFloatTypes, 2,
1268                     fcvtmsCode)
1269    twoEqualRegInstX("fcvtms", "FcvtmsQX", "SimdCvtOp", floatTypes, 4,
1270                     fcvtmsCode)
1271    twoEqualRegInstX("fcvtms", "FcvtmsScX", "SimdCvtOp", floatTypes, 4,
1272                     fcvtmsCode, scalar=True)
1273    # FCVTMU
1274    fcvtmuCode = fcvtCode % ("0", "true", "FPRounding_NEGINF")
1275    twoEqualRegInstX("fcvtmu", "FcvtmuDX", "SimdCvtOp", smallFloatTypes, 2,
1276                     fcvtmuCode)
1277    twoEqualRegInstX("fcvtmu", "FcvtmuQX", "SimdCvtOp", floatTypes, 4,
1278                     fcvtmuCode)
1279    twoEqualRegInstX("fcvtmu", "FcvtmuScX", "SimdCvtOp", floatTypes, 4,
1280                     fcvtmuCode, scalar=True)
1281    # FCVTN, FCVTN2
1282    fcvtnCode = fpOp % ("fplibConvert<BigElement, Element>("
1283                        "srcElem1, FPCRRounding(fpscr), fpscr)")
1284    twoRegNarrowInstX("fcvtn", "FcvtnX", "SimdCvtOp",
1285                      ("uint16_t", "uint32_t"), fcvtnCode)
1286    twoRegNarrowInstX("fcvtn", "Fcvtn2X", "SimdCvtOp",
1287                      ("uint16_t", "uint32_t"), fcvtnCode, hi=True)
1288    # FCVTNS
1289    fcvtnsCode = fcvtCode % ("0", "false", "FPRounding_TIEEVEN")
1290    twoEqualRegInstX("fcvtns", "FcvtnsDX", "SimdCvtOp", smallFloatTypes, 2,
1291                     fcvtnsCode)
1292    twoEqualRegInstX("fcvtns", "FcvtnsQX", "SimdCvtOp", floatTypes, 4,
1293                     fcvtnsCode)
1294    twoEqualRegInstX("fcvtns", "FcvtnsScX", "SimdCvtOp", floatTypes, 4,
1295                     fcvtnsCode, scalar=True)
1296    # FCVTNU
1297    fcvtnuCode = fcvtCode % ("0", "true", "FPRounding_TIEEVEN")
1298    twoEqualRegInstX("fcvtnu", "FcvtnuDX", "SimdCvtOp", smallFloatTypes, 2,
1299                     fcvtnuCode)
1300    twoEqualRegInstX("fcvtnu", "FcvtnuQX", "SimdCvtOp", floatTypes, 4,
1301                     fcvtnuCode)
1302    twoEqualRegInstX("fcvtnu", "FcvtnuScX", "SimdCvtOp", floatTypes, 4,
1303                     fcvtnuCode, scalar=True)
1304    # FCVTPS
1305    fcvtpsCode = fcvtCode % ("0", "false", "FPRounding_POSINF")
1306    twoEqualRegInstX("fcvtps", "FcvtpsDX", "SimdCvtOp", smallFloatTypes, 2,
1307                     fcvtpsCode)
1308    twoEqualRegInstX("fcvtps", "FcvtpsQX", "SimdCvtOp", floatTypes, 4,
1309                     fcvtpsCode)
1310    twoEqualRegInstX("fcvtps", "FcvtpsScX", "SimdCvtOp", floatTypes, 4,
1311                     fcvtpsCode, scalar=True)
1312    # FCVTPU
1313    fcvtpuCode = fcvtCode % ("0", "true", "FPRounding_POSINF")
1314    twoEqualRegInstX("fcvtpu", "FcvtpuDX", "SimdCvtOp", smallFloatTypes, 2,
1315                     fcvtpuCode)
1316    twoEqualRegInstX("fcvtpu", "FcvtpuQX", "SimdCvtOp", floatTypes, 4,
1317                     fcvtpuCode)
1318    twoEqualRegInstX("fcvtpu", "FcvtpuScX", "SimdCvtOp", floatTypes, 4,
1319                     fcvtpuCode, scalar=True)
1320    # FCVTXN, FCVTXN2
1321    fcvtxnCode = fpOp % ("fplibConvert<BigElement, Element>("
1322                         "srcElem1, FPRounding_ODD, fpscr)")
1323    twoRegNarrowInstX("fcvtxn", "FcvtxnX", "SimdCvtOp", smallFloatTypes,
1324                      fcvtxnCode)
1325    twoRegNarrowInstX("fcvtxn", "Fcvtxn2X", "SimdCvtOp", smallFloatTypes,
1326                      fcvtxnCode, hi=True)
1327    twoRegNarrowInstX("fcvtxn", "FcvtxnScX", "SimdCvtOp", smallFloatTypes,
1328                      fcvtxnCode, scalar=True)
1329    # FCVTZS (fixed-point)
1330    fcvtzsCode = fcvtCode % ("imm", "false", "FPRounding_ZERO")
1331    twoEqualRegInstX("fcvtzs", "FcvtzsFixedDX", "SimdCvtOp", smallFloatTypes,
1332                     2, fcvtzsCode, hasImm=True)
1333    twoEqualRegInstX("fcvtzs", "FcvtzsFixedQX", "SimdCvtOp", floatTypes, 4,
1334                     fcvtzsCode, hasImm=True)
1335    twoEqualRegInstX("fcvtzs", "FcvtzsFixedScX", "SimdCvtOp", floatTypes, 4,
1336                     fcvtzsCode, hasImm=True, scalar=True)
1337    # FCVTZS (integer)
1338    fcvtzsIntCode = fcvtCode % ("0", "false", "FPRounding_ZERO")
1339    twoEqualRegInstX("fcvtzs", "FcvtzsIntDX", "SimdCvtOp", smallFloatTypes,
1340                     2, fcvtzsIntCode)
1341    twoEqualRegInstX("fcvtzs", "FcvtzsIntQX", "SimdCvtOp", floatTypes, 4,
1342                     fcvtzsIntCode)
1343    twoEqualRegInstX("fcvtzs", "FcvtzsIntScX", "SimdCvtOp", floatTypes, 4,
1344                     fcvtzsIntCode, scalar=True)
1345    # FCVTZU (fixed-point)
1346    fcvtzuCode = fcvtCode % ("imm", "true", "FPRounding_ZERO")
1347    twoEqualRegInstX("fcvtzu", "FcvtzuFixedDX", "SimdCvtOp", smallFloatTypes,
1348                     2, fcvtzuCode, hasImm=True)
1349    twoEqualRegInstX("fcvtzu", "FcvtzuFixedQX", "SimdCvtOp", floatTypes, 4,
1350                     fcvtzuCode, hasImm=True)
1351    twoEqualRegInstX("fcvtzu", "FcvtzuFixedScX", "SimdCvtOp", floatTypes, 4,
1352                     fcvtzuCode, hasImm=True, scalar=True)
1353    # FCVTZU (integer)
1354    fcvtzuIntCode = fcvtCode % ("0", "true", "FPRounding_ZERO")
1355    twoEqualRegInstX("fcvtzu", "FcvtzuIntDX", "SimdCvtOp", smallFloatTypes, 2,
1356                     fcvtzuIntCode)
1357    twoEqualRegInstX("fcvtzu", "FcvtzuIntQX", "SimdCvtOp", floatTypes, 4,
1358                     fcvtzuIntCode)
1359    twoEqualRegInstX("fcvtzu", "FcvtzuIntScX", "SimdCvtOp", floatTypes, 4,
1360                     fcvtzuIntCode, scalar=True)
1361    # FDIV
1362    fdivCode = fpBinOp % "Div"
1363    threeEqualRegInstX("fdiv", "FdivDX", "SimdFloatDivOp", smallFloatTypes, 2,
1364                       fdivCode)
1365    threeEqualRegInstX("fdiv", "FdivQX", "SimdFloatDivOp", floatTypes, 4,
1366                       fdivCode)
1367    # FMAX
1368    fmaxCode = fpBinOp % "Max"
1369    threeEqualRegInstX("fmax", "FmaxDX", "SimdFloatCmpOp", smallFloatTypes, 2,
1370                       fmaxCode)
1371    threeEqualRegInstX("fmax", "FmaxQX", "SimdFloatCmpOp", floatTypes, 4,
1372                       fmaxCode)
1373    # FMAXNM
1374    fmaxnmCode = fpBinOp % "MaxNum"
1375    threeEqualRegInstX("fmaxnm", "FmaxnmDX", "SimdFloatCmpOp", smallFloatTypes,
1376                       2, fmaxnmCode)
1377    threeEqualRegInstX("fmaxnm", "FmaxnmQX", "SimdFloatCmpOp", floatTypes, 4,
1378                       fmaxnmCode)
1379    # FMAXNMP (scalar)
1380    twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScDX", "SimdFloatCmpOp",
1381                          ("uint32_t",), 2, fmaxnmCode)
1382    twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScQX", "SimdFloatCmpOp",
1383                          ("uint64_t",), 4, fmaxnmCode)
1384    # FMAXNMP (vector)
1385    threeEqualRegInstX("fmaxnmp", "FmaxnmpDX", "SimdFloatCmpOp",
1386                       smallFloatTypes, 2, fmaxnmCode, pairwise=True)
1387    threeEqualRegInstX("fmaxnmp", "FmaxnmpQX", "SimdFloatCmpOp", floatTypes, 4,
1388                       fmaxnmCode, pairwise=True)
1389    # FMAXNMV
1390    # Note: SimdFloatCmpOp can be a bit optimistic here
1391    fpAcrossOp = fpOp % "fplib%s<Element>(destElem, srcElem1, fpscr)"
1392    fmaxnmAcrossCode = fpAcrossOp % "MaxNum"
1393    twoRegAcrossInstX("fmaxnmv", "FmaxnmvQX", "SimdFloatCmpOp", ("uint32_t",),
1394                      4, fmaxnmAcrossCode)
1395    # FMAXP (scalar)
1396    twoRegPairwiseScInstX("fmaxp", "FmaxpScDX", "SimdFloatCmpOp",
1397                          ("uint32_t",), 2, fmaxCode)
1398    twoRegPairwiseScInstX("fmaxp", "FmaxpScQX", "SimdFloatCmpOp",
1399                          ("uint64_t",), 4, fmaxCode)
1400    # FMAXP (vector)
1401    threeEqualRegInstX("fmaxp", "FmaxpDX", "SimdFloatCmpOp", smallFloatTypes,
1402                       2, fmaxCode, pairwise=True)
1403    threeEqualRegInstX("fmaxp", "FmaxpQX", "SimdFloatCmpOp", floatTypes, 4,
1404                       fmaxCode, pairwise=True)
1405    # FMAXV
1406    # Note: SimdFloatCmpOp can be a bit optimistic here
1407    fmaxAcrossCode = fpAcrossOp % "Max"
1408    twoRegAcrossInstX("fmaxv", "FmaxvQX", "SimdFloatCmpOp", ("uint32_t",), 4,
1409                      fmaxAcrossCode)
1410    # FMIN
1411    fminCode = fpBinOp % "Min"
1412    threeEqualRegInstX("fmin", "FminDX", "SimdFloatCmpOp", smallFloatTypes, 2,
1413                       fminCode)
1414    threeEqualRegInstX("fmin", "FminQX", "SimdFloatCmpOp", floatTypes, 4,
1415                       fminCode)
1416    # FMINNM
1417    fminnmCode = fpBinOp % "MinNum"
1418    threeEqualRegInstX("fminnm", "FminnmDX", "SimdFloatCmpOp", smallFloatTypes,
1419                       2, fminnmCode)
1420    threeEqualRegInstX("fminnm", "FminnmQX", "SimdFloatCmpOp", floatTypes, 4,
1421                       fminnmCode)
1422    # FMINNMP (scalar)
1423    twoRegPairwiseScInstX("fminnmp", "FminnmpScDX", "SimdFloatCmpOp",
1424                          ("uint32_t",), 2, fminnmCode)
1425    twoRegPairwiseScInstX("fminnmp", "FminnmpScQX", "SimdFloatCmpOp",
1426                          ("uint64_t",), 4, fminnmCode)
1427    # FMINNMP (vector)
1428    threeEqualRegInstX("fminnmp", "FminnmpDX", "SimdFloatCmpOp",
1429                       smallFloatTypes, 2, fminnmCode, pairwise=True)
1430    threeEqualRegInstX("fminnmp", "FminnmpQX", "SimdFloatCmpOp", floatTypes, 4,
1431                       fminnmCode, pairwise=True)
1432    # FMINNMV
1433    # Note: SimdFloatCmpOp can be a bit optimistic here
1434    fminnmAcrossCode = fpAcrossOp % "MinNum"
1435    twoRegAcrossInstX("fminnmv", "FminnmvQX", "SimdFloatCmpOp", ("uint32_t",),
1436                      4, fminnmAcrossCode)
1437    # FMINP (scalar)
1438    twoRegPairwiseScInstX("fminp", "FminpScDX", "SimdFloatCmpOp",
1439                          ("uint32_t",), 2, fminCode)
1440    twoRegPairwiseScInstX("fminp", "FminpScQX", "SimdFloatCmpOp",
1441                          ("uint64_t",), 4, fminCode)
1442    # FMINP (vector)
1443    threeEqualRegInstX("fminp", "FminpDX", "SimdFloatCmpOp", smallFloatTypes,
1444                       2, fminCode, pairwise=True)
1445    threeEqualRegInstX("fminp", "FminpQX", "SimdFloatCmpOp", floatTypes, 4,
1446                       fminCode, pairwise=True)
1447    # FMINV
1448    # Note: SimdFloatCmpOp can be a bit optimistic here
1449    fminAcrossCode = fpAcrossOp % "Min"
1450    twoRegAcrossInstX("fminv", "FminvQX", "SimdFloatCmpOp", ("uint32_t",), 4,
1451                      fminAcrossCode)
1452    # FMLA (by element)
1453    fmlaCode = fpOp % ("fplibMulAdd<Element>("
1454                       "destElem, srcElem1, srcElem2, fpscr)")
1455    threeEqualRegInstX("fmla", "FmlaElemDX", "SimdFloatMultAccOp",
1456                       smallFloatTypes, 2, fmlaCode, True, byElem=True)
1457    threeEqualRegInstX("fmla", "FmlaElemQX", "SimdFloatMultAccOp", floatTypes,
1458                       4, fmlaCode, True, byElem=True)
1459    threeEqualRegInstX("fmla", "FmlaElemScX", "SimdFloatMultAccOp", floatTypes,
1460                       4, fmlaCode, True, byElem=True, scalar=True)
1461    # FMLA (vector)
1462    threeEqualRegInstX("fmla", "FmlaDX", "SimdFloatMultAccOp", smallFloatTypes,
1463                       2, fmlaCode, True)
1464    threeEqualRegInstX("fmla", "FmlaQX", "SimdFloatMultAccOp", floatTypes, 4,
1465                       fmlaCode, True)
1466    # FMLS (by element)
1467    fmlsCode = fpOp % ("fplibMulAdd<Element>(destElem,"
1468                       " fplibNeg<Element>(srcElem1), srcElem2, fpscr)")
1469    threeEqualRegInstX("fmls", "FmlsElemDX", "SimdFloatMultAccOp",
1470                       smallFloatTypes, 2, fmlsCode, True, byElem=True)
1471    threeEqualRegInstX("fmls", "FmlsElemQX", "SimdFloatMultAccOp", floatTypes,
1472                       4, fmlsCode, True, byElem=True)
1473    threeEqualRegInstX("fmls", "FmlsElemScX", "SimdFloatMultAccOp", floatTypes,
1474                       4, fmlsCode, True, byElem=True, scalar=True)
1475    # FMLS (vector)
1476    threeEqualRegInstX("fmls", "FmlsDX", "SimdFloatMultAccOp", smallFloatTypes,
1477                       2, fmlsCode, True)
1478    threeEqualRegInstX("fmls", "FmlsQX", "SimdFloatMultAccOp", floatTypes, 4,
1479                       fmlsCode, True)
1480    # FMOV
1481    fmovCode = 'destElem = imm;'
1482    oneRegImmInstX("fmov", "FmovDX", "SimdMiscOp", smallFloatTypes, 2,
1483                   fmovCode)
1484    oneRegImmInstX("fmov", "FmovQX", "SimdMiscOp", floatTypes, 4, fmovCode)
1485    # FMUL (by element)
1486    fmulCode = fpBinOp % "Mul"
1487    threeEqualRegInstX("fmul", "FmulElemDX", "SimdFloatMultOp",
1488                       smallFloatTypes, 2, fmulCode, byElem=True)
1489    threeEqualRegInstX("fmul", "FmulElemQX", "SimdFloatMultOp", floatTypes, 4,
1490                       fmulCode, byElem=True)
1491    threeEqualRegInstX("fmul", "FmulElemScX", "SimdFloatMultOp", floatTypes, 4,
1492                       fmulCode, byElem=True, scalar=True)
1493    # FMUL (vector)
1494    threeEqualRegInstX("fmul", "FmulDX", "SimdFloatMultOp", smallFloatTypes, 2,
1495                       fmulCode)
1496    threeEqualRegInstX("fmul", "FmulQX", "SimdFloatMultOp", floatTypes, 4,
1497                       fmulCode)
1498    # FMULX
1499    fmulxCode = fpBinOp % "MulX"
1500    threeEqualRegInstX("fmulx", "FmulxDX", "SimdFloatMultOp", smallFloatTypes,
1501                       2, fmulxCode)
1502    threeEqualRegInstX("fmulx", "FmulxQX", "SimdFloatMultOp", floatTypes, 4,
1503                       fmulxCode)
1504    threeEqualRegInstX("fmulx", "FmulxScX", "SimdFloatMultOp", floatTypes, 4,
1505                       fmulxCode, scalar=True)
1506    # FMULX (by element)
1507    threeEqualRegInstX("fmulx", "FmulxElemDX", "SimdFloatMultOp",
1508                       smallFloatTypes, 2, fmulxCode, byElem=True)
1509    threeEqualRegInstX("fmulx", "FmulxElemQX", "SimdFloatMultOp", floatTypes,
1510                       4, fmulxCode, byElem=True)
1511    threeEqualRegInstX("fmulx", "FmulxElemScX", "SimdFloatMultOp", floatTypes,
1512                       4, fmulxCode, byElem=True, scalar=True)
1513    # FNEG
1514    fnegCode = fpOp % "fplibNeg<Element>(srcElem1)"
1515    twoEqualRegInstX("Neg", "FnegDX", "SimdFloatAluOp", smallFloatTypes, 2,
1516                     fnegCode)
1517    twoEqualRegInstX("Neg", "FnegQX", "SimdFloatAluOp", floatTypes, 4,
1518                     fnegCode)
1519    # FRECPE
1520    frecpeCode = fpOp % "fplibRecipEstimate<Element>(srcElem1, fpscr)"
1521    twoEqualRegInstX("frecpe", "FrecpeDX", "SimdFloatMultAccOp",
1522                     smallFloatTypes, 2, frecpeCode)
1523    twoEqualRegInstX("frecpe", "FrecpeQX", "SimdFloatMultAccOp", floatTypes, 4,
1524                     frecpeCode)
1525    twoEqualRegInstX("frecpe", "FrecpeScX", "SimdFloatMultAccOp", floatTypes,
1526                     4, frecpeCode, scalar=True)
1527    # FRECPS
1528    frecpsCode = fpBinOp % "RecipStepFused"
1529    threeEqualRegInstX("frecps", "FrecpsDX", "SimdFloatMultAccOp",
1530                       smallFloatTypes, 2, frecpsCode)
1531    threeEqualRegInstX("frecps", "FrecpsQX", "SimdFloatMultAccOp", floatTypes,
1532                       4, frecpsCode)
1533    threeEqualRegInstX("frecps", "FrecpsScX", "SimdFloatMultAccOp", floatTypes,
1534                       4, frecpsCode, scalar=True)
1535    # FRECPX
1536    frecpxCode = fpOp % "fplibRecpX<Element>(srcElem1, fpscr)"
1537    twoEqualRegInstX("frecpx", "FrecpxX", "SimdFloatMultAccOp", floatTypes, 4,
1538                     frecpxCode, scalar=True)
1539    # FRINTA
1540    frintCode = fpOp % "fplibRoundInt<Element>(srcElem1, %s, %s, fpscr)"
1541    frintaCode = frintCode % ("FPRounding_TIEAWAY", "false")
1542    twoEqualRegInstX("frinta", "FrintaDX", "SimdCvtOp", smallFloatTypes, 2,
1543                     frintaCode)
1544    twoEqualRegInstX("frinta", "FrintaQX", "SimdCvtOp", floatTypes, 4,
1545                     frintaCode)
1546    # FRINTI
1547    frintiCode = frintCode % ("FPCRRounding(fpscr)", "false")
1548    twoEqualRegInstX("frinti", "FrintiDX", "SimdCvtOp", smallFloatTypes, 2,
1549                     frintiCode)
1550    twoEqualRegInstX("frinti", "FrintiQX", "SimdCvtOp", floatTypes, 4,
1551                     frintiCode)
1552    # FRINTM
1553    frintmCode = frintCode % ("FPRounding_NEGINF", "false")
1554    twoEqualRegInstX("frintm", "FrintmDX", "SimdCvtOp", smallFloatTypes, 2,
1555                     frintmCode)
1556    twoEqualRegInstX("frintm", "FrintmQX", "SimdCvtOp", floatTypes, 4,
1557                     frintmCode)
1558    # FRINTN
1559    frintnCode = frintCode % ("FPRounding_TIEEVEN", "false")
1560    twoEqualRegInstX("frintn", "FrintnDX", "SimdCvtOp", smallFloatTypes, 2,
1561                     frintnCode)
1562    twoEqualRegInstX("frintn", "FrintnQX", "SimdCvtOp", floatTypes, 4,
1563                     frintnCode)
1564    # FRINTP
1565    frintpCode = frintCode % ("FPRounding_POSINF", "false")
1566    twoEqualRegInstX("frintp", "FrintpDX", "SimdCvtOp", smallFloatTypes, 2,
1567                     frintpCode)
1568    twoEqualRegInstX("frintp", "FrintpQX", "SimdCvtOp", floatTypes, 4,
1569                     frintpCode)
1570    # FRINTX
1571    frintxCode = frintCode % ("FPCRRounding(fpscr)", "true")
1572    twoEqualRegInstX("frintx", "FrintxDX", "SimdCvtOp", smallFloatTypes, 2,
1573                     frintxCode)
1574    twoEqualRegInstX("frintx", "FrintxQX", "SimdCvtOp", floatTypes, 4,
1575                     frintxCode)
1576    # FRINTZ
1577    frintzCode = frintCode % ("FPRounding_ZERO", "false")
1578    twoEqualRegInstX("frintz", "FrintzDX", "SimdCvtOp", smallFloatTypes, 2,
1579                     frintzCode)
1580    twoEqualRegInstX("frintz", "FrintzQX", "SimdCvtOp", floatTypes, 4,
1581                     frintzCode)
1582    # FRSQRTE
1583    frsqrteCode = fpOp % "fplibRSqrtEstimate<Element>(srcElem1, fpscr)"
1584    twoEqualRegInstX("frsqrte", "FrsqrteDX", "SimdFloatSqrtOp",
1585                     smallFloatTypes, 2, frsqrteCode)
1586    twoEqualRegInstX("frsqrte", "FrsqrteQX", "SimdFloatSqrtOp", floatTypes, 4,
1587                     frsqrteCode)
1588    twoEqualRegInstX("frsqrte", "FrsqrteScX", "SimdFloatSqrtOp", floatTypes, 4,
1589                     frsqrteCode, scalar=True)
1590    # FRSQRTS
1591    frsqrtsCode = fpBinOp % "RSqrtStepFused"
1592    threeEqualRegInstX("frsqrts", "FrsqrtsDX", "SimdFloatMiscOp",
1593                       smallFloatTypes, 2, frsqrtsCode)
1594    threeEqualRegInstX("frsqrts", "FrsqrtsQX", "SimdFloatMiscOp", floatTypes,
1595                       4, frsqrtsCode)
1596    threeEqualRegInstX("frsqrts", "FrsqrtsScX", "SimdFloatMiscOp", floatTypes,
1597                       4, frsqrtsCode, scalar=True)
1598    # FSQRT
1599    fsqrtCode = fpOp % "fplibSqrt<Element>(srcElem1, fpscr)"
1600    twoEqualRegInstX("fsqrt", "FsqrtDX", "SimdFloatSqrtOp", smallFloatTypes, 2,
1601                     fsqrtCode)
1602    twoEqualRegInstX("fsqrt", "FsqrtQX", "SimdFloatSqrtOp", floatTypes, 4,
1603                     fsqrtCode)
1604    # FSUB
1605    fsubCode = fpBinOp % "Sub"
1606    threeEqualRegInstX("fsub", "FsubDX", "SimdFloatAddOp", smallFloatTypes, 2,
1607                       fsubCode)
1608    threeEqualRegInstX("fsub", "FsubQX", "SimdFloatAddOp", floatTypes, 4,
1609                       fsubCode)
1610    # INS (element)
1611    insFromVecElemInstX("ins", "InsElemX", "SimdMiscOp", unsignedTypes, 4)
1612    # INS (general register)
1613    insFromGprInstX("ins", "InsGprWX", "SimdMiscOp", smallUnsignedTypes, 4,
1614                    'W')
1615    insFromGprInstX("ins", "InsGprXX", "SimdMiscOp", unsignedTypes, 4, 'X')
1616    # MLA (by element)
1617    mlaCode = "destElem += srcElem1 * srcElem2;"
1618    threeEqualRegInstX("mla", "MlaElemDX", "SimdMultAccOp",
1619                       ("uint16_t", "uint32_t"), 2, mlaCode, True, byElem=True)
1620    threeEqualRegInstX("mla", "MlaElemQX", "SimdMultAccOp",
1621                       ("uint16_t", "uint32_t"), 4, mlaCode, True, byElem=True)
1622    # MLA (vector)
1623    threeEqualRegInstX("mla", "MlaDX", "SimdMultAccOp", smallUnsignedTypes, 2,
1624                       mlaCode, True)
1625    threeEqualRegInstX("mla", "MlaQX", "SimdMultAccOp", smallUnsignedTypes, 4,
1626                       mlaCode, True)
1627    # MLS (by element)
1628    mlsCode = "destElem -= srcElem1 * srcElem2;"
1629    threeEqualRegInstX("mls", "MlsElemDX", "SimdMultAccOp",
1630                       ("uint16_t", "uint32_t"), 2, mlsCode, True, byElem=True)
1631    threeEqualRegInstX("mls", "MlsElemQX", "SimdMultAccOp",
1632                       ("uint16_t", "uint32_t"), 4, mlsCode, True, byElem=True)
1633    # MLS (vector)
1634    threeEqualRegInstX("mls", "MlsDX", "SimdMultAccOp", smallUnsignedTypes, 2,
1635                       mlsCode, True)
1636    threeEqualRegInstX("mls", "MlsQX", "SimdMultAccOp", smallUnsignedTypes, 4,
1637                       mlsCode, True)
1638    # MOV (element) -> alias to INS (element)
1639    # MOV (from general) -> alias to INS (general register)
1640    # MOV (scalar) -> alias to DUP (element)
1641    # MOV (to general) -> alias to UMOV
1642    # MOV (vector) -> alias to ORR (register)
1643    # MOVI
1644    movImmCode = "destElem = imm;"
1645    oneRegImmInstX("movi", "MoviDX", "SimdMiscOp", ("uint64_t",), 2,
1646                   movImmCode)
1647    oneRegImmInstX("movi", "MoviQX", "SimdMiscOp", ("uint64_t",), 4,
1648                   movImmCode)
1649    # MUL (by element)
1650    mulCode = "destElem = srcElem1 * srcElem2;"
1651    threeEqualRegInstX("mul", "MulElemDX", "SimdMultOp",
1652                       ("uint16_t", "uint32_t"), 2, mulCode, byElem=True)
1653    threeEqualRegInstX("mul", "MulElemQX", "SimdMultOp",
1654                       ("uint16_t", "uint32_t"), 4, mulCode, byElem=True)
1655    # MUL (vector)
1656    threeEqualRegInstX("mul", "MulDX", "SimdMultOp", smallUnsignedTypes, 2,
1657                       mulCode)
1658    threeEqualRegInstX("mul", "MulQX", "SimdMultOp", smallUnsignedTypes, 4,
1659                       mulCode)
1660    # MVN
1661    mvnCode = "destElem = ~srcElem1;"
1662    twoEqualRegInstX("mvn", "MvnDX", "SimdAluOp", ("uint64_t",), 2, mvnCode)
1663    twoEqualRegInstX("mvn", "MvnQX", "SimdAluOp", ("uint64_t",), 4, mvnCode)
1664    # MVNI
1665    mvniCode = "destElem = ~imm;"
1666    oneRegImmInstX("mvni", "MvniDX", "SimdAluOp", ("uint64_t",), 2, mvniCode)
1667    oneRegImmInstX("mvni", "MvniQX", "SimdAluOp", ("uint64_t",), 4, mvniCode)
1668    # NEG
1669    negCode = "destElem = -srcElem1;"
1670    twoEqualRegInstX("neg", "NegDX", "SimdAluOp", signedTypes, 2, negCode)
1671    twoEqualRegInstX("neg", "NegQX", "SimdAluOp", signedTypes, 4, negCode)
1672    # NOT -> alias to MVN
1673    # ORN
1674    ornCode = "destElem = srcElem1 | ~srcElem2;"
1675    threeEqualRegInstX("orn", "OrnDX", "SimdAluOp", ("uint64_t",), 2, ornCode)
1676    threeEqualRegInstX("orn", "OrnQX", "SimdAluOp", ("uint64_t",), 4, ornCode)
1677    # ORR (immediate)
1678    orrImmCode = "destElem |= imm;"
1679    oneRegImmInstX("orr", "OrrImmDX", "SimdAluOp", ("uint64_t",), 2,
1680                   orrImmCode, True)
1681    oneRegImmInstX("orr", "OrrImmQX", "SimdAluOp", ("uint64_t",), 4,
1682                   orrImmCode, True)
1683    # ORR (register)
1684    orrCode = "destElem = srcElem1 | srcElem2;"
1685    threeEqualRegInstX("orr", "OrrDX", "SimdAluOp", ("uint64_t",), 2, orrCode)
1686    threeEqualRegInstX("orr", "OrrQX", "SimdAluOp", ("uint64_t",), 4, orrCode)
1687    # PMUL
1688    pmulCode = '''
1689            destElem = 0;
1690            for (unsigned j = 0; j < sizeof(Element) * 8; j++) {
1691                if (bits(srcElem2, j))
1692                    destElem ^= srcElem1 << j;
1693            }
1694    '''
1695    threeEqualRegInstX("pmul", "PmulDX", "SimdMultOp", ("uint8_t",), 2,
1696                       pmulCode)
1697    threeEqualRegInstX("pmul", "PmulQX", "SimdMultOp", ("uint8_t",), 4,
1698                       pmulCode)
1699    # PMULL, PMULL2
1700    # Note: 64-bit PMULL is not available (Crypto. Extension)
1701    pmullCode = '''
1702            destElem = 0;
1703            for (unsigned j = 0; j < sizeof(Element) * 8; j++) {
1704                if (bits(srcElem2, j))
1705                    destElem ^= (BigElement)srcElem1 << j;
1706            }
1707    '''
1708    threeRegLongInstX("pmull", "PmullX", "SimdMultOp", ("uint8_t",), pmullCode)
1709    threeRegLongInstX("pmull", "Pmull2X", "SimdMultOp", ("uint8_t",),
1710                      pmullCode, hi=True)
1711    # RADDHN, RADDHN2
1712    raddhnCode = '''
1713            destElem = ((BigElement)srcElem1 + (BigElement)srcElem2 +
1714                        ((BigElement)1 << (sizeof(Element) * 8 - 1))) >>
1715                       (sizeof(Element) * 8);
1716    '''
1717    threeRegNarrowInstX("raddhn", "RaddhnX", "SimdAddOp", smallUnsignedTypes,
1718                        raddhnCode)
1719    threeRegNarrowInstX("raddhn2", "Raddhn2X", "SimdAddOp", smallUnsignedTypes,
1720                        raddhnCode, hi=True)
1721    # RBIT
1722    rbitCode = '''
1723            destElem = 0;
1724            Element temp = srcElem1;
1725            for (int i = 0; i < 8 * sizeof(Element); i++) {
1726                destElem = destElem  | ((temp & 0x1) <<
1727                                        (8 * sizeof(Element) - 1 - i));
1728                temp >>= 1;
1729            }
1730    '''
1731    twoEqualRegInstX("rbit", "RbitDX", "SimdAluOp", ("uint8_t",), 2, rbitCode)
1732    twoEqualRegInstX("rbit", "RbitQX", "SimdAluOp", ("uint8_t",), 4, rbitCode)
1733    # REV16
1734    rev16Code = '''
1735            destElem = srcElem1;
1736            unsigned groupSize = ((1 << 1) / sizeof(Element));
1737            unsigned reverseMask = (groupSize - 1);
1738            j = i ^ reverseMask;
1739    '''
1740    twoEqualRegInstX("rev16", "Rev16DX", "SimdAluOp", ("uint8_t",), 2,
1741                     rev16Code)
1742    twoEqualRegInstX("rev16", "Rev16QX", "SimdAluOp", ("uint8_t",), 4,
1743                     rev16Code)
1744    # REV32
1745    rev32Code = '''
1746            destElem = srcElem1;
1747            unsigned groupSize = ((1 << 2) / sizeof(Element));
1748            unsigned reverseMask = (groupSize - 1);
1749            j = i ^ reverseMask;
1750    '''
1751    twoEqualRegInstX("rev32", "Rev32DX", "SimdAluOp", ("uint8_t", "uint16_t"),
1752                     2, rev32Code)
1753    twoEqualRegInstX("rev32", "Rev32QX", "SimdAluOp", ("uint8_t", "uint16_t"),
1754                     4, rev32Code)
1755    # REV64
1756    rev64Code = '''
1757            destElem = srcElem1;
1758            unsigned groupSize = ((1 << 3) / sizeof(Element));
1759            unsigned reverseMask = (groupSize - 1);
1760            j = i ^ reverseMask;
1761    '''
1762    twoEqualRegInstX("rev64", "Rev64DX", "SimdAluOp", smallUnsignedTypes, 2,
1763                     rev64Code)
1764    twoEqualRegInstX("rev64", "Rev64QX", "SimdAluOp", smallUnsignedTypes, 4,
1765                     rev64Code)
1766    # RSHRN, RSHRN2
1767    rshrnCode = '''
1768            if (imm > sizeof(srcElem1) * 8) {
1769                destElem = 0;
1770            } else if (imm) {
1771                Element rBit = bits(srcElem1, imm - 1);
1772                destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit;
1773            } else {
1774                destElem = srcElem1;
1775            }
1776    '''
1777    twoRegNarrowInstX("rshrn", "RshrnX", "SimdShiftOp", smallUnsignedTypes,
1778                      rshrnCode, hasImm=True)
1779    twoRegNarrowInstX("rshrn2", "Rshrn2X", "SimdShiftOp", smallUnsignedTypes,
1780                      rshrnCode, hasImm=True, hi=True)
1781    # RSUBHN, RSUBHN2
1782    rsubhnCode = '''
1783            destElem = ((BigElement)srcElem1 - (BigElement)srcElem2 +
1784                        ((BigElement)1 << (sizeof(Element) * 8 - 1))) >>
1785                       (sizeof(Element) * 8);
1786    '''
1787    threeRegNarrowInstX("rsubhn", "RsubhnX", "SimdAddOp", smallTypes,
1788                        rsubhnCode)
1789    threeRegNarrowInstX("rsubhn2", "Rsubhn2X", "SimdAddOp", smallTypes,
1790                        rsubhnCode, hi=True)
1791    # SABA
1792    abaCode = '''
1793            destElem += (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) :
1794                                                (srcElem2 - srcElem1);
1795    '''
1796    threeEqualRegInstX("saba", "SabaDX", "SimdAddAccOp", smallSignedTypes, 2,
1797                       abaCode, True)
1798    threeEqualRegInstX("saba", "SabaQX", "SimdAddAccOp", smallSignedTypes, 4,
1799                       abaCode, True)
1800    # SABAL, SABAL2
1801    abalCode = '''
1802            destElem += (srcElem1 > srcElem2) ?
1803                ((BigElement)srcElem1 - (BigElement)srcElem2) :
1804                ((BigElement)srcElem2 - (BigElement)srcElem1);
1805    '''
1806    threeRegLongInstX("sabal", "SabalX", "SimdAddAccOp", smallSignedTypes,
1807                      abalCode, True)
1808    threeRegLongInstX("sabal2", "Sabal2X", "SimdAddAccOp", smallSignedTypes,
1809                      abalCode, True, hi=True)
1810    # SABD
1811    abdCode = '''
1812            destElem = (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) :
1813                                               (srcElem2 - srcElem1);
1814    '''
1815    threeEqualRegInstX("sabd", "SabdDX", "SimdAddOp", smallSignedTypes, 2,
1816                       abdCode)
1817    threeEqualRegInstX("sabd", "SabdQX", "SimdAddOp", smallSignedTypes, 4,
1818                       abdCode)
1819    # SABDL, SABDL2
1820    abdlCode = '''
1821            destElem = (srcElem1 > srcElem2) ?
1822                ((BigElement)srcElem1 - (BigElement)srcElem2) :
1823                ((BigElement)srcElem2 - (BigElement)srcElem1);
1824    '''
1825    threeRegLongInstX("sabdl", "SabdlX", "SimdAddAccOp", smallSignedTypes,
1826                      abdlCode, True)
1827    threeRegLongInstX("sabdl2", "Sabdl2X", "SimdAddAccOp", smallSignedTypes,
1828                      abdlCode, True, hi=True)
1829    # SADALP
1830    adalpCode = "destElem += (BigElement)srcElem1 + (BigElement)srcElem2;"
1831    twoRegCondenseInstX("sadalp", "SadalpDX", "SimdAddOp", smallSignedTypes, 2,
1832                        adalpCode, True)
1833    twoRegCondenseInstX("sadalp", "SadalpQX", "SimdAddOp", smallSignedTypes, 4,
1834                        adalpCode, True)
1835    # SADDL, SADDL2
1836    addlwCode = "destElem = (BigElement)srcElem1 + (BigElement)srcElem2;"
1837    threeRegLongInstX("saddl", "SaddlX", "SimdAddAccOp", smallSignedTypes,
1838                      addlwCode)
1839    threeRegLongInstX("saddl2", "Saddl2X", "SimdAddAccOp", smallSignedTypes,
1840                      addlwCode, hi=True)
1841    # SADDLP
1842    twoRegCondenseInstX("saddlp", "SaddlpDX", "SimdAddOp", smallSignedTypes, 2,
1843                        addlwCode)
1844    twoRegCondenseInstX("saddlp", "SaddlpQX", "SimdAddOp", smallSignedTypes, 4,
1845                        addlwCode)
1846    # SADDLV
1847    # Note: SimdAddOp can be a bit optimistic here
1848    addAcrossLongCode = "destElem += (BigElement)srcElem1;"
1849    twoRegAcrossInstX("saddlv", "SaddlvDX", "SimdAddOp", ("int8_t", "int16_t"),
1850                      2, addAcrossLongCode, long=True)
1851    twoRegAcrossInstX("saddlv", "SaddlvQX", "SimdAddOp", ("int8_t", "int16_t"),
1852                      4, addAcrossLongCode, long=True)
1853    twoRegAcrossInstX("saddlv", "SaddlvBQX", "SimdAddOp", ("int32_t",), 4,
1854                      addAcrossLongCode, doubleDest=True, long=True)
1855    # SADDW, SADDW2
1856    threeRegWideInstX("saddw", "SaddwX", "SimdAddAccOp", smallSignedTypes,
1857                      addlwCode)
1858    threeRegWideInstX("saddw2", "Saddw2X", "SimdAddAccOp", smallSignedTypes,
1859                      addlwCode, hi=True)
1860    # SCVTF (fixed-point)
1861    scvtfFixedCode = fpOp % ("fplibFixedToFP<Element>((int%d_t) srcElem1, imm,"
1862                             " false, FPCRRounding(fpscr), fpscr)")
1863    twoEqualRegInstX("scvtf", "ScvtfFixedDX", "SimdCvtOp", smallFloatTypes, 2,
1864                     scvtfFixedCode % 32, hasImm=True)
1865    twoEqualRegInstX("scvtf", "ScvtfFixedSQX", "SimdCvtOp", smallFloatTypes, 4,
1866                     scvtfFixedCode % 32, hasImm=True)
1867    twoEqualRegInstX("scvtf", "ScvtfFixedDQX", "SimdCvtOp", ("uint64_t",), 4,
1868                     scvtfFixedCode % 64, hasImm=True)
1869    twoEqualRegInstX("scvtf", "ScvtfFixedScSX", "SimdCvtOp", smallFloatTypes,
1870                     4, scvtfFixedCode % 32, hasImm=True, scalar=True)
1871    twoEqualRegInstX("scvtf", "ScvtfFixedScDX", "SimdCvtOp", ("uint64_t",), 4,
1872                     scvtfFixedCode % 64, hasImm=True, scalar=True)
1873    # SCVTF (integer)
1874    scvtfIntCode = fpOp % ("fplibFixedToFP<Element>((int%d_t) srcElem1, 0,"
1875                           " false, FPCRRounding(fpscr), fpscr)")
1876    twoEqualRegInstX("scvtf", "ScvtfIntDX", "SimdCvtOp", smallFloatTypes, 2,
1877                     scvtfIntCode % 32)
1878    twoEqualRegInstX("scvtf", "ScvtfIntSQX", "SimdCvtOp", smallFloatTypes, 4,
1879                     scvtfIntCode % 32)
1880    twoEqualRegInstX("scvtf", "ScvtfIntDQX", "SimdCvtOp", ("uint64_t",), 4,
1881                     scvtfIntCode % 64)
1882    twoEqualRegInstX("scvtf", "ScvtfIntScSX", "SimdCvtOp", smallFloatTypes, 4,
1883                     scvtfIntCode % 32, scalar=True)
1884    twoEqualRegInstX("scvtf", "ScvtfIntScDX", "SimdCvtOp", ("uint64_t",), 4,
1885                     scvtfIntCode % 64, scalar=True)
1886    # SHADD
1887    haddCode = '''
1888            Element carryBit =
1889                (((unsigned)srcElem1 & 0x1) +
1890                 ((unsigned)srcElem2 & 0x1)) >> 1;
1891            // Use division instead of a shift to ensure the sign extension works
1892            // right. The compiler will figure out if it can be a shift. Mask the
1893            // inputs so they get truncated correctly.
1894            destElem = (((srcElem1 & ~(Element)1) / 2) +
1895                        ((srcElem2 & ~(Element)1) / 2)) + carryBit;
1896    '''
1897    threeEqualRegInstX("shadd", "ShaddDX", "SimdAddOp", smallSignedTypes, 2,
1898                       haddCode)
1899    threeEqualRegInstX("shadd", "ShaddQX", "SimdAddOp", smallSignedTypes, 4,
1900                       haddCode)
1901    # SHL
1902    shlCode = '''
1903            if (imm >= sizeof(Element) * 8)
1904                destElem = (srcElem1 << (sizeof(Element) * 8 - 1)) << 1;
1905            else
1906                destElem = srcElem1 << imm;
1907    '''
1908    twoEqualRegInstX("shl", "ShlDX", "SimdShiftOp", unsignedTypes, 2, shlCode,
1909                     hasImm=True)
1910    twoEqualRegInstX("shl", "ShlQX", "SimdShiftOp", unsignedTypes, 4, shlCode,
1911                     hasImm=True)
1912    # SHLL, SHLL2
1913    shllCode = "destElem = ((BigElement)srcElem1) << (sizeof(Element) * 8);"
1914    twoRegLongInstX("shll", "ShllX", "SimdShiftOp", smallTypes, shllCode)
1915    twoRegLongInstX("shll", "Shll2X", "SimdShiftOp", smallTypes, shllCode,
1916                    hi=True)
1917    # SHRN, SHRN2
1918    shrnCode = '''
1919            if (imm >= sizeof(srcElem1) * 8) {
1920                destElem = 0;
1921            } else {
1922                destElem = srcElem1 >> imm;
1923            }
1924    '''
1925    twoRegNarrowInstX("shrn", "ShrnX", "SimdShiftOp", smallUnsignedTypes,
1926                      shrnCode, hasImm=True)
1927    twoRegNarrowInstX("shrn2", "Shrn2X", "SimdShiftOp", smallUnsignedTypes,
1928                      shrnCode, hasImm=True, hi=True)
1929    # SHSUB
1930    hsubCode = '''
1931            Element borrowBit =
1932                (((srcElem1 & 0x1) - (srcElem2 & 0x1)) >> 1) & 0x1;
1933            // Use division instead of a shift to ensure the sign extension works
1934            // right. The compiler will figure out if it can be a shift. Mask the
1935            // inputs so they get truncated correctly.
1936            destElem = (((srcElem1 & ~(Element)1) / 2) -
1937                        ((srcElem2 & ~(Element)1) / 2)) - borrowBit;
1938    '''
1939    threeEqualRegInstX("shsub", "ShsubDX", "SimdAddOp", smallSignedTypes, 2,
1940                       hsubCode)
1941    threeEqualRegInstX("shsub", "ShsubQX", "SimdAddOp", smallSignedTypes, 4,
1942                       hsubCode)
1943    # SLI
1944    sliCode = '''
1945            if (imm >= sizeof(Element) * 8)
1946                destElem = destElem;
1947            else
1948                destElem = (srcElem1 << imm) | (destElem & mask(imm));
1949    '''
1950    twoEqualRegInstX("sli", "SliDX", "SimdShiftOp", unsignedTypes, 2, sliCode,
1951                     True, hasImm=True)
1952    twoEqualRegInstX("sli", "SliQX", "SimdShiftOp", unsignedTypes, 4, sliCode,
1953                     True, hasImm=True)
1954    # SMAX
1955    maxCode = "destElem = (srcElem1 > srcElem2) ? srcElem1 : srcElem2;"
1956    threeEqualRegInstX("smax", "SmaxDX", "SimdCmpOp", smallSignedTypes, 2,
1957                       maxCode)
1958    threeEqualRegInstX("smax", "SmaxQX", "SimdCmpOp", smallSignedTypes, 4,
1959                       maxCode)
1960    # SMAXP
1961    threeEqualRegInstX("smaxp", "SmaxpDX", "SimdCmpOp", smallSignedTypes, 2,
1962                       maxCode, pairwise=True)
1963    threeEqualRegInstX("smaxp", "SmaxpQX", "SimdCmpOp", smallSignedTypes, 4,
1964                       maxCode, pairwise=True)
1965    # SMAXV
1966    maxAcrossCode = '''
1967            if (i == 0 || srcElem1 > destElem)
1968                destElem = srcElem1;
1969    '''
1970    twoRegAcrossInstX("smaxv", "SmaxvDX", "SimdCmpOp", ("int8_t", "int16_t"),
1971                      2, maxAcrossCode)
1972    twoRegAcrossInstX("smaxv", "SmaxvQX", "SimdCmpOp", smallSignedTypes, 4,
1973                      maxAcrossCode)
1974    # SMIN
1975    minCode = "destElem = (srcElem1 < srcElem2) ? srcElem1 : srcElem2;"
1976    threeEqualRegInstX("smin", "SminDX", "SimdCmpOp", smallSignedTypes, 2,
1977                       minCode)
1978    threeEqualRegInstX("smin", "SminQX", "SimdCmpOp", smallSignedTypes, 4,
1979                       minCode)
1980    # SMINP
1981    threeEqualRegInstX("sminp", "SminpDX", "SimdCmpOp", smallSignedTypes, 2,
1982                       minCode, pairwise=True)
1983    threeEqualRegInstX("sminp", "SminpQX", "SimdCmpOp", smallSignedTypes, 4,
1984                       minCode, pairwise=True)
1985    # SMINV
1986    minAcrossCode = '''
1987            if (i == 0 || srcElem1 < destElem)
1988                destElem = srcElem1;
1989    '''
1990    twoRegAcrossInstX("sminv", "SminvDX", "SimdCmpOp", ("int8_t", "int16_t"),
1991                      2, minAcrossCode)
1992    twoRegAcrossInstX("sminv", "SminvQX", "SimdCmpOp", smallSignedTypes, 4,
1993                      minAcrossCode)
1994
1995    split('exec')
1996
1997    # SMLAL, SMLAL2 (by element)
1998    mlalCode = "destElem += (BigElement)srcElem1 * (BigElement)srcElem2;"
1999    threeRegLongInstX("smlal", "SmlalElemX", "SimdMultAccOp",
2000                      ("int16_t", "int32_t"), mlalCode, True, byElem=True)
2001    threeRegLongInstX("smlal", "SmlalElem2X", "SimdMultAccOp",
2002                      ("int16_t", "int32_t"), mlalCode, True, byElem=True,
2003                      hi=True)
2004    # SMLAL, SMLAL2 (vector)
2005    threeRegLongInstX("smlal", "SmlalX", "SimdMultAccOp", smallSignedTypes,
2006                      mlalCode, True)
2007    threeRegLongInstX("smlal", "Smlal2X", "SimdMultAccOp", smallSignedTypes,
2008                      mlalCode, True, hi=True)
2009    # SMLSL, SMLSL2 (by element)
2010    mlslCode = "destElem -= (BigElement)srcElem1 * (BigElement)srcElem2;"
2011    threeRegLongInstX("smlsl", "SmlslElemX", "SimdMultAccOp", smallSignedTypes,
2012                      mlslCode, True, byElem=True)
2013    threeRegLongInstX("smlsl", "SmlslElem2X", "SimdMultAccOp",
2014                      smallSignedTypes, mlslCode, True, byElem=True, hi=True)
2015    # SMLSL, SMLSL2 (vector)
2016    threeRegLongInstX("smlsl", "SmlslX", "SimdMultAccOp", smallSignedTypes,
2017                      mlslCode, True)
2018    threeRegLongInstX("smlsl", "Smlsl2X", "SimdMultAccOp", smallSignedTypes,
2019                      mlslCode, True, hi=True)
2020    # SMOV
2021    insToGprInstX("smov", "SmovWX", "SimdMiscOp", ("int8_t", "int16_t"), 4,
2022                  'W', True)
2023    insToGprInstX("smov", "SmovXX", "SimdMiscOp", smallSignedTypes, 4, 'X',
2024                  True)
2025    # SMULL, SMULL2 (by element)
2026    mullCode = "destElem = (BigElement)srcElem1 * (BigElement)srcElem2;"
2027    threeRegLongInstX("smull", "SmullElemX", "SimdMultOp", smallSignedTypes,
2028                      mullCode, byElem=True)
2029    threeRegLongInstX("smull", "SmullElem2X", "SimdMultOp", smallSignedTypes,
2030                      mullCode, byElem=True, hi=True)
2031    # SMULL, SMULL2 (vector)
2032    threeRegLongInstX("smull", "SmullX", "SimdMultOp", smallSignedTypes,
2033                      mullCode)
2034    threeRegLongInstX("smull", "Smull2X", "SimdMultOp", smallSignedTypes,
2035                      mullCode, hi=True)
2036    # SQABS
2037    sqabsCode = '''
2038        FPSCR fpscr = (FPSCR) FpscrQc;
2039        if (srcElem1 == (Element)(std::numeric_limits<Element>::min())) {
2040            fpscr.qc = 1;
2041            destElem = ~srcElem1;
2042        } else if (srcElem1 < 0) {
2043            destElem = -srcElem1;
2044        } else {
2045            destElem = srcElem1;
2046        }
2047        FpscrQc = fpscr;
2048    '''
2049    twoEqualRegInstX("sqabs", "SqabsDX", "SimdAluOp", smallSignedTypes, 2,
2050                     sqabsCode)
2051    twoEqualRegInstX("sqabs", "SqabsQX", "SimdAluOp", signedTypes, 4,
2052                     sqabsCode)
2053    twoEqualRegInstX("sqabs", "SqabsScX", "SimdAluOp", signedTypes, 4,
2054                     sqabsCode, scalar=True)
2055    # SQADD
2056    sqaddCode = '''
2057            destElem = srcElem1 + srcElem2;
2058            FPSCR fpscr = (FPSCR) FpscrQc;
2059            bool negDest = (destElem < 0);
2060            bool negSrc1 = (srcElem1 < 0);
2061            bool negSrc2 = (srcElem2 < 0);
2062            if ((negDest != negSrc1) && (negSrc1 == negSrc2)) {
2063                destElem = std::numeric_limits<Element>::min();
2064                if (negDest)
2065                    destElem -= 1;
2066                fpscr.qc = 1;
2067            }
2068            FpscrQc = fpscr;
2069    '''
2070    threeEqualRegInstX("sqadd", "SqaddDX", "SimdAddOp", smallSignedTypes, 2,
2071                       sqaddCode)
2072    threeEqualRegInstX("sqadd", "SqaddQX", "SimdAddOp", signedTypes, 4,
2073                       sqaddCode)
2074    threeEqualRegInstX("sqadd", "SqaddScX", "SimdAddOp", signedTypes, 4,
2075                       sqaddCode, scalar=True)
2076    # SQDMLAL, SQDMLAL2 (by element)
2077    qdmlalCode = '''
2078        FPSCR fpscr = (FPSCR) FpscrQc;
2079        BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2080        Element maxNeg = std::numeric_limits<Element>::min();
2081        Element halfNeg = maxNeg / 2;
2082        if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2083            (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2084            (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2085            midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8));
2086            fpscr.qc = 1;
2087        }
2088        bool negPreDest = ltz(destElem);
2089        destElem += midElem;
2090        bool negDest = ltz(destElem);
2091        bool negMid = ltz(midElem);
2092        if (negPreDest == negMid && negMid != negDest) {
2093            destElem = mask(sizeof(BigElement) * 8 - 1);
2094            if (negPreDest)
2095                destElem = ~destElem;
2096            fpscr.qc = 1;
2097        }
2098        FpscrQc = fpscr;
2099    '''
2100    threeRegLongInstX("sqdmlal", "SqdmlalElemX", "SimdMultAccOp",
2101                      ("int16_t", "int32_t"), qdmlalCode, True, byElem=True)
2102    threeRegLongInstX("sqdmlal", "SqdmlalElem2X", "SimdMultAccOp",
2103                      ("int16_t", "int32_t"), qdmlalCode, True, byElem=True,
2104                      hi=True)
2105    threeRegLongInstX("sqdmlal", "SqdmlalElemScX", "SimdMultAccOp",
2106                      ("int16_t", "int32_t"), qdmlalCode, True, byElem=True,
2107                      scalar=True)
2108    # SQDMLAL, SQDMLAL2 (vector)
2109    threeRegLongInstX("sqdmlal", "SqdmlalX", "SimdMultAccOp",
2110                      ("int16_t", "int32_t"), qdmlalCode, True)
2111    threeRegLongInstX("sqdmlal", "Sqdmlal2X", "SimdMultAccOp",
2112                      ("int16_t", "int32_t"), qdmlalCode, True, hi=True)
2113    threeRegLongInstX("sqdmlal", "SqdmlalScX", "SimdMultAccOp",
2114                      ("int16_t", "int32_t"), qdmlalCode, True, scalar=True)
2115    # SQDMLSL, SQDMLSL2 (by element)
2116    qdmlslCode = '''
2117        FPSCR fpscr = (FPSCR) FpscrQc;
2118        BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2119        Element maxNeg = std::numeric_limits<Element>::min();
2120        Element halfNeg = maxNeg / 2;
2121        if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2122            (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2123            (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2124            midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8));
2125            fpscr.qc = 1;
2126        }
2127        bool negPreDest = ltz(destElem);
2128        destElem -= midElem;
2129        bool negDest = ltz(destElem);
2130        bool posMid = ltz((BigElement)-midElem);
2131        if (negPreDest == posMid && posMid != negDest) {
2132            destElem = mask(sizeof(BigElement) * 8 - 1);
2133            if (negPreDest)
2134                destElem = ~destElem;
2135            fpscr.qc = 1;
2136        }
2137        FpscrQc = fpscr;
2138    '''
2139    threeRegLongInstX("sqdmlsl", "SqdmlslElemX", "SimdMultAccOp",
2140                      ("int16_t", "int32_t"), qdmlslCode, True, byElem=True)
2141    threeRegLongInstX("sqdmlsl", "SqdmlslElem2X", "SimdMultAccOp",
2142                      ("int16_t", "int32_t"), qdmlslCode, True, byElem=True,
2143                      hi=True)
2144    threeRegLongInstX("sqdmlsl", "SqdmlslElemScX", "SimdMultAccOp",
2145                      ("int16_t", "int32_t"), qdmlslCode, True, byElem=True,
2146                      scalar=True)
2147    # SQDMLSL, SQDMLSL2 (vector)
2148    threeRegLongInstX("sqdmlsl", "SqdmlslX", "SimdMultAccOp",
2149                      ("int16_t", "int32_t"), qdmlslCode, True)
2150    threeRegLongInstX("sqdmlsl", "Sqdmlsl2X", "SimdMultAccOp",
2151                      ("int16_t", "int32_t"), qdmlslCode, True, hi=True)
2152    threeRegLongInstX("sqdmlsl", "SqdmlslScX", "SimdMultAccOp",
2153                      ("int16_t", "int32_t"), qdmlslCode, True, scalar=True)
2154    # SQDMULH (by element)
2155    sqdmulhCode = '''
2156            FPSCR fpscr = (FPSCR) FpscrQc;
2157            destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2) >>
2158                       (sizeof(Element) * 8);
2159            if (srcElem1 == srcElem2 &&
2160                    srcElem1 == (Element)((Element)1 <<
2161                        (sizeof(Element) * 8 - 1))) {
2162                destElem = ~srcElem1;
2163                fpscr.qc = 1;
2164            }
2165            FpscrQc = fpscr;
2166    '''
2167    threeEqualRegInstX("sqdmulh", "SqdmulhElemDX", "SimdMultOp",
2168                       ("int16_t", "int32_t"), 2, sqdmulhCode, byElem=True)
2169    threeEqualRegInstX("sqdmulh", "SqdmulhElemQX", "SimdMultOp",
2170                       ("int16_t", "int32_t"), 4, sqdmulhCode, byElem=True)
2171    threeEqualRegInstX("sqdmulh", "SqdmulhElemScX", "SimdMultOp",
2172                       ("int16_t", "int32_t"), 4, sqdmulhCode, byElem=True,
2173                       scalar=True)
2174    # SQDMULH (vector)
2175    threeEqualRegInstX("sqdmulh", "SqdmulhDX", "SimdMultOp",
2176                       ("int16_t", "int32_t"), 2, sqdmulhCode)
2177    threeEqualRegInstX("sqdmulh", "SqdmulhQX", "SimdMultOp",
2178                       ("int16_t", "int32_t"), 4, sqdmulhCode)
2179    threeEqualRegInstX("sqdmulh", "SqdmulhScX", "SimdMultOp",
2180                       ("int16_t", "int32_t"), 4, sqdmulhCode, scalar=True)
2181    # SQDMULL, SQDMULL2 (by element)
2182    qdmullCode = '''
2183        FPSCR fpscr = (FPSCR) FpscrQc;
2184        destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2185        if (srcElem1 == srcElem2 &&
2186                srcElem1 == (Element)((Element)1 <<
2187                    (Element)(sizeof(Element) * 8 - 1))) {
2188            destElem = ~((BigElement)srcElem1 << (sizeof(Element) * 8));
2189            fpscr.qc = 1;
2190        }
2191        FpscrQc = fpscr;
2192    '''
2193    threeRegLongInstX("sqdmull", "SqdmullElemX", "SimdMultOp",
2194                      ("int16_t", "int32_t"), qdmullCode, True, byElem=True)
2195    threeRegLongInstX("sqdmull", "SqdmullElem2X", "SimdMultOp",
2196                      ("int16_t", "int32_t"), qdmullCode, True, byElem=True,
2197                      hi=True)
2198    threeRegLongInstX("sqdmull", "SqdmullElemScX", "SimdMultOp",
2199                      ("int16_t", "int32_t"), qdmullCode, True, byElem=True,
2200                      scalar=True)
2201    # SQDMULL, SQDMULL2 (vector)
2202    threeRegLongInstX("sqdmull", "SqdmullX", "SimdMultOp",
2203                      ("int16_t", "int32_t"), qdmullCode, True)
2204    threeRegLongInstX("sqdmull", "Sqdmull2X", "SimdMultOp",
2205                      ("int16_t", "int32_t"), qdmullCode, True, hi=True)
2206    threeRegLongInstX("sqdmull", "SqdmullScX", "SimdMultOp",
2207                      ("int16_t", "int32_t"), qdmullCode, True, scalar=True)
2208    # SQNEG
2209    sqnegCode = '''
2210        FPSCR fpscr = (FPSCR) FpscrQc;
2211        if (srcElem1 == (Element)(std::numeric_limits<Element>::min())) {
2212            fpscr.qc = 1;
2213            destElem = ~srcElem1;
2214        } else {
2215            destElem = -srcElem1;
2216        }
2217        FpscrQc = fpscr;
2218    '''
2219    twoEqualRegInstX("sqneg", "SqnegDX", "SimdAluOp", smallSignedTypes, 2,
2220                     sqnegCode)
2221    twoEqualRegInstX("sqneg", "SqnegQX", "SimdAluOp", signedTypes, 4,
2222                     sqnegCode)
2223    twoEqualRegInstX("sqneg", "SqnegScX", "SimdAluOp", signedTypes, 4,
2224                     sqnegCode, scalar=True)
2225    # SQRDMULH (by element)
2226    sqrdmulhCode = '''
2227            FPSCR fpscr = (FPSCR) FpscrQc;
2228            destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2 +
2229                        ((int64_t)1 << (sizeof(Element) * 8 - 1))) >>
2230                       (sizeof(Element) * 8);
2231            Element maxNeg = std::numeric_limits<Element>::min();
2232            Element halfNeg = maxNeg / 2;
2233            if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2234                (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2235                (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2236                if (destElem < 0) {
2237                    destElem = mask(sizeof(Element) * 8 - 1);
2238                } else {
2239                    destElem = std::numeric_limits<Element>::min();
2240                }
2241                fpscr.qc = 1;
2242            }
2243            FpscrQc = fpscr;
2244    '''
2245    threeEqualRegInstX("sqrdmulh", "SqrdmulhElemDX", "SimdMultOp",
2246                       ("int16_t", "int32_t"), 2, sqrdmulhCode, byElem=True)
2247    threeEqualRegInstX("sqrdmulh", "SqrdmulhElemQX", "SimdMultOp",
2248                       ("int16_t", "int32_t"), 4, sqrdmulhCode, byElem=True)
2249    threeEqualRegInstX("sqrdmulh", "SqrdmulhElemScX", "SimdMultOp",
2250                       ("int16_t", "int32_t"), 4, sqrdmulhCode, byElem=True,
2251                       scalar=True)
2252    # SQRDMULH (vector)
2253    threeEqualRegInstX("sqrdmulh", "SqrdmulhDX", "SimdMultOp",
2254                       ("int16_t", "int32_t"), 2, sqrdmulhCode)
2255    threeEqualRegInstX("sqrdmulh", "SqrdmulhQX", "SimdMultOp",
2256                       ("int16_t", "int32_t"), 4, sqrdmulhCode)
2257    threeEqualRegInstX("sqrdmulh", "SqrdmulhScX", "SimdMultOp",
2258                       ("int16_t", "int32_t"), 4, sqrdmulhCode, scalar=True)
2259    # SQRSHL
2260    sqrshlCode = '''
2261            int16_t shiftAmt = (int8_t)srcElem2;
2262            FPSCR fpscr = (FPSCR) FpscrQc;
2263            if (shiftAmt < 0) {
2264                shiftAmt = -shiftAmt;
2265                Element rBit = 0;
2266                if (shiftAmt <= sizeof(Element) * 8)
2267                    rBit = bits(srcElem1, shiftAmt - 1);
2268                if (shiftAmt > sizeof(Element) * 8 && srcElem1 < 0)
2269                    rBit = 1;
2270                if (shiftAmt >= sizeof(Element) * 8) {
2271                    shiftAmt = sizeof(Element) * 8 - 1;
2272                    destElem = 0;
2273                } else {
2274                    destElem = (srcElem1 >> shiftAmt);
2275                }
2276                // Make sure the right shift sign extended when it should.
2277                if (srcElem1 < 0 && destElem >= 0) {
2278                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
2279                                                 1 - shiftAmt));
2280                }
2281                destElem += rBit;
2282            } else if (shiftAmt > 0) {
2283                bool sat = false;
2284                if (shiftAmt >= sizeof(Element) * 8) {
2285                    if (srcElem1 != 0)
2286                        sat = true;
2287                    else
2288                        destElem = 0;
2289                } else {
2290                    if (bits((uint64_t) srcElem1, sizeof(Element) * 8 - 1,
2291                                sizeof(Element) * 8 - 1 - shiftAmt) !=
2292                            ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) {
2293                        sat = true;
2294                    } else {
2295                        destElem = srcElem1 << shiftAmt;
2296                    }
2297                }
2298                if (sat) {
2299                    fpscr.qc = 1;
2300                    destElem = mask(sizeof(Element) * 8 - 1);
2301                    if (srcElem1 < 0)
2302                        destElem = ~destElem;
2303                }
2304            } else {
2305                destElem = srcElem1;
2306            }
2307            FpscrQc = fpscr;
2308    '''
2309    threeEqualRegInstX("sqrshl", "SqrshlDX", "SimdCmpOp", smallSignedTypes, 2,
2310                       sqrshlCode)
2311    threeEqualRegInstX("sqrshl", "SqrshlQX", "SimdCmpOp", signedTypes, 4,
2312                       sqrshlCode)
2313    threeEqualRegInstX("sqrshl", "SqrshlScX", "SimdCmpOp", signedTypes, 4,
2314                       sqrshlCode, scalar=True)
2315    # SQRSHRN, SQRSHRN2
2316    sqrshrnCode = '''
2317            FPSCR fpscr = (FPSCR) FpscrQc;
2318            if (imm > sizeof(srcElem1) * 8) {
2319                if (srcElem1 != 0 && srcElem1 != -1)
2320                    fpscr.qc = 1;
2321                destElem = 0;
2322            } else if (imm) {
2323                BigElement mid = (srcElem1 >> (imm - 1));
2324                uint64_t rBit = mid & 0x1;
2325                mid >>= 1;
2326                mid |= -(mid & ((BigElement)1 <<
2327                            (sizeof(BigElement) * 8 - 1 - imm)));
2328                mid += rBit;
2329                if (mid != (Element)mid) {
2330                    destElem = mask(sizeof(Element) * 8 - 1);
2331                    if (srcElem1 < 0)
2332                        destElem = ~destElem;
2333                    fpscr.qc = 1;
2334                } else {
2335                    destElem = mid;
2336                }
2337            } else {
2338                if (srcElem1 != (Element)srcElem1) {
2339                    destElem = mask(sizeof(Element) * 8 - 1);
2340                    if (srcElem1 < 0)
2341                        destElem = ~destElem;
2342                    fpscr.qc = 1;
2343                } else {
2344                    destElem = srcElem1;
2345                }
2346            }
2347            FpscrQc = fpscr;
2348    '''
2349    twoRegNarrowInstX("sqrshrn", "SqrshrnX", "SimdShiftOp", smallSignedTypes,
2350                      sqrshrnCode, hasImm=True)
2351    twoRegNarrowInstX("sqrshrn2", "Sqrshrn2X", "SimdShiftOp", smallSignedTypes,
2352                      sqrshrnCode, hasImm=True, hi=True)
2353    twoRegNarrowInstX("sqrshrn", "SqrshrnScX", "SimdShiftOp", smallSignedTypes,
2354                      sqrshrnCode, hasImm=True, scalar=True)
2355    # SQRSHRUN, SQRSHRUN2
2356    sqrshrunCode = '''
2357            FPSCR fpscr = (FPSCR) FpscrQc;
2358            if (imm > sizeof(srcElem1) * 8) {
2359                if (srcElem1 != 0)
2360                    fpscr.qc = 1;
2361                destElem = 0;
2362            } else if (imm) {
2363                BigElement mid = (srcElem1 >> (imm - 1));
2364                uint64_t rBit = mid & 0x1;
2365                mid >>= 1;
2366                mid |= -(mid & ((BigElement)1 <<
2367                                (sizeof(BigElement) * 8 - 1 - imm)));
2368                mid += rBit;
2369                if (bits(mid, sizeof(BigElement) * 8 - 1,
2370                              sizeof(Element) * 8) != 0) {
2371                    if (srcElem1 < 0) {
2372                        destElem = 0;
2373                    } else {
2374                        destElem = mask(sizeof(Element) * 8);
2375                    }
2376                    fpscr.qc = 1;
2377                } else {
2378                    destElem = mid;
2379                }
2380            } else {
2381                if (srcElem1 < 0) {
2382                    fpscr.qc = 1;
2383                    destElem = 0;
2384                } else {
2385                    destElem = srcElem1;
2386                }
2387            }
2388            FpscrQc = fpscr;
2389    '''
2390    twoRegNarrowInstX("sqrshrun", "SqrshrunX", "SimdShiftOp", smallSignedTypes,
2391                      sqrshrunCode, hasImm=True)
2392    twoRegNarrowInstX("sqrshrun", "Sqrshrun2X", "SimdShiftOp",
2393                      smallSignedTypes, sqrshrunCode, hasImm=True, hi=True)
2394    twoRegNarrowInstX("sqrshrun", "SqrshrunScX", "SimdShiftOp",
2395                      smallSignedTypes, sqrshrunCode, hasImm=True, scalar=True)
2396    # SQSHL (immediate)
2397    sqshlImmCode = '''
2398            FPSCR fpscr = (FPSCR) FpscrQc;
2399            if (imm >= sizeof(Element) * 8) {
2400                if (srcElem1 != 0) {
2401                    destElem = std::numeric_limits<Element>::min();
2402                    if (srcElem1 > 0)
2403                        destElem = ~destElem;
2404                    fpscr.qc = 1;
2405                } else {
2406                    destElem = 0;
2407                }
2408            } else if (imm) {
2409                destElem = (srcElem1 << imm);
2410                uint64_t topBits = bits((uint64_t)srcElem1,
2411                                        sizeof(Element) * 8 - 1,
2412                                        sizeof(Element) * 8 - 1 - imm);
2413                if (topBits != 0 && topBits != mask(imm + 1)) {
2414                    destElem = std::numeric_limits<Element>::min();
2415                    if (srcElem1 > 0)
2416                        destElem = ~destElem;
2417                    fpscr.qc = 1;
2418                }
2419            } else {
2420                destElem = srcElem1;
2421            }
2422            FpscrQc = fpscr;
2423    '''
2424    twoEqualRegInstX("sqshl", "SqshlImmDX", "SimdAluOp", smallSignedTypes, 2,
2425                     sqshlImmCode, hasImm=True)
2426    twoEqualRegInstX("sqshl", "SqshlImmQX", "SimdAluOp", signedTypes, 4,
2427                     sqshlImmCode, hasImm=True)
2428    twoEqualRegInstX("sqshl", "SqshlImmScX", "SimdAluOp", signedTypes, 4,
2429                     sqshlImmCode, hasImm=True, scalar=True)
2430    # SQSHL (register)
2431    sqshlCode = '''
2432            int16_t shiftAmt = (int8_t)srcElem2;
2433            FPSCR fpscr = (FPSCR) FpscrQc;
2434            if (shiftAmt < 0) {
2435                shiftAmt = -shiftAmt;
2436                if (shiftAmt >= sizeof(Element) * 8) {
2437                    shiftAmt = sizeof(Element) * 8 - 1;
2438                    destElem = 0;
2439                } else {
2440                    destElem = (srcElem1 >> shiftAmt);
2441                }
2442                // Make sure the right shift sign extended when it should.
2443                if (srcElem1 < 0 && destElem >= 0) {
2444                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
2445                                                 1 - shiftAmt));
2446                }
2447            } else if (shiftAmt > 0) {
2448                bool sat = false;
2449                if (shiftAmt >= sizeof(Element) * 8) {
2450                    if (srcElem1 != 0)
2451                        sat = true;
2452                    else
2453                        destElem = 0;
2454                } else {
2455                    if (bits((uint64_t) srcElem1, sizeof(Element) * 8 - 1,
2456                                sizeof(Element) * 8 - 1 - shiftAmt) !=
2457                            ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) {
2458                        sat = true;
2459                    } else {
2460                        destElem = srcElem1 << shiftAmt;
2461                    }
2462                }
2463                if (sat) {
2464                    fpscr.qc = 1;
2465                    destElem = mask(sizeof(Element) * 8 - 1);
2466                    if (srcElem1 < 0)
2467                        destElem = ~destElem;
2468                }
2469            } else {
2470                destElem = srcElem1;
2471            }
2472            FpscrQc = fpscr;
2473    '''
2474    threeEqualRegInstX("sqshl", "SqshlDX", "SimdAluOp", smallSignedTypes, 2,
2475                       sqshlCode)
2476    threeEqualRegInstX("sqshl", "SqshlQX", "SimdAluOp", signedTypes, 4,
2477                       sqshlCode)
2478    threeEqualRegInstX("sqshl", "SqshlScX", "SimdAluOp", signedTypes, 4,
2479                       sqshlCode, scalar=True)
2480    # SQSHLU
2481    sqshluCode = '''
2482            FPSCR fpscr = (FPSCR) FpscrQc;
2483            if (imm >= sizeof(Element) * 8) {
2484                if (srcElem1 < 0) {
2485                    destElem = 0;
2486                    fpscr.qc = 1;
2487                } else if (srcElem1 > 0) {
2488                    destElem = mask(sizeof(Element) * 8);
2489                    fpscr.qc = 1;
2490                } else {
2491                    destElem = 0;
2492                }
2493            } else if (imm) {
2494                destElem = (srcElem1 << imm);
2495                uint64_t topBits = bits((uint64_t)srcElem1,
2496                                        sizeof(Element) * 8 - 1,
2497                                        sizeof(Element) * 8 - imm);
2498                if (srcElem1 < 0) {
2499                    destElem = 0;
2500                    fpscr.qc = 1;
2501                } else if (topBits != 0) {
2502                    destElem = mask(sizeof(Element) * 8);
2503                    fpscr.qc = 1;
2504                }
2505            } else {
2506                if (srcElem1 < 0) {
2507                    fpscr.qc = 1;
2508                    destElem = 0;
2509                } else {
2510                    destElem = srcElem1;
2511                }
2512            }
2513            FpscrQc = fpscr;
2514    '''
2515    twoEqualRegInstX("sqshlu", "SqshluDX", "SimdAluOp", smallSignedTypes, 2,
2516                     sqshluCode, hasImm=True)
2517    twoEqualRegInstX("sqshlu", "SqshluQX", "SimdAluOp", signedTypes, 4,
2518                     sqshluCode, hasImm=True)
2519    twoEqualRegInstX("sqshlu", "SqshluScX", "SimdAluOp", signedTypes, 4,
2520                     sqshluCode, hasImm=True, scalar=True)
2521    # SQSHRN, SQSHRN2
2522    sqshrnCode = '''
2523        FPSCR fpscr = (FPSCR) FpscrQc;
2524        if (imm > sizeof(srcElem1) * 8) {
2525            if (srcElem1 != 0 && srcElem1 != -1)
2526                fpscr.qc = 1;
2527            destElem = 0;
2528        } else if (imm) {
2529            BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
2530            mid |= -(mid & ((BigElement)1 <<
2531                        (sizeof(BigElement) * 8 - 1 - imm)));
2532            if (mid != (Element)mid) {
2533                destElem = mask(sizeof(Element) * 8 - 1);
2534                if (srcElem1 < 0)
2535                    destElem = ~destElem;
2536                fpscr.qc = 1;
2537            } else {
2538                destElem = mid;
2539            }
2540        } else {
2541            destElem = srcElem1;
2542        }
2543        FpscrQc = fpscr;
2544    '''
2545    twoRegNarrowInstX("sqshrn", "SqshrnX", "SimdShiftOp", smallSignedTypes,
2546                      sqshrnCode, hasImm=True)
2547    twoRegNarrowInstX("sqshrn2", "Sqshrn2X", "SimdShiftOp", smallSignedTypes,
2548                      sqshrnCode, hasImm=True, hi=True)
2549    twoRegNarrowInstX("sqshrn", "SqshrnScX", "SimdShiftOp", smallSignedTypes,
2550                      sqshrnCode, hasImm=True, scalar=True)
2551    # SQSHRUN, SQSHRUN2
2552    sqshrunCode = '''
2553            FPSCR fpscr = (FPSCR) FpscrQc;
2554            if (imm > sizeof(srcElem1) * 8) {
2555                if (srcElem1 != 0)
2556                    fpscr.qc = 1;
2557                destElem = 0;
2558            } else if (imm) {
2559                BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
2560                if (bits(mid, sizeof(BigElement) * 8 - 1,
2561                              sizeof(Element) * 8) != 0) {
2562                    if (srcElem1 < 0) {
2563                        destElem = 0;
2564                    } else {
2565                        destElem = mask(sizeof(Element) * 8);
2566                    }
2567                    fpscr.qc = 1;
2568                } else {
2569                    destElem = mid;
2570                }
2571            } else {
2572                destElem = srcElem1;
2573            }
2574            FpscrQc = fpscr;
2575    '''
2576    twoRegNarrowInstX("sqshrun", "SqshrunX", "SimdShiftOp", smallSignedTypes,
2577                      sqshrunCode, hasImm=True)
2578    twoRegNarrowInstX("sqshrun", "Sqshrun2X", "SimdShiftOp", smallSignedTypes,
2579                      sqshrunCode, hasImm=True, hi=True)
2580    twoRegNarrowInstX("sqshrun", "SqshrunScX", "SimdShiftOp", smallSignedTypes,
2581                      sqshrunCode, hasImm=True, scalar=True)
2582    # SQSUB
2583    sqsubCode = '''
2584            destElem = srcElem1 - srcElem2;
2585            FPSCR fpscr = (FPSCR) FpscrQc;
2586            bool negDest = (destElem < 0);
2587            bool negSrc1 = (srcElem1 < 0);
2588            bool posSrc2 = (srcElem2 >= 0);
2589            if ((negDest != negSrc1) && (negSrc1 == posSrc2)) {
2590                destElem = std::numeric_limits<Element>::min();
2591                if (negDest)
2592                    destElem -= 1;
2593                fpscr.qc = 1;
2594            }
2595            FpscrQc = fpscr;
2596    '''
2597    threeEqualRegInstX("sqsub", "SqsubDX", "SimdAddOp", smallSignedTypes, 2,
2598                       sqsubCode)
2599    threeEqualRegInstX("sqsub", "SqsubQX", "SimdAddOp", signedTypes, 4,
2600                       sqsubCode)
2601    threeEqualRegInstX("sqsub", "SqsubScX", "SimdAddOp", signedTypes, 4,
2602                       sqsubCode, scalar=True)
2603    # SQXTN, SQXTN2
2604    sqxtnCode = '''
2605            FPSCR fpscr = (FPSCR) FpscrQc;
2606            destElem = srcElem1;
2607            if ((BigElement)destElem != srcElem1) {
2608                fpscr.qc = 1;
2609                destElem = mask(sizeof(Element) * 8 - 1);
2610                if (srcElem1 < 0)
2611                    destElem = ~destElem;
2612            }
2613            FpscrQc = fpscr;
2614    '''
2615    twoRegNarrowInstX("sqxtn", "SqxtnX", "SimdMiscOp", smallSignedTypes,
2616                      sqxtnCode)
2617    twoRegNarrowInstX("sqxtn", "Sqxtn2X", "SimdMiscOp", smallSignedTypes,
2618                      sqxtnCode, hi=True)
2619    twoRegNarrowInstX("sqxtn", "SqxtnScX", "SimdMiscOp", smallSignedTypes,
2620                      sqxtnCode, scalar=True)
2621    # SQXTUN, SQXTUN2
2622    sqxtunCode = '''
2623            FPSCR fpscr = (FPSCR) FpscrQc;
2624            destElem = srcElem1;
2625            if (srcElem1 < 0 ||
2626                    ((BigElement)destElem & mask(sizeof(Element) * 8)) != srcElem1) {
2627                fpscr.qc = 1;
2628                destElem = mask(sizeof(Element) * 8);
2629                if (srcElem1 < 0)
2630                    destElem = ~destElem;
2631            }
2632            FpscrQc = fpscr;
2633    '''
2634    twoRegNarrowInstX("sqxtun", "SqxtunX", "SimdMiscOp", smallSignedTypes,
2635                      sqxtunCode)
2636    twoRegNarrowInstX("sqxtun", "Sqxtun2X", "SimdMiscOp", smallSignedTypes,
2637                      sqxtunCode, hi=True)
2638    twoRegNarrowInstX("sqxtun", "SqxtunScX", "SimdMiscOp", smallSignedTypes,
2639                      sqxtunCode, scalar=True)
2640    # SRHADD
2641    rhaddCode = '''
2642            Element carryBit =
2643                (((unsigned)srcElem1 & 0x1) +
2644                 ((unsigned)srcElem2 & 0x1) + 1) >> 1;
2645            // Use division instead of a shift to ensure the sign extension works
2646            // right. The compiler will figure out if it can be a shift. Mask the
2647            // inputs so they get truncated correctly.
2648            destElem = (((srcElem1 & ~(Element)1) / 2) +
2649                        ((srcElem2 & ~(Element)1) / 2)) + carryBit;
2650    '''
2651    threeEqualRegInstX("srhadd", "SrhaddDX", "SimdAddOp", smallSignedTypes, 2,
2652                       rhaddCode)
2653    threeEqualRegInstX("srhadd", "SrhaddQX", "SimdAddOp", smallSignedTypes, 4,
2654                       rhaddCode)
2655    # SRI
2656    sriCode = '''
2657            if (imm >= sizeof(Element) * 8)
2658                destElem = destElem;
2659            else
2660                destElem = (srcElem1 >> imm) |
2661                    (destElem & ~mask(sizeof(Element) * 8 - imm));
2662    '''
2663    twoEqualRegInstX("sri", "SriDX", "SimdShiftOp", unsignedTypes, 2, sriCode,
2664                     True, hasImm=True)
2665    twoEqualRegInstX("sri", "SriQX", "SimdShiftOp", unsignedTypes, 4, sriCode,
2666                     True, hasImm=True)
2667    # SRSHL
2668    rshlCode = '''
2669            int16_t shiftAmt = (int8_t)srcElem2;
2670            if (shiftAmt < 0) {
2671                shiftAmt = -shiftAmt;
2672                Element rBit = 0;
2673                if (shiftAmt <= sizeof(Element) * 8)
2674                    rBit = bits(srcElem1, shiftAmt - 1);
2675                if (shiftAmt > sizeof(Element) * 8 && ltz(srcElem1))
2676                    rBit = 1;
2677                if (shiftAmt >= sizeof(Element) * 8) {
2678                    shiftAmt = sizeof(Element) * 8 - 1;
2679                    destElem = 0;
2680                } else {
2681                    destElem = (srcElem1 >> shiftAmt);
2682                }
2683                // Make sure the right shift sign extended when it should.
2684                if (ltz(srcElem1) && !ltz(destElem)) {
2685                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
2686                                                 1 - shiftAmt));
2687                }
2688                destElem += rBit;
2689            } else if (shiftAmt > 0) {
2690                if (shiftAmt >= sizeof(Element) * 8) {
2691                    destElem = 0;
2692                } else {
2693                    destElem = srcElem1 << shiftAmt;
2694                }
2695            } else {
2696                destElem = srcElem1;
2697            }
2698    '''
2699    threeEqualRegInstX("srshl", "SrshlDX", "SimdShiftOp", signedTypes, 2,
2700                       rshlCode)
2701    threeEqualRegInstX("srshl", "SrshlQX", "SimdShiftOp", signedTypes, 4,
2702                       rshlCode)
2703    # SRSHR
2704    rshrCode = '''
2705            if (imm > sizeof(srcElem1) * 8) {
2706                destElem = 0;
2707            } else if (imm) {
2708                Element rBit = bits(srcElem1, imm - 1);
2709                destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit;
2710            } else {
2711                destElem = srcElem1;
2712            }
2713    '''
2714    twoEqualRegInstX("srshr", "SrshrDX", "SimdShiftOp", signedTypes, 2,
2715                     rshrCode, hasImm=True)
2716    twoEqualRegInstX("srshr", "SrshrQX", "SimdShiftOp", signedTypes, 4,
2717                     rshrCode, hasImm=True)
2718    # SRSRA
2719    rsraCode = '''
2720            if (imm > sizeof(srcElem1) * 8) {
2721                destElem += 0;
2722            } else if (imm) {
2723                Element rBit = bits(srcElem1, imm - 1);
2724                destElem += ((srcElem1 >> (imm - 1)) >> 1) + rBit;
2725            } else {
2726                destElem += srcElem1;
2727            }
2728    '''
2729    twoEqualRegInstX("srsra", "SrsraDX", "SimdShiftOp", signedTypes, 2,
2730                     rsraCode, True, hasImm=True)
2731    twoEqualRegInstX("srsra", "SrsraQX", "SimdShiftOp", signedTypes, 4,
2732                     rsraCode, True, hasImm=True)
2733    # SSHL
2734    shlCode = '''
2735            int16_t shiftAmt = (int8_t)srcElem2;
2736            if (shiftAmt < 0) {
2737                shiftAmt = -shiftAmt;
2738                if (shiftAmt >= sizeof(Element) * 8) {
2739                    shiftAmt = sizeof(Element) * 8 - 1;
2740                    destElem = 0;
2741                } else {
2742                    destElem = (srcElem1 >> shiftAmt);
2743                }
2744                // Make sure the right shift sign extended when it should.
2745                if (ltz(srcElem1) && !ltz(destElem)) {
2746                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
2747                                                 1 - shiftAmt));
2748                }
2749            } else {
2750                if (shiftAmt >= sizeof(Element) * 8) {
2751                    destElem = 0;
2752                } else {
2753                    destElem = srcElem1 << shiftAmt;
2754                }
2755            }
2756    '''
2757    threeEqualRegInstX("sshl", "SshlDX", "SimdShiftOp", signedTypes, 2,
2758                       shlCode)
2759    threeEqualRegInstX("sshl", "SshlQX", "SimdShiftOp", signedTypes, 4,
2760                       shlCode)
2761    # SSHLL, SSHLL2
2762    shllCode = '''
2763            if (imm >= sizeof(destElem) * 8) {
2764                destElem = 0;
2765            } else {
2766                destElem = (BigElement)srcElem1 << imm;
2767            }
2768    '''
2769    twoRegLongInstX("sshll", "SshllX", "SimdShiftOp", smallSignedTypes,
2770                    shllCode, hasImm=True)
2771    twoRegLongInstX("sshll", "Sshll2X", "SimdShiftOp", smallSignedTypes,
2772                    shllCode, hasImm=True, hi=True)
2773    # SSHR
2774    shrCode = '''
2775            if (imm >= sizeof(srcElem1) * 8) {
2776                if (ltz(srcElem1))
2777                    destElem = -1;
2778                else
2779                    destElem = 0;
2780            } else {
2781                destElem = srcElem1 >> imm;
2782            }
2783    '''
2784    twoEqualRegInstX("sshr", "SshrDX", "SimdShiftOp", signedTypes, 2, shrCode,
2785                     hasImm=True)
2786    twoEqualRegInstX("sshr", "SshrQX", "SimdShiftOp", signedTypes, 4, shrCode,
2787                     hasImm=True)
2788    # SSRA
2789    sraCode = '''
2790            Element mid;;
2791            if (imm >= sizeof(srcElem1) * 8) {
2792                mid = ltz(srcElem1) ? -1 : 0;
2793            } else {
2794                mid = srcElem1 >> imm;
2795                if (ltz(srcElem1) && !ltz(mid)) {
2796                    mid |= -(mid & ((Element)1 <<
2797                                    (sizeof(Element) * 8 - 1 - imm)));
2798                }
2799            }
2800            destElem += mid;
2801    '''
2802    twoEqualRegInstX("ssra", "SsraDX", "SimdShiftOp", signedTypes, 2, sraCode,
2803                     True, hasImm=True)
2804    twoEqualRegInstX("ssra", "SsraQX", "SimdShiftOp", signedTypes, 4, sraCode,
2805                     True, hasImm=True)
2806    # SSUBL
2807    sublwCode = "destElem = (BigElement)srcElem1 - (BigElement)srcElem2;"
2808    threeRegLongInstX("ssubl", "SsublX", "SimdAddOp", smallSignedTypes,
2809                      sublwCode)
2810    threeRegLongInstX("ssubl2", "Ssubl2X", "SimdAddOp", smallSignedTypes,
2811                      sublwCode, hi=True)
2812    # SSUBW
2813    threeRegWideInstX("ssubw", "SsubwX", "SimdAddOp", smallSignedTypes,
2814                      sublwCode)
2815    threeRegWideInstX("ssubw2", "Ssubw2X", "SimdAddOp", smallSignedTypes,
2816                      sublwCode, hi=True)
2817    # SUB
2818    subCode = "destElem = srcElem1 - srcElem2;"
2819    threeEqualRegInstX("sub", "SubDX", "SimdAddOp", unsignedTypes, 2, subCode)
2820    threeEqualRegInstX("sub", "SubQX", "SimdAddOp", unsignedTypes, 4, subCode)
2821    # SUBHN, SUBHN2
2822    subhnCode = '''
2823            destElem = ((BigElement)srcElem1 - (BigElement)srcElem2) >>
2824                        (sizeof(Element) * 8);
2825    '''
2826    threeRegNarrowInstX("subhn", "SubhnX", "SimdAddOp", smallUnsignedTypes,
2827                        subhnCode)
2828    threeRegNarrowInstX("subhn2", "Subhn2X", "SimdAddOp", smallUnsignedTypes,
2829                        subhnCode, hi=True)
2830    # SUQADD
2831    suqaddCode = '''
2832            FPSCR fpscr = (FPSCR) FpscrQc;
2833            Element tmp = destElem + srcElem1;
2834            if (bits(destElem, sizeof(Element) * 8 - 1) == 0) {
2835                if (bits(tmp, sizeof(Element) * 8 - 1) == 1 ||
2836                        tmp < srcElem1 || tmp < destElem) {
2837                    destElem = (((Element) 1) << (sizeof(Element) * 8 - 1)) - 1;
2838                    fpscr.qc = 1;
2839                } else {
2840                    destElem = tmp;
2841                }
2842            } else {
2843                Element absDestElem = (~destElem) + 1;
2844                if (absDestElem < srcElem1) {
2845                    // Still check for positive sat., no need to check for negative sat.
2846                    if (bits(tmp, sizeof(Element) * 8 - 1) == 1) {
2847                        destElem = (((Element) 1) << (sizeof(Element) * 8 - 1)) - 1;
2848                        fpscr.qc = 1;
2849                    } else {
2850                        destElem = tmp;
2851                    }
2852                } else {
2853                    destElem = tmp;
2854                }
2855            }
2856            FpscrQc = fpscr;
2857    '''
2858    twoEqualRegInstX("suqadd", "SuqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
2859                     suqaddCode, True)
2860    twoEqualRegInstX("suqadd", "SuqaddQX", "SimdAddOp", unsignedTypes, 4,
2861                     suqaddCode, True)
2862    twoEqualRegInstX("suqadd", "SuqaddScX", "SimdAddOp", unsignedTypes, 4,
2863                     suqaddCode, True, scalar=True)
2864    # SXTL -> alias to SSHLL
2865    # TBL
2866    tbxTblInstX("tbl", "Tbl1DX", "SimdMiscOp", ("uint8_t",), 1, "true", 2)
2867    tbxTblInstX("tbl", "Tbl1QX", "SimdMiscOp", ("uint8_t",), 1, "true", 4)
2868    tbxTblInstX("tbl", "Tbl2DX", "SimdMiscOp", ("uint8_t",), 2, "true", 2)
2869    tbxTblInstX("tbl", "Tbl2QX", "SimdMiscOp", ("uint8_t",), 2, "true", 4)
2870    tbxTblInstX("tbl", "Tbl3DX", "SimdMiscOp", ("uint8_t",), 3, "true", 2)
2871    tbxTblInstX("tbl", "Tbl3QX", "SimdMiscOp", ("uint8_t",), 3, "true", 4)
2872    tbxTblInstX("tbl", "Tbl4DX", "SimdMiscOp", ("uint8_t",), 4, "true", 2)
2873    tbxTblInstX("tbl", "Tbl4QX", "SimdMiscOp", ("uint8_t",), 4, "true", 4)
2874    # TBX
2875    tbxTblInstX("tbx", "Tbx1DX", "SimdMiscOp", ("uint8_t",), 1, "false", 2)
2876    tbxTblInstX("tbx", "Tbx1QX", "SimdMiscOp", ("uint8_t",), 1, "false", 4)
2877    tbxTblInstX("tbx", "Tbx2DX", "SimdMiscOp", ("uint8_t",), 2, "false", 2)
2878    tbxTblInstX("tbx", "Tbx2QX", "SimdMiscOp", ("uint8_t",), 2, "false", 4)
2879    tbxTblInstX("tbx", "Tbx3DX", "SimdMiscOp", ("uint8_t",), 3, "false", 2)
2880    tbxTblInstX("tbx", "Tbx3QX", "SimdMiscOp", ("uint8_t",), 3, "false", 4)
2881    tbxTblInstX("tbx", "Tbx4DX", "SimdMiscOp", ("uint8_t",), 4, "false", 2)
2882    tbxTblInstX("tbx", "Tbx4QX", "SimdMiscOp", ("uint8_t",), 4, "false", 4)
2883    # TRN1
2884    trnCode = '''
2885        unsigned part = %s;
2886        for (unsigned i = 0; i < eCount / 2; i++) {
2887            destReg.elements[2 * i] = srcReg1.elements[2 * i + part];
2888            destReg.elements[2 * i + 1] = srcReg2.elements[2 * i + part];
2889        }
2890    '''
2891    threeRegScrambleInstX("trn1", "Trn1DX", "SimdAluOp", smallUnsignedTypes, 2,
2892                          trnCode % "0")
2893    threeRegScrambleInstX("trn1", "Trn1QX", "SimdAluOp", unsignedTypes, 4,
2894                          trnCode % "0")
2895    # TRN2
2896    threeRegScrambleInstX("trn2", "Trn2DX", "SimdAluOp", smallUnsignedTypes, 2,
2897                          trnCode % "1")
2898    threeRegScrambleInstX("trn2", "Trn2QX", "SimdAluOp", unsignedTypes, 4,
2899                          trnCode % "1")
2900    # UABA
2901    threeEqualRegInstX("uaba", "UabaDX", "SimdAddAccOp", smallUnsignedTypes, 2,
2902                       abaCode, True)
2903    threeEqualRegInstX("uaba", "UabaQX", "SimdAddAccOp", smallUnsignedTypes, 4,
2904                       abaCode, True)
2905    # UABAL, UABAL2
2906    threeRegLongInstX("uabal", "UabalX", "SimdAddAccOp", smallUnsignedTypes,
2907                      abalCode, True)
2908    threeRegLongInstX("uabal2", "Uabal2X", "SimdAddAccOp", smallUnsignedTypes,
2909                      abalCode, True, hi=True)
2910    # UABD
2911    threeEqualRegInstX("uabd", "UabdDX", "SimdAddOp", smallUnsignedTypes, 2,
2912                       abdCode)
2913    threeEqualRegInstX("uabd", "UabdQX", "SimdAddOp", smallUnsignedTypes, 4,
2914                       abdCode)
2915    # UABDL, UABDL2
2916    threeRegLongInstX("uabdl", "UabdlX", "SimdAddAccOp", smallUnsignedTypes,
2917                      abdlCode, True)
2918    threeRegLongInstX("uabdl2", "Uabdl2X", "SimdAddAccOp", smallUnsignedTypes,
2919                      abdlCode, True, hi=True)
2920    # UADALP
2921    twoRegCondenseInstX("uadalp", "UadalpDX", "SimdAddOp", smallUnsignedTypes,
2922                        2, adalpCode, True)
2923    twoRegCondenseInstX("uadalp", "UadalpQX", "SimdAddOp", smallUnsignedTypes,
2924                        4, adalpCode, True)
2925    # UADDL, UADDL2
2926    threeRegLongInstX("uaddl", "UaddlX", "SimdAddAccOp", smallUnsignedTypes,
2927                      addlwCode)
2928    threeRegLongInstX("uaddl2", "Uaddl2X", "SimdAddAccOp", smallUnsignedTypes,
2929                      addlwCode, hi=True)
2930    # UADDLP
2931    twoRegCondenseInstX("uaddlp", "UaddlpDX", "SimdAddOp", smallUnsignedTypes,
2932                        2, addlwCode)
2933    twoRegCondenseInstX("uaddlp", "UaddlpQX", "SimdAddOp", smallUnsignedTypes,
2934                        4, addlwCode)
2935    # UADDLV
2936    twoRegAcrossInstX("uaddlv", "UaddlvDX", "SimdAddOp",
2937                      ("uint8_t", "uint16_t"), 2, addAcrossLongCode, long=True)
2938    twoRegAcrossInstX("uaddlv", "UaddlvQX", "SimdAddOp",
2939                      ("uint8_t", "uint16_t"), 4, addAcrossLongCode, long=True)
2940    twoRegAcrossInstX("uaddlv", "UaddlvBQX", "SimdAddOp", ("uint32_t",), 4,
2941                      addAcrossLongCode, doubleDest=True, long=True)
2942    # UADDW
2943    threeRegWideInstX("uaddw", "UaddwX", "SimdAddAccOp", smallUnsignedTypes,
2944                      addlwCode)
2945    threeRegWideInstX("uaddw2", "Uaddw2X", "SimdAddAccOp", smallUnsignedTypes,
2946                      addlwCode, hi=True)
2947    # UCVTF (fixed-point)
2948    ucvtfFixedCode = fpOp % ("fplibFixedToFP<Element>(srcElem1, imm, true,"
2949                             " FPCRRounding(fpscr), fpscr)")
2950    twoEqualRegInstX("ucvtf", "UcvtfFixedDX", "SimdCvtOp", smallFloatTypes, 2,
2951                     ucvtfFixedCode, hasImm=True)
2952    twoEqualRegInstX("ucvtf", "UcvtfFixedQX", "SimdCvtOp", floatTypes, 4,
2953                     ucvtfFixedCode, hasImm=True)
2954    twoEqualRegInstX("ucvtf", "UcvtfFixedScX", "SimdCvtOp", floatTypes, 4,
2955                     ucvtfFixedCode, hasImm=True, scalar=True)
2956    # UCVTF (integer)
2957    ucvtfIntCode = fpOp % ("fplibFixedToFP<Element>(srcElem1, 0, true,"
2958                           " FPCRRounding(fpscr), fpscr)")
2959    twoEqualRegInstX("ucvtf", "UcvtfIntDX", "SimdCvtOp", smallFloatTypes, 2,
2960                     ucvtfIntCode)
2961    twoEqualRegInstX("ucvtf", "UcvtfIntQX", "SimdCvtOp", floatTypes, 4,
2962                     ucvtfIntCode)
2963    twoEqualRegInstX("ucvtf", "UcvtfIntScX", "SimdCvtOp", floatTypes, 4,
2964                     ucvtfIntCode, scalar=True)
2965    # UHADD
2966    threeEqualRegInstX("uhadd", "UhaddDX", "SimdAddOp", smallUnsignedTypes, 2,
2967                       haddCode)
2968    threeEqualRegInstX("uhadd", "UhaddQX", "SimdAddOp", smallUnsignedTypes, 4,
2969                       haddCode)
2970    # UHSUB
2971    threeEqualRegInstX("uhsub", "UhsubDX", "SimdAddOp", smallUnsignedTypes, 2,
2972                       hsubCode)
2973    threeEqualRegInstX("uhsub", "UhsubQX", "SimdAddOp", smallUnsignedTypes, 4,
2974                       hsubCode)
2975    # UMAX
2976    threeEqualRegInstX("umax", "UmaxDX", "SimdCmpOp", smallUnsignedTypes, 2,
2977                       maxCode)
2978    threeEqualRegInstX("umax", "UmaxQX", "SimdCmpOp", smallUnsignedTypes, 4,
2979                       maxCode)
2980    # UMAXP
2981    threeEqualRegInstX("umaxp", "UmaxpDX", "SimdCmpOp", smallUnsignedTypes, 2,
2982                       maxCode, pairwise=True)
2983    threeEqualRegInstX("umaxp", "UmaxpQX", "SimdCmpOp", smallUnsignedTypes, 4,
2984                       maxCode, pairwise=True)
2985    # UMAXV
2986    twoRegAcrossInstX("umaxv", "UmaxvDX", "SimdCmpOp", ("uint8_t", "uint16_t"),
2987                      2, maxAcrossCode)
2988    twoRegAcrossInstX("umaxv", "UmaxvQX", "SimdCmpOp", smallUnsignedTypes, 4,
2989                      maxAcrossCode)
2990    # UMIN
2991    threeEqualRegInstX("umin", "UminDX", "SimdCmpOp", smallUnsignedTypes, 2,
2992                       minCode)
2993    threeEqualRegInstX("umin", "UminQX", "SimdCmpOp", smallUnsignedTypes, 4,
2994                       minCode)
2995    # UMINP
2996    threeEqualRegInstX("uminp", "UminpDX", "SimdCmpOp", smallUnsignedTypes, 2,
2997                       minCode, pairwise=True)
2998    threeEqualRegInstX("uminp", "UminpQX", "SimdCmpOp", smallUnsignedTypes, 4,
2999                       minCode, pairwise=True)
3000    # UMINV
3001    twoRegAcrossInstX("uminv", "UminvDX", "SimdCmpOp", ("uint8_t", "uint16_t"),
3002                      2, minAcrossCode)
3003    twoRegAcrossInstX("uminv", "UminvQX", "SimdCmpOp", smallUnsignedTypes, 4,
3004                      minAcrossCode)
3005    # UMLAL (by element)
3006    threeRegLongInstX("umlal", "UmlalElemX", "SimdMultAccOp",
3007                      smallUnsignedTypes, mlalCode, True, byElem=True)
3008    threeRegLongInstX("umlal", "UmlalElem2X", "SimdMultAccOp",
3009                      smallUnsignedTypes, mlalCode, True, byElem=True, hi=True)
3010    # UMLAL (vector)
3011    threeRegLongInstX("umlal", "UmlalX", "SimdMultAccOp", smallUnsignedTypes,
3012                      mlalCode, True)
3013    threeRegLongInstX("umlal", "Umlal2X", "SimdMultAccOp", smallUnsignedTypes,
3014                      mlalCode, True, hi=True)
3015    # UMLSL (by element)
3016    threeRegLongInstX("umlsl", "UmlslElemX", "SimdMultAccOp",
3017                      smallUnsignedTypes, mlslCode, True, byElem=True)
3018    threeRegLongInstX("umlsl", "UmlslElem2X", "SimdMultAccOp",
3019                      smallUnsignedTypes, mlslCode, True, byElem=True, hi=True)
3020    # UMLSL (vector)
3021    threeRegLongInstX("umlsl", "UmlslX", "SimdMultAccOp", smallUnsignedTypes,
3022                      mlslCode, True)
3023    threeRegLongInstX("umlsl", "Umlsl2X", "SimdMultAccOp", smallUnsignedTypes,
3024                      mlslCode, True, hi=True)
3025    # UMOV
3026    insToGprInstX("umov", "UmovWX", "SimdMiscOp", smallUnsignedTypes, 4, 'W')
3027    insToGprInstX("umov", "UmovXX", "SimdMiscOp", ("uint64_t",), 4, 'X')
3028    # UMULL, UMULL2 (by element)
3029    threeRegLongInstX("umull", "UmullElemX", "SimdMultOp", smallUnsignedTypes,
3030                      mullCode, byElem=True)
3031    threeRegLongInstX("umull", "UmullElem2X", "SimdMultOp", smallUnsignedTypes,
3032                      mullCode, byElem=True, hi=True)
3033    # UMULL, UMULL2 (vector)
3034    threeRegLongInstX("umull", "UmullX", "SimdMultOp", smallUnsignedTypes,
3035                      mullCode)
3036    threeRegLongInstX("umull", "Umull2X", "SimdMultOp", smallUnsignedTypes,
3037                      mullCode, hi=True)
3038    # UQADD
3039    uqaddCode = '''
3040            destElem = srcElem1 + srcElem2;
3041            FPSCR fpscr = (FPSCR) FpscrQc;
3042            if (destElem < srcElem1 || destElem < srcElem2) {
3043                destElem = (Element)(-1);
3044                fpscr.qc = 1;
3045            }
3046            FpscrQc = fpscr;
3047    '''
3048    threeEqualRegInstX("uqadd", "UqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
3049                       uqaddCode)
3050    threeEqualRegInstX("uqadd", "UqaddQX", "SimdAddOp", unsignedTypes, 4,
3051                       uqaddCode)
3052    threeEqualRegInstX("uqadd", "UqaddScX", "SimdAddOp", unsignedTypes, 4,
3053                       uqaddCode, scalar=True)
3054    # UQRSHL
3055    uqrshlCode = '''
3056            int16_t shiftAmt = (int8_t)srcElem2;
3057            FPSCR fpscr = (FPSCR) FpscrQc;
3058            if (shiftAmt < 0) {
3059                shiftAmt = -shiftAmt;
3060                Element rBit = 0;
3061                if (shiftAmt <= sizeof(Element) * 8)
3062                    rBit = bits(srcElem1, shiftAmt - 1);
3063                if (shiftAmt >= sizeof(Element) * 8) {
3064                    shiftAmt = sizeof(Element) * 8 - 1;
3065                    destElem = 0;
3066                } else {
3067                    destElem = (srcElem1 >> shiftAmt);
3068                }
3069                destElem += rBit;
3070            } else {
3071                if (shiftAmt >= sizeof(Element) * 8) {
3072                    if (srcElem1 != 0) {
3073                        destElem = mask(sizeof(Element) * 8);
3074                        fpscr.qc = 1;
3075                    } else {
3076                        destElem = 0;
3077                    }
3078                } else {
3079                    if (bits(srcElem1, sizeof(Element) * 8 - 1,
3080                                sizeof(Element) * 8 - shiftAmt)) {
3081                        destElem = mask(sizeof(Element) * 8);
3082                        fpscr.qc = 1;
3083                    } else {
3084                        destElem = srcElem1 << shiftAmt;
3085                    }
3086                }
3087            }
3088            FpscrQc = fpscr;
3089    '''
3090    threeEqualRegInstX("uqrshl", "UqrshlDX", "SimdCmpOp", smallUnsignedTypes,
3091                       2, uqrshlCode)
3092    threeEqualRegInstX("uqrshl", "UqrshlQX", "SimdCmpOp", unsignedTypes, 4,
3093                       uqrshlCode)
3094    threeEqualRegInstX("uqrshl", "UqrshlScX", "SimdCmpOp", unsignedTypes, 4,
3095                       uqrshlCode, scalar=True)
3096    # UQRSHRN
3097    uqrshrnCode = '''
3098            FPSCR fpscr = (FPSCR) FpscrQc;
3099            if (imm > sizeof(srcElem1) * 8) {
3100                if (srcElem1 != 0)
3101                    fpscr.qc = 1;
3102                destElem = 0;
3103            } else if (imm) {
3104                BigElement mid = (srcElem1 >> (imm - 1));
3105                uint64_t rBit = mid & 0x1;
3106                mid >>= 1;
3107                mid += rBit;
3108                if (mid != (Element)mid) {
3109                    destElem = mask(sizeof(Element) * 8);
3110                    fpscr.qc = 1;
3111                } else {
3112                    destElem = mid;
3113                }
3114            } else {
3115                if (srcElem1 != (Element)srcElem1) {
3116                    destElem = mask(sizeof(Element) * 8 - 1);
3117                    fpscr.qc = 1;
3118                } else {
3119                    destElem = srcElem1;
3120                }
3121            }
3122            FpscrQc = fpscr;
3123    '''
3124    twoRegNarrowInstX("uqrshrn", "UqrshrnX", "SimdShiftOp", smallUnsignedTypes,
3125                      uqrshrnCode, hasImm=True)
3126    twoRegNarrowInstX("uqrshrn2", "Uqrshrn2X", "SimdShiftOp",
3127                      smallUnsignedTypes, uqrshrnCode, hasImm=True, hi=True)
3128    twoRegNarrowInstX("uqrshrn", "UqrshrnScX", "SimdShiftOp",
3129                      smallUnsignedTypes, uqrshrnCode, hasImm=True,
3130                      scalar=True)
3131    # UQSHL (immediate)
3132    uqshlImmCode = '''
3133            FPSCR fpscr = (FPSCR) FpscrQc;
3134            if (imm >= sizeof(Element) * 8) {
3135                if (srcElem1 != 0) {
3136                    destElem = mask(sizeof(Element) * 8);
3137                    fpscr.qc = 1;
3138                } else {
3139                    destElem = 0;
3140                }
3141            } else if (imm) {
3142                destElem = (srcElem1 << imm);
3143                uint64_t topBits = bits((uint64_t)srcElem1,
3144                                        sizeof(Element) * 8 - 1,
3145                                        sizeof(Element) * 8 - imm);
3146                if (topBits != 0) {
3147                    destElem = mask(sizeof(Element) * 8);
3148                    fpscr.qc = 1;
3149                }
3150            } else {
3151                destElem = srcElem1;
3152            }
3153            FpscrQc = fpscr;
3154    '''
3155    twoEqualRegInstX("uqshl", "UqshlImmDX", "SimdAluOp", smallUnsignedTypes, 2,
3156                     uqshlImmCode, hasImm=True)
3157    twoEqualRegInstX("uqshl", "UqshlImmQX", "SimdAluOp", unsignedTypes, 4,
3158                     uqshlImmCode, hasImm=True)
3159    twoEqualRegInstX("uqshl", "UqshlImmScX", "SimdAluOp", unsignedTypes, 4,
3160                     uqshlImmCode, hasImm=True, scalar=True)
3161    # UQSHL (register)
3162    uqshlCode = '''
3163            int16_t shiftAmt = (int8_t)srcElem2;
3164            FPSCR fpscr = (FPSCR) FpscrQc;
3165            if (shiftAmt < 0) {
3166                shiftAmt = -shiftAmt;
3167                if (shiftAmt >= sizeof(Element) * 8) {
3168                    shiftAmt = sizeof(Element) * 8 - 1;
3169                    destElem = 0;
3170                } else {
3171                    destElem = (srcElem1 >> shiftAmt);
3172                }
3173            } else if (shiftAmt > 0) {
3174                if (shiftAmt >= sizeof(Element) * 8) {
3175                    if (srcElem1 != 0) {
3176                        destElem = mask(sizeof(Element) * 8);
3177                        fpscr.qc = 1;
3178                    } else {
3179                        destElem = 0;
3180                    }
3181                } else {
3182                    if (bits(srcElem1, sizeof(Element) * 8 - 1,
3183                                sizeof(Element) * 8 - shiftAmt)) {
3184                        destElem = mask(sizeof(Element) * 8);
3185                        fpscr.qc = 1;
3186                    } else {
3187                        destElem = srcElem1 << shiftAmt;
3188                    }
3189                }
3190            } else {
3191                destElem = srcElem1;
3192            }
3193            FpscrQc = fpscr;
3194    '''
3195    threeEqualRegInstX("uqshl", "UqshlDX", "SimdAluOp", smallUnsignedTypes, 2,
3196                       uqshlCode)
3197    threeEqualRegInstX("uqshl", "UqshlQX", "SimdAluOp", unsignedTypes, 4,
3198                       uqshlCode)
3199    threeEqualRegInstX("uqshl", "UqshlScX", "SimdAluOp", unsignedTypes, 4,
3200                       uqshlCode, scalar=True)
3201    # UQSHRN, UQSHRN2
3202    uqshrnCode = '''
3203            FPSCR fpscr = (FPSCR) FpscrQc;
3204            if (imm > sizeof(srcElem1) * 8) {
3205                if (srcElem1 != 0)
3206                    fpscr.qc = 1;
3207                destElem = 0;
3208            } else if (imm) {
3209                BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
3210                if (mid != (Element)mid) {
3211                    destElem = mask(sizeof(Element) * 8);
3212                    fpscr.qc = 1;
3213                } else {
3214                    destElem = mid;
3215                }
3216            } else {
3217                destElem = srcElem1;
3218            }
3219            FpscrQc = fpscr;
3220    '''
3221    twoRegNarrowInstX("uqshrn", "UqshrnX", "SimdShiftOp", smallUnsignedTypes,
3222                      uqshrnCode, hasImm=True)
3223    twoRegNarrowInstX("uqshrn2", "Uqshrn2X", "SimdShiftOp", smallUnsignedTypes,
3224                      uqshrnCode, hasImm=True, hi=True)
3225    twoRegNarrowInstX("uqshrn", "UqshrnScX", "SimdShiftOp", smallUnsignedTypes,
3226                      uqshrnCode, hasImm=True, scalar=True)
3227    # UQSUB
3228    uqsubCode = '''
3229            destElem = srcElem1 - srcElem2;
3230            FPSCR fpscr = (FPSCR) FpscrQc;
3231            if (destElem > srcElem1) {
3232                destElem = 0;
3233                fpscr.qc = 1;
3234            }
3235            FpscrQc = fpscr;
3236    '''
3237    threeEqualRegInstX("uqsub", "UqsubDX", "SimdAddOp", smallUnsignedTypes, 2,
3238                       uqsubCode)
3239    threeEqualRegInstX("uqsub", "UqsubQX", "SimdAddOp", unsignedTypes, 4,
3240                       uqsubCode)
3241    threeEqualRegInstX("uqsub", "UqsubScX", "SimdAddOp", unsignedTypes, 4,
3242                       uqsubCode, scalar=True)
3243    # UQXTN
3244    uqxtnCode = '''
3245            FPSCR fpscr = (FPSCR) FpscrQc;
3246            destElem = srcElem1;
3247            if ((BigElement)destElem != srcElem1) {
3248                fpscr.qc = 1;
3249                destElem = mask(sizeof(Element) * 8);
3250            }
3251            FpscrQc = fpscr;
3252    '''
3253    twoRegNarrowInstX("uqxtn", "UqxtnX", "SimdMiscOp", smallUnsignedTypes,
3254                      uqxtnCode)
3255    twoRegNarrowInstX("uqxtn", "Uqxtn2X", "SimdMiscOp", smallUnsignedTypes,
3256                      uqxtnCode, hi=True)
3257    twoRegNarrowInstX("uqxtn", "UqxtnScX", "SimdMiscOp", smallUnsignedTypes,
3258                      uqxtnCode, scalar=True)
3259    # URECPE
3260    urecpeCode = "destElem = unsignedRecipEstimate(srcElem1);"
3261    twoEqualRegInstX("urecpe", "UrecpeDX", "SimdMultAccOp", ("uint32_t",), 2,
3262                     urecpeCode)
3263    twoEqualRegInstX("urecpe", "UrecpeQX", "SimdMultAccOp", ("uint32_t",), 4,
3264                     urecpeCode)
3265    # URHADD
3266    threeEqualRegInstX("urhadd", "UrhaddDX", "SimdAddOp", smallUnsignedTypes,
3267                       2, rhaddCode)
3268    threeEqualRegInstX("urhadd", "UrhaddQX", "SimdAddOp", smallUnsignedTypes,
3269                       4, rhaddCode)
3270    # URSHL
3271    threeEqualRegInstX("urshl", "UrshlDX", "SimdShiftOp", unsignedTypes, 2,
3272                       rshlCode)
3273    threeEqualRegInstX("urshl", "UrshlQX", "SimdShiftOp", unsignedTypes, 4,
3274                       rshlCode)
3275    # URSHR
3276    twoEqualRegInstX("urshr", "UrshrDX", "SimdShiftOp", unsignedTypes, 2,
3277                     rshrCode, hasImm=True)
3278    twoEqualRegInstX("urshr", "UrshrQX", "SimdShiftOp", unsignedTypes, 4,
3279                     rshrCode, hasImm=True)
3280    # URSQRTE
3281    ursqrteCode = "destElem = unsignedRSqrtEstimate(srcElem1);"
3282    twoEqualRegInstX("ursqrte", "UrsqrteDX", "SimdSqrtOp", ("uint32_t",), 2,
3283                     ursqrteCode)
3284    twoEqualRegInstX("ursqrte", "UrsqrteQX", "SimdSqrtOp", ("uint32_t",), 4,
3285                     ursqrteCode)
3286    # URSRA
3287    twoEqualRegInstX("ursra", "UrsraDX", "SimdShiftOp", unsignedTypes, 2,
3288                     rsraCode, True, hasImm=True)
3289    twoEqualRegInstX("ursra", "UrsraQX", "SimdShiftOp", unsignedTypes, 4,
3290                     rsraCode, True, hasImm=True)
3291    # USHL
3292    threeEqualRegInstX("ushl", "UshlDX", "SimdShiftOp", unsignedTypes, 2,
3293                       shlCode)
3294    threeEqualRegInstX("ushl", "UshlQX", "SimdShiftOp", unsignedTypes, 4,
3295                       shlCode)
3296    # USHLL, USHLL2
3297    twoRegLongInstX("ushll", "UshllX", "SimdShiftOp", smallUnsignedTypes,
3298                    shllCode, hasImm=True)
3299    twoRegLongInstX("ushll", "Ushll2X", "SimdShiftOp", smallUnsignedTypes,
3300                    shllCode, hi=True, hasImm=True)
3301    # USHR
3302    twoEqualRegInstX("ushr", "UshrDX", "SimdShiftOp", unsignedTypes, 2,
3303                     shrCode, hasImm=True)
3304    twoEqualRegInstX("ushr", "UshrQX", "SimdShiftOp", unsignedTypes, 4,
3305                     shrCode, hasImm=True)
3306    # USQADD
3307    usqaddCode = '''
3308            FPSCR fpscr = (FPSCR) FpscrQc;
3309            Element tmp = destElem + srcElem1;
3310            if (bits(srcElem1, sizeof(Element) * 8 - 1) == 0) {
3311                if (tmp < srcElem1 || tmp < destElem) {
3312                    destElem = (Element)(-1);
3313                    fpscr.qc = 1;
3314                } else {
3315                    destElem = tmp;
3316                }
3317            } else {
3318                Element absSrcElem1 = (~srcElem1) + 1;
3319                if (absSrcElem1 > destElem) {
3320                    destElem = 0;
3321                    fpscr.qc = 1;
3322                } else {
3323                    destElem = tmp;
3324                }
3325            }
3326            FpscrQc = fpscr;
3327    '''
3328    twoEqualRegInstX("usqadd", "UsqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
3329                     usqaddCode, True)
3330    twoEqualRegInstX("usqadd", "UsqaddQX", "SimdAddOp", unsignedTypes, 4,
3331                     usqaddCode, True)
3332    twoEqualRegInstX("usqadd", "UsqaddScX", "SimdAddOp", unsignedTypes, 4,
3333                     usqaddCode, True, scalar=True)
3334    # USRA
3335    twoEqualRegInstX("usra", "UsraDX", "SimdShiftOp", unsignedTypes, 2,
3336                     sraCode, True, hasImm=True)
3337    twoEqualRegInstX("usra", "UsraQX", "SimdShiftOp", unsignedTypes, 4,
3338                     sraCode, True, hasImm=True)
3339    # USUBL
3340    threeRegLongInstX("usubl", "UsublX", "SimdAddOp", smallUnsignedTypes,
3341                      sublwCode)
3342    threeRegLongInstX("usubl2", "Usubl2X", "SimdAddOp", smallUnsignedTypes,
3343                      sublwCode, hi=True)
3344    # USUBW
3345    threeRegWideInstX("usubw", "UsubwX", "SimdAddOp", smallUnsignedTypes,
3346                      sublwCode)
3347    threeRegWideInstX("usubw2", "Usubw2X", "SimdAddOp", smallUnsignedTypes,
3348                      sublwCode, hi=True)
3349    # UXTL -> alias to USHLL
3350    # UZP1
3351    uzpCode = '''
3352        unsigned part = %s;
3353        for (unsigned i = 0; i < eCount / 2; i++) {
3354            destReg.elements[i] = srcReg1.elements[2 * i + part];
3355            destReg.elements[eCount / 2 + i] = srcReg2.elements[2 * i + part];
3356        }
3357    '''
3358    threeRegScrambleInstX("Uzp1", "Uzp1DX", "SimdAluOp", smallUnsignedTypes, 2,
3359                          uzpCode % "0")
3360    threeRegScrambleInstX("Uzp1", "Uzp1QX", "SimdAluOp", unsignedTypes, 4,
3361                          uzpCode % "0")
3362    # UZP2
3363    threeRegScrambleInstX("Uzp2", "Uzp2DX", "SimdAluOp", smallUnsignedTypes, 2,
3364                          uzpCode % "1")
3365    threeRegScrambleInstX("Uzp2", "Uzp2QX", "SimdAluOp", unsignedTypes, 4,
3366                          uzpCode % "1")
3367    # XTN, XTN2
3368    xtnCode = "destElem = srcElem1;"
3369    twoRegNarrowInstX("Xtn", "XtnX", "SimdMiscOp", smallUnsignedTypes, xtnCode)
3370    twoRegNarrowInstX("Xtn", "Xtn2X", "SimdMiscOp", smallUnsignedTypes,
3371                      xtnCode, hi=True)
3372    # ZIP1
3373    zipCode = '''
3374        unsigned base = %s;
3375        for (unsigned i = 0; i < eCount / 2; i++) {
3376            destReg.elements[2 * i] = srcReg1.elements[base + i];
3377            destReg.elements[2 * i + 1] = srcReg2.elements[base + i];
3378        }
3379    '''
3380    threeRegScrambleInstX("zip1", "Zip1DX", "SimdAluOp", smallUnsignedTypes, 2,
3381                          zipCode % "0")
3382    threeRegScrambleInstX("zip1", "Zip1QX", "SimdAluOp", unsignedTypes, 4,
3383                          zipCode % "0")
3384    # ZIP2
3385    threeRegScrambleInstX("zip2", "Zip2DX", "SimdAluOp", smallUnsignedTypes, 2,
3386                          zipCode % "eCount / 2")
3387    threeRegScrambleInstX("zip2", "Zip2QX", "SimdAluOp", unsignedTypes, 4,
3388                          zipCode % "eCount / 2")
3389
3390    for decoderFlavour, type_dict in decoders.iteritems():
3391        header_output += '''
3392        class %(decoder_flavour)sDecoder {
3393        public:
3394        ''' % { "decoder_flavour" : decoderFlavour }
3395        for type,name in type_dict.iteritems():
3396            header_output += '''
3397            template<typename Elem> using %(type)s = %(new_name)s<Elem>;''' % {
3398               "type" : type, "new_name" : name
3399            }
3400        header_output += '''
3401        };'''
3402}};
3403