neon64.isa (13120:690a0db8e58b) neon64.isa (13544:0b4e5446167c)
1// -*- mode: c++ -*-
2
3// Copyright (c) 2012-2013, 2015-2018 ARM Limited
4// All rights reserved
5//
6// The license below extends only to copyright in the software and shall
7// not be construed as granting a license to any other intellectual
8// property including but not limited to intellectual property relating
9// to a hardware implementation of the functionality of the software
10// licensed hereunder. You may use the software subject to the license
11// terms below provided that you ensure that this notice is replicated
12// unmodified and in its entirety in all distributions of the software,
13// modified or unmodified, in source code or in binary form.
14//
15// Redistribution and use in source and binary forms, with or without
16// modification, are permitted provided that the following conditions are
17// met: redistributions of source code must retain the above copyright
18// notice, this list of conditions and the following disclaimer;
19// redistributions in binary form must reproduce the above copyright
20// notice, this list of conditions and the following disclaimer in the
21// documentation and/or other materials provided with the distribution;
22// neither the name of the copyright holders nor the names of its
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Authors: Giacomo Gabrielli
39// Mbou Eyole
40
41let {{
42
43 header_output = ""
44 exec_output = ""
45 decoders = { 'Generic' : {} }
46
47 # FP types (FP operations always work with unsigned representations)
48 floatTypes = ("uint16_t", "uint32_t", "uint64_t")
49 smallFloatTypes = ("uint32_t",)
50
51 def threeEqualRegInstX(name, Name, opClass, types, rCount, op,
52 readDest=False, pairwise=False, scalar=False,
53 byElem=False, decoder='Generic'):
54 assert (not pairwise) or ((not byElem) and (not scalar))
55 global header_output, exec_output, decoders
56 eWalkCode = simd64EnabledCheckCode + '''
57 RegVect srcReg1, destReg;
58 '''
59 if byElem:
60 # 2nd register operand has to be read fully
61 eWalkCode += '''
62 FullRegVect srcReg2;
63 '''
64 else:
65 eWalkCode += '''
66 RegVect srcReg2;
67 '''
68 for reg in range(rCount):
69 eWalkCode += '''
70 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
71 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
72 ''' % { "reg" : reg }
73 if readDest:
74 eWalkCode += '''
75 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
76 ''' % { "reg" : reg }
77 if byElem:
78 # 2nd operand has to be read fully
79 for reg in range(rCount, 4):
80 eWalkCode += '''
81 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
82 ''' % { "reg" : reg }
83 readDestCode = ''
84 if readDest:
85 readDestCode = 'destElem = gtoh(destReg.elements[i]);'
86 if pairwise:
87 eWalkCode += '''
88 for (unsigned i = 0; i < eCount; i++) {
89 Element srcElem1 = gtoh(2 * i < eCount ?
90 srcReg1.elements[2 * i] :
91 srcReg2.elements[2 * i - eCount]);
92 Element srcElem2 = gtoh(2 * i < eCount ?
93 srcReg1.elements[2 * i + 1] :
94 srcReg2.elements[2 * i + 1 - eCount]);
95 Element destElem;
96 %(readDest)s
97 %(op)s
98 destReg.elements[i] = htog(destElem);
99 }
100 ''' % { "op" : op, "readDest" : readDestCode }
101 else:
102 scalarCheck = '''
103 if (i != 0) {
104 destReg.elements[i] = 0;
105 continue;
106 }
107 '''
108 eWalkCode += '''
109 for (unsigned i = 0; i < eCount; i++) {
110 %(scalarCheck)s
111 Element srcElem1 = gtoh(srcReg1.elements[i]);
112 Element srcElem2 = gtoh(srcReg2.elements[%(src2Index)s]);
113 Element destElem;
114 %(readDest)s
115 %(op)s
116 destReg.elements[i] = htog(destElem);
117 }
118 ''' % { "op" : op, "readDest" : readDestCode,
119 "scalarCheck" : scalarCheck if scalar else "",
120 "src2Index" : "imm" if byElem else "i" }
121 for reg in range(rCount):
122 eWalkCode += '''
123 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
124 ''' % { "reg" : reg }
125 if rCount < 4: # zero upper half
126 for reg in range(rCount, 4):
127 eWalkCode += '''
128 AA64FpDestP%(reg)d_uw = 0;
129 ''' % { "reg" : reg }
130 iop = InstObjParams(name, Name,
131 "DataX2RegImmOp" if byElem else "DataX2RegOp",
132 { "code": eWalkCode,
133 "r_count": rCount,
134 "op_class": opClass }, [])
135 if byElem:
136 header_output += NeonX2RegImmOpDeclare.subst(iop)
137 else:
138 header_output += NeonX2RegOpDeclare.subst(iop)
139 exec_output += NeonXEqualRegOpExecute.subst(iop)
140 for type in types:
141 substDict = { "targs" : type,
142 "class_name" : Name }
143 exec_output += NeonXExecDeclare.subst(substDict)
144
145 def threeUnequalRegInstX(name, Name, opClass, types, op,
146 bigSrc1, bigSrc2, bigDest, readDest, scalar=False,
147 byElem=False, hi=False):
148 assert not (scalar and hi)
149 global header_output, exec_output
150 src1Cnt = src2Cnt = destCnt = 2
151 src1Prefix = src2Prefix = destPrefix = ''
152 if bigSrc1:
153 src1Cnt = 4
154 src1Prefix = 'Big'
155 if bigSrc2:
156 src2Cnt = 4
157 src2Prefix = 'Big'
158 if bigDest:
159 destCnt = 4
160 destPrefix = 'Big'
161 if byElem:
162 src2Prefix = 'Full'
163 eWalkCode = simd64EnabledCheckCode + '''
164 %sRegVect srcReg1;
165 %sRegVect srcReg2;
166 %sRegVect destReg;
167 ''' % (src1Prefix, src2Prefix, destPrefix)
168 srcReg1 = 0
169 if hi and not bigSrc1: # long/widening operations
170 srcReg1 = 2
171 for reg in range(src1Cnt):
172 eWalkCode += '''
173 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(srcReg1)d_uw);
174 ''' % { "reg" : reg, "srcReg1" : srcReg1 }
175 srcReg1 += 1
176 srcReg2 = 0
177 if (not byElem) and (hi and not bigSrc2): # long/widening operations
178 srcReg2 = 2
179 for reg in range(src2Cnt):
180 eWalkCode += '''
181 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(srcReg2)d_uw);
182 ''' % { "reg" : reg, "srcReg2" : srcReg2 }
183 srcReg2 += 1
184 if byElem:
185 # 2nd operand has to be read fully
186 for reg in range(src2Cnt, 4):
187 eWalkCode += '''
188 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
189 ''' % { "reg" : reg }
190 if readDest:
191 for reg in range(destCnt):
192 eWalkCode += '''
193 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
194 ''' % { "reg" : reg }
195 readDestCode = ''
196 if readDest:
197 readDestCode = 'destElem = gtoh(destReg.elements[i]);'
198 scalarCheck = '''
199 if (i != 0) {
200 destReg.elements[i] = 0;
201 continue;
202 }
203 '''
204 eWalkCode += '''
205 for (unsigned i = 0; i < eCount; i++) {
206 %(scalarCheck)s
207 %(src1Prefix)sElement srcElem1 = gtoh(srcReg1.elements[i]);
208 %(src1Prefix)sElement srcElem2 = gtoh(srcReg2.elements[%(src2Index)s]);
209 %(destPrefix)sElement destElem;
210 %(readDest)s
211 %(op)s
212 destReg.elements[i] = htog(destElem);
213 }
214 ''' % { "op" : op, "readDest" : readDestCode,
215 "src1Prefix" : src1Prefix, "src2Prefix" : src2Prefix,
216 "destPrefix" : destPrefix,
217 "scalarCheck" : scalarCheck if scalar else "",
218 "src2Index" : "imm" if byElem else "i" }
219 destReg = 0
220 if hi and not bigDest:
221 # narrowing operations
222 destReg = 2
223 for reg in range(destCnt):
224 eWalkCode += '''
225 AA64FpDestP%(destReg)d_uw = gtoh(destReg.regs[%(reg)d]);
226 ''' % { "reg" : reg, "destReg": destReg }
227 destReg += 1
228 if destCnt < 4:
229 if hi: # Explicitly merge with lower half
230 for reg in range(0, destCnt):
231 eWalkCode += '''
232 AA64FpDestP%(reg)d_uw = AA64FpDestP%(reg)d_uw;''' % { "reg" : reg }
233 else: # zero upper half
234 for reg in range(destCnt, 4):
235 eWalkCode += '''
236 AA64FpDestP%(reg)d_uw = 0;''' % { "reg" : reg }
237
238 iop = InstObjParams(name, Name,
239 "DataX2RegImmOp" if byElem else "DataX2RegOp",
240 { "code": eWalkCode,
241 "r_count": 2,
242 "op_class": opClass }, [])
243 if byElem:
244 header_output += NeonX2RegImmOpDeclare.subst(iop)
245 else:
246 header_output += NeonX2RegOpDeclare.subst(iop)
247 exec_output += NeonXUnequalRegOpExecute.subst(iop)
248 for type in types:
249 substDict = { "targs" : type,
250 "class_name" : Name }
251 exec_output += NeonXExecDeclare.subst(substDict)
252
253 def threeRegNarrowInstX(name, Name, opClass, types, op, readDest=False,
254 scalar=False, byElem=False, hi=False):
255 assert not byElem
256 threeUnequalRegInstX(name, Name, opClass, types, op,
257 True, True, False, readDest, scalar, byElem, hi)
258
259 def threeRegLongInstX(name, Name, opClass, types, op, readDest=False,
260 scalar=False, byElem=False, hi=False):
261 threeUnequalRegInstX(name, Name, opClass, types, op,
262 False, False, True, readDest, scalar, byElem, hi)
263
264 def threeRegWideInstX(name, Name, opClass, types, op, readDest=False,
265 scalar=False, byElem=False, hi=False):
266 assert not byElem
267 threeUnequalRegInstX(name, Name, opClass, types, op,
268 True, False, True, readDest, scalar, byElem, hi)
269
270 def twoEqualRegInstX(name, Name, opClass, types, rCount, op,
271 readDest=False, scalar=False, byElem=False,
272 hasImm=False, isDup=False):
273 global header_output, exec_output
274 assert (not isDup) or byElem
275 if byElem:
276 hasImm = True
277 if isDup:
278 eWalkCode = simd64EnabledCheckCode + '''
279 FullRegVect srcReg1;
280 RegVect destReg;
281 '''
282 else:
283 eWalkCode = simd64EnabledCheckCode + '''
284 RegVect srcReg1, destReg;
285 '''
286 for reg in range(4 if isDup else rCount):
287 eWalkCode += '''
288 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
289 ''' % { "reg" : reg }
290 if readDest:
291 eWalkCode += '''
292 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
293 ''' % { "reg" : reg }
294 readDestCode = ''
295 if readDest:
296 readDestCode = 'destElem = gtoh(destReg.elements[i]);'
297 scalarCheck = '''
298 if (i != 0) {
299 destReg.elements[i] = 0;
300 continue;
301 }
302 '''
303 eWalkCode += '''
304 for (unsigned i = 0; i < eCount; i++) {
305 %(scalarCheck)s
306 unsigned j = i;
307 Element srcElem1 = gtoh(srcReg1.elements[%(src1Index)s]);
308 Element destElem;
309 %(readDest)s
310 %(op)s
311 destReg.elements[j] = htog(destElem);
312 }
313 ''' % { "op" : op, "readDest" : readDestCode,
314 "scalarCheck" : scalarCheck if scalar else "",
315 "src1Index" : "imm" if byElem else "i" }
316 for reg in range(rCount):
317 eWalkCode += '''
318 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
319 ''' % { "reg" : reg }
320 if rCount < 4: # zero upper half
321 for reg in range(rCount, 4):
322 eWalkCode += '''
323 AA64FpDestP%(reg)d_uw = 0;
324 ''' % { "reg" : reg }
325 iop = InstObjParams(name, Name,
326 "DataX1RegImmOp" if hasImm else "DataX1RegOp",
327 { "code": eWalkCode,
328 "r_count": rCount,
329 "op_class": opClass }, [])
330 if hasImm:
331 header_output += NeonX1RegImmOpDeclare.subst(iop)
332 else:
333 header_output += NeonX1RegOpDeclare.subst(iop)
334 exec_output += NeonXEqualRegOpExecute.subst(iop)
335 for type in types:
336 substDict = { "targs" : type,
337 "class_name" : Name }
338 exec_output += NeonXExecDeclare.subst(substDict)
339
340 def twoRegLongInstX(name, Name, opClass, types, op, readDest=False,
341 hi=False, hasImm=False):
342 global header_output, exec_output
343 eWalkCode = simd64EnabledCheckCode + '''
344 RegVect srcReg1;
345 BigRegVect destReg;
346 '''
347 destReg = 0 if not hi else 2
348 for reg in range(2):
349 eWalkCode += '''
350 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(destReg)d_uw);
351 ''' % { "reg" : reg, "destReg": destReg }
352 destReg += 1
353 destReg = 0 if not hi else 2
354 if readDest:
355 for reg in range(4):
356 eWalkCode += '''
357 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
358 ''' % { "reg" : reg }
359 destReg += 1
360 readDestCode = ''
361 if readDest:
362 readDestCode = 'destReg = gtoh(destReg.elements[i]);'
363 eWalkCode += '''
364 for (unsigned i = 0; i < eCount; i++) {
365 Element srcElem1 = gtoh(srcReg1.elements[i]);
366 BigElement destElem;
367 %(readDest)s
368 %(op)s
369 destReg.elements[i] = htog(destElem);
370 }
371 ''' % { "op" : op, "readDest" : readDestCode }
372 for reg in range(4):
373 eWalkCode += '''
374 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
375 ''' % { "reg" : reg }
376 iop = InstObjParams(name, Name,
377 "DataX1RegImmOp" if hasImm else "DataX1RegOp",
378 { "code": eWalkCode,
379 "r_count": 2,
380 "op_class": opClass }, [])
381 if hasImm:
382 header_output += NeonX1RegImmOpDeclare.subst(iop)
383 else:
384 header_output += NeonX1RegOpDeclare.subst(iop)
385 exec_output += NeonXUnequalRegOpExecute.subst(iop)
386 for type in types:
387 substDict = { "targs" : type,
388 "class_name" : Name }
389 exec_output += NeonXExecDeclare.subst(substDict)
390
391 def twoRegNarrowInstX(name, Name, opClass, types, op, readDest=False,
392 scalar=False, hi=False, hasImm=False):
393 global header_output, exec_output
394 eWalkCode = simd64EnabledCheckCode + '''
395 BigRegVect srcReg1;
396 RegVect destReg;
397 '''
398 for reg in range(4):
399 eWalkCode += '''
400 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
401 ''' % { "reg" : reg }
402 if readDest:
403 for reg in range(2):
404 eWalkCode += '''
405 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
406 ''' % { "reg" : reg }
407 else:
408 eWalkCode += '''
409 destReg.elements[0] = 0;
410 ''' % { "reg" : reg }
411 readDestCode = ''
412 if readDest:
413 readDestCode = 'destElem = gtoh(destReg.elements[i]);'
414 scalarCheck = '''
415 if (i != 0) {
416 destReg.elements[i] = 0;
417 continue;
418 }
419 '''
420 eWalkCode += '''
421 for (unsigned i = 0; i < eCount; i++) {
422 %(scalarCheck)s
423 BigElement srcElem1 = gtoh(srcReg1.elements[i]);
424 Element destElem;
425 %(readDest)s
426 %(op)s
427 destReg.elements[i] = htog(destElem);
428 }
429 ''' % { "op" : op, "readDest" : readDestCode,
430 "scalarCheck" : scalarCheck if scalar else "" }
431 destReg = 0 if not hi else 2
432 for reg in range(2):
433 eWalkCode += '''
434 AA64FpDestP%(destReg)d_uw = gtoh(destReg.regs[%(reg)d]);
435 ''' % { "reg" : reg, "destReg": destReg }
436 destReg += 1
437 if hi:
438 for reg in range(0, 2): # Explicitly merge with the lower half
439 eWalkCode += '''
440 AA64FpDestP%(reg)d_uw = AA64FpDestP%(reg)d_uw;''' % { "reg" : reg }
441 else:
442 for reg in range(2, 4): # zero upper half
443 eWalkCode += '''
444 AA64FpDestP%(reg)d_uw = 0;
445 ''' % { "reg" : reg }
446
447 iop = InstObjParams(name, Name,
448 "DataX1RegImmOp" if hasImm else "DataX1RegOp",
449 { "code": eWalkCode,
450 "r_count": 2,
451 "op_class": opClass }, [])
452 if hasImm:
453 header_output += NeonX1RegImmOpDeclare.subst(iop)
454 else:
455 header_output += NeonX1RegOpDeclare.subst(iop)
456 exec_output += NeonXUnequalRegOpExecute.subst(iop)
457 for type in types:
458 substDict = { "targs" : type,
459 "class_name" : Name }
460 exec_output += NeonXExecDeclare.subst(substDict)
461
462 def threeRegScrambleInstX(name, Name, opClass, types, rCount, op):
463 global header_output, exec_output
464 eWalkCode = simd64EnabledCheckCode + '''
465 RegVect srcReg1, srcReg2, destReg;
466 '''
467 for reg in range(rCount):
468 eWalkCode += '''
469 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
470 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
471 ''' % { "reg" : reg }
472 eWalkCode += op
473 for reg in range(rCount):
474 eWalkCode += '''
475 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
476 ''' % { "reg" : reg }
477 if rCount < 4:
478 for reg in range(rCount, 4):
479 eWalkCode += '''
480 AA64FpDestP%(reg)d_uw = 0;
481 ''' % { "reg" : reg }
482 iop = InstObjParams(name, Name,
483 "DataX2RegOp",
484 { "code": eWalkCode,
485 "r_count": rCount,
486 "op_class": opClass }, [])
487 header_output += NeonX2RegOpDeclare.subst(iop)
488 exec_output += NeonXEqualRegOpExecute.subst(iop)
489 for type in types:
490 substDict = { "targs" : type,
491 "class_name" : Name }
492 exec_output += NeonXExecDeclare.subst(substDict)
493
494 def insFromVecElemInstX(name, Name, opClass, types, rCount):
495 global header_output, exec_output
496 eWalkCode = simd64EnabledCheckCode + '''
497 FullRegVect srcReg1;
498 RegVect destReg;
499 '''
500 for reg in range(4):
501 eWalkCode += '''
502 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
503 ''' % { "reg" : reg }
504 for reg in range(rCount):
505 eWalkCode += '''
506 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
507 ''' % { "reg" : reg }
508 eWalkCode += '''
509 Element srcElem1 = gtoh(srcReg1.elements[imm2]);
510 Element destElem = srcElem1;
511 destReg.elements[imm1] = htog(destElem);
512 '''
513 for reg in range(rCount):
514 eWalkCode += '''
515 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
516 ''' % { "reg" : reg }
517 iop = InstObjParams(name, Name,
518 "DataX1Reg2ImmOp",
519 { "code": eWalkCode,
520 "r_count": rCount,
521 "op_class": opClass }, [])
522 header_output += NeonX1Reg2ImmOpDeclare.subst(iop)
523 exec_output += NeonXEqualRegOpExecute.subst(iop)
524 for type in types:
525 substDict = { "targs" : type,
526 "class_name" : Name }
527 exec_output += NeonXExecDeclare.subst(substDict)
528
529 def twoRegPairwiseScInstX(name, Name, opClass, types, rCount, op):
530 global header_output, exec_output
531 eWalkCode = simd64EnabledCheckCode + '''
532 RegVect srcReg1, destReg;
533 '''
534 for reg in range(rCount):
535 eWalkCode += '''
536 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
537 ''' % { "reg" : reg }
538 eWalkCode += '''
539 Element srcElem1 = gtoh(srcReg1.elements[0]);
540 Element srcElem2 = gtoh(srcReg1.elements[1]);
541 Element destElem;
542 %(op)s
543 destReg.elements[0] = htog(destElem);
544 ''' % { "op" : op }
545 destCnt = rCount / 2
546 for reg in range(destCnt):
547 eWalkCode += '''
548 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
549 ''' % { "reg" : reg }
550 for reg in range(destCnt, 4): # zero upper half
551 eWalkCode += '''
552 AA64FpDestP%(reg)d_uw = 0;
553 ''' % { "reg" : reg }
554 iop = InstObjParams(name, Name,
555 "DataX1RegOp",
556 { "code": eWalkCode,
557 "r_count": rCount,
558 "op_class": opClass }, [])
559 header_output += NeonX1RegOpDeclare.subst(iop)
560 exec_output += NeonXEqualRegOpExecute.subst(iop)
561 for type in types:
562 substDict = { "targs" : type,
563 "class_name" : Name }
564 exec_output += NeonXExecDeclare.subst(substDict)
565
566 def twoRegAcrossInstX(name, Name, opClass, types, rCount, op,
567 doubleDest=False, long=False):
568 global header_output, exec_output
569 destPrefix = "Big" if long else ""
570 eWalkCode = simd64EnabledCheckCode + '''
571 RegVect srcReg1;
572 %sRegVect destReg;
573 ''' % destPrefix
574 for reg in range(rCount):
575 eWalkCode += '''
576 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
577 ''' % { "reg" : reg }
578 eWalkCode += '''
579 destReg.regs[0] = 0;
580 %(destPrefix)sElement destElem = 0;
581 for (unsigned i = 0; i < eCount; i++) {
582 Element srcElem1 = gtoh(srcReg1.elements[i]);
583 if (i == 0) {
584 destElem = srcElem1;
585 } else {
586 %(op)s
587 }
588 }
589 destReg.elements[0] = htog(destElem);
590 ''' % { "op" : op, "destPrefix" : destPrefix }
591 destCnt = 2 if doubleDest else 1
592 for reg in range(destCnt):
593 eWalkCode += '''
594 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
595 ''' % { "reg" : reg }
596 for reg in range(destCnt, 4): # zero upper half
597 eWalkCode += '''
598 AA64FpDestP%(reg)d_uw = 0;
599 ''' % { "reg" : reg }
600 iop = InstObjParams(name, Name,
601 "DataX1RegOp",
602 { "code": eWalkCode,
603 "r_count": rCount,
604 "op_class": opClass }, [])
605 header_output += NeonX1RegOpDeclare.subst(iop)
606 if long:
607 exec_output += NeonXUnequalRegOpExecute.subst(iop)
608 else:
609 exec_output += NeonXEqualRegOpExecute.subst(iop)
610 for type in types:
611 substDict = { "targs" : type,
612 "class_name" : Name }
613 exec_output += NeonXExecDeclare.subst(substDict)
614
615 def twoRegCondenseInstX(name, Name, opClass, types, rCount, op,
616 readDest=False):
617 global header_output, exec_output
618 eWalkCode = simd64EnabledCheckCode + '''
619 RegVect srcRegs;
620 BigRegVect destReg;
621 '''
622 for reg in range(rCount):
623 eWalkCode += '''
624 srcRegs.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
625 ''' % { "reg" : reg }
626 if readDest:
627 eWalkCode += '''
628 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
629 ''' % { "reg" : reg }
630 readDestCode = ''
631 if readDest:
632 readDestCode = 'destElem = gtoh(destReg.elements[i]);'
633 eWalkCode += '''
634 for (unsigned i = 0; i < eCount / 2; i++) {
635 Element srcElem1 = gtoh(srcRegs.elements[2 * i]);
636 Element srcElem2 = gtoh(srcRegs.elements[2 * i + 1]);
637 BigElement destElem;
638 %(readDest)s
639 %(op)s
640 destReg.elements[i] = htog(destElem);
641 }
642 ''' % { "op" : op, "readDest" : readDestCode }
643 for reg in range(rCount):
644 eWalkCode += '''
645 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
646 ''' % { "reg" : reg }
647 if rCount < 4: # zero upper half
648 for reg in range(rCount, 4):
649 eWalkCode += '''
650 AA64FpDestP%(reg)d_uw = 0;
651 ''' % { "reg" : reg }
652 iop = InstObjParams(name, Name,
653 "DataX1RegOp",
654 { "code": eWalkCode,
655 "r_count": rCount,
656 "op_class": opClass }, [])
657 header_output += NeonX1RegOpDeclare.subst(iop)
658 exec_output += NeonXUnequalRegOpExecute.subst(iop)
659 for type in types:
660 substDict = { "targs" : type,
661 "class_name" : Name }
662 exec_output += NeonXExecDeclare.subst(substDict)
663
664 def oneRegImmInstX(name, Name, opClass, types, rCount, op, readDest=False):
665 global header_output, exec_output
666 eWalkCode = simd64EnabledCheckCode + '''
667 RegVect destReg;
668 '''
669 if readDest:
670 for reg in range(rCount):
671 eWalkCode += '''
672 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
673 ''' % { "reg" : reg }
674 readDestCode = ''
675 if readDest:
676 readDestCode = 'destElem = gtoh(destReg.elements[i]);'
677 eWalkCode += '''
678 for (unsigned i = 0; i < eCount; i++) {
679 Element destElem;
680 %(readDest)s
681 %(op)s
682 destReg.elements[i] = htog(destElem);
683 }
684 ''' % { "op" : op, "readDest" : readDestCode }
685 for reg in range(rCount):
686 eWalkCode += '''
687 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
688 ''' % { "reg" : reg }
689 if rCount < 4: # zero upper half
690 for reg in range(rCount, 4):
691 eWalkCode += '''
692 AA64FpDestP%(reg)d_uw = 0;
693 ''' % { "reg" : reg }
694 iop = InstObjParams(name, Name,
695 "DataXImmOnlyOp",
696 { "code": eWalkCode,
697 "r_count": rCount,
698 "op_class": opClass }, [])
699 header_output += NeonX1RegImmOnlyOpDeclare.subst(iop)
700 exec_output += NeonXEqualRegOpExecute.subst(iop)
701 for type in types:
702 substDict = { "targs" : type,
703 "class_name" : Name }
704 exec_output += NeonXExecDeclare.subst(substDict)
705
706 def dupGprInstX(name, Name, opClass, types, rCount, gprSpec):
707 global header_output, exec_output
708 eWalkCode = simd64EnabledCheckCode + '''
709 RegVect destReg;
710 for (unsigned i = 0; i < eCount; i++) {
711 destReg.elements[i] = htog((Element) %sOp1);
712 }
713 ''' % gprSpec
714 for reg in range(rCount):
715 eWalkCode += '''
716 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
717 ''' % { "reg" : reg }
718 if rCount < 4: # zero upper half
719 for reg in range(rCount, 4):
720 eWalkCode += '''
721 AA64FpDestP%(reg)d_uw = 0;
722 ''' % { "reg" : reg }
723 iop = InstObjParams(name, Name,
724 "DataX1RegOp",
725 { "code": eWalkCode,
726 "r_count": rCount,
727 "op_class": opClass }, [])
728 header_output += NeonX1RegOpDeclare.subst(iop)
729 exec_output += NeonXEqualRegOpExecute.subst(iop)
730 for type in types:
731 substDict = { "targs" : type,
732 "class_name" : Name }
733 exec_output += NeonXExecDeclare.subst(substDict)
734
735 def extInstX(name, Name, opClass, types, rCount, op):
736 global header_output, exec_output
737 eWalkCode = simd64EnabledCheckCode + '''
738 RegVect srcReg1, srcReg2, destReg;
739 '''
740 for reg in range(rCount):
741 eWalkCode += '''
742 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
743 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
744 ''' % { "reg" : reg }
745 eWalkCode += op
746 for reg in range(rCount):
747 eWalkCode += '''
748 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
749 ''' % { "reg" : reg }
750 if rCount < 4: # zero upper half
751 for reg in range(rCount, 4):
752 eWalkCode += '''
753 AA64FpDestP%(reg)d_uw = 0;
754 ''' % { "reg" : reg }
755 iop = InstObjParams(name, Name,
756 "DataX2RegImmOp",
757 { "code": eWalkCode,
758 "r_count": rCount,
759 "op_class": opClass }, [])
760 header_output += NeonX2RegImmOpDeclare.subst(iop)
761 exec_output += NeonXEqualRegOpExecute.subst(iop)
762 for type in types:
763 substDict = { "targs" : type,
764 "class_name" : Name }
765 exec_output += NeonXExecDeclare.subst(substDict)
766
767 def insFromGprInstX(name, Name, opClass, types, rCount, gprSpec):
768 global header_output, exec_output
769 eWalkCode = simd64EnabledCheckCode + '''
770 RegVect destReg;
771 '''
772 for reg in range(rCount):
773 eWalkCode += '''
774 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
775 ''' % { "reg" : reg }
776 eWalkCode += '''
777 destReg.elements[imm] = htog((Element) %sOp1);
778 ''' % gprSpec
779 for reg in range(rCount):
780 eWalkCode += '''
781 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
782 ''' % { "reg" : reg }
783 iop = InstObjParams(name, Name,
784 "DataX1RegImmOp",
785 { "code": eWalkCode,
786 "r_count": rCount,
787 "op_class": opClass }, [])
788 header_output += NeonX1RegImmOpDeclare.subst(iop)
789 exec_output += NeonXEqualRegOpExecute.subst(iop)
790 for type in types:
791 substDict = { "targs" : type,
792 "class_name" : Name }
793 exec_output += NeonXExecDeclare.subst(substDict)
794
795 def insToGprInstX(name, Name, opClass, types, rCount, gprSpec,
796 signExt=False):
797 global header_output, exec_output
798 eWalkCode = simd64EnabledCheckCode + '''
799 FullRegVect srcReg;
800 '''
801 for reg in range(4):
802 eWalkCode += '''
803 srcReg.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
804 ''' % { "reg" : reg }
805 if signExt:
806 eWalkCode += '''
807 %sDest = sext<sizeof(Element) * 8>(srcReg.elements[imm]);
808 ''' % gprSpec
809 else:
810 eWalkCode += '''
811 %sDest = srcReg.elements[imm];
812 ''' % gprSpec
813 iop = InstObjParams(name, Name,
814 "DataX1RegImmOp",
815 { "code": eWalkCode,
816 "r_count": rCount,
817 "op_class": opClass }, [])
818 header_output += NeonX1RegImmOpDeclare.subst(iop)
819 exec_output += NeonXEqualRegOpExecute.subst(iop)
820 for type in types:
821 substDict = { "targs" : type,
822 "class_name" : Name }
823 exec_output += NeonXExecDeclare.subst(substDict)
824
825 def tbxTblInstX(name, Name, opClass, types, length, isTbl, rCount):
826 global header_output, decoder_output, exec_output
827 code = simd64EnabledCheckCode + '''
828 union
829 {
830 uint8_t bytes[64];
1// -*- mode: c++ -*-
2
3// Copyright (c) 2012-2013, 2015-2018 ARM Limited
4// All rights reserved
5//
6// The license below extends only to copyright in the software and shall
7// not be construed as granting a license to any other intellectual
8// property including but not limited to intellectual property relating
9// to a hardware implementation of the functionality of the software
10// licensed hereunder. You may use the software subject to the license
11// terms below provided that you ensure that this notice is replicated
12// unmodified and in its entirety in all distributions of the software,
13// modified or unmodified, in source code or in binary form.
14//
15// Redistribution and use in source and binary forms, with or without
16// modification, are permitted provided that the following conditions are
17// met: redistributions of source code must retain the above copyright
18// notice, this list of conditions and the following disclaimer;
19// redistributions in binary form must reproduce the above copyright
20// notice, this list of conditions and the following disclaimer in the
21// documentation and/or other materials provided with the distribution;
22// neither the name of the copyright holders nor the names of its
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Authors: Giacomo Gabrielli
39// Mbou Eyole
40
41let {{
42
43 header_output = ""
44 exec_output = ""
45 decoders = { 'Generic' : {} }
46
47 # FP types (FP operations always work with unsigned representations)
48 floatTypes = ("uint16_t", "uint32_t", "uint64_t")
49 smallFloatTypes = ("uint32_t",)
50
51 def threeEqualRegInstX(name, Name, opClass, types, rCount, op,
52 readDest=False, pairwise=False, scalar=False,
53 byElem=False, decoder='Generic'):
54 assert (not pairwise) or ((not byElem) and (not scalar))
55 global header_output, exec_output, decoders
56 eWalkCode = simd64EnabledCheckCode + '''
57 RegVect srcReg1, destReg;
58 '''
59 if byElem:
60 # 2nd register operand has to be read fully
61 eWalkCode += '''
62 FullRegVect srcReg2;
63 '''
64 else:
65 eWalkCode += '''
66 RegVect srcReg2;
67 '''
68 for reg in range(rCount):
69 eWalkCode += '''
70 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
71 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
72 ''' % { "reg" : reg }
73 if readDest:
74 eWalkCode += '''
75 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
76 ''' % { "reg" : reg }
77 if byElem:
78 # 2nd operand has to be read fully
79 for reg in range(rCount, 4):
80 eWalkCode += '''
81 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
82 ''' % { "reg" : reg }
83 readDestCode = ''
84 if readDest:
85 readDestCode = 'destElem = gtoh(destReg.elements[i]);'
86 if pairwise:
87 eWalkCode += '''
88 for (unsigned i = 0; i < eCount; i++) {
89 Element srcElem1 = gtoh(2 * i < eCount ?
90 srcReg1.elements[2 * i] :
91 srcReg2.elements[2 * i - eCount]);
92 Element srcElem2 = gtoh(2 * i < eCount ?
93 srcReg1.elements[2 * i + 1] :
94 srcReg2.elements[2 * i + 1 - eCount]);
95 Element destElem;
96 %(readDest)s
97 %(op)s
98 destReg.elements[i] = htog(destElem);
99 }
100 ''' % { "op" : op, "readDest" : readDestCode }
101 else:
102 scalarCheck = '''
103 if (i != 0) {
104 destReg.elements[i] = 0;
105 continue;
106 }
107 '''
108 eWalkCode += '''
109 for (unsigned i = 0; i < eCount; i++) {
110 %(scalarCheck)s
111 Element srcElem1 = gtoh(srcReg1.elements[i]);
112 Element srcElem2 = gtoh(srcReg2.elements[%(src2Index)s]);
113 Element destElem;
114 %(readDest)s
115 %(op)s
116 destReg.elements[i] = htog(destElem);
117 }
118 ''' % { "op" : op, "readDest" : readDestCode,
119 "scalarCheck" : scalarCheck if scalar else "",
120 "src2Index" : "imm" if byElem else "i" }
121 for reg in range(rCount):
122 eWalkCode += '''
123 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
124 ''' % { "reg" : reg }
125 if rCount < 4: # zero upper half
126 for reg in range(rCount, 4):
127 eWalkCode += '''
128 AA64FpDestP%(reg)d_uw = 0;
129 ''' % { "reg" : reg }
130 iop = InstObjParams(name, Name,
131 "DataX2RegImmOp" if byElem else "DataX2RegOp",
132 { "code": eWalkCode,
133 "r_count": rCount,
134 "op_class": opClass }, [])
135 if byElem:
136 header_output += NeonX2RegImmOpDeclare.subst(iop)
137 else:
138 header_output += NeonX2RegOpDeclare.subst(iop)
139 exec_output += NeonXEqualRegOpExecute.subst(iop)
140 for type in types:
141 substDict = { "targs" : type,
142 "class_name" : Name }
143 exec_output += NeonXExecDeclare.subst(substDict)
144
145 def threeUnequalRegInstX(name, Name, opClass, types, op,
146 bigSrc1, bigSrc2, bigDest, readDest, scalar=False,
147 byElem=False, hi=False):
148 assert not (scalar and hi)
149 global header_output, exec_output
150 src1Cnt = src2Cnt = destCnt = 2
151 src1Prefix = src2Prefix = destPrefix = ''
152 if bigSrc1:
153 src1Cnt = 4
154 src1Prefix = 'Big'
155 if bigSrc2:
156 src2Cnt = 4
157 src2Prefix = 'Big'
158 if bigDest:
159 destCnt = 4
160 destPrefix = 'Big'
161 if byElem:
162 src2Prefix = 'Full'
163 eWalkCode = simd64EnabledCheckCode + '''
164 %sRegVect srcReg1;
165 %sRegVect srcReg2;
166 %sRegVect destReg;
167 ''' % (src1Prefix, src2Prefix, destPrefix)
168 srcReg1 = 0
169 if hi and not bigSrc1: # long/widening operations
170 srcReg1 = 2
171 for reg in range(src1Cnt):
172 eWalkCode += '''
173 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(srcReg1)d_uw);
174 ''' % { "reg" : reg, "srcReg1" : srcReg1 }
175 srcReg1 += 1
176 srcReg2 = 0
177 if (not byElem) and (hi and not bigSrc2): # long/widening operations
178 srcReg2 = 2
179 for reg in range(src2Cnt):
180 eWalkCode += '''
181 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(srcReg2)d_uw);
182 ''' % { "reg" : reg, "srcReg2" : srcReg2 }
183 srcReg2 += 1
184 if byElem:
185 # 2nd operand has to be read fully
186 for reg in range(src2Cnt, 4):
187 eWalkCode += '''
188 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
189 ''' % { "reg" : reg }
190 if readDest:
191 for reg in range(destCnt):
192 eWalkCode += '''
193 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
194 ''' % { "reg" : reg }
195 readDestCode = ''
196 if readDest:
197 readDestCode = 'destElem = gtoh(destReg.elements[i]);'
198 scalarCheck = '''
199 if (i != 0) {
200 destReg.elements[i] = 0;
201 continue;
202 }
203 '''
204 eWalkCode += '''
205 for (unsigned i = 0; i < eCount; i++) {
206 %(scalarCheck)s
207 %(src1Prefix)sElement srcElem1 = gtoh(srcReg1.elements[i]);
208 %(src1Prefix)sElement srcElem2 = gtoh(srcReg2.elements[%(src2Index)s]);
209 %(destPrefix)sElement destElem;
210 %(readDest)s
211 %(op)s
212 destReg.elements[i] = htog(destElem);
213 }
214 ''' % { "op" : op, "readDest" : readDestCode,
215 "src1Prefix" : src1Prefix, "src2Prefix" : src2Prefix,
216 "destPrefix" : destPrefix,
217 "scalarCheck" : scalarCheck if scalar else "",
218 "src2Index" : "imm" if byElem else "i" }
219 destReg = 0
220 if hi and not bigDest:
221 # narrowing operations
222 destReg = 2
223 for reg in range(destCnt):
224 eWalkCode += '''
225 AA64FpDestP%(destReg)d_uw = gtoh(destReg.regs[%(reg)d]);
226 ''' % { "reg" : reg, "destReg": destReg }
227 destReg += 1
228 if destCnt < 4:
229 if hi: # Explicitly merge with lower half
230 for reg in range(0, destCnt):
231 eWalkCode += '''
232 AA64FpDestP%(reg)d_uw = AA64FpDestP%(reg)d_uw;''' % { "reg" : reg }
233 else: # zero upper half
234 for reg in range(destCnt, 4):
235 eWalkCode += '''
236 AA64FpDestP%(reg)d_uw = 0;''' % { "reg" : reg }
237
238 iop = InstObjParams(name, Name,
239 "DataX2RegImmOp" if byElem else "DataX2RegOp",
240 { "code": eWalkCode,
241 "r_count": 2,
242 "op_class": opClass }, [])
243 if byElem:
244 header_output += NeonX2RegImmOpDeclare.subst(iop)
245 else:
246 header_output += NeonX2RegOpDeclare.subst(iop)
247 exec_output += NeonXUnequalRegOpExecute.subst(iop)
248 for type in types:
249 substDict = { "targs" : type,
250 "class_name" : Name }
251 exec_output += NeonXExecDeclare.subst(substDict)
252
253 def threeRegNarrowInstX(name, Name, opClass, types, op, readDest=False,
254 scalar=False, byElem=False, hi=False):
255 assert not byElem
256 threeUnequalRegInstX(name, Name, opClass, types, op,
257 True, True, False, readDest, scalar, byElem, hi)
258
259 def threeRegLongInstX(name, Name, opClass, types, op, readDest=False,
260 scalar=False, byElem=False, hi=False):
261 threeUnequalRegInstX(name, Name, opClass, types, op,
262 False, False, True, readDest, scalar, byElem, hi)
263
264 def threeRegWideInstX(name, Name, opClass, types, op, readDest=False,
265 scalar=False, byElem=False, hi=False):
266 assert not byElem
267 threeUnequalRegInstX(name, Name, opClass, types, op,
268 True, False, True, readDest, scalar, byElem, hi)
269
270 def twoEqualRegInstX(name, Name, opClass, types, rCount, op,
271 readDest=False, scalar=False, byElem=False,
272 hasImm=False, isDup=False):
273 global header_output, exec_output
274 assert (not isDup) or byElem
275 if byElem:
276 hasImm = True
277 if isDup:
278 eWalkCode = simd64EnabledCheckCode + '''
279 FullRegVect srcReg1;
280 RegVect destReg;
281 '''
282 else:
283 eWalkCode = simd64EnabledCheckCode + '''
284 RegVect srcReg1, destReg;
285 '''
286 for reg in range(4 if isDup else rCount):
287 eWalkCode += '''
288 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
289 ''' % { "reg" : reg }
290 if readDest:
291 eWalkCode += '''
292 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
293 ''' % { "reg" : reg }
294 readDestCode = ''
295 if readDest:
296 readDestCode = 'destElem = gtoh(destReg.elements[i]);'
297 scalarCheck = '''
298 if (i != 0) {
299 destReg.elements[i] = 0;
300 continue;
301 }
302 '''
303 eWalkCode += '''
304 for (unsigned i = 0; i < eCount; i++) {
305 %(scalarCheck)s
306 unsigned j = i;
307 Element srcElem1 = gtoh(srcReg1.elements[%(src1Index)s]);
308 Element destElem;
309 %(readDest)s
310 %(op)s
311 destReg.elements[j] = htog(destElem);
312 }
313 ''' % { "op" : op, "readDest" : readDestCode,
314 "scalarCheck" : scalarCheck if scalar else "",
315 "src1Index" : "imm" if byElem else "i" }
316 for reg in range(rCount):
317 eWalkCode += '''
318 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
319 ''' % { "reg" : reg }
320 if rCount < 4: # zero upper half
321 for reg in range(rCount, 4):
322 eWalkCode += '''
323 AA64FpDestP%(reg)d_uw = 0;
324 ''' % { "reg" : reg }
325 iop = InstObjParams(name, Name,
326 "DataX1RegImmOp" if hasImm else "DataX1RegOp",
327 { "code": eWalkCode,
328 "r_count": rCount,
329 "op_class": opClass }, [])
330 if hasImm:
331 header_output += NeonX1RegImmOpDeclare.subst(iop)
332 else:
333 header_output += NeonX1RegOpDeclare.subst(iop)
334 exec_output += NeonXEqualRegOpExecute.subst(iop)
335 for type in types:
336 substDict = { "targs" : type,
337 "class_name" : Name }
338 exec_output += NeonXExecDeclare.subst(substDict)
339
340 def twoRegLongInstX(name, Name, opClass, types, op, readDest=False,
341 hi=False, hasImm=False):
342 global header_output, exec_output
343 eWalkCode = simd64EnabledCheckCode + '''
344 RegVect srcReg1;
345 BigRegVect destReg;
346 '''
347 destReg = 0 if not hi else 2
348 for reg in range(2):
349 eWalkCode += '''
350 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(destReg)d_uw);
351 ''' % { "reg" : reg, "destReg": destReg }
352 destReg += 1
353 destReg = 0 if not hi else 2
354 if readDest:
355 for reg in range(4):
356 eWalkCode += '''
357 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
358 ''' % { "reg" : reg }
359 destReg += 1
360 readDestCode = ''
361 if readDest:
362 readDestCode = 'destReg = gtoh(destReg.elements[i]);'
363 eWalkCode += '''
364 for (unsigned i = 0; i < eCount; i++) {
365 Element srcElem1 = gtoh(srcReg1.elements[i]);
366 BigElement destElem;
367 %(readDest)s
368 %(op)s
369 destReg.elements[i] = htog(destElem);
370 }
371 ''' % { "op" : op, "readDest" : readDestCode }
372 for reg in range(4):
373 eWalkCode += '''
374 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
375 ''' % { "reg" : reg }
376 iop = InstObjParams(name, Name,
377 "DataX1RegImmOp" if hasImm else "DataX1RegOp",
378 { "code": eWalkCode,
379 "r_count": 2,
380 "op_class": opClass }, [])
381 if hasImm:
382 header_output += NeonX1RegImmOpDeclare.subst(iop)
383 else:
384 header_output += NeonX1RegOpDeclare.subst(iop)
385 exec_output += NeonXUnequalRegOpExecute.subst(iop)
386 for type in types:
387 substDict = { "targs" : type,
388 "class_name" : Name }
389 exec_output += NeonXExecDeclare.subst(substDict)
390
391 def twoRegNarrowInstX(name, Name, opClass, types, op, readDest=False,
392 scalar=False, hi=False, hasImm=False):
393 global header_output, exec_output
394 eWalkCode = simd64EnabledCheckCode + '''
395 BigRegVect srcReg1;
396 RegVect destReg;
397 '''
398 for reg in range(4):
399 eWalkCode += '''
400 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
401 ''' % { "reg" : reg }
402 if readDest:
403 for reg in range(2):
404 eWalkCode += '''
405 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
406 ''' % { "reg" : reg }
407 else:
408 eWalkCode += '''
409 destReg.elements[0] = 0;
410 ''' % { "reg" : reg }
411 readDestCode = ''
412 if readDest:
413 readDestCode = 'destElem = gtoh(destReg.elements[i]);'
414 scalarCheck = '''
415 if (i != 0) {
416 destReg.elements[i] = 0;
417 continue;
418 }
419 '''
420 eWalkCode += '''
421 for (unsigned i = 0; i < eCount; i++) {
422 %(scalarCheck)s
423 BigElement srcElem1 = gtoh(srcReg1.elements[i]);
424 Element destElem;
425 %(readDest)s
426 %(op)s
427 destReg.elements[i] = htog(destElem);
428 }
429 ''' % { "op" : op, "readDest" : readDestCode,
430 "scalarCheck" : scalarCheck if scalar else "" }
431 destReg = 0 if not hi else 2
432 for reg in range(2):
433 eWalkCode += '''
434 AA64FpDestP%(destReg)d_uw = gtoh(destReg.regs[%(reg)d]);
435 ''' % { "reg" : reg, "destReg": destReg }
436 destReg += 1
437 if hi:
438 for reg in range(0, 2): # Explicitly merge with the lower half
439 eWalkCode += '''
440 AA64FpDestP%(reg)d_uw = AA64FpDestP%(reg)d_uw;''' % { "reg" : reg }
441 else:
442 for reg in range(2, 4): # zero upper half
443 eWalkCode += '''
444 AA64FpDestP%(reg)d_uw = 0;
445 ''' % { "reg" : reg }
446
447 iop = InstObjParams(name, Name,
448 "DataX1RegImmOp" if hasImm else "DataX1RegOp",
449 { "code": eWalkCode,
450 "r_count": 2,
451 "op_class": opClass }, [])
452 if hasImm:
453 header_output += NeonX1RegImmOpDeclare.subst(iop)
454 else:
455 header_output += NeonX1RegOpDeclare.subst(iop)
456 exec_output += NeonXUnequalRegOpExecute.subst(iop)
457 for type in types:
458 substDict = { "targs" : type,
459 "class_name" : Name }
460 exec_output += NeonXExecDeclare.subst(substDict)
461
462 def threeRegScrambleInstX(name, Name, opClass, types, rCount, op):
463 global header_output, exec_output
464 eWalkCode = simd64EnabledCheckCode + '''
465 RegVect srcReg1, srcReg2, destReg;
466 '''
467 for reg in range(rCount):
468 eWalkCode += '''
469 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
470 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
471 ''' % { "reg" : reg }
472 eWalkCode += op
473 for reg in range(rCount):
474 eWalkCode += '''
475 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
476 ''' % { "reg" : reg }
477 if rCount < 4:
478 for reg in range(rCount, 4):
479 eWalkCode += '''
480 AA64FpDestP%(reg)d_uw = 0;
481 ''' % { "reg" : reg }
482 iop = InstObjParams(name, Name,
483 "DataX2RegOp",
484 { "code": eWalkCode,
485 "r_count": rCount,
486 "op_class": opClass }, [])
487 header_output += NeonX2RegOpDeclare.subst(iop)
488 exec_output += NeonXEqualRegOpExecute.subst(iop)
489 for type in types:
490 substDict = { "targs" : type,
491 "class_name" : Name }
492 exec_output += NeonXExecDeclare.subst(substDict)
493
494 def insFromVecElemInstX(name, Name, opClass, types, rCount):
495 global header_output, exec_output
496 eWalkCode = simd64EnabledCheckCode + '''
497 FullRegVect srcReg1;
498 RegVect destReg;
499 '''
500 for reg in range(4):
501 eWalkCode += '''
502 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
503 ''' % { "reg" : reg }
504 for reg in range(rCount):
505 eWalkCode += '''
506 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
507 ''' % { "reg" : reg }
508 eWalkCode += '''
509 Element srcElem1 = gtoh(srcReg1.elements[imm2]);
510 Element destElem = srcElem1;
511 destReg.elements[imm1] = htog(destElem);
512 '''
513 for reg in range(rCount):
514 eWalkCode += '''
515 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
516 ''' % { "reg" : reg }
517 iop = InstObjParams(name, Name,
518 "DataX1Reg2ImmOp",
519 { "code": eWalkCode,
520 "r_count": rCount,
521 "op_class": opClass }, [])
522 header_output += NeonX1Reg2ImmOpDeclare.subst(iop)
523 exec_output += NeonXEqualRegOpExecute.subst(iop)
524 for type in types:
525 substDict = { "targs" : type,
526 "class_name" : Name }
527 exec_output += NeonXExecDeclare.subst(substDict)
528
529 def twoRegPairwiseScInstX(name, Name, opClass, types, rCount, op):
530 global header_output, exec_output
531 eWalkCode = simd64EnabledCheckCode + '''
532 RegVect srcReg1, destReg;
533 '''
534 for reg in range(rCount):
535 eWalkCode += '''
536 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
537 ''' % { "reg" : reg }
538 eWalkCode += '''
539 Element srcElem1 = gtoh(srcReg1.elements[0]);
540 Element srcElem2 = gtoh(srcReg1.elements[1]);
541 Element destElem;
542 %(op)s
543 destReg.elements[0] = htog(destElem);
544 ''' % { "op" : op }
545 destCnt = rCount / 2
546 for reg in range(destCnt):
547 eWalkCode += '''
548 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
549 ''' % { "reg" : reg }
550 for reg in range(destCnt, 4): # zero upper half
551 eWalkCode += '''
552 AA64FpDestP%(reg)d_uw = 0;
553 ''' % { "reg" : reg }
554 iop = InstObjParams(name, Name,
555 "DataX1RegOp",
556 { "code": eWalkCode,
557 "r_count": rCount,
558 "op_class": opClass }, [])
559 header_output += NeonX1RegOpDeclare.subst(iop)
560 exec_output += NeonXEqualRegOpExecute.subst(iop)
561 for type in types:
562 substDict = { "targs" : type,
563 "class_name" : Name }
564 exec_output += NeonXExecDeclare.subst(substDict)
565
566 def twoRegAcrossInstX(name, Name, opClass, types, rCount, op,
567 doubleDest=False, long=False):
568 global header_output, exec_output
569 destPrefix = "Big" if long else ""
570 eWalkCode = simd64EnabledCheckCode + '''
571 RegVect srcReg1;
572 %sRegVect destReg;
573 ''' % destPrefix
574 for reg in range(rCount):
575 eWalkCode += '''
576 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
577 ''' % { "reg" : reg }
578 eWalkCode += '''
579 destReg.regs[0] = 0;
580 %(destPrefix)sElement destElem = 0;
581 for (unsigned i = 0; i < eCount; i++) {
582 Element srcElem1 = gtoh(srcReg1.elements[i]);
583 if (i == 0) {
584 destElem = srcElem1;
585 } else {
586 %(op)s
587 }
588 }
589 destReg.elements[0] = htog(destElem);
590 ''' % { "op" : op, "destPrefix" : destPrefix }
591 destCnt = 2 if doubleDest else 1
592 for reg in range(destCnt):
593 eWalkCode += '''
594 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
595 ''' % { "reg" : reg }
596 for reg in range(destCnt, 4): # zero upper half
597 eWalkCode += '''
598 AA64FpDestP%(reg)d_uw = 0;
599 ''' % { "reg" : reg }
600 iop = InstObjParams(name, Name,
601 "DataX1RegOp",
602 { "code": eWalkCode,
603 "r_count": rCount,
604 "op_class": opClass }, [])
605 header_output += NeonX1RegOpDeclare.subst(iop)
606 if long:
607 exec_output += NeonXUnequalRegOpExecute.subst(iop)
608 else:
609 exec_output += NeonXEqualRegOpExecute.subst(iop)
610 for type in types:
611 substDict = { "targs" : type,
612 "class_name" : Name }
613 exec_output += NeonXExecDeclare.subst(substDict)
614
615 def twoRegCondenseInstX(name, Name, opClass, types, rCount, op,
616 readDest=False):
617 global header_output, exec_output
618 eWalkCode = simd64EnabledCheckCode + '''
619 RegVect srcRegs;
620 BigRegVect destReg;
621 '''
622 for reg in range(rCount):
623 eWalkCode += '''
624 srcRegs.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
625 ''' % { "reg" : reg }
626 if readDest:
627 eWalkCode += '''
628 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
629 ''' % { "reg" : reg }
630 readDestCode = ''
631 if readDest:
632 readDestCode = 'destElem = gtoh(destReg.elements[i]);'
633 eWalkCode += '''
634 for (unsigned i = 0; i < eCount / 2; i++) {
635 Element srcElem1 = gtoh(srcRegs.elements[2 * i]);
636 Element srcElem2 = gtoh(srcRegs.elements[2 * i + 1]);
637 BigElement destElem;
638 %(readDest)s
639 %(op)s
640 destReg.elements[i] = htog(destElem);
641 }
642 ''' % { "op" : op, "readDest" : readDestCode }
643 for reg in range(rCount):
644 eWalkCode += '''
645 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
646 ''' % { "reg" : reg }
647 if rCount < 4: # zero upper half
648 for reg in range(rCount, 4):
649 eWalkCode += '''
650 AA64FpDestP%(reg)d_uw = 0;
651 ''' % { "reg" : reg }
652 iop = InstObjParams(name, Name,
653 "DataX1RegOp",
654 { "code": eWalkCode,
655 "r_count": rCount,
656 "op_class": opClass }, [])
657 header_output += NeonX1RegOpDeclare.subst(iop)
658 exec_output += NeonXUnequalRegOpExecute.subst(iop)
659 for type in types:
660 substDict = { "targs" : type,
661 "class_name" : Name }
662 exec_output += NeonXExecDeclare.subst(substDict)
663
664 def oneRegImmInstX(name, Name, opClass, types, rCount, op, readDest=False):
665 global header_output, exec_output
666 eWalkCode = simd64EnabledCheckCode + '''
667 RegVect destReg;
668 '''
669 if readDest:
670 for reg in range(rCount):
671 eWalkCode += '''
672 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
673 ''' % { "reg" : reg }
674 readDestCode = ''
675 if readDest:
676 readDestCode = 'destElem = gtoh(destReg.elements[i]);'
677 eWalkCode += '''
678 for (unsigned i = 0; i < eCount; i++) {
679 Element destElem;
680 %(readDest)s
681 %(op)s
682 destReg.elements[i] = htog(destElem);
683 }
684 ''' % { "op" : op, "readDest" : readDestCode }
685 for reg in range(rCount):
686 eWalkCode += '''
687 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
688 ''' % { "reg" : reg }
689 if rCount < 4: # zero upper half
690 for reg in range(rCount, 4):
691 eWalkCode += '''
692 AA64FpDestP%(reg)d_uw = 0;
693 ''' % { "reg" : reg }
694 iop = InstObjParams(name, Name,
695 "DataXImmOnlyOp",
696 { "code": eWalkCode,
697 "r_count": rCount,
698 "op_class": opClass }, [])
699 header_output += NeonX1RegImmOnlyOpDeclare.subst(iop)
700 exec_output += NeonXEqualRegOpExecute.subst(iop)
701 for type in types:
702 substDict = { "targs" : type,
703 "class_name" : Name }
704 exec_output += NeonXExecDeclare.subst(substDict)
705
706 def dupGprInstX(name, Name, opClass, types, rCount, gprSpec):
707 global header_output, exec_output
708 eWalkCode = simd64EnabledCheckCode + '''
709 RegVect destReg;
710 for (unsigned i = 0; i < eCount; i++) {
711 destReg.elements[i] = htog((Element) %sOp1);
712 }
713 ''' % gprSpec
714 for reg in range(rCount):
715 eWalkCode += '''
716 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
717 ''' % { "reg" : reg }
718 if rCount < 4: # zero upper half
719 for reg in range(rCount, 4):
720 eWalkCode += '''
721 AA64FpDestP%(reg)d_uw = 0;
722 ''' % { "reg" : reg }
723 iop = InstObjParams(name, Name,
724 "DataX1RegOp",
725 { "code": eWalkCode,
726 "r_count": rCount,
727 "op_class": opClass }, [])
728 header_output += NeonX1RegOpDeclare.subst(iop)
729 exec_output += NeonXEqualRegOpExecute.subst(iop)
730 for type in types:
731 substDict = { "targs" : type,
732 "class_name" : Name }
733 exec_output += NeonXExecDeclare.subst(substDict)
734
735 def extInstX(name, Name, opClass, types, rCount, op):
736 global header_output, exec_output
737 eWalkCode = simd64EnabledCheckCode + '''
738 RegVect srcReg1, srcReg2, destReg;
739 '''
740 for reg in range(rCount):
741 eWalkCode += '''
742 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
743 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
744 ''' % { "reg" : reg }
745 eWalkCode += op
746 for reg in range(rCount):
747 eWalkCode += '''
748 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
749 ''' % { "reg" : reg }
750 if rCount < 4: # zero upper half
751 for reg in range(rCount, 4):
752 eWalkCode += '''
753 AA64FpDestP%(reg)d_uw = 0;
754 ''' % { "reg" : reg }
755 iop = InstObjParams(name, Name,
756 "DataX2RegImmOp",
757 { "code": eWalkCode,
758 "r_count": rCount,
759 "op_class": opClass }, [])
760 header_output += NeonX2RegImmOpDeclare.subst(iop)
761 exec_output += NeonXEqualRegOpExecute.subst(iop)
762 for type in types:
763 substDict = { "targs" : type,
764 "class_name" : Name }
765 exec_output += NeonXExecDeclare.subst(substDict)
766
767 def insFromGprInstX(name, Name, opClass, types, rCount, gprSpec):
768 global header_output, exec_output
769 eWalkCode = simd64EnabledCheckCode + '''
770 RegVect destReg;
771 '''
772 for reg in range(rCount):
773 eWalkCode += '''
774 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
775 ''' % { "reg" : reg }
776 eWalkCode += '''
777 destReg.elements[imm] = htog((Element) %sOp1);
778 ''' % gprSpec
779 for reg in range(rCount):
780 eWalkCode += '''
781 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
782 ''' % { "reg" : reg }
783 iop = InstObjParams(name, Name,
784 "DataX1RegImmOp",
785 { "code": eWalkCode,
786 "r_count": rCount,
787 "op_class": opClass }, [])
788 header_output += NeonX1RegImmOpDeclare.subst(iop)
789 exec_output += NeonXEqualRegOpExecute.subst(iop)
790 for type in types:
791 substDict = { "targs" : type,
792 "class_name" : Name }
793 exec_output += NeonXExecDeclare.subst(substDict)
794
795 def insToGprInstX(name, Name, opClass, types, rCount, gprSpec,
796 signExt=False):
797 global header_output, exec_output
798 eWalkCode = simd64EnabledCheckCode + '''
799 FullRegVect srcReg;
800 '''
801 for reg in range(4):
802 eWalkCode += '''
803 srcReg.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
804 ''' % { "reg" : reg }
805 if signExt:
806 eWalkCode += '''
807 %sDest = sext<sizeof(Element) * 8>(srcReg.elements[imm]);
808 ''' % gprSpec
809 else:
810 eWalkCode += '''
811 %sDest = srcReg.elements[imm];
812 ''' % gprSpec
813 iop = InstObjParams(name, Name,
814 "DataX1RegImmOp",
815 { "code": eWalkCode,
816 "r_count": rCount,
817 "op_class": opClass }, [])
818 header_output += NeonX1RegImmOpDeclare.subst(iop)
819 exec_output += NeonXEqualRegOpExecute.subst(iop)
820 for type in types:
821 substDict = { "targs" : type,
822 "class_name" : Name }
823 exec_output += NeonXExecDeclare.subst(substDict)
824
825 def tbxTblInstX(name, Name, opClass, types, length, isTbl, rCount):
826 global header_output, decoder_output, exec_output
827 code = simd64EnabledCheckCode + '''
828 union
829 {
830 uint8_t bytes[64];
831 FloatRegBits regs[16];
831 uint32_t regs[16];
832 } table;
833
834 union
835 {
836 uint8_t bytes[%(rCount)d * 4];
832 } table;
833
834 union
835 {
836 uint8_t bytes[%(rCount)d * 4];
837 FloatRegBits regs[%(rCount)d];
837 uint32_t regs[%(rCount)d];
838 } destReg, srcReg2;
839
840 const unsigned length = %(length)d;
841 const bool isTbl = %(isTbl)s;
842 ''' % { "rCount" : rCount, "length" : length, "isTbl" : isTbl }
843 for reg in range(rCount):
844 code += '''
845 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
846 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
847 ''' % { "reg" : reg }
848 for reg in range(16):
849 if reg < length * 4:
850 code += '''
851 table.regs[%(reg)d] = htog(AA64FpOp1P%(p)dV%(v)dS_uw);
852 ''' % { "reg" : reg, "p" : reg % 4, "v" : reg / 4 }
853 else:
854 code += '''
855 table.regs[%(reg)d] = 0;
856 ''' % { "reg" : reg }
857 code += '''
858 for (unsigned i = 0; i < sizeof(destReg); i++) {
859 uint8_t index = srcReg2.bytes[i];
860 if (index < 16 * length) {
861 destReg.bytes[i] = table.bytes[index];
862 } else {
863 if (isTbl)
864 destReg.bytes[i] = 0;
865 // else destReg.bytes[i] unchanged
866 }
867 }
868 '''
869 for reg in range(rCount):
870 code += '''
871 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
872 ''' % { "reg" : reg }
873 if rCount < 4: # zero upper half
874 for reg in range(rCount, 4):
875 code += '''
876 AA64FpDestP%(reg)d_uw = 0;
877 ''' % { "reg" : reg }
878 iop = InstObjParams(name, Name,
879 "DataX2RegOp",
880 { "code": code,
881 "r_count": rCount,
882 "op_class": opClass }, [])
883 header_output += NeonX2RegOpDeclare.subst(iop)
884 exec_output += NeonXEqualRegOpExecute.subst(iop)
885 for type in types:
886 substDict = { "targs" : type,
887 "class_name" : Name }
888 exec_output += NeonXExecDeclare.subst(substDict)
889
890 # ABS
891 absCode = '''
892 if (srcElem1 < 0) {
893 destElem = -srcElem1;
894 } else {
895 destElem = srcElem1;
896 }
897 '''
898 twoEqualRegInstX("abs", "AbsDX", "SimdAluOp", signedTypes, 2, absCode)
899 twoEqualRegInstX("abs", "AbsQX", "SimdAluOp", signedTypes, 4, absCode)
900 # ADD
901 addCode = "destElem = srcElem1 + srcElem2;"
902 threeEqualRegInstX("add", "AddDX", "SimdAddOp", unsignedTypes, 2, addCode)
903 threeEqualRegInstX("add", "AddQX", "SimdAddOp", unsignedTypes, 4, addCode)
904 # ADDHN, ADDHN2
905 addhnCode = '''
906 destElem = ((BigElement)srcElem1 + (BigElement)srcElem2) >>
907 (sizeof(Element) * 8);
908 '''
909 threeRegNarrowInstX("addhn", "AddhnX", "SimdAddOp", smallUnsignedTypes,
910 addhnCode)
911 threeRegNarrowInstX("addhn2", "Addhn2X", "SimdAddOp", smallUnsignedTypes,
912 addhnCode, hi=True)
913 # ADDP (scalar)
914 twoRegPairwiseScInstX("addp", "AddpScQX", "SimdAddOp", ("uint64_t",), 4,
915 addCode)
916 # ADDP (vector)
917 threeEqualRegInstX("addp", "AddpDX", "SimdAddOp", smallUnsignedTypes, 2,
918 addCode, pairwise=True)
919 threeEqualRegInstX("addp", "AddpQX", "SimdAddOp", unsignedTypes, 4,
920 addCode, pairwise=True)
921 # ADDV
922 # Note: SimdAddOp can be a bit optimistic here
923 addAcrossCode = "destElem += srcElem1;"
924 twoRegAcrossInstX("addv", "AddvDX", "SimdAddOp", ("uint8_t", "uint16_t"),
925 2, addAcrossCode)
926 twoRegAcrossInstX("addv", "AddvQX", "SimdAddOp", smallUnsignedTypes, 4,
927 addAcrossCode)
928 # AND
929 andCode = "destElem = srcElem1 & srcElem2;"
930 threeEqualRegInstX("and", "AndDX", "SimdAluOp", ("uint64_t",), 2, andCode)
931 threeEqualRegInstX("and", "AndQX", "SimdAluOp", ("uint64_t",), 4, andCode)
932 # BIC (immediate)
933 bicImmCode = "destElem &= ~imm;"
934 oneRegImmInstX("bic", "BicImmDX", "SimdAluOp", ("uint64_t",), 2,
935 bicImmCode, True)
936 oneRegImmInstX("bic", "BicImmQX", "SimdAluOp", ("uint64_t",), 4,
937 bicImmCode, True)
938 # BIC (register)
939 bicCode = "destElem = srcElem1 & ~srcElem2;"
940 threeEqualRegInstX("bic", "BicDX", "SimdAluOp", ("uint64_t",), 2, bicCode)
941 threeEqualRegInstX("bic", "BicQX", "SimdAluOp", ("uint64_t",), 4, bicCode)
942 # BIF
943 bifCode = "destElem = (destElem & srcElem2) | (srcElem1 & ~srcElem2);"
944 threeEqualRegInstX("bif", "BifDX", "SimdAluOp", ("uint64_t",), 2, bifCode,
945 True)
946 threeEqualRegInstX("bif", "BifQX", "SimdAluOp", ("uint64_t",), 4, bifCode,
947 True)
948 # BIT
949 bitCode = "destElem = (srcElem1 & srcElem2) | (destElem & ~srcElem2);"
950 threeEqualRegInstX("bit", "BitDX", "SimdAluOp", ("uint64_t",), 2, bitCode,
951 True)
952 threeEqualRegInstX("bit", "BitQX", "SimdAluOp", ("uint64_t",), 4, bitCode,
953 True)
954 # BSL
955 bslCode = "destElem = (srcElem1 & destElem) | (srcElem2 & ~destElem);"
956 threeEqualRegInstX("bsl", "BslDX", "SimdAluOp", ("uint64_t",), 2, bslCode,
957 True)
958 threeEqualRegInstX("bsl", "BslQX", "SimdAluOp", ("uint64_t",), 4, bslCode,
959 True)
960 # CLS
961 clsCode = '''
962 unsigned count = 0;
963 if (srcElem1 < 0) {
964 srcElem1 <<= 1;
965 while (srcElem1 < 0 && count < sizeof(Element) * 8 - 1) {
966 count++;
967 srcElem1 <<= 1;
968 }
969 } else {
970 srcElem1 <<= 1;
971 while (srcElem1 >= 0 && count < sizeof(Element) * 8 - 1) {
972 count++;
973 srcElem1 <<= 1;
974 }
975 }
976 destElem = count;
977 '''
978 twoEqualRegInstX("cls", "ClsDX", "SimdAluOp", smallSignedTypes, 2, clsCode)
979 twoEqualRegInstX("cls", "ClsQX", "SimdAluOp", smallSignedTypes, 4, clsCode)
980 # CLZ
981 clzCode = '''
982 unsigned count = 0;
983 while (srcElem1 >= 0 && count < sizeof(Element) * 8) {
984 count++;
985 srcElem1 <<= 1;
986 }
987 destElem = count;
988 '''
989 twoEqualRegInstX("clz", "ClzDX", "SimdAluOp", smallSignedTypes, 2, clzCode)
990 twoEqualRegInstX("clz", "ClzQX", "SimdAluOp", smallSignedTypes, 4, clzCode)
991 # CMEQ (register)
992 cmeqCode = "destElem = (srcElem1 == srcElem2) ? (Element)(-1) : 0;"
993 threeEqualRegInstX("cmeq", "CmeqDX", "SimdCmpOp", unsignedTypes, 2,
994 cmeqCode)
995 threeEqualRegInstX("cmeq", "CmeqQX", "SimdCmpOp", unsignedTypes, 4,
996 cmeqCode)
997 # CMEQ (zero)
998 cmeqZeroCode = "destElem = (srcElem1 == 0) ? (Element)(-1) : 0;"
999 twoEqualRegInstX("cmeq", "CmeqZeroDX", "SimdCmpOp", signedTypes, 2,
1000 cmeqZeroCode)
1001 twoEqualRegInstX("cmeq", "CmeqZeroQX", "SimdCmpOp", signedTypes, 4,
1002 cmeqZeroCode)
1003 # CMGE (register)
1004 cmgeCode = "destElem = (srcElem1 >= srcElem2) ? (Element)(-1) : 0;"
1005 threeEqualRegInstX("cmge", "CmgeDX", "SimdCmpOp", signedTypes, 2, cmgeCode)
1006 threeEqualRegInstX("cmge", "CmgeQX", "SimdCmpOp", signedTypes, 4, cmgeCode)
1007 # CMGE (zero)
1008 cmgeZeroCode = "destElem = (srcElem1 >= 0) ? (Element)(-1) : 0;"
1009 twoEqualRegInstX("cmge", "CmgeZeroDX", "SimdCmpOp", signedTypes, 2,
1010 cmgeZeroCode)
1011 twoEqualRegInstX("cmge", "CmgeZeroQX", "SimdCmpOp", signedTypes, 4,
1012 cmgeZeroCode)
1013 # CMGT (register)
1014 cmgtCode = "destElem = (srcElem1 > srcElem2) ? (Element)(-1) : 0;"
1015 threeEqualRegInstX("cmgt", "CmgtDX", "SimdCmpOp", signedTypes, 2, cmgtCode)
1016 threeEqualRegInstX("cmgt", "CmgtQX", "SimdCmpOp", signedTypes, 4, cmgtCode)
1017 # CMGT (zero)
1018 cmgtZeroCode = "destElem = (srcElem1 > 0) ? (Element)(-1) : 0;"
1019 twoEqualRegInstX("cmgt", "CmgtZeroDX", "SimdCmpOp", signedTypes, 2,
1020 cmgtZeroCode)
1021 twoEqualRegInstX("cmgt", "CmgtZeroQX", "SimdCmpOp", signedTypes, 4,
1022 cmgtZeroCode)
1023 # CMHI (register)
1024 threeEqualRegInstX("cmhi", "CmhiDX", "SimdCmpOp", unsignedTypes, 2,
1025 cmgtCode)
1026 threeEqualRegInstX("cmhi", "CmhiQX", "SimdCmpOp", unsignedTypes, 4,
1027 cmgtCode)
1028 # CMHS (register)
1029 threeEqualRegInstX("cmhs", "CmhsDX", "SimdCmpOp", unsignedTypes, 2,
1030 cmgeCode)
1031 threeEqualRegInstX("cmhs", "CmhsQX", "SimdCmpOp", unsignedTypes, 4,
1032 cmgeCode)
1033 # CMLE (zero)
1034 cmleZeroCode = "destElem = (srcElem1 <= 0) ? (Element)(-1) : 0;"
1035 twoEqualRegInstX("cmle", "CmleZeroDX", "SimdCmpOp", signedTypes, 2,
1036 cmleZeroCode)
1037 twoEqualRegInstX("cmle", "CmleZeroQX", "SimdCmpOp", signedTypes, 4,
1038 cmleZeroCode)
1039 # CMLT (zero)
1040 cmltZeroCode = "destElem = (srcElem1 < 0) ? (Element)(-1) : 0;"
1041 twoEqualRegInstX("cmlt", "CmltZeroDX", "SimdCmpOp", signedTypes, 2,
1042 cmltZeroCode)
1043 twoEqualRegInstX("cmlt", "CmltZeroQX", "SimdCmpOp", signedTypes, 4,
1044 cmltZeroCode)
1045 # CMTST (register)
1046 tstCode = "destElem = (srcElem1 & srcElem2) ? (Element)(-1) : 0;"
1047 threeEqualRegInstX("cmtst", "CmtstDX", "SimdAluOp", unsignedTypes, 2,
1048 tstCode)
1049 threeEqualRegInstX("cmtst", "CmtstQX", "SimdAluOp", unsignedTypes, 4,
1050 tstCode)
1051 # CNT
1052 cntCode = '''
1053 unsigned count = 0;
1054 while (srcElem1 && count < sizeof(Element) * 8) {
1055 count += srcElem1 & 0x1;
1056 srcElem1 >>= 1;
1057 }
1058 destElem = count;
1059 '''
1060 twoEqualRegInstX("cnt", "CntDX", "SimdAluOp", ("uint8_t",), 2, cntCode)
1061 twoEqualRegInstX("cnt", "CntQX", "SimdAluOp", ("uint8_t",), 4, cntCode)
1062 # DUP (element)
1063 dupCode = "destElem = srcElem1;"
1064 twoEqualRegInstX("dup", "DupElemDX", "SimdMiscOp", smallUnsignedTypes, 2,
1065 dupCode, isDup=True, byElem=True)
1066 twoEqualRegInstX("dup", "DupElemQX", "SimdMiscOp", unsignedTypes, 4,
1067 dupCode, isDup=True, byElem=True)
1068 twoEqualRegInstX("dup", "DupElemScX", "SimdMiscOp", unsignedTypes, 4,
1069 dupCode, isDup=True, byElem=True, scalar=True)
1070 # DUP (general register)
1071 dupGprInstX("dup", "DupGprWDX", "SimdMiscOp", smallUnsignedTypes, 2, 'W')
1072 dupGprInstX("dup", "DupGprWQX", "SimdMiscOp", smallUnsignedTypes, 4, 'W')
1073 dupGprInstX("dup", "DupGprXQX", "SimdMiscOp", ("uint64_t",), 4, 'X')
1074 # EOR
1075 eorCode = "destElem = srcElem1 ^ srcElem2;"
1076 threeEqualRegInstX("eor", "EorDX", "SimdAluOp", ("uint64_t",), 2, eorCode)
1077 threeEqualRegInstX("eor", "EorQX", "SimdAluOp", ("uint64_t",), 4, eorCode)
1078 # EXT
1079 extCode = '''
1080 for (unsigned i = 0; i < eCount; i++) {
1081 unsigned index = i + imm;
1082 if (index < eCount) {
1083 destReg.elements[i] = srcReg1.elements[index];
1084 } else {
1085 index -= eCount;
1086 if (index >= eCount) {
1087 fault = std::make_shared<UndefinedInstruction>(
1088 machInst, false, mnemonic);
1089 } else {
1090 destReg.elements[i] = srcReg2.elements[index];
1091 }
1092 }
1093 }
1094 '''
1095 extInstX("Ext", "ExtDX", "SimdMiscOp", ("uint8_t",), 2, extCode)
1096 extInstX("Ext", "ExtQX", "SimdMiscOp", ("uint8_t",), 4, extCode)
1097 # FABD
1098 fpOp = '''
1099 FPSCR fpscr = (FPSCR) FpscrExc;
1100 destElem = %s;
1101 FpscrExc = fpscr;
1102 '''
1103 fabdCode = fpOp % "fplibAbs<Element>(fplibSub(srcElem1, srcElem2, fpscr))"
1104 threeEqualRegInstX("fabd", "FabdDX", "SimdFloatAddOp", smallFloatTypes, 2,
1105 fabdCode)
1106 threeEqualRegInstX("fabd", "FabdQX", "SimdFloatAddOp", floatTypes, 4,
1107 fabdCode)
1108 threeEqualRegInstX("fabd", "FabdScX", "SimdFloatAddOp", floatTypes, 4,
1109 fabdCode, scalar=True)
1110 # FABS
1111 fabsCode = fpOp % "fplibAbs<Element>(srcElem1)"
1112 twoEqualRegInstX("Abs", "FabsDX", "SimdFloatAluOp", smallFloatTypes, 2,
1113 fabsCode)
1114 twoEqualRegInstX("Abs", "FabsQX", "SimdFloatAluOp", floatTypes, 4,
1115 fabsCode)
1116 # FACGE
1117 fpCmpAbsOp = fpOp % ("fplibCompare%s<Element>(fplibAbs<Element>(srcElem1),"
1118 " fplibAbs<Element>(srcElem2), fpscr) ? -1 : 0")
1119 facgeCode = fpCmpAbsOp % "GE"
1120 threeEqualRegInstX("facge", "FacgeDX", "SimdFloatCmpOp", smallFloatTypes,
1121 2, facgeCode)
1122 threeEqualRegInstX("facge", "FacgeQX", "SimdFloatCmpOp", floatTypes, 4,
1123 facgeCode)
1124 threeEqualRegInstX("facge", "FacgeScX", "SimdFloatCmpOp", floatTypes, 4,
1125 facgeCode, scalar=True)
1126 # FACGT
1127 facgtCode = fpCmpAbsOp % "GT"
1128 threeEqualRegInstX("facgt", "FacgtDX", "SimdFloatCmpOp", smallFloatTypes,
1129 2, facgtCode)
1130 threeEqualRegInstX("facgt", "FacgtQX", "SimdFloatCmpOp", floatTypes, 4,
1131 facgtCode)
1132 threeEqualRegInstX("facgt", "FacgtScX", "SimdFloatCmpOp", floatTypes, 4,
1133 facgtCode, scalar=True)
1134 # FADD
1135 fpBinOp = fpOp % "fplib%s<Element>(srcElem1, srcElem2, fpscr)"
1136 faddCode = fpBinOp % "Add"
1137 threeEqualRegInstX("fadd", "FaddDX", "SimdFloatAddOp", smallFloatTypes, 2,
1138 faddCode)
1139 threeEqualRegInstX("fadd", "FaddQX", "SimdFloatAddOp", floatTypes, 4,
1140 faddCode)
1141 # FADDP (scalar)
1142 twoRegPairwiseScInstX("faddp", "FaddpScDX", "SimdFloatAddOp",
1143 ("uint32_t",), 2, faddCode)
1144 twoRegPairwiseScInstX("faddp", "FaddpScQX", "SimdFloatAddOp",
1145 ("uint64_t",), 4, faddCode)
1146 # FADDP (vector)
1147 threeEqualRegInstX("faddp", "FaddpDX", "SimdFloatAddOp", smallFloatTypes,
1148 2, faddCode, pairwise=True)
1149 threeEqualRegInstX("faddp", "FaddpQX", "SimdFloatAddOp", floatTypes, 4,
1150 faddCode, pairwise=True)
1151 # FCMEQ (register)
1152 fpCmpOp = fpOp % ("fplibCompare%s<Element>(srcElem1, srcElem2, fpscr) ?"
1153 " -1 : 0")
1154 fcmeqCode = fpCmpOp % "EQ"
1155 threeEqualRegInstX("fcmeq", "FcmeqDX", "SimdFloatCmpOp", smallFloatTypes,
1156 2, fcmeqCode)
1157 threeEqualRegInstX("fcmeq", "FcmeqQX", "SimdFloatCmpOp", floatTypes, 4,
1158 fcmeqCode)
1159 threeEqualRegInstX("fcmeq", "FcmeqScX", "SimdFloatCmpOp", floatTypes, 4,
1160 fcmeqCode, scalar=True)
1161 # FCMEQ (zero)
1162 fpCmpZeroOp = fpOp % "fplibCompare%s<Element>(srcElem1, 0, fpscr) ? -1 : 0"
1163 fcmeqZeroCode = fpCmpZeroOp % "EQ"
1164 twoEqualRegInstX("fcmeq", "FcmeqZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1165 2, fcmeqZeroCode)
1166 twoEqualRegInstX("fcmeq", "FcmeqZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1167 fcmeqZeroCode)
1168 twoEqualRegInstX("fcmeq", "FcmeqZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1169 fcmeqZeroCode, scalar=True)
1170 # FCMGE (register)
1171 fcmgeCode = fpCmpOp % "GE"
1172 threeEqualRegInstX("fcmge", "FcmgeDX", "SimdFloatCmpOp", smallFloatTypes,
1173 2, fcmgeCode)
1174 threeEqualRegInstX("fcmge", "FcmgeQX", "SimdFloatCmpOp", floatTypes, 4,
1175 fcmgeCode)
1176 threeEqualRegInstX("fcmge", "FcmgeScX", "SimdFloatCmpOp", floatTypes, 4,
1177 fcmgeCode, scalar=True)
1178 # FCMGE (zero)
1179 fcmgeZeroCode = fpCmpZeroOp % "GE"
1180 twoEqualRegInstX("fcmge", "FcmgeZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1181 2, fcmgeZeroCode)
1182 twoEqualRegInstX("fcmge", "FcmgeZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1183 fcmgeZeroCode)
1184 twoEqualRegInstX("fcmge", "FcmgeZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1185 fcmgeZeroCode, scalar=True)
1186 # FCMGT (register)
1187 fcmgtCode = fpCmpOp % "GT"
1188 threeEqualRegInstX("fcmgt", "FcmgtDX", "SimdFloatCmpOp", smallFloatTypes,
1189 2, fcmgtCode)
1190 threeEqualRegInstX("fcmgt", "FcmgtQX", "SimdFloatCmpOp", floatTypes, 4,
1191 fcmgtCode)
1192 threeEqualRegInstX("fcmgt", "FcmgtScX", "SimdFloatCmpOp", floatTypes, 4,
1193 fcmgtCode, scalar=True)
1194 # FCMGT (zero)
1195 fcmgtZeroCode = fpCmpZeroOp % "GT"
1196 twoEqualRegInstX("fcmgt", "FcmgtZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1197 2, fcmgtZeroCode)
1198 twoEqualRegInstX("fcmgt", "FcmgtZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1199 fcmgtZeroCode)
1200 twoEqualRegInstX("fcmgt", "FcmgtZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1201 fcmgtZeroCode, scalar=True)
1202 # FCMLE (zero)
1203 fpCmpRevZeroOp = fpOp % ("fplibCompare%s<Element>(0, srcElem1, fpscr) ?"
1204 " -1 : 0")
1205 fcmleZeroCode = fpCmpRevZeroOp % "GE"
1206 twoEqualRegInstX("fcmle", "FcmleZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1207 2, fcmleZeroCode)
1208 twoEqualRegInstX("fcmle", "FcmleZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1209 fcmleZeroCode)
1210 twoEqualRegInstX("fcmle", "FcmleZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1211 fcmleZeroCode, scalar=True)
1212 # FCMLT (zero)
1213 fcmltZeroCode = fpCmpRevZeroOp % "GT"
1214 twoEqualRegInstX("fcmlt", "FcmltZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1215 2, fcmltZeroCode)
1216 twoEqualRegInstX("fcmlt", "FcmltZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1217 fcmltZeroCode)
1218 twoEqualRegInstX("fcmlt", "FcmltZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1219 fcmltZeroCode, scalar=True)
1220 # FCVTAS
1221 fcvtCode = fpOp % ("fplibFPToFixed<Element, Element>("
1222 "srcElem1, %s, %s, %s, fpscr)")
1223 fcvtasCode = fcvtCode % ("0", "false", "FPRounding_TIEAWAY")
1224 twoEqualRegInstX("fcvtas", "FcvtasDX", "SimdCvtOp", smallFloatTypes, 2,
1225 fcvtasCode)
1226 twoEqualRegInstX("fcvtas", "FcvtasQX", "SimdCvtOp", floatTypes, 4,
1227 fcvtasCode)
1228 twoEqualRegInstX("fcvtas", "FcvtasScX", "SimdCvtOp", floatTypes, 4,
1229 fcvtasCode, scalar=True)
1230 # FCVTAU
1231 fcvtauCode = fcvtCode % ("0", "true", "FPRounding_TIEAWAY")
1232 twoEqualRegInstX("fcvtau", "FcvtauDX", "SimdCvtOp", smallFloatTypes, 2,
1233 fcvtauCode)
1234 twoEqualRegInstX("fcvtau", "FcvtauQX", "SimdCvtOp", floatTypes, 4,
1235 fcvtauCode)
1236 twoEqualRegInstX("fcvtau", "FcvtauScX", "SimdCvtOp", floatTypes, 4,
1237 fcvtauCode, scalar=True)
1238 # FCVTL, FCVTL2
1239 fcvtlCode = fpOp % ("fplibConvert<Element, BigElement>("
1240 "srcElem1, FPCRRounding(fpscr), fpscr)")
1241 twoRegLongInstX("fcvtl", "FcvtlX", "SimdCvtOp", ("uint16_t", "uint32_t"),
1242 fcvtlCode)
1243 twoRegLongInstX("fcvtl", "Fcvtl2X", "SimdCvtOp", ("uint16_t", "uint32_t"),
1244 fcvtlCode, hi=True)
1245 # FCVTMS
1246 fcvtmsCode = fcvtCode % ("0", "false", "FPRounding_NEGINF")
1247 twoEqualRegInstX("fcvtms", "FcvtmsDX", "SimdCvtOp", smallFloatTypes, 2,
1248 fcvtmsCode)
1249 twoEqualRegInstX("fcvtms", "FcvtmsQX", "SimdCvtOp", floatTypes, 4,
1250 fcvtmsCode)
1251 twoEqualRegInstX("fcvtms", "FcvtmsScX", "SimdCvtOp", floatTypes, 4,
1252 fcvtmsCode, scalar=True)
1253 # FCVTMU
1254 fcvtmuCode = fcvtCode % ("0", "true", "FPRounding_NEGINF")
1255 twoEqualRegInstX("fcvtmu", "FcvtmuDX", "SimdCvtOp", smallFloatTypes, 2,
1256 fcvtmuCode)
1257 twoEqualRegInstX("fcvtmu", "FcvtmuQX", "SimdCvtOp", floatTypes, 4,
1258 fcvtmuCode)
1259 twoEqualRegInstX("fcvtmu", "FcvtmuScX", "SimdCvtOp", floatTypes, 4,
1260 fcvtmuCode, scalar=True)
1261 # FCVTN, FCVTN2
1262 fcvtnCode = fpOp % ("fplibConvert<BigElement, Element>("
1263 "srcElem1, FPCRRounding(fpscr), fpscr)")
1264 twoRegNarrowInstX("fcvtn", "FcvtnX", "SimdCvtOp",
1265 ("uint16_t", "uint32_t"), fcvtnCode)
1266 twoRegNarrowInstX("fcvtn", "Fcvtn2X", "SimdCvtOp",
1267 ("uint16_t", "uint32_t"), fcvtnCode, hi=True)
1268 # FCVTNS
1269 fcvtnsCode = fcvtCode % ("0", "false", "FPRounding_TIEEVEN")
1270 twoEqualRegInstX("fcvtns", "FcvtnsDX", "SimdCvtOp", smallFloatTypes, 2,
1271 fcvtnsCode)
1272 twoEqualRegInstX("fcvtns", "FcvtnsQX", "SimdCvtOp", floatTypes, 4,
1273 fcvtnsCode)
1274 twoEqualRegInstX("fcvtns", "FcvtnsScX", "SimdCvtOp", floatTypes, 4,
1275 fcvtnsCode, scalar=True)
1276 # FCVTNU
1277 fcvtnuCode = fcvtCode % ("0", "true", "FPRounding_TIEEVEN")
1278 twoEqualRegInstX("fcvtnu", "FcvtnuDX", "SimdCvtOp", smallFloatTypes, 2,
1279 fcvtnuCode)
1280 twoEqualRegInstX("fcvtnu", "FcvtnuQX", "SimdCvtOp", floatTypes, 4,
1281 fcvtnuCode)
1282 twoEqualRegInstX("fcvtnu", "FcvtnuScX", "SimdCvtOp", floatTypes, 4,
1283 fcvtnuCode, scalar=True)
1284 # FCVTPS
1285 fcvtpsCode = fcvtCode % ("0", "false", "FPRounding_POSINF")
1286 twoEqualRegInstX("fcvtps", "FcvtpsDX", "SimdCvtOp", smallFloatTypes, 2,
1287 fcvtpsCode)
1288 twoEqualRegInstX("fcvtps", "FcvtpsQX", "SimdCvtOp", floatTypes, 4,
1289 fcvtpsCode)
1290 twoEqualRegInstX("fcvtps", "FcvtpsScX", "SimdCvtOp", floatTypes, 4,
1291 fcvtpsCode, scalar=True)
1292 # FCVTPU
1293 fcvtpuCode = fcvtCode % ("0", "true", "FPRounding_POSINF")
1294 twoEqualRegInstX("fcvtpu", "FcvtpuDX", "SimdCvtOp", smallFloatTypes, 2,
1295 fcvtpuCode)
1296 twoEqualRegInstX("fcvtpu", "FcvtpuQX", "SimdCvtOp", floatTypes, 4,
1297 fcvtpuCode)
1298 twoEqualRegInstX("fcvtpu", "FcvtpuScX", "SimdCvtOp", floatTypes, 4,
1299 fcvtpuCode, scalar=True)
1300 # FCVTXN, FCVTXN2
1301 fcvtxnCode = fpOp % ("fplibConvert<BigElement, Element>("
1302 "srcElem1, FPRounding_ODD, fpscr)")
1303 twoRegNarrowInstX("fcvtxn", "FcvtxnX", "SimdCvtOp", smallFloatTypes,
1304 fcvtxnCode)
1305 twoRegNarrowInstX("fcvtxn", "Fcvtxn2X", "SimdCvtOp", smallFloatTypes,
1306 fcvtxnCode, hi=True)
1307 twoRegNarrowInstX("fcvtxn", "FcvtxnScX", "SimdCvtOp", smallFloatTypes,
1308 fcvtxnCode, scalar=True)
1309 # FCVTZS (fixed-point)
1310 fcvtzsCode = fcvtCode % ("imm", "false", "FPRounding_ZERO")
1311 twoEqualRegInstX("fcvtzs", "FcvtzsFixedDX", "SimdCvtOp", smallFloatTypes,
1312 2, fcvtzsCode, hasImm=True)
1313 twoEqualRegInstX("fcvtzs", "FcvtzsFixedQX", "SimdCvtOp", floatTypes, 4,
1314 fcvtzsCode, hasImm=True)
1315 twoEqualRegInstX("fcvtzs", "FcvtzsFixedScX", "SimdCvtOp", floatTypes, 4,
1316 fcvtzsCode, hasImm=True, scalar=True)
1317 # FCVTZS (integer)
1318 fcvtzsIntCode = fcvtCode % ("0", "false", "FPRounding_ZERO")
1319 twoEqualRegInstX("fcvtzs", "FcvtzsIntDX", "SimdCvtOp", smallFloatTypes,
1320 2, fcvtzsIntCode)
1321 twoEqualRegInstX("fcvtzs", "FcvtzsIntQX", "SimdCvtOp", floatTypes, 4,
1322 fcvtzsIntCode)
1323 twoEqualRegInstX("fcvtzs", "FcvtzsIntScX", "SimdCvtOp", floatTypes, 4,
1324 fcvtzsIntCode, scalar=True)
1325 # FCVTZU (fixed-point)
1326 fcvtzuCode = fcvtCode % ("imm", "true", "FPRounding_ZERO")
1327 twoEqualRegInstX("fcvtzu", "FcvtzuFixedDX", "SimdCvtOp", smallFloatTypes,
1328 2, fcvtzuCode, hasImm=True)
1329 twoEqualRegInstX("fcvtzu", "FcvtzuFixedQX", "SimdCvtOp", floatTypes, 4,
1330 fcvtzuCode, hasImm=True)
1331 twoEqualRegInstX("fcvtzu", "FcvtzuFixedScX", "SimdCvtOp", floatTypes, 4,
1332 fcvtzuCode, hasImm=True, scalar=True)
1333 # FCVTZU (integer)
1334 fcvtzuIntCode = fcvtCode % ("0", "true", "FPRounding_ZERO")
1335 twoEqualRegInstX("fcvtzu", "FcvtzuIntDX", "SimdCvtOp", smallFloatTypes, 2,
1336 fcvtzuIntCode)
1337 twoEqualRegInstX("fcvtzu", "FcvtzuIntQX", "SimdCvtOp", floatTypes, 4,
1338 fcvtzuIntCode)
1339 twoEqualRegInstX("fcvtzu", "FcvtzuIntScX", "SimdCvtOp", floatTypes, 4,
1340 fcvtzuIntCode, scalar=True)
1341 # FDIV
1342 fdivCode = fpBinOp % "Div"
1343 threeEqualRegInstX("fdiv", "FdivDX", "SimdFloatDivOp", smallFloatTypes, 2,
1344 fdivCode)
1345 threeEqualRegInstX("fdiv", "FdivQX", "SimdFloatDivOp", floatTypes, 4,
1346 fdivCode)
1347 # FMAX
1348 fmaxCode = fpBinOp % "Max"
1349 threeEqualRegInstX("fmax", "FmaxDX", "SimdFloatCmpOp", smallFloatTypes, 2,
1350 fmaxCode)
1351 threeEqualRegInstX("fmax", "FmaxQX", "SimdFloatCmpOp", floatTypes, 4,
1352 fmaxCode)
1353 # FMAXNM
1354 fmaxnmCode = fpBinOp % "MaxNum"
1355 threeEqualRegInstX("fmaxnm", "FmaxnmDX", "SimdFloatCmpOp", smallFloatTypes,
1356 2, fmaxnmCode)
1357 threeEqualRegInstX("fmaxnm", "FmaxnmQX", "SimdFloatCmpOp", floatTypes, 4,
1358 fmaxnmCode)
1359 # FMAXNMP (scalar)
1360 twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScDX", "SimdFloatCmpOp",
1361 ("uint32_t",), 2, fmaxnmCode)
1362 twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScQX", "SimdFloatCmpOp",
1363 ("uint64_t",), 4, fmaxnmCode)
1364 # FMAXNMP (vector)
1365 threeEqualRegInstX("fmaxnmp", "FmaxnmpDX", "SimdFloatCmpOp",
1366 smallFloatTypes, 2, fmaxnmCode, pairwise=True)
1367 threeEqualRegInstX("fmaxnmp", "FmaxnmpQX", "SimdFloatCmpOp", floatTypes, 4,
1368 fmaxnmCode, pairwise=True)
1369 # FMAXNMV
1370 # Note: SimdFloatCmpOp can be a bit optimistic here
1371 fpAcrossOp = fpOp % "fplib%s<Element>(destElem, srcElem1, fpscr)"
1372 fmaxnmAcrossCode = fpAcrossOp % "MaxNum"
1373 twoRegAcrossInstX("fmaxnmv", "FmaxnmvQX", "SimdFloatCmpOp", ("uint32_t",),
1374 4, fmaxnmAcrossCode)
1375 # FMAXP (scalar)
1376 twoRegPairwiseScInstX("fmaxp", "FmaxpScDX", "SimdFloatCmpOp",
1377 ("uint32_t",), 2, fmaxCode)
1378 twoRegPairwiseScInstX("fmaxp", "FmaxpScQX", "SimdFloatCmpOp",
1379 ("uint64_t",), 4, fmaxCode)
1380 # FMAXP (vector)
1381 threeEqualRegInstX("fmaxp", "FmaxpDX", "SimdFloatCmpOp", smallFloatTypes,
1382 2, fmaxCode, pairwise=True)
1383 threeEqualRegInstX("fmaxp", "FmaxpQX", "SimdFloatCmpOp", floatTypes, 4,
1384 fmaxCode, pairwise=True)
1385 # FMAXV
1386 # Note: SimdFloatCmpOp can be a bit optimistic here
1387 fmaxAcrossCode = fpAcrossOp % "Max"
1388 twoRegAcrossInstX("fmaxv", "FmaxvQX", "SimdFloatCmpOp", ("uint32_t",), 4,
1389 fmaxAcrossCode)
1390 # FMIN
1391 fminCode = fpBinOp % "Min"
1392 threeEqualRegInstX("fmin", "FminDX", "SimdFloatCmpOp", smallFloatTypes, 2,
1393 fminCode)
1394 threeEqualRegInstX("fmin", "FminQX", "SimdFloatCmpOp", floatTypes, 4,
1395 fminCode)
1396 # FMINNM
1397 fminnmCode = fpBinOp % "MinNum"
1398 threeEqualRegInstX("fminnm", "FminnmDX", "SimdFloatCmpOp", smallFloatTypes,
1399 2, fminnmCode)
1400 threeEqualRegInstX("fminnm", "FminnmQX", "SimdFloatCmpOp", floatTypes, 4,
1401 fminnmCode)
1402 # FMINNMP (scalar)
1403 twoRegPairwiseScInstX("fminnmp", "FminnmpScDX", "SimdFloatCmpOp",
1404 ("uint32_t",), 2, fminnmCode)
1405 twoRegPairwiseScInstX("fminnmp", "FminnmpScQX", "SimdFloatCmpOp",
1406 ("uint64_t",), 4, fminnmCode)
1407 # FMINNMP (vector)
1408 threeEqualRegInstX("fminnmp", "FminnmpDX", "SimdFloatCmpOp",
1409 smallFloatTypes, 2, fminnmCode, pairwise=True)
1410 threeEqualRegInstX("fminnmp", "FminnmpQX", "SimdFloatCmpOp", floatTypes, 4,
1411 fminnmCode, pairwise=True)
1412 # FMINNMV
1413 # Note: SimdFloatCmpOp can be a bit optimistic here
1414 fminnmAcrossCode = fpAcrossOp % "MinNum"
1415 twoRegAcrossInstX("fminnmv", "FminnmvQX", "SimdFloatCmpOp", ("uint32_t",),
1416 4, fminnmAcrossCode)
1417 # FMINP (scalar)
1418 twoRegPairwiseScInstX("fminp", "FminpScDX", "SimdFloatCmpOp",
1419 ("uint32_t",), 2, fminCode)
1420 twoRegPairwiseScInstX("fminp", "FminpScQX", "SimdFloatCmpOp",
1421 ("uint64_t",), 4, fminCode)
1422 # FMINP (vector)
1423 threeEqualRegInstX("fminp", "FminpDX", "SimdFloatCmpOp", smallFloatTypes,
1424 2, fminCode, pairwise=True)
1425 threeEqualRegInstX("fminp", "FminpQX", "SimdFloatCmpOp", floatTypes, 4,
1426 fminCode, pairwise=True)
1427 # FMINV
1428 # Note: SimdFloatCmpOp can be a bit optimistic here
1429 fminAcrossCode = fpAcrossOp % "Min"
1430 twoRegAcrossInstX("fminv", "FminvQX", "SimdFloatCmpOp", ("uint32_t",), 4,
1431 fminAcrossCode)
1432 # FMLA (by element)
1433 fmlaCode = fpOp % ("fplibMulAdd<Element>("
1434 "destElem, srcElem1, srcElem2, fpscr)")
1435 threeEqualRegInstX("fmla", "FmlaElemDX", "SimdFloatMultAccOp",
1436 smallFloatTypes, 2, fmlaCode, True, byElem=True)
1437 threeEqualRegInstX("fmla", "FmlaElemQX", "SimdFloatMultAccOp", floatTypes,
1438 4, fmlaCode, True, byElem=True)
1439 threeEqualRegInstX("fmla", "FmlaElemScX", "SimdFloatMultAccOp", floatTypes,
1440 4, fmlaCode, True, byElem=True, scalar=True)
1441 # FMLA (vector)
1442 threeEqualRegInstX("fmla", "FmlaDX", "SimdFloatMultAccOp", smallFloatTypes,
1443 2, fmlaCode, True)
1444 threeEqualRegInstX("fmla", "FmlaQX", "SimdFloatMultAccOp", floatTypes, 4,
1445 fmlaCode, True)
1446 # FMLS (by element)
1447 fmlsCode = fpOp % ("fplibMulAdd<Element>(destElem,"
1448 " fplibNeg<Element>(srcElem1), srcElem2, fpscr)")
1449 threeEqualRegInstX("fmls", "FmlsElemDX", "SimdFloatMultAccOp",
1450 smallFloatTypes, 2, fmlsCode, True, byElem=True)
1451 threeEqualRegInstX("fmls", "FmlsElemQX", "SimdFloatMultAccOp", floatTypes,
1452 4, fmlsCode, True, byElem=True)
1453 threeEqualRegInstX("fmls", "FmlsElemScX", "SimdFloatMultAccOp", floatTypes,
1454 4, fmlsCode, True, byElem=True, scalar=True)
1455 # FMLS (vector)
1456 threeEqualRegInstX("fmls", "FmlsDX", "SimdFloatMultAccOp", smallFloatTypes,
1457 2, fmlsCode, True)
1458 threeEqualRegInstX("fmls", "FmlsQX", "SimdFloatMultAccOp", floatTypes, 4,
1459 fmlsCode, True)
1460 # FMOV
1461 fmovCode = 'destElem = imm;'
1462 oneRegImmInstX("fmov", "FmovDX", "SimdMiscOp", smallFloatTypes, 2,
1463 fmovCode)
1464 oneRegImmInstX("fmov", "FmovQX", "SimdMiscOp", floatTypes, 4, fmovCode)
1465 # FMUL (by element)
1466 fmulCode = fpBinOp % "Mul"
1467 threeEqualRegInstX("fmul", "FmulElemDX", "SimdFloatMultOp",
1468 smallFloatTypes, 2, fmulCode, byElem=True)
1469 threeEqualRegInstX("fmul", "FmulElemQX", "SimdFloatMultOp", floatTypes, 4,
1470 fmulCode, byElem=True)
1471 threeEqualRegInstX("fmul", "FmulElemScX", "SimdFloatMultOp", floatTypes, 4,
1472 fmulCode, byElem=True, scalar=True)
1473 # FMUL (vector)
1474 threeEqualRegInstX("fmul", "FmulDX", "SimdFloatMultOp", smallFloatTypes, 2,
1475 fmulCode)
1476 threeEqualRegInstX("fmul", "FmulQX", "SimdFloatMultOp", floatTypes, 4,
1477 fmulCode)
1478 # FMULX
1479 fmulxCode = fpBinOp % "MulX"
1480 threeEqualRegInstX("fmulx", "FmulxDX", "SimdFloatMultOp", smallFloatTypes,
1481 2, fmulxCode)
1482 threeEqualRegInstX("fmulx", "FmulxQX", "SimdFloatMultOp", floatTypes, 4,
1483 fmulxCode)
1484 threeEqualRegInstX("fmulx", "FmulxScX", "SimdFloatMultOp", floatTypes, 4,
1485 fmulxCode, scalar=True)
1486 # FMULX (by element)
1487 threeEqualRegInstX("fmulx", "FmulxElemDX", "SimdFloatMultOp",
1488 smallFloatTypes, 2, fmulxCode, byElem=True)
1489 threeEqualRegInstX("fmulx", "FmulxElemQX", "SimdFloatMultOp", floatTypes,
1490 4, fmulxCode, byElem=True)
1491 threeEqualRegInstX("fmulx", "FmulxElemScX", "SimdFloatMultOp", floatTypes,
1492 4, fmulxCode, byElem=True, scalar=True)
1493 # FNEG
1494 fnegCode = fpOp % "fplibNeg<Element>(srcElem1)"
1495 twoEqualRegInstX("Neg", "FnegDX", "SimdFloatAluOp", smallFloatTypes, 2,
1496 fnegCode)
1497 twoEqualRegInstX("Neg", "FnegQX", "SimdFloatAluOp", floatTypes, 4,
1498 fnegCode)
1499 # FRECPE
1500 frecpeCode = fpOp % "fplibRecipEstimate<Element>(srcElem1, fpscr)"
1501 twoEqualRegInstX("frecpe", "FrecpeDX", "SimdFloatMultAccOp",
1502 smallFloatTypes, 2, frecpeCode)
1503 twoEqualRegInstX("frecpe", "FrecpeQX", "SimdFloatMultAccOp", floatTypes, 4,
1504 frecpeCode)
1505 twoEqualRegInstX("frecpe", "FrecpeScX", "SimdFloatMultAccOp", floatTypes,
1506 4, frecpeCode, scalar=True)
1507 # FRECPS
1508 frecpsCode = fpBinOp % "RecipStepFused"
1509 threeEqualRegInstX("frecps", "FrecpsDX", "SimdFloatMultAccOp",
1510 smallFloatTypes, 2, frecpsCode)
1511 threeEqualRegInstX("frecps", "FrecpsQX", "SimdFloatMultAccOp", floatTypes,
1512 4, frecpsCode)
1513 threeEqualRegInstX("frecps", "FrecpsScX", "SimdFloatMultAccOp", floatTypes,
1514 4, frecpsCode, scalar=True)
1515 # FRECPX
1516 frecpxCode = fpOp % "fplibRecpX<Element>(srcElem1, fpscr)"
1517 twoEqualRegInstX("frecpx", "FrecpxX", "SimdFloatMultAccOp", floatTypes, 4,
1518 frecpxCode, scalar=True)
1519 # FRINTA
1520 frintCode = fpOp % "fplibRoundInt<Element>(srcElem1, %s, %s, fpscr)"
1521 frintaCode = frintCode % ("FPRounding_TIEAWAY", "false")
1522 twoEqualRegInstX("frinta", "FrintaDX", "SimdCvtOp", smallFloatTypes, 2,
1523 frintaCode)
1524 twoEqualRegInstX("frinta", "FrintaQX", "SimdCvtOp", floatTypes, 4,
1525 frintaCode)
1526 # FRINTI
1527 frintiCode = frintCode % ("FPCRRounding(fpscr)", "false")
1528 twoEqualRegInstX("frinti", "FrintiDX", "SimdCvtOp", smallFloatTypes, 2,
1529 frintiCode)
1530 twoEqualRegInstX("frinti", "FrintiQX", "SimdCvtOp", floatTypes, 4,
1531 frintiCode)
1532 # FRINTM
1533 frintmCode = frintCode % ("FPRounding_NEGINF", "false")
1534 twoEqualRegInstX("frintm", "FrintmDX", "SimdCvtOp", smallFloatTypes, 2,
1535 frintmCode)
1536 twoEqualRegInstX("frintm", "FrintmQX", "SimdCvtOp", floatTypes, 4,
1537 frintmCode)
1538 # FRINTN
1539 frintnCode = frintCode % ("FPRounding_TIEEVEN", "false")
1540 twoEqualRegInstX("frintn", "FrintnDX", "SimdCvtOp", smallFloatTypes, 2,
1541 frintnCode)
1542 twoEqualRegInstX("frintn", "FrintnQX", "SimdCvtOp", floatTypes, 4,
1543 frintnCode)
1544 # FRINTP
1545 frintpCode = frintCode % ("FPRounding_POSINF", "false")
1546 twoEqualRegInstX("frintp", "FrintpDX", "SimdCvtOp", smallFloatTypes, 2,
1547 frintpCode)
1548 twoEqualRegInstX("frintp", "FrintpQX", "SimdCvtOp", floatTypes, 4,
1549 frintpCode)
1550 # FRINTX
1551 frintxCode = frintCode % ("FPCRRounding(fpscr)", "true")
1552 twoEqualRegInstX("frintx", "FrintxDX", "SimdCvtOp", smallFloatTypes, 2,
1553 frintxCode)
1554 twoEqualRegInstX("frintx", "FrintxQX", "SimdCvtOp", floatTypes, 4,
1555 frintxCode)
1556 # FRINTZ
1557 frintzCode = frintCode % ("FPRounding_ZERO", "false")
1558 twoEqualRegInstX("frintz", "FrintzDX", "SimdCvtOp", smallFloatTypes, 2,
1559 frintzCode)
1560 twoEqualRegInstX("frintz", "FrintzQX", "SimdCvtOp", floatTypes, 4,
1561 frintzCode)
1562 # FRSQRTE
1563 frsqrteCode = fpOp % "fplibRSqrtEstimate<Element>(srcElem1, fpscr)"
1564 twoEqualRegInstX("frsqrte", "FrsqrteDX", "SimdFloatSqrtOp",
1565 smallFloatTypes, 2, frsqrteCode)
1566 twoEqualRegInstX("frsqrte", "FrsqrteQX", "SimdFloatSqrtOp", floatTypes, 4,
1567 frsqrteCode)
1568 twoEqualRegInstX("frsqrte", "FrsqrteScX", "SimdFloatSqrtOp", floatTypes, 4,
1569 frsqrteCode, scalar=True)
1570 # FRSQRTS
1571 frsqrtsCode = fpBinOp % "RSqrtStepFused"
1572 threeEqualRegInstX("frsqrts", "FrsqrtsDX", "SimdFloatMiscOp",
1573 smallFloatTypes, 2, frsqrtsCode)
1574 threeEqualRegInstX("frsqrts", "FrsqrtsQX", "SimdFloatMiscOp", floatTypes,
1575 4, frsqrtsCode)
1576 threeEqualRegInstX("frsqrts", "FrsqrtsScX", "SimdFloatMiscOp", floatTypes,
1577 4, frsqrtsCode, scalar=True)
1578 # FSQRT
1579 fsqrtCode = fpOp % "fplibSqrt<Element>(srcElem1, fpscr)"
1580 twoEqualRegInstX("fsqrt", "FsqrtDX", "SimdFloatSqrtOp", smallFloatTypes, 2,
1581 fsqrtCode)
1582 twoEqualRegInstX("fsqrt", "FsqrtQX", "SimdFloatSqrtOp", floatTypes, 4,
1583 fsqrtCode)
1584 # FSUB
1585 fsubCode = fpBinOp % "Sub"
1586 threeEqualRegInstX("fsub", "FsubDX", "SimdFloatAddOp", smallFloatTypes, 2,
1587 fsubCode)
1588 threeEqualRegInstX("fsub", "FsubQX", "SimdFloatAddOp", floatTypes, 4,
1589 fsubCode)
1590 # INS (element)
1591 insFromVecElemInstX("ins", "InsElemX", "SimdMiscOp", unsignedTypes, 4)
1592 # INS (general register)
1593 insFromGprInstX("ins", "InsGprWX", "SimdMiscOp", smallUnsignedTypes, 4,
1594 'W')
1595 insFromGprInstX("ins", "InsGprXX", "SimdMiscOp", unsignedTypes, 4, 'X')
1596 # MLA (by element)
1597 mlaCode = "destElem += srcElem1 * srcElem2;"
1598 threeEqualRegInstX("mla", "MlaElemDX", "SimdMultAccOp",
1599 ("uint16_t", "uint32_t"), 2, mlaCode, True, byElem=True)
1600 threeEqualRegInstX("mla", "MlaElemQX", "SimdMultAccOp",
1601 ("uint16_t", "uint32_t"), 4, mlaCode, True, byElem=True)
1602 # MLA (vector)
1603 threeEqualRegInstX("mla", "MlaDX", "SimdMultAccOp", smallUnsignedTypes, 2,
1604 mlaCode, True)
1605 threeEqualRegInstX("mla", "MlaQX", "SimdMultAccOp", smallUnsignedTypes, 4,
1606 mlaCode, True)
1607 # MLS (by element)
1608 mlsCode = "destElem -= srcElem1 * srcElem2;"
1609 threeEqualRegInstX("mls", "MlsElemDX", "SimdMultAccOp",
1610 ("uint16_t", "uint32_t"), 2, mlsCode, True, byElem=True)
1611 threeEqualRegInstX("mls", "MlsElemQX", "SimdMultAccOp",
1612 ("uint16_t", "uint32_t"), 4, mlsCode, True, byElem=True)
1613 # MLS (vector)
1614 threeEqualRegInstX("mls", "MlsDX", "SimdMultAccOp", smallUnsignedTypes, 2,
1615 mlsCode, True)
1616 threeEqualRegInstX("mls", "MlsQX", "SimdMultAccOp", smallUnsignedTypes, 4,
1617 mlsCode, True)
1618 # MOV (element) -> alias to INS (element)
1619 # MOV (from general) -> alias to INS (general register)
1620 # MOV (scalar) -> alias to DUP (element)
1621 # MOV (to general) -> alias to UMOV
1622 # MOV (vector) -> alias to ORR (register)
1623 # MOVI
1624 movImmCode = "destElem = imm;"
1625 oneRegImmInstX("movi", "MoviDX", "SimdMiscOp", ("uint64_t",), 2,
1626 movImmCode)
1627 oneRegImmInstX("movi", "MoviQX", "SimdMiscOp", ("uint64_t",), 4,
1628 movImmCode)
1629 # MUL (by element)
1630 mulCode = "destElem = srcElem1 * srcElem2;"
1631 threeEqualRegInstX("mul", "MulElemDX", "SimdMultOp",
1632 ("uint16_t", "uint32_t"), 2, mulCode, byElem=True)
1633 threeEqualRegInstX("mul", "MulElemQX", "SimdMultOp",
1634 ("uint16_t", "uint32_t"), 4, mulCode, byElem=True)
1635 # MUL (vector)
1636 threeEqualRegInstX("mul", "MulDX", "SimdMultOp", smallUnsignedTypes, 2,
1637 mulCode)
1638 threeEqualRegInstX("mul", "MulQX", "SimdMultOp", smallUnsignedTypes, 4,
1639 mulCode)
1640 # MVN
1641 mvnCode = "destElem = ~srcElem1;"
1642 twoEqualRegInstX("mvn", "MvnDX", "SimdAluOp", ("uint64_t",), 2, mvnCode)
1643 twoEqualRegInstX("mvn", "MvnQX", "SimdAluOp", ("uint64_t",), 4, mvnCode)
1644 # MVNI
1645 mvniCode = "destElem = ~imm;"
1646 oneRegImmInstX("mvni", "MvniDX", "SimdAluOp", ("uint64_t",), 2, mvniCode)
1647 oneRegImmInstX("mvni", "MvniQX", "SimdAluOp", ("uint64_t",), 4, mvniCode)
1648 # NEG
1649 negCode = "destElem = -srcElem1;"
1650 twoEqualRegInstX("neg", "NegDX", "SimdAluOp", signedTypes, 2, negCode)
1651 twoEqualRegInstX("neg", "NegQX", "SimdAluOp", signedTypes, 4, negCode)
1652 # NOT -> alias to MVN
1653 # ORN
1654 ornCode = "destElem = srcElem1 | ~srcElem2;"
1655 threeEqualRegInstX("orn", "OrnDX", "SimdAluOp", ("uint64_t",), 2, ornCode)
1656 threeEqualRegInstX("orn", "OrnQX", "SimdAluOp", ("uint64_t",), 4, ornCode)
1657 # ORR (immediate)
1658 orrImmCode = "destElem |= imm;"
1659 oneRegImmInstX("orr", "OrrImmDX", "SimdAluOp", ("uint64_t",), 2,
1660 orrImmCode, True)
1661 oneRegImmInstX("orr", "OrrImmQX", "SimdAluOp", ("uint64_t",), 4,
1662 orrImmCode, True)
1663 # ORR (register)
1664 orrCode = "destElem = srcElem1 | srcElem2;"
1665 threeEqualRegInstX("orr", "OrrDX", "SimdAluOp", ("uint64_t",), 2, orrCode)
1666 threeEqualRegInstX("orr", "OrrQX", "SimdAluOp", ("uint64_t",), 4, orrCode)
1667 # PMUL
1668 pmulCode = '''
1669 destElem = 0;
1670 for (unsigned j = 0; j < sizeof(Element) * 8; j++) {
1671 if (bits(srcElem2, j))
1672 destElem ^= srcElem1 << j;
1673 }
1674 '''
1675 threeEqualRegInstX("pmul", "PmulDX", "SimdMultOp", ("uint8_t",), 2,
1676 pmulCode)
1677 threeEqualRegInstX("pmul", "PmulQX", "SimdMultOp", ("uint8_t",), 4,
1678 pmulCode)
1679 # PMULL, PMULL2
1680 # Note: 64-bit PMULL is not available (Crypto. Extension)
1681 pmullCode = '''
1682 destElem = 0;
1683 for (unsigned j = 0; j < sizeof(Element) * 8; j++) {
1684 if (bits(srcElem2, j))
1685 destElem ^= (BigElement)srcElem1 << j;
1686 }
1687 '''
1688 threeRegLongInstX("pmull", "PmullX", "SimdMultOp", ("uint8_t",), pmullCode)
1689 threeRegLongInstX("pmull", "Pmull2X", "SimdMultOp", ("uint8_t",),
1690 pmullCode, hi=True)
1691 # RADDHN, RADDHN2
1692 raddhnCode = '''
1693 destElem = ((BigElement)srcElem1 + (BigElement)srcElem2 +
1694 ((BigElement)1 << (sizeof(Element) * 8 - 1))) >>
1695 (sizeof(Element) * 8);
1696 '''
1697 threeRegNarrowInstX("raddhn", "RaddhnX", "SimdAddOp", smallUnsignedTypes,
1698 raddhnCode)
1699 threeRegNarrowInstX("raddhn2", "Raddhn2X", "SimdAddOp", smallUnsignedTypes,
1700 raddhnCode, hi=True)
1701 # RBIT
1702 rbitCode = '''
1703 destElem = 0;
1704 Element temp = srcElem1;
1705 for (int i = 0; i < 8 * sizeof(Element); i++) {
1706 destElem = destElem | ((temp & 0x1) <<
1707 (8 * sizeof(Element) - 1 - i));
1708 temp >>= 1;
1709 }
1710 '''
1711 twoEqualRegInstX("rbit", "RbitDX", "SimdAluOp", ("uint8_t",), 2, rbitCode)
1712 twoEqualRegInstX("rbit", "RbitQX", "SimdAluOp", ("uint8_t",), 4, rbitCode)
1713 # REV16
1714 rev16Code = '''
1715 destElem = srcElem1;
1716 unsigned groupSize = ((1 << 1) / sizeof(Element));
1717 unsigned reverseMask = (groupSize - 1);
1718 j = i ^ reverseMask;
1719 '''
1720 twoEqualRegInstX("rev16", "Rev16DX", "SimdAluOp", ("uint8_t",), 2,
1721 rev16Code)
1722 twoEqualRegInstX("rev16", "Rev16QX", "SimdAluOp", ("uint8_t",), 4,
1723 rev16Code)
1724 # REV32
1725 rev32Code = '''
1726 destElem = srcElem1;
1727 unsigned groupSize = ((1 << 2) / sizeof(Element));
1728 unsigned reverseMask = (groupSize - 1);
1729 j = i ^ reverseMask;
1730 '''
1731 twoEqualRegInstX("rev32", "Rev32DX", "SimdAluOp", ("uint8_t", "uint16_t"),
1732 2, rev32Code)
1733 twoEqualRegInstX("rev32", "Rev32QX", "SimdAluOp", ("uint8_t", "uint16_t"),
1734 4, rev32Code)
1735 # REV64
1736 rev64Code = '''
1737 destElem = srcElem1;
1738 unsigned groupSize = ((1 << 3) / sizeof(Element));
1739 unsigned reverseMask = (groupSize - 1);
1740 j = i ^ reverseMask;
1741 '''
1742 twoEqualRegInstX("rev64", "Rev64DX", "SimdAluOp", smallUnsignedTypes, 2,
1743 rev64Code)
1744 twoEqualRegInstX("rev64", "Rev64QX", "SimdAluOp", smallUnsignedTypes, 4,
1745 rev64Code)
1746 # RSHRN, RSHRN2
1747 rshrnCode = '''
1748 if (imm > sizeof(srcElem1) * 8) {
1749 destElem = 0;
1750 } else if (imm) {
1751 Element rBit = bits(srcElem1, imm - 1);
1752 destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit;
1753 } else {
1754 destElem = srcElem1;
1755 }
1756 '''
1757 twoRegNarrowInstX("rshrn", "RshrnX", "SimdShiftOp", smallUnsignedTypes,
1758 rshrnCode, hasImm=True)
1759 twoRegNarrowInstX("rshrn2", "Rshrn2X", "SimdShiftOp", smallUnsignedTypes,
1760 rshrnCode, hasImm=True, hi=True)
1761 # RSUBHN, RSUBHN2
1762 rsubhnCode = '''
1763 destElem = ((BigElement)srcElem1 - (BigElement)srcElem2 +
1764 ((BigElement)1 << (sizeof(Element) * 8 - 1))) >>
1765 (sizeof(Element) * 8);
1766 '''
1767 threeRegNarrowInstX("rsubhn", "RsubhnX", "SimdAddOp", smallTypes,
1768 rsubhnCode)
1769 threeRegNarrowInstX("rsubhn2", "Rsubhn2X", "SimdAddOp", smallTypes,
1770 rsubhnCode, hi=True)
1771 # SABA
1772 abaCode = '''
1773 destElem += (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) :
1774 (srcElem2 - srcElem1);
1775 '''
1776 threeEqualRegInstX("saba", "SabaDX", "SimdAddAccOp", smallSignedTypes, 2,
1777 abaCode, True)
1778 threeEqualRegInstX("saba", "SabaQX", "SimdAddAccOp", smallSignedTypes, 4,
1779 abaCode, True)
1780 # SABAL, SABAL2
1781 abalCode = '''
1782 destElem += (srcElem1 > srcElem2) ?
1783 ((BigElement)srcElem1 - (BigElement)srcElem2) :
1784 ((BigElement)srcElem2 - (BigElement)srcElem1);
1785 '''
1786 threeRegLongInstX("sabal", "SabalX", "SimdAddAccOp", smallSignedTypes,
1787 abalCode, True)
1788 threeRegLongInstX("sabal2", "Sabal2X", "SimdAddAccOp", smallSignedTypes,
1789 abalCode, True, hi=True)
1790 # SABD
1791 abdCode = '''
1792 destElem = (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) :
1793 (srcElem2 - srcElem1);
1794 '''
1795 threeEqualRegInstX("sabd", "SabdDX", "SimdAddOp", smallSignedTypes, 2,
1796 abdCode)
1797 threeEqualRegInstX("sabd", "SabdQX", "SimdAddOp", smallSignedTypes, 4,
1798 abdCode)
1799 # SABDL, SABDL2
1800 abdlCode = '''
1801 destElem = (srcElem1 > srcElem2) ?
1802 ((BigElement)srcElem1 - (BigElement)srcElem2) :
1803 ((BigElement)srcElem2 - (BigElement)srcElem1);
1804 '''
1805 threeRegLongInstX("sabdl", "SabdlX", "SimdAddAccOp", smallSignedTypes,
1806 abdlCode, True)
1807 threeRegLongInstX("sabdl2", "Sabdl2X", "SimdAddAccOp", smallSignedTypes,
1808 abdlCode, True, hi=True)
1809 # SADALP
1810 adalpCode = "destElem += (BigElement)srcElem1 + (BigElement)srcElem2;"
1811 twoRegCondenseInstX("sadalp", "SadalpDX", "SimdAddOp", smallSignedTypes, 2,
1812 adalpCode, True)
1813 twoRegCondenseInstX("sadalp", "SadalpQX", "SimdAddOp", smallSignedTypes, 4,
1814 adalpCode, True)
1815 # SADDL, SADDL2
1816 addlwCode = "destElem = (BigElement)srcElem1 + (BigElement)srcElem2;"
1817 threeRegLongInstX("saddl", "SaddlX", "SimdAddAccOp", smallSignedTypes,
1818 addlwCode)
1819 threeRegLongInstX("saddl2", "Saddl2X", "SimdAddAccOp", smallSignedTypes,
1820 addlwCode, hi=True)
1821 # SADDLP
1822 twoRegCondenseInstX("saddlp", "SaddlpDX", "SimdAddOp", smallSignedTypes, 2,
1823 addlwCode)
1824 twoRegCondenseInstX("saddlp", "SaddlpQX", "SimdAddOp", smallSignedTypes, 4,
1825 addlwCode)
1826 # SADDLV
1827 # Note: SimdAddOp can be a bit optimistic here
1828 addAcrossLongCode = "destElem += (BigElement)srcElem1;"
1829 twoRegAcrossInstX("saddlv", "SaddlvDX", "SimdAddOp", ("int8_t", "int16_t"),
1830 2, addAcrossLongCode, long=True)
1831 twoRegAcrossInstX("saddlv", "SaddlvQX", "SimdAddOp", ("int8_t", "int16_t"),
1832 4, addAcrossLongCode, long=True)
1833 twoRegAcrossInstX("saddlv", "SaddlvBQX", "SimdAddOp", ("int32_t",), 4,
1834 addAcrossLongCode, doubleDest=True, long=True)
1835 # SADDW, SADDW2
1836 threeRegWideInstX("saddw", "SaddwX", "SimdAddAccOp", smallSignedTypes,
1837 addlwCode)
1838 threeRegWideInstX("saddw2", "Saddw2X", "SimdAddAccOp", smallSignedTypes,
1839 addlwCode, hi=True)
1840 # SCVTF (fixed-point)
1841 scvtfFixedCode = fpOp % ("fplibFixedToFP<Element>((int%d_t) srcElem1, imm,"
1842 " false, FPCRRounding(fpscr), fpscr)")
1843 twoEqualRegInstX("scvtf", "ScvtfFixedDX", "SimdCvtOp", smallFloatTypes, 2,
1844 scvtfFixedCode % 32, hasImm=True)
1845 twoEqualRegInstX("scvtf", "ScvtfFixedSQX", "SimdCvtOp", smallFloatTypes, 4,
1846 scvtfFixedCode % 32, hasImm=True)
1847 twoEqualRegInstX("scvtf", "ScvtfFixedDQX", "SimdCvtOp", ("uint64_t",), 4,
1848 scvtfFixedCode % 64, hasImm=True)
1849 twoEqualRegInstX("scvtf", "ScvtfFixedScSX", "SimdCvtOp", smallFloatTypes,
1850 4, scvtfFixedCode % 32, hasImm=True, scalar=True)
1851 twoEqualRegInstX("scvtf", "ScvtfFixedScDX", "SimdCvtOp", ("uint64_t",), 4,
1852 scvtfFixedCode % 64, hasImm=True, scalar=True)
1853 # SCVTF (integer)
1854 scvtfIntCode = fpOp % ("fplibFixedToFP<Element>((int%d_t) srcElem1, 0,"
1855 " false, FPCRRounding(fpscr), fpscr)")
1856 twoEqualRegInstX("scvtf", "ScvtfIntDX", "SimdCvtOp", smallFloatTypes, 2,
1857 scvtfIntCode % 32)
1858 twoEqualRegInstX("scvtf", "ScvtfIntSQX", "SimdCvtOp", smallFloatTypes, 4,
1859 scvtfIntCode % 32)
1860 twoEqualRegInstX("scvtf", "ScvtfIntDQX", "SimdCvtOp", ("uint64_t",), 4,
1861 scvtfIntCode % 64)
1862 twoEqualRegInstX("scvtf", "ScvtfIntScSX", "SimdCvtOp", smallFloatTypes, 4,
1863 scvtfIntCode % 32, scalar=True)
1864 twoEqualRegInstX("scvtf", "ScvtfIntScDX", "SimdCvtOp", ("uint64_t",), 4,
1865 scvtfIntCode % 64, scalar=True)
1866 # SHADD
1867 haddCode = '''
1868 Element carryBit =
1869 (((unsigned)srcElem1 & 0x1) +
1870 ((unsigned)srcElem2 & 0x1)) >> 1;
1871 // Use division instead of a shift to ensure the sign extension works
1872 // right. The compiler will figure out if it can be a shift. Mask the
1873 // inputs so they get truncated correctly.
1874 destElem = (((srcElem1 & ~(Element)1) / 2) +
1875 ((srcElem2 & ~(Element)1) / 2)) + carryBit;
1876 '''
1877 threeEqualRegInstX("shadd", "ShaddDX", "SimdAddOp", smallSignedTypes, 2,
1878 haddCode)
1879 threeEqualRegInstX("shadd", "ShaddQX", "SimdAddOp", smallSignedTypes, 4,
1880 haddCode)
1881 # SHL
1882 shlCode = '''
1883 if (imm >= sizeof(Element) * 8)
1884 destElem = (srcElem1 << (sizeof(Element) * 8 - 1)) << 1;
1885 else
1886 destElem = srcElem1 << imm;
1887 '''
1888 twoEqualRegInstX("shl", "ShlDX", "SimdShiftOp", unsignedTypes, 2, shlCode,
1889 hasImm=True)
1890 twoEqualRegInstX("shl", "ShlQX", "SimdShiftOp", unsignedTypes, 4, shlCode,
1891 hasImm=True)
1892 # SHLL, SHLL2
1893 shllCode = "destElem = ((BigElement)srcElem1) << (sizeof(Element) * 8);"
1894 twoRegLongInstX("shll", "ShllX", "SimdShiftOp", smallTypes, shllCode)
1895 twoRegLongInstX("shll", "Shll2X", "SimdShiftOp", smallTypes, shllCode,
1896 hi=True)
1897 # SHRN, SHRN2
1898 shrnCode = '''
1899 if (imm >= sizeof(srcElem1) * 8) {
1900 destElem = 0;
1901 } else {
1902 destElem = srcElem1 >> imm;
1903 }
1904 '''
1905 twoRegNarrowInstX("shrn", "ShrnX", "SimdShiftOp", smallUnsignedTypes,
1906 shrnCode, hasImm=True)
1907 twoRegNarrowInstX("shrn2", "Shrn2X", "SimdShiftOp", smallUnsignedTypes,
1908 shrnCode, hasImm=True, hi=True)
1909 # SHSUB
1910 hsubCode = '''
1911 Element borrowBit =
1912 (((srcElem1 & 0x1) - (srcElem2 & 0x1)) >> 1) & 0x1;
1913 // Use division instead of a shift to ensure the sign extension works
1914 // right. The compiler will figure out if it can be a shift. Mask the
1915 // inputs so they get truncated correctly.
1916 destElem = (((srcElem1 & ~(Element)1) / 2) -
1917 ((srcElem2 & ~(Element)1) / 2)) - borrowBit;
1918 '''
1919 threeEqualRegInstX("shsub", "ShsubDX", "SimdAddOp", smallSignedTypes, 2,
1920 hsubCode)
1921 threeEqualRegInstX("shsub", "ShsubQX", "SimdAddOp", smallSignedTypes, 4,
1922 hsubCode)
1923 # SLI
1924 sliCode = '''
1925 if (imm >= sizeof(Element) * 8)
1926 destElem = destElem;
1927 else
1928 destElem = (srcElem1 << imm) | (destElem & mask(imm));
1929 '''
1930 twoEqualRegInstX("sli", "SliDX", "SimdShiftOp", unsignedTypes, 2, sliCode,
1931 True, hasImm=True)
1932 twoEqualRegInstX("sli", "SliQX", "SimdShiftOp", unsignedTypes, 4, sliCode,
1933 True, hasImm=True)
1934 # SMAX
1935 maxCode = "destElem = (srcElem1 > srcElem2) ? srcElem1 : srcElem2;"
1936 threeEqualRegInstX("smax", "SmaxDX", "SimdCmpOp", smallSignedTypes, 2,
1937 maxCode)
1938 threeEqualRegInstX("smax", "SmaxQX", "SimdCmpOp", smallSignedTypes, 4,
1939 maxCode)
1940 # SMAXP
1941 threeEqualRegInstX("smaxp", "SmaxpDX", "SimdCmpOp", smallSignedTypes, 2,
1942 maxCode, pairwise=True)
1943 threeEqualRegInstX("smaxp", "SmaxpQX", "SimdCmpOp", smallSignedTypes, 4,
1944 maxCode, pairwise=True)
1945 # SMAXV
1946 maxAcrossCode = '''
1947 if (i == 0 || srcElem1 > destElem)
1948 destElem = srcElem1;
1949 '''
1950 twoRegAcrossInstX("smaxv", "SmaxvDX", "SimdCmpOp", ("int8_t", "int16_t"),
1951 2, maxAcrossCode)
1952 twoRegAcrossInstX("smaxv", "SmaxvQX", "SimdCmpOp", smallSignedTypes, 4,
1953 maxAcrossCode)
1954 # SMIN
1955 minCode = "destElem = (srcElem1 < srcElem2) ? srcElem1 : srcElem2;"
1956 threeEqualRegInstX("smin", "SminDX", "SimdCmpOp", smallSignedTypes, 2,
1957 minCode)
1958 threeEqualRegInstX("smin", "SminQX", "SimdCmpOp", smallSignedTypes, 4,
1959 minCode)
1960 # SMINP
1961 threeEqualRegInstX("sminp", "SminpDX", "SimdCmpOp", smallSignedTypes, 2,
1962 minCode, pairwise=True)
1963 threeEqualRegInstX("sminp", "SminpQX", "SimdCmpOp", smallSignedTypes, 4,
1964 minCode, pairwise=True)
1965 # SMINV
1966 minAcrossCode = '''
1967 if (i == 0 || srcElem1 < destElem)
1968 destElem = srcElem1;
1969 '''
1970 twoRegAcrossInstX("sminv", "SminvDX", "SimdCmpOp", ("int8_t", "int16_t"),
1971 2, minAcrossCode)
1972 twoRegAcrossInstX("sminv", "SminvQX", "SimdCmpOp", smallSignedTypes, 4,
1973 minAcrossCode)
1974
1975 split('exec')
1976
1977 # SMLAL, SMLAL2 (by element)
1978 mlalCode = "destElem += (BigElement)srcElem1 * (BigElement)srcElem2;"
1979 threeRegLongInstX("smlal", "SmlalElemX", "SimdMultAccOp",
1980 ("int16_t", "int32_t"), mlalCode, True, byElem=True)
1981 threeRegLongInstX("smlal", "SmlalElem2X", "SimdMultAccOp",
1982 ("int16_t", "int32_t"), mlalCode, True, byElem=True,
1983 hi=True)
1984 # SMLAL, SMLAL2 (vector)
1985 threeRegLongInstX("smlal", "SmlalX", "SimdMultAccOp", smallSignedTypes,
1986 mlalCode, True)
1987 threeRegLongInstX("smlal", "Smlal2X", "SimdMultAccOp", smallSignedTypes,
1988 mlalCode, True, hi=True)
1989 # SMLSL, SMLSL2 (by element)
1990 mlslCode = "destElem -= (BigElement)srcElem1 * (BigElement)srcElem2;"
1991 threeRegLongInstX("smlsl", "SmlslElemX", "SimdMultAccOp", smallSignedTypes,
1992 mlslCode, True, byElem=True)
1993 threeRegLongInstX("smlsl", "SmlslElem2X", "SimdMultAccOp",
1994 smallSignedTypes, mlslCode, True, byElem=True, hi=True)
1995 # SMLSL, SMLSL2 (vector)
1996 threeRegLongInstX("smlsl", "SmlslX", "SimdMultAccOp", smallSignedTypes,
1997 mlslCode, True)
1998 threeRegLongInstX("smlsl", "Smlsl2X", "SimdMultAccOp", smallSignedTypes,
1999 mlslCode, True, hi=True)
2000 # SMOV
2001 insToGprInstX("smov", "SmovWX", "SimdMiscOp", ("int8_t", "int16_t"), 4,
2002 'W', True)
2003 insToGprInstX("smov", "SmovXX", "SimdMiscOp", smallSignedTypes, 4, 'X',
2004 True)
2005 # SMULL, SMULL2 (by element)
2006 mullCode = "destElem = (BigElement)srcElem1 * (BigElement)srcElem2;"
2007 threeRegLongInstX("smull", "SmullElemX", "SimdMultOp", smallSignedTypes,
2008 mullCode, byElem=True)
2009 threeRegLongInstX("smull", "SmullElem2X", "SimdMultOp", smallSignedTypes,
2010 mullCode, byElem=True, hi=True)
2011 # SMULL, SMULL2 (vector)
2012 threeRegLongInstX("smull", "SmullX", "SimdMultOp", smallSignedTypes,
2013 mullCode)
2014 threeRegLongInstX("smull", "Smull2X", "SimdMultOp", smallSignedTypes,
2015 mullCode, hi=True)
2016 # SQABS
2017 sqabsCode = '''
2018 FPSCR fpscr = (FPSCR) FpscrQc;
2019 if (srcElem1 == (Element)(std::numeric_limits<Element>::min())) {
2020 fpscr.qc = 1;
2021 destElem = ~srcElem1;
2022 } else if (srcElem1 < 0) {
2023 destElem = -srcElem1;
2024 } else {
2025 destElem = srcElem1;
2026 }
2027 FpscrQc = fpscr;
2028 '''
2029 twoEqualRegInstX("sqabs", "SqabsDX", "SimdAluOp", smallSignedTypes, 2,
2030 sqabsCode)
2031 twoEqualRegInstX("sqabs", "SqabsQX", "SimdAluOp", signedTypes, 4,
2032 sqabsCode)
2033 twoEqualRegInstX("sqabs", "SqabsScX", "SimdAluOp", signedTypes, 4,
2034 sqabsCode, scalar=True)
2035 # SQADD
2036 sqaddCode = '''
2037 destElem = srcElem1 + srcElem2;
2038 FPSCR fpscr = (FPSCR) FpscrQc;
2039 bool negDest = (destElem < 0);
2040 bool negSrc1 = (srcElem1 < 0);
2041 bool negSrc2 = (srcElem2 < 0);
2042 if ((negDest != negSrc1) && (negSrc1 == negSrc2)) {
2043 destElem = std::numeric_limits<Element>::min();
2044 if (negDest)
2045 destElem -= 1;
2046 fpscr.qc = 1;
2047 }
2048 FpscrQc = fpscr;
2049 '''
2050 threeEqualRegInstX("sqadd", "SqaddDX", "SimdAddOp", smallSignedTypes, 2,
2051 sqaddCode)
2052 threeEqualRegInstX("sqadd", "SqaddQX", "SimdAddOp", signedTypes, 4,
2053 sqaddCode)
2054 threeEqualRegInstX("sqadd", "SqaddScX", "SimdAddOp", signedTypes, 4,
2055 sqaddCode, scalar=True)
2056 # SQDMLAL, SQDMLAL2 (by element)
2057 qdmlalCode = '''
2058 FPSCR fpscr = (FPSCR) FpscrQc;
2059 BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2060 Element maxNeg = std::numeric_limits<Element>::min();
2061 Element halfNeg = maxNeg / 2;
2062 if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2063 (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2064 (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2065 midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8));
2066 fpscr.qc = 1;
2067 }
2068 bool negPreDest = ltz(destElem);
2069 destElem += midElem;
2070 bool negDest = ltz(destElem);
2071 bool negMid = ltz(midElem);
2072 if (negPreDest == negMid && negMid != negDest) {
2073 destElem = mask(sizeof(BigElement) * 8 - 1);
2074 if (negPreDest)
2075 destElem = ~destElem;
2076 fpscr.qc = 1;
2077 }
2078 FpscrQc = fpscr;
2079 '''
2080 threeRegLongInstX("sqdmlal", "SqdmlalElemX", "SimdMultAccOp",
2081 ("int16_t", "int32_t"), qdmlalCode, True, byElem=True)
2082 threeRegLongInstX("sqdmlal", "SqdmlalElem2X", "SimdMultAccOp",
2083 ("int16_t", "int32_t"), qdmlalCode, True, byElem=True,
2084 hi=True)
2085 threeRegLongInstX("sqdmlal", "SqdmlalElemScX", "SimdMultAccOp",
2086 ("int16_t", "int32_t"), qdmlalCode, True, byElem=True,
2087 scalar=True)
2088 # SQDMLAL, SQDMLAL2 (vector)
2089 threeRegLongInstX("sqdmlal", "SqdmlalX", "SimdMultAccOp",
2090 ("int16_t", "int32_t"), qdmlalCode, True)
2091 threeRegLongInstX("sqdmlal", "Sqdmlal2X", "SimdMultAccOp",
2092 ("int16_t", "int32_t"), qdmlalCode, True, hi=True)
2093 threeRegLongInstX("sqdmlal", "SqdmlalScX", "SimdMultAccOp",
2094 ("int16_t", "int32_t"), qdmlalCode, True, scalar=True)
2095 # SQDMLSL, SQDMLSL2 (by element)
2096 qdmlslCode = '''
2097 FPSCR fpscr = (FPSCR) FpscrQc;
2098 BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2099 Element maxNeg = std::numeric_limits<Element>::min();
2100 Element halfNeg = maxNeg / 2;
2101 if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2102 (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2103 (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2104 midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8));
2105 fpscr.qc = 1;
2106 }
2107 bool negPreDest = ltz(destElem);
2108 destElem -= midElem;
2109 bool negDest = ltz(destElem);
2110 bool posMid = ltz((BigElement)-midElem);
2111 if (negPreDest == posMid && posMid != negDest) {
2112 destElem = mask(sizeof(BigElement) * 8 - 1);
2113 if (negPreDest)
2114 destElem = ~destElem;
2115 fpscr.qc = 1;
2116 }
2117 FpscrQc = fpscr;
2118 '''
2119 threeRegLongInstX("sqdmlsl", "SqdmlslElemX", "SimdMultAccOp",
2120 ("int16_t", "int32_t"), qdmlslCode, True, byElem=True)
2121 threeRegLongInstX("sqdmlsl", "SqdmlslElem2X", "SimdMultAccOp",
2122 ("int16_t", "int32_t"), qdmlslCode, True, byElem=True,
2123 hi=True)
2124 threeRegLongInstX("sqdmlsl", "SqdmlslElemScX", "SimdMultAccOp",
2125 ("int16_t", "int32_t"), qdmlslCode, True, byElem=True,
2126 scalar=True)
2127 # SQDMLSL, SQDMLSL2 (vector)
2128 threeRegLongInstX("sqdmlsl", "SqdmlslX", "SimdMultAccOp",
2129 ("int16_t", "int32_t"), qdmlslCode, True)
2130 threeRegLongInstX("sqdmlsl", "Sqdmlsl2X", "SimdMultAccOp",
2131 ("int16_t", "int32_t"), qdmlslCode, True, hi=True)
2132 threeRegLongInstX("sqdmlsl", "SqdmlslScX", "SimdMultAccOp",
2133 ("int16_t", "int32_t"), qdmlslCode, True, scalar=True)
2134 # SQDMULH (by element)
2135 sqdmulhCode = '''
2136 FPSCR fpscr = (FPSCR) FpscrQc;
2137 destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2) >>
2138 (sizeof(Element) * 8);
2139 if (srcElem1 == srcElem2 &&
2140 srcElem1 == (Element)((Element)1 <<
2141 (sizeof(Element) * 8 - 1))) {
2142 destElem = ~srcElem1;
2143 fpscr.qc = 1;
2144 }
2145 FpscrQc = fpscr;
2146 '''
2147 threeEqualRegInstX("sqdmulh", "SqdmulhElemDX", "SimdMultOp",
2148 ("int16_t", "int32_t"), 2, sqdmulhCode, byElem=True)
2149 threeEqualRegInstX("sqdmulh", "SqdmulhElemQX", "SimdMultOp",
2150 ("int16_t", "int32_t"), 4, sqdmulhCode, byElem=True)
2151 threeEqualRegInstX("sqdmulh", "SqdmulhElemScX", "SimdMultOp",
2152 ("int16_t", "int32_t"), 4, sqdmulhCode, byElem=True,
2153 scalar=True)
2154 # SQDMULH (vector)
2155 threeEqualRegInstX("sqdmulh", "SqdmulhDX", "SimdMultOp",
2156 ("int16_t", "int32_t"), 2, sqdmulhCode)
2157 threeEqualRegInstX("sqdmulh", "SqdmulhQX", "SimdMultOp",
2158 ("int16_t", "int32_t"), 4, sqdmulhCode)
2159 threeEqualRegInstX("sqdmulh", "SqdmulhScX", "SimdMultOp",
2160 ("int16_t", "int32_t"), 4, sqdmulhCode, scalar=True)
2161 # SQDMULL, SQDMULL2 (by element)
2162 qdmullCode = '''
2163 FPSCR fpscr = (FPSCR) FpscrQc;
2164 destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2165 if (srcElem1 == srcElem2 &&
2166 srcElem1 == (Element)((Element)1 <<
2167 (Element)(sizeof(Element) * 8 - 1))) {
2168 destElem = ~((BigElement)srcElem1 << (sizeof(Element) * 8));
2169 fpscr.qc = 1;
2170 }
2171 FpscrQc = fpscr;
2172 '''
2173 threeRegLongInstX("sqdmull", "SqdmullElemX", "SimdMultOp",
2174 ("int16_t", "int32_t"), qdmullCode, True, byElem=True)
2175 threeRegLongInstX("sqdmull", "SqdmullElem2X", "SimdMultOp",
2176 ("int16_t", "int32_t"), qdmullCode, True, byElem=True,
2177 hi=True)
2178 threeRegLongInstX("sqdmull", "SqdmullElemScX", "SimdMultOp",
2179 ("int16_t", "int32_t"), qdmullCode, True, byElem=True,
2180 scalar=True)
2181 # SQDMULL, SQDMULL2 (vector)
2182 threeRegLongInstX("sqdmull", "SqdmullX", "SimdMultOp",
2183 ("int16_t", "int32_t"), qdmullCode, True)
2184 threeRegLongInstX("sqdmull", "Sqdmull2X", "SimdMultOp",
2185 ("int16_t", "int32_t"), qdmullCode, True, hi=True)
2186 threeRegLongInstX("sqdmull", "SqdmullScX", "SimdMultOp",
2187 ("int16_t", "int32_t"), qdmullCode, True, scalar=True)
2188 # SQNEG
2189 sqnegCode = '''
2190 FPSCR fpscr = (FPSCR) FpscrQc;
2191 if (srcElem1 == (Element)(std::numeric_limits<Element>::min())) {
2192 fpscr.qc = 1;
2193 destElem = ~srcElem1;
2194 } else {
2195 destElem = -srcElem1;
2196 }
2197 FpscrQc = fpscr;
2198 '''
2199 twoEqualRegInstX("sqneg", "SqnegDX", "SimdAluOp", smallSignedTypes, 2,
2200 sqnegCode)
2201 twoEqualRegInstX("sqneg", "SqnegQX", "SimdAluOp", signedTypes, 4,
2202 sqnegCode)
2203 twoEqualRegInstX("sqneg", "SqnegScX", "SimdAluOp", signedTypes, 4,
2204 sqnegCode, scalar=True)
2205 # SQRDMULH (by element)
2206 sqrdmulhCode = '''
2207 FPSCR fpscr = (FPSCR) FpscrQc;
2208 destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2 +
2209 ((int64_t)1 << (sizeof(Element) * 8 - 1))) >>
2210 (sizeof(Element) * 8);
2211 Element maxNeg = std::numeric_limits<Element>::min();
2212 Element halfNeg = maxNeg / 2;
2213 if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2214 (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2215 (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2216 if (destElem < 0) {
2217 destElem = mask(sizeof(Element) * 8 - 1);
2218 } else {
2219 destElem = std::numeric_limits<Element>::min();
2220 }
2221 fpscr.qc = 1;
2222 }
2223 FpscrQc = fpscr;
2224 '''
2225 threeEqualRegInstX("sqrdmulh", "SqrdmulhElemDX", "SimdMultOp",
2226 ("int16_t", "int32_t"), 2, sqrdmulhCode, byElem=True)
2227 threeEqualRegInstX("sqrdmulh", "SqrdmulhElemQX", "SimdMultOp",
2228 ("int16_t", "int32_t"), 4, sqrdmulhCode, byElem=True)
2229 threeEqualRegInstX("sqrdmulh", "SqrdmulhElemScX", "SimdMultOp",
2230 ("int16_t", "int32_t"), 4, sqrdmulhCode, byElem=True,
2231 scalar=True)
2232 # SQRDMULH (vector)
2233 threeEqualRegInstX("sqrdmulh", "SqrdmulhDX", "SimdMultOp",
2234 ("int16_t", "int32_t"), 2, sqrdmulhCode)
2235 threeEqualRegInstX("sqrdmulh", "SqrdmulhQX", "SimdMultOp",
2236 ("int16_t", "int32_t"), 4, sqrdmulhCode)
2237 threeEqualRegInstX("sqrdmulh", "SqrdmulhScX", "SimdMultOp",
2238 ("int16_t", "int32_t"), 4, sqrdmulhCode, scalar=True)
2239 # SQRSHL
2240 sqrshlCode = '''
2241 int16_t shiftAmt = (int8_t)srcElem2;
2242 FPSCR fpscr = (FPSCR) FpscrQc;
2243 if (shiftAmt < 0) {
2244 shiftAmt = -shiftAmt;
2245 Element rBit = 0;
2246 if (shiftAmt <= sizeof(Element) * 8)
2247 rBit = bits(srcElem1, shiftAmt - 1);
2248 if (shiftAmt > sizeof(Element) * 8 && srcElem1 < 0)
2249 rBit = 1;
2250 if (shiftAmt >= sizeof(Element) * 8) {
2251 shiftAmt = sizeof(Element) * 8 - 1;
2252 destElem = 0;
2253 } else {
2254 destElem = (srcElem1 >> shiftAmt);
2255 }
2256 // Make sure the right shift sign extended when it should.
2257 if (srcElem1 < 0 && destElem >= 0) {
2258 destElem |= -((Element)1 << (sizeof(Element) * 8 -
2259 1 - shiftAmt));
2260 }
2261 destElem += rBit;
2262 } else if (shiftAmt > 0) {
2263 bool sat = false;
2264 if (shiftAmt >= sizeof(Element) * 8) {
2265 if (srcElem1 != 0)
2266 sat = true;
2267 else
2268 destElem = 0;
2269 } else {
2270 if (bits((uint64_t) srcElem1, sizeof(Element) * 8 - 1,
2271 sizeof(Element) * 8 - 1 - shiftAmt) !=
2272 ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) {
2273 sat = true;
2274 } else {
2275 destElem = srcElem1 << shiftAmt;
2276 }
2277 }
2278 if (sat) {
2279 fpscr.qc = 1;
2280 destElem = mask(sizeof(Element) * 8 - 1);
2281 if (srcElem1 < 0)
2282 destElem = ~destElem;
2283 }
2284 } else {
2285 destElem = srcElem1;
2286 }
2287 FpscrQc = fpscr;
2288 '''
2289 threeEqualRegInstX("sqrshl", "SqrshlDX", "SimdCmpOp", smallSignedTypes, 2,
2290 sqrshlCode)
2291 threeEqualRegInstX("sqrshl", "SqrshlQX", "SimdCmpOp", signedTypes, 4,
2292 sqrshlCode)
2293 threeEqualRegInstX("sqrshl", "SqrshlScX", "SimdCmpOp", signedTypes, 4,
2294 sqrshlCode, scalar=True)
2295 # SQRSHRN, SQRSHRN2
2296 sqrshrnCode = '''
2297 FPSCR fpscr = (FPSCR) FpscrQc;
2298 if (imm > sizeof(srcElem1) * 8) {
2299 if (srcElem1 != 0 && srcElem1 != -1)
2300 fpscr.qc = 1;
2301 destElem = 0;
2302 } else if (imm) {
2303 BigElement mid = (srcElem1 >> (imm - 1));
2304 uint64_t rBit = mid & 0x1;
2305 mid >>= 1;
2306 mid |= -(mid & ((BigElement)1 <<
2307 (sizeof(BigElement) * 8 - 1 - imm)));
2308 mid += rBit;
2309 if (mid != (Element)mid) {
2310 destElem = mask(sizeof(Element) * 8 - 1);
2311 if (srcElem1 < 0)
2312 destElem = ~destElem;
2313 fpscr.qc = 1;
2314 } else {
2315 destElem = mid;
2316 }
2317 } else {
2318 if (srcElem1 != (Element)srcElem1) {
2319 destElem = mask(sizeof(Element) * 8 - 1);
2320 if (srcElem1 < 0)
2321 destElem = ~destElem;
2322 fpscr.qc = 1;
2323 } else {
2324 destElem = srcElem1;
2325 }
2326 }
2327 FpscrQc = fpscr;
2328 '''
2329 twoRegNarrowInstX("sqrshrn", "SqrshrnX", "SimdShiftOp", smallSignedTypes,
2330 sqrshrnCode, hasImm=True)
2331 twoRegNarrowInstX("sqrshrn2", "Sqrshrn2X", "SimdShiftOp", smallSignedTypes,
2332 sqrshrnCode, hasImm=True, hi=True)
2333 twoRegNarrowInstX("sqrshrn", "SqrshrnScX", "SimdShiftOp", smallSignedTypes,
2334 sqrshrnCode, hasImm=True, scalar=True)
2335 # SQRSHRUN, SQRSHRUN2
2336 sqrshrunCode = '''
2337 FPSCR fpscr = (FPSCR) FpscrQc;
2338 if (imm > sizeof(srcElem1) * 8) {
2339 if (srcElem1 != 0)
2340 fpscr.qc = 1;
2341 destElem = 0;
2342 } else if (imm) {
2343 BigElement mid = (srcElem1 >> (imm - 1));
2344 uint64_t rBit = mid & 0x1;
2345 mid >>= 1;
2346 mid |= -(mid & ((BigElement)1 <<
2347 (sizeof(BigElement) * 8 - 1 - imm)));
2348 mid += rBit;
2349 if (bits(mid, sizeof(BigElement) * 8 - 1,
2350 sizeof(Element) * 8) != 0) {
2351 if (srcElem1 < 0) {
2352 destElem = 0;
2353 } else {
2354 destElem = mask(sizeof(Element) * 8);
2355 }
2356 fpscr.qc = 1;
2357 } else {
2358 destElem = mid;
2359 }
2360 } else {
2361 if (srcElem1 < 0) {
2362 fpscr.qc = 1;
2363 destElem = 0;
2364 } else {
2365 destElem = srcElem1;
2366 }
2367 }
2368 FpscrQc = fpscr;
2369 '''
2370 twoRegNarrowInstX("sqrshrun", "SqrshrunX", "SimdShiftOp", smallSignedTypes,
2371 sqrshrunCode, hasImm=True)
2372 twoRegNarrowInstX("sqrshrun", "Sqrshrun2X", "SimdShiftOp",
2373 smallSignedTypes, sqrshrunCode, hasImm=True, hi=True)
2374 twoRegNarrowInstX("sqrshrun", "SqrshrunScX", "SimdShiftOp",
2375 smallSignedTypes, sqrshrunCode, hasImm=True, scalar=True)
2376 # SQSHL (immediate)
2377 sqshlImmCode = '''
2378 FPSCR fpscr = (FPSCR) FpscrQc;
2379 if (imm >= sizeof(Element) * 8) {
2380 if (srcElem1 != 0) {
2381 destElem = std::numeric_limits<Element>::min();
2382 if (srcElem1 > 0)
2383 destElem = ~destElem;
2384 fpscr.qc = 1;
2385 } else {
2386 destElem = 0;
2387 }
2388 } else if (imm) {
2389 destElem = (srcElem1 << imm);
2390 uint64_t topBits = bits((uint64_t)srcElem1,
2391 sizeof(Element) * 8 - 1,
2392 sizeof(Element) * 8 - 1 - imm);
2393 if (topBits != 0 && topBits != mask(imm + 1)) {
2394 destElem = std::numeric_limits<Element>::min();
2395 if (srcElem1 > 0)
2396 destElem = ~destElem;
2397 fpscr.qc = 1;
2398 }
2399 } else {
2400 destElem = srcElem1;
2401 }
2402 FpscrQc = fpscr;
2403 '''
2404 twoEqualRegInstX("sqshl", "SqshlImmDX", "SimdAluOp", smallSignedTypes, 2,
2405 sqshlImmCode, hasImm=True)
2406 twoEqualRegInstX("sqshl", "SqshlImmQX", "SimdAluOp", signedTypes, 4,
2407 sqshlImmCode, hasImm=True)
2408 twoEqualRegInstX("sqshl", "SqshlImmScX", "SimdAluOp", signedTypes, 4,
2409 sqshlImmCode, hasImm=True, scalar=True)
2410 # SQSHL (register)
2411 sqshlCode = '''
2412 int16_t shiftAmt = (int8_t)srcElem2;
2413 FPSCR fpscr = (FPSCR) FpscrQc;
2414 if (shiftAmt < 0) {
2415 shiftAmt = -shiftAmt;
2416 if (shiftAmt >= sizeof(Element) * 8) {
2417 shiftAmt = sizeof(Element) * 8 - 1;
2418 destElem = 0;
2419 } else {
2420 destElem = (srcElem1 >> shiftAmt);
2421 }
2422 // Make sure the right shift sign extended when it should.
2423 if (srcElem1 < 0 && destElem >= 0) {
2424 destElem |= -((Element)1 << (sizeof(Element) * 8 -
2425 1 - shiftAmt));
2426 }
2427 } else if (shiftAmt > 0) {
2428 bool sat = false;
2429 if (shiftAmt >= sizeof(Element) * 8) {
2430 if (srcElem1 != 0)
2431 sat = true;
2432 else
2433 destElem = 0;
2434 } else {
2435 if (bits((uint64_t) srcElem1, sizeof(Element) * 8 - 1,
2436 sizeof(Element) * 8 - 1 - shiftAmt) !=
2437 ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) {
2438 sat = true;
2439 } else {
2440 destElem = srcElem1 << shiftAmt;
2441 }
2442 }
2443 if (sat) {
2444 fpscr.qc = 1;
2445 destElem = mask(sizeof(Element) * 8 - 1);
2446 if (srcElem1 < 0)
2447 destElem = ~destElem;
2448 }
2449 } else {
2450 destElem = srcElem1;
2451 }
2452 FpscrQc = fpscr;
2453 '''
2454 threeEqualRegInstX("sqshl", "SqshlDX", "SimdAluOp", smallSignedTypes, 2,
2455 sqshlCode)
2456 threeEqualRegInstX("sqshl", "SqshlQX", "SimdAluOp", signedTypes, 4,
2457 sqshlCode)
2458 threeEqualRegInstX("sqshl", "SqshlScX", "SimdAluOp", signedTypes, 4,
2459 sqshlCode, scalar=True)
2460 # SQSHLU
2461 sqshluCode = '''
2462 FPSCR fpscr = (FPSCR) FpscrQc;
2463 if (imm >= sizeof(Element) * 8) {
2464 if (srcElem1 < 0) {
2465 destElem = 0;
2466 fpscr.qc = 1;
2467 } else if (srcElem1 > 0) {
2468 destElem = mask(sizeof(Element) * 8);
2469 fpscr.qc = 1;
2470 } else {
2471 destElem = 0;
2472 }
2473 } else if (imm) {
2474 destElem = (srcElem1 << imm);
2475 uint64_t topBits = bits((uint64_t)srcElem1,
2476 sizeof(Element) * 8 - 1,
2477 sizeof(Element) * 8 - imm);
2478 if (srcElem1 < 0) {
2479 destElem = 0;
2480 fpscr.qc = 1;
2481 } else if (topBits != 0) {
2482 destElem = mask(sizeof(Element) * 8);
2483 fpscr.qc = 1;
2484 }
2485 } else {
2486 if (srcElem1 < 0) {
2487 fpscr.qc = 1;
2488 destElem = 0;
2489 } else {
2490 destElem = srcElem1;
2491 }
2492 }
2493 FpscrQc = fpscr;
2494 '''
2495 twoEqualRegInstX("sqshlu", "SqshluDX", "SimdAluOp", smallSignedTypes, 2,
2496 sqshluCode, hasImm=True)
2497 twoEqualRegInstX("sqshlu", "SqshluQX", "SimdAluOp", signedTypes, 4,
2498 sqshluCode, hasImm=True)
2499 twoEqualRegInstX("sqshlu", "SqshluScX", "SimdAluOp", signedTypes, 4,
2500 sqshluCode, hasImm=True, scalar=True)
2501 # SQSHRN, SQSHRN2
2502 sqshrnCode = '''
2503 FPSCR fpscr = (FPSCR) FpscrQc;
2504 if (imm > sizeof(srcElem1) * 8) {
2505 if (srcElem1 != 0 && srcElem1 != -1)
2506 fpscr.qc = 1;
2507 destElem = 0;
2508 } else if (imm) {
2509 BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
2510 mid |= -(mid & ((BigElement)1 <<
2511 (sizeof(BigElement) * 8 - 1 - imm)));
2512 if (mid != (Element)mid) {
2513 destElem = mask(sizeof(Element) * 8 - 1);
2514 if (srcElem1 < 0)
2515 destElem = ~destElem;
2516 fpscr.qc = 1;
2517 } else {
2518 destElem = mid;
2519 }
2520 } else {
2521 destElem = srcElem1;
2522 }
2523 FpscrQc = fpscr;
2524 '''
2525 twoRegNarrowInstX("sqshrn", "SqshrnX", "SimdShiftOp", smallSignedTypes,
2526 sqshrnCode, hasImm=True)
2527 twoRegNarrowInstX("sqshrn2", "Sqshrn2X", "SimdShiftOp", smallSignedTypes,
2528 sqshrnCode, hasImm=True, hi=True)
2529 twoRegNarrowInstX("sqshrn", "SqshrnScX", "SimdShiftOp", smallSignedTypes,
2530 sqshrnCode, hasImm=True, scalar=True)
2531 # SQSHRUN, SQSHRUN2
2532 sqshrunCode = '''
2533 FPSCR fpscr = (FPSCR) FpscrQc;
2534 if (imm > sizeof(srcElem1) * 8) {
2535 if (srcElem1 != 0)
2536 fpscr.qc = 1;
2537 destElem = 0;
2538 } else if (imm) {
2539 BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
2540 if (bits(mid, sizeof(BigElement) * 8 - 1,
2541 sizeof(Element) * 8) != 0) {
2542 if (srcElem1 < 0) {
2543 destElem = 0;
2544 } else {
2545 destElem = mask(sizeof(Element) * 8);
2546 }
2547 fpscr.qc = 1;
2548 } else {
2549 destElem = mid;
2550 }
2551 } else {
2552 destElem = srcElem1;
2553 }
2554 FpscrQc = fpscr;
2555 '''
2556 twoRegNarrowInstX("sqshrun", "SqshrunX", "SimdShiftOp", smallSignedTypes,
2557 sqshrunCode, hasImm=True)
2558 twoRegNarrowInstX("sqshrun", "Sqshrun2X", "SimdShiftOp", smallSignedTypes,
2559 sqshrunCode, hasImm=True, hi=True)
2560 twoRegNarrowInstX("sqshrun", "SqshrunScX", "SimdShiftOp", smallSignedTypes,
2561 sqshrunCode, hasImm=True, scalar=True)
2562 # SQSUB
2563 sqsubCode = '''
2564 destElem = srcElem1 - srcElem2;
2565 FPSCR fpscr = (FPSCR) FpscrQc;
2566 bool negDest = (destElem < 0);
2567 bool negSrc1 = (srcElem1 < 0);
2568 bool posSrc2 = (srcElem2 >= 0);
2569 if ((negDest != negSrc1) && (negSrc1 == posSrc2)) {
2570 destElem = std::numeric_limits<Element>::min();
2571 if (negDest)
2572 destElem -= 1;
2573 fpscr.qc = 1;
2574 }
2575 FpscrQc = fpscr;
2576 '''
2577 threeEqualRegInstX("sqsub", "SqsubDX", "SimdAddOp", smallSignedTypes, 2,
2578 sqsubCode)
2579 threeEqualRegInstX("sqsub", "SqsubQX", "SimdAddOp", signedTypes, 4,
2580 sqsubCode)
2581 threeEqualRegInstX("sqsub", "SqsubScX", "SimdAddOp", signedTypes, 4,
2582 sqsubCode, scalar=True)
2583 # SQXTN, SQXTN2
2584 sqxtnCode = '''
2585 FPSCR fpscr = (FPSCR) FpscrQc;
2586 destElem = srcElem1;
2587 if ((BigElement)destElem != srcElem1) {
2588 fpscr.qc = 1;
2589 destElem = mask(sizeof(Element) * 8 - 1);
2590 if (srcElem1 < 0)
2591 destElem = ~destElem;
2592 }
2593 FpscrQc = fpscr;
2594 '''
2595 twoRegNarrowInstX("sqxtn", "SqxtnX", "SimdMiscOp", smallSignedTypes,
2596 sqxtnCode)
2597 twoRegNarrowInstX("sqxtn", "Sqxtn2X", "SimdMiscOp", smallSignedTypes,
2598 sqxtnCode, hi=True)
2599 twoRegNarrowInstX("sqxtn", "SqxtnScX", "SimdMiscOp", smallSignedTypes,
2600 sqxtnCode, scalar=True)
2601 # SQXTUN, SQXTUN2
2602 sqxtunCode = '''
2603 FPSCR fpscr = (FPSCR) FpscrQc;
2604 destElem = srcElem1;
2605 if (srcElem1 < 0 ||
2606 ((BigElement)destElem & mask(sizeof(Element) * 8)) != srcElem1) {
2607 fpscr.qc = 1;
2608 destElem = mask(sizeof(Element) * 8);
2609 if (srcElem1 < 0)
2610 destElem = ~destElem;
2611 }
2612 FpscrQc = fpscr;
2613 '''
2614 twoRegNarrowInstX("sqxtun", "SqxtunX", "SimdMiscOp", smallSignedTypes,
2615 sqxtunCode)
2616 twoRegNarrowInstX("sqxtun", "Sqxtun2X", "SimdMiscOp", smallSignedTypes,
2617 sqxtunCode, hi=True)
2618 twoRegNarrowInstX("sqxtun", "SqxtunScX", "SimdMiscOp", smallSignedTypes,
2619 sqxtunCode, scalar=True)
2620 # SRHADD
2621 rhaddCode = '''
2622 Element carryBit =
2623 (((unsigned)srcElem1 & 0x1) +
2624 ((unsigned)srcElem2 & 0x1) + 1) >> 1;
2625 // Use division instead of a shift to ensure the sign extension works
2626 // right. The compiler will figure out if it can be a shift. Mask the
2627 // inputs so they get truncated correctly.
2628 destElem = (((srcElem1 & ~(Element)1) / 2) +
2629 ((srcElem2 & ~(Element)1) / 2)) + carryBit;
2630 '''
2631 threeEqualRegInstX("srhadd", "SrhaddDX", "SimdAddOp", smallSignedTypes, 2,
2632 rhaddCode)
2633 threeEqualRegInstX("srhadd", "SrhaddQX", "SimdAddOp", smallSignedTypes, 4,
2634 rhaddCode)
2635 # SRI
2636 sriCode = '''
2637 if (imm >= sizeof(Element) * 8)
2638 destElem = destElem;
2639 else
2640 destElem = (srcElem1 >> imm) |
2641 (destElem & ~mask(sizeof(Element) * 8 - imm));
2642 '''
2643 twoEqualRegInstX("sri", "SriDX", "SimdShiftOp", unsignedTypes, 2, sriCode,
2644 True, hasImm=True)
2645 twoEqualRegInstX("sri", "SriQX", "SimdShiftOp", unsignedTypes, 4, sriCode,
2646 True, hasImm=True)
2647 # SRSHL
2648 rshlCode = '''
2649 int16_t shiftAmt = (int8_t)srcElem2;
2650 if (shiftAmt < 0) {
2651 shiftAmt = -shiftAmt;
2652 Element rBit = 0;
2653 if (shiftAmt <= sizeof(Element) * 8)
2654 rBit = bits(srcElem1, shiftAmt - 1);
2655 if (shiftAmt > sizeof(Element) * 8 && ltz(srcElem1))
2656 rBit = 1;
2657 if (shiftAmt >= sizeof(Element) * 8) {
2658 shiftAmt = sizeof(Element) * 8 - 1;
2659 destElem = 0;
2660 } else {
2661 destElem = (srcElem1 >> shiftAmt);
2662 }
2663 // Make sure the right shift sign extended when it should.
2664 if (ltz(srcElem1) && !ltz(destElem)) {
2665 destElem |= -((Element)1 << (sizeof(Element) * 8 -
2666 1 - shiftAmt));
2667 }
2668 destElem += rBit;
2669 } else if (shiftAmt > 0) {
2670 if (shiftAmt >= sizeof(Element) * 8) {
2671 destElem = 0;
2672 } else {
2673 destElem = srcElem1 << shiftAmt;
2674 }
2675 } else {
2676 destElem = srcElem1;
2677 }
2678 '''
2679 threeEqualRegInstX("srshl", "SrshlDX", "SimdShiftOp", signedTypes, 2,
2680 rshlCode)
2681 threeEqualRegInstX("srshl", "SrshlQX", "SimdShiftOp", signedTypes, 4,
2682 rshlCode)
2683 # SRSHR
2684 rshrCode = '''
2685 if (imm > sizeof(srcElem1) * 8) {
2686 destElem = 0;
2687 } else if (imm) {
2688 Element rBit = bits(srcElem1, imm - 1);
2689 destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit;
2690 } else {
2691 destElem = srcElem1;
2692 }
2693 '''
2694 twoEqualRegInstX("srshr", "SrshrDX", "SimdShiftOp", signedTypes, 2,
2695 rshrCode, hasImm=True)
2696 twoEqualRegInstX("srshr", "SrshrQX", "SimdShiftOp", signedTypes, 4,
2697 rshrCode, hasImm=True)
2698 # SRSRA
2699 rsraCode = '''
2700 if (imm > sizeof(srcElem1) * 8) {
2701 destElem += 0;
2702 } else if (imm) {
2703 Element rBit = bits(srcElem1, imm - 1);
2704 destElem += ((srcElem1 >> (imm - 1)) >> 1) + rBit;
2705 } else {
2706 destElem += srcElem1;
2707 }
2708 '''
2709 twoEqualRegInstX("srsra", "SrsraDX", "SimdShiftOp", signedTypes, 2,
2710 rsraCode, True, hasImm=True)
2711 twoEqualRegInstX("srsra", "SrsraQX", "SimdShiftOp", signedTypes, 4,
2712 rsraCode, True, hasImm=True)
2713 # SSHL
2714 shlCode = '''
2715 int16_t shiftAmt = (int8_t)srcElem2;
2716 if (shiftAmt < 0) {
2717 shiftAmt = -shiftAmt;
2718 if (shiftAmt >= sizeof(Element) * 8) {
2719 shiftAmt = sizeof(Element) * 8 - 1;
2720 destElem = 0;
2721 } else {
2722 destElem = (srcElem1 >> shiftAmt);
2723 }
2724 // Make sure the right shift sign extended when it should.
2725 if (ltz(srcElem1) && !ltz(destElem)) {
2726 destElem |= -((Element)1 << (sizeof(Element) * 8 -
2727 1 - shiftAmt));
2728 }
2729 } else {
2730 if (shiftAmt >= sizeof(Element) * 8) {
2731 destElem = 0;
2732 } else {
2733 destElem = srcElem1 << shiftAmt;
2734 }
2735 }
2736 '''
2737 threeEqualRegInstX("sshl", "SshlDX", "SimdShiftOp", signedTypes, 2,
2738 shlCode)
2739 threeEqualRegInstX("sshl", "SshlQX", "SimdShiftOp", signedTypes, 4,
2740 shlCode)
2741 # SSHLL, SSHLL2
2742 shllCode = '''
2743 if (imm >= sizeof(destElem) * 8) {
2744 destElem = 0;
2745 } else {
2746 destElem = (BigElement)srcElem1 << imm;
2747 }
2748 '''
2749 twoRegLongInstX("sshll", "SshllX", "SimdShiftOp", smallSignedTypes,
2750 shllCode, hasImm=True)
2751 twoRegLongInstX("sshll", "Sshll2X", "SimdShiftOp", smallSignedTypes,
2752 shllCode, hasImm=True, hi=True)
2753 # SSHR
2754 shrCode = '''
2755 if (imm >= sizeof(srcElem1) * 8) {
2756 if (ltz(srcElem1))
2757 destElem = -1;
2758 else
2759 destElem = 0;
2760 } else {
2761 destElem = srcElem1 >> imm;
2762 }
2763 '''
2764 twoEqualRegInstX("sshr", "SshrDX", "SimdShiftOp", signedTypes, 2, shrCode,
2765 hasImm=True)
2766 twoEqualRegInstX("sshr", "SshrQX", "SimdShiftOp", signedTypes, 4, shrCode,
2767 hasImm=True)
2768 # SSRA
2769 sraCode = '''
2770 Element mid;;
2771 if (imm >= sizeof(srcElem1) * 8) {
2772 mid = ltz(srcElem1) ? -1 : 0;
2773 } else {
2774 mid = srcElem1 >> imm;
2775 if (ltz(srcElem1) && !ltz(mid)) {
2776 mid |= -(mid & ((Element)1 <<
2777 (sizeof(Element) * 8 - 1 - imm)));
2778 }
2779 }
2780 destElem += mid;
2781 '''
2782 twoEqualRegInstX("ssra", "SsraDX", "SimdShiftOp", signedTypes, 2, sraCode,
2783 True, hasImm=True)
2784 twoEqualRegInstX("ssra", "SsraQX", "SimdShiftOp", signedTypes, 4, sraCode,
2785 True, hasImm=True)
2786 # SSUBL
2787 sublwCode = "destElem = (BigElement)srcElem1 - (BigElement)srcElem2;"
2788 threeRegLongInstX("ssubl", "SsublX", "SimdAddOp", smallSignedTypes,
2789 sublwCode)
2790 threeRegLongInstX("ssubl2", "Ssubl2X", "SimdAddOp", smallSignedTypes,
2791 sublwCode, hi=True)
2792 # SSUBW
2793 threeRegWideInstX("ssubw", "SsubwX", "SimdAddOp", smallSignedTypes,
2794 sublwCode)
2795 threeRegWideInstX("ssubw2", "Ssubw2X", "SimdAddOp", smallSignedTypes,
2796 sublwCode, hi=True)
2797 # SUB
2798 subCode = "destElem = srcElem1 - srcElem2;"
2799 threeEqualRegInstX("sub", "SubDX", "SimdAddOp", unsignedTypes, 2, subCode)
2800 threeEqualRegInstX("sub", "SubQX", "SimdAddOp", unsignedTypes, 4, subCode)
2801 # SUBHN, SUBHN2
2802 subhnCode = '''
2803 destElem = ((BigElement)srcElem1 - (BigElement)srcElem2) >>
2804 (sizeof(Element) * 8);
2805 '''
2806 threeRegNarrowInstX("subhn", "SubhnX", "SimdAddOp", smallUnsignedTypes,
2807 subhnCode)
2808 threeRegNarrowInstX("subhn2", "Subhn2X", "SimdAddOp", smallUnsignedTypes,
2809 subhnCode, hi=True)
2810 # SUQADD
2811 suqaddCode = '''
2812 FPSCR fpscr = (FPSCR) FpscrQc;
2813 Element tmp = destElem + srcElem1;
2814 if (bits(destElem, sizeof(Element) * 8 - 1) == 0) {
2815 if (bits(tmp, sizeof(Element) * 8 - 1) == 1 ||
2816 tmp < srcElem1 || tmp < destElem) {
2817 destElem = (((Element) 1) << (sizeof(Element) * 8 - 1)) - 1;
2818 fpscr.qc = 1;
2819 } else {
2820 destElem = tmp;
2821 }
2822 } else {
2823 Element absDestElem = (~destElem) + 1;
2824 if (absDestElem < srcElem1) {
2825 // Still check for positive sat., no need to check for negative sat.
2826 if (bits(tmp, sizeof(Element) * 8 - 1) == 1) {
2827 destElem = (((Element) 1) << (sizeof(Element) * 8 - 1)) - 1;
2828 fpscr.qc = 1;
2829 } else {
2830 destElem = tmp;
2831 }
2832 } else {
2833 destElem = tmp;
2834 }
2835 }
2836 FpscrQc = fpscr;
2837 '''
2838 twoEqualRegInstX("suqadd", "SuqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
2839 suqaddCode, True)
2840 twoEqualRegInstX("suqadd", "SuqaddQX", "SimdAddOp", unsignedTypes, 4,
2841 suqaddCode, True)
2842 twoEqualRegInstX("suqadd", "SuqaddScX", "SimdAddOp", unsignedTypes, 4,
2843 suqaddCode, True, scalar=True)
2844 # SXTL -> alias to SSHLL
2845 # TBL
2846 tbxTblInstX("tbl", "Tbl1DX", "SimdMiscOp", ("uint8_t",), 1, "true", 2)
2847 tbxTblInstX("tbl", "Tbl1QX", "SimdMiscOp", ("uint8_t",), 1, "true", 4)
2848 tbxTblInstX("tbl", "Tbl2DX", "SimdMiscOp", ("uint8_t",), 2, "true", 2)
2849 tbxTblInstX("tbl", "Tbl2QX", "SimdMiscOp", ("uint8_t",), 2, "true", 4)
2850 tbxTblInstX("tbl", "Tbl3DX", "SimdMiscOp", ("uint8_t",), 3, "true", 2)
2851 tbxTblInstX("tbl", "Tbl3QX", "SimdMiscOp", ("uint8_t",), 3, "true", 4)
2852 tbxTblInstX("tbl", "Tbl4DX", "SimdMiscOp", ("uint8_t",), 4, "true", 2)
2853 tbxTblInstX("tbl", "Tbl4QX", "SimdMiscOp", ("uint8_t",), 4, "true", 4)
2854 # TBX
2855 tbxTblInstX("tbx", "Tbx1DX", "SimdMiscOp", ("uint8_t",), 1, "false", 2)
2856 tbxTblInstX("tbx", "Tbx1QX", "SimdMiscOp", ("uint8_t",), 1, "false", 4)
2857 tbxTblInstX("tbx", "Tbx2DX", "SimdMiscOp", ("uint8_t",), 2, "false", 2)
2858 tbxTblInstX("tbx", "Tbx2QX", "SimdMiscOp", ("uint8_t",), 2, "false", 4)
2859 tbxTblInstX("tbx", "Tbx3DX", "SimdMiscOp", ("uint8_t",), 3, "false", 2)
2860 tbxTblInstX("tbx", "Tbx3QX", "SimdMiscOp", ("uint8_t",), 3, "false", 4)
2861 tbxTblInstX("tbx", "Tbx4DX", "SimdMiscOp", ("uint8_t",), 4, "false", 2)
2862 tbxTblInstX("tbx", "Tbx4QX", "SimdMiscOp", ("uint8_t",), 4, "false", 4)
2863 # TRN1
2864 trnCode = '''
2865 unsigned part = %s;
2866 for (unsigned i = 0; i < eCount / 2; i++) {
2867 destReg.elements[2 * i] = srcReg1.elements[2 * i + part];
2868 destReg.elements[2 * i + 1] = srcReg2.elements[2 * i + part];
2869 }
2870 '''
2871 threeRegScrambleInstX("trn1", "Trn1DX", "SimdAluOp", smallUnsignedTypes, 2,
2872 trnCode % "0")
2873 threeRegScrambleInstX("trn1", "Trn1QX", "SimdAluOp", unsignedTypes, 4,
2874 trnCode % "0")
2875 # TRN2
2876 threeRegScrambleInstX("trn2", "Trn2DX", "SimdAluOp", smallUnsignedTypes, 2,
2877 trnCode % "1")
2878 threeRegScrambleInstX("trn2", "Trn2QX", "SimdAluOp", unsignedTypes, 4,
2879 trnCode % "1")
2880 # UABA
2881 threeEqualRegInstX("uaba", "UabaDX", "SimdAddAccOp", smallUnsignedTypes, 2,
2882 abaCode, True)
2883 threeEqualRegInstX("uaba", "UabaQX", "SimdAddAccOp", smallUnsignedTypes, 4,
2884 abaCode, True)
2885 # UABAL, UABAL2
2886 threeRegLongInstX("uabal", "UabalX", "SimdAddAccOp", smallUnsignedTypes,
2887 abalCode, True)
2888 threeRegLongInstX("uabal2", "Uabal2X", "SimdAddAccOp", smallUnsignedTypes,
2889 abalCode, True, hi=True)
2890 # UABD
2891 threeEqualRegInstX("uabd", "UabdDX", "SimdAddOp", smallUnsignedTypes, 2,
2892 abdCode)
2893 threeEqualRegInstX("uabd", "UabdQX", "SimdAddOp", smallUnsignedTypes, 4,
2894 abdCode)
2895 # UABDL, UABDL2
2896 threeRegLongInstX("uabdl", "UabdlX", "SimdAddAccOp", smallUnsignedTypes,
2897 abdlCode, True)
2898 threeRegLongInstX("uabdl2", "Uabdl2X", "SimdAddAccOp", smallUnsignedTypes,
2899 abdlCode, True, hi=True)
2900 # UADALP
2901 twoRegCondenseInstX("uadalp", "UadalpDX", "SimdAddOp", smallUnsignedTypes,
2902 2, adalpCode, True)
2903 twoRegCondenseInstX("uadalp", "UadalpQX", "SimdAddOp", smallUnsignedTypes,
2904 4, adalpCode, True)
2905 # UADDL, UADDL2
2906 threeRegLongInstX("uaddl", "UaddlX", "SimdAddAccOp", smallUnsignedTypes,
2907 addlwCode)
2908 threeRegLongInstX("uaddl2", "Uaddl2X", "SimdAddAccOp", smallUnsignedTypes,
2909 addlwCode, hi=True)
2910 # UADDLP
2911 twoRegCondenseInstX("uaddlp", "UaddlpDX", "SimdAddOp", smallUnsignedTypes,
2912 2, addlwCode)
2913 twoRegCondenseInstX("uaddlp", "UaddlpQX", "SimdAddOp", smallUnsignedTypes,
2914 4, addlwCode)
2915 # UADDLV
2916 twoRegAcrossInstX("uaddlv", "UaddlvDX", "SimdAddOp",
2917 ("uint8_t", "uint16_t"), 2, addAcrossLongCode, long=True)
2918 twoRegAcrossInstX("uaddlv", "UaddlvQX", "SimdAddOp",
2919 ("uint8_t", "uint16_t"), 4, addAcrossLongCode, long=True)
2920 twoRegAcrossInstX("uaddlv", "UaddlvBQX", "SimdAddOp", ("uint32_t",), 4,
2921 addAcrossLongCode, doubleDest=True, long=True)
2922 # UADDW
2923 threeRegWideInstX("uaddw", "UaddwX", "SimdAddAccOp", smallUnsignedTypes,
2924 addlwCode)
2925 threeRegWideInstX("uaddw2", "Uaddw2X", "SimdAddAccOp", smallUnsignedTypes,
2926 addlwCode, hi=True)
2927 # UCVTF (fixed-point)
2928 ucvtfFixedCode = fpOp % ("fplibFixedToFP<Element>(srcElem1, imm, true,"
2929 " FPCRRounding(fpscr), fpscr)")
2930 twoEqualRegInstX("ucvtf", "UcvtfFixedDX", "SimdCvtOp", smallFloatTypes, 2,
2931 ucvtfFixedCode, hasImm=True)
2932 twoEqualRegInstX("ucvtf", "UcvtfFixedQX", "SimdCvtOp", floatTypes, 4,
2933 ucvtfFixedCode, hasImm=True)
2934 twoEqualRegInstX("ucvtf", "UcvtfFixedScX", "SimdCvtOp", floatTypes, 4,
2935 ucvtfFixedCode, hasImm=True, scalar=True)
2936 # UCVTF (integer)
2937 ucvtfIntCode = fpOp % ("fplibFixedToFP<Element>(srcElem1, 0, true,"
2938 " FPCRRounding(fpscr), fpscr)")
2939 twoEqualRegInstX("ucvtf", "UcvtfIntDX", "SimdCvtOp", smallFloatTypes, 2,
2940 ucvtfIntCode)
2941 twoEqualRegInstX("ucvtf", "UcvtfIntQX", "SimdCvtOp", floatTypes, 4,
2942 ucvtfIntCode)
2943 twoEqualRegInstX("ucvtf", "UcvtfIntScX", "SimdCvtOp", floatTypes, 4,
2944 ucvtfIntCode, scalar=True)
2945 # UHADD
2946 threeEqualRegInstX("uhadd", "UhaddDX", "SimdAddOp", smallUnsignedTypes, 2,
2947 haddCode)
2948 threeEqualRegInstX("uhadd", "UhaddQX", "SimdAddOp", smallUnsignedTypes, 4,
2949 haddCode)
2950 # UHSUB
2951 threeEqualRegInstX("uhsub", "UhsubDX", "SimdAddOp", smallUnsignedTypes, 2,
2952 hsubCode)
2953 threeEqualRegInstX("uhsub", "UhsubQX", "SimdAddOp", smallUnsignedTypes, 4,
2954 hsubCode)
2955 # UMAX
2956 threeEqualRegInstX("umax", "UmaxDX", "SimdCmpOp", smallUnsignedTypes, 2,
2957 maxCode)
2958 threeEqualRegInstX("umax", "UmaxQX", "SimdCmpOp", smallUnsignedTypes, 4,
2959 maxCode)
2960 # UMAXP
2961 threeEqualRegInstX("umaxp", "UmaxpDX", "SimdCmpOp", smallUnsignedTypes, 2,
2962 maxCode, pairwise=True)
2963 threeEqualRegInstX("umaxp", "UmaxpQX", "SimdCmpOp", smallUnsignedTypes, 4,
2964 maxCode, pairwise=True)
2965 # UMAXV
2966 twoRegAcrossInstX("umaxv", "UmaxvDX", "SimdCmpOp", ("uint8_t", "uint16_t"),
2967 2, maxAcrossCode)
2968 twoRegAcrossInstX("umaxv", "UmaxvQX", "SimdCmpOp", smallUnsignedTypes, 4,
2969 maxAcrossCode)
2970 # UMIN
2971 threeEqualRegInstX("umin", "UminDX", "SimdCmpOp", smallUnsignedTypes, 2,
2972 minCode)
2973 threeEqualRegInstX("umin", "UminQX", "SimdCmpOp", smallUnsignedTypes, 4,
2974 minCode)
2975 # UMINP
2976 threeEqualRegInstX("uminp", "UminpDX", "SimdCmpOp", smallUnsignedTypes, 2,
2977 minCode, pairwise=True)
2978 threeEqualRegInstX("uminp", "UminpQX", "SimdCmpOp", smallUnsignedTypes, 4,
2979 minCode, pairwise=True)
2980 # UMINV
2981 twoRegAcrossInstX("uminv", "UminvDX", "SimdCmpOp", ("uint8_t", "uint16_t"),
2982 2, minAcrossCode)
2983 twoRegAcrossInstX("uminv", "UminvQX", "SimdCmpOp", smallUnsignedTypes, 4,
2984 minAcrossCode)
2985 # UMLAL (by element)
2986 threeRegLongInstX("umlal", "UmlalElemX", "SimdMultAccOp",
2987 smallUnsignedTypes, mlalCode, True, byElem=True)
2988 threeRegLongInstX("umlal", "UmlalElem2X", "SimdMultAccOp",
2989 smallUnsignedTypes, mlalCode, True, byElem=True, hi=True)
2990 # UMLAL (vector)
2991 threeRegLongInstX("umlal", "UmlalX", "SimdMultAccOp", smallUnsignedTypes,
2992 mlalCode, True)
2993 threeRegLongInstX("umlal", "Umlal2X", "SimdMultAccOp", smallUnsignedTypes,
2994 mlalCode, True, hi=True)
2995 # UMLSL (by element)
2996 threeRegLongInstX("umlsl", "UmlslElemX", "SimdMultAccOp",
2997 smallUnsignedTypes, mlslCode, True, byElem=True)
2998 threeRegLongInstX("umlsl", "UmlslElem2X", "SimdMultAccOp",
2999 smallUnsignedTypes, mlslCode, True, byElem=True, hi=True)
3000 # UMLSL (vector)
3001 threeRegLongInstX("umlsl", "UmlslX", "SimdMultAccOp", smallUnsignedTypes,
3002 mlslCode, True)
3003 threeRegLongInstX("umlsl", "Umlsl2X", "SimdMultAccOp", smallUnsignedTypes,
3004 mlslCode, True, hi=True)
3005 # UMOV
3006 insToGprInstX("umov", "UmovWX", "SimdMiscOp", smallUnsignedTypes, 4, 'W')
3007 insToGprInstX("umov", "UmovXX", "SimdMiscOp", ("uint64_t",), 4, 'X')
3008 # UMULL, UMULL2 (by element)
3009 threeRegLongInstX("umull", "UmullElemX", "SimdMultOp", smallUnsignedTypes,
3010 mullCode, byElem=True)
3011 threeRegLongInstX("umull", "UmullElem2X", "SimdMultOp", smallUnsignedTypes,
3012 mullCode, byElem=True, hi=True)
3013 # UMULL, UMULL2 (vector)
3014 threeRegLongInstX("umull", "UmullX", "SimdMultOp", smallUnsignedTypes,
3015 mullCode)
3016 threeRegLongInstX("umull", "Umull2X", "SimdMultOp", smallUnsignedTypes,
3017 mullCode, hi=True)
3018 # UQADD
3019 uqaddCode = '''
3020 destElem = srcElem1 + srcElem2;
3021 FPSCR fpscr = (FPSCR) FpscrQc;
3022 if (destElem < srcElem1 || destElem < srcElem2) {
3023 destElem = (Element)(-1);
3024 fpscr.qc = 1;
3025 }
3026 FpscrQc = fpscr;
3027 '''
3028 threeEqualRegInstX("uqadd", "UqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
3029 uqaddCode)
3030 threeEqualRegInstX("uqadd", "UqaddQX", "SimdAddOp", unsignedTypes, 4,
3031 uqaddCode)
3032 threeEqualRegInstX("uqadd", "UqaddScX", "SimdAddOp", unsignedTypes, 4,
3033 uqaddCode, scalar=True)
3034 # UQRSHL
3035 uqrshlCode = '''
3036 int16_t shiftAmt = (int8_t)srcElem2;
3037 FPSCR fpscr = (FPSCR) FpscrQc;
3038 if (shiftAmt < 0) {
3039 shiftAmt = -shiftAmt;
3040 Element rBit = 0;
3041 if (shiftAmt <= sizeof(Element) * 8)
3042 rBit = bits(srcElem1, shiftAmt - 1);
3043 if (shiftAmt >= sizeof(Element) * 8) {
3044 shiftAmt = sizeof(Element) * 8 - 1;
3045 destElem = 0;
3046 } else {
3047 destElem = (srcElem1 >> shiftAmt);
3048 }
3049 destElem += rBit;
3050 } else {
3051 if (shiftAmt >= sizeof(Element) * 8) {
3052 if (srcElem1 != 0) {
3053 destElem = mask(sizeof(Element) * 8);
3054 fpscr.qc = 1;
3055 } else {
3056 destElem = 0;
3057 }
3058 } else {
3059 if (bits(srcElem1, sizeof(Element) * 8 - 1,
3060 sizeof(Element) * 8 - shiftAmt)) {
3061 destElem = mask(sizeof(Element) * 8);
3062 fpscr.qc = 1;
3063 } else {
3064 destElem = srcElem1 << shiftAmt;
3065 }
3066 }
3067 }
3068 FpscrQc = fpscr;
3069 '''
3070 threeEqualRegInstX("uqrshl", "UqrshlDX", "SimdCmpOp", smallUnsignedTypes,
3071 2, uqrshlCode)
3072 threeEqualRegInstX("uqrshl", "UqrshlQX", "SimdCmpOp", unsignedTypes, 4,
3073 uqrshlCode)
3074 threeEqualRegInstX("uqrshl", "UqrshlScX", "SimdCmpOp", unsignedTypes, 4,
3075 uqrshlCode, scalar=True)
3076 # UQRSHRN
3077 uqrshrnCode = '''
3078 FPSCR fpscr = (FPSCR) FpscrQc;
3079 if (imm > sizeof(srcElem1) * 8) {
3080 if (srcElem1 != 0)
3081 fpscr.qc = 1;
3082 destElem = 0;
3083 } else if (imm) {
3084 BigElement mid = (srcElem1 >> (imm - 1));
3085 uint64_t rBit = mid & 0x1;
3086 mid >>= 1;
3087 mid += rBit;
3088 if (mid != (Element)mid) {
3089 destElem = mask(sizeof(Element) * 8);
3090 fpscr.qc = 1;
3091 } else {
3092 destElem = mid;
3093 }
3094 } else {
3095 if (srcElem1 != (Element)srcElem1) {
3096 destElem = mask(sizeof(Element) * 8 - 1);
3097 fpscr.qc = 1;
3098 } else {
3099 destElem = srcElem1;
3100 }
3101 }
3102 FpscrQc = fpscr;
3103 '''
3104 twoRegNarrowInstX("uqrshrn", "UqrshrnX", "SimdShiftOp", smallUnsignedTypes,
3105 uqrshrnCode, hasImm=True)
3106 twoRegNarrowInstX("uqrshrn2", "Uqrshrn2X", "SimdShiftOp",
3107 smallUnsignedTypes, uqrshrnCode, hasImm=True, hi=True)
3108 twoRegNarrowInstX("uqrshrn", "UqrshrnScX", "SimdShiftOp",
3109 smallUnsignedTypes, uqrshrnCode, hasImm=True,
3110 scalar=True)
3111 # UQSHL (immediate)
3112 uqshlImmCode = '''
3113 FPSCR fpscr = (FPSCR) FpscrQc;
3114 if (imm >= sizeof(Element) * 8) {
3115 if (srcElem1 != 0) {
3116 destElem = mask(sizeof(Element) * 8);
3117 fpscr.qc = 1;
3118 } else {
3119 destElem = 0;
3120 }
3121 } else if (imm) {
3122 destElem = (srcElem1 << imm);
3123 uint64_t topBits = bits((uint64_t)srcElem1,
3124 sizeof(Element) * 8 - 1,
3125 sizeof(Element) * 8 - imm);
3126 if (topBits != 0) {
3127 destElem = mask(sizeof(Element) * 8);
3128 fpscr.qc = 1;
3129 }
3130 } else {
3131 destElem = srcElem1;
3132 }
3133 FpscrQc = fpscr;
3134 '''
3135 twoEqualRegInstX("uqshl", "UqshlImmDX", "SimdAluOp", smallUnsignedTypes, 2,
3136 uqshlImmCode, hasImm=True)
3137 twoEqualRegInstX("uqshl", "UqshlImmQX", "SimdAluOp", unsignedTypes, 4,
3138 uqshlImmCode, hasImm=True)
3139 twoEqualRegInstX("uqshl", "UqshlImmScX", "SimdAluOp", unsignedTypes, 4,
3140 uqshlImmCode, hasImm=True, scalar=True)
3141 # UQSHL (register)
3142 uqshlCode = '''
3143 int16_t shiftAmt = (int8_t)srcElem2;
3144 FPSCR fpscr = (FPSCR) FpscrQc;
3145 if (shiftAmt < 0) {
3146 shiftAmt = -shiftAmt;
3147 if (shiftAmt >= sizeof(Element) * 8) {
3148 shiftAmt = sizeof(Element) * 8 - 1;
3149 destElem = 0;
3150 } else {
3151 destElem = (srcElem1 >> shiftAmt);
3152 }
3153 } else if (shiftAmt > 0) {
3154 if (shiftAmt >= sizeof(Element) * 8) {
3155 if (srcElem1 != 0) {
3156 destElem = mask(sizeof(Element) * 8);
3157 fpscr.qc = 1;
3158 } else {
3159 destElem = 0;
3160 }
3161 } else {
3162 if (bits(srcElem1, sizeof(Element) * 8 - 1,
3163 sizeof(Element) * 8 - shiftAmt)) {
3164 destElem = mask(sizeof(Element) * 8);
3165 fpscr.qc = 1;
3166 } else {
3167 destElem = srcElem1 << shiftAmt;
3168 }
3169 }
3170 } else {
3171 destElem = srcElem1;
3172 }
3173 FpscrQc = fpscr;
3174 '''
3175 threeEqualRegInstX("uqshl", "UqshlDX", "SimdAluOp", smallUnsignedTypes, 2,
3176 uqshlCode)
3177 threeEqualRegInstX("uqshl", "UqshlQX", "SimdAluOp", unsignedTypes, 4,
3178 uqshlCode)
3179 threeEqualRegInstX("uqshl", "UqshlScX", "SimdAluOp", unsignedTypes, 4,
3180 uqshlCode, scalar=True)
3181 # UQSHRN, UQSHRN2
3182 uqshrnCode = '''
3183 FPSCR fpscr = (FPSCR) FpscrQc;
3184 if (imm > sizeof(srcElem1) * 8) {
3185 if (srcElem1 != 0)
3186 fpscr.qc = 1;
3187 destElem = 0;
3188 } else if (imm) {
3189 BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
3190 if (mid != (Element)mid) {
3191 destElem = mask(sizeof(Element) * 8);
3192 fpscr.qc = 1;
3193 } else {
3194 destElem = mid;
3195 }
3196 } else {
3197 destElem = srcElem1;
3198 }
3199 FpscrQc = fpscr;
3200 '''
3201 twoRegNarrowInstX("uqshrn", "UqshrnX", "SimdShiftOp", smallUnsignedTypes,
3202 uqshrnCode, hasImm=True)
3203 twoRegNarrowInstX("uqshrn2", "Uqshrn2X", "SimdShiftOp", smallUnsignedTypes,
3204 uqshrnCode, hasImm=True, hi=True)
3205 twoRegNarrowInstX("uqshrn", "UqshrnScX", "SimdShiftOp", smallUnsignedTypes,
3206 uqshrnCode, hasImm=True, scalar=True)
3207 # UQSUB
3208 uqsubCode = '''
3209 destElem = srcElem1 - srcElem2;
3210 FPSCR fpscr = (FPSCR) FpscrQc;
3211 if (destElem > srcElem1) {
3212 destElem = 0;
3213 fpscr.qc = 1;
3214 }
3215 FpscrQc = fpscr;
3216 '''
3217 threeEqualRegInstX("uqsub", "UqsubDX", "SimdAddOp", smallUnsignedTypes, 2,
3218 uqsubCode)
3219 threeEqualRegInstX("uqsub", "UqsubQX", "SimdAddOp", unsignedTypes, 4,
3220 uqsubCode)
3221 threeEqualRegInstX("uqsub", "UqsubScX", "SimdAddOp", unsignedTypes, 4,
3222 uqsubCode, scalar=True)
3223 # UQXTN
3224 uqxtnCode = '''
3225 FPSCR fpscr = (FPSCR) FpscrQc;
3226 destElem = srcElem1;
3227 if ((BigElement)destElem != srcElem1) {
3228 fpscr.qc = 1;
3229 destElem = mask(sizeof(Element) * 8);
3230 }
3231 FpscrQc = fpscr;
3232 '''
3233 twoRegNarrowInstX("uqxtn", "UqxtnX", "SimdMiscOp", smallUnsignedTypes,
3234 uqxtnCode)
3235 twoRegNarrowInstX("uqxtn", "Uqxtn2X", "SimdMiscOp", smallUnsignedTypes,
3236 uqxtnCode, hi=True)
3237 twoRegNarrowInstX("uqxtn", "UqxtnScX", "SimdMiscOp", smallUnsignedTypes,
3238 uqxtnCode, scalar=True)
3239 # URECPE
3240 urecpeCode = "destElem = unsignedRecipEstimate(srcElem1);"
3241 twoEqualRegInstX("urecpe", "UrecpeDX", "SimdMultAccOp", ("uint32_t",), 2,
3242 urecpeCode)
3243 twoEqualRegInstX("urecpe", "UrecpeQX", "SimdMultAccOp", ("uint32_t",), 4,
3244 urecpeCode)
3245 # URHADD
3246 threeEqualRegInstX("urhadd", "UrhaddDX", "SimdAddOp", smallUnsignedTypes,
3247 2, rhaddCode)
3248 threeEqualRegInstX("urhadd", "UrhaddQX", "SimdAddOp", smallUnsignedTypes,
3249 4, rhaddCode)
3250 # URSHL
3251 threeEqualRegInstX("urshl", "UrshlDX", "SimdShiftOp", unsignedTypes, 2,
3252 rshlCode)
3253 threeEqualRegInstX("urshl", "UrshlQX", "SimdShiftOp", unsignedTypes, 4,
3254 rshlCode)
3255 # URSHR
3256 twoEqualRegInstX("urshr", "UrshrDX", "SimdShiftOp", unsignedTypes, 2,
3257 rshrCode, hasImm=True)
3258 twoEqualRegInstX("urshr", "UrshrQX", "SimdShiftOp", unsignedTypes, 4,
3259 rshrCode, hasImm=True)
3260 # URSQRTE
3261 ursqrteCode = "destElem = unsignedRSqrtEstimate(srcElem1);"
3262 twoEqualRegInstX("ursqrte", "UrsqrteDX", "SimdSqrtOp", ("uint32_t",), 2,
3263 ursqrteCode)
3264 twoEqualRegInstX("ursqrte", "UrsqrteQX", "SimdSqrtOp", ("uint32_t",), 4,
3265 ursqrteCode)
3266 # URSRA
3267 twoEqualRegInstX("ursra", "UrsraDX", "SimdShiftOp", unsignedTypes, 2,
3268 rsraCode, True, hasImm=True)
3269 twoEqualRegInstX("ursra", "UrsraQX", "SimdShiftOp", unsignedTypes, 4,
3270 rsraCode, True, hasImm=True)
3271 # USHL
3272 threeEqualRegInstX("ushl", "UshlDX", "SimdShiftOp", unsignedTypes, 2,
3273 shlCode)
3274 threeEqualRegInstX("ushl", "UshlQX", "SimdShiftOp", unsignedTypes, 4,
3275 shlCode)
3276 # USHLL, USHLL2
3277 twoRegLongInstX("ushll", "UshllX", "SimdShiftOp", smallUnsignedTypes,
3278 shllCode, hasImm=True)
3279 twoRegLongInstX("ushll", "Ushll2X", "SimdShiftOp", smallUnsignedTypes,
3280 shllCode, hi=True, hasImm=True)
3281 # USHR
3282 twoEqualRegInstX("ushr", "UshrDX", "SimdShiftOp", unsignedTypes, 2,
3283 shrCode, hasImm=True)
3284 twoEqualRegInstX("ushr", "UshrQX", "SimdShiftOp", unsignedTypes, 4,
3285 shrCode, hasImm=True)
3286 # USQADD
3287 usqaddCode = '''
3288 FPSCR fpscr = (FPSCR) FpscrQc;
3289 Element tmp = destElem + srcElem1;
3290 if (bits(srcElem1, sizeof(Element) * 8 - 1) == 0) {
3291 if (tmp < srcElem1 || tmp < destElem) {
3292 destElem = (Element)(-1);
3293 fpscr.qc = 1;
3294 } else {
3295 destElem = tmp;
3296 }
3297 } else {
3298 Element absSrcElem1 = (~srcElem1) + 1;
3299 if (absSrcElem1 > destElem) {
3300 destElem = 0;
3301 fpscr.qc = 1;
3302 } else {
3303 destElem = tmp;
3304 }
3305 }
3306 FpscrQc = fpscr;
3307 '''
3308 twoEqualRegInstX("usqadd", "UsqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
3309 usqaddCode, True)
3310 twoEqualRegInstX("usqadd", "UsqaddQX", "SimdAddOp", unsignedTypes, 4,
3311 usqaddCode, True)
3312 twoEqualRegInstX("usqadd", "UsqaddScX", "SimdAddOp", unsignedTypes, 4,
3313 usqaddCode, True, scalar=True)
3314 # USRA
3315 twoEqualRegInstX("usra", "UsraDX", "SimdShiftOp", unsignedTypes, 2,
3316 sraCode, True, hasImm=True)
3317 twoEqualRegInstX("usra", "UsraQX", "SimdShiftOp", unsignedTypes, 4,
3318 sraCode, True, hasImm=True)
3319 # USUBL
3320 threeRegLongInstX("usubl", "UsublX", "SimdAddOp", smallUnsignedTypes,
3321 sublwCode)
3322 threeRegLongInstX("usubl2", "Usubl2X", "SimdAddOp", smallUnsignedTypes,
3323 sublwCode, hi=True)
3324 # USUBW
3325 threeRegWideInstX("usubw", "UsubwX", "SimdAddOp", smallUnsignedTypes,
3326 sublwCode)
3327 threeRegWideInstX("usubw2", "Usubw2X", "SimdAddOp", smallUnsignedTypes,
3328 sublwCode, hi=True)
3329 # UXTL -> alias to USHLL
3330 # UZP1
3331 uzpCode = '''
3332 unsigned part = %s;
3333 for (unsigned i = 0; i < eCount / 2; i++) {
3334 destReg.elements[i] = srcReg1.elements[2 * i + part];
3335 destReg.elements[eCount / 2 + i] = srcReg2.elements[2 * i + part];
3336 }
3337 '''
3338 threeRegScrambleInstX("Uzp1", "Uzp1DX", "SimdAluOp", smallUnsignedTypes, 2,
3339 uzpCode % "0")
3340 threeRegScrambleInstX("Uzp1", "Uzp1QX", "SimdAluOp", unsignedTypes, 4,
3341 uzpCode % "0")
3342 # UZP2
3343 threeRegScrambleInstX("Uzp2", "Uzp2DX", "SimdAluOp", smallUnsignedTypes, 2,
3344 uzpCode % "1")
3345 threeRegScrambleInstX("Uzp2", "Uzp2QX", "SimdAluOp", unsignedTypes, 4,
3346 uzpCode % "1")
3347 # XTN, XTN2
3348 xtnCode = "destElem = srcElem1;"
3349 twoRegNarrowInstX("Xtn", "XtnX", "SimdMiscOp", smallUnsignedTypes, xtnCode)
3350 twoRegNarrowInstX("Xtn", "Xtn2X", "SimdMiscOp", smallUnsignedTypes,
3351 xtnCode, hi=True)
3352 # ZIP1
3353 zipCode = '''
3354 unsigned base = %s;
3355 for (unsigned i = 0; i < eCount / 2; i++) {
3356 destReg.elements[2 * i] = srcReg1.elements[base + i];
3357 destReg.elements[2 * i + 1] = srcReg2.elements[base + i];
3358 }
3359 '''
3360 threeRegScrambleInstX("zip1", "Zip1DX", "SimdAluOp", smallUnsignedTypes, 2,
3361 zipCode % "0")
3362 threeRegScrambleInstX("zip1", "Zip1QX", "SimdAluOp", unsignedTypes, 4,
3363 zipCode % "0")
3364 # ZIP2
3365 threeRegScrambleInstX("zip2", "Zip2DX", "SimdAluOp", smallUnsignedTypes, 2,
3366 zipCode % "eCount / 2")
3367 threeRegScrambleInstX("zip2", "Zip2QX", "SimdAluOp", unsignedTypes, 4,
3368 zipCode % "eCount / 2")
3369
3370 for decoderFlavour, type_dict in decoders.iteritems():
3371 header_output += '''
3372 class %(decoder_flavour)sDecoder {
3373 public:
3374 ''' % { "decoder_flavour" : decoderFlavour }
3375 for type,name in type_dict.iteritems():
3376 header_output += '''
3377 template<typename Elem> using %(type)s = %(new_name)s<Elem>;''' % {
3378 "type" : type, "new_name" : name
3379 }
3380 header_output += '''
3381 };'''
3382}};
838 } destReg, srcReg2;
839
840 const unsigned length = %(length)d;
841 const bool isTbl = %(isTbl)s;
842 ''' % { "rCount" : rCount, "length" : length, "isTbl" : isTbl }
843 for reg in range(rCount):
844 code += '''
845 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
846 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
847 ''' % { "reg" : reg }
848 for reg in range(16):
849 if reg < length * 4:
850 code += '''
851 table.regs[%(reg)d] = htog(AA64FpOp1P%(p)dV%(v)dS_uw);
852 ''' % { "reg" : reg, "p" : reg % 4, "v" : reg / 4 }
853 else:
854 code += '''
855 table.regs[%(reg)d] = 0;
856 ''' % { "reg" : reg }
857 code += '''
858 for (unsigned i = 0; i < sizeof(destReg); i++) {
859 uint8_t index = srcReg2.bytes[i];
860 if (index < 16 * length) {
861 destReg.bytes[i] = table.bytes[index];
862 } else {
863 if (isTbl)
864 destReg.bytes[i] = 0;
865 // else destReg.bytes[i] unchanged
866 }
867 }
868 '''
869 for reg in range(rCount):
870 code += '''
871 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
872 ''' % { "reg" : reg }
873 if rCount < 4: # zero upper half
874 for reg in range(rCount, 4):
875 code += '''
876 AA64FpDestP%(reg)d_uw = 0;
877 ''' % { "reg" : reg }
878 iop = InstObjParams(name, Name,
879 "DataX2RegOp",
880 { "code": code,
881 "r_count": rCount,
882 "op_class": opClass }, [])
883 header_output += NeonX2RegOpDeclare.subst(iop)
884 exec_output += NeonXEqualRegOpExecute.subst(iop)
885 for type in types:
886 substDict = { "targs" : type,
887 "class_name" : Name }
888 exec_output += NeonXExecDeclare.subst(substDict)
889
890 # ABS
891 absCode = '''
892 if (srcElem1 < 0) {
893 destElem = -srcElem1;
894 } else {
895 destElem = srcElem1;
896 }
897 '''
898 twoEqualRegInstX("abs", "AbsDX", "SimdAluOp", signedTypes, 2, absCode)
899 twoEqualRegInstX("abs", "AbsQX", "SimdAluOp", signedTypes, 4, absCode)
900 # ADD
901 addCode = "destElem = srcElem1 + srcElem2;"
902 threeEqualRegInstX("add", "AddDX", "SimdAddOp", unsignedTypes, 2, addCode)
903 threeEqualRegInstX("add", "AddQX", "SimdAddOp", unsignedTypes, 4, addCode)
904 # ADDHN, ADDHN2
905 addhnCode = '''
906 destElem = ((BigElement)srcElem1 + (BigElement)srcElem2) >>
907 (sizeof(Element) * 8);
908 '''
909 threeRegNarrowInstX("addhn", "AddhnX", "SimdAddOp", smallUnsignedTypes,
910 addhnCode)
911 threeRegNarrowInstX("addhn2", "Addhn2X", "SimdAddOp", smallUnsignedTypes,
912 addhnCode, hi=True)
913 # ADDP (scalar)
914 twoRegPairwiseScInstX("addp", "AddpScQX", "SimdAddOp", ("uint64_t",), 4,
915 addCode)
916 # ADDP (vector)
917 threeEqualRegInstX("addp", "AddpDX", "SimdAddOp", smallUnsignedTypes, 2,
918 addCode, pairwise=True)
919 threeEqualRegInstX("addp", "AddpQX", "SimdAddOp", unsignedTypes, 4,
920 addCode, pairwise=True)
921 # ADDV
922 # Note: SimdAddOp can be a bit optimistic here
923 addAcrossCode = "destElem += srcElem1;"
924 twoRegAcrossInstX("addv", "AddvDX", "SimdAddOp", ("uint8_t", "uint16_t"),
925 2, addAcrossCode)
926 twoRegAcrossInstX("addv", "AddvQX", "SimdAddOp", smallUnsignedTypes, 4,
927 addAcrossCode)
928 # AND
929 andCode = "destElem = srcElem1 & srcElem2;"
930 threeEqualRegInstX("and", "AndDX", "SimdAluOp", ("uint64_t",), 2, andCode)
931 threeEqualRegInstX("and", "AndQX", "SimdAluOp", ("uint64_t",), 4, andCode)
932 # BIC (immediate)
933 bicImmCode = "destElem &= ~imm;"
934 oneRegImmInstX("bic", "BicImmDX", "SimdAluOp", ("uint64_t",), 2,
935 bicImmCode, True)
936 oneRegImmInstX("bic", "BicImmQX", "SimdAluOp", ("uint64_t",), 4,
937 bicImmCode, True)
938 # BIC (register)
939 bicCode = "destElem = srcElem1 & ~srcElem2;"
940 threeEqualRegInstX("bic", "BicDX", "SimdAluOp", ("uint64_t",), 2, bicCode)
941 threeEqualRegInstX("bic", "BicQX", "SimdAluOp", ("uint64_t",), 4, bicCode)
942 # BIF
943 bifCode = "destElem = (destElem & srcElem2) | (srcElem1 & ~srcElem2);"
944 threeEqualRegInstX("bif", "BifDX", "SimdAluOp", ("uint64_t",), 2, bifCode,
945 True)
946 threeEqualRegInstX("bif", "BifQX", "SimdAluOp", ("uint64_t",), 4, bifCode,
947 True)
948 # BIT
949 bitCode = "destElem = (srcElem1 & srcElem2) | (destElem & ~srcElem2);"
950 threeEqualRegInstX("bit", "BitDX", "SimdAluOp", ("uint64_t",), 2, bitCode,
951 True)
952 threeEqualRegInstX("bit", "BitQX", "SimdAluOp", ("uint64_t",), 4, bitCode,
953 True)
954 # BSL
955 bslCode = "destElem = (srcElem1 & destElem) | (srcElem2 & ~destElem);"
956 threeEqualRegInstX("bsl", "BslDX", "SimdAluOp", ("uint64_t",), 2, bslCode,
957 True)
958 threeEqualRegInstX("bsl", "BslQX", "SimdAluOp", ("uint64_t",), 4, bslCode,
959 True)
960 # CLS
961 clsCode = '''
962 unsigned count = 0;
963 if (srcElem1 < 0) {
964 srcElem1 <<= 1;
965 while (srcElem1 < 0 && count < sizeof(Element) * 8 - 1) {
966 count++;
967 srcElem1 <<= 1;
968 }
969 } else {
970 srcElem1 <<= 1;
971 while (srcElem1 >= 0 && count < sizeof(Element) * 8 - 1) {
972 count++;
973 srcElem1 <<= 1;
974 }
975 }
976 destElem = count;
977 '''
978 twoEqualRegInstX("cls", "ClsDX", "SimdAluOp", smallSignedTypes, 2, clsCode)
979 twoEqualRegInstX("cls", "ClsQX", "SimdAluOp", smallSignedTypes, 4, clsCode)
980 # CLZ
981 clzCode = '''
982 unsigned count = 0;
983 while (srcElem1 >= 0 && count < sizeof(Element) * 8) {
984 count++;
985 srcElem1 <<= 1;
986 }
987 destElem = count;
988 '''
989 twoEqualRegInstX("clz", "ClzDX", "SimdAluOp", smallSignedTypes, 2, clzCode)
990 twoEqualRegInstX("clz", "ClzQX", "SimdAluOp", smallSignedTypes, 4, clzCode)
991 # CMEQ (register)
992 cmeqCode = "destElem = (srcElem1 == srcElem2) ? (Element)(-1) : 0;"
993 threeEqualRegInstX("cmeq", "CmeqDX", "SimdCmpOp", unsignedTypes, 2,
994 cmeqCode)
995 threeEqualRegInstX("cmeq", "CmeqQX", "SimdCmpOp", unsignedTypes, 4,
996 cmeqCode)
997 # CMEQ (zero)
998 cmeqZeroCode = "destElem = (srcElem1 == 0) ? (Element)(-1) : 0;"
999 twoEqualRegInstX("cmeq", "CmeqZeroDX", "SimdCmpOp", signedTypes, 2,
1000 cmeqZeroCode)
1001 twoEqualRegInstX("cmeq", "CmeqZeroQX", "SimdCmpOp", signedTypes, 4,
1002 cmeqZeroCode)
1003 # CMGE (register)
1004 cmgeCode = "destElem = (srcElem1 >= srcElem2) ? (Element)(-1) : 0;"
1005 threeEqualRegInstX("cmge", "CmgeDX", "SimdCmpOp", signedTypes, 2, cmgeCode)
1006 threeEqualRegInstX("cmge", "CmgeQX", "SimdCmpOp", signedTypes, 4, cmgeCode)
1007 # CMGE (zero)
1008 cmgeZeroCode = "destElem = (srcElem1 >= 0) ? (Element)(-1) : 0;"
1009 twoEqualRegInstX("cmge", "CmgeZeroDX", "SimdCmpOp", signedTypes, 2,
1010 cmgeZeroCode)
1011 twoEqualRegInstX("cmge", "CmgeZeroQX", "SimdCmpOp", signedTypes, 4,
1012 cmgeZeroCode)
1013 # CMGT (register)
1014 cmgtCode = "destElem = (srcElem1 > srcElem2) ? (Element)(-1) : 0;"
1015 threeEqualRegInstX("cmgt", "CmgtDX", "SimdCmpOp", signedTypes, 2, cmgtCode)
1016 threeEqualRegInstX("cmgt", "CmgtQX", "SimdCmpOp", signedTypes, 4, cmgtCode)
1017 # CMGT (zero)
1018 cmgtZeroCode = "destElem = (srcElem1 > 0) ? (Element)(-1) : 0;"
1019 twoEqualRegInstX("cmgt", "CmgtZeroDX", "SimdCmpOp", signedTypes, 2,
1020 cmgtZeroCode)
1021 twoEqualRegInstX("cmgt", "CmgtZeroQX", "SimdCmpOp", signedTypes, 4,
1022 cmgtZeroCode)
1023 # CMHI (register)
1024 threeEqualRegInstX("cmhi", "CmhiDX", "SimdCmpOp", unsignedTypes, 2,
1025 cmgtCode)
1026 threeEqualRegInstX("cmhi", "CmhiQX", "SimdCmpOp", unsignedTypes, 4,
1027 cmgtCode)
1028 # CMHS (register)
1029 threeEqualRegInstX("cmhs", "CmhsDX", "SimdCmpOp", unsignedTypes, 2,
1030 cmgeCode)
1031 threeEqualRegInstX("cmhs", "CmhsQX", "SimdCmpOp", unsignedTypes, 4,
1032 cmgeCode)
1033 # CMLE (zero)
1034 cmleZeroCode = "destElem = (srcElem1 <= 0) ? (Element)(-1) : 0;"
1035 twoEqualRegInstX("cmle", "CmleZeroDX", "SimdCmpOp", signedTypes, 2,
1036 cmleZeroCode)
1037 twoEqualRegInstX("cmle", "CmleZeroQX", "SimdCmpOp", signedTypes, 4,
1038 cmleZeroCode)
1039 # CMLT (zero)
1040 cmltZeroCode = "destElem = (srcElem1 < 0) ? (Element)(-1) : 0;"
1041 twoEqualRegInstX("cmlt", "CmltZeroDX", "SimdCmpOp", signedTypes, 2,
1042 cmltZeroCode)
1043 twoEqualRegInstX("cmlt", "CmltZeroQX", "SimdCmpOp", signedTypes, 4,
1044 cmltZeroCode)
1045 # CMTST (register)
1046 tstCode = "destElem = (srcElem1 & srcElem2) ? (Element)(-1) : 0;"
1047 threeEqualRegInstX("cmtst", "CmtstDX", "SimdAluOp", unsignedTypes, 2,
1048 tstCode)
1049 threeEqualRegInstX("cmtst", "CmtstQX", "SimdAluOp", unsignedTypes, 4,
1050 tstCode)
1051 # CNT
1052 cntCode = '''
1053 unsigned count = 0;
1054 while (srcElem1 && count < sizeof(Element) * 8) {
1055 count += srcElem1 & 0x1;
1056 srcElem1 >>= 1;
1057 }
1058 destElem = count;
1059 '''
1060 twoEqualRegInstX("cnt", "CntDX", "SimdAluOp", ("uint8_t",), 2, cntCode)
1061 twoEqualRegInstX("cnt", "CntQX", "SimdAluOp", ("uint8_t",), 4, cntCode)
1062 # DUP (element)
1063 dupCode = "destElem = srcElem1;"
1064 twoEqualRegInstX("dup", "DupElemDX", "SimdMiscOp", smallUnsignedTypes, 2,
1065 dupCode, isDup=True, byElem=True)
1066 twoEqualRegInstX("dup", "DupElemQX", "SimdMiscOp", unsignedTypes, 4,
1067 dupCode, isDup=True, byElem=True)
1068 twoEqualRegInstX("dup", "DupElemScX", "SimdMiscOp", unsignedTypes, 4,
1069 dupCode, isDup=True, byElem=True, scalar=True)
1070 # DUP (general register)
1071 dupGprInstX("dup", "DupGprWDX", "SimdMiscOp", smallUnsignedTypes, 2, 'W')
1072 dupGprInstX("dup", "DupGprWQX", "SimdMiscOp", smallUnsignedTypes, 4, 'W')
1073 dupGprInstX("dup", "DupGprXQX", "SimdMiscOp", ("uint64_t",), 4, 'X')
1074 # EOR
1075 eorCode = "destElem = srcElem1 ^ srcElem2;"
1076 threeEqualRegInstX("eor", "EorDX", "SimdAluOp", ("uint64_t",), 2, eorCode)
1077 threeEqualRegInstX("eor", "EorQX", "SimdAluOp", ("uint64_t",), 4, eorCode)
1078 # EXT
1079 extCode = '''
1080 for (unsigned i = 0; i < eCount; i++) {
1081 unsigned index = i + imm;
1082 if (index < eCount) {
1083 destReg.elements[i] = srcReg1.elements[index];
1084 } else {
1085 index -= eCount;
1086 if (index >= eCount) {
1087 fault = std::make_shared<UndefinedInstruction>(
1088 machInst, false, mnemonic);
1089 } else {
1090 destReg.elements[i] = srcReg2.elements[index];
1091 }
1092 }
1093 }
1094 '''
1095 extInstX("Ext", "ExtDX", "SimdMiscOp", ("uint8_t",), 2, extCode)
1096 extInstX("Ext", "ExtQX", "SimdMiscOp", ("uint8_t",), 4, extCode)
1097 # FABD
1098 fpOp = '''
1099 FPSCR fpscr = (FPSCR) FpscrExc;
1100 destElem = %s;
1101 FpscrExc = fpscr;
1102 '''
1103 fabdCode = fpOp % "fplibAbs<Element>(fplibSub(srcElem1, srcElem2, fpscr))"
1104 threeEqualRegInstX("fabd", "FabdDX", "SimdFloatAddOp", smallFloatTypes, 2,
1105 fabdCode)
1106 threeEqualRegInstX("fabd", "FabdQX", "SimdFloatAddOp", floatTypes, 4,
1107 fabdCode)
1108 threeEqualRegInstX("fabd", "FabdScX", "SimdFloatAddOp", floatTypes, 4,
1109 fabdCode, scalar=True)
1110 # FABS
1111 fabsCode = fpOp % "fplibAbs<Element>(srcElem1)"
1112 twoEqualRegInstX("Abs", "FabsDX", "SimdFloatAluOp", smallFloatTypes, 2,
1113 fabsCode)
1114 twoEqualRegInstX("Abs", "FabsQX", "SimdFloatAluOp", floatTypes, 4,
1115 fabsCode)
1116 # FACGE
1117 fpCmpAbsOp = fpOp % ("fplibCompare%s<Element>(fplibAbs<Element>(srcElem1),"
1118 " fplibAbs<Element>(srcElem2), fpscr) ? -1 : 0")
1119 facgeCode = fpCmpAbsOp % "GE"
1120 threeEqualRegInstX("facge", "FacgeDX", "SimdFloatCmpOp", smallFloatTypes,
1121 2, facgeCode)
1122 threeEqualRegInstX("facge", "FacgeQX", "SimdFloatCmpOp", floatTypes, 4,
1123 facgeCode)
1124 threeEqualRegInstX("facge", "FacgeScX", "SimdFloatCmpOp", floatTypes, 4,
1125 facgeCode, scalar=True)
1126 # FACGT
1127 facgtCode = fpCmpAbsOp % "GT"
1128 threeEqualRegInstX("facgt", "FacgtDX", "SimdFloatCmpOp", smallFloatTypes,
1129 2, facgtCode)
1130 threeEqualRegInstX("facgt", "FacgtQX", "SimdFloatCmpOp", floatTypes, 4,
1131 facgtCode)
1132 threeEqualRegInstX("facgt", "FacgtScX", "SimdFloatCmpOp", floatTypes, 4,
1133 facgtCode, scalar=True)
1134 # FADD
1135 fpBinOp = fpOp % "fplib%s<Element>(srcElem1, srcElem2, fpscr)"
1136 faddCode = fpBinOp % "Add"
1137 threeEqualRegInstX("fadd", "FaddDX", "SimdFloatAddOp", smallFloatTypes, 2,
1138 faddCode)
1139 threeEqualRegInstX("fadd", "FaddQX", "SimdFloatAddOp", floatTypes, 4,
1140 faddCode)
1141 # FADDP (scalar)
1142 twoRegPairwiseScInstX("faddp", "FaddpScDX", "SimdFloatAddOp",
1143 ("uint32_t",), 2, faddCode)
1144 twoRegPairwiseScInstX("faddp", "FaddpScQX", "SimdFloatAddOp",
1145 ("uint64_t",), 4, faddCode)
1146 # FADDP (vector)
1147 threeEqualRegInstX("faddp", "FaddpDX", "SimdFloatAddOp", smallFloatTypes,
1148 2, faddCode, pairwise=True)
1149 threeEqualRegInstX("faddp", "FaddpQX", "SimdFloatAddOp", floatTypes, 4,
1150 faddCode, pairwise=True)
1151 # FCMEQ (register)
1152 fpCmpOp = fpOp % ("fplibCompare%s<Element>(srcElem1, srcElem2, fpscr) ?"
1153 " -1 : 0")
1154 fcmeqCode = fpCmpOp % "EQ"
1155 threeEqualRegInstX("fcmeq", "FcmeqDX", "SimdFloatCmpOp", smallFloatTypes,
1156 2, fcmeqCode)
1157 threeEqualRegInstX("fcmeq", "FcmeqQX", "SimdFloatCmpOp", floatTypes, 4,
1158 fcmeqCode)
1159 threeEqualRegInstX("fcmeq", "FcmeqScX", "SimdFloatCmpOp", floatTypes, 4,
1160 fcmeqCode, scalar=True)
1161 # FCMEQ (zero)
1162 fpCmpZeroOp = fpOp % "fplibCompare%s<Element>(srcElem1, 0, fpscr) ? -1 : 0"
1163 fcmeqZeroCode = fpCmpZeroOp % "EQ"
1164 twoEqualRegInstX("fcmeq", "FcmeqZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1165 2, fcmeqZeroCode)
1166 twoEqualRegInstX("fcmeq", "FcmeqZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1167 fcmeqZeroCode)
1168 twoEqualRegInstX("fcmeq", "FcmeqZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1169 fcmeqZeroCode, scalar=True)
1170 # FCMGE (register)
1171 fcmgeCode = fpCmpOp % "GE"
1172 threeEqualRegInstX("fcmge", "FcmgeDX", "SimdFloatCmpOp", smallFloatTypes,
1173 2, fcmgeCode)
1174 threeEqualRegInstX("fcmge", "FcmgeQX", "SimdFloatCmpOp", floatTypes, 4,
1175 fcmgeCode)
1176 threeEqualRegInstX("fcmge", "FcmgeScX", "SimdFloatCmpOp", floatTypes, 4,
1177 fcmgeCode, scalar=True)
1178 # FCMGE (zero)
1179 fcmgeZeroCode = fpCmpZeroOp % "GE"
1180 twoEqualRegInstX("fcmge", "FcmgeZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1181 2, fcmgeZeroCode)
1182 twoEqualRegInstX("fcmge", "FcmgeZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1183 fcmgeZeroCode)
1184 twoEqualRegInstX("fcmge", "FcmgeZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1185 fcmgeZeroCode, scalar=True)
1186 # FCMGT (register)
1187 fcmgtCode = fpCmpOp % "GT"
1188 threeEqualRegInstX("fcmgt", "FcmgtDX", "SimdFloatCmpOp", smallFloatTypes,
1189 2, fcmgtCode)
1190 threeEqualRegInstX("fcmgt", "FcmgtQX", "SimdFloatCmpOp", floatTypes, 4,
1191 fcmgtCode)
1192 threeEqualRegInstX("fcmgt", "FcmgtScX", "SimdFloatCmpOp", floatTypes, 4,
1193 fcmgtCode, scalar=True)
1194 # FCMGT (zero)
1195 fcmgtZeroCode = fpCmpZeroOp % "GT"
1196 twoEqualRegInstX("fcmgt", "FcmgtZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1197 2, fcmgtZeroCode)
1198 twoEqualRegInstX("fcmgt", "FcmgtZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1199 fcmgtZeroCode)
1200 twoEqualRegInstX("fcmgt", "FcmgtZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1201 fcmgtZeroCode, scalar=True)
1202 # FCMLE (zero)
1203 fpCmpRevZeroOp = fpOp % ("fplibCompare%s<Element>(0, srcElem1, fpscr) ?"
1204 " -1 : 0")
1205 fcmleZeroCode = fpCmpRevZeroOp % "GE"
1206 twoEqualRegInstX("fcmle", "FcmleZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1207 2, fcmleZeroCode)
1208 twoEqualRegInstX("fcmle", "FcmleZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1209 fcmleZeroCode)
1210 twoEqualRegInstX("fcmle", "FcmleZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1211 fcmleZeroCode, scalar=True)
1212 # FCMLT (zero)
1213 fcmltZeroCode = fpCmpRevZeroOp % "GT"
1214 twoEqualRegInstX("fcmlt", "FcmltZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1215 2, fcmltZeroCode)
1216 twoEqualRegInstX("fcmlt", "FcmltZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1217 fcmltZeroCode)
1218 twoEqualRegInstX("fcmlt", "FcmltZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1219 fcmltZeroCode, scalar=True)
1220 # FCVTAS
1221 fcvtCode = fpOp % ("fplibFPToFixed<Element, Element>("
1222 "srcElem1, %s, %s, %s, fpscr)")
1223 fcvtasCode = fcvtCode % ("0", "false", "FPRounding_TIEAWAY")
1224 twoEqualRegInstX("fcvtas", "FcvtasDX", "SimdCvtOp", smallFloatTypes, 2,
1225 fcvtasCode)
1226 twoEqualRegInstX("fcvtas", "FcvtasQX", "SimdCvtOp", floatTypes, 4,
1227 fcvtasCode)
1228 twoEqualRegInstX("fcvtas", "FcvtasScX", "SimdCvtOp", floatTypes, 4,
1229 fcvtasCode, scalar=True)
1230 # FCVTAU
1231 fcvtauCode = fcvtCode % ("0", "true", "FPRounding_TIEAWAY")
1232 twoEqualRegInstX("fcvtau", "FcvtauDX", "SimdCvtOp", smallFloatTypes, 2,
1233 fcvtauCode)
1234 twoEqualRegInstX("fcvtau", "FcvtauQX", "SimdCvtOp", floatTypes, 4,
1235 fcvtauCode)
1236 twoEqualRegInstX("fcvtau", "FcvtauScX", "SimdCvtOp", floatTypes, 4,
1237 fcvtauCode, scalar=True)
1238 # FCVTL, FCVTL2
1239 fcvtlCode = fpOp % ("fplibConvert<Element, BigElement>("
1240 "srcElem1, FPCRRounding(fpscr), fpscr)")
1241 twoRegLongInstX("fcvtl", "FcvtlX", "SimdCvtOp", ("uint16_t", "uint32_t"),
1242 fcvtlCode)
1243 twoRegLongInstX("fcvtl", "Fcvtl2X", "SimdCvtOp", ("uint16_t", "uint32_t"),
1244 fcvtlCode, hi=True)
1245 # FCVTMS
1246 fcvtmsCode = fcvtCode % ("0", "false", "FPRounding_NEGINF")
1247 twoEqualRegInstX("fcvtms", "FcvtmsDX", "SimdCvtOp", smallFloatTypes, 2,
1248 fcvtmsCode)
1249 twoEqualRegInstX("fcvtms", "FcvtmsQX", "SimdCvtOp", floatTypes, 4,
1250 fcvtmsCode)
1251 twoEqualRegInstX("fcvtms", "FcvtmsScX", "SimdCvtOp", floatTypes, 4,
1252 fcvtmsCode, scalar=True)
1253 # FCVTMU
1254 fcvtmuCode = fcvtCode % ("0", "true", "FPRounding_NEGINF")
1255 twoEqualRegInstX("fcvtmu", "FcvtmuDX", "SimdCvtOp", smallFloatTypes, 2,
1256 fcvtmuCode)
1257 twoEqualRegInstX("fcvtmu", "FcvtmuQX", "SimdCvtOp", floatTypes, 4,
1258 fcvtmuCode)
1259 twoEqualRegInstX("fcvtmu", "FcvtmuScX", "SimdCvtOp", floatTypes, 4,
1260 fcvtmuCode, scalar=True)
1261 # FCVTN, FCVTN2
1262 fcvtnCode = fpOp % ("fplibConvert<BigElement, Element>("
1263 "srcElem1, FPCRRounding(fpscr), fpscr)")
1264 twoRegNarrowInstX("fcvtn", "FcvtnX", "SimdCvtOp",
1265 ("uint16_t", "uint32_t"), fcvtnCode)
1266 twoRegNarrowInstX("fcvtn", "Fcvtn2X", "SimdCvtOp",
1267 ("uint16_t", "uint32_t"), fcvtnCode, hi=True)
1268 # FCVTNS
1269 fcvtnsCode = fcvtCode % ("0", "false", "FPRounding_TIEEVEN")
1270 twoEqualRegInstX("fcvtns", "FcvtnsDX", "SimdCvtOp", smallFloatTypes, 2,
1271 fcvtnsCode)
1272 twoEqualRegInstX("fcvtns", "FcvtnsQX", "SimdCvtOp", floatTypes, 4,
1273 fcvtnsCode)
1274 twoEqualRegInstX("fcvtns", "FcvtnsScX", "SimdCvtOp", floatTypes, 4,
1275 fcvtnsCode, scalar=True)
1276 # FCVTNU
1277 fcvtnuCode = fcvtCode % ("0", "true", "FPRounding_TIEEVEN")
1278 twoEqualRegInstX("fcvtnu", "FcvtnuDX", "SimdCvtOp", smallFloatTypes, 2,
1279 fcvtnuCode)
1280 twoEqualRegInstX("fcvtnu", "FcvtnuQX", "SimdCvtOp", floatTypes, 4,
1281 fcvtnuCode)
1282 twoEqualRegInstX("fcvtnu", "FcvtnuScX", "SimdCvtOp", floatTypes, 4,
1283 fcvtnuCode, scalar=True)
1284 # FCVTPS
1285 fcvtpsCode = fcvtCode % ("0", "false", "FPRounding_POSINF")
1286 twoEqualRegInstX("fcvtps", "FcvtpsDX", "SimdCvtOp", smallFloatTypes, 2,
1287 fcvtpsCode)
1288 twoEqualRegInstX("fcvtps", "FcvtpsQX", "SimdCvtOp", floatTypes, 4,
1289 fcvtpsCode)
1290 twoEqualRegInstX("fcvtps", "FcvtpsScX", "SimdCvtOp", floatTypes, 4,
1291 fcvtpsCode, scalar=True)
1292 # FCVTPU
1293 fcvtpuCode = fcvtCode % ("0", "true", "FPRounding_POSINF")
1294 twoEqualRegInstX("fcvtpu", "FcvtpuDX", "SimdCvtOp", smallFloatTypes, 2,
1295 fcvtpuCode)
1296 twoEqualRegInstX("fcvtpu", "FcvtpuQX", "SimdCvtOp", floatTypes, 4,
1297 fcvtpuCode)
1298 twoEqualRegInstX("fcvtpu", "FcvtpuScX", "SimdCvtOp", floatTypes, 4,
1299 fcvtpuCode, scalar=True)
1300 # FCVTXN, FCVTXN2
1301 fcvtxnCode = fpOp % ("fplibConvert<BigElement, Element>("
1302 "srcElem1, FPRounding_ODD, fpscr)")
1303 twoRegNarrowInstX("fcvtxn", "FcvtxnX", "SimdCvtOp", smallFloatTypes,
1304 fcvtxnCode)
1305 twoRegNarrowInstX("fcvtxn", "Fcvtxn2X", "SimdCvtOp", smallFloatTypes,
1306 fcvtxnCode, hi=True)
1307 twoRegNarrowInstX("fcvtxn", "FcvtxnScX", "SimdCvtOp", smallFloatTypes,
1308 fcvtxnCode, scalar=True)
1309 # FCVTZS (fixed-point)
1310 fcvtzsCode = fcvtCode % ("imm", "false", "FPRounding_ZERO")
1311 twoEqualRegInstX("fcvtzs", "FcvtzsFixedDX", "SimdCvtOp", smallFloatTypes,
1312 2, fcvtzsCode, hasImm=True)
1313 twoEqualRegInstX("fcvtzs", "FcvtzsFixedQX", "SimdCvtOp", floatTypes, 4,
1314 fcvtzsCode, hasImm=True)
1315 twoEqualRegInstX("fcvtzs", "FcvtzsFixedScX", "SimdCvtOp", floatTypes, 4,
1316 fcvtzsCode, hasImm=True, scalar=True)
1317 # FCVTZS (integer)
1318 fcvtzsIntCode = fcvtCode % ("0", "false", "FPRounding_ZERO")
1319 twoEqualRegInstX("fcvtzs", "FcvtzsIntDX", "SimdCvtOp", smallFloatTypes,
1320 2, fcvtzsIntCode)
1321 twoEqualRegInstX("fcvtzs", "FcvtzsIntQX", "SimdCvtOp", floatTypes, 4,
1322 fcvtzsIntCode)
1323 twoEqualRegInstX("fcvtzs", "FcvtzsIntScX", "SimdCvtOp", floatTypes, 4,
1324 fcvtzsIntCode, scalar=True)
1325 # FCVTZU (fixed-point)
1326 fcvtzuCode = fcvtCode % ("imm", "true", "FPRounding_ZERO")
1327 twoEqualRegInstX("fcvtzu", "FcvtzuFixedDX", "SimdCvtOp", smallFloatTypes,
1328 2, fcvtzuCode, hasImm=True)
1329 twoEqualRegInstX("fcvtzu", "FcvtzuFixedQX", "SimdCvtOp", floatTypes, 4,
1330 fcvtzuCode, hasImm=True)
1331 twoEqualRegInstX("fcvtzu", "FcvtzuFixedScX", "SimdCvtOp", floatTypes, 4,
1332 fcvtzuCode, hasImm=True, scalar=True)
1333 # FCVTZU (integer)
1334 fcvtzuIntCode = fcvtCode % ("0", "true", "FPRounding_ZERO")
1335 twoEqualRegInstX("fcvtzu", "FcvtzuIntDX", "SimdCvtOp", smallFloatTypes, 2,
1336 fcvtzuIntCode)
1337 twoEqualRegInstX("fcvtzu", "FcvtzuIntQX", "SimdCvtOp", floatTypes, 4,
1338 fcvtzuIntCode)
1339 twoEqualRegInstX("fcvtzu", "FcvtzuIntScX", "SimdCvtOp", floatTypes, 4,
1340 fcvtzuIntCode, scalar=True)
1341 # FDIV
1342 fdivCode = fpBinOp % "Div"
1343 threeEqualRegInstX("fdiv", "FdivDX", "SimdFloatDivOp", smallFloatTypes, 2,
1344 fdivCode)
1345 threeEqualRegInstX("fdiv", "FdivQX", "SimdFloatDivOp", floatTypes, 4,
1346 fdivCode)
1347 # FMAX
1348 fmaxCode = fpBinOp % "Max"
1349 threeEqualRegInstX("fmax", "FmaxDX", "SimdFloatCmpOp", smallFloatTypes, 2,
1350 fmaxCode)
1351 threeEqualRegInstX("fmax", "FmaxQX", "SimdFloatCmpOp", floatTypes, 4,
1352 fmaxCode)
1353 # FMAXNM
1354 fmaxnmCode = fpBinOp % "MaxNum"
1355 threeEqualRegInstX("fmaxnm", "FmaxnmDX", "SimdFloatCmpOp", smallFloatTypes,
1356 2, fmaxnmCode)
1357 threeEqualRegInstX("fmaxnm", "FmaxnmQX", "SimdFloatCmpOp", floatTypes, 4,
1358 fmaxnmCode)
1359 # FMAXNMP (scalar)
1360 twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScDX", "SimdFloatCmpOp",
1361 ("uint32_t",), 2, fmaxnmCode)
1362 twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScQX", "SimdFloatCmpOp",
1363 ("uint64_t",), 4, fmaxnmCode)
1364 # FMAXNMP (vector)
1365 threeEqualRegInstX("fmaxnmp", "FmaxnmpDX", "SimdFloatCmpOp",
1366 smallFloatTypes, 2, fmaxnmCode, pairwise=True)
1367 threeEqualRegInstX("fmaxnmp", "FmaxnmpQX", "SimdFloatCmpOp", floatTypes, 4,
1368 fmaxnmCode, pairwise=True)
1369 # FMAXNMV
1370 # Note: SimdFloatCmpOp can be a bit optimistic here
1371 fpAcrossOp = fpOp % "fplib%s<Element>(destElem, srcElem1, fpscr)"
1372 fmaxnmAcrossCode = fpAcrossOp % "MaxNum"
1373 twoRegAcrossInstX("fmaxnmv", "FmaxnmvQX", "SimdFloatCmpOp", ("uint32_t",),
1374 4, fmaxnmAcrossCode)
1375 # FMAXP (scalar)
1376 twoRegPairwiseScInstX("fmaxp", "FmaxpScDX", "SimdFloatCmpOp",
1377 ("uint32_t",), 2, fmaxCode)
1378 twoRegPairwiseScInstX("fmaxp", "FmaxpScQX", "SimdFloatCmpOp",
1379 ("uint64_t",), 4, fmaxCode)
1380 # FMAXP (vector)
1381 threeEqualRegInstX("fmaxp", "FmaxpDX", "SimdFloatCmpOp", smallFloatTypes,
1382 2, fmaxCode, pairwise=True)
1383 threeEqualRegInstX("fmaxp", "FmaxpQX", "SimdFloatCmpOp", floatTypes, 4,
1384 fmaxCode, pairwise=True)
1385 # FMAXV
1386 # Note: SimdFloatCmpOp can be a bit optimistic here
1387 fmaxAcrossCode = fpAcrossOp % "Max"
1388 twoRegAcrossInstX("fmaxv", "FmaxvQX", "SimdFloatCmpOp", ("uint32_t",), 4,
1389 fmaxAcrossCode)
1390 # FMIN
1391 fminCode = fpBinOp % "Min"
1392 threeEqualRegInstX("fmin", "FminDX", "SimdFloatCmpOp", smallFloatTypes, 2,
1393 fminCode)
1394 threeEqualRegInstX("fmin", "FminQX", "SimdFloatCmpOp", floatTypes, 4,
1395 fminCode)
1396 # FMINNM
1397 fminnmCode = fpBinOp % "MinNum"
1398 threeEqualRegInstX("fminnm", "FminnmDX", "SimdFloatCmpOp", smallFloatTypes,
1399 2, fminnmCode)
1400 threeEqualRegInstX("fminnm", "FminnmQX", "SimdFloatCmpOp", floatTypes, 4,
1401 fminnmCode)
1402 # FMINNMP (scalar)
1403 twoRegPairwiseScInstX("fminnmp", "FminnmpScDX", "SimdFloatCmpOp",
1404 ("uint32_t",), 2, fminnmCode)
1405 twoRegPairwiseScInstX("fminnmp", "FminnmpScQX", "SimdFloatCmpOp",
1406 ("uint64_t",), 4, fminnmCode)
1407 # FMINNMP (vector)
1408 threeEqualRegInstX("fminnmp", "FminnmpDX", "SimdFloatCmpOp",
1409 smallFloatTypes, 2, fminnmCode, pairwise=True)
1410 threeEqualRegInstX("fminnmp", "FminnmpQX", "SimdFloatCmpOp", floatTypes, 4,
1411 fminnmCode, pairwise=True)
1412 # FMINNMV
1413 # Note: SimdFloatCmpOp can be a bit optimistic here
1414 fminnmAcrossCode = fpAcrossOp % "MinNum"
1415 twoRegAcrossInstX("fminnmv", "FminnmvQX", "SimdFloatCmpOp", ("uint32_t",),
1416 4, fminnmAcrossCode)
1417 # FMINP (scalar)
1418 twoRegPairwiseScInstX("fminp", "FminpScDX", "SimdFloatCmpOp",
1419 ("uint32_t",), 2, fminCode)
1420 twoRegPairwiseScInstX("fminp", "FminpScQX", "SimdFloatCmpOp",
1421 ("uint64_t",), 4, fminCode)
1422 # FMINP (vector)
1423 threeEqualRegInstX("fminp", "FminpDX", "SimdFloatCmpOp", smallFloatTypes,
1424 2, fminCode, pairwise=True)
1425 threeEqualRegInstX("fminp", "FminpQX", "SimdFloatCmpOp", floatTypes, 4,
1426 fminCode, pairwise=True)
1427 # FMINV
1428 # Note: SimdFloatCmpOp can be a bit optimistic here
1429 fminAcrossCode = fpAcrossOp % "Min"
1430 twoRegAcrossInstX("fminv", "FminvQX", "SimdFloatCmpOp", ("uint32_t",), 4,
1431 fminAcrossCode)
1432 # FMLA (by element)
1433 fmlaCode = fpOp % ("fplibMulAdd<Element>("
1434 "destElem, srcElem1, srcElem2, fpscr)")
1435 threeEqualRegInstX("fmla", "FmlaElemDX", "SimdFloatMultAccOp",
1436 smallFloatTypes, 2, fmlaCode, True, byElem=True)
1437 threeEqualRegInstX("fmla", "FmlaElemQX", "SimdFloatMultAccOp", floatTypes,
1438 4, fmlaCode, True, byElem=True)
1439 threeEqualRegInstX("fmla", "FmlaElemScX", "SimdFloatMultAccOp", floatTypes,
1440 4, fmlaCode, True, byElem=True, scalar=True)
1441 # FMLA (vector)
1442 threeEqualRegInstX("fmla", "FmlaDX", "SimdFloatMultAccOp", smallFloatTypes,
1443 2, fmlaCode, True)
1444 threeEqualRegInstX("fmla", "FmlaQX", "SimdFloatMultAccOp", floatTypes, 4,
1445 fmlaCode, True)
1446 # FMLS (by element)
1447 fmlsCode = fpOp % ("fplibMulAdd<Element>(destElem,"
1448 " fplibNeg<Element>(srcElem1), srcElem2, fpscr)")
1449 threeEqualRegInstX("fmls", "FmlsElemDX", "SimdFloatMultAccOp",
1450 smallFloatTypes, 2, fmlsCode, True, byElem=True)
1451 threeEqualRegInstX("fmls", "FmlsElemQX", "SimdFloatMultAccOp", floatTypes,
1452 4, fmlsCode, True, byElem=True)
1453 threeEqualRegInstX("fmls", "FmlsElemScX", "SimdFloatMultAccOp", floatTypes,
1454 4, fmlsCode, True, byElem=True, scalar=True)
1455 # FMLS (vector)
1456 threeEqualRegInstX("fmls", "FmlsDX", "SimdFloatMultAccOp", smallFloatTypes,
1457 2, fmlsCode, True)
1458 threeEqualRegInstX("fmls", "FmlsQX", "SimdFloatMultAccOp", floatTypes, 4,
1459 fmlsCode, True)
1460 # FMOV
1461 fmovCode = 'destElem = imm;'
1462 oneRegImmInstX("fmov", "FmovDX", "SimdMiscOp", smallFloatTypes, 2,
1463 fmovCode)
1464 oneRegImmInstX("fmov", "FmovQX", "SimdMiscOp", floatTypes, 4, fmovCode)
1465 # FMUL (by element)
1466 fmulCode = fpBinOp % "Mul"
1467 threeEqualRegInstX("fmul", "FmulElemDX", "SimdFloatMultOp",
1468 smallFloatTypes, 2, fmulCode, byElem=True)
1469 threeEqualRegInstX("fmul", "FmulElemQX", "SimdFloatMultOp", floatTypes, 4,
1470 fmulCode, byElem=True)
1471 threeEqualRegInstX("fmul", "FmulElemScX", "SimdFloatMultOp", floatTypes, 4,
1472 fmulCode, byElem=True, scalar=True)
1473 # FMUL (vector)
1474 threeEqualRegInstX("fmul", "FmulDX", "SimdFloatMultOp", smallFloatTypes, 2,
1475 fmulCode)
1476 threeEqualRegInstX("fmul", "FmulQX", "SimdFloatMultOp", floatTypes, 4,
1477 fmulCode)
1478 # FMULX
1479 fmulxCode = fpBinOp % "MulX"
1480 threeEqualRegInstX("fmulx", "FmulxDX", "SimdFloatMultOp", smallFloatTypes,
1481 2, fmulxCode)
1482 threeEqualRegInstX("fmulx", "FmulxQX", "SimdFloatMultOp", floatTypes, 4,
1483 fmulxCode)
1484 threeEqualRegInstX("fmulx", "FmulxScX", "SimdFloatMultOp", floatTypes, 4,
1485 fmulxCode, scalar=True)
1486 # FMULX (by element)
1487 threeEqualRegInstX("fmulx", "FmulxElemDX", "SimdFloatMultOp",
1488 smallFloatTypes, 2, fmulxCode, byElem=True)
1489 threeEqualRegInstX("fmulx", "FmulxElemQX", "SimdFloatMultOp", floatTypes,
1490 4, fmulxCode, byElem=True)
1491 threeEqualRegInstX("fmulx", "FmulxElemScX", "SimdFloatMultOp", floatTypes,
1492 4, fmulxCode, byElem=True, scalar=True)
1493 # FNEG
1494 fnegCode = fpOp % "fplibNeg<Element>(srcElem1)"
1495 twoEqualRegInstX("Neg", "FnegDX", "SimdFloatAluOp", smallFloatTypes, 2,
1496 fnegCode)
1497 twoEqualRegInstX("Neg", "FnegQX", "SimdFloatAluOp", floatTypes, 4,
1498 fnegCode)
1499 # FRECPE
1500 frecpeCode = fpOp % "fplibRecipEstimate<Element>(srcElem1, fpscr)"
1501 twoEqualRegInstX("frecpe", "FrecpeDX", "SimdFloatMultAccOp",
1502 smallFloatTypes, 2, frecpeCode)
1503 twoEqualRegInstX("frecpe", "FrecpeQX", "SimdFloatMultAccOp", floatTypes, 4,
1504 frecpeCode)
1505 twoEqualRegInstX("frecpe", "FrecpeScX", "SimdFloatMultAccOp", floatTypes,
1506 4, frecpeCode, scalar=True)
1507 # FRECPS
1508 frecpsCode = fpBinOp % "RecipStepFused"
1509 threeEqualRegInstX("frecps", "FrecpsDX", "SimdFloatMultAccOp",
1510 smallFloatTypes, 2, frecpsCode)
1511 threeEqualRegInstX("frecps", "FrecpsQX", "SimdFloatMultAccOp", floatTypes,
1512 4, frecpsCode)
1513 threeEqualRegInstX("frecps", "FrecpsScX", "SimdFloatMultAccOp", floatTypes,
1514 4, frecpsCode, scalar=True)
1515 # FRECPX
1516 frecpxCode = fpOp % "fplibRecpX<Element>(srcElem1, fpscr)"
1517 twoEqualRegInstX("frecpx", "FrecpxX", "SimdFloatMultAccOp", floatTypes, 4,
1518 frecpxCode, scalar=True)
1519 # FRINTA
1520 frintCode = fpOp % "fplibRoundInt<Element>(srcElem1, %s, %s, fpscr)"
1521 frintaCode = frintCode % ("FPRounding_TIEAWAY", "false")
1522 twoEqualRegInstX("frinta", "FrintaDX", "SimdCvtOp", smallFloatTypes, 2,
1523 frintaCode)
1524 twoEqualRegInstX("frinta", "FrintaQX", "SimdCvtOp", floatTypes, 4,
1525 frintaCode)
1526 # FRINTI
1527 frintiCode = frintCode % ("FPCRRounding(fpscr)", "false")
1528 twoEqualRegInstX("frinti", "FrintiDX", "SimdCvtOp", smallFloatTypes, 2,
1529 frintiCode)
1530 twoEqualRegInstX("frinti", "FrintiQX", "SimdCvtOp", floatTypes, 4,
1531 frintiCode)
1532 # FRINTM
1533 frintmCode = frintCode % ("FPRounding_NEGINF", "false")
1534 twoEqualRegInstX("frintm", "FrintmDX", "SimdCvtOp", smallFloatTypes, 2,
1535 frintmCode)
1536 twoEqualRegInstX("frintm", "FrintmQX", "SimdCvtOp", floatTypes, 4,
1537 frintmCode)
1538 # FRINTN
1539 frintnCode = frintCode % ("FPRounding_TIEEVEN", "false")
1540 twoEqualRegInstX("frintn", "FrintnDX", "SimdCvtOp", smallFloatTypes, 2,
1541 frintnCode)
1542 twoEqualRegInstX("frintn", "FrintnQX", "SimdCvtOp", floatTypes, 4,
1543 frintnCode)
1544 # FRINTP
1545 frintpCode = frintCode % ("FPRounding_POSINF", "false")
1546 twoEqualRegInstX("frintp", "FrintpDX", "SimdCvtOp", smallFloatTypes, 2,
1547 frintpCode)
1548 twoEqualRegInstX("frintp", "FrintpQX", "SimdCvtOp", floatTypes, 4,
1549 frintpCode)
1550 # FRINTX
1551 frintxCode = frintCode % ("FPCRRounding(fpscr)", "true")
1552 twoEqualRegInstX("frintx", "FrintxDX", "SimdCvtOp", smallFloatTypes, 2,
1553 frintxCode)
1554 twoEqualRegInstX("frintx", "FrintxQX", "SimdCvtOp", floatTypes, 4,
1555 frintxCode)
1556 # FRINTZ
1557 frintzCode = frintCode % ("FPRounding_ZERO", "false")
1558 twoEqualRegInstX("frintz", "FrintzDX", "SimdCvtOp", smallFloatTypes, 2,
1559 frintzCode)
1560 twoEqualRegInstX("frintz", "FrintzQX", "SimdCvtOp", floatTypes, 4,
1561 frintzCode)
1562 # FRSQRTE
1563 frsqrteCode = fpOp % "fplibRSqrtEstimate<Element>(srcElem1, fpscr)"
1564 twoEqualRegInstX("frsqrte", "FrsqrteDX", "SimdFloatSqrtOp",
1565 smallFloatTypes, 2, frsqrteCode)
1566 twoEqualRegInstX("frsqrte", "FrsqrteQX", "SimdFloatSqrtOp", floatTypes, 4,
1567 frsqrteCode)
1568 twoEqualRegInstX("frsqrte", "FrsqrteScX", "SimdFloatSqrtOp", floatTypes, 4,
1569 frsqrteCode, scalar=True)
1570 # FRSQRTS
1571 frsqrtsCode = fpBinOp % "RSqrtStepFused"
1572 threeEqualRegInstX("frsqrts", "FrsqrtsDX", "SimdFloatMiscOp",
1573 smallFloatTypes, 2, frsqrtsCode)
1574 threeEqualRegInstX("frsqrts", "FrsqrtsQX", "SimdFloatMiscOp", floatTypes,
1575 4, frsqrtsCode)
1576 threeEqualRegInstX("frsqrts", "FrsqrtsScX", "SimdFloatMiscOp", floatTypes,
1577 4, frsqrtsCode, scalar=True)
1578 # FSQRT
1579 fsqrtCode = fpOp % "fplibSqrt<Element>(srcElem1, fpscr)"
1580 twoEqualRegInstX("fsqrt", "FsqrtDX", "SimdFloatSqrtOp", smallFloatTypes, 2,
1581 fsqrtCode)
1582 twoEqualRegInstX("fsqrt", "FsqrtQX", "SimdFloatSqrtOp", floatTypes, 4,
1583 fsqrtCode)
1584 # FSUB
1585 fsubCode = fpBinOp % "Sub"
1586 threeEqualRegInstX("fsub", "FsubDX", "SimdFloatAddOp", smallFloatTypes, 2,
1587 fsubCode)
1588 threeEqualRegInstX("fsub", "FsubQX", "SimdFloatAddOp", floatTypes, 4,
1589 fsubCode)
1590 # INS (element)
1591 insFromVecElemInstX("ins", "InsElemX", "SimdMiscOp", unsignedTypes, 4)
1592 # INS (general register)
1593 insFromGprInstX("ins", "InsGprWX", "SimdMiscOp", smallUnsignedTypes, 4,
1594 'W')
1595 insFromGprInstX("ins", "InsGprXX", "SimdMiscOp", unsignedTypes, 4, 'X')
1596 # MLA (by element)
1597 mlaCode = "destElem += srcElem1 * srcElem2;"
1598 threeEqualRegInstX("mla", "MlaElemDX", "SimdMultAccOp",
1599 ("uint16_t", "uint32_t"), 2, mlaCode, True, byElem=True)
1600 threeEqualRegInstX("mla", "MlaElemQX", "SimdMultAccOp",
1601 ("uint16_t", "uint32_t"), 4, mlaCode, True, byElem=True)
1602 # MLA (vector)
1603 threeEqualRegInstX("mla", "MlaDX", "SimdMultAccOp", smallUnsignedTypes, 2,
1604 mlaCode, True)
1605 threeEqualRegInstX("mla", "MlaQX", "SimdMultAccOp", smallUnsignedTypes, 4,
1606 mlaCode, True)
1607 # MLS (by element)
1608 mlsCode = "destElem -= srcElem1 * srcElem2;"
1609 threeEqualRegInstX("mls", "MlsElemDX", "SimdMultAccOp",
1610 ("uint16_t", "uint32_t"), 2, mlsCode, True, byElem=True)
1611 threeEqualRegInstX("mls", "MlsElemQX", "SimdMultAccOp",
1612 ("uint16_t", "uint32_t"), 4, mlsCode, True, byElem=True)
1613 # MLS (vector)
1614 threeEqualRegInstX("mls", "MlsDX", "SimdMultAccOp", smallUnsignedTypes, 2,
1615 mlsCode, True)
1616 threeEqualRegInstX("mls", "MlsQX", "SimdMultAccOp", smallUnsignedTypes, 4,
1617 mlsCode, True)
1618 # MOV (element) -> alias to INS (element)
1619 # MOV (from general) -> alias to INS (general register)
1620 # MOV (scalar) -> alias to DUP (element)
1621 # MOV (to general) -> alias to UMOV
1622 # MOV (vector) -> alias to ORR (register)
1623 # MOVI
1624 movImmCode = "destElem = imm;"
1625 oneRegImmInstX("movi", "MoviDX", "SimdMiscOp", ("uint64_t",), 2,
1626 movImmCode)
1627 oneRegImmInstX("movi", "MoviQX", "SimdMiscOp", ("uint64_t",), 4,
1628 movImmCode)
1629 # MUL (by element)
1630 mulCode = "destElem = srcElem1 * srcElem2;"
1631 threeEqualRegInstX("mul", "MulElemDX", "SimdMultOp",
1632 ("uint16_t", "uint32_t"), 2, mulCode, byElem=True)
1633 threeEqualRegInstX("mul", "MulElemQX", "SimdMultOp",
1634 ("uint16_t", "uint32_t"), 4, mulCode, byElem=True)
1635 # MUL (vector)
1636 threeEqualRegInstX("mul", "MulDX", "SimdMultOp", smallUnsignedTypes, 2,
1637 mulCode)
1638 threeEqualRegInstX("mul", "MulQX", "SimdMultOp", smallUnsignedTypes, 4,
1639 mulCode)
1640 # MVN
1641 mvnCode = "destElem = ~srcElem1;"
1642 twoEqualRegInstX("mvn", "MvnDX", "SimdAluOp", ("uint64_t",), 2, mvnCode)
1643 twoEqualRegInstX("mvn", "MvnQX", "SimdAluOp", ("uint64_t",), 4, mvnCode)
1644 # MVNI
1645 mvniCode = "destElem = ~imm;"
1646 oneRegImmInstX("mvni", "MvniDX", "SimdAluOp", ("uint64_t",), 2, mvniCode)
1647 oneRegImmInstX("mvni", "MvniQX", "SimdAluOp", ("uint64_t",), 4, mvniCode)
1648 # NEG
1649 negCode = "destElem = -srcElem1;"
1650 twoEqualRegInstX("neg", "NegDX", "SimdAluOp", signedTypes, 2, negCode)
1651 twoEqualRegInstX("neg", "NegQX", "SimdAluOp", signedTypes, 4, negCode)
1652 # NOT -> alias to MVN
1653 # ORN
1654 ornCode = "destElem = srcElem1 | ~srcElem2;"
1655 threeEqualRegInstX("orn", "OrnDX", "SimdAluOp", ("uint64_t",), 2, ornCode)
1656 threeEqualRegInstX("orn", "OrnQX", "SimdAluOp", ("uint64_t",), 4, ornCode)
1657 # ORR (immediate)
1658 orrImmCode = "destElem |= imm;"
1659 oneRegImmInstX("orr", "OrrImmDX", "SimdAluOp", ("uint64_t",), 2,
1660 orrImmCode, True)
1661 oneRegImmInstX("orr", "OrrImmQX", "SimdAluOp", ("uint64_t",), 4,
1662 orrImmCode, True)
1663 # ORR (register)
1664 orrCode = "destElem = srcElem1 | srcElem2;"
1665 threeEqualRegInstX("orr", "OrrDX", "SimdAluOp", ("uint64_t",), 2, orrCode)
1666 threeEqualRegInstX("orr", "OrrQX", "SimdAluOp", ("uint64_t",), 4, orrCode)
1667 # PMUL
1668 pmulCode = '''
1669 destElem = 0;
1670 for (unsigned j = 0; j < sizeof(Element) * 8; j++) {
1671 if (bits(srcElem2, j))
1672 destElem ^= srcElem1 << j;
1673 }
1674 '''
1675 threeEqualRegInstX("pmul", "PmulDX", "SimdMultOp", ("uint8_t",), 2,
1676 pmulCode)
1677 threeEqualRegInstX("pmul", "PmulQX", "SimdMultOp", ("uint8_t",), 4,
1678 pmulCode)
1679 # PMULL, PMULL2
1680 # Note: 64-bit PMULL is not available (Crypto. Extension)
1681 pmullCode = '''
1682 destElem = 0;
1683 for (unsigned j = 0; j < sizeof(Element) * 8; j++) {
1684 if (bits(srcElem2, j))
1685 destElem ^= (BigElement)srcElem1 << j;
1686 }
1687 '''
1688 threeRegLongInstX("pmull", "PmullX", "SimdMultOp", ("uint8_t",), pmullCode)
1689 threeRegLongInstX("pmull", "Pmull2X", "SimdMultOp", ("uint8_t",),
1690 pmullCode, hi=True)
1691 # RADDHN, RADDHN2
1692 raddhnCode = '''
1693 destElem = ((BigElement)srcElem1 + (BigElement)srcElem2 +
1694 ((BigElement)1 << (sizeof(Element) * 8 - 1))) >>
1695 (sizeof(Element) * 8);
1696 '''
1697 threeRegNarrowInstX("raddhn", "RaddhnX", "SimdAddOp", smallUnsignedTypes,
1698 raddhnCode)
1699 threeRegNarrowInstX("raddhn2", "Raddhn2X", "SimdAddOp", smallUnsignedTypes,
1700 raddhnCode, hi=True)
1701 # RBIT
1702 rbitCode = '''
1703 destElem = 0;
1704 Element temp = srcElem1;
1705 for (int i = 0; i < 8 * sizeof(Element); i++) {
1706 destElem = destElem | ((temp & 0x1) <<
1707 (8 * sizeof(Element) - 1 - i));
1708 temp >>= 1;
1709 }
1710 '''
1711 twoEqualRegInstX("rbit", "RbitDX", "SimdAluOp", ("uint8_t",), 2, rbitCode)
1712 twoEqualRegInstX("rbit", "RbitQX", "SimdAluOp", ("uint8_t",), 4, rbitCode)
1713 # REV16
1714 rev16Code = '''
1715 destElem = srcElem1;
1716 unsigned groupSize = ((1 << 1) / sizeof(Element));
1717 unsigned reverseMask = (groupSize - 1);
1718 j = i ^ reverseMask;
1719 '''
1720 twoEqualRegInstX("rev16", "Rev16DX", "SimdAluOp", ("uint8_t",), 2,
1721 rev16Code)
1722 twoEqualRegInstX("rev16", "Rev16QX", "SimdAluOp", ("uint8_t",), 4,
1723 rev16Code)
1724 # REV32
1725 rev32Code = '''
1726 destElem = srcElem1;
1727 unsigned groupSize = ((1 << 2) / sizeof(Element));
1728 unsigned reverseMask = (groupSize - 1);
1729 j = i ^ reverseMask;
1730 '''
1731 twoEqualRegInstX("rev32", "Rev32DX", "SimdAluOp", ("uint8_t", "uint16_t"),
1732 2, rev32Code)
1733 twoEqualRegInstX("rev32", "Rev32QX", "SimdAluOp", ("uint8_t", "uint16_t"),
1734 4, rev32Code)
1735 # REV64
1736 rev64Code = '''
1737 destElem = srcElem1;
1738 unsigned groupSize = ((1 << 3) / sizeof(Element));
1739 unsigned reverseMask = (groupSize - 1);
1740 j = i ^ reverseMask;
1741 '''
1742 twoEqualRegInstX("rev64", "Rev64DX", "SimdAluOp", smallUnsignedTypes, 2,
1743 rev64Code)
1744 twoEqualRegInstX("rev64", "Rev64QX", "SimdAluOp", smallUnsignedTypes, 4,
1745 rev64Code)
1746 # RSHRN, RSHRN2
1747 rshrnCode = '''
1748 if (imm > sizeof(srcElem1) * 8) {
1749 destElem = 0;
1750 } else if (imm) {
1751 Element rBit = bits(srcElem1, imm - 1);
1752 destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit;
1753 } else {
1754 destElem = srcElem1;
1755 }
1756 '''
1757 twoRegNarrowInstX("rshrn", "RshrnX", "SimdShiftOp", smallUnsignedTypes,
1758 rshrnCode, hasImm=True)
1759 twoRegNarrowInstX("rshrn2", "Rshrn2X", "SimdShiftOp", smallUnsignedTypes,
1760 rshrnCode, hasImm=True, hi=True)
1761 # RSUBHN, RSUBHN2
1762 rsubhnCode = '''
1763 destElem = ((BigElement)srcElem1 - (BigElement)srcElem2 +
1764 ((BigElement)1 << (sizeof(Element) * 8 - 1))) >>
1765 (sizeof(Element) * 8);
1766 '''
1767 threeRegNarrowInstX("rsubhn", "RsubhnX", "SimdAddOp", smallTypes,
1768 rsubhnCode)
1769 threeRegNarrowInstX("rsubhn2", "Rsubhn2X", "SimdAddOp", smallTypes,
1770 rsubhnCode, hi=True)
1771 # SABA
1772 abaCode = '''
1773 destElem += (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) :
1774 (srcElem2 - srcElem1);
1775 '''
1776 threeEqualRegInstX("saba", "SabaDX", "SimdAddAccOp", smallSignedTypes, 2,
1777 abaCode, True)
1778 threeEqualRegInstX("saba", "SabaQX", "SimdAddAccOp", smallSignedTypes, 4,
1779 abaCode, True)
1780 # SABAL, SABAL2
1781 abalCode = '''
1782 destElem += (srcElem1 > srcElem2) ?
1783 ((BigElement)srcElem1 - (BigElement)srcElem2) :
1784 ((BigElement)srcElem2 - (BigElement)srcElem1);
1785 '''
1786 threeRegLongInstX("sabal", "SabalX", "SimdAddAccOp", smallSignedTypes,
1787 abalCode, True)
1788 threeRegLongInstX("sabal2", "Sabal2X", "SimdAddAccOp", smallSignedTypes,
1789 abalCode, True, hi=True)
1790 # SABD
1791 abdCode = '''
1792 destElem = (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) :
1793 (srcElem2 - srcElem1);
1794 '''
1795 threeEqualRegInstX("sabd", "SabdDX", "SimdAddOp", smallSignedTypes, 2,
1796 abdCode)
1797 threeEqualRegInstX("sabd", "SabdQX", "SimdAddOp", smallSignedTypes, 4,
1798 abdCode)
1799 # SABDL, SABDL2
1800 abdlCode = '''
1801 destElem = (srcElem1 > srcElem2) ?
1802 ((BigElement)srcElem1 - (BigElement)srcElem2) :
1803 ((BigElement)srcElem2 - (BigElement)srcElem1);
1804 '''
1805 threeRegLongInstX("sabdl", "SabdlX", "SimdAddAccOp", smallSignedTypes,
1806 abdlCode, True)
1807 threeRegLongInstX("sabdl2", "Sabdl2X", "SimdAddAccOp", smallSignedTypes,
1808 abdlCode, True, hi=True)
1809 # SADALP
1810 adalpCode = "destElem += (BigElement)srcElem1 + (BigElement)srcElem2;"
1811 twoRegCondenseInstX("sadalp", "SadalpDX", "SimdAddOp", smallSignedTypes, 2,
1812 adalpCode, True)
1813 twoRegCondenseInstX("sadalp", "SadalpQX", "SimdAddOp", smallSignedTypes, 4,
1814 adalpCode, True)
1815 # SADDL, SADDL2
1816 addlwCode = "destElem = (BigElement)srcElem1 + (BigElement)srcElem2;"
1817 threeRegLongInstX("saddl", "SaddlX", "SimdAddAccOp", smallSignedTypes,
1818 addlwCode)
1819 threeRegLongInstX("saddl2", "Saddl2X", "SimdAddAccOp", smallSignedTypes,
1820 addlwCode, hi=True)
1821 # SADDLP
1822 twoRegCondenseInstX("saddlp", "SaddlpDX", "SimdAddOp", smallSignedTypes, 2,
1823 addlwCode)
1824 twoRegCondenseInstX("saddlp", "SaddlpQX", "SimdAddOp", smallSignedTypes, 4,
1825 addlwCode)
1826 # SADDLV
1827 # Note: SimdAddOp can be a bit optimistic here
1828 addAcrossLongCode = "destElem += (BigElement)srcElem1;"
1829 twoRegAcrossInstX("saddlv", "SaddlvDX", "SimdAddOp", ("int8_t", "int16_t"),
1830 2, addAcrossLongCode, long=True)
1831 twoRegAcrossInstX("saddlv", "SaddlvQX", "SimdAddOp", ("int8_t", "int16_t"),
1832 4, addAcrossLongCode, long=True)
1833 twoRegAcrossInstX("saddlv", "SaddlvBQX", "SimdAddOp", ("int32_t",), 4,
1834 addAcrossLongCode, doubleDest=True, long=True)
1835 # SADDW, SADDW2
1836 threeRegWideInstX("saddw", "SaddwX", "SimdAddAccOp", smallSignedTypes,
1837 addlwCode)
1838 threeRegWideInstX("saddw2", "Saddw2X", "SimdAddAccOp", smallSignedTypes,
1839 addlwCode, hi=True)
1840 # SCVTF (fixed-point)
1841 scvtfFixedCode = fpOp % ("fplibFixedToFP<Element>((int%d_t) srcElem1, imm,"
1842 " false, FPCRRounding(fpscr), fpscr)")
1843 twoEqualRegInstX("scvtf", "ScvtfFixedDX", "SimdCvtOp", smallFloatTypes, 2,
1844 scvtfFixedCode % 32, hasImm=True)
1845 twoEqualRegInstX("scvtf", "ScvtfFixedSQX", "SimdCvtOp", smallFloatTypes, 4,
1846 scvtfFixedCode % 32, hasImm=True)
1847 twoEqualRegInstX("scvtf", "ScvtfFixedDQX", "SimdCvtOp", ("uint64_t",), 4,
1848 scvtfFixedCode % 64, hasImm=True)
1849 twoEqualRegInstX("scvtf", "ScvtfFixedScSX", "SimdCvtOp", smallFloatTypes,
1850 4, scvtfFixedCode % 32, hasImm=True, scalar=True)
1851 twoEqualRegInstX("scvtf", "ScvtfFixedScDX", "SimdCvtOp", ("uint64_t",), 4,
1852 scvtfFixedCode % 64, hasImm=True, scalar=True)
1853 # SCVTF (integer)
1854 scvtfIntCode = fpOp % ("fplibFixedToFP<Element>((int%d_t) srcElem1, 0,"
1855 " false, FPCRRounding(fpscr), fpscr)")
1856 twoEqualRegInstX("scvtf", "ScvtfIntDX", "SimdCvtOp", smallFloatTypes, 2,
1857 scvtfIntCode % 32)
1858 twoEqualRegInstX("scvtf", "ScvtfIntSQX", "SimdCvtOp", smallFloatTypes, 4,
1859 scvtfIntCode % 32)
1860 twoEqualRegInstX("scvtf", "ScvtfIntDQX", "SimdCvtOp", ("uint64_t",), 4,
1861 scvtfIntCode % 64)
1862 twoEqualRegInstX("scvtf", "ScvtfIntScSX", "SimdCvtOp", smallFloatTypes, 4,
1863 scvtfIntCode % 32, scalar=True)
1864 twoEqualRegInstX("scvtf", "ScvtfIntScDX", "SimdCvtOp", ("uint64_t",), 4,
1865 scvtfIntCode % 64, scalar=True)
1866 # SHADD
1867 haddCode = '''
1868 Element carryBit =
1869 (((unsigned)srcElem1 & 0x1) +
1870 ((unsigned)srcElem2 & 0x1)) >> 1;
1871 // Use division instead of a shift to ensure the sign extension works
1872 // right. The compiler will figure out if it can be a shift. Mask the
1873 // inputs so they get truncated correctly.
1874 destElem = (((srcElem1 & ~(Element)1) / 2) +
1875 ((srcElem2 & ~(Element)1) / 2)) + carryBit;
1876 '''
1877 threeEqualRegInstX("shadd", "ShaddDX", "SimdAddOp", smallSignedTypes, 2,
1878 haddCode)
1879 threeEqualRegInstX("shadd", "ShaddQX", "SimdAddOp", smallSignedTypes, 4,
1880 haddCode)
1881 # SHL
1882 shlCode = '''
1883 if (imm >= sizeof(Element) * 8)
1884 destElem = (srcElem1 << (sizeof(Element) * 8 - 1)) << 1;
1885 else
1886 destElem = srcElem1 << imm;
1887 '''
1888 twoEqualRegInstX("shl", "ShlDX", "SimdShiftOp", unsignedTypes, 2, shlCode,
1889 hasImm=True)
1890 twoEqualRegInstX("shl", "ShlQX", "SimdShiftOp", unsignedTypes, 4, shlCode,
1891 hasImm=True)
1892 # SHLL, SHLL2
1893 shllCode = "destElem = ((BigElement)srcElem1) << (sizeof(Element) * 8);"
1894 twoRegLongInstX("shll", "ShllX", "SimdShiftOp", smallTypes, shllCode)
1895 twoRegLongInstX("shll", "Shll2X", "SimdShiftOp", smallTypes, shllCode,
1896 hi=True)
1897 # SHRN, SHRN2
1898 shrnCode = '''
1899 if (imm >= sizeof(srcElem1) * 8) {
1900 destElem = 0;
1901 } else {
1902 destElem = srcElem1 >> imm;
1903 }
1904 '''
1905 twoRegNarrowInstX("shrn", "ShrnX", "SimdShiftOp", smallUnsignedTypes,
1906 shrnCode, hasImm=True)
1907 twoRegNarrowInstX("shrn2", "Shrn2X", "SimdShiftOp", smallUnsignedTypes,
1908 shrnCode, hasImm=True, hi=True)
1909 # SHSUB
1910 hsubCode = '''
1911 Element borrowBit =
1912 (((srcElem1 & 0x1) - (srcElem2 & 0x1)) >> 1) & 0x1;
1913 // Use division instead of a shift to ensure the sign extension works
1914 // right. The compiler will figure out if it can be a shift. Mask the
1915 // inputs so they get truncated correctly.
1916 destElem = (((srcElem1 & ~(Element)1) / 2) -
1917 ((srcElem2 & ~(Element)1) / 2)) - borrowBit;
1918 '''
1919 threeEqualRegInstX("shsub", "ShsubDX", "SimdAddOp", smallSignedTypes, 2,
1920 hsubCode)
1921 threeEqualRegInstX("shsub", "ShsubQX", "SimdAddOp", smallSignedTypes, 4,
1922 hsubCode)
1923 # SLI
1924 sliCode = '''
1925 if (imm >= sizeof(Element) * 8)
1926 destElem = destElem;
1927 else
1928 destElem = (srcElem1 << imm) | (destElem & mask(imm));
1929 '''
1930 twoEqualRegInstX("sli", "SliDX", "SimdShiftOp", unsignedTypes, 2, sliCode,
1931 True, hasImm=True)
1932 twoEqualRegInstX("sli", "SliQX", "SimdShiftOp", unsignedTypes, 4, sliCode,
1933 True, hasImm=True)
1934 # SMAX
1935 maxCode = "destElem = (srcElem1 > srcElem2) ? srcElem1 : srcElem2;"
1936 threeEqualRegInstX("smax", "SmaxDX", "SimdCmpOp", smallSignedTypes, 2,
1937 maxCode)
1938 threeEqualRegInstX("smax", "SmaxQX", "SimdCmpOp", smallSignedTypes, 4,
1939 maxCode)
1940 # SMAXP
1941 threeEqualRegInstX("smaxp", "SmaxpDX", "SimdCmpOp", smallSignedTypes, 2,
1942 maxCode, pairwise=True)
1943 threeEqualRegInstX("smaxp", "SmaxpQX", "SimdCmpOp", smallSignedTypes, 4,
1944 maxCode, pairwise=True)
1945 # SMAXV
1946 maxAcrossCode = '''
1947 if (i == 0 || srcElem1 > destElem)
1948 destElem = srcElem1;
1949 '''
1950 twoRegAcrossInstX("smaxv", "SmaxvDX", "SimdCmpOp", ("int8_t", "int16_t"),
1951 2, maxAcrossCode)
1952 twoRegAcrossInstX("smaxv", "SmaxvQX", "SimdCmpOp", smallSignedTypes, 4,
1953 maxAcrossCode)
1954 # SMIN
1955 minCode = "destElem = (srcElem1 < srcElem2) ? srcElem1 : srcElem2;"
1956 threeEqualRegInstX("smin", "SminDX", "SimdCmpOp", smallSignedTypes, 2,
1957 minCode)
1958 threeEqualRegInstX("smin", "SminQX", "SimdCmpOp", smallSignedTypes, 4,
1959 minCode)
1960 # SMINP
1961 threeEqualRegInstX("sminp", "SminpDX", "SimdCmpOp", smallSignedTypes, 2,
1962 minCode, pairwise=True)
1963 threeEqualRegInstX("sminp", "SminpQX", "SimdCmpOp", smallSignedTypes, 4,
1964 minCode, pairwise=True)
1965 # SMINV
1966 minAcrossCode = '''
1967 if (i == 0 || srcElem1 < destElem)
1968 destElem = srcElem1;
1969 '''
1970 twoRegAcrossInstX("sminv", "SminvDX", "SimdCmpOp", ("int8_t", "int16_t"),
1971 2, minAcrossCode)
1972 twoRegAcrossInstX("sminv", "SminvQX", "SimdCmpOp", smallSignedTypes, 4,
1973 minAcrossCode)
1974
1975 split('exec')
1976
1977 # SMLAL, SMLAL2 (by element)
1978 mlalCode = "destElem += (BigElement)srcElem1 * (BigElement)srcElem2;"
1979 threeRegLongInstX("smlal", "SmlalElemX", "SimdMultAccOp",
1980 ("int16_t", "int32_t"), mlalCode, True, byElem=True)
1981 threeRegLongInstX("smlal", "SmlalElem2X", "SimdMultAccOp",
1982 ("int16_t", "int32_t"), mlalCode, True, byElem=True,
1983 hi=True)
1984 # SMLAL, SMLAL2 (vector)
1985 threeRegLongInstX("smlal", "SmlalX", "SimdMultAccOp", smallSignedTypes,
1986 mlalCode, True)
1987 threeRegLongInstX("smlal", "Smlal2X", "SimdMultAccOp", smallSignedTypes,
1988 mlalCode, True, hi=True)
1989 # SMLSL, SMLSL2 (by element)
1990 mlslCode = "destElem -= (BigElement)srcElem1 * (BigElement)srcElem2;"
1991 threeRegLongInstX("smlsl", "SmlslElemX", "SimdMultAccOp", smallSignedTypes,
1992 mlslCode, True, byElem=True)
1993 threeRegLongInstX("smlsl", "SmlslElem2X", "SimdMultAccOp",
1994 smallSignedTypes, mlslCode, True, byElem=True, hi=True)
1995 # SMLSL, SMLSL2 (vector)
1996 threeRegLongInstX("smlsl", "SmlslX", "SimdMultAccOp", smallSignedTypes,
1997 mlslCode, True)
1998 threeRegLongInstX("smlsl", "Smlsl2X", "SimdMultAccOp", smallSignedTypes,
1999 mlslCode, True, hi=True)
2000 # SMOV
2001 insToGprInstX("smov", "SmovWX", "SimdMiscOp", ("int8_t", "int16_t"), 4,
2002 'W', True)
2003 insToGprInstX("smov", "SmovXX", "SimdMiscOp", smallSignedTypes, 4, 'X',
2004 True)
2005 # SMULL, SMULL2 (by element)
2006 mullCode = "destElem = (BigElement)srcElem1 * (BigElement)srcElem2;"
2007 threeRegLongInstX("smull", "SmullElemX", "SimdMultOp", smallSignedTypes,
2008 mullCode, byElem=True)
2009 threeRegLongInstX("smull", "SmullElem2X", "SimdMultOp", smallSignedTypes,
2010 mullCode, byElem=True, hi=True)
2011 # SMULL, SMULL2 (vector)
2012 threeRegLongInstX("smull", "SmullX", "SimdMultOp", smallSignedTypes,
2013 mullCode)
2014 threeRegLongInstX("smull", "Smull2X", "SimdMultOp", smallSignedTypes,
2015 mullCode, hi=True)
2016 # SQABS
2017 sqabsCode = '''
2018 FPSCR fpscr = (FPSCR) FpscrQc;
2019 if (srcElem1 == (Element)(std::numeric_limits<Element>::min())) {
2020 fpscr.qc = 1;
2021 destElem = ~srcElem1;
2022 } else if (srcElem1 < 0) {
2023 destElem = -srcElem1;
2024 } else {
2025 destElem = srcElem1;
2026 }
2027 FpscrQc = fpscr;
2028 '''
2029 twoEqualRegInstX("sqabs", "SqabsDX", "SimdAluOp", smallSignedTypes, 2,
2030 sqabsCode)
2031 twoEqualRegInstX("sqabs", "SqabsQX", "SimdAluOp", signedTypes, 4,
2032 sqabsCode)
2033 twoEqualRegInstX("sqabs", "SqabsScX", "SimdAluOp", signedTypes, 4,
2034 sqabsCode, scalar=True)
2035 # SQADD
2036 sqaddCode = '''
2037 destElem = srcElem1 + srcElem2;
2038 FPSCR fpscr = (FPSCR) FpscrQc;
2039 bool negDest = (destElem < 0);
2040 bool negSrc1 = (srcElem1 < 0);
2041 bool negSrc2 = (srcElem2 < 0);
2042 if ((negDest != negSrc1) && (negSrc1 == negSrc2)) {
2043 destElem = std::numeric_limits<Element>::min();
2044 if (negDest)
2045 destElem -= 1;
2046 fpscr.qc = 1;
2047 }
2048 FpscrQc = fpscr;
2049 '''
2050 threeEqualRegInstX("sqadd", "SqaddDX", "SimdAddOp", smallSignedTypes, 2,
2051 sqaddCode)
2052 threeEqualRegInstX("sqadd", "SqaddQX", "SimdAddOp", signedTypes, 4,
2053 sqaddCode)
2054 threeEqualRegInstX("sqadd", "SqaddScX", "SimdAddOp", signedTypes, 4,
2055 sqaddCode, scalar=True)
2056 # SQDMLAL, SQDMLAL2 (by element)
2057 qdmlalCode = '''
2058 FPSCR fpscr = (FPSCR) FpscrQc;
2059 BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2060 Element maxNeg = std::numeric_limits<Element>::min();
2061 Element halfNeg = maxNeg / 2;
2062 if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2063 (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2064 (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2065 midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8));
2066 fpscr.qc = 1;
2067 }
2068 bool negPreDest = ltz(destElem);
2069 destElem += midElem;
2070 bool negDest = ltz(destElem);
2071 bool negMid = ltz(midElem);
2072 if (negPreDest == negMid && negMid != negDest) {
2073 destElem = mask(sizeof(BigElement) * 8 - 1);
2074 if (negPreDest)
2075 destElem = ~destElem;
2076 fpscr.qc = 1;
2077 }
2078 FpscrQc = fpscr;
2079 '''
2080 threeRegLongInstX("sqdmlal", "SqdmlalElemX", "SimdMultAccOp",
2081 ("int16_t", "int32_t"), qdmlalCode, True, byElem=True)
2082 threeRegLongInstX("sqdmlal", "SqdmlalElem2X", "SimdMultAccOp",
2083 ("int16_t", "int32_t"), qdmlalCode, True, byElem=True,
2084 hi=True)
2085 threeRegLongInstX("sqdmlal", "SqdmlalElemScX", "SimdMultAccOp",
2086 ("int16_t", "int32_t"), qdmlalCode, True, byElem=True,
2087 scalar=True)
2088 # SQDMLAL, SQDMLAL2 (vector)
2089 threeRegLongInstX("sqdmlal", "SqdmlalX", "SimdMultAccOp",
2090 ("int16_t", "int32_t"), qdmlalCode, True)
2091 threeRegLongInstX("sqdmlal", "Sqdmlal2X", "SimdMultAccOp",
2092 ("int16_t", "int32_t"), qdmlalCode, True, hi=True)
2093 threeRegLongInstX("sqdmlal", "SqdmlalScX", "SimdMultAccOp",
2094 ("int16_t", "int32_t"), qdmlalCode, True, scalar=True)
2095 # SQDMLSL, SQDMLSL2 (by element)
2096 qdmlslCode = '''
2097 FPSCR fpscr = (FPSCR) FpscrQc;
2098 BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2099 Element maxNeg = std::numeric_limits<Element>::min();
2100 Element halfNeg = maxNeg / 2;
2101 if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2102 (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2103 (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2104 midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8));
2105 fpscr.qc = 1;
2106 }
2107 bool negPreDest = ltz(destElem);
2108 destElem -= midElem;
2109 bool negDest = ltz(destElem);
2110 bool posMid = ltz((BigElement)-midElem);
2111 if (negPreDest == posMid && posMid != negDest) {
2112 destElem = mask(sizeof(BigElement) * 8 - 1);
2113 if (negPreDest)
2114 destElem = ~destElem;
2115 fpscr.qc = 1;
2116 }
2117 FpscrQc = fpscr;
2118 '''
2119 threeRegLongInstX("sqdmlsl", "SqdmlslElemX", "SimdMultAccOp",
2120 ("int16_t", "int32_t"), qdmlslCode, True, byElem=True)
2121 threeRegLongInstX("sqdmlsl", "SqdmlslElem2X", "SimdMultAccOp",
2122 ("int16_t", "int32_t"), qdmlslCode, True, byElem=True,
2123 hi=True)
2124 threeRegLongInstX("sqdmlsl", "SqdmlslElemScX", "SimdMultAccOp",
2125 ("int16_t", "int32_t"), qdmlslCode, True, byElem=True,
2126 scalar=True)
2127 # SQDMLSL, SQDMLSL2 (vector)
2128 threeRegLongInstX("sqdmlsl", "SqdmlslX", "SimdMultAccOp",
2129 ("int16_t", "int32_t"), qdmlslCode, True)
2130 threeRegLongInstX("sqdmlsl", "Sqdmlsl2X", "SimdMultAccOp",
2131 ("int16_t", "int32_t"), qdmlslCode, True, hi=True)
2132 threeRegLongInstX("sqdmlsl", "SqdmlslScX", "SimdMultAccOp",
2133 ("int16_t", "int32_t"), qdmlslCode, True, scalar=True)
2134 # SQDMULH (by element)
2135 sqdmulhCode = '''
2136 FPSCR fpscr = (FPSCR) FpscrQc;
2137 destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2) >>
2138 (sizeof(Element) * 8);
2139 if (srcElem1 == srcElem2 &&
2140 srcElem1 == (Element)((Element)1 <<
2141 (sizeof(Element) * 8 - 1))) {
2142 destElem = ~srcElem1;
2143 fpscr.qc = 1;
2144 }
2145 FpscrQc = fpscr;
2146 '''
2147 threeEqualRegInstX("sqdmulh", "SqdmulhElemDX", "SimdMultOp",
2148 ("int16_t", "int32_t"), 2, sqdmulhCode, byElem=True)
2149 threeEqualRegInstX("sqdmulh", "SqdmulhElemQX", "SimdMultOp",
2150 ("int16_t", "int32_t"), 4, sqdmulhCode, byElem=True)
2151 threeEqualRegInstX("sqdmulh", "SqdmulhElemScX", "SimdMultOp",
2152 ("int16_t", "int32_t"), 4, sqdmulhCode, byElem=True,
2153 scalar=True)
2154 # SQDMULH (vector)
2155 threeEqualRegInstX("sqdmulh", "SqdmulhDX", "SimdMultOp",
2156 ("int16_t", "int32_t"), 2, sqdmulhCode)
2157 threeEqualRegInstX("sqdmulh", "SqdmulhQX", "SimdMultOp",
2158 ("int16_t", "int32_t"), 4, sqdmulhCode)
2159 threeEqualRegInstX("sqdmulh", "SqdmulhScX", "SimdMultOp",
2160 ("int16_t", "int32_t"), 4, sqdmulhCode, scalar=True)
2161 # SQDMULL, SQDMULL2 (by element)
2162 qdmullCode = '''
2163 FPSCR fpscr = (FPSCR) FpscrQc;
2164 destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2165 if (srcElem1 == srcElem2 &&
2166 srcElem1 == (Element)((Element)1 <<
2167 (Element)(sizeof(Element) * 8 - 1))) {
2168 destElem = ~((BigElement)srcElem1 << (sizeof(Element) * 8));
2169 fpscr.qc = 1;
2170 }
2171 FpscrQc = fpscr;
2172 '''
2173 threeRegLongInstX("sqdmull", "SqdmullElemX", "SimdMultOp",
2174 ("int16_t", "int32_t"), qdmullCode, True, byElem=True)
2175 threeRegLongInstX("sqdmull", "SqdmullElem2X", "SimdMultOp",
2176 ("int16_t", "int32_t"), qdmullCode, True, byElem=True,
2177 hi=True)
2178 threeRegLongInstX("sqdmull", "SqdmullElemScX", "SimdMultOp",
2179 ("int16_t", "int32_t"), qdmullCode, True, byElem=True,
2180 scalar=True)
2181 # SQDMULL, SQDMULL2 (vector)
2182 threeRegLongInstX("sqdmull", "SqdmullX", "SimdMultOp",
2183 ("int16_t", "int32_t"), qdmullCode, True)
2184 threeRegLongInstX("sqdmull", "Sqdmull2X", "SimdMultOp",
2185 ("int16_t", "int32_t"), qdmullCode, True, hi=True)
2186 threeRegLongInstX("sqdmull", "SqdmullScX", "SimdMultOp",
2187 ("int16_t", "int32_t"), qdmullCode, True, scalar=True)
2188 # SQNEG
2189 sqnegCode = '''
2190 FPSCR fpscr = (FPSCR) FpscrQc;
2191 if (srcElem1 == (Element)(std::numeric_limits<Element>::min())) {
2192 fpscr.qc = 1;
2193 destElem = ~srcElem1;
2194 } else {
2195 destElem = -srcElem1;
2196 }
2197 FpscrQc = fpscr;
2198 '''
2199 twoEqualRegInstX("sqneg", "SqnegDX", "SimdAluOp", smallSignedTypes, 2,
2200 sqnegCode)
2201 twoEqualRegInstX("sqneg", "SqnegQX", "SimdAluOp", signedTypes, 4,
2202 sqnegCode)
2203 twoEqualRegInstX("sqneg", "SqnegScX", "SimdAluOp", signedTypes, 4,
2204 sqnegCode, scalar=True)
2205 # SQRDMULH (by element)
2206 sqrdmulhCode = '''
2207 FPSCR fpscr = (FPSCR) FpscrQc;
2208 destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2 +
2209 ((int64_t)1 << (sizeof(Element) * 8 - 1))) >>
2210 (sizeof(Element) * 8);
2211 Element maxNeg = std::numeric_limits<Element>::min();
2212 Element halfNeg = maxNeg / 2;
2213 if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2214 (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2215 (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2216 if (destElem < 0) {
2217 destElem = mask(sizeof(Element) * 8 - 1);
2218 } else {
2219 destElem = std::numeric_limits<Element>::min();
2220 }
2221 fpscr.qc = 1;
2222 }
2223 FpscrQc = fpscr;
2224 '''
2225 threeEqualRegInstX("sqrdmulh", "SqrdmulhElemDX", "SimdMultOp",
2226 ("int16_t", "int32_t"), 2, sqrdmulhCode, byElem=True)
2227 threeEqualRegInstX("sqrdmulh", "SqrdmulhElemQX", "SimdMultOp",
2228 ("int16_t", "int32_t"), 4, sqrdmulhCode, byElem=True)
2229 threeEqualRegInstX("sqrdmulh", "SqrdmulhElemScX", "SimdMultOp",
2230 ("int16_t", "int32_t"), 4, sqrdmulhCode, byElem=True,
2231 scalar=True)
2232 # SQRDMULH (vector)
2233 threeEqualRegInstX("sqrdmulh", "SqrdmulhDX", "SimdMultOp",
2234 ("int16_t", "int32_t"), 2, sqrdmulhCode)
2235 threeEqualRegInstX("sqrdmulh", "SqrdmulhQX", "SimdMultOp",
2236 ("int16_t", "int32_t"), 4, sqrdmulhCode)
2237 threeEqualRegInstX("sqrdmulh", "SqrdmulhScX", "SimdMultOp",
2238 ("int16_t", "int32_t"), 4, sqrdmulhCode, scalar=True)
2239 # SQRSHL
2240 sqrshlCode = '''
2241 int16_t shiftAmt = (int8_t)srcElem2;
2242 FPSCR fpscr = (FPSCR) FpscrQc;
2243 if (shiftAmt < 0) {
2244 shiftAmt = -shiftAmt;
2245 Element rBit = 0;
2246 if (shiftAmt <= sizeof(Element) * 8)
2247 rBit = bits(srcElem1, shiftAmt - 1);
2248 if (shiftAmt > sizeof(Element) * 8 && srcElem1 < 0)
2249 rBit = 1;
2250 if (shiftAmt >= sizeof(Element) * 8) {
2251 shiftAmt = sizeof(Element) * 8 - 1;
2252 destElem = 0;
2253 } else {
2254 destElem = (srcElem1 >> shiftAmt);
2255 }
2256 // Make sure the right shift sign extended when it should.
2257 if (srcElem1 < 0 && destElem >= 0) {
2258 destElem |= -((Element)1 << (sizeof(Element) * 8 -
2259 1 - shiftAmt));
2260 }
2261 destElem += rBit;
2262 } else if (shiftAmt > 0) {
2263 bool sat = false;
2264 if (shiftAmt >= sizeof(Element) * 8) {
2265 if (srcElem1 != 0)
2266 sat = true;
2267 else
2268 destElem = 0;
2269 } else {
2270 if (bits((uint64_t) srcElem1, sizeof(Element) * 8 - 1,
2271 sizeof(Element) * 8 - 1 - shiftAmt) !=
2272 ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) {
2273 sat = true;
2274 } else {
2275 destElem = srcElem1 << shiftAmt;
2276 }
2277 }
2278 if (sat) {
2279 fpscr.qc = 1;
2280 destElem = mask(sizeof(Element) * 8 - 1);
2281 if (srcElem1 < 0)
2282 destElem = ~destElem;
2283 }
2284 } else {
2285 destElem = srcElem1;
2286 }
2287 FpscrQc = fpscr;
2288 '''
2289 threeEqualRegInstX("sqrshl", "SqrshlDX", "SimdCmpOp", smallSignedTypes, 2,
2290 sqrshlCode)
2291 threeEqualRegInstX("sqrshl", "SqrshlQX", "SimdCmpOp", signedTypes, 4,
2292 sqrshlCode)
2293 threeEqualRegInstX("sqrshl", "SqrshlScX", "SimdCmpOp", signedTypes, 4,
2294 sqrshlCode, scalar=True)
2295 # SQRSHRN, SQRSHRN2
2296 sqrshrnCode = '''
2297 FPSCR fpscr = (FPSCR) FpscrQc;
2298 if (imm > sizeof(srcElem1) * 8) {
2299 if (srcElem1 != 0 && srcElem1 != -1)
2300 fpscr.qc = 1;
2301 destElem = 0;
2302 } else if (imm) {
2303 BigElement mid = (srcElem1 >> (imm - 1));
2304 uint64_t rBit = mid & 0x1;
2305 mid >>= 1;
2306 mid |= -(mid & ((BigElement)1 <<
2307 (sizeof(BigElement) * 8 - 1 - imm)));
2308 mid += rBit;
2309 if (mid != (Element)mid) {
2310 destElem = mask(sizeof(Element) * 8 - 1);
2311 if (srcElem1 < 0)
2312 destElem = ~destElem;
2313 fpscr.qc = 1;
2314 } else {
2315 destElem = mid;
2316 }
2317 } else {
2318 if (srcElem1 != (Element)srcElem1) {
2319 destElem = mask(sizeof(Element) * 8 - 1);
2320 if (srcElem1 < 0)
2321 destElem = ~destElem;
2322 fpscr.qc = 1;
2323 } else {
2324 destElem = srcElem1;
2325 }
2326 }
2327 FpscrQc = fpscr;
2328 '''
2329 twoRegNarrowInstX("sqrshrn", "SqrshrnX", "SimdShiftOp", smallSignedTypes,
2330 sqrshrnCode, hasImm=True)
2331 twoRegNarrowInstX("sqrshrn2", "Sqrshrn2X", "SimdShiftOp", smallSignedTypes,
2332 sqrshrnCode, hasImm=True, hi=True)
2333 twoRegNarrowInstX("sqrshrn", "SqrshrnScX", "SimdShiftOp", smallSignedTypes,
2334 sqrshrnCode, hasImm=True, scalar=True)
2335 # SQRSHRUN, SQRSHRUN2
2336 sqrshrunCode = '''
2337 FPSCR fpscr = (FPSCR) FpscrQc;
2338 if (imm > sizeof(srcElem1) * 8) {
2339 if (srcElem1 != 0)
2340 fpscr.qc = 1;
2341 destElem = 0;
2342 } else if (imm) {
2343 BigElement mid = (srcElem1 >> (imm - 1));
2344 uint64_t rBit = mid & 0x1;
2345 mid >>= 1;
2346 mid |= -(mid & ((BigElement)1 <<
2347 (sizeof(BigElement) * 8 - 1 - imm)));
2348 mid += rBit;
2349 if (bits(mid, sizeof(BigElement) * 8 - 1,
2350 sizeof(Element) * 8) != 0) {
2351 if (srcElem1 < 0) {
2352 destElem = 0;
2353 } else {
2354 destElem = mask(sizeof(Element) * 8);
2355 }
2356 fpscr.qc = 1;
2357 } else {
2358 destElem = mid;
2359 }
2360 } else {
2361 if (srcElem1 < 0) {
2362 fpscr.qc = 1;
2363 destElem = 0;
2364 } else {
2365 destElem = srcElem1;
2366 }
2367 }
2368 FpscrQc = fpscr;
2369 '''
2370 twoRegNarrowInstX("sqrshrun", "SqrshrunX", "SimdShiftOp", smallSignedTypes,
2371 sqrshrunCode, hasImm=True)
2372 twoRegNarrowInstX("sqrshrun", "Sqrshrun2X", "SimdShiftOp",
2373 smallSignedTypes, sqrshrunCode, hasImm=True, hi=True)
2374 twoRegNarrowInstX("sqrshrun", "SqrshrunScX", "SimdShiftOp",
2375 smallSignedTypes, sqrshrunCode, hasImm=True, scalar=True)
2376 # SQSHL (immediate)
2377 sqshlImmCode = '''
2378 FPSCR fpscr = (FPSCR) FpscrQc;
2379 if (imm >= sizeof(Element) * 8) {
2380 if (srcElem1 != 0) {
2381 destElem = std::numeric_limits<Element>::min();
2382 if (srcElem1 > 0)
2383 destElem = ~destElem;
2384 fpscr.qc = 1;
2385 } else {
2386 destElem = 0;
2387 }
2388 } else if (imm) {
2389 destElem = (srcElem1 << imm);
2390 uint64_t topBits = bits((uint64_t)srcElem1,
2391 sizeof(Element) * 8 - 1,
2392 sizeof(Element) * 8 - 1 - imm);
2393 if (topBits != 0 && topBits != mask(imm + 1)) {
2394 destElem = std::numeric_limits<Element>::min();
2395 if (srcElem1 > 0)
2396 destElem = ~destElem;
2397 fpscr.qc = 1;
2398 }
2399 } else {
2400 destElem = srcElem1;
2401 }
2402 FpscrQc = fpscr;
2403 '''
2404 twoEqualRegInstX("sqshl", "SqshlImmDX", "SimdAluOp", smallSignedTypes, 2,
2405 sqshlImmCode, hasImm=True)
2406 twoEqualRegInstX("sqshl", "SqshlImmQX", "SimdAluOp", signedTypes, 4,
2407 sqshlImmCode, hasImm=True)
2408 twoEqualRegInstX("sqshl", "SqshlImmScX", "SimdAluOp", signedTypes, 4,
2409 sqshlImmCode, hasImm=True, scalar=True)
2410 # SQSHL (register)
2411 sqshlCode = '''
2412 int16_t shiftAmt = (int8_t)srcElem2;
2413 FPSCR fpscr = (FPSCR) FpscrQc;
2414 if (shiftAmt < 0) {
2415 shiftAmt = -shiftAmt;
2416 if (shiftAmt >= sizeof(Element) * 8) {
2417 shiftAmt = sizeof(Element) * 8 - 1;
2418 destElem = 0;
2419 } else {
2420 destElem = (srcElem1 >> shiftAmt);
2421 }
2422 // Make sure the right shift sign extended when it should.
2423 if (srcElem1 < 0 && destElem >= 0) {
2424 destElem |= -((Element)1 << (sizeof(Element) * 8 -
2425 1 - shiftAmt));
2426 }
2427 } else if (shiftAmt > 0) {
2428 bool sat = false;
2429 if (shiftAmt >= sizeof(Element) * 8) {
2430 if (srcElem1 != 0)
2431 sat = true;
2432 else
2433 destElem = 0;
2434 } else {
2435 if (bits((uint64_t) srcElem1, sizeof(Element) * 8 - 1,
2436 sizeof(Element) * 8 - 1 - shiftAmt) !=
2437 ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) {
2438 sat = true;
2439 } else {
2440 destElem = srcElem1 << shiftAmt;
2441 }
2442 }
2443 if (sat) {
2444 fpscr.qc = 1;
2445 destElem = mask(sizeof(Element) * 8 - 1);
2446 if (srcElem1 < 0)
2447 destElem = ~destElem;
2448 }
2449 } else {
2450 destElem = srcElem1;
2451 }
2452 FpscrQc = fpscr;
2453 '''
2454 threeEqualRegInstX("sqshl", "SqshlDX", "SimdAluOp", smallSignedTypes, 2,
2455 sqshlCode)
2456 threeEqualRegInstX("sqshl", "SqshlQX", "SimdAluOp", signedTypes, 4,
2457 sqshlCode)
2458 threeEqualRegInstX("sqshl", "SqshlScX", "SimdAluOp", signedTypes, 4,
2459 sqshlCode, scalar=True)
2460 # SQSHLU
2461 sqshluCode = '''
2462 FPSCR fpscr = (FPSCR) FpscrQc;
2463 if (imm >= sizeof(Element) * 8) {
2464 if (srcElem1 < 0) {
2465 destElem = 0;
2466 fpscr.qc = 1;
2467 } else if (srcElem1 > 0) {
2468 destElem = mask(sizeof(Element) * 8);
2469 fpscr.qc = 1;
2470 } else {
2471 destElem = 0;
2472 }
2473 } else if (imm) {
2474 destElem = (srcElem1 << imm);
2475 uint64_t topBits = bits((uint64_t)srcElem1,
2476 sizeof(Element) * 8 - 1,
2477 sizeof(Element) * 8 - imm);
2478 if (srcElem1 < 0) {
2479 destElem = 0;
2480 fpscr.qc = 1;
2481 } else if (topBits != 0) {
2482 destElem = mask(sizeof(Element) * 8);
2483 fpscr.qc = 1;
2484 }
2485 } else {
2486 if (srcElem1 < 0) {
2487 fpscr.qc = 1;
2488 destElem = 0;
2489 } else {
2490 destElem = srcElem1;
2491 }
2492 }
2493 FpscrQc = fpscr;
2494 '''
2495 twoEqualRegInstX("sqshlu", "SqshluDX", "SimdAluOp", smallSignedTypes, 2,
2496 sqshluCode, hasImm=True)
2497 twoEqualRegInstX("sqshlu", "SqshluQX", "SimdAluOp", signedTypes, 4,
2498 sqshluCode, hasImm=True)
2499 twoEqualRegInstX("sqshlu", "SqshluScX", "SimdAluOp", signedTypes, 4,
2500 sqshluCode, hasImm=True, scalar=True)
2501 # SQSHRN, SQSHRN2
2502 sqshrnCode = '''
2503 FPSCR fpscr = (FPSCR) FpscrQc;
2504 if (imm > sizeof(srcElem1) * 8) {
2505 if (srcElem1 != 0 && srcElem1 != -1)
2506 fpscr.qc = 1;
2507 destElem = 0;
2508 } else if (imm) {
2509 BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
2510 mid |= -(mid & ((BigElement)1 <<
2511 (sizeof(BigElement) * 8 - 1 - imm)));
2512 if (mid != (Element)mid) {
2513 destElem = mask(sizeof(Element) * 8 - 1);
2514 if (srcElem1 < 0)
2515 destElem = ~destElem;
2516 fpscr.qc = 1;
2517 } else {
2518 destElem = mid;
2519 }
2520 } else {
2521 destElem = srcElem1;
2522 }
2523 FpscrQc = fpscr;
2524 '''
2525 twoRegNarrowInstX("sqshrn", "SqshrnX", "SimdShiftOp", smallSignedTypes,
2526 sqshrnCode, hasImm=True)
2527 twoRegNarrowInstX("sqshrn2", "Sqshrn2X", "SimdShiftOp", smallSignedTypes,
2528 sqshrnCode, hasImm=True, hi=True)
2529 twoRegNarrowInstX("sqshrn", "SqshrnScX", "SimdShiftOp", smallSignedTypes,
2530 sqshrnCode, hasImm=True, scalar=True)
2531 # SQSHRUN, SQSHRUN2
2532 sqshrunCode = '''
2533 FPSCR fpscr = (FPSCR) FpscrQc;
2534 if (imm > sizeof(srcElem1) * 8) {
2535 if (srcElem1 != 0)
2536 fpscr.qc = 1;
2537 destElem = 0;
2538 } else if (imm) {
2539 BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
2540 if (bits(mid, sizeof(BigElement) * 8 - 1,
2541 sizeof(Element) * 8) != 0) {
2542 if (srcElem1 < 0) {
2543 destElem = 0;
2544 } else {
2545 destElem = mask(sizeof(Element) * 8);
2546 }
2547 fpscr.qc = 1;
2548 } else {
2549 destElem = mid;
2550 }
2551 } else {
2552 destElem = srcElem1;
2553 }
2554 FpscrQc = fpscr;
2555 '''
2556 twoRegNarrowInstX("sqshrun", "SqshrunX", "SimdShiftOp", smallSignedTypes,
2557 sqshrunCode, hasImm=True)
2558 twoRegNarrowInstX("sqshrun", "Sqshrun2X", "SimdShiftOp", smallSignedTypes,
2559 sqshrunCode, hasImm=True, hi=True)
2560 twoRegNarrowInstX("sqshrun", "SqshrunScX", "SimdShiftOp", smallSignedTypes,
2561 sqshrunCode, hasImm=True, scalar=True)
2562 # SQSUB
2563 sqsubCode = '''
2564 destElem = srcElem1 - srcElem2;
2565 FPSCR fpscr = (FPSCR) FpscrQc;
2566 bool negDest = (destElem < 0);
2567 bool negSrc1 = (srcElem1 < 0);
2568 bool posSrc2 = (srcElem2 >= 0);
2569 if ((negDest != negSrc1) && (negSrc1 == posSrc2)) {
2570 destElem = std::numeric_limits<Element>::min();
2571 if (negDest)
2572 destElem -= 1;
2573 fpscr.qc = 1;
2574 }
2575 FpscrQc = fpscr;
2576 '''
2577 threeEqualRegInstX("sqsub", "SqsubDX", "SimdAddOp", smallSignedTypes, 2,
2578 sqsubCode)
2579 threeEqualRegInstX("sqsub", "SqsubQX", "SimdAddOp", signedTypes, 4,
2580 sqsubCode)
2581 threeEqualRegInstX("sqsub", "SqsubScX", "SimdAddOp", signedTypes, 4,
2582 sqsubCode, scalar=True)
2583 # SQXTN, SQXTN2
2584 sqxtnCode = '''
2585 FPSCR fpscr = (FPSCR) FpscrQc;
2586 destElem = srcElem1;
2587 if ((BigElement)destElem != srcElem1) {
2588 fpscr.qc = 1;
2589 destElem = mask(sizeof(Element) * 8 - 1);
2590 if (srcElem1 < 0)
2591 destElem = ~destElem;
2592 }
2593 FpscrQc = fpscr;
2594 '''
2595 twoRegNarrowInstX("sqxtn", "SqxtnX", "SimdMiscOp", smallSignedTypes,
2596 sqxtnCode)
2597 twoRegNarrowInstX("sqxtn", "Sqxtn2X", "SimdMiscOp", smallSignedTypes,
2598 sqxtnCode, hi=True)
2599 twoRegNarrowInstX("sqxtn", "SqxtnScX", "SimdMiscOp", smallSignedTypes,
2600 sqxtnCode, scalar=True)
2601 # SQXTUN, SQXTUN2
2602 sqxtunCode = '''
2603 FPSCR fpscr = (FPSCR) FpscrQc;
2604 destElem = srcElem1;
2605 if (srcElem1 < 0 ||
2606 ((BigElement)destElem & mask(sizeof(Element) * 8)) != srcElem1) {
2607 fpscr.qc = 1;
2608 destElem = mask(sizeof(Element) * 8);
2609 if (srcElem1 < 0)
2610 destElem = ~destElem;
2611 }
2612 FpscrQc = fpscr;
2613 '''
2614 twoRegNarrowInstX("sqxtun", "SqxtunX", "SimdMiscOp", smallSignedTypes,
2615 sqxtunCode)
2616 twoRegNarrowInstX("sqxtun", "Sqxtun2X", "SimdMiscOp", smallSignedTypes,
2617 sqxtunCode, hi=True)
2618 twoRegNarrowInstX("sqxtun", "SqxtunScX", "SimdMiscOp", smallSignedTypes,
2619 sqxtunCode, scalar=True)
2620 # SRHADD
2621 rhaddCode = '''
2622 Element carryBit =
2623 (((unsigned)srcElem1 & 0x1) +
2624 ((unsigned)srcElem2 & 0x1) + 1) >> 1;
2625 // Use division instead of a shift to ensure the sign extension works
2626 // right. The compiler will figure out if it can be a shift. Mask the
2627 // inputs so they get truncated correctly.
2628 destElem = (((srcElem1 & ~(Element)1) / 2) +
2629 ((srcElem2 & ~(Element)1) / 2)) + carryBit;
2630 '''
2631 threeEqualRegInstX("srhadd", "SrhaddDX", "SimdAddOp", smallSignedTypes, 2,
2632 rhaddCode)
2633 threeEqualRegInstX("srhadd", "SrhaddQX", "SimdAddOp", smallSignedTypes, 4,
2634 rhaddCode)
2635 # SRI
2636 sriCode = '''
2637 if (imm >= sizeof(Element) * 8)
2638 destElem = destElem;
2639 else
2640 destElem = (srcElem1 >> imm) |
2641 (destElem & ~mask(sizeof(Element) * 8 - imm));
2642 '''
2643 twoEqualRegInstX("sri", "SriDX", "SimdShiftOp", unsignedTypes, 2, sriCode,
2644 True, hasImm=True)
2645 twoEqualRegInstX("sri", "SriQX", "SimdShiftOp", unsignedTypes, 4, sriCode,
2646 True, hasImm=True)
2647 # SRSHL
2648 rshlCode = '''
2649 int16_t shiftAmt = (int8_t)srcElem2;
2650 if (shiftAmt < 0) {
2651 shiftAmt = -shiftAmt;
2652 Element rBit = 0;
2653 if (shiftAmt <= sizeof(Element) * 8)
2654 rBit = bits(srcElem1, shiftAmt - 1);
2655 if (shiftAmt > sizeof(Element) * 8 && ltz(srcElem1))
2656 rBit = 1;
2657 if (shiftAmt >= sizeof(Element) * 8) {
2658 shiftAmt = sizeof(Element) * 8 - 1;
2659 destElem = 0;
2660 } else {
2661 destElem = (srcElem1 >> shiftAmt);
2662 }
2663 // Make sure the right shift sign extended when it should.
2664 if (ltz(srcElem1) && !ltz(destElem)) {
2665 destElem |= -((Element)1 << (sizeof(Element) * 8 -
2666 1 - shiftAmt));
2667 }
2668 destElem += rBit;
2669 } else if (shiftAmt > 0) {
2670 if (shiftAmt >= sizeof(Element) * 8) {
2671 destElem = 0;
2672 } else {
2673 destElem = srcElem1 << shiftAmt;
2674 }
2675 } else {
2676 destElem = srcElem1;
2677 }
2678 '''
2679 threeEqualRegInstX("srshl", "SrshlDX", "SimdShiftOp", signedTypes, 2,
2680 rshlCode)
2681 threeEqualRegInstX("srshl", "SrshlQX", "SimdShiftOp", signedTypes, 4,
2682 rshlCode)
2683 # SRSHR
2684 rshrCode = '''
2685 if (imm > sizeof(srcElem1) * 8) {
2686 destElem = 0;
2687 } else if (imm) {
2688 Element rBit = bits(srcElem1, imm - 1);
2689 destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit;
2690 } else {
2691 destElem = srcElem1;
2692 }
2693 '''
2694 twoEqualRegInstX("srshr", "SrshrDX", "SimdShiftOp", signedTypes, 2,
2695 rshrCode, hasImm=True)
2696 twoEqualRegInstX("srshr", "SrshrQX", "SimdShiftOp", signedTypes, 4,
2697 rshrCode, hasImm=True)
2698 # SRSRA
2699 rsraCode = '''
2700 if (imm > sizeof(srcElem1) * 8) {
2701 destElem += 0;
2702 } else if (imm) {
2703 Element rBit = bits(srcElem1, imm - 1);
2704 destElem += ((srcElem1 >> (imm - 1)) >> 1) + rBit;
2705 } else {
2706 destElem += srcElem1;
2707 }
2708 '''
2709 twoEqualRegInstX("srsra", "SrsraDX", "SimdShiftOp", signedTypes, 2,
2710 rsraCode, True, hasImm=True)
2711 twoEqualRegInstX("srsra", "SrsraQX", "SimdShiftOp", signedTypes, 4,
2712 rsraCode, True, hasImm=True)
2713 # SSHL
2714 shlCode = '''
2715 int16_t shiftAmt = (int8_t)srcElem2;
2716 if (shiftAmt < 0) {
2717 shiftAmt = -shiftAmt;
2718 if (shiftAmt >= sizeof(Element) * 8) {
2719 shiftAmt = sizeof(Element) * 8 - 1;
2720 destElem = 0;
2721 } else {
2722 destElem = (srcElem1 >> shiftAmt);
2723 }
2724 // Make sure the right shift sign extended when it should.
2725 if (ltz(srcElem1) && !ltz(destElem)) {
2726 destElem |= -((Element)1 << (sizeof(Element) * 8 -
2727 1 - shiftAmt));
2728 }
2729 } else {
2730 if (shiftAmt >= sizeof(Element) * 8) {
2731 destElem = 0;
2732 } else {
2733 destElem = srcElem1 << shiftAmt;
2734 }
2735 }
2736 '''
2737 threeEqualRegInstX("sshl", "SshlDX", "SimdShiftOp", signedTypes, 2,
2738 shlCode)
2739 threeEqualRegInstX("sshl", "SshlQX", "SimdShiftOp", signedTypes, 4,
2740 shlCode)
2741 # SSHLL, SSHLL2
2742 shllCode = '''
2743 if (imm >= sizeof(destElem) * 8) {
2744 destElem = 0;
2745 } else {
2746 destElem = (BigElement)srcElem1 << imm;
2747 }
2748 '''
2749 twoRegLongInstX("sshll", "SshllX", "SimdShiftOp", smallSignedTypes,
2750 shllCode, hasImm=True)
2751 twoRegLongInstX("sshll", "Sshll2X", "SimdShiftOp", smallSignedTypes,
2752 shllCode, hasImm=True, hi=True)
2753 # SSHR
2754 shrCode = '''
2755 if (imm >= sizeof(srcElem1) * 8) {
2756 if (ltz(srcElem1))
2757 destElem = -1;
2758 else
2759 destElem = 0;
2760 } else {
2761 destElem = srcElem1 >> imm;
2762 }
2763 '''
2764 twoEqualRegInstX("sshr", "SshrDX", "SimdShiftOp", signedTypes, 2, shrCode,
2765 hasImm=True)
2766 twoEqualRegInstX("sshr", "SshrQX", "SimdShiftOp", signedTypes, 4, shrCode,
2767 hasImm=True)
2768 # SSRA
2769 sraCode = '''
2770 Element mid;;
2771 if (imm >= sizeof(srcElem1) * 8) {
2772 mid = ltz(srcElem1) ? -1 : 0;
2773 } else {
2774 mid = srcElem1 >> imm;
2775 if (ltz(srcElem1) && !ltz(mid)) {
2776 mid |= -(mid & ((Element)1 <<
2777 (sizeof(Element) * 8 - 1 - imm)));
2778 }
2779 }
2780 destElem += mid;
2781 '''
2782 twoEqualRegInstX("ssra", "SsraDX", "SimdShiftOp", signedTypes, 2, sraCode,
2783 True, hasImm=True)
2784 twoEqualRegInstX("ssra", "SsraQX", "SimdShiftOp", signedTypes, 4, sraCode,
2785 True, hasImm=True)
2786 # SSUBL
2787 sublwCode = "destElem = (BigElement)srcElem1 - (BigElement)srcElem2;"
2788 threeRegLongInstX("ssubl", "SsublX", "SimdAddOp", smallSignedTypes,
2789 sublwCode)
2790 threeRegLongInstX("ssubl2", "Ssubl2X", "SimdAddOp", smallSignedTypes,
2791 sublwCode, hi=True)
2792 # SSUBW
2793 threeRegWideInstX("ssubw", "SsubwX", "SimdAddOp", smallSignedTypes,
2794 sublwCode)
2795 threeRegWideInstX("ssubw2", "Ssubw2X", "SimdAddOp", smallSignedTypes,
2796 sublwCode, hi=True)
2797 # SUB
2798 subCode = "destElem = srcElem1 - srcElem2;"
2799 threeEqualRegInstX("sub", "SubDX", "SimdAddOp", unsignedTypes, 2, subCode)
2800 threeEqualRegInstX("sub", "SubQX", "SimdAddOp", unsignedTypes, 4, subCode)
2801 # SUBHN, SUBHN2
2802 subhnCode = '''
2803 destElem = ((BigElement)srcElem1 - (BigElement)srcElem2) >>
2804 (sizeof(Element) * 8);
2805 '''
2806 threeRegNarrowInstX("subhn", "SubhnX", "SimdAddOp", smallUnsignedTypes,
2807 subhnCode)
2808 threeRegNarrowInstX("subhn2", "Subhn2X", "SimdAddOp", smallUnsignedTypes,
2809 subhnCode, hi=True)
2810 # SUQADD
2811 suqaddCode = '''
2812 FPSCR fpscr = (FPSCR) FpscrQc;
2813 Element tmp = destElem + srcElem1;
2814 if (bits(destElem, sizeof(Element) * 8 - 1) == 0) {
2815 if (bits(tmp, sizeof(Element) * 8 - 1) == 1 ||
2816 tmp < srcElem1 || tmp < destElem) {
2817 destElem = (((Element) 1) << (sizeof(Element) * 8 - 1)) - 1;
2818 fpscr.qc = 1;
2819 } else {
2820 destElem = tmp;
2821 }
2822 } else {
2823 Element absDestElem = (~destElem) + 1;
2824 if (absDestElem < srcElem1) {
2825 // Still check for positive sat., no need to check for negative sat.
2826 if (bits(tmp, sizeof(Element) * 8 - 1) == 1) {
2827 destElem = (((Element) 1) << (sizeof(Element) * 8 - 1)) - 1;
2828 fpscr.qc = 1;
2829 } else {
2830 destElem = tmp;
2831 }
2832 } else {
2833 destElem = tmp;
2834 }
2835 }
2836 FpscrQc = fpscr;
2837 '''
2838 twoEqualRegInstX("suqadd", "SuqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
2839 suqaddCode, True)
2840 twoEqualRegInstX("suqadd", "SuqaddQX", "SimdAddOp", unsignedTypes, 4,
2841 suqaddCode, True)
2842 twoEqualRegInstX("suqadd", "SuqaddScX", "SimdAddOp", unsignedTypes, 4,
2843 suqaddCode, True, scalar=True)
2844 # SXTL -> alias to SSHLL
2845 # TBL
2846 tbxTblInstX("tbl", "Tbl1DX", "SimdMiscOp", ("uint8_t",), 1, "true", 2)
2847 tbxTblInstX("tbl", "Tbl1QX", "SimdMiscOp", ("uint8_t",), 1, "true", 4)
2848 tbxTblInstX("tbl", "Tbl2DX", "SimdMiscOp", ("uint8_t",), 2, "true", 2)
2849 tbxTblInstX("tbl", "Tbl2QX", "SimdMiscOp", ("uint8_t",), 2, "true", 4)
2850 tbxTblInstX("tbl", "Tbl3DX", "SimdMiscOp", ("uint8_t",), 3, "true", 2)
2851 tbxTblInstX("tbl", "Tbl3QX", "SimdMiscOp", ("uint8_t",), 3, "true", 4)
2852 tbxTblInstX("tbl", "Tbl4DX", "SimdMiscOp", ("uint8_t",), 4, "true", 2)
2853 tbxTblInstX("tbl", "Tbl4QX", "SimdMiscOp", ("uint8_t",), 4, "true", 4)
2854 # TBX
2855 tbxTblInstX("tbx", "Tbx1DX", "SimdMiscOp", ("uint8_t",), 1, "false", 2)
2856 tbxTblInstX("tbx", "Tbx1QX", "SimdMiscOp", ("uint8_t",), 1, "false", 4)
2857 tbxTblInstX("tbx", "Tbx2DX", "SimdMiscOp", ("uint8_t",), 2, "false", 2)
2858 tbxTblInstX("tbx", "Tbx2QX", "SimdMiscOp", ("uint8_t",), 2, "false", 4)
2859 tbxTblInstX("tbx", "Tbx3DX", "SimdMiscOp", ("uint8_t",), 3, "false", 2)
2860 tbxTblInstX("tbx", "Tbx3QX", "SimdMiscOp", ("uint8_t",), 3, "false", 4)
2861 tbxTblInstX("tbx", "Tbx4DX", "SimdMiscOp", ("uint8_t",), 4, "false", 2)
2862 tbxTblInstX("tbx", "Tbx4QX", "SimdMiscOp", ("uint8_t",), 4, "false", 4)
2863 # TRN1
2864 trnCode = '''
2865 unsigned part = %s;
2866 for (unsigned i = 0; i < eCount / 2; i++) {
2867 destReg.elements[2 * i] = srcReg1.elements[2 * i + part];
2868 destReg.elements[2 * i + 1] = srcReg2.elements[2 * i + part];
2869 }
2870 '''
2871 threeRegScrambleInstX("trn1", "Trn1DX", "SimdAluOp", smallUnsignedTypes, 2,
2872 trnCode % "0")
2873 threeRegScrambleInstX("trn1", "Trn1QX", "SimdAluOp", unsignedTypes, 4,
2874 trnCode % "0")
2875 # TRN2
2876 threeRegScrambleInstX("trn2", "Trn2DX", "SimdAluOp", smallUnsignedTypes, 2,
2877 trnCode % "1")
2878 threeRegScrambleInstX("trn2", "Trn2QX", "SimdAluOp", unsignedTypes, 4,
2879 trnCode % "1")
2880 # UABA
2881 threeEqualRegInstX("uaba", "UabaDX", "SimdAddAccOp", smallUnsignedTypes, 2,
2882 abaCode, True)
2883 threeEqualRegInstX("uaba", "UabaQX", "SimdAddAccOp", smallUnsignedTypes, 4,
2884 abaCode, True)
2885 # UABAL, UABAL2
2886 threeRegLongInstX("uabal", "UabalX", "SimdAddAccOp", smallUnsignedTypes,
2887 abalCode, True)
2888 threeRegLongInstX("uabal2", "Uabal2X", "SimdAddAccOp", smallUnsignedTypes,
2889 abalCode, True, hi=True)
2890 # UABD
2891 threeEqualRegInstX("uabd", "UabdDX", "SimdAddOp", smallUnsignedTypes, 2,
2892 abdCode)
2893 threeEqualRegInstX("uabd", "UabdQX", "SimdAddOp", smallUnsignedTypes, 4,
2894 abdCode)
2895 # UABDL, UABDL2
2896 threeRegLongInstX("uabdl", "UabdlX", "SimdAddAccOp", smallUnsignedTypes,
2897 abdlCode, True)
2898 threeRegLongInstX("uabdl2", "Uabdl2X", "SimdAddAccOp", smallUnsignedTypes,
2899 abdlCode, True, hi=True)
2900 # UADALP
2901 twoRegCondenseInstX("uadalp", "UadalpDX", "SimdAddOp", smallUnsignedTypes,
2902 2, adalpCode, True)
2903 twoRegCondenseInstX("uadalp", "UadalpQX", "SimdAddOp", smallUnsignedTypes,
2904 4, adalpCode, True)
2905 # UADDL, UADDL2
2906 threeRegLongInstX("uaddl", "UaddlX", "SimdAddAccOp", smallUnsignedTypes,
2907 addlwCode)
2908 threeRegLongInstX("uaddl2", "Uaddl2X", "SimdAddAccOp", smallUnsignedTypes,
2909 addlwCode, hi=True)
2910 # UADDLP
2911 twoRegCondenseInstX("uaddlp", "UaddlpDX", "SimdAddOp", smallUnsignedTypes,
2912 2, addlwCode)
2913 twoRegCondenseInstX("uaddlp", "UaddlpQX", "SimdAddOp", smallUnsignedTypes,
2914 4, addlwCode)
2915 # UADDLV
2916 twoRegAcrossInstX("uaddlv", "UaddlvDX", "SimdAddOp",
2917 ("uint8_t", "uint16_t"), 2, addAcrossLongCode, long=True)
2918 twoRegAcrossInstX("uaddlv", "UaddlvQX", "SimdAddOp",
2919 ("uint8_t", "uint16_t"), 4, addAcrossLongCode, long=True)
2920 twoRegAcrossInstX("uaddlv", "UaddlvBQX", "SimdAddOp", ("uint32_t",), 4,
2921 addAcrossLongCode, doubleDest=True, long=True)
2922 # UADDW
2923 threeRegWideInstX("uaddw", "UaddwX", "SimdAddAccOp", smallUnsignedTypes,
2924 addlwCode)
2925 threeRegWideInstX("uaddw2", "Uaddw2X", "SimdAddAccOp", smallUnsignedTypes,
2926 addlwCode, hi=True)
2927 # UCVTF (fixed-point)
2928 ucvtfFixedCode = fpOp % ("fplibFixedToFP<Element>(srcElem1, imm, true,"
2929 " FPCRRounding(fpscr), fpscr)")
2930 twoEqualRegInstX("ucvtf", "UcvtfFixedDX", "SimdCvtOp", smallFloatTypes, 2,
2931 ucvtfFixedCode, hasImm=True)
2932 twoEqualRegInstX("ucvtf", "UcvtfFixedQX", "SimdCvtOp", floatTypes, 4,
2933 ucvtfFixedCode, hasImm=True)
2934 twoEqualRegInstX("ucvtf", "UcvtfFixedScX", "SimdCvtOp", floatTypes, 4,
2935 ucvtfFixedCode, hasImm=True, scalar=True)
2936 # UCVTF (integer)
2937 ucvtfIntCode = fpOp % ("fplibFixedToFP<Element>(srcElem1, 0, true,"
2938 " FPCRRounding(fpscr), fpscr)")
2939 twoEqualRegInstX("ucvtf", "UcvtfIntDX", "SimdCvtOp", smallFloatTypes, 2,
2940 ucvtfIntCode)
2941 twoEqualRegInstX("ucvtf", "UcvtfIntQX", "SimdCvtOp", floatTypes, 4,
2942 ucvtfIntCode)
2943 twoEqualRegInstX("ucvtf", "UcvtfIntScX", "SimdCvtOp", floatTypes, 4,
2944 ucvtfIntCode, scalar=True)
2945 # UHADD
2946 threeEqualRegInstX("uhadd", "UhaddDX", "SimdAddOp", smallUnsignedTypes, 2,
2947 haddCode)
2948 threeEqualRegInstX("uhadd", "UhaddQX", "SimdAddOp", smallUnsignedTypes, 4,
2949 haddCode)
2950 # UHSUB
2951 threeEqualRegInstX("uhsub", "UhsubDX", "SimdAddOp", smallUnsignedTypes, 2,
2952 hsubCode)
2953 threeEqualRegInstX("uhsub", "UhsubQX", "SimdAddOp", smallUnsignedTypes, 4,
2954 hsubCode)
2955 # UMAX
2956 threeEqualRegInstX("umax", "UmaxDX", "SimdCmpOp", smallUnsignedTypes, 2,
2957 maxCode)
2958 threeEqualRegInstX("umax", "UmaxQX", "SimdCmpOp", smallUnsignedTypes, 4,
2959 maxCode)
2960 # UMAXP
2961 threeEqualRegInstX("umaxp", "UmaxpDX", "SimdCmpOp", smallUnsignedTypes, 2,
2962 maxCode, pairwise=True)
2963 threeEqualRegInstX("umaxp", "UmaxpQX", "SimdCmpOp", smallUnsignedTypes, 4,
2964 maxCode, pairwise=True)
2965 # UMAXV
2966 twoRegAcrossInstX("umaxv", "UmaxvDX", "SimdCmpOp", ("uint8_t", "uint16_t"),
2967 2, maxAcrossCode)
2968 twoRegAcrossInstX("umaxv", "UmaxvQX", "SimdCmpOp", smallUnsignedTypes, 4,
2969 maxAcrossCode)
2970 # UMIN
2971 threeEqualRegInstX("umin", "UminDX", "SimdCmpOp", smallUnsignedTypes, 2,
2972 minCode)
2973 threeEqualRegInstX("umin", "UminQX", "SimdCmpOp", smallUnsignedTypes, 4,
2974 minCode)
2975 # UMINP
2976 threeEqualRegInstX("uminp", "UminpDX", "SimdCmpOp", smallUnsignedTypes, 2,
2977 minCode, pairwise=True)
2978 threeEqualRegInstX("uminp", "UminpQX", "SimdCmpOp", smallUnsignedTypes, 4,
2979 minCode, pairwise=True)
2980 # UMINV
2981 twoRegAcrossInstX("uminv", "UminvDX", "SimdCmpOp", ("uint8_t", "uint16_t"),
2982 2, minAcrossCode)
2983 twoRegAcrossInstX("uminv", "UminvQX", "SimdCmpOp", smallUnsignedTypes, 4,
2984 minAcrossCode)
2985 # UMLAL (by element)
2986 threeRegLongInstX("umlal", "UmlalElemX", "SimdMultAccOp",
2987 smallUnsignedTypes, mlalCode, True, byElem=True)
2988 threeRegLongInstX("umlal", "UmlalElem2X", "SimdMultAccOp",
2989 smallUnsignedTypes, mlalCode, True, byElem=True, hi=True)
2990 # UMLAL (vector)
2991 threeRegLongInstX("umlal", "UmlalX", "SimdMultAccOp", smallUnsignedTypes,
2992 mlalCode, True)
2993 threeRegLongInstX("umlal", "Umlal2X", "SimdMultAccOp", smallUnsignedTypes,
2994 mlalCode, True, hi=True)
2995 # UMLSL (by element)
2996 threeRegLongInstX("umlsl", "UmlslElemX", "SimdMultAccOp",
2997 smallUnsignedTypes, mlslCode, True, byElem=True)
2998 threeRegLongInstX("umlsl", "UmlslElem2X", "SimdMultAccOp",
2999 smallUnsignedTypes, mlslCode, True, byElem=True, hi=True)
3000 # UMLSL (vector)
3001 threeRegLongInstX("umlsl", "UmlslX", "SimdMultAccOp", smallUnsignedTypes,
3002 mlslCode, True)
3003 threeRegLongInstX("umlsl", "Umlsl2X", "SimdMultAccOp", smallUnsignedTypes,
3004 mlslCode, True, hi=True)
3005 # UMOV
3006 insToGprInstX("umov", "UmovWX", "SimdMiscOp", smallUnsignedTypes, 4, 'W')
3007 insToGprInstX("umov", "UmovXX", "SimdMiscOp", ("uint64_t",), 4, 'X')
3008 # UMULL, UMULL2 (by element)
3009 threeRegLongInstX("umull", "UmullElemX", "SimdMultOp", smallUnsignedTypes,
3010 mullCode, byElem=True)
3011 threeRegLongInstX("umull", "UmullElem2X", "SimdMultOp", smallUnsignedTypes,
3012 mullCode, byElem=True, hi=True)
3013 # UMULL, UMULL2 (vector)
3014 threeRegLongInstX("umull", "UmullX", "SimdMultOp", smallUnsignedTypes,
3015 mullCode)
3016 threeRegLongInstX("umull", "Umull2X", "SimdMultOp", smallUnsignedTypes,
3017 mullCode, hi=True)
3018 # UQADD
3019 uqaddCode = '''
3020 destElem = srcElem1 + srcElem2;
3021 FPSCR fpscr = (FPSCR) FpscrQc;
3022 if (destElem < srcElem1 || destElem < srcElem2) {
3023 destElem = (Element)(-1);
3024 fpscr.qc = 1;
3025 }
3026 FpscrQc = fpscr;
3027 '''
3028 threeEqualRegInstX("uqadd", "UqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
3029 uqaddCode)
3030 threeEqualRegInstX("uqadd", "UqaddQX", "SimdAddOp", unsignedTypes, 4,
3031 uqaddCode)
3032 threeEqualRegInstX("uqadd", "UqaddScX", "SimdAddOp", unsignedTypes, 4,
3033 uqaddCode, scalar=True)
3034 # UQRSHL
3035 uqrshlCode = '''
3036 int16_t shiftAmt = (int8_t)srcElem2;
3037 FPSCR fpscr = (FPSCR) FpscrQc;
3038 if (shiftAmt < 0) {
3039 shiftAmt = -shiftAmt;
3040 Element rBit = 0;
3041 if (shiftAmt <= sizeof(Element) * 8)
3042 rBit = bits(srcElem1, shiftAmt - 1);
3043 if (shiftAmt >= sizeof(Element) * 8) {
3044 shiftAmt = sizeof(Element) * 8 - 1;
3045 destElem = 0;
3046 } else {
3047 destElem = (srcElem1 >> shiftAmt);
3048 }
3049 destElem += rBit;
3050 } else {
3051 if (shiftAmt >= sizeof(Element) * 8) {
3052 if (srcElem1 != 0) {
3053 destElem = mask(sizeof(Element) * 8);
3054 fpscr.qc = 1;
3055 } else {
3056 destElem = 0;
3057 }
3058 } else {
3059 if (bits(srcElem1, sizeof(Element) * 8 - 1,
3060 sizeof(Element) * 8 - shiftAmt)) {
3061 destElem = mask(sizeof(Element) * 8);
3062 fpscr.qc = 1;
3063 } else {
3064 destElem = srcElem1 << shiftAmt;
3065 }
3066 }
3067 }
3068 FpscrQc = fpscr;
3069 '''
3070 threeEqualRegInstX("uqrshl", "UqrshlDX", "SimdCmpOp", smallUnsignedTypes,
3071 2, uqrshlCode)
3072 threeEqualRegInstX("uqrshl", "UqrshlQX", "SimdCmpOp", unsignedTypes, 4,
3073 uqrshlCode)
3074 threeEqualRegInstX("uqrshl", "UqrshlScX", "SimdCmpOp", unsignedTypes, 4,
3075 uqrshlCode, scalar=True)
3076 # UQRSHRN
3077 uqrshrnCode = '''
3078 FPSCR fpscr = (FPSCR) FpscrQc;
3079 if (imm > sizeof(srcElem1) * 8) {
3080 if (srcElem1 != 0)
3081 fpscr.qc = 1;
3082 destElem = 0;
3083 } else if (imm) {
3084 BigElement mid = (srcElem1 >> (imm - 1));
3085 uint64_t rBit = mid & 0x1;
3086 mid >>= 1;
3087 mid += rBit;
3088 if (mid != (Element)mid) {
3089 destElem = mask(sizeof(Element) * 8);
3090 fpscr.qc = 1;
3091 } else {
3092 destElem = mid;
3093 }
3094 } else {
3095 if (srcElem1 != (Element)srcElem1) {
3096 destElem = mask(sizeof(Element) * 8 - 1);
3097 fpscr.qc = 1;
3098 } else {
3099 destElem = srcElem1;
3100 }
3101 }
3102 FpscrQc = fpscr;
3103 '''
3104 twoRegNarrowInstX("uqrshrn", "UqrshrnX", "SimdShiftOp", smallUnsignedTypes,
3105 uqrshrnCode, hasImm=True)
3106 twoRegNarrowInstX("uqrshrn2", "Uqrshrn2X", "SimdShiftOp",
3107 smallUnsignedTypes, uqrshrnCode, hasImm=True, hi=True)
3108 twoRegNarrowInstX("uqrshrn", "UqrshrnScX", "SimdShiftOp",
3109 smallUnsignedTypes, uqrshrnCode, hasImm=True,
3110 scalar=True)
3111 # UQSHL (immediate)
3112 uqshlImmCode = '''
3113 FPSCR fpscr = (FPSCR) FpscrQc;
3114 if (imm >= sizeof(Element) * 8) {
3115 if (srcElem1 != 0) {
3116 destElem = mask(sizeof(Element) * 8);
3117 fpscr.qc = 1;
3118 } else {
3119 destElem = 0;
3120 }
3121 } else if (imm) {
3122 destElem = (srcElem1 << imm);
3123 uint64_t topBits = bits((uint64_t)srcElem1,
3124 sizeof(Element) * 8 - 1,
3125 sizeof(Element) * 8 - imm);
3126 if (topBits != 0) {
3127 destElem = mask(sizeof(Element) * 8);
3128 fpscr.qc = 1;
3129 }
3130 } else {
3131 destElem = srcElem1;
3132 }
3133 FpscrQc = fpscr;
3134 '''
3135 twoEqualRegInstX("uqshl", "UqshlImmDX", "SimdAluOp", smallUnsignedTypes, 2,
3136 uqshlImmCode, hasImm=True)
3137 twoEqualRegInstX("uqshl", "UqshlImmQX", "SimdAluOp", unsignedTypes, 4,
3138 uqshlImmCode, hasImm=True)
3139 twoEqualRegInstX("uqshl", "UqshlImmScX", "SimdAluOp", unsignedTypes, 4,
3140 uqshlImmCode, hasImm=True, scalar=True)
3141 # UQSHL (register)
3142 uqshlCode = '''
3143 int16_t shiftAmt = (int8_t)srcElem2;
3144 FPSCR fpscr = (FPSCR) FpscrQc;
3145 if (shiftAmt < 0) {
3146 shiftAmt = -shiftAmt;
3147 if (shiftAmt >= sizeof(Element) * 8) {
3148 shiftAmt = sizeof(Element) * 8 - 1;
3149 destElem = 0;
3150 } else {
3151 destElem = (srcElem1 >> shiftAmt);
3152 }
3153 } else if (shiftAmt > 0) {
3154 if (shiftAmt >= sizeof(Element) * 8) {
3155 if (srcElem1 != 0) {
3156 destElem = mask(sizeof(Element) * 8);
3157 fpscr.qc = 1;
3158 } else {
3159 destElem = 0;
3160 }
3161 } else {
3162 if (bits(srcElem1, sizeof(Element) * 8 - 1,
3163 sizeof(Element) * 8 - shiftAmt)) {
3164 destElem = mask(sizeof(Element) * 8);
3165 fpscr.qc = 1;
3166 } else {
3167 destElem = srcElem1 << shiftAmt;
3168 }
3169 }
3170 } else {
3171 destElem = srcElem1;
3172 }
3173 FpscrQc = fpscr;
3174 '''
3175 threeEqualRegInstX("uqshl", "UqshlDX", "SimdAluOp", smallUnsignedTypes, 2,
3176 uqshlCode)
3177 threeEqualRegInstX("uqshl", "UqshlQX", "SimdAluOp", unsignedTypes, 4,
3178 uqshlCode)
3179 threeEqualRegInstX("uqshl", "UqshlScX", "SimdAluOp", unsignedTypes, 4,
3180 uqshlCode, scalar=True)
3181 # UQSHRN, UQSHRN2
3182 uqshrnCode = '''
3183 FPSCR fpscr = (FPSCR) FpscrQc;
3184 if (imm > sizeof(srcElem1) * 8) {
3185 if (srcElem1 != 0)
3186 fpscr.qc = 1;
3187 destElem = 0;
3188 } else if (imm) {
3189 BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
3190 if (mid != (Element)mid) {
3191 destElem = mask(sizeof(Element) * 8);
3192 fpscr.qc = 1;
3193 } else {
3194 destElem = mid;
3195 }
3196 } else {
3197 destElem = srcElem1;
3198 }
3199 FpscrQc = fpscr;
3200 '''
3201 twoRegNarrowInstX("uqshrn", "UqshrnX", "SimdShiftOp", smallUnsignedTypes,
3202 uqshrnCode, hasImm=True)
3203 twoRegNarrowInstX("uqshrn2", "Uqshrn2X", "SimdShiftOp", smallUnsignedTypes,
3204 uqshrnCode, hasImm=True, hi=True)
3205 twoRegNarrowInstX("uqshrn", "UqshrnScX", "SimdShiftOp", smallUnsignedTypes,
3206 uqshrnCode, hasImm=True, scalar=True)
3207 # UQSUB
3208 uqsubCode = '''
3209 destElem = srcElem1 - srcElem2;
3210 FPSCR fpscr = (FPSCR) FpscrQc;
3211 if (destElem > srcElem1) {
3212 destElem = 0;
3213 fpscr.qc = 1;
3214 }
3215 FpscrQc = fpscr;
3216 '''
3217 threeEqualRegInstX("uqsub", "UqsubDX", "SimdAddOp", smallUnsignedTypes, 2,
3218 uqsubCode)
3219 threeEqualRegInstX("uqsub", "UqsubQX", "SimdAddOp", unsignedTypes, 4,
3220 uqsubCode)
3221 threeEqualRegInstX("uqsub", "UqsubScX", "SimdAddOp", unsignedTypes, 4,
3222 uqsubCode, scalar=True)
3223 # UQXTN
3224 uqxtnCode = '''
3225 FPSCR fpscr = (FPSCR) FpscrQc;
3226 destElem = srcElem1;
3227 if ((BigElement)destElem != srcElem1) {
3228 fpscr.qc = 1;
3229 destElem = mask(sizeof(Element) * 8);
3230 }
3231 FpscrQc = fpscr;
3232 '''
3233 twoRegNarrowInstX("uqxtn", "UqxtnX", "SimdMiscOp", smallUnsignedTypes,
3234 uqxtnCode)
3235 twoRegNarrowInstX("uqxtn", "Uqxtn2X", "SimdMiscOp", smallUnsignedTypes,
3236 uqxtnCode, hi=True)
3237 twoRegNarrowInstX("uqxtn", "UqxtnScX", "SimdMiscOp", smallUnsignedTypes,
3238 uqxtnCode, scalar=True)
3239 # URECPE
3240 urecpeCode = "destElem = unsignedRecipEstimate(srcElem1);"
3241 twoEqualRegInstX("urecpe", "UrecpeDX", "SimdMultAccOp", ("uint32_t",), 2,
3242 urecpeCode)
3243 twoEqualRegInstX("urecpe", "UrecpeQX", "SimdMultAccOp", ("uint32_t",), 4,
3244 urecpeCode)
3245 # URHADD
3246 threeEqualRegInstX("urhadd", "UrhaddDX", "SimdAddOp", smallUnsignedTypes,
3247 2, rhaddCode)
3248 threeEqualRegInstX("urhadd", "UrhaddQX", "SimdAddOp", smallUnsignedTypes,
3249 4, rhaddCode)
3250 # URSHL
3251 threeEqualRegInstX("urshl", "UrshlDX", "SimdShiftOp", unsignedTypes, 2,
3252 rshlCode)
3253 threeEqualRegInstX("urshl", "UrshlQX", "SimdShiftOp", unsignedTypes, 4,
3254 rshlCode)
3255 # URSHR
3256 twoEqualRegInstX("urshr", "UrshrDX", "SimdShiftOp", unsignedTypes, 2,
3257 rshrCode, hasImm=True)
3258 twoEqualRegInstX("urshr", "UrshrQX", "SimdShiftOp", unsignedTypes, 4,
3259 rshrCode, hasImm=True)
3260 # URSQRTE
3261 ursqrteCode = "destElem = unsignedRSqrtEstimate(srcElem1);"
3262 twoEqualRegInstX("ursqrte", "UrsqrteDX", "SimdSqrtOp", ("uint32_t",), 2,
3263 ursqrteCode)
3264 twoEqualRegInstX("ursqrte", "UrsqrteQX", "SimdSqrtOp", ("uint32_t",), 4,
3265 ursqrteCode)
3266 # URSRA
3267 twoEqualRegInstX("ursra", "UrsraDX", "SimdShiftOp", unsignedTypes, 2,
3268 rsraCode, True, hasImm=True)
3269 twoEqualRegInstX("ursra", "UrsraQX", "SimdShiftOp", unsignedTypes, 4,
3270 rsraCode, True, hasImm=True)
3271 # USHL
3272 threeEqualRegInstX("ushl", "UshlDX", "SimdShiftOp", unsignedTypes, 2,
3273 shlCode)
3274 threeEqualRegInstX("ushl", "UshlQX", "SimdShiftOp", unsignedTypes, 4,
3275 shlCode)
3276 # USHLL, USHLL2
3277 twoRegLongInstX("ushll", "UshllX", "SimdShiftOp", smallUnsignedTypes,
3278 shllCode, hasImm=True)
3279 twoRegLongInstX("ushll", "Ushll2X", "SimdShiftOp", smallUnsignedTypes,
3280 shllCode, hi=True, hasImm=True)
3281 # USHR
3282 twoEqualRegInstX("ushr", "UshrDX", "SimdShiftOp", unsignedTypes, 2,
3283 shrCode, hasImm=True)
3284 twoEqualRegInstX("ushr", "UshrQX", "SimdShiftOp", unsignedTypes, 4,
3285 shrCode, hasImm=True)
3286 # USQADD
3287 usqaddCode = '''
3288 FPSCR fpscr = (FPSCR) FpscrQc;
3289 Element tmp = destElem + srcElem1;
3290 if (bits(srcElem1, sizeof(Element) * 8 - 1) == 0) {
3291 if (tmp < srcElem1 || tmp < destElem) {
3292 destElem = (Element)(-1);
3293 fpscr.qc = 1;
3294 } else {
3295 destElem = tmp;
3296 }
3297 } else {
3298 Element absSrcElem1 = (~srcElem1) + 1;
3299 if (absSrcElem1 > destElem) {
3300 destElem = 0;
3301 fpscr.qc = 1;
3302 } else {
3303 destElem = tmp;
3304 }
3305 }
3306 FpscrQc = fpscr;
3307 '''
3308 twoEqualRegInstX("usqadd", "UsqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
3309 usqaddCode, True)
3310 twoEqualRegInstX("usqadd", "UsqaddQX", "SimdAddOp", unsignedTypes, 4,
3311 usqaddCode, True)
3312 twoEqualRegInstX("usqadd", "UsqaddScX", "SimdAddOp", unsignedTypes, 4,
3313 usqaddCode, True, scalar=True)
3314 # USRA
3315 twoEqualRegInstX("usra", "UsraDX", "SimdShiftOp", unsignedTypes, 2,
3316 sraCode, True, hasImm=True)
3317 twoEqualRegInstX("usra", "UsraQX", "SimdShiftOp", unsignedTypes, 4,
3318 sraCode, True, hasImm=True)
3319 # USUBL
3320 threeRegLongInstX("usubl", "UsublX", "SimdAddOp", smallUnsignedTypes,
3321 sublwCode)
3322 threeRegLongInstX("usubl2", "Usubl2X", "SimdAddOp", smallUnsignedTypes,
3323 sublwCode, hi=True)
3324 # USUBW
3325 threeRegWideInstX("usubw", "UsubwX", "SimdAddOp", smallUnsignedTypes,
3326 sublwCode)
3327 threeRegWideInstX("usubw2", "Usubw2X", "SimdAddOp", smallUnsignedTypes,
3328 sublwCode, hi=True)
3329 # UXTL -> alias to USHLL
3330 # UZP1
3331 uzpCode = '''
3332 unsigned part = %s;
3333 for (unsigned i = 0; i < eCount / 2; i++) {
3334 destReg.elements[i] = srcReg1.elements[2 * i + part];
3335 destReg.elements[eCount / 2 + i] = srcReg2.elements[2 * i + part];
3336 }
3337 '''
3338 threeRegScrambleInstX("Uzp1", "Uzp1DX", "SimdAluOp", smallUnsignedTypes, 2,
3339 uzpCode % "0")
3340 threeRegScrambleInstX("Uzp1", "Uzp1QX", "SimdAluOp", unsignedTypes, 4,
3341 uzpCode % "0")
3342 # UZP2
3343 threeRegScrambleInstX("Uzp2", "Uzp2DX", "SimdAluOp", smallUnsignedTypes, 2,
3344 uzpCode % "1")
3345 threeRegScrambleInstX("Uzp2", "Uzp2QX", "SimdAluOp", unsignedTypes, 4,
3346 uzpCode % "1")
3347 # XTN, XTN2
3348 xtnCode = "destElem = srcElem1;"
3349 twoRegNarrowInstX("Xtn", "XtnX", "SimdMiscOp", smallUnsignedTypes, xtnCode)
3350 twoRegNarrowInstX("Xtn", "Xtn2X", "SimdMiscOp", smallUnsignedTypes,
3351 xtnCode, hi=True)
3352 # ZIP1
3353 zipCode = '''
3354 unsigned base = %s;
3355 for (unsigned i = 0; i < eCount / 2; i++) {
3356 destReg.elements[2 * i] = srcReg1.elements[base + i];
3357 destReg.elements[2 * i + 1] = srcReg2.elements[base + i];
3358 }
3359 '''
3360 threeRegScrambleInstX("zip1", "Zip1DX", "SimdAluOp", smallUnsignedTypes, 2,
3361 zipCode % "0")
3362 threeRegScrambleInstX("zip1", "Zip1QX", "SimdAluOp", unsignedTypes, 4,
3363 zipCode % "0")
3364 # ZIP2
3365 threeRegScrambleInstX("zip2", "Zip2DX", "SimdAluOp", smallUnsignedTypes, 2,
3366 zipCode % "eCount / 2")
3367 threeRegScrambleInstX("zip2", "Zip2QX", "SimdAluOp", unsignedTypes, 4,
3368 zipCode % "eCount / 2")
3369
3370 for decoderFlavour, type_dict in decoders.iteritems():
3371 header_output += '''
3372 class %(decoder_flavour)sDecoder {
3373 public:
3374 ''' % { "decoder_flavour" : decoderFlavour }
3375 for type,name in type_dict.iteritems():
3376 header_output += '''
3377 template<typename Elem> using %(type)s = %(new_name)s<Elem>;''' % {
3378 "type" : type, "new_name" : name
3379 }
3380 header_output += '''
3381 };'''
3382}};