neon64_mem.isa (10474:799c8ee4ecba) neon64_mem.isa (10537:47fe87b0cf97)
1// -*- mode: c++ -*-
2
3// Copyright (c) 2012-2014 ARM Limited
4// All rights reserved
5//
6// The license below extends only to copyright in the software and shall
7// not be construed as granting a license to any other intellectual
8// property including but not limited to intellectual property relating
9// to a hardware implementation of the functionality of the software
10// licensed hereunder. You may use the software subject to the license
11// terms below provided that you ensure that this notice is replicated
12// unmodified and in its entirety in all distributions of the software,
13// modified or unmodified, in source code or in binary form.
14//
15// Redistribution and use in source and binary forms, with or without
16// modification, are permitted provided that the following conditions are
17// met: redistributions of source code must retain the above copyright
18// notice, this list of conditions and the following disclaimer;
19// redistributions in binary form must reproduce the above copyright
20// notice, this list of conditions and the following disclaimer in the
21// documentation and/or other materials provided with the distribution;
22// neither the name of the copyright holders nor the names of its
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Authors: Mbou Eyole
39// Giacomo Gabrielli
40
41let {{
42
43 header_output = ''
44 decoder_output = ''
45 exec_output = ''
46
47 def mkMemAccMicroOp(name):
48 global header_output, decoder_output, exec_output
49 SPAlignmentCheckCodeNeon = '''
50 if (baseIsSP && bits(XURa, 3, 0) &&
51 SPAlignmentCheckEnabled(xc->tcBase())) {
52 return std::make_shared<SPAlignmentFault>();
53 }
54 '''
55 eaCode = SPAlignmentCheckCodeNeon + '''
56 EA = XURa + imm;
57 '''
58 memDecl = '''
59 const int MaxNumBytes = 16;
60 union MemUnion {
61 uint8_t bytes[MaxNumBytes];
62 uint32_t floatRegBits[MaxNumBytes / 4];
63 };
64 '''
65
66 # Do endian conversion for all the elements
67 convCode = '''
68 VReg x = {0, 0};
69
70 x.lo = (((XReg) memUnion.floatRegBits[1]) << 32) |
71 (XReg) memUnion.floatRegBits[0];
72 x.hi = (((XReg) memUnion.floatRegBits[3]) << 32) |
73 (XReg) memUnion.floatRegBits[2];
74
75 const unsigned eCount = 16 / (1 << eSize);
76
77 if (isBigEndian64(xc->tcBase())) {
78 for (unsigned i = 0; i < eCount; i++) {
79 switch (eSize) {
80 case 0x3: // 64-bit
81 writeVecElem(&x, (XReg) gtobe(
82 (uint64_t) readVecElem(x, i, eSize)), i, eSize);
83 break;
84 case 0x2: // 32-bit
85 writeVecElem(&x, (XReg) gtobe(
86 (uint32_t) readVecElem(x, i, eSize)), i, eSize);
87 break;
88 case 0x1: // 16-bit
89 writeVecElem(&x, (XReg) gtobe(
90 (uint16_t) readVecElem(x, i, eSize)), i, eSize);
91 break;
92 default: // 8-bit
93 break; // Nothing to do here
94 }
95 }
96 } else {
97 for (unsigned i = 0; i < eCount; i++) {
98 switch (eSize) {
99 case 0x3: // 64-bit
100 writeVecElem(&x, (XReg) gtole(
101 (uint64_t) readVecElem(x, i, eSize)), i, eSize);
102 break;
103 case 0x2: // 32-bit
104 writeVecElem(&x, (XReg) gtole(
105 (uint32_t) readVecElem(x, i, eSize)), i, eSize);
106 break;
107 case 0x1: // 16-bit
108 writeVecElem(&x, (XReg) gtole(
109 (uint16_t) readVecElem(x, i, eSize)), i, eSize);
110 break;
111 default: // 8-bit
112 break; // Nothing to do here
113 }
114 }
115 }
116
117 memUnion.floatRegBits[0] = (uint32_t) x.lo;
118 memUnion.floatRegBits[1] = (uint32_t) (x.lo >> 32);
119 memUnion.floatRegBits[2] = (uint32_t) x.hi;
120 memUnion.floatRegBits[3] = (uint32_t) (x.hi >> 32);
121 '''
122
123 # Offload everything into registers
124 regSetCode = ''
125 for reg in range(4):
126 regSetCode += '''
127 AA64FpDestP%(reg)d_uw = gtoh(memUnion.floatRegBits[%(reg)d]);
128 ''' % { 'reg' : reg }
129
130 # Pull everything in from registers
131 regGetCode = ''
132 for reg in range(4):
133 regGetCode += '''
134 memUnion.floatRegBits[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
135 ''' % { 'reg' : reg }
136
137 loadMemAccCode = convCode + regSetCode
138 storeMemAccCode = regGetCode + convCode
139
140 loadIop = InstObjParams(name + 'ld',
141 'MicroNeonLoad64',
142 'MicroNeonMemOp',
143 { 'mem_decl' : memDecl,
144 'memacc_code' : loadMemAccCode,
145 'ea_code' : simd64EnabledCheckCode + eaCode,
146 },
147 [ 'IsMicroop', 'IsMemRef', 'IsLoad' ])
148 storeIop = InstObjParams(name + 'st',
149 'MicroNeonStore64',
150 'MicroNeonMemOp',
151 { 'mem_decl' : memDecl,
152 'memacc_code' : storeMemAccCode,
153 'ea_code' : simd64EnabledCheckCode + eaCode,
154 },
155 [ 'IsMicroop', 'IsMemRef', 'IsStore' ])
156
157 exec_output += NeonLoadExecute64.subst(loadIop) + \
158 NeonLoadInitiateAcc64.subst(loadIop) + \
159 NeonLoadCompleteAcc64.subst(loadIop) + \
160 NeonStoreExecute64.subst(storeIop) + \
161 NeonStoreInitiateAcc64.subst(storeIop) + \
162 NeonStoreCompleteAcc64.subst(storeIop)
163 header_output += MicroNeonMemDeclare64.subst(loadIop) + \
164 MicroNeonMemDeclare64.subst(storeIop)
165
166 def mkMarshalMicroOp(name, Name, numRegs=4):
167 global header_output, decoder_output, exec_output
168
169 getInputCodeOp1L = ''
170 for v in range(numRegs):
171 for p in range(4):
172 getInputCodeOp1L += '''
173 writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)d_uw,
174 %(p)d, 0x2);
175 ''' % { 'v' : v, 'p' : p }
176
177 getInputCodeOp1S = ''
178 for v in range(numRegs):
179 for p in range(4):
180 getInputCodeOp1S += '''
181 writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)dS_uw,
182 %(p)d, 0x2);
183 ''' % { 'v' : v, 'p' : p }
184
185 if name == 'deint_neon_uop':
186
187 eCode = '''
1// -*- mode: c++ -*-
2
3// Copyright (c) 2012-2014 ARM Limited
4// All rights reserved
5//
6// The license below extends only to copyright in the software and shall
7// not be construed as granting a license to any other intellectual
8// property including but not limited to intellectual property relating
9// to a hardware implementation of the functionality of the software
10// licensed hereunder. You may use the software subject to the license
11// terms below provided that you ensure that this notice is replicated
12// unmodified and in its entirety in all distributions of the software,
13// modified or unmodified, in source code or in binary form.
14//
15// Redistribution and use in source and binary forms, with or without
16// modification, are permitted provided that the following conditions are
17// met: redistributions of source code must retain the above copyright
18// notice, this list of conditions and the following disclaimer;
19// redistributions in binary form must reproduce the above copyright
20// notice, this list of conditions and the following disclaimer in the
21// documentation and/or other materials provided with the distribution;
22// neither the name of the copyright holders nor the names of its
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Authors: Mbou Eyole
39// Giacomo Gabrielli
40
41let {{
42
43 header_output = ''
44 decoder_output = ''
45 exec_output = ''
46
47 def mkMemAccMicroOp(name):
48 global header_output, decoder_output, exec_output
49 SPAlignmentCheckCodeNeon = '''
50 if (baseIsSP && bits(XURa, 3, 0) &&
51 SPAlignmentCheckEnabled(xc->tcBase())) {
52 return std::make_shared<SPAlignmentFault>();
53 }
54 '''
55 eaCode = SPAlignmentCheckCodeNeon + '''
56 EA = XURa + imm;
57 '''
58 memDecl = '''
59 const int MaxNumBytes = 16;
60 union MemUnion {
61 uint8_t bytes[MaxNumBytes];
62 uint32_t floatRegBits[MaxNumBytes / 4];
63 };
64 '''
65
66 # Do endian conversion for all the elements
67 convCode = '''
68 VReg x = {0, 0};
69
70 x.lo = (((XReg) memUnion.floatRegBits[1]) << 32) |
71 (XReg) memUnion.floatRegBits[0];
72 x.hi = (((XReg) memUnion.floatRegBits[3]) << 32) |
73 (XReg) memUnion.floatRegBits[2];
74
75 const unsigned eCount = 16 / (1 << eSize);
76
77 if (isBigEndian64(xc->tcBase())) {
78 for (unsigned i = 0; i < eCount; i++) {
79 switch (eSize) {
80 case 0x3: // 64-bit
81 writeVecElem(&x, (XReg) gtobe(
82 (uint64_t) readVecElem(x, i, eSize)), i, eSize);
83 break;
84 case 0x2: // 32-bit
85 writeVecElem(&x, (XReg) gtobe(
86 (uint32_t) readVecElem(x, i, eSize)), i, eSize);
87 break;
88 case 0x1: // 16-bit
89 writeVecElem(&x, (XReg) gtobe(
90 (uint16_t) readVecElem(x, i, eSize)), i, eSize);
91 break;
92 default: // 8-bit
93 break; // Nothing to do here
94 }
95 }
96 } else {
97 for (unsigned i = 0; i < eCount; i++) {
98 switch (eSize) {
99 case 0x3: // 64-bit
100 writeVecElem(&x, (XReg) gtole(
101 (uint64_t) readVecElem(x, i, eSize)), i, eSize);
102 break;
103 case 0x2: // 32-bit
104 writeVecElem(&x, (XReg) gtole(
105 (uint32_t) readVecElem(x, i, eSize)), i, eSize);
106 break;
107 case 0x1: // 16-bit
108 writeVecElem(&x, (XReg) gtole(
109 (uint16_t) readVecElem(x, i, eSize)), i, eSize);
110 break;
111 default: // 8-bit
112 break; // Nothing to do here
113 }
114 }
115 }
116
117 memUnion.floatRegBits[0] = (uint32_t) x.lo;
118 memUnion.floatRegBits[1] = (uint32_t) (x.lo >> 32);
119 memUnion.floatRegBits[2] = (uint32_t) x.hi;
120 memUnion.floatRegBits[3] = (uint32_t) (x.hi >> 32);
121 '''
122
123 # Offload everything into registers
124 regSetCode = ''
125 for reg in range(4):
126 regSetCode += '''
127 AA64FpDestP%(reg)d_uw = gtoh(memUnion.floatRegBits[%(reg)d]);
128 ''' % { 'reg' : reg }
129
130 # Pull everything in from registers
131 regGetCode = ''
132 for reg in range(4):
133 regGetCode += '''
134 memUnion.floatRegBits[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
135 ''' % { 'reg' : reg }
136
137 loadMemAccCode = convCode + regSetCode
138 storeMemAccCode = regGetCode + convCode
139
140 loadIop = InstObjParams(name + 'ld',
141 'MicroNeonLoad64',
142 'MicroNeonMemOp',
143 { 'mem_decl' : memDecl,
144 'memacc_code' : loadMemAccCode,
145 'ea_code' : simd64EnabledCheckCode + eaCode,
146 },
147 [ 'IsMicroop', 'IsMemRef', 'IsLoad' ])
148 storeIop = InstObjParams(name + 'st',
149 'MicroNeonStore64',
150 'MicroNeonMemOp',
151 { 'mem_decl' : memDecl,
152 'memacc_code' : storeMemAccCode,
153 'ea_code' : simd64EnabledCheckCode + eaCode,
154 },
155 [ 'IsMicroop', 'IsMemRef', 'IsStore' ])
156
157 exec_output += NeonLoadExecute64.subst(loadIop) + \
158 NeonLoadInitiateAcc64.subst(loadIop) + \
159 NeonLoadCompleteAcc64.subst(loadIop) + \
160 NeonStoreExecute64.subst(storeIop) + \
161 NeonStoreInitiateAcc64.subst(storeIop) + \
162 NeonStoreCompleteAcc64.subst(storeIop)
163 header_output += MicroNeonMemDeclare64.subst(loadIop) + \
164 MicroNeonMemDeclare64.subst(storeIop)
165
166 def mkMarshalMicroOp(name, Name, numRegs=4):
167 global header_output, decoder_output, exec_output
168
169 getInputCodeOp1L = ''
170 for v in range(numRegs):
171 for p in range(4):
172 getInputCodeOp1L += '''
173 writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)d_uw,
174 %(p)d, 0x2);
175 ''' % { 'v' : v, 'p' : p }
176
177 getInputCodeOp1S = ''
178 for v in range(numRegs):
179 for p in range(4):
180 getInputCodeOp1S += '''
181 writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)dS_uw,
182 %(p)d, 0x2);
183 ''' % { 'v' : v, 'p' : p }
184
185 if name == 'deint_neon_uop':
186
187 eCode = '''
188 VReg input[4]; // input data from scratch area
188 // input data from scratch area
189 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
189 VReg output[2]; // output data to arch. SIMD regs
190 VReg temp;
191 temp.lo = 0;
192 temp.hi = 0;
193 '''
194 for p in range(4):
195 eCode += '''
196 writeVecElem(&temp, (XReg) AA64FpDestP%(p)dV1L_uw, %(p)d, 0x2);
197 ''' % { 'p' : p }
198 eCode += getInputCodeOp1L
199
200 # Note that numRegs is not always the same as numStructElems; in
201 # particular, for LD1/ST1, numStructElems is 1 but numRegs can be
202 # 1, 2, 3 or 4
203
204 eCode += '''
205 output[0].lo = 0;
206 output[0].hi = 0;
207 output[1].lo = 0;
208 output[1].hi = 0;
209
210 int eCount = dataSize / (8 << eSize);
211 int eSizeBytes = 1 << eSize; // element size in bytes
212 int numBytes = step * dataSize / 4;
213 int totNumBytes = numRegs * dataSize / 8;
214
215 int structElemNo, pos, a, b;
216 XReg data;
217
218 for (int r = 0; r < 2; ++r) {
219 for (int i = 0; i < eCount; ++i) {
220 if (numBytes < totNumBytes) {
221 structElemNo = r + (step * 2);
222 if (numStructElems == 1) {
223 pos = (eSizeBytes * i) +
224 (eCount * structElemNo * eSizeBytes);
225 } else {
226 pos = (numStructElems * eSizeBytes * i) +
227 (structElemNo * eSizeBytes);
228 }
229 a = pos / 16;
230 b = (pos % 16) / eSizeBytes;
231 data = (XReg) readVecElem(input[a], (XReg) b,
232 eSize);
233 writeVecElem(&output[r], data, i, eSize);
234 numBytes += eSizeBytes;
235 }
236 }
237 }
238 '''
239 for p in range(4):
240 eCode += '''
241 AA64FpDestP%(p)dV0L_uw = (uint32_t) readVecElem(output[0],
242 %(p)d, 0x2);
243 ''' % { 'p' : p }
244 eCode += '''
245 if ((numRegs % 2 == 0) || (numRegs == 3 && step == 0)) {
246 '''
247 for p in range(4):
248 eCode += '''
249 AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(
250 output[1], %(p)d, 0x2);
251 ''' % { 'p' : p }
252 eCode += '''
253 } else {
254 '''
255 for p in range(4):
256 eCode += '''
257 AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(temp,
258 %(p)d, 0x2);
259 ''' % { 'p' : p }
260 eCode += '''
261 }
262 '''
263
264 iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
265 { 'code' : eCode, 'op_class' : 'No_OpClass' },
266 ['IsMicroop'])
267 header_output += MicroNeonMixDeclare64.subst(iop)
268 exec_output += MicroNeonMixExecute64.subst(iop)
269
270 elif name == 'int_neon_uop':
271
272 eCode = '''
190 VReg output[2]; // output data to arch. SIMD regs
191 VReg temp;
192 temp.lo = 0;
193 temp.hi = 0;
194 '''
195 for p in range(4):
196 eCode += '''
197 writeVecElem(&temp, (XReg) AA64FpDestP%(p)dV1L_uw, %(p)d, 0x2);
198 ''' % { 'p' : p }
199 eCode += getInputCodeOp1L
200
201 # Note that numRegs is not always the same as numStructElems; in
202 # particular, for LD1/ST1, numStructElems is 1 but numRegs can be
203 # 1, 2, 3 or 4
204
205 eCode += '''
206 output[0].lo = 0;
207 output[0].hi = 0;
208 output[1].lo = 0;
209 output[1].hi = 0;
210
211 int eCount = dataSize / (8 << eSize);
212 int eSizeBytes = 1 << eSize; // element size in bytes
213 int numBytes = step * dataSize / 4;
214 int totNumBytes = numRegs * dataSize / 8;
215
216 int structElemNo, pos, a, b;
217 XReg data;
218
219 for (int r = 0; r < 2; ++r) {
220 for (int i = 0; i < eCount; ++i) {
221 if (numBytes < totNumBytes) {
222 structElemNo = r + (step * 2);
223 if (numStructElems == 1) {
224 pos = (eSizeBytes * i) +
225 (eCount * structElemNo * eSizeBytes);
226 } else {
227 pos = (numStructElems * eSizeBytes * i) +
228 (structElemNo * eSizeBytes);
229 }
230 a = pos / 16;
231 b = (pos % 16) / eSizeBytes;
232 data = (XReg) readVecElem(input[a], (XReg) b,
233 eSize);
234 writeVecElem(&output[r], data, i, eSize);
235 numBytes += eSizeBytes;
236 }
237 }
238 }
239 '''
240 for p in range(4):
241 eCode += '''
242 AA64FpDestP%(p)dV0L_uw = (uint32_t) readVecElem(output[0],
243 %(p)d, 0x2);
244 ''' % { 'p' : p }
245 eCode += '''
246 if ((numRegs % 2 == 0) || (numRegs == 3 && step == 0)) {
247 '''
248 for p in range(4):
249 eCode += '''
250 AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(
251 output[1], %(p)d, 0x2);
252 ''' % { 'p' : p }
253 eCode += '''
254 } else {
255 '''
256 for p in range(4):
257 eCode += '''
258 AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(temp,
259 %(p)d, 0x2);
260 ''' % { 'p' : p }
261 eCode += '''
262 }
263 '''
264
265 iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
266 { 'code' : eCode, 'op_class' : 'No_OpClass' },
267 ['IsMicroop'])
268 header_output += MicroNeonMixDeclare64.subst(iop)
269 exec_output += MicroNeonMixExecute64.subst(iop)
270
271 elif name == 'int_neon_uop':
272
273 eCode = '''
273 VReg input[4]; // input data from arch. SIMD regs
274 // input data from arch. SIMD regs
275 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
274 VReg output[2]; // output data to scratch area
275 '''
276
277 eCode += getInputCodeOp1S
278
279 # Note that numRegs is not always the same as numStructElems; in
280 # particular, for LD1/ST1, numStructElems is 1 but numRegs can be
281 # 1, 2, 3 or 4
282
283 eCode += '''
284 int eCount = dataSize / (8 << eSize);
285 int eSizeBytes = 1 << eSize;
286 int totNumBytes = numRegs * dataSize / 8;
287 int numOutputElems = 128 / (8 << eSize);
288 int stepOffset = step * 32;
289
290 for (int i = 0; i < 2; ++i) {
291 output[i].lo = 0;
292 output[i].hi = 0;
293 }
294
295 int r = 0, k = 0, i, j;
296 XReg data;
297
298 for (int pos = stepOffset; pos < 32 + stepOffset;
299 pos += eSizeBytes) {
300 if (pos < totNumBytes) {
301 if (numStructElems == 1) {
302 i = (pos / eSizeBytes) % eCount;
303 j = pos / (eCount * eSizeBytes);
304 } else {
305 i = pos / (numStructElems * eSizeBytes);
306 j = (pos % (numStructElems * eSizeBytes)) /
307 eSizeBytes;
308 }
309 data = (XReg) readVecElem(input[j], (XReg) i, eSize);
310 writeVecElem(&output[r], data, k, eSize);
311 k++;
312 if (k == numOutputElems){
313 k = 0;
314 ++r;
315 }
316 }
317 }
318 '''
319 for v in range(2):
320 for p in range(4):
321 eCode += '''
322 AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem(
323 output[%(v)d], %(p)d, 0x2);
324 ''' % { 'v': v, 'p': p}
325
326 iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
327 { 'code' : eCode, 'op_class' : 'No_OpClass' },
328 ['IsMicroop'])
329 header_output += MicroNeonMixDeclare64.subst(iop)
330 exec_output += MicroNeonMixExecute64.subst(iop)
331
332 elif name == 'unpack_neon_uop':
333
334 eCode = '''
276 VReg output[2]; // output data to scratch area
277 '''
278
279 eCode += getInputCodeOp1S
280
281 # Note that numRegs is not always the same as numStructElems; in
282 # particular, for LD1/ST1, numStructElems is 1 but numRegs can be
283 # 1, 2, 3 or 4
284
285 eCode += '''
286 int eCount = dataSize / (8 << eSize);
287 int eSizeBytes = 1 << eSize;
288 int totNumBytes = numRegs * dataSize / 8;
289 int numOutputElems = 128 / (8 << eSize);
290 int stepOffset = step * 32;
291
292 for (int i = 0; i < 2; ++i) {
293 output[i].lo = 0;
294 output[i].hi = 0;
295 }
296
297 int r = 0, k = 0, i, j;
298 XReg data;
299
300 for (int pos = stepOffset; pos < 32 + stepOffset;
301 pos += eSizeBytes) {
302 if (pos < totNumBytes) {
303 if (numStructElems == 1) {
304 i = (pos / eSizeBytes) % eCount;
305 j = pos / (eCount * eSizeBytes);
306 } else {
307 i = pos / (numStructElems * eSizeBytes);
308 j = (pos % (numStructElems * eSizeBytes)) /
309 eSizeBytes;
310 }
311 data = (XReg) readVecElem(input[j], (XReg) i, eSize);
312 writeVecElem(&output[r], data, k, eSize);
313 k++;
314 if (k == numOutputElems){
315 k = 0;
316 ++r;
317 }
318 }
319 }
320 '''
321 for v in range(2):
322 for p in range(4):
323 eCode += '''
324 AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem(
325 output[%(v)d], %(p)d, 0x2);
326 ''' % { 'v': v, 'p': p}
327
328 iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
329 { 'code' : eCode, 'op_class' : 'No_OpClass' },
330 ['IsMicroop'])
331 header_output += MicroNeonMixDeclare64.subst(iop)
332 exec_output += MicroNeonMixExecute64.subst(iop)
333
334 elif name == 'unpack_neon_uop':
335
336 eCode = '''
335 VReg input[4]; //input data from scratch area
337 //input data from scratch area
338 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
336 VReg output[2]; //output data to arch. SIMD regs
337 '''
338
339 eCode += getInputCodeOp1L
340
341 # Fill output regs with register data initially. Note that
342 # elements in output register outside indexed lanes are left
343 # untouched
344 for v in range(2):
345 for p in range(4):
346 eCode += '''
347 writeVecElem(&output[%(v)d], (XReg) AA64FpDestP%(p)dV%(v)dL_uw,
348 %(p)d, 0x2);
349 ''' % { 'v': v, 'p': p}
350 eCode += '''
351 int eCount = dataSize / (8 << eSize);
352 int eCount128 = 128 / (8 << eSize);
353 int eSizeBytes = 1 << eSize;
354 int totNumBytes = numStructElems * eSizeBytes;
355 int numInputElems = eCount128;
356 int stepOffset = step * 2 * eSizeBytes;
357 int stepLimit = 2 * eSizeBytes;
358
359 int r = 0, i, j;
360 XReg data;
361
362 for (int pos = stepOffset; pos < stepLimit + stepOffset;
363 pos += eSizeBytes) {
364 if (pos < totNumBytes) {
365 r = pos / eSizeBytes;
366 j = r / numInputElems;
367 i = r % numInputElems;
368 data = (XReg) readVecElem(input[j], (XReg) i, eSize);
369
370 if (replicate) {
371 for (int i = 0; i < eCount128; ++i) {
372 if (i < eCount) {
373 writeVecElem(&output[r % 2], data, i,
374 eSize);
375 } else { // zero extend if necessary
376 writeVecElem(&output[r % 2], (XReg) 0, i,
377 eSize);
378 }
379 }
380 } else {
381 writeVecElem(&output[r % 2], data, lane, eSize);
382 }
383 }
384 }
385 '''
386 for v in range(2):
387 for p in range(4):
388 eCode += '''
389 AA64FpDestP%(p)dV%(v)dL_uw = (uint32_t) readVecElem(
390 output[%(v)d], %(p)d, 0x2);
391 ''' % { 'v' : v, 'p' : p }
392
393 iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64',
394 { 'code' : eCode }, ['IsMicroop'])
395 header_output += MicroNeonMixLaneDeclare64.subst(iop)
396 exec_output += MicroNeonMixExecute64.subst(iop)
397
398 elif name == 'pack_neon_uop':
399
400 eCode = '''
339 VReg output[2]; //output data to arch. SIMD regs
340 '''
341
342 eCode += getInputCodeOp1L
343
344 # Fill output regs with register data initially. Note that
345 # elements in output register outside indexed lanes are left
346 # untouched
347 for v in range(2):
348 for p in range(4):
349 eCode += '''
350 writeVecElem(&output[%(v)d], (XReg) AA64FpDestP%(p)dV%(v)dL_uw,
351 %(p)d, 0x2);
352 ''' % { 'v': v, 'p': p}
353 eCode += '''
354 int eCount = dataSize / (8 << eSize);
355 int eCount128 = 128 / (8 << eSize);
356 int eSizeBytes = 1 << eSize;
357 int totNumBytes = numStructElems * eSizeBytes;
358 int numInputElems = eCount128;
359 int stepOffset = step * 2 * eSizeBytes;
360 int stepLimit = 2 * eSizeBytes;
361
362 int r = 0, i, j;
363 XReg data;
364
365 for (int pos = stepOffset; pos < stepLimit + stepOffset;
366 pos += eSizeBytes) {
367 if (pos < totNumBytes) {
368 r = pos / eSizeBytes;
369 j = r / numInputElems;
370 i = r % numInputElems;
371 data = (XReg) readVecElem(input[j], (XReg) i, eSize);
372
373 if (replicate) {
374 for (int i = 0; i < eCount128; ++i) {
375 if (i < eCount) {
376 writeVecElem(&output[r % 2], data, i,
377 eSize);
378 } else { // zero extend if necessary
379 writeVecElem(&output[r % 2], (XReg) 0, i,
380 eSize);
381 }
382 }
383 } else {
384 writeVecElem(&output[r % 2], data, lane, eSize);
385 }
386 }
387 }
388 '''
389 for v in range(2):
390 for p in range(4):
391 eCode += '''
392 AA64FpDestP%(p)dV%(v)dL_uw = (uint32_t) readVecElem(
393 output[%(v)d], %(p)d, 0x2);
394 ''' % { 'v' : v, 'p' : p }
395
396 iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64',
397 { 'code' : eCode }, ['IsMicroop'])
398 header_output += MicroNeonMixLaneDeclare64.subst(iop)
399 exec_output += MicroNeonMixExecute64.subst(iop)
400
401 elif name == 'pack_neon_uop':
402
403 eCode = '''
401 VReg input[4]; // input data from arch. SIMD regs
404 // input data from arch. SIMD regs
405 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
402 VReg output[2]; // output data to scratch area
403 '''
404
405 eCode += getInputCodeOp1S
406
407 eCode += '''
408 int eSizeBytes = 1 << eSize;
409 int numOutputElems = 128 / (8 << eSize);
410 int totNumBytes = numStructElems * eSizeBytes;
411 int stepOffset = step * 32;
412 int stepLimit = 32;
413
414 int r = 0, i, j;
415 XReg data;
416
417 for (int i = 0; i < 2; ++i) {
418 output[i].lo = 0;
419 output[i].hi = 0;
420 }
421
422 for (int pos = stepOffset; pos < stepLimit + stepOffset;
423 pos += eSizeBytes) {
424 if (pos < totNumBytes) {
425 r = pos / 16;
426 j = pos / eSizeBytes;
427 i = (pos / eSizeBytes) % numOutputElems;
428 data = (XReg) readVecElem(input[j], lane, eSize);
429 writeVecElem(&output[r % 2], data, i, eSize);
430 }
431 }
432 '''
433
434 for v in range(2):
435 for p in range(4):
436 eCode += '''
437 AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem(
438 output[%(v)d], %(p)d, 0x2);
439 ''' % { 'v' : v, 'p' : p }
440
441 iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64',
442 { 'code' : eCode }, ['IsMicroop'])
443 header_output += MicroNeonMixLaneDeclare64.subst(iop)
444 exec_output += MicroNeonMixExecute64.subst(iop)
445
446 # Generate instructions
447 mkMemAccMicroOp('mem_neon_uop')
448 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_1Reg', numRegs=1)
449 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_2Reg', numRegs=2)
450 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_3Reg', numRegs=3)
451 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_4Reg', numRegs=4)
452 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_1Reg', numRegs=1)
453 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_2Reg', numRegs=2)
454 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_3Reg', numRegs=3)
455 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_4Reg', numRegs=4)
456 mkMarshalMicroOp('unpack_neon_uop', 'MicroUnpackNeon64')
457 mkMarshalMicroOp('pack_neon_uop', 'MicroPackNeon64')
458
459}};
460
461let {{
462
463 iop = InstObjParams('vldmult64', 'VldMult64', 'VldMultOp64', '', [])
464 header_output += VMemMultDeclare64.subst(iop)
465 decoder_output += VMemMultConstructor64.subst(iop)
466
467 iop = InstObjParams('vstmult64', 'VstMult64', 'VstMultOp64', '', [])
468 header_output += VMemMultDeclare64.subst(iop)
469 decoder_output += VMemMultConstructor64.subst(iop)
470
471 iop = InstObjParams('vldsingle64', 'VldSingle64', 'VldSingleOp64', '', [])
472 header_output += VMemSingleDeclare64.subst(iop)
473 decoder_output += VMemSingleConstructor64.subst(iop)
474
475 iop = InstObjParams('vstsingle64', 'VstSingle64', 'VstSingleOp64', '', [])
476 header_output += VMemSingleDeclare64.subst(iop)
477 decoder_output += VMemSingleConstructor64.subst(iop)
478
479}};
406 VReg output[2]; // output data to scratch area
407 '''
408
409 eCode += getInputCodeOp1S
410
411 eCode += '''
412 int eSizeBytes = 1 << eSize;
413 int numOutputElems = 128 / (8 << eSize);
414 int totNumBytes = numStructElems * eSizeBytes;
415 int stepOffset = step * 32;
416 int stepLimit = 32;
417
418 int r = 0, i, j;
419 XReg data;
420
421 for (int i = 0; i < 2; ++i) {
422 output[i].lo = 0;
423 output[i].hi = 0;
424 }
425
426 for (int pos = stepOffset; pos < stepLimit + stepOffset;
427 pos += eSizeBytes) {
428 if (pos < totNumBytes) {
429 r = pos / 16;
430 j = pos / eSizeBytes;
431 i = (pos / eSizeBytes) % numOutputElems;
432 data = (XReg) readVecElem(input[j], lane, eSize);
433 writeVecElem(&output[r % 2], data, i, eSize);
434 }
435 }
436 '''
437
438 for v in range(2):
439 for p in range(4):
440 eCode += '''
441 AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem(
442 output[%(v)d], %(p)d, 0x2);
443 ''' % { 'v' : v, 'p' : p }
444
445 iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64',
446 { 'code' : eCode }, ['IsMicroop'])
447 header_output += MicroNeonMixLaneDeclare64.subst(iop)
448 exec_output += MicroNeonMixExecute64.subst(iop)
449
450 # Generate instructions
451 mkMemAccMicroOp('mem_neon_uop')
452 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_1Reg', numRegs=1)
453 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_2Reg', numRegs=2)
454 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_3Reg', numRegs=3)
455 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_4Reg', numRegs=4)
456 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_1Reg', numRegs=1)
457 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_2Reg', numRegs=2)
458 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_3Reg', numRegs=3)
459 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_4Reg', numRegs=4)
460 mkMarshalMicroOp('unpack_neon_uop', 'MicroUnpackNeon64')
461 mkMarshalMicroOp('pack_neon_uop', 'MicroPackNeon64')
462
463}};
464
465let {{
466
467 iop = InstObjParams('vldmult64', 'VldMult64', 'VldMultOp64', '', [])
468 header_output += VMemMultDeclare64.subst(iop)
469 decoder_output += VMemMultConstructor64.subst(iop)
470
471 iop = InstObjParams('vstmult64', 'VstMult64', 'VstMultOp64', '', [])
472 header_output += VMemMultDeclare64.subst(iop)
473 decoder_output += VMemMultConstructor64.subst(iop)
474
475 iop = InstObjParams('vldsingle64', 'VldSingle64', 'VldSingleOp64', '', [])
476 header_output += VMemSingleDeclare64.subst(iop)
477 decoder_output += VMemSingleConstructor64.subst(iop)
478
479 iop = InstObjParams('vstsingle64', 'VstSingle64', 'VstSingleOp64', '', [])
480 header_output += VMemSingleDeclare64.subst(iop)
481 decoder_output += VMemSingleConstructor64.subst(iop)
482
483}};