neon64_mem.isa revision 10474:799c8ee4ecba
1// -*- mode: c++ -*-
2
3// Copyright (c) 2012-2014 ARM Limited
4// All rights reserved
5//
6// The license below extends only to copyright in the software and shall
7// not be construed as granting a license to any other intellectual
8// property including but not limited to intellectual property relating
9// to a hardware implementation of the functionality of the software
10// licensed hereunder.  You may use the software subject to the license
11// terms below provided that you ensure that this notice is replicated
12// unmodified and in its entirety in all distributions of the software,
13// modified or unmodified, in source code or in binary form.
14//
15// Redistribution and use in source and binary forms, with or without
16// modification, are permitted provided that the following conditions are
17// met: redistributions of source code must retain the above copyright
18// notice, this list of conditions and the following disclaimer;
19// redistributions in binary form must reproduce the above copyright
20// notice, this list of conditions and the following disclaimer in the
21// documentation and/or other materials provided with the distribution;
22// neither the name of the copyright holders nor the names of its
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Authors: Mbou Eyole
39//          Giacomo Gabrielli
40
41let {{
42
43    header_output = ''
44    decoder_output = ''
45    exec_output = ''
46
47    def mkMemAccMicroOp(name):
48        global header_output, decoder_output, exec_output
49        SPAlignmentCheckCodeNeon = '''
50            if (baseIsSP && bits(XURa, 3, 0) &&
51                SPAlignmentCheckEnabled(xc->tcBase())) {
52                return std::make_shared<SPAlignmentFault>();
53            }
54        '''
55        eaCode = SPAlignmentCheckCodeNeon + '''
56            EA = XURa + imm;
57        '''
58        memDecl = '''
59            const int MaxNumBytes = 16;
60            union MemUnion {
61                uint8_t bytes[MaxNumBytes];
62                uint32_t floatRegBits[MaxNumBytes / 4];
63            };
64        '''
65
66        # Do endian conversion for all the elements
67        convCode = '''
68            VReg x = {0, 0};
69
70            x.lo = (((XReg) memUnion.floatRegBits[1]) << 32) |
71                (XReg) memUnion.floatRegBits[0];
72            x.hi = (((XReg) memUnion.floatRegBits[3]) << 32) |
73                (XReg) memUnion.floatRegBits[2];
74
75            const unsigned eCount = 16 / (1 << eSize);
76
77            if (isBigEndian64(xc->tcBase())) {
78                for (unsigned i = 0; i < eCount; i++) {
79                    switch (eSize) {
80                      case 0x3:  // 64-bit
81                        writeVecElem(&x, (XReg) gtobe(
82                            (uint64_t) readVecElem(x, i, eSize)), i, eSize);
83                        break;
84                      case 0x2:  // 32-bit
85                        writeVecElem(&x, (XReg) gtobe(
86                            (uint32_t) readVecElem(x, i, eSize)), i, eSize);
87                        break;
88                      case 0x1:  // 16-bit
89                        writeVecElem(&x, (XReg) gtobe(
90                            (uint16_t) readVecElem(x, i, eSize)), i, eSize);
91                        break;
92                      default:  // 8-bit
93                        break;  // Nothing to do here
94                    }
95                }
96            } else {
97                for (unsigned i = 0; i < eCount; i++) {
98                    switch (eSize) {
99                      case 0x3:  // 64-bit
100                        writeVecElem(&x, (XReg) gtole(
101                            (uint64_t) readVecElem(x, i, eSize)), i, eSize);
102                        break;
103                      case 0x2:  // 32-bit
104                        writeVecElem(&x, (XReg) gtole(
105                            (uint32_t) readVecElem(x, i, eSize)), i, eSize);
106                        break;
107                      case 0x1:  // 16-bit
108                        writeVecElem(&x, (XReg) gtole(
109                            (uint16_t) readVecElem(x, i, eSize)), i, eSize);
110                        break;
111                      default:  // 8-bit
112                        break;  // Nothing to do here
113                    }
114                }
115            }
116
117            memUnion.floatRegBits[0] = (uint32_t) x.lo;
118            memUnion.floatRegBits[1] = (uint32_t) (x.lo >> 32);
119            memUnion.floatRegBits[2] = (uint32_t) x.hi;
120            memUnion.floatRegBits[3] = (uint32_t) (x.hi >> 32);
121        '''
122
123        # Offload everything into registers
124        regSetCode = ''
125        for reg in range(4):
126            regSetCode += '''
127            AA64FpDestP%(reg)d_uw = gtoh(memUnion.floatRegBits[%(reg)d]);
128            ''' % { 'reg' : reg }
129
130        # Pull everything in from registers
131        regGetCode = ''
132        for reg in range(4):
133            regGetCode += '''
134            memUnion.floatRegBits[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
135            ''' % { 'reg' : reg }
136
137        loadMemAccCode = convCode + regSetCode
138        storeMemAccCode = regGetCode + convCode
139
140        loadIop = InstObjParams(name + 'ld',
141                'MicroNeonLoad64',
142                'MicroNeonMemOp',
143            {   'mem_decl' : memDecl,
144                'memacc_code' : loadMemAccCode,
145                'ea_code' : simd64EnabledCheckCode + eaCode,
146            },
147            [ 'IsMicroop', 'IsMemRef', 'IsLoad' ])
148        storeIop = InstObjParams(name + 'st',
149                'MicroNeonStore64',
150                'MicroNeonMemOp',
151            {   'mem_decl' : memDecl,
152                'memacc_code' : storeMemAccCode,
153                'ea_code' : simd64EnabledCheckCode + eaCode,
154            },
155            [ 'IsMicroop', 'IsMemRef', 'IsStore' ])
156
157        exec_output += NeonLoadExecute64.subst(loadIop) + \
158            NeonLoadInitiateAcc64.subst(loadIop) + \
159            NeonLoadCompleteAcc64.subst(loadIop) + \
160            NeonStoreExecute64.subst(storeIop) + \
161            NeonStoreInitiateAcc64.subst(storeIop) + \
162            NeonStoreCompleteAcc64.subst(storeIop)
163        header_output += MicroNeonMemDeclare64.subst(loadIop) + \
164            MicroNeonMemDeclare64.subst(storeIop)
165
166    def mkMarshalMicroOp(name, Name, numRegs=4):
167        global header_output, decoder_output, exec_output
168
169        getInputCodeOp1L = ''
170        for v in range(numRegs):
171            for p in range(4):
172                getInputCodeOp1L += '''
173            writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)d_uw,
174                         %(p)d, 0x2);
175            ''' % { 'v' : v, 'p' : p }
176
177        getInputCodeOp1S = ''
178        for v in range(numRegs):
179            for p in range(4):
180                getInputCodeOp1S += '''
181            writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)dS_uw,
182                         %(p)d, 0x2);
183            ''' % { 'v' : v, 'p' : p }
184
185        if name == 'deint_neon_uop':
186
187            eCode = '''
188                VReg input[4];  // input data from scratch area
189                VReg output[2];  // output data to arch. SIMD regs
190                VReg temp;
191                temp.lo = 0;
192                temp.hi = 0;
193            '''
194            for p in range(4):
195                eCode += '''
196                writeVecElem(&temp, (XReg) AA64FpDestP%(p)dV1L_uw, %(p)d, 0x2);
197                ''' % { 'p' : p }
198            eCode += getInputCodeOp1L
199
200            # Note that numRegs is not always the same as numStructElems; in
201            # particular, for LD1/ST1, numStructElems is 1 but numRegs can be
202            # 1, 2, 3 or 4
203
204            eCode += '''
205                output[0].lo = 0;
206                output[0].hi = 0;
207                output[1].lo = 0;
208                output[1].hi = 0;
209
210                int eCount = dataSize / (8 << eSize);
211                int eSizeBytes = 1 << eSize;  // element size in bytes
212                int numBytes = step * dataSize / 4;
213                int totNumBytes = numRegs * dataSize / 8;
214
215                int structElemNo, pos, a, b;
216                XReg data;
217
218                for (int r = 0; r < 2; ++r) {
219                    for (int i = 0; i < eCount; ++i) {
220                        if (numBytes < totNumBytes) {
221                            structElemNo = r + (step * 2);
222                            if (numStructElems == 1) {
223                                pos = (eSizeBytes * i) +
224                                    (eCount * structElemNo * eSizeBytes);
225                            } else {
226                                pos = (numStructElems * eSizeBytes * i) +
227                                    (structElemNo * eSizeBytes);
228                            }
229                            a = pos / 16;
230                            b = (pos % 16) / eSizeBytes;
231                            data = (XReg) readVecElem(input[a], (XReg) b,
232                                                      eSize);
233                            writeVecElem(&output[r], data, i, eSize);
234                            numBytes += eSizeBytes;
235                        }
236                    }
237                }
238            '''
239            for p in range(4):
240                eCode += '''
241                AA64FpDestP%(p)dV0L_uw = (uint32_t) readVecElem(output[0],
242                    %(p)d, 0x2);
243                ''' % { 'p' : p }
244            eCode += '''
245                if ((numRegs % 2 == 0) || (numRegs == 3 && step == 0)) {
246            '''
247            for p in range(4):
248                eCode += '''
249                    AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(
250                        output[1], %(p)d, 0x2);
251                ''' % { 'p' : p }
252            eCode += '''
253                } else {
254            '''
255            for p in range(4):
256                eCode += '''
257                    AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(temp,
258                        %(p)d, 0x2);
259                ''' % { 'p' : p }
260            eCode += '''
261                }
262            '''
263
264            iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
265                                { 'code' : eCode, 'op_class' : 'No_OpClass' },
266                                ['IsMicroop'])
267            header_output += MicroNeonMixDeclare64.subst(iop)
268            exec_output += MicroNeonMixExecute64.subst(iop)
269
270        elif name == 'int_neon_uop':
271
272            eCode = '''
273                VReg input[4];  // input data from arch. SIMD regs
274                VReg output[2];  // output data to scratch area
275            '''
276
277            eCode += getInputCodeOp1S
278
279            # Note that numRegs is not always the same as numStructElems; in
280            # particular, for LD1/ST1, numStructElems is 1 but numRegs can be
281            # 1, 2, 3 or 4
282
283            eCode += '''
284                int eCount = dataSize / (8 << eSize);
285                int eSizeBytes = 1 << eSize;
286                int totNumBytes = numRegs * dataSize / 8;
287                int numOutputElems = 128 / (8 << eSize);
288                int stepOffset = step * 32;
289
290                for (int i = 0; i < 2; ++i) {
291                    output[i].lo = 0;
292                    output[i].hi = 0;
293                }
294
295                int r = 0, k = 0, i, j;
296                XReg data;
297
298                for (int pos = stepOffset; pos < 32 + stepOffset;
299                        pos += eSizeBytes) {
300                    if (pos < totNumBytes) {
301                        if (numStructElems == 1) {
302                            i = (pos / eSizeBytes) % eCount;
303                            j = pos / (eCount * eSizeBytes);
304                        } else {
305                            i = pos / (numStructElems * eSizeBytes);
306                            j = (pos % (numStructElems * eSizeBytes)) /
307                                eSizeBytes;
308                        }
309                        data = (XReg) readVecElem(input[j], (XReg) i, eSize);
310                        writeVecElem(&output[r], data, k, eSize);
311                        k++;
312                        if (k == numOutputElems){
313                            k = 0;
314                            ++r;
315                        }
316                    }
317                }
318                '''
319            for v in range(2):
320                for p in range(4):
321                    eCode += '''
322                AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem(
323                    output[%(v)d], %(p)d, 0x2);
324                ''' % { 'v': v, 'p': p}
325
326            iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
327                                { 'code' : eCode, 'op_class' : 'No_OpClass' },
328                                ['IsMicroop'])
329            header_output += MicroNeonMixDeclare64.subst(iop)
330            exec_output += MicroNeonMixExecute64.subst(iop)
331
332        elif name == 'unpack_neon_uop':
333
334            eCode = '''
335                VReg input[4];  //input data from scratch area
336                VReg output[2];  //output data to arch. SIMD regs
337            '''
338
339            eCode += getInputCodeOp1L
340
341            # Fill output regs with register data initially.  Note that
342            # elements in output register outside indexed lanes are left
343            # untouched
344            for v in range(2):
345                for p in range(4):
346                    eCode += '''
347                writeVecElem(&output[%(v)d], (XReg) AA64FpDestP%(p)dV%(v)dL_uw,
348                             %(p)d, 0x2);
349                ''' % { 'v': v, 'p': p}
350            eCode += '''
351                int eCount = dataSize / (8 << eSize);
352                int eCount128 = 128 / (8 << eSize);
353                int eSizeBytes = 1 << eSize;
354                int totNumBytes = numStructElems * eSizeBytes;
355                int numInputElems = eCount128;
356                int stepOffset = step * 2 * eSizeBytes;
357                int stepLimit = 2 * eSizeBytes;
358
359                int r = 0, i, j;
360                XReg data;
361
362                for (int pos = stepOffset; pos < stepLimit + stepOffset;
363                        pos += eSizeBytes) {
364                    if (pos < totNumBytes) {
365                        r = pos / eSizeBytes;
366                        j = r / numInputElems;
367                        i = r % numInputElems;
368                        data = (XReg) readVecElem(input[j], (XReg) i, eSize);
369
370                        if (replicate) {
371                            for (int i = 0; i < eCount128; ++i) {
372                                if (i < eCount) {
373                                    writeVecElem(&output[r % 2], data, i,
374                                                 eSize);
375                                } else {  // zero extend if necessary
376                                    writeVecElem(&output[r % 2], (XReg) 0, i,
377                                                 eSize);
378                                }
379                            }
380                        } else {
381                            writeVecElem(&output[r % 2], data, lane, eSize);
382                        }
383                    }
384                }
385            '''
386            for v in range(2):
387                for p in range(4):
388                    eCode += '''
389                AA64FpDestP%(p)dV%(v)dL_uw = (uint32_t) readVecElem(
390                    output[%(v)d], %(p)d, 0x2);
391                ''' % { 'v' : v, 'p' : p }
392
393            iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64',
394                                { 'code' : eCode }, ['IsMicroop'])
395            header_output += MicroNeonMixLaneDeclare64.subst(iop)
396            exec_output += MicroNeonMixExecute64.subst(iop)
397
398        elif name == 'pack_neon_uop':
399
400            eCode = '''
401                VReg input[4];  // input data from arch. SIMD regs
402                VReg output[2];  // output data to scratch area
403            '''
404
405            eCode += getInputCodeOp1S
406
407            eCode += '''
408                int eSizeBytes = 1 << eSize;
409                int numOutputElems = 128 / (8 << eSize);
410                int totNumBytes = numStructElems * eSizeBytes;
411                int stepOffset = step * 32;
412                int stepLimit = 32;
413
414                int r = 0, i, j;
415                XReg data;
416
417                for (int i = 0; i < 2; ++i) {
418                    output[i].lo = 0;
419                    output[i].hi = 0;
420                }
421
422                for (int pos = stepOffset; pos < stepLimit + stepOffset;
423                        pos += eSizeBytes) {
424                    if (pos < totNumBytes) {
425                        r = pos / 16;
426                        j = pos / eSizeBytes;
427                        i = (pos / eSizeBytes) %  numOutputElems;
428                        data = (XReg) readVecElem(input[j], lane, eSize);
429                        writeVecElem(&output[r % 2], data, i, eSize);
430                    }
431                }
432            '''
433
434            for v in range(2):
435                for p in range(4):
436                    eCode += '''
437                AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem(
438                    output[%(v)d], %(p)d, 0x2);
439                ''' % { 'v' : v, 'p' : p }
440
441            iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64',
442                                { 'code' : eCode }, ['IsMicroop'])
443            header_output += MicroNeonMixLaneDeclare64.subst(iop)
444            exec_output += MicroNeonMixExecute64.subst(iop)
445
446    # Generate instructions
447    mkMemAccMicroOp('mem_neon_uop')
448    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_1Reg', numRegs=1)
449    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_2Reg', numRegs=2)
450    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_3Reg', numRegs=3)
451    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_4Reg', numRegs=4)
452    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_1Reg', numRegs=1)
453    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_2Reg', numRegs=2)
454    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_3Reg', numRegs=3)
455    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_4Reg', numRegs=4)
456    mkMarshalMicroOp('unpack_neon_uop', 'MicroUnpackNeon64')
457    mkMarshalMicroOp('pack_neon_uop', 'MicroPackNeon64')
458
459}};
460
461let {{
462
463    iop = InstObjParams('vldmult64', 'VldMult64', 'VldMultOp64', '', [])
464    header_output += VMemMultDeclare64.subst(iop)
465    decoder_output += VMemMultConstructor64.subst(iop)
466
467    iop = InstObjParams('vstmult64', 'VstMult64', 'VstMultOp64', '', [])
468    header_output += VMemMultDeclare64.subst(iop)
469    decoder_output += VMemMultConstructor64.subst(iop)
470
471    iop = InstObjParams('vldsingle64', 'VldSingle64', 'VldSingleOp64', '', [])
472    header_output += VMemSingleDeclare64.subst(iop)
473    decoder_output += VMemSingleConstructor64.subst(iop)
474
475    iop = InstObjParams('vstsingle64', 'VstSingle64', 'VstSingleOp64', '', [])
476    header_output += VMemSingleDeclare64.subst(iop)
477    decoder_output += VMemSingleConstructor64.subst(iop)
478
479}};
480