neon64_mem.isa revision 10537:47fe87b0cf97
1// -*- mode: c++ -*-
2
3// Copyright (c) 2012-2014 ARM Limited
4// All rights reserved
5//
6// The license below extends only to copyright in the software and shall
7// not be construed as granting a license to any other intellectual
8// property including but not limited to intellectual property relating
9// to a hardware implementation of the functionality of the software
10// licensed hereunder.  You may use the software subject to the license
11// terms below provided that you ensure that this notice is replicated
12// unmodified and in its entirety in all distributions of the software,
13// modified or unmodified, in source code or in binary form.
14//
15// Redistribution and use in source and binary forms, with or without
16// modification, are permitted provided that the following conditions are
17// met: redistributions of source code must retain the above copyright
18// notice, this list of conditions and the following disclaimer;
19// redistributions in binary form must reproduce the above copyright
20// notice, this list of conditions and the following disclaimer in the
21// documentation and/or other materials provided with the distribution;
22// neither the name of the copyright holders nor the names of its
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Authors: Mbou Eyole
39//          Giacomo Gabrielli
40
41let {{
42
43    header_output = ''
44    decoder_output = ''
45    exec_output = ''
46
47    def mkMemAccMicroOp(name):
48        global header_output, decoder_output, exec_output
49        SPAlignmentCheckCodeNeon = '''
50            if (baseIsSP && bits(XURa, 3, 0) &&
51                SPAlignmentCheckEnabled(xc->tcBase())) {
52                return std::make_shared<SPAlignmentFault>();
53            }
54        '''
55        eaCode = SPAlignmentCheckCodeNeon + '''
56            EA = XURa + imm;
57        '''
58        memDecl = '''
59            const int MaxNumBytes = 16;
60            union MemUnion {
61                uint8_t bytes[MaxNumBytes];
62                uint32_t floatRegBits[MaxNumBytes / 4];
63            };
64        '''
65
66        # Do endian conversion for all the elements
67        convCode = '''
68            VReg x = {0, 0};
69
70            x.lo = (((XReg) memUnion.floatRegBits[1]) << 32) |
71                (XReg) memUnion.floatRegBits[0];
72            x.hi = (((XReg) memUnion.floatRegBits[3]) << 32) |
73                (XReg) memUnion.floatRegBits[2];
74
75            const unsigned eCount = 16 / (1 << eSize);
76
77            if (isBigEndian64(xc->tcBase())) {
78                for (unsigned i = 0; i < eCount; i++) {
79                    switch (eSize) {
80                      case 0x3:  // 64-bit
81                        writeVecElem(&x, (XReg) gtobe(
82                            (uint64_t) readVecElem(x, i, eSize)), i, eSize);
83                        break;
84                      case 0x2:  // 32-bit
85                        writeVecElem(&x, (XReg) gtobe(
86                            (uint32_t) readVecElem(x, i, eSize)), i, eSize);
87                        break;
88                      case 0x1:  // 16-bit
89                        writeVecElem(&x, (XReg) gtobe(
90                            (uint16_t) readVecElem(x, i, eSize)), i, eSize);
91                        break;
92                      default:  // 8-bit
93                        break;  // Nothing to do here
94                    }
95                }
96            } else {
97                for (unsigned i = 0; i < eCount; i++) {
98                    switch (eSize) {
99                      case 0x3:  // 64-bit
100                        writeVecElem(&x, (XReg) gtole(
101                            (uint64_t) readVecElem(x, i, eSize)), i, eSize);
102                        break;
103                      case 0x2:  // 32-bit
104                        writeVecElem(&x, (XReg) gtole(
105                            (uint32_t) readVecElem(x, i, eSize)), i, eSize);
106                        break;
107                      case 0x1:  // 16-bit
108                        writeVecElem(&x, (XReg) gtole(
109                            (uint16_t) readVecElem(x, i, eSize)), i, eSize);
110                        break;
111                      default:  // 8-bit
112                        break;  // Nothing to do here
113                    }
114                }
115            }
116
117            memUnion.floatRegBits[0] = (uint32_t) x.lo;
118            memUnion.floatRegBits[1] = (uint32_t) (x.lo >> 32);
119            memUnion.floatRegBits[2] = (uint32_t) x.hi;
120            memUnion.floatRegBits[3] = (uint32_t) (x.hi >> 32);
121        '''
122
123        # Offload everything into registers
124        regSetCode = ''
125        for reg in range(4):
126            regSetCode += '''
127            AA64FpDestP%(reg)d_uw = gtoh(memUnion.floatRegBits[%(reg)d]);
128            ''' % { 'reg' : reg }
129
130        # Pull everything in from registers
131        regGetCode = ''
132        for reg in range(4):
133            regGetCode += '''
134            memUnion.floatRegBits[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
135            ''' % { 'reg' : reg }
136
137        loadMemAccCode = convCode + regSetCode
138        storeMemAccCode = regGetCode + convCode
139
140        loadIop = InstObjParams(name + 'ld',
141                'MicroNeonLoad64',
142                'MicroNeonMemOp',
143            {   'mem_decl' : memDecl,
144                'memacc_code' : loadMemAccCode,
145                'ea_code' : simd64EnabledCheckCode + eaCode,
146            },
147            [ 'IsMicroop', 'IsMemRef', 'IsLoad' ])
148        storeIop = InstObjParams(name + 'st',
149                'MicroNeonStore64',
150                'MicroNeonMemOp',
151            {   'mem_decl' : memDecl,
152                'memacc_code' : storeMemAccCode,
153                'ea_code' : simd64EnabledCheckCode + eaCode,
154            },
155            [ 'IsMicroop', 'IsMemRef', 'IsStore' ])
156
157        exec_output += NeonLoadExecute64.subst(loadIop) + \
158            NeonLoadInitiateAcc64.subst(loadIop) + \
159            NeonLoadCompleteAcc64.subst(loadIop) + \
160            NeonStoreExecute64.subst(storeIop) + \
161            NeonStoreInitiateAcc64.subst(storeIop) + \
162            NeonStoreCompleteAcc64.subst(storeIop)
163        header_output += MicroNeonMemDeclare64.subst(loadIop) + \
164            MicroNeonMemDeclare64.subst(storeIop)
165
166    def mkMarshalMicroOp(name, Name, numRegs=4):
167        global header_output, decoder_output, exec_output
168
169        getInputCodeOp1L = ''
170        for v in range(numRegs):
171            for p in range(4):
172                getInputCodeOp1L += '''
173            writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)d_uw,
174                         %(p)d, 0x2);
175            ''' % { 'v' : v, 'p' : p }
176
177        getInputCodeOp1S = ''
178        for v in range(numRegs):
179            for p in range(4):
180                getInputCodeOp1S += '''
181            writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)dS_uw,
182                         %(p)d, 0x2);
183            ''' % { 'v' : v, 'p' : p }
184
185        if name == 'deint_neon_uop':
186
187            eCode = '''
188                // input data from scratch area
189                VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
190                VReg output[2];  // output data to arch. SIMD regs
191                VReg temp;
192                temp.lo = 0;
193                temp.hi = 0;
194            '''
195            for p in range(4):
196                eCode += '''
197                writeVecElem(&temp, (XReg) AA64FpDestP%(p)dV1L_uw, %(p)d, 0x2);
198                ''' % { 'p' : p }
199            eCode += getInputCodeOp1L
200
201            # Note that numRegs is not always the same as numStructElems; in
202            # particular, for LD1/ST1, numStructElems is 1 but numRegs can be
203            # 1, 2, 3 or 4
204
205            eCode += '''
206                output[0].lo = 0;
207                output[0].hi = 0;
208                output[1].lo = 0;
209                output[1].hi = 0;
210
211                int eCount = dataSize / (8 << eSize);
212                int eSizeBytes = 1 << eSize;  // element size in bytes
213                int numBytes = step * dataSize / 4;
214                int totNumBytes = numRegs * dataSize / 8;
215
216                int structElemNo, pos, a, b;
217                XReg data;
218
219                for (int r = 0; r < 2; ++r) {
220                    for (int i = 0; i < eCount; ++i) {
221                        if (numBytes < totNumBytes) {
222                            structElemNo = r + (step * 2);
223                            if (numStructElems == 1) {
224                                pos = (eSizeBytes * i) +
225                                    (eCount * structElemNo * eSizeBytes);
226                            } else {
227                                pos = (numStructElems * eSizeBytes * i) +
228                                    (structElemNo * eSizeBytes);
229                            }
230                            a = pos / 16;
231                            b = (pos % 16) / eSizeBytes;
232                            data = (XReg) readVecElem(input[a], (XReg) b,
233                                                      eSize);
234                            writeVecElem(&output[r], data, i, eSize);
235                            numBytes += eSizeBytes;
236                        }
237                    }
238                }
239            '''
240            for p in range(4):
241                eCode += '''
242                AA64FpDestP%(p)dV0L_uw = (uint32_t) readVecElem(output[0],
243                    %(p)d, 0x2);
244                ''' % { 'p' : p }
245            eCode += '''
246                if ((numRegs % 2 == 0) || (numRegs == 3 && step == 0)) {
247            '''
248            for p in range(4):
249                eCode += '''
250                    AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(
251                        output[1], %(p)d, 0x2);
252                ''' % { 'p' : p }
253            eCode += '''
254                } else {
255            '''
256            for p in range(4):
257                eCode += '''
258                    AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(temp,
259                        %(p)d, 0x2);
260                ''' % { 'p' : p }
261            eCode += '''
262                }
263            '''
264
265            iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
266                                { 'code' : eCode, 'op_class' : 'No_OpClass' },
267                                ['IsMicroop'])
268            header_output += MicroNeonMixDeclare64.subst(iop)
269            exec_output += MicroNeonMixExecute64.subst(iop)
270
271        elif name == 'int_neon_uop':
272
273            eCode = '''
274                // input data from arch. SIMD regs
275                VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
276                VReg output[2];  // output data to scratch area
277            '''
278
279            eCode += getInputCodeOp1S
280
281            # Note that numRegs is not always the same as numStructElems; in
282            # particular, for LD1/ST1, numStructElems is 1 but numRegs can be
283            # 1, 2, 3 or 4
284
285            eCode += '''
286                int eCount = dataSize / (8 << eSize);
287                int eSizeBytes = 1 << eSize;
288                int totNumBytes = numRegs * dataSize / 8;
289                int numOutputElems = 128 / (8 << eSize);
290                int stepOffset = step * 32;
291
292                for (int i = 0; i < 2; ++i) {
293                    output[i].lo = 0;
294                    output[i].hi = 0;
295                }
296
297                int r = 0, k = 0, i, j;
298                XReg data;
299
300                for (int pos = stepOffset; pos < 32 + stepOffset;
301                        pos += eSizeBytes) {
302                    if (pos < totNumBytes) {
303                        if (numStructElems == 1) {
304                            i = (pos / eSizeBytes) % eCount;
305                            j = pos / (eCount * eSizeBytes);
306                        } else {
307                            i = pos / (numStructElems * eSizeBytes);
308                            j = (pos % (numStructElems * eSizeBytes)) /
309                                eSizeBytes;
310                        }
311                        data = (XReg) readVecElem(input[j], (XReg) i, eSize);
312                        writeVecElem(&output[r], data, k, eSize);
313                        k++;
314                        if (k == numOutputElems){
315                            k = 0;
316                            ++r;
317                        }
318                    }
319                }
320                '''
321            for v in range(2):
322                for p in range(4):
323                    eCode += '''
324                AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem(
325                    output[%(v)d], %(p)d, 0x2);
326                ''' % { 'v': v, 'p': p}
327
328            iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
329                                { 'code' : eCode, 'op_class' : 'No_OpClass' },
330                                ['IsMicroop'])
331            header_output += MicroNeonMixDeclare64.subst(iop)
332            exec_output += MicroNeonMixExecute64.subst(iop)
333
334        elif name == 'unpack_neon_uop':
335
336            eCode = '''
337                //input data from scratch area
338                VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
339                VReg output[2];  //output data to arch. SIMD regs
340            '''
341
342            eCode += getInputCodeOp1L
343
344            # Fill output regs with register data initially.  Note that
345            # elements in output register outside indexed lanes are left
346            # untouched
347            for v in range(2):
348                for p in range(4):
349                    eCode += '''
350                writeVecElem(&output[%(v)d], (XReg) AA64FpDestP%(p)dV%(v)dL_uw,
351                             %(p)d, 0x2);
352                ''' % { 'v': v, 'p': p}
353            eCode += '''
354                int eCount = dataSize / (8 << eSize);
355                int eCount128 = 128 / (8 << eSize);
356                int eSizeBytes = 1 << eSize;
357                int totNumBytes = numStructElems * eSizeBytes;
358                int numInputElems = eCount128;
359                int stepOffset = step * 2 * eSizeBytes;
360                int stepLimit = 2 * eSizeBytes;
361
362                int r = 0, i, j;
363                XReg data;
364
365                for (int pos = stepOffset; pos < stepLimit + stepOffset;
366                        pos += eSizeBytes) {
367                    if (pos < totNumBytes) {
368                        r = pos / eSizeBytes;
369                        j = r / numInputElems;
370                        i = r % numInputElems;
371                        data = (XReg) readVecElem(input[j], (XReg) i, eSize);
372
373                        if (replicate) {
374                            for (int i = 0; i < eCount128; ++i) {
375                                if (i < eCount) {
376                                    writeVecElem(&output[r % 2], data, i,
377                                                 eSize);
378                                } else {  // zero extend if necessary
379                                    writeVecElem(&output[r % 2], (XReg) 0, i,
380                                                 eSize);
381                                }
382                            }
383                        } else {
384                            writeVecElem(&output[r % 2], data, lane, eSize);
385                        }
386                    }
387                }
388            '''
389            for v in range(2):
390                for p in range(4):
391                    eCode += '''
392                AA64FpDestP%(p)dV%(v)dL_uw = (uint32_t) readVecElem(
393                    output[%(v)d], %(p)d, 0x2);
394                ''' % { 'v' : v, 'p' : p }
395
396            iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64',
397                                { 'code' : eCode }, ['IsMicroop'])
398            header_output += MicroNeonMixLaneDeclare64.subst(iop)
399            exec_output += MicroNeonMixExecute64.subst(iop)
400
401        elif name == 'pack_neon_uop':
402
403            eCode = '''
404                // input data from arch. SIMD regs
405                VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
406                VReg output[2];  // output data to scratch area
407            '''
408
409            eCode += getInputCodeOp1S
410
411            eCode += '''
412                int eSizeBytes = 1 << eSize;
413                int numOutputElems = 128 / (8 << eSize);
414                int totNumBytes = numStructElems * eSizeBytes;
415                int stepOffset = step * 32;
416                int stepLimit = 32;
417
418                int r = 0, i, j;
419                XReg data;
420
421                for (int i = 0; i < 2; ++i) {
422                    output[i].lo = 0;
423                    output[i].hi = 0;
424                }
425
426                for (int pos = stepOffset; pos < stepLimit + stepOffset;
427                        pos += eSizeBytes) {
428                    if (pos < totNumBytes) {
429                        r = pos / 16;
430                        j = pos / eSizeBytes;
431                        i = (pos / eSizeBytes) %  numOutputElems;
432                        data = (XReg) readVecElem(input[j], lane, eSize);
433                        writeVecElem(&output[r % 2], data, i, eSize);
434                    }
435                }
436            '''
437
438            for v in range(2):
439                for p in range(4):
440                    eCode += '''
441                AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem(
442                    output[%(v)d], %(p)d, 0x2);
443                ''' % { 'v' : v, 'p' : p }
444
445            iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64',
446                                { 'code' : eCode }, ['IsMicroop'])
447            header_output += MicroNeonMixLaneDeclare64.subst(iop)
448            exec_output += MicroNeonMixExecute64.subst(iop)
449
450    # Generate instructions
451    mkMemAccMicroOp('mem_neon_uop')
452    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_1Reg', numRegs=1)
453    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_2Reg', numRegs=2)
454    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_3Reg', numRegs=3)
455    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_4Reg', numRegs=4)
456    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_1Reg', numRegs=1)
457    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_2Reg', numRegs=2)
458    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_3Reg', numRegs=3)
459    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_4Reg', numRegs=4)
460    mkMarshalMicroOp('unpack_neon_uop', 'MicroUnpackNeon64')
461    mkMarshalMicroOp('pack_neon_uop', 'MicroPackNeon64')
462
463}};
464
465let {{
466
467    iop = InstObjParams('vldmult64', 'VldMult64', 'VldMultOp64', '', [])
468    header_output += VMemMultDeclare64.subst(iop)
469    decoder_output += VMemMultConstructor64.subst(iop)
470
471    iop = InstObjParams('vstmult64', 'VstMult64', 'VstMultOp64', '', [])
472    header_output += VMemMultDeclare64.subst(iop)
473    decoder_output += VMemMultConstructor64.subst(iop)
474
475    iop = InstObjParams('vldsingle64', 'VldSingle64', 'VldSingleOp64', '', [])
476    header_output += VMemSingleDeclare64.subst(iop)
477    decoder_output += VMemSingleConstructor64.subst(iop)
478
479    iop = InstObjParams('vstsingle64', 'VstSingle64', 'VstSingleOp64', '', [])
480    header_output += VMemSingleDeclare64.subst(iop)
481    decoder_output += VMemSingleConstructor64.subst(iop)
482
483}};
484