neon64_mem.isa revision 12583:0c047fc2b3e0
1// -*- mode: c++ -*-
2
3// Copyright (c) 2012-2014 ARM Limited
4// All rights reserved
5//
6// The license below extends only to copyright in the software and shall
7// not be construed as granting a license to any other intellectual
8// property including but not limited to intellectual property relating
9// to a hardware implementation of the functionality of the software
10// licensed hereunder.  You may use the software subject to the license
11// terms below provided that you ensure that this notice is replicated
12// unmodified and in its entirety in all distributions of the software,
13// modified or unmodified, in source code or in binary form.
14//
15// Redistribution and use in source and binary forms, with or without
16// modification, are permitted provided that the following conditions are
17// met: redistributions of source code must retain the above copyright
18// notice, this list of conditions and the following disclaimer;
19// redistributions in binary form must reproduce the above copyright
20// notice, this list of conditions and the following disclaimer in the
21// documentation and/or other materials provided with the distribution;
22// neither the name of the copyright holders nor the names of its
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Authors: Mbou Eyole
39//          Giacomo Gabrielli
40
41let {{
42
43    header_output = ''
44    decoder_output = ''
45    exec_output = ''
46
47    def mkMemAccMicroOp(name):
48        global header_output, decoder_output, exec_output
49        SPAlignmentCheckCodeNeon = '''
50            if (baseIsSP && bits(XURa, 3, 0) &&
51                SPAlignmentCheckEnabled(xc->tcBase())) {
52                return std::make_shared<SPAlignmentFault>();
53            }
54        '''
55        eaCode = SPAlignmentCheckCodeNeon + '''
56            EA = XURa + imm;
57        '''
58        memDecl = '''
59            const int MaxNumBytes = 16;
60            union MemUnion {
61                uint8_t bytes[MaxNumBytes];
62                uint32_t floatRegBits[MaxNumBytes / 4];
63            };
64        '''
65
66        # Do endian conversion for all the elements
67        convCode = '''
68            VReg x = {0, 0};
69
70            x.lo = (((XReg) memUnion.floatRegBits[1]) << 32) |
71                (XReg) memUnion.floatRegBits[0];
72            x.hi = (((XReg) memUnion.floatRegBits[3]) << 32) |
73                (XReg) memUnion.floatRegBits[2];
74
75            const unsigned eCount = 16 / (1 << eSize);
76
77            if (isBigEndian64(xc->tcBase())) {
78                for (unsigned i = 0; i < eCount; i++) {
79                    switch (eSize) {
80                      case 0x3:  // 64-bit
81                        writeVecElem(&x, (XReg) gtobe(
82                            (uint64_t) readVecElem(x, i, eSize)), i, eSize);
83                        break;
84                      case 0x2:  // 32-bit
85                        writeVecElem(&x, (XReg) gtobe(
86                            (uint32_t) readVecElem(x, i, eSize)), i, eSize);
87                        break;
88                      case 0x1:  // 16-bit
89                        writeVecElem(&x, (XReg) gtobe(
90                            (uint16_t) readVecElem(x, i, eSize)), i, eSize);
91                        break;
92                      default:  // 8-bit
93                        break;  // Nothing to do here
94                    }
95                }
96            } else {
97                for (unsigned i = 0; i < eCount; i++) {
98                    switch (eSize) {
99                      case 0x3:  // 64-bit
100                        writeVecElem(&x, (XReg) gtole(
101                            (uint64_t) readVecElem(x, i, eSize)), i, eSize);
102                        break;
103                      case 0x2:  // 32-bit
104                        writeVecElem(&x, (XReg) gtole(
105                            (uint32_t) readVecElem(x, i, eSize)), i, eSize);
106                        break;
107                      case 0x1:  // 16-bit
108                        writeVecElem(&x, (XReg) gtole(
109                            (uint16_t) readVecElem(x, i, eSize)), i, eSize);
110                        break;
111                      default:  // 8-bit
112                        break;  // Nothing to do here
113                    }
114                }
115            }
116
117            memUnion.floatRegBits[0] = (uint32_t) x.lo;
118            memUnion.floatRegBits[1] = (uint32_t) (x.lo >> 32);
119            memUnion.floatRegBits[2] = (uint32_t) x.hi;
120            memUnion.floatRegBits[3] = (uint32_t) (x.hi >> 32);
121        '''
122
123        # Offload everything into registers
124        regSetCode = ''
125        for reg in range(4):
126            regSetCode += '''
127            AA64FpDestP%(reg)d_uw = gtoh(memUnion.floatRegBits[%(reg)d]);
128            ''' % { 'reg' : reg }
129
130        # Pull everything in from registers
131        regGetCode = ''
132        for reg in range(4):
133            regGetCode += '''
134            memUnion.floatRegBits[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
135            ''' % { 'reg' : reg }
136
137        loadMemAccCode = convCode + regSetCode
138        storeMemAccCode = regGetCode + convCode
139
140        loadIop = InstObjParams(name + 'ld',
141                'MicroNeonLoad64',
142                'MicroNeonMemOp',
143            {   'mem_decl' : memDecl,
144                'memacc_code' : loadMemAccCode,
145                'ea_code' : simd64EnabledCheckCode + eaCode,
146            },
147            [ 'IsMicroop', 'IsMemRef', 'IsLoad' ])
148        storeIop = InstObjParams(name + 'st',
149                'MicroNeonStore64',
150                'MicroNeonMemOp',
151            {   'mem_decl' : memDecl,
152                'memacc_code' : storeMemAccCode,
153                'ea_code' : simd64EnabledCheckCode + eaCode,
154            },
155            [ 'IsMicroop', 'IsMemRef', 'IsStore' ])
156
157        exec_output += NeonLoadExecute64.subst(loadIop) + \
158            NeonLoadInitiateAcc64.subst(loadIop) + \
159            NeonLoadCompleteAcc64.subst(loadIop) + \
160            NeonStoreExecute64.subst(storeIop) + \
161            NeonStoreInitiateAcc64.subst(storeIop) + \
162            NeonStoreCompleteAcc64.subst(storeIop)
163        header_output += MicroNeonMemDeclare64.subst(loadIop) + \
164            MicroNeonMemDeclare64.subst(storeIop)
165
166    def mkMarshalMicroOp(name, Name, numRegs=4):
167        global header_output, decoder_output, exec_output
168
169        getInputCodeOp1L = ''
170        for v in range(numRegs):
171            for p in range(4):
172                getInputCodeOp1L += '''
173            writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)d_uw,
174                         %(p)d, 0x2);
175            ''' % { 'v' : v, 'p' : p }
176
177        getInputCodeOp1S = ''
178        for v in range(numRegs):
179            for p in range(4):
180                getInputCodeOp1S += '''
181            writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)dS_uw,
182                         %(p)d, 0x2);
183            ''' % { 'v' : v, 'p' : p }
184
185        if name == 'deint_neon_uop':
186
187            eCode = '''
188                // input data from scratch area
189                VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
190                VReg output[2];  // output data to arch. SIMD regs
191                VReg temp;
192                temp.lo = 0;
193                temp.hi = 0;
194            '''
195            for p in range(4):
196                eCode += '''
197                writeVecElem(&temp, (XReg) AA64FpDestP%(p)dV1L_uw, %(p)d, 0x2);
198                ''' % { 'p' : p }
199            eCode += getInputCodeOp1L
200
201            # Note that numRegs is not always the same as numStructElems; in
202            # particular, for LD1/ST1, numStructElems is 1 but numRegs can be
203            # 1, 2, 3 or 4
204
205            eCode += '''
206                output[0].lo = 0;
207                output[0].hi = 0;
208                output[1].lo = 0;
209                output[1].hi = 0;
210
211                int eCount = dataSize / (8 << eSize);
212                int eSizeBytes = 1 << eSize;  // element size in bytes
213                int numBytes = step * dataSize / 4;
214                int totNumBytes = numRegs * dataSize / 8;
215
216                int structElemNo, pos, a, b;
217                XReg data;
218
219                for (int r = 0; r < 2; ++r) {
220                    for (int i = 0; i < eCount; ++i) {
221                        if (numBytes < totNumBytes) {
222                            structElemNo = r + (step * 2);
223                            if (numStructElems == 1) {
224                                pos = (eSizeBytes * i) +
225                                    (eCount * structElemNo * eSizeBytes);
226                            } else {
227                                pos = (numStructElems * eSizeBytes * i) +
228                                    (structElemNo * eSizeBytes);
229                            }
230                            a = pos / 16;
231                            b = (pos % 16) / eSizeBytes;
232                            data = (XReg) readVecElem(input[a], (XReg) b,
233                                                      eSize);
234                            writeVecElem(&output[r], data, i, eSize);
235                            numBytes += eSizeBytes;
236                        }
237                    }
238                }
239            '''
240            for p in range(4):
241                eCode += '''
242                AA64FpDestP%(p)dV0L_uw = (uint32_t) readVecElem(output[0],
243                    %(p)d, 0x2);
244                ''' % { 'p' : p }
245            eCode += '''
246                if ((numRegs % 2 == 0) || (numRegs == 3 && step == 0)) {
247            '''
248            for p in range(4):
249                eCode += '''
250                    AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(
251                        output[1], %(p)d, 0x2);
252                ''' % { 'p' : p }
253            eCode += '''
254                } else {
255            '''
256            for p in range(4):
257                eCode += '''
258                    AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(temp,
259                        %(p)d, 0x2);
260                ''' % { 'p' : p }
261            eCode += '''
262                }
263            '''
264
265            iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
266                                { 'code' : eCode, 'op_class' : 'No_OpClass' },
267                                ['IsMicroop'])
268            header_output += MicroNeonMixDeclare64.subst(iop)
269            exec_output += MicroNeonMixExecute64.subst(iop)
270
271        elif name == 'int_neon_uop':
272
273            eCode = '''
274                // input data from arch. SIMD regs
275                VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
276                VReg output[2];  // output data to scratch area
277            '''
278
279            eCode += getInputCodeOp1S
280
281            # Note that numRegs is not always the same as numStructElems; in
282            # particular, for LD1/ST1, numStructElems is 1 but numRegs can be
283            # 1, 2, 3 or 4
284
285            eCode += '''
286                int eCount = dataSize / (8 << eSize);
287                int eSizeBytes = 1 << eSize;
288                int totNumBytes = numRegs * dataSize / 8;
289                int numOutputElems = 128 / (8 << eSize);
290                int stepOffset = step * 32;
291
292                for (int i = 0; i < 2; ++i) {
293                    output[i].lo = 0;
294                    output[i].hi = 0;
295                }
296
297                int r = 0, k = 0, i, j;
298                XReg data;
299
300                for (int pos = stepOffset; pos < 32 + stepOffset;
301                        pos += eSizeBytes) {
302                    if (pos < totNumBytes) {
303                        if (numStructElems == 1) {
304                            i = (pos / eSizeBytes) % eCount;
305                            j = pos / (eCount * eSizeBytes);
306                        } else {
307                            i = pos / (numStructElems * eSizeBytes);
308                            j = (pos % (numStructElems * eSizeBytes)) /
309                                eSizeBytes;
310                        }
311                        data = (XReg) readVecElem(input[j], (XReg) i, eSize);
312                        writeVecElem(&output[r], data, k, eSize);
313                        k++;
314                        if (k == numOutputElems){
315                            k = 0;
316                            ++r;
317                        }
318                    }
319                }
320                '''
321            for v in range(2):
322                for p in range(4):
323                    eCode += '''
324                AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem(
325                    output[%(v)d], %(p)d, 0x2);
326                ''' % { 'v': v, 'p': p}
327
328            iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
329                                { 'code' : eCode, 'op_class' : 'No_OpClass' },
330                                ['IsMicroop'])
331            header_output += MicroNeonMixDeclare64.subst(iop)
332            exec_output += MicroNeonMixExecute64.subst(iop)
333
334        elif name == 'unpack_neon_uop':
335
336            eCode = '''
337                //input data from scratch area
338                VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
339                //output data to arch. SIMD regs
340                VReg output[2] = { {0, 0}, {0, 0} };
341            '''
342
343            eCode += getInputCodeOp1L
344
345            # Fill output regs with register data initially.  Note that
346            # elements in output register outside indexed lanes are left
347            # untouched
348            for v in range(2):
349                for p in range(4):
350                    eCode += '''
351                writeVecElem(&output[%(v)d], (XReg) AA64FpDestP%(p)dV%(v)dL_uw,
352                             %(p)d, 0x2);
353                ''' % { 'v': v, 'p': p}
354            eCode += '''
355                int eCount = dataSize / (8 << eSize);
356                int eCount128 = 128 / (8 << eSize);
357                int eSizeBytes = 1 << eSize;
358                int totNumBytes = numStructElems * eSizeBytes;
359                int numInputElems = eCount128;
360                int stepOffset = step * 2 * eSizeBytes;
361                int stepLimit = 2 * eSizeBytes;
362
363                int r = 0, i, j;
364                XReg data;
365
366                for (int pos = stepOffset; pos < stepLimit + stepOffset;
367                        pos += eSizeBytes) {
368                    if (pos < totNumBytes) {
369                        r = pos / eSizeBytes;
370                        j = r / numInputElems;
371                        i = r % numInputElems;
372                        data = (XReg) readVecElem(input[j], (XReg) i, eSize);
373
374                        if (replicate) {
375                            for (int i = 0; i < eCount128; ++i) {
376                                if (i < eCount) {
377                                    writeVecElem(&output[r % 2], data, i,
378                                                 eSize);
379                                } else {  // zero extend if necessary
380                                    writeVecElem(&output[r % 2], (XReg) 0, i,
381                                                 eSize);
382                                }
383                            }
384                        } else {
385                            writeVecElem(&output[r % 2], data, lane, eSize);
386                        }
387                    }
388                }
389            '''
390            for v in range(2):
391                for p in range(4):
392                    eCode += '''
393                AA64FpDestP%(p)dV%(v)dL_uw = (uint32_t) readVecElem(
394                    output[%(v)d], %(p)d, 0x2);
395                ''' % { 'v' : v, 'p' : p }
396
397            iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64',
398                                { 'code' : eCode }, ['IsMicroop'])
399            header_output += MicroNeonMixLaneDeclare64.subst(iop)
400            exec_output += MicroNeonMixExecute64.subst(iop)
401
402        elif name == 'pack_neon_uop':
403
404            eCode = '''
405                // input data from arch. SIMD regs
406                VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
407                VReg output[2];  // output data to scratch area
408            '''
409
410            eCode += getInputCodeOp1S
411
412            eCode += '''
413                int eSizeBytes = 1 << eSize;
414                int numOutputElems = 128 / (8 << eSize);
415                int totNumBytes = numStructElems * eSizeBytes;
416                int stepOffset = step * 32;
417                int stepLimit = 32;
418
419                int r = 0, i, j;
420                XReg data;
421
422                for (int i = 0; i < 2; ++i) {
423                    output[i].lo = 0;
424                    output[i].hi = 0;
425                }
426
427                for (int pos = stepOffset; pos < stepLimit + stepOffset;
428                        pos += eSizeBytes) {
429                    if (pos < totNumBytes) {
430                        r = pos / 16;
431                        j = pos / eSizeBytes;
432                        i = (pos / eSizeBytes) %  numOutputElems;
433                        data = (XReg) readVecElem(input[j], lane, eSize);
434                        writeVecElem(&output[r % 2], data, i, eSize);
435                    }
436                }
437            '''
438
439            for v in range(2):
440                for p in range(4):
441                    eCode += '''
442                AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem(
443                    output[%(v)d], %(p)d, 0x2);
444                ''' % { 'v' : v, 'p' : p }
445
446            iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64',
447                                { 'code' : eCode }, ['IsMicroop'])
448            header_output += MicroNeonMixLaneDeclare64.subst(iop)
449            exec_output += MicroNeonMixExecute64.subst(iop)
450
451    # Generate instructions
452    mkMemAccMicroOp('mem_neon_uop')
453    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_1Reg', numRegs=1)
454    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_2Reg', numRegs=2)
455    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_3Reg', numRegs=3)
456    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_4Reg', numRegs=4)
457    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_1Reg', numRegs=1)
458    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_2Reg', numRegs=2)
459    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_3Reg', numRegs=3)
460    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_4Reg', numRegs=4)
461    mkMarshalMicroOp('unpack_neon_uop', 'MicroUnpackNeon64')
462    mkMarshalMicroOp('pack_neon_uop', 'MicroPackNeon64')
463
464}};
465
466let {{
467
468    iop = InstObjParams('vldmult64', 'VldMult64', 'VldMultOp64', '', [])
469    header_output += VMemMultDeclare64.subst(iop)
470    decoder_output += VMemMultConstructor64.subst(iop)
471
472    iop = InstObjParams('vstmult64', 'VstMult64', 'VstMultOp64', '', [])
473    header_output += VMemMultDeclare64.subst(iop)
474    decoder_output += VMemMultConstructor64.subst(iop)
475
476    iop = InstObjParams('vldsingle64', 'VldSingle64', 'VldSingleOp64', '', [])
477    header_output += VMemSingleDeclare64.subst(iop)
478    decoder_output += VMemSingleConstructor64.subst(iop)
479
480    iop = InstObjParams('vstsingle64', 'VstSingle64', 'VstSingleOp64', '', [])
481    header_output += VMemSingleDeclare64.subst(iop)
482    decoder_output += VMemSingleConstructor64.subst(iop)
483
484}};
485