1// -*- mode: c++ -*-
2
3// Copyright (c) 2012-2014 ARM Limited
4// All rights reserved
5//
6// The license below extends only to copyright in the software and shall
7// not be construed as granting a license to any other intellectual
8// property including but not limited to intellectual property relating
9// to a hardware implementation of the functionality of the software
10// licensed hereunder.  You may use the software subject to the license
11// terms below provided that you ensure that this notice is replicated
12// unmodified and in its entirety in all distributions of the software,
13// modified or unmodified, in source code or in binary form.
14//
15// Redistribution and use in source and binary forms, with or without
16// modification, are permitted provided that the following conditions are
17// met: redistributions of source code must retain the above copyright
18// notice, this list of conditions and the following disclaimer;
19// redistributions in binary form must reproduce the above copyright
20// notice, this list of conditions and the following disclaimer in the
21// documentation and/or other materials provided with the distribution;
22// neither the name of the copyright holders nor the names of its
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Authors: Mbou Eyole
39//          Giacomo Gabrielli
40
41let {{
42
43    header_output = ''
44    decoder_output = ''
45    exec_output = ''
46
47    zeroSveVecRegUpperPartCode = '''
48        ArmISA::ISA::zeroSveVecRegUpperPart(%s,
49            ArmStaticInst::getCurSveVecLen<uint64_t>(xc->tcBase()));
50    '''
51
52    def mkMemAccMicroOp(name):
53        global header_output, decoder_output, exec_output
54        SPAlignmentCheckCodeNeon = '''
55            if (baseIsSP && bits(XURa, 3, 0) &&
56                SPAlignmentCheckEnabled(xc->tcBase())) {
57                return std::make_shared<SPAlignmentFault>();
58            }
59        '''
60        eaCode = SPAlignmentCheckCodeNeon + '''
61            EA = XURa + imm;
62        '''
63        memDecl = '''
64            const int MaxNumBytes = 16;
65            union MemUnion {
66                uint8_t bytes[MaxNumBytes];
67                uint32_t floatRegBits[MaxNumBytes / 4];
68            };
69        '''
70
71        # Do endian conversion for all the elements
72        convCode = '''
73            VReg x = {0, 0};
74
75            x.lo = (((XReg) memUnion.floatRegBits[1]) << 32) |
76                (XReg) memUnion.floatRegBits[0];
77            x.hi = (((XReg) memUnion.floatRegBits[3]) << 32) |
78                (XReg) memUnion.floatRegBits[2];
79
80            const unsigned eCount = 16 / (1 << eSize);
81
82            if (isBigEndian64(xc->tcBase())) {
83                for (unsigned i = 0; i < eCount; i++) {
84                    switch (eSize) {
85                      case 0x3:  // 64-bit
86                        writeVecElem(&x, (XReg) gtobe(
87                            (uint64_t) readVecElem(x, i, eSize)), i, eSize);
88                        break;
89                      case 0x2:  // 32-bit
90                        writeVecElem(&x, (XReg) gtobe(
91                            (uint32_t) readVecElem(x, i, eSize)), i, eSize);
92                        break;
93                      case 0x1:  // 16-bit
94                        writeVecElem(&x, (XReg) gtobe(
95                            (uint16_t) readVecElem(x, i, eSize)), i, eSize);
96                        break;
97                      default:  // 8-bit
98                        break;  // Nothing to do here
99                    }
100                }
101            } else {
102                for (unsigned i = 0; i < eCount; i++) {
103                    switch (eSize) {
104                      case 0x3:  // 64-bit
105                        writeVecElem(&x, (XReg) gtole(
106                            (uint64_t) readVecElem(x, i, eSize)), i, eSize);
107                        break;
108                      case 0x2:  // 32-bit
109                        writeVecElem(&x, (XReg) gtole(
110                            (uint32_t) readVecElem(x, i, eSize)), i, eSize);
111                        break;
112                      case 0x1:  // 16-bit
113                        writeVecElem(&x, (XReg) gtole(
114                            (uint16_t) readVecElem(x, i, eSize)), i, eSize);
115                        break;
116                      default:  // 8-bit
117                        break;  // Nothing to do here
118                    }
119                }
120            }
121
122            memUnion.floatRegBits[0] = (uint32_t) x.lo;
123            memUnion.floatRegBits[1] = (uint32_t) (x.lo >> 32);
124            memUnion.floatRegBits[2] = (uint32_t) x.hi;
125            memUnion.floatRegBits[3] = (uint32_t) (x.hi >> 32);
126        '''
127
128        # Offload everything into registers
129        regSetCode = ''
130        for reg in range(4):
131            regSetCode += '''
132            AA64FpDestP%(reg)d_uw = gtoh(memUnion.floatRegBits[%(reg)d]);
133            ''' % { 'reg' : reg }
134
135        # Pull everything in from registers
136        regGetCode = ''
137        for reg in range(4):
138            regGetCode += '''
139            memUnion.floatRegBits[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
140            ''' % { 'reg' : reg }
141
142        loadMemAccCode = convCode + regSetCode
143        storeMemAccCode = regGetCode + convCode
144
145        loadIop = InstObjParams(name + 'ld',
146                'MicroNeonLoad64',
147                'MicroNeonMemOp',
148            {   'mem_decl' : memDecl,
149                'memacc_code' : loadMemAccCode,
150                'ea_code' : simd64EnabledCheckCode + eaCode,
151            },
152            [ 'IsMicroop', 'IsMemRef', 'IsLoad' ])
153        loadIop.snippets["memacc_code"] += zeroSveVecRegUpperPartCode % \
154            "AA64FpDest"
155        storeIop = InstObjParams(name + 'st',
156                'MicroNeonStore64',
157                'MicroNeonMemOp',
158            {   'mem_decl' : memDecl,
159                'memacc_code' : storeMemAccCode,
160                'ea_code' : simd64EnabledCheckCode + eaCode,
161            },
162            [ 'IsMicroop', 'IsMemRef', 'IsStore' ])
163
164        exec_output += NeonLoadExecute64.subst(loadIop) + \
165            NeonLoadInitiateAcc64.subst(loadIop) + \
166            NeonLoadCompleteAcc64.subst(loadIop) + \
167            NeonStoreExecute64.subst(storeIop) + \
168            NeonStoreInitiateAcc64.subst(storeIop) + \
169            NeonStoreCompleteAcc64.subst(storeIop)
170        header_output += MicroNeonMemDeclare64.subst(loadIop) + \
171            MicroNeonMemDeclare64.subst(storeIop)
172
173    def mkMarshalMicroOp(name, Name, numRegs=4):
174        global header_output, decoder_output, exec_output
175
176        getInputCodeOp1L = ''
177        for v in range(numRegs):
178            for p in range(4):
179                getInputCodeOp1L += '''
180            writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)d_uw,
181                         %(p)d, 0x2);
182            ''' % { 'v' : v, 'p' : p }
183
184        getInputCodeOp1S = ''
185        for v in range(numRegs):
186            for p in range(4):
187                getInputCodeOp1S += '''
188            writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)dS_uw,
189                         %(p)d, 0x2);
190            ''' % { 'v' : v, 'p' : p }
191
192        if name == 'deint_neon_uop':
193
194            eCode = '''
195                // input data from scratch area
196                VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
197                VReg output[2];  // output data to arch. SIMD regs
198                VReg temp;
199                temp.lo = 0;
200                temp.hi = 0;
201            '''
202            for p in range(4):
203                eCode += '''
204                writeVecElem(&temp, (XReg) AA64FpDestP%(p)dV1L_uw, %(p)d, 0x2);
205                ''' % { 'p' : p }
206            eCode += getInputCodeOp1L
207
208            # Note that numRegs is not always the same as numStructElems; in
209            # particular, for LD1/ST1, numStructElems is 1 but numRegs can be
210            # 1, 2, 3 or 4
211
212            eCode += '''
213                output[0].lo = 0;
214                output[0].hi = 0;
215                output[1].lo = 0;
216                output[1].hi = 0;
217
218                int eCount = dataSize / (8 << eSize);
219                int eSizeBytes = 1 << eSize;  // element size in bytes
220                int numBytes = step * dataSize / 4;
221                int totNumBytes = numRegs * dataSize / 8;
222
223                int structElemNo, pos, a, b;
224                XReg data;
225
226                for (int r = 0; r < 2; ++r) {
227                    for (int i = 0; i < eCount; ++i) {
228                        if (numBytes < totNumBytes) {
229                            structElemNo = r + (step * 2);
230                            if (numStructElems == 1) {
231                                pos = (eSizeBytes * i) +
232                                    (eCount * structElemNo * eSizeBytes);
233                            } else {
234                                pos = (numStructElems * eSizeBytes * i) +
235                                    (structElemNo * eSizeBytes);
236                            }
237                            a = pos / 16;
238                            b = (pos % 16) / eSizeBytes;
239                            data = (XReg) readVecElem(input[a], (XReg) b,
240                                                      eSize);
241                            writeVecElem(&output[r], data, i, eSize);
242                            numBytes += eSizeBytes;
243                        }
244                    }
245                }
246            '''
247            for p in range(4):
248                eCode += '''
249                AA64FpDestP%(p)dV0L_uw = (uint32_t) readVecElem(output[0],
250                    %(p)d, 0x2);
251                ''' % { 'p' : p }
252            eCode += '''
253                if ((numRegs % 2 == 0) || (numRegs == 3 && step == 0)) {
254            '''
255            for p in range(4):
256                eCode += '''
257                    AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(
258                        output[1], %(p)d, 0x2);
259                ''' % { 'p' : p }
260            eCode += '''
261                } else {
262            '''
263            for p in range(4):
264                eCode += '''
265                    AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(temp,
266                        %(p)d, 0x2);
267                ''' % { 'p' : p }
268            eCode += '''
269                }
270            '''
271
272            iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
273                                { 'code' : eCode, 'op_class' : 'No_OpClass' },
274                                ['IsMicroop'])
275            header_output += MicroNeonMixDeclare64.subst(iop)
276            exec_output += MicroNeonMixExecute64.subst(iop)
277
278        elif name == 'int_neon_uop':
279
280            eCode = '''
281                // input data from arch. SIMD regs
282                VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
283                VReg output[2];  // output data to scratch area
284            '''
285
286            eCode += getInputCodeOp1S
287
288            # Note that numRegs is not always the same as numStructElems; in
289            # particular, for LD1/ST1, numStructElems is 1 but numRegs can be
290            # 1, 2, 3 or 4
291
292            eCode += '''
293                int eCount = dataSize / (8 << eSize);
294                int eSizeBytes = 1 << eSize;
295                int totNumBytes = numRegs * dataSize / 8;
296                int numOutputElems = 128 / (8 << eSize);
297                int stepOffset = step * 32;
298
299                for (int i = 0; i < 2; ++i) {
300                    output[i].lo = 0;
301                    output[i].hi = 0;
302                }
303
304                int r = 0, k = 0, i, j;
305                XReg data;
306
307                for (int pos = stepOffset; pos < 32 + stepOffset;
308                        pos += eSizeBytes) {
309                    if (pos < totNumBytes) {
310                        if (numStructElems == 1) {
311                            i = (pos / eSizeBytes) % eCount;
312                            j = pos / (eCount * eSizeBytes);
313                        } else {
314                            i = pos / (numStructElems * eSizeBytes);
315                            j = (pos % (numStructElems * eSizeBytes)) /
316                                eSizeBytes;
317                        }
318                        data = (XReg) readVecElem(input[j], (XReg) i, eSize);
319                        writeVecElem(&output[r], data, k, eSize);
320                        k++;
321                        if (k == numOutputElems){
322                            k = 0;
323                            ++r;
324                        }
325                    }
326                }
327                '''
328            for v in range(2):
329                for p in range(4):
330                    eCode += '''
331                AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem(
332                    output[%(v)d], %(p)d, 0x2);
333                ''' % { 'v': v, 'p': p}
334
335            iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
336                                { 'code' : eCode, 'op_class' : 'No_OpClass' },
337                                ['IsMicroop'])
338            header_output += MicroNeonMixDeclare64.subst(iop)
339            exec_output += MicroNeonMixExecute64.subst(iop)
340
341        elif name == 'unpack_neon_uop':
342
343            eCode = '''
344                //input data from scratch area
345                VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
346                //output data to arch. SIMD regs
347                VReg output[2] = { {0, 0}, {0, 0} };
348            '''
349
350            eCode += getInputCodeOp1L
351
352            # Fill output regs with register data initially.  Note that
353            # elements in output register outside indexed lanes are left
354            # untouched
355            for v in range(2):
356                for p in range(4):
357                    eCode += '''
358                writeVecElem(&output[%(v)d], (XReg) AA64FpDestP%(p)dV%(v)dL_uw,
359                             %(p)d, 0x2);
360                ''' % { 'v': v, 'p': p}
361            eCode += '''
362                int eCount = dataSize / (8 << eSize);
363                int eCount128 = 128 / (8 << eSize);
364                int eSizeBytes = 1 << eSize;
365                int totNumBytes = numStructElems * eSizeBytes;
366                int numInputElems = eCount128;
367                int stepOffset = step * 2 * eSizeBytes;
368                int stepLimit = 2 * eSizeBytes;
369
370                int r = 0, i, j;
371                XReg data;
372
373                for (int pos = stepOffset; pos < stepLimit + stepOffset;
374                        pos += eSizeBytes) {
375                    if (pos < totNumBytes) {
376                        r = pos / eSizeBytes;
377                        j = r / numInputElems;
378                        i = r % numInputElems;
379                        data = (XReg) readVecElem(input[j], (XReg) i, eSize);
380
381                        if (replicate) {
382                            for (int i = 0; i < eCount128; ++i) {
383                                if (i < eCount) {
384                                    writeVecElem(&output[r % 2], data, i,
385                                                 eSize);
386                                } else {  // zero extend if necessary
387                                    writeVecElem(&output[r % 2], (XReg) 0, i,
388                                                 eSize);
389                                }
390                            }
391                        } else {
392                            writeVecElem(&output[r % 2], data, lane, eSize);
393                        }
394                    }
395                }
396            '''
397            for v in range(2):
398                for p in range(4):
399                    eCode += '''
400                AA64FpDestP%(p)dV%(v)dL_uw = (uint32_t) readVecElem(
401                    output[%(v)d], %(p)d, 0x2);
402                ''' % { 'v' : v, 'p' : p }
403
404            iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64',
405                                { 'code' : eCode }, ['IsMicroop'])
406            header_output += MicroNeonMixLaneDeclare64.subst(iop)
407            exec_output += MicroNeonMixExecute64.subst(iop)
408
409        elif name == 'pack_neon_uop':
410
411            eCode = '''
412                // input data from arch. SIMD regs
413                VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
414                VReg output[2];  // output data to scratch area
415            '''
416
417            eCode += getInputCodeOp1S
418
419            eCode += '''
420                int eSizeBytes = 1 << eSize;
421                int numOutputElems = 128 / (8 << eSize);
422                int totNumBytes = numStructElems * eSizeBytes;
423                int stepOffset = step * 32;
424                int stepLimit = 32;
425
426                int r = 0, i, j;
427                XReg data;
428
429                for (int i = 0; i < 2; ++i) {
430                    output[i].lo = 0;
431                    output[i].hi = 0;
432                }
433
434                for (int pos = stepOffset; pos < stepLimit + stepOffset;
435                        pos += eSizeBytes) {
436                    if (pos < totNumBytes) {
437                        r = pos / 16;
438                        j = pos / eSizeBytes;
439                        i = (pos / eSizeBytes) %  numOutputElems;
440                        data = (XReg) readVecElem(input[j], lane, eSize);
441                        writeVecElem(&output[r % 2], data, i, eSize);
442                    }
443                }
444            '''
445
446            for v in range(2):
447                for p in range(4):
448                    eCode += '''
449                AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem(
450                    output[%(v)d], %(p)d, 0x2);
451                ''' % { 'v' : v, 'p' : p }
452
453            iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64',
454                                { 'code' : eCode }, ['IsMicroop'])
455            header_output += MicroNeonMixLaneDeclare64.subst(iop)
456            exec_output += MicroNeonMixExecute64.subst(iop)
457
458    # Generate instructions
459    mkMemAccMicroOp('mem_neon_uop')
460    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_1Reg', numRegs=1)
461    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_2Reg', numRegs=2)
462    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_3Reg', numRegs=3)
463    mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_4Reg', numRegs=4)
464    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_1Reg', numRegs=1)
465    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_2Reg', numRegs=2)
466    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_3Reg', numRegs=3)
467    mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_4Reg', numRegs=4)
468    mkMarshalMicroOp('unpack_neon_uop', 'MicroUnpackNeon64')
469    mkMarshalMicroOp('pack_neon_uop', 'MicroPackNeon64')
470
471}};
472
473let {{
474
475    iop = InstObjParams('vldmult64', 'VldMult64', 'VldMultOp64', '', [])
476    header_output += VMemMultDeclare64.subst(iop)
477    decoder_output += VMemMultConstructor64.subst(iop)
478
479    iop = InstObjParams('vstmult64', 'VstMult64', 'VstMultOp64', '', [])
480    header_output += VMemMultDeclare64.subst(iop)
481    decoder_output += VMemMultConstructor64.subst(iop)
482
483    iop = InstObjParams('vldsingle64', 'VldSingle64', 'VldSingleOp64', '', [])
484    header_output += VMemSingleDeclare64.subst(iop)
485    decoder_output += VMemSingleConstructor64.subst(iop)
486
487    iop = InstObjParams('vstsingle64', 'VstSingle64', 'VstSingleOp64', '', [])
488    header_output += VMemSingleDeclare64.subst(iop)
489    decoder_output += VMemSingleConstructor64.subst(iop)
490
491}};
492