GPU.py revision 11308
1#
2#  Copyright (c) 2015 Advanced Micro Devices, Inc.
3#  All rights reserved.
4#
5#  For use for simulation and test purposes only
6#
7#  Redistribution and use in source and binary forms, with or without
8#  modification, are permitted provided that the following conditions are met:
9#
10#  1. Redistributions of source code must retain the above copyright notice,
11#  this list of conditions and the following disclaimer.
12#
13#  2. Redistributions in binary form must reproduce the above copyright notice,
14#  this list of conditions and the following disclaimer in the documentation
15#  and/or other materials provided with the distribution.
16#
17#  3. Neither the name of the copyright holder nor the names of its contributors
18#  may be used to endorse or promote products derived from this software
19#  without specific prior written permission.
20#
21#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31#  POSSIBILITY OF SUCH DAMAGE.
32#
33#  Author: Steve Reinhardt
34#
35
36from ClockedObject import ClockedObject
37from Device import DmaDevice
38from m5.defines import buildEnv
39from m5.params import *
40from m5.proxy import *
41from m5.SimObject import SimObject
42from MemObject import MemObject
43from Process import EmulatedDriver
44from Bridge import Bridge
45from LdsState import LdsState
46
47class PrefetchType(Enum): vals = [
48    'PF_CU',
49    'PF_PHASE',
50    'PF_WF',
51    'PF_STRIDE',
52    'PF_END',
53    ]
54
55class VectorRegisterFile(SimObject):
56    type = 'VectorRegisterFile'
57    cxx_class = 'VectorRegisterFile'
58    cxx_header = 'gpu-compute/vector_register_file.hh'
59
60    simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
61    num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
62    min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
63
64class Wavefront(SimObject):
65    type = 'Wavefront'
66    cxx_class = 'Wavefront'
67    cxx_header = 'gpu-compute/wavefront.hh'
68
69    simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
70    wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
71
72class ComputeUnit(MemObject):
73    type = 'ComputeUnit'
74    cxx_class = 'ComputeUnit'
75    cxx_header = 'gpu-compute/compute_unit.hh'
76
77    wavefronts = VectorParam.Wavefront('Number of wavefronts')
78    wfSize = Param.Int(64, 'Wavefront size (in work items)')
79    num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
80
81    spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\
82                                        'latency')
83
84    dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\
85                                        'latency')
86
87    issue_period = Param.Int(4, 'number of cycles per issue period')
88    num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU')
89    num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU')
90    n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
91    mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\
92                                "Represents the pipeline to reach the TCP and "\
93                                "specified in GPU clock cycles")
94    mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\
95                                 "cu. Represents the pipeline between the TCP "\
96                                 "and cu as well as TCP data array access. "\
97                                 "Specified in GPU clock cycles")
98    system = Param.System(Parent.any, "system object")
99    cu_id = Param.Int('CU id')
100    vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\
101                                           "in bytes")
102    coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\
103                                           "in bytes")
104
105    memory_port = VectorMasterPort("Port to the memory system")
106    translation_port = VectorMasterPort('Port to the TLB hierarchy')
107    sqc_port = MasterPort("Port to the SQC (I-cache")
108    sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)")
109    perLaneTLB = Param.Bool(False, "enable per-lane TLB")
110    prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\
111                               "(0 turns off prefetching)")
112    prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)")
113    prefetch_prev_type = Param.PrefetchType('PF_PHASE', "Prefetch the stride "\
114                                            "from last mem req in lane of "\
115                                            "CU|Phase|Wavefront")
116    execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy");
117    xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr.");
118    debugSegFault = Param.Bool(False, "enable debugging GPU seg faults")
119    functionalTLB = Param.Bool(False, "Assume TLB causes no delay")
120
121    localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\
122                                        "kernel end")
123
124    countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\
125                                   "and how many times")
126    global_mem_queue_size = Param.Int(256, "Number of entries in the global "
127                                      "memory pipeline's queues")
128    local_mem_queue_size = Param.Int(256, "Number of entries in the local "
129                                      "memory pipeline's queues")
130    ldsBus = Bridge() # the bridge between the CU and its LDS
131    ldsPort = MasterPort("The port that goes to the LDS")
132    localDataStore = Param.LdsState("the LDS for this CU")
133
134    vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
135                                                          "file")
136
137class Shader(ClockedObject):
138    type = 'Shader'
139    cxx_class = 'Shader'
140    cxx_header = 'gpu-compute/shader.hh'
141
142    CUs = VectorParam.ComputeUnit('Number of compute units')
143    n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
144    impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into
145                                                  ruby at kernel boundaries""")
146    separate_acquire_release = Param.Bool(False,
147        """Do ld_acquire/st_release generate separate requests for the
148        acquire and release?""")
149    globalmem = Param.MemorySize('64kB', 'Memory size')
150    timing = Param.Bool(False, 'timing memory accesses')
151
152    cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU")
153    translation = Param.Bool(False, "address translation");
154
155class ClDriver(EmulatedDriver):
156    type = 'ClDriver'
157    cxx_header = 'gpu-compute/cl_driver.hh'
158    codefile = VectorParam.String('code file name(s)')
159
160class GpuDispatcher(DmaDevice):
161    type = 'GpuDispatcher'
162    cxx_header = 'gpu-compute/dispatcher.hh'
163    # put at 8GB line for now
164    pio_addr = Param.Addr(0x200000000, "Device Address")
165    pio_latency = Param.Latency('1ns', "Programmed IO latency")
166    shader_pointer = Param.Shader('pointer to shader')
167    translation_port = MasterPort('Port to the dispatcher TLB')
168    cpu = Param.BaseCPU("CPU to wake up on kernel completion")
169
170    cl_driver = Param.ClDriver('pointer to driver')
171
172class OpType(Enum): vals = [
173    'OT_NULL',
174    'OT_ALU',
175    'OT_SPECIAL',
176    'OT_GLOBAL_READ',
177    'OT_GLOBAL_WRITE',
178    'OT_GLOBAL_ATOMIC',
179    'OT_GLOBAL_HIST',
180    'OT_GLOBAL_LDAS',
181    'OT_SHARED_READ',
182    'OT_SHARED_WRITE',
183    'OT_SHARED_ATOMIC',
184    'OT_SHARED_HIST',
185    'OT_SHARED_LDAS',
186    'OT_PRIVATE_READ',
187    'OT_PRIVATE_WRITE',
188    'OT_PRIVATE_ATOMIC',
189    'OT_PRIVATE_HIST',
190    'OT_PRIVATE_LDAS',
191    'OT_SPILL_READ',
192    'OT_SPILL_WRITE',
193    'OT_SPILL_ATOMIC',
194    'OT_SPILL_HIST',
195    'OT_SPILL_LDAS',
196    'OT_READONLY_READ',
197    'OT_READONLY_WRITE',
198    'OT_READONLY_ATOMIC',
199    'OT_READONLY_HIST',
200    'OT_READONLY_LDAS',
201    'OT_FLAT_READ',
202    'OT_FLAT_WRITE',
203    'OT_FLAT_ATOMIC',
204    'OT_FLAT_HIST',
205    'OT_FLAT_LDAS',
206    'OT_KERN_READ',
207    'OT_BRANCH',
208
209    # note: Only the OT_BOTH_MEMFENCE seems to be supported in the 1.0F version
210    #       of the compiler.
211    'OT_SHARED_MEMFENCE',
212    'OT_GLOBAL_MEMFENCE',
213    'OT_BOTH_MEMFENCE',
214
215    'OT_BARRIER',
216    'OT_PRINT',
217    'OT_RET',
218    'OT_NOP',
219    'OT_ARG'
220    ]
221
222class MemType(Enum): vals = [
223    'M_U8',
224    'M_U16',
225    'M_U32',
226    'M_U64',
227    'M_S8',
228    'M_S16',
229    'M_S32',
230    'M_S64',
231    'M_F16',
232    'M_F32',
233    'M_F64',
234    ]
235
236class MemOpType(Enum): vals = [
237    'MO_LD',
238    'MO_ST',
239    'MO_LDAS',
240    'MO_LDA',
241    'MO_AAND',
242    'MO_AOR',
243    'MO_AXOR',
244    'MO_ACAS',
245    'MO_AEXCH',
246    'MO_AADD',
247    'MO_ASUB',
248    'MO_AINC',
249    'MO_ADEC',
250    'MO_AMAX',
251    'MO_AMIN',
252    'MO_ANRAND',
253    'MO_ANROR',
254    'MO_ANRXOR',
255    'MO_ANRCAS',
256    'MO_ANREXCH',
257    'MO_ANRADD',
258    'MO_ANRSUB',
259    'MO_ANRINC',
260    'MO_ANRDEC',
261    'MO_ANRMAX',
262    'MO_ANRMIN',
263    'MO_HAND',
264    'MO_HOR',
265    'MO_HXOR',
266    'MO_HCAS',
267    'MO_HEXCH',
268    'MO_HADD',
269    'MO_HSUB',
270    'MO_HINC',
271    'MO_HDEC',
272    'MO_HMAX',
273    'MO_HMIN',
274    'MO_UNDEF'
275    ]
276
277class StorageClassType(Enum): vals = [
278    'SC_SPILL',
279    'SC_GLOBAL',
280    'SC_SHARED',
281    'SC_PRIVATE',
282    'SC_READONLY',
283    'SC_KERNARG',
284    'SC_NONE',
285    ]
286
287class RegisterType(Enum): vals = [
288    'RT_VECTOR',
289    'RT_SCALAR',
290    'RT_CONDITION',
291    'RT_HARDWARE',
292    'RT_NONE',
293    ]
294
295class GenericMemoryOrder(Enum): vals = [
296    'MEMORY_ORDER_NONE',
297    'MEMORY_ORDER_RELAXED',
298    'MEMORY_ORDER_SC_ACQUIRE',
299    'MEMORY_ORDER_SC_RELEASE',
300    'MEMORY_ORDER_SC_ACQUIRE_RELEASE',
301    ]
302
303class GenericMemoryScope(Enum): vals = [
304    'MEMORY_SCOPE_NONE',
305    'MEMORY_SCOPE_WORKITEM',
306    'MEMORY_SCOPE_WAVEFRONT',
307    'MEMORY_SCOPE_WORKGROUP',
308    'MEMORY_SCOPE_DEVICE',
309    'MEMORY_SCOPE_SYSTEM',
310    ]
311