Cross Reference: /gem5/src/gpu-compute/GPU.py

Deleted Added

sdiff udiff text old ( 13665:9c7fe3811b88 ) new ( 13892:0182a0601f66 )

full compact

GPU.py (13665:9c7fe3811b88)	GPU.py (13892:0182a0601f66)
1# 2# Copyright (c) 2015 Advanced Micro Devices, Inc. 3# All rights reserved. 4# 5# For use for simulation and test purposes only 6# 7# Redistribution and use in source and binary forms, with or without 8# modification, are permitted provided that the following conditions are met: 9# 10# 1. Redistributions of source code must retain the above copyright notice, 11# this list of conditions and the following disclaimer. 12# 13# 2. Redistributions in binary form must reproduce the above copyright notice, 14# this list of conditions and the following disclaimer in the documentation 15# and/or other materials provided with the distribution. 16# 17# 3. Neither the name of the copyright holder nor the names of its contributors 18# may be used to endorse or promote products derived from this software 19# without specific prior written permission. 20# 21# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31# POSSIBILITY OF SUCH DAMAGE. 32# 33# Author: Steve Reinhardt 34# 35 36from m5.defines import buildEnv 37from m5.params import * 38from m5.proxy import * 39from m5.SimObject import SimObject 40 41from m5.objects.ClockedObject import ClockedObject 42from m5.objects.Device import DmaDevice	1# 2# Copyright (c) 2015 Advanced Micro Devices, Inc. 3# All rights reserved. 4# 5# For use for simulation and test purposes only 6# 7# Redistribution and use in source and binary forms, with or without 8# modification, are permitted provided that the following conditions are met: 9# 10# 1. Redistributions of source code must retain the above copyright notice, 11# this list of conditions and the following disclaimer. 12# 13# 2. Redistributions in binary form must reproduce the above copyright notice, 14# this list of conditions and the following disclaimer in the documentation 15# and/or other materials provided with the distribution. 16# 17# 3. Neither the name of the copyright holder nor the names of its contributors 18# may be used to endorse or promote products derived from this software 19# without specific prior written permission. 20# 21# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31# POSSIBILITY OF SUCH DAMAGE. 32# 33# Author: Steve Reinhardt 34# 35 36from m5.defines import buildEnv 37from m5.params import * 38from m5.proxy import * 39from m5.SimObject import SimObject 40 41from m5.objects.ClockedObject import ClockedObject 42from m5.objects.Device import DmaDevice
43from m5.objects.MemObject import MemObject
44from m5.objects.Process import EmulatedDriver 45from m5.objects.Bridge import Bridge 46from m5.objects.LdsState import LdsState 47 48class PrefetchType(Enum): vals = [ 49 'PF_CU', 50 'PF_PHASE', 51 'PF_WF', 52 'PF_STRIDE', 53 'PF_END', 54 ] 55 56class VectorRegisterFile(SimObject): 57 type = 'VectorRegisterFile' 58 cxx_class = 'VectorRegisterFile' 59 cxx_header = 'gpu-compute/vector_register_file.hh' 60 61 simd_id = Param.Int(0, 'SIMD ID associated with this VRF') 62 num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD') 63 wfSize = Param.Int(64, 'Wavefront size (in work items)') 64 min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF') 65 66class Wavefront(SimObject): 67 type = 'Wavefront' 68 cxx_class = 'Wavefront' 69 cxx_header = 'gpu-compute/wavefront.hh' 70 71 simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)') 72 wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)') 73 wfSize = Param.Int(64, 'Wavefront size (in work items)') 74	43from m5.objects.Process import EmulatedDriver 44from m5.objects.Bridge import Bridge 45from m5.objects.LdsState import LdsState 46 47class PrefetchType(Enum): vals = [ 48 'PF_CU', 49 'PF_PHASE', 50 'PF_WF', 51 'PF_STRIDE', 52 'PF_END', 53 ] 54 55class VectorRegisterFile(SimObject): 56 type = 'VectorRegisterFile' 57 cxx_class = 'VectorRegisterFile' 58 cxx_header = 'gpu-compute/vector_register_file.hh' 59 60 simd_id = Param.Int(0, 'SIMD ID associated with this VRF') 61 num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD') 62 wfSize = Param.Int(64, 'Wavefront size (in work items)') 63 min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF') 64 65class Wavefront(SimObject): 66 type = 'Wavefront' 67 cxx_class = 'Wavefront' 68 cxx_header = 'gpu-compute/wavefront.hh' 69 70 simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)') 71 wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)') 72 wfSize = Param.Int(64, 'Wavefront size (in work items)') 73
75class ComputeUnit(MemObject):	74class ComputeUnit(ClockedObject):
76 type = 'ComputeUnit' 77 cxx_class = 'ComputeUnit' 78 cxx_header = 'gpu-compute/compute_unit.hh' 79 80 wavefronts = VectorParam.Wavefront('Number of wavefronts') 81 wfSize = Param.Int(64, 'Wavefront size (in work items)') 82 num_SIMDs = Param.Int(4, 'number of SIMD units per CU') 83 84 spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\ 85 'latency') 86 87 dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\ 88 'latency') 89 90 issue_period = Param.Int(4, 'number of cycles per issue period') 91 num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU') 92 num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU') 93 n_wf = Param.Int(1, 'Number of wavefront slots per SIMD') 94 mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\ 95 "Represents the pipeline to reach the TCP and "\ 96 "specified in GPU clock cycles") 97 mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\ 98 "cu. Represents the pipeline between the TCP "\ 99 "and cu as well as TCP data array access. "\ 100 "Specified in GPU clock cycles") 101 system = Param.System(Parent.any, "system object") 102 cu_id = Param.Int('CU id') 103 vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\ 104 "in bytes") 105 coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\ 106 "in bytes") 107 108 memory_port = VectorMasterPort("Port to the memory system") 109 translation_port = VectorMasterPort('Port to the TLB hierarchy') 110 sqc_port = MasterPort("Port to the SQC (I-cache") 111 sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)") 112 perLaneTLB = Param.Bool(False, "enable per-lane TLB") 113 prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\ 114 "(0 turns off prefetching)") 115 prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)") 116 prefetch_prev_type = Param.PrefetchType('PF_PHASE', "Prefetch the stride "\ 117 "from last mem req in lane of "\ 118 "CU\|Phase\|Wavefront") 119 execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy"); 120 xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr."); 121 debugSegFault = Param.Bool(False, "enable debugging GPU seg faults") 122 functionalTLB = Param.Bool(False, "Assume TLB causes no delay") 123 124 localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\ 125 "kernel end") 126 127 countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\ 128 "and how many times") 129 global_mem_queue_size = Param.Int(256, "Number of entries in the global " 130 "memory pipeline's queues") 131 local_mem_queue_size = Param.Int(256, "Number of entries in the local " 132 "memory pipeline's queues") 133 ldsBus = Bridge() # the bridge between the CU and its LDS 134 ldsPort = MasterPort("The port that goes to the LDS") 135 localDataStore = Param.LdsState("the LDS for this CU") 136 137 vector_register_file = VectorParam.VectorRegisterFile("Vector register "\ 138 "file") 139 out_of_order_data_delivery = Param.Bool(False, "enable OoO data delivery" 140 " in the GM pipeline") 141 142class Shader(ClockedObject): 143 type = 'Shader' 144 cxx_class = 'Shader' 145 cxx_header = 'gpu-compute/shader.hh' 146 147 CUs = VectorParam.ComputeUnit('Number of compute units') 148 n_wf = Param.Int(1, 'Number of wavefront slots per SIMD') 149 impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into 150 ruby at kernel boundaries""") 151 separate_acquire_release = Param.Bool(False, 152 """Do ld_acquire/st_release generate separate requests for the 153 acquire and release?""") 154 globalmem = Param.MemorySize('64kB', 'Memory size') 155 timing = Param.Bool(False, 'timing memory accesses') 156 157 cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU") 158 translation = Param.Bool(False, "address translation"); 159 160class ClDriver(EmulatedDriver): 161 type = 'ClDriver' 162 cxx_header = 'gpu-compute/cl_driver.hh' 163 codefile = VectorParam.String('code file name(s)') 164 165class GpuDispatcher(DmaDevice): 166 type = 'GpuDispatcher' 167 cxx_header = 'gpu-compute/dispatcher.hh' 168 # put at 8GB line for now 169 pio_addr = Param.Addr(0x200000000, "Device Address") 170 pio_latency = Param.Latency('1ns', "Programmed IO latency") 171 shader_pointer = Param.Shader('pointer to shader') 172 translation_port = MasterPort('Port to the dispatcher TLB') 173 cpu = Param.BaseCPU("CPU to wake up on kernel completion") 174 175 cl_driver = Param.ClDriver('pointer to driver') 176 177class MemType(Enum): vals = [ 178 'M_U8', 179 'M_U16', 180 'M_U32', 181 'M_U64', 182 'M_S8', 183 'M_S16', 184 'M_S32', 185 'M_S64', 186 'M_F16', 187 'M_F32', 188 'M_F64', 189 ] 190 191class StorageClassType(Enum): vals = [ 192 'SC_SPILL', 193 'SC_GLOBAL', 194 'SC_SHARED', 195 'SC_PRIVATE', 196 'SC_READONLY', 197 'SC_KERNARG', 198 'SC_NONE', 199 ] 200 201class RegisterType(Enum): vals = [ 202 'RT_VECTOR', 203 'RT_SCALAR', 204 'RT_CONDITION', 205 'RT_HARDWARE', 206 'RT_NONE', 207 ]	75 type = 'ComputeUnit' 76 cxx_class = 'ComputeUnit' 77 cxx_header = 'gpu-compute/compute_unit.hh' 78 79 wavefronts = VectorParam.Wavefront('Number of wavefronts') 80 wfSize = Param.Int(64, 'Wavefront size (in work items)') 81 num_SIMDs = Param.Int(4, 'number of SIMD units per CU') 82 83 spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\ 84 'latency') 85 86 dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\ 87 'latency') 88 89 issue_period = Param.Int(4, 'number of cycles per issue period') 90 num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU') 91 num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU') 92 n_wf = Param.Int(1, 'Number of wavefront slots per SIMD') 93 mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\ 94 "Represents the pipeline to reach the TCP and "\ 95 "specified in GPU clock cycles") 96 mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\ 97 "cu. Represents the pipeline between the TCP "\ 98 "and cu as well as TCP data array access. "\ 99 "Specified in GPU clock cycles") 100 system = Param.System(Parent.any, "system object") 101 cu_id = Param.Int('CU id') 102 vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\ 103 "in bytes") 104 coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\ 105 "in bytes") 106 107 memory_port = VectorMasterPort("Port to the memory system") 108 translation_port = VectorMasterPort('Port to the TLB hierarchy') 109 sqc_port = MasterPort("Port to the SQC (I-cache") 110 sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)") 111 perLaneTLB = Param.Bool(False, "enable per-lane TLB") 112 prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\ 113 "(0 turns off prefetching)") 114 prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)") 115 prefetch_prev_type = Param.PrefetchType('PF_PHASE', "Prefetch the stride "\ 116 "from last mem req in lane of "\ 117 "CU\|Phase\|Wavefront") 118 execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy"); 119 xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr."); 120 debugSegFault = Param.Bool(False, "enable debugging GPU seg faults") 121 functionalTLB = Param.Bool(False, "Assume TLB causes no delay") 122 123 localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\ 124 "kernel end") 125 126 countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\ 127 "and how many times") 128 global_mem_queue_size = Param.Int(256, "Number of entries in the global " 129 "memory pipeline's queues") 130 local_mem_queue_size = Param.Int(256, "Number of entries in the local " 131 "memory pipeline's queues") 132 ldsBus = Bridge() # the bridge between the CU and its LDS 133 ldsPort = MasterPort("The port that goes to the LDS") 134 localDataStore = Param.LdsState("the LDS for this CU") 135 136 vector_register_file = VectorParam.VectorRegisterFile("Vector register "\ 137 "file") 138 out_of_order_data_delivery = Param.Bool(False, "enable OoO data delivery" 139 " in the GM pipeline") 140 141class Shader(ClockedObject): 142 type = 'Shader' 143 cxx_class = 'Shader' 144 cxx_header = 'gpu-compute/shader.hh' 145 146 CUs = VectorParam.ComputeUnit('Number of compute units') 147 n_wf = Param.Int(1, 'Number of wavefront slots per SIMD') 148 impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into 149 ruby at kernel boundaries""") 150 separate_acquire_release = Param.Bool(False, 151 """Do ld_acquire/st_release generate separate requests for the 152 acquire and release?""") 153 globalmem = Param.MemorySize('64kB', 'Memory size') 154 timing = Param.Bool(False, 'timing memory accesses') 155 156 cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU") 157 translation = Param.Bool(False, "address translation"); 158 159class ClDriver(EmulatedDriver): 160 type = 'ClDriver' 161 cxx_header = 'gpu-compute/cl_driver.hh' 162 codefile = VectorParam.String('code file name(s)') 163 164class GpuDispatcher(DmaDevice): 165 type = 'GpuDispatcher' 166 cxx_header = 'gpu-compute/dispatcher.hh' 167 # put at 8GB line for now 168 pio_addr = Param.Addr(0x200000000, "Device Address") 169 pio_latency = Param.Latency('1ns', "Programmed IO latency") 170 shader_pointer = Param.Shader('pointer to shader') 171 translation_port = MasterPort('Port to the dispatcher TLB') 172 cpu = Param.BaseCPU("CPU to wake up on kernel completion") 173 174 cl_driver = Param.ClDriver('pointer to driver') 175 176class MemType(Enum): vals = [ 177 'M_U8', 178 'M_U16', 179 'M_U32', 180 'M_U64', 181 'M_S8', 182 'M_S16', 183 'M_S32', 184 'M_S64', 185 'M_F16', 186 'M_F32', 187 'M_F64', 188 ] 189 190class StorageClassType(Enum): vals = [ 191 'SC_SPILL', 192 'SC_GLOBAL', 193 'SC_SHARED', 194 'SC_PRIVATE', 195 'SC_READONLY', 196 'SC_KERNARG', 197 'SC_NONE', 198 ] 199 200class RegisterType(Enum): vals = [ 201 'RT_VECTOR', 202 'RT_SCALAR', 203 'RT_CONDITION', 204 'RT_HARDWARE', 205 'RT_NONE', 206 ]