GPU.py revision 11308
1# 2# Copyright (c) 2015 Advanced Micro Devices, Inc. 3# All rights reserved. 4# 5# For use for simulation and test purposes only 6# 7# Redistribution and use in source and binary forms, with or without 8# modification, are permitted provided that the following conditions are met: 9# 10# 1. Redistributions of source code must retain the above copyright notice, 11# this list of conditions and the following disclaimer. 12# 13# 2. Redistributions in binary form must reproduce the above copyright notice, 14# this list of conditions and the following disclaimer in the documentation 15# and/or other materials provided with the distribution. 16# 17# 3. Neither the name of the copyright holder nor the names of its contributors 18# may be used to endorse or promote products derived from this software 19# without specific prior written permission. 20# 21# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31# POSSIBILITY OF SUCH DAMAGE. 32# 33# Author: Steve Reinhardt 34# 35 36from ClockedObject import ClockedObject 37from Device import DmaDevice 38from m5.defines import buildEnv 39from m5.params import * 40from m5.proxy import * 41from m5.SimObject import SimObject 42from MemObject import MemObject 43from Process import EmulatedDriver 44from Bridge import Bridge 45from LdsState import LdsState 46 47class PrefetchType(Enum): vals = [ 48 'PF_CU', 49 'PF_PHASE', 50 'PF_WF', 51 'PF_STRIDE', 52 'PF_END', 53 ] 54 55class VectorRegisterFile(SimObject): 56 type = 'VectorRegisterFile' 57 cxx_class = 'VectorRegisterFile' 58 cxx_header = 'gpu-compute/vector_register_file.hh' 59 60 simd_id = Param.Int(0, 'SIMD ID associated with this VRF') 61 num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD') 62 min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF') 63 64class Wavefront(SimObject): 65 type = 'Wavefront' 66 cxx_class = 'Wavefront' 67 cxx_header = 'gpu-compute/wavefront.hh' 68 69 simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)') 70 wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)') 71 72class ComputeUnit(MemObject): 73 type = 'ComputeUnit' 74 cxx_class = 'ComputeUnit' 75 cxx_header = 'gpu-compute/compute_unit.hh' 76 77 wavefronts = VectorParam.Wavefront('Number of wavefronts') 78 wfSize = Param.Int(64, 'Wavefront size (in work items)') 79 num_SIMDs = Param.Int(4, 'number of SIMD units per CU') 80 81 spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\ 82 'latency') 83 84 dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\ 85 'latency') 86 87 issue_period = Param.Int(4, 'number of cycles per issue period') 88 num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU') 89 num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU') 90 n_wf = Param.Int(1, 'Number of wavefront slots per SIMD') 91 mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\ 92 "Represents the pipeline to reach the TCP and "\ 93 "specified in GPU clock cycles") 94 mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\ 95 "cu. Represents the pipeline between the TCP "\ 96 "and cu as well as TCP data array access. "\ 97 "Specified in GPU clock cycles") 98 system = Param.System(Parent.any, "system object") 99 cu_id = Param.Int('CU id') 100 vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\ 101 "in bytes") 102 coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\ 103 "in bytes") 104 105 memory_port = VectorMasterPort("Port to the memory system") 106 translation_port = VectorMasterPort('Port to the TLB hierarchy') 107 sqc_port = MasterPort("Port to the SQC (I-cache") 108 sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)") 109 perLaneTLB = Param.Bool(False, "enable per-lane TLB") 110 prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\ 111 "(0 turns off prefetching)") 112 prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)") 113 prefetch_prev_type = Param.PrefetchType('PF_PHASE', "Prefetch the stride "\ 114 "from last mem req in lane of "\ 115 "CU|Phase|Wavefront") 116 execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy"); 117 xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr."); 118 debugSegFault = Param.Bool(False, "enable debugging GPU seg faults") 119 functionalTLB = Param.Bool(False, "Assume TLB causes no delay") 120 121 localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\ 122 "kernel end") 123 124 countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\ 125 "and how many times") 126 global_mem_queue_size = Param.Int(256, "Number of entries in the global " 127 "memory pipeline's queues") 128 local_mem_queue_size = Param.Int(256, "Number of entries in the local " 129 "memory pipeline's queues") 130 ldsBus = Bridge() # the bridge between the CU and its LDS 131 ldsPort = MasterPort("The port that goes to the LDS") 132 localDataStore = Param.LdsState("the LDS for this CU") 133 134 vector_register_file = VectorParam.VectorRegisterFile("Vector register "\ 135 "file") 136 137class Shader(ClockedObject): 138 type = 'Shader' 139 cxx_class = 'Shader' 140 cxx_header = 'gpu-compute/shader.hh' 141 142 CUs = VectorParam.ComputeUnit('Number of compute units') 143 n_wf = Param.Int(1, 'Number of wavefront slots per SIMD') 144 impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into 145 ruby at kernel boundaries""") 146 separate_acquire_release = Param.Bool(False, 147 """Do ld_acquire/st_release generate separate requests for the 148 acquire and release?""") 149 globalmem = Param.MemorySize('64kB', 'Memory size') 150 timing = Param.Bool(False, 'timing memory accesses') 151 152 cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU") 153 translation = Param.Bool(False, "address translation"); 154 155class ClDriver(EmulatedDriver): 156 type = 'ClDriver' 157 cxx_header = 'gpu-compute/cl_driver.hh' 158 codefile = VectorParam.String('code file name(s)') 159 160class GpuDispatcher(DmaDevice): 161 type = 'GpuDispatcher' 162 cxx_header = 'gpu-compute/dispatcher.hh' 163 # put at 8GB line for now 164 pio_addr = Param.Addr(0x200000000, "Device Address") 165 pio_latency = Param.Latency('1ns', "Programmed IO latency") 166 shader_pointer = Param.Shader('pointer to shader') 167 translation_port = MasterPort('Port to the dispatcher TLB') 168 cpu = Param.BaseCPU("CPU to wake up on kernel completion") 169 170 cl_driver = Param.ClDriver('pointer to driver') 171 172class OpType(Enum): vals = [ 173 'OT_NULL', 174 'OT_ALU', 175 'OT_SPECIAL', 176 'OT_GLOBAL_READ', 177 'OT_GLOBAL_WRITE', 178 'OT_GLOBAL_ATOMIC', 179 'OT_GLOBAL_HIST', 180 'OT_GLOBAL_LDAS', 181 'OT_SHARED_READ', 182 'OT_SHARED_WRITE', 183 'OT_SHARED_ATOMIC', 184 'OT_SHARED_HIST', 185 'OT_SHARED_LDAS', 186 'OT_PRIVATE_READ', 187 'OT_PRIVATE_WRITE', 188 'OT_PRIVATE_ATOMIC', 189 'OT_PRIVATE_HIST', 190 'OT_PRIVATE_LDAS', 191 'OT_SPILL_READ', 192 'OT_SPILL_WRITE', 193 'OT_SPILL_ATOMIC', 194 'OT_SPILL_HIST', 195 'OT_SPILL_LDAS', 196 'OT_READONLY_READ', 197 'OT_READONLY_WRITE', 198 'OT_READONLY_ATOMIC', 199 'OT_READONLY_HIST', 200 'OT_READONLY_LDAS', 201 'OT_FLAT_READ', 202 'OT_FLAT_WRITE', 203 'OT_FLAT_ATOMIC', 204 'OT_FLAT_HIST', 205 'OT_FLAT_LDAS', 206 'OT_KERN_READ', 207 'OT_BRANCH', 208 209 # note: Only the OT_BOTH_MEMFENCE seems to be supported in the 1.0F version 210 # of the compiler. 211 'OT_SHARED_MEMFENCE', 212 'OT_GLOBAL_MEMFENCE', 213 'OT_BOTH_MEMFENCE', 214 215 'OT_BARRIER', 216 'OT_PRINT', 217 'OT_RET', 218 'OT_NOP', 219 'OT_ARG' 220 ] 221 222class MemType(Enum): vals = [ 223 'M_U8', 224 'M_U16', 225 'M_U32', 226 'M_U64', 227 'M_S8', 228 'M_S16', 229 'M_S32', 230 'M_S64', 231 'M_F16', 232 'M_F32', 233 'M_F64', 234 ] 235 236class MemOpType(Enum): vals = [ 237 'MO_LD', 238 'MO_ST', 239 'MO_LDAS', 240 'MO_LDA', 241 'MO_AAND', 242 'MO_AOR', 243 'MO_AXOR', 244 'MO_ACAS', 245 'MO_AEXCH', 246 'MO_AADD', 247 'MO_ASUB', 248 'MO_AINC', 249 'MO_ADEC', 250 'MO_AMAX', 251 'MO_AMIN', 252 'MO_ANRAND', 253 'MO_ANROR', 254 'MO_ANRXOR', 255 'MO_ANRCAS', 256 'MO_ANREXCH', 257 'MO_ANRADD', 258 'MO_ANRSUB', 259 'MO_ANRINC', 260 'MO_ANRDEC', 261 'MO_ANRMAX', 262 'MO_ANRMIN', 263 'MO_HAND', 264 'MO_HOR', 265 'MO_HXOR', 266 'MO_HCAS', 267 'MO_HEXCH', 268 'MO_HADD', 269 'MO_HSUB', 270 'MO_HINC', 271 'MO_HDEC', 272 'MO_HMAX', 273 'MO_HMIN', 274 'MO_UNDEF' 275 ] 276 277class StorageClassType(Enum): vals = [ 278 'SC_SPILL', 279 'SC_GLOBAL', 280 'SC_SHARED', 281 'SC_PRIVATE', 282 'SC_READONLY', 283 'SC_KERNARG', 284 'SC_NONE', 285 ] 286 287class RegisterType(Enum): vals = [ 288 'RT_VECTOR', 289 'RT_SCALAR', 290 'RT_CONDITION', 291 'RT_HARDWARE', 292 'RT_NONE', 293 ] 294 295class GenericMemoryOrder(Enum): vals = [ 296 'MEMORY_ORDER_NONE', 297 'MEMORY_ORDER_RELAXED', 298 'MEMORY_ORDER_SC_ACQUIRE', 299 'MEMORY_ORDER_SC_RELEASE', 300 'MEMORY_ORDER_SC_ACQUIRE_RELEASE', 301 ] 302 303class GenericMemoryScope(Enum): vals = [ 304 'MEMORY_SCOPE_NONE', 305 'MEMORY_SCOPE_WORKITEM', 306 'MEMORY_SCOPE_WAVEFRONT', 307 'MEMORY_SCOPE_WORKGROUP', 308 'MEMORY_SCOPE_DEVICE', 309 'MEMORY_SCOPE_SYSTEM', 310 ] 311