apu_se.py revision 11616
111308Santhony.gutierrez@amd.com# 211308Santhony.gutierrez@amd.com# Copyright (c) 2015 Advanced Micro Devices, Inc. 311308Santhony.gutierrez@amd.com# All rights reserved. 411308Santhony.gutierrez@amd.com# 511308Santhony.gutierrez@amd.com# For use for simulation and test purposes only 611308Santhony.gutierrez@amd.com# 711308Santhony.gutierrez@amd.com# Redistribution and use in source and binary forms, with or without 811308Santhony.gutierrez@amd.com# modification, are permitted provided that the following conditions are met: 911308Santhony.gutierrez@amd.com# 1011308Santhony.gutierrez@amd.com# 1. Redistributions of source code must retain the above copyright notice, 1111308Santhony.gutierrez@amd.com# this list of conditions and the following disclaimer. 1211308Santhony.gutierrez@amd.com# 1311308Santhony.gutierrez@amd.com# 2. Redistributions in binary form must reproduce the above copyright notice, 1411308Santhony.gutierrez@amd.com# this list of conditions and the following disclaimer in the documentation 1511308Santhony.gutierrez@amd.com# and/or other materials provided with the distribution. 1611308Santhony.gutierrez@amd.com# 1711308Santhony.gutierrez@amd.com# 3. Neither the name of the copyright holder nor the names of its contributors 1811308Santhony.gutierrez@amd.com# may be used to endorse or promote products derived from this software 1911308Santhony.gutierrez@amd.com# without specific prior written permission. 2011308Santhony.gutierrez@amd.com# 2111308Santhony.gutierrez@amd.com# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 2211308Santhony.gutierrez@amd.com# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 2311308Santhony.gutierrez@amd.com# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2411308Santhony.gutierrez@amd.com# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 2511308Santhony.gutierrez@amd.com# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 2611308Santhony.gutierrez@amd.com# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 2711308Santhony.gutierrez@amd.com# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 2811308Santhony.gutierrez@amd.com# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 2911308Santhony.gutierrez@amd.com# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 3011308Santhony.gutierrez@amd.com# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 3111308Santhony.gutierrez@amd.com# POSSIBILITY OF SUCH DAMAGE. 3211308Santhony.gutierrez@amd.com# 3311308Santhony.gutierrez@amd.com# Author: Sooraj Puthoor 3411308Santhony.gutierrez@amd.com# 3511308Santhony.gutierrez@amd.com 3611308Santhony.gutierrez@amd.comimport optparse, os, re 3711308Santhony.gutierrez@amd.comimport math 3811308Santhony.gutierrez@amd.comimport glob 3911308Santhony.gutierrez@amd.comimport inspect 4011308Santhony.gutierrez@amd.com 4111308Santhony.gutierrez@amd.comimport m5 4211308Santhony.gutierrez@amd.comfrom m5.objects import * 4311308Santhony.gutierrez@amd.comfrom m5.util import addToPath 4411308Santhony.gutierrez@amd.com 4511308Santhony.gutierrez@amd.comaddToPath('../ruby') 4611308Santhony.gutierrez@amd.comaddToPath('../common') 4711308Santhony.gutierrez@amd.comaddToPath('../topologies') 4811308Santhony.gutierrez@amd.com 4911308Santhony.gutierrez@amd.comimport Options 5011308Santhony.gutierrez@amd.comimport Ruby 5111308Santhony.gutierrez@amd.comimport Simulation 5211308Santhony.gutierrez@amd.comimport GPUTLBOptions, GPUTLBConfig 5311308Santhony.gutierrez@amd.com 5411308Santhony.gutierrez@amd.com########################## Script Options ######################## 5511308Santhony.gutierrez@amd.comdef setOption(parser, opt_str, value = 1): 5611308Santhony.gutierrez@amd.com # check to make sure the option actually exists 5711308Santhony.gutierrez@amd.com if not parser.has_option(opt_str): 5811308Santhony.gutierrez@amd.com raise Exception("cannot find %s in list of possible options" % opt_str) 5911308Santhony.gutierrez@amd.com 6011308Santhony.gutierrez@amd.com opt = parser.get_option(opt_str) 6111308Santhony.gutierrez@amd.com # set the value 6211308Santhony.gutierrez@amd.com exec("parser.values.%s = %s" % (opt.dest, value)) 6311308Santhony.gutierrez@amd.com 6411308Santhony.gutierrez@amd.comdef getOption(parser, opt_str): 6511308Santhony.gutierrez@amd.com # check to make sure the option actually exists 6611308Santhony.gutierrez@amd.com if not parser.has_option(opt_str): 6711308Santhony.gutierrez@amd.com raise Exception("cannot find %s in list of possible options" % opt_str) 6811308Santhony.gutierrez@amd.com 6911308Santhony.gutierrez@amd.com opt = parser.get_option(opt_str) 7011308Santhony.gutierrez@amd.com # get the value 7111308Santhony.gutierrez@amd.com exec("return_value = parser.values.%s" % opt.dest) 7211308Santhony.gutierrez@amd.com return return_value 7311308Santhony.gutierrez@amd.com 7411308Santhony.gutierrez@amd.com# Adding script options 7511308Santhony.gutierrez@amd.comparser = optparse.OptionParser() 7611308Santhony.gutierrez@amd.comOptions.addCommonOptions(parser) 7711308Santhony.gutierrez@amd.comOptions.addSEOptions(parser) 7811308Santhony.gutierrez@amd.com 7911308Santhony.gutierrez@amd.comparser.add_option("--cpu-only-mode", action="store_true", default=False, 8011308Santhony.gutierrez@amd.com help="APU mode. Used to take care of problems in "\ 8111308Santhony.gutierrez@amd.com "Ruby.py while running APU protocols") 8211308Santhony.gutierrez@amd.comparser.add_option("-k", "--kernel-files", 8311308Santhony.gutierrez@amd.com help="file(s) containing GPU kernel code (colon separated)") 8411308Santhony.gutierrez@amd.comparser.add_option("-u", "--num-compute-units", type="int", default=1, 8511308Santhony.gutierrez@amd.com help="number of GPU compute units"), 8611308Santhony.gutierrez@amd.comparser.add_option("--num-cp", type="int", default=0, 8711308Santhony.gutierrez@amd.com help="Number of GPU Command Processors (CP)") 8811308Santhony.gutierrez@amd.comparser.add_option("--benchmark-root", help="Root of benchmark directory tree") 8911308Santhony.gutierrez@amd.com 9011308Santhony.gutierrez@amd.com# not super important now, but to avoid putting the number 4 everywhere, make 9111308Santhony.gutierrez@amd.com# it an option/knob 9211308Santhony.gutierrez@amd.comparser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs" \ 9311308Santhony.gutierrez@amd.com "sharing an SQC (icache, and thus icache TLB)") 9411308Santhony.gutierrez@amd.comparser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \ 9511308Santhony.gutierrez@amd.com "per CU") 9611308Santhony.gutierrez@amd.comparser.add_option("--wf-size", type="int", default=64, 9711308Santhony.gutierrez@amd.com help="Wavefront size(in workitems)") 9811308Santhony.gutierrez@amd.comparser.add_option("--sp-bypass-path-length", type="int", default=4, \ 9911308Santhony.gutierrez@amd.com help="Number of stages of bypass path in vector ALU for Single Precision ops") 10011308Santhony.gutierrez@amd.comparser.add_option("--dp-bypass-path-length", type="int", default=4, \ 10111308Santhony.gutierrez@amd.com help="Number of stages of bypass path in vector ALU for Double Precision ops") 10211308Santhony.gutierrez@amd.com# issue period per SIMD unit: number of cycles before issuing another vector 10311308Santhony.gutierrez@amd.comparser.add_option("--issue-period", type="int", default=4, \ 10411308Santhony.gutierrez@amd.com help="Number of cycles per vector instruction issue period") 10511308Santhony.gutierrez@amd.comparser.add_option("--glbmem-wr-bus-width", type="int", default=32, \ 10611308Santhony.gutierrez@amd.com help="VGPR to Coalescer (Global Memory) data bus width in bytes") 10711308Santhony.gutierrez@amd.comparser.add_option("--glbmem-rd-bus-width", type="int", default=32, \ 10811308Santhony.gutierrez@amd.com help="Coalescer to VGPR (Global Memory) data bus width in bytes") 10911308Santhony.gutierrez@amd.com# Currently we only support 1 local memory pipe 11011308Santhony.gutierrez@amd.comparser.add_option("--shr-mem-pipes-per-cu", type="int", default=1, \ 11111308Santhony.gutierrez@amd.com help="Number of Shared Memory pipelines per CU") 11211308Santhony.gutierrez@amd.com# Currently we only support 1 global memory pipe 11311308Santhony.gutierrez@amd.comparser.add_option("--glb-mem-pipes-per-cu", type="int", default=1, \ 11411308Santhony.gutierrez@amd.com help="Number of Global Memory pipelines per CU") 11511308Santhony.gutierrez@amd.comparser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \ 11611308Santhony.gutierrez@amd.com "WF slots per SIMD") 11711308Santhony.gutierrez@amd.com 11811308Santhony.gutierrez@amd.comparser.add_option("--vreg-file-size", type="int", default=2048, 11911308Santhony.gutierrez@amd.com help="number of physical vector registers per SIMD") 12011308Santhony.gutierrez@amd.comparser.add_option("--bw-scalor", type="int", default=0, 12111308Santhony.gutierrez@amd.com help="bandwidth scalor for scalability analysis") 12211308Santhony.gutierrez@amd.comparser.add_option("--CPUClock", type="string", default="2GHz", 12311308Santhony.gutierrez@amd.com help="CPU clock") 12411308Santhony.gutierrez@amd.comparser.add_option("--GPUClock", type="string", default="1GHz", 12511308Santhony.gutierrez@amd.com help="GPU clock") 12611308Santhony.gutierrez@amd.comparser.add_option("--cpu-voltage", action="store", type="string", 12711308Santhony.gutierrez@amd.com default='1.0V', 12811308Santhony.gutierrez@amd.com help = """CPU voltage domain""") 12911308Santhony.gutierrez@amd.comparser.add_option("--gpu-voltage", action="store", type="string", 13011308Santhony.gutierrez@amd.com default='1.0V', 13111308Santhony.gutierrez@amd.com help = """CPU voltage domain""") 13211308Santhony.gutierrez@amd.comparser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST", 13311308Santhony.gutierrez@amd.com help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)") 13411308Santhony.gutierrez@amd.comparser.add_option("--xact-cas-mode", action="store_true", 13511308Santhony.gutierrez@amd.com help="enable load_compare mode (transactional CAS)") 13611308Santhony.gutierrez@amd.comparser.add_option("--SegFaultDebug",action="store_true", 13711308Santhony.gutierrez@amd.com help="checks for GPU seg fault before TLB access") 13811308Santhony.gutierrez@amd.comparser.add_option("--FunctionalTLB",action="store_true", 13911308Santhony.gutierrez@amd.com help="Assumes TLB has no latency") 14011308Santhony.gutierrez@amd.comparser.add_option("--LocalMemBarrier",action="store_true", 14111308Santhony.gutierrez@amd.com help="Barrier does not wait for writethroughs to complete") 14211308Santhony.gutierrez@amd.comparser.add_option("--countPages", action="store_true", 14311308Santhony.gutierrez@amd.com help="Count Page Accesses and output in per-CU output files") 14411308Santhony.gutierrez@amd.comparser.add_option("--TLB-prefetch", type="int", help = "prefetch depth for"\ 14511308Santhony.gutierrez@amd.com "TLBs") 14611308Santhony.gutierrez@amd.comparser.add_option("--pf-type", type="string", help="type of prefetch: "\ 14711308Santhony.gutierrez@amd.com "PF_CU, PF_WF, PF_PHASE, PF_STRIDE") 14811308Santhony.gutierrez@amd.comparser.add_option("--pf-stride", type="int", help="set prefetch stride") 14911308Santhony.gutierrez@amd.comparser.add_option("--numLdsBanks", type="int", default=32, 15011308Santhony.gutierrez@amd.com help="number of physical banks per LDS module") 15111308Santhony.gutierrez@amd.comparser.add_option("--ldsBankConflictPenalty", type="int", default=1, 15211308Santhony.gutierrez@amd.com help="number of cycles per LDS bank conflict") 15311616Sdavid.j.hashe@gmail.comparser.add_option('--fast-forward-pseudo-op', action='store_true', 15411616Sdavid.j.hashe@gmail.com help = 'fast forward using kvm until the m5_switchcpu' 15511616Sdavid.j.hashe@gmail.com ' pseudo-op is encountered, then switch cpus. subsequent' 15611616Sdavid.j.hashe@gmail.com ' m5_switchcpu pseudo-ops will toggle back and forth') 15711308Santhony.gutierrez@amd.com 15811308Santhony.gutierrez@amd.com 15911308Santhony.gutierrez@amd.comRuby.define_options(parser) 16011308Santhony.gutierrez@amd.com 16111308Santhony.gutierrez@amd.com#add TLB options to the parser 16211308Santhony.gutierrez@amd.comGPUTLBOptions.tlb_options(parser) 16311308Santhony.gutierrez@amd.com 16411308Santhony.gutierrez@amd.com(options, args) = parser.parse_args() 16511308Santhony.gutierrez@amd.com 16611308Santhony.gutierrez@amd.com# The GPU cache coherence protocols only work with the backing store 16711308Santhony.gutierrez@amd.comsetOption(parser, "--access-backing-store") 16811308Santhony.gutierrez@amd.com 16911308Santhony.gutierrez@amd.com# if benchmark root is specified explicitly, that overrides the search path 17011308Santhony.gutierrez@amd.comif options.benchmark_root: 17111308Santhony.gutierrez@amd.com benchmark_path = [options.benchmark_root] 17211308Santhony.gutierrez@amd.comelse: 17311308Santhony.gutierrez@amd.com # Set default benchmark search path to current dir 17411308Santhony.gutierrez@amd.com benchmark_path = ['.'] 17511308Santhony.gutierrez@amd.com 17611308Santhony.gutierrez@amd.com########################## Sanity Check ######################## 17711308Santhony.gutierrez@amd.com 17811308Santhony.gutierrez@amd.com# Currently the gpu model requires ruby 17911308Santhony.gutierrez@amd.comif buildEnv['PROTOCOL'] == 'None': 18011308Santhony.gutierrez@amd.com fatal("GPU model requires ruby") 18111308Santhony.gutierrez@amd.com 18211308Santhony.gutierrez@amd.com# Currently the gpu model requires only timing or detailed CPU 18311308Santhony.gutierrez@amd.comif not (options.cpu_type == "timing" or 18411308Santhony.gutierrez@amd.com options.cpu_type == "detailed"): 18511308Santhony.gutierrez@amd.com fatal("GPU model requires timing or detailed CPU") 18611308Santhony.gutierrez@amd.com 18711308Santhony.gutierrez@amd.com# This file can support multiple compute units 18811308Santhony.gutierrez@amd.comassert(options.num_compute_units >= 1) 18911308Santhony.gutierrez@amd.com 19011308Santhony.gutierrez@amd.com# Currently, the sqc (I-Cache of GPU) is shared by 19111308Santhony.gutierrez@amd.com# multiple compute units(CUs). The protocol works just fine 19211308Santhony.gutierrez@amd.com# even if sqc is not shared. Overriding this option here 19311308Santhony.gutierrez@amd.com# so that the user need not explicitly set this (assuming 19411308Santhony.gutierrez@amd.com# sharing sqc is the common usage) 19511308Santhony.gutierrez@amd.comn_cu = options.num_compute_units 19611308Santhony.gutierrez@amd.comnum_sqc = int(math.ceil(float(n_cu) / options.cu_per_sqc)) 19711308Santhony.gutierrez@amd.comoptions.num_sqc = num_sqc # pass this to Ruby 19811308Santhony.gutierrez@amd.com 19911308Santhony.gutierrez@amd.com########################## Creating the GPU system ######################## 20011308Santhony.gutierrez@amd.com# shader is the GPU 20111308Santhony.gutierrez@amd.comshader = Shader(n_wf = options.wfs_per_simd, 20211308Santhony.gutierrez@amd.com clk_domain = SrcClockDomain( 20311308Santhony.gutierrez@amd.com clock = options.GPUClock, 20411308Santhony.gutierrez@amd.com voltage_domain = VoltageDomain( 20511308Santhony.gutierrez@amd.com voltage = options.gpu_voltage))) 20611308Santhony.gutierrez@amd.com 20711308Santhony.gutierrez@amd.com# GPU_RfO(Read For Ownership) implements SC/TSO memory model. 20811308Santhony.gutierrez@amd.com# Other GPU protocols implement release consistency at GPU side. 20911308Santhony.gutierrez@amd.com# So, all GPU protocols other than GPU_RfO should make their writes 21011308Santhony.gutierrez@amd.com# visible to the global memory and should read from global memory 21111308Santhony.gutierrez@amd.com# during kernal boundary. The pipeline initiates(or do not initiate) 21211308Santhony.gutierrez@amd.com# the acquire/release operation depending on this impl_kern_boundary_sync 21311308Santhony.gutierrez@amd.com# flag. This flag=true means pipeline initiates a acquire/release operation 21411308Santhony.gutierrez@amd.com# at kernel boundary. 21511308Santhony.gutierrez@amd.comif buildEnv['PROTOCOL'] == 'GPU_RfO': 21611308Santhony.gutierrez@amd.com shader.impl_kern_boundary_sync = False 21711308Santhony.gutierrez@amd.comelse: 21811308Santhony.gutierrez@amd.com shader.impl_kern_boundary_sync = True 21911308Santhony.gutierrez@amd.com 22011308Santhony.gutierrez@amd.com# Switching off per-lane TLB by default 22111308Santhony.gutierrez@amd.comper_lane = False 22211308Santhony.gutierrez@amd.comif options.TLB_config == "perLane": 22311308Santhony.gutierrez@amd.com per_lane = True 22411308Santhony.gutierrez@amd.com 22511308Santhony.gutierrez@amd.com# List of compute units; one GPU can have multiple compute units 22611308Santhony.gutierrez@amd.comcompute_units = [] 22711308Santhony.gutierrez@amd.comfor i in xrange(n_cu): 22811308Santhony.gutierrez@amd.com compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane, 22911308Santhony.gutierrez@amd.com num_SIMDs = options.simds_per_cu, 23011308Santhony.gutierrez@amd.com wfSize = options.wf_size, 23111308Santhony.gutierrez@amd.com spbypass_pipe_length = options.sp_bypass_path_length, 23211308Santhony.gutierrez@amd.com dpbypass_pipe_length = options.dp_bypass_path_length, 23311308Santhony.gutierrez@amd.com issue_period = options.issue_period, 23411308Santhony.gutierrez@amd.com coalescer_to_vrf_bus_width = \ 23511308Santhony.gutierrez@amd.com options.glbmem_rd_bus_width, 23611308Santhony.gutierrez@amd.com vrf_to_coalescer_bus_width = \ 23711308Santhony.gutierrez@amd.com options.glbmem_wr_bus_width, 23811308Santhony.gutierrez@amd.com num_global_mem_pipes = \ 23911308Santhony.gutierrez@amd.com options.glb_mem_pipes_per_cu, 24011308Santhony.gutierrez@amd.com num_shared_mem_pipes = \ 24111308Santhony.gutierrez@amd.com options.shr_mem_pipes_per_cu, 24211308Santhony.gutierrez@amd.com n_wf = options.wfs_per_simd, 24311308Santhony.gutierrez@amd.com execPolicy = options.CUExecPolicy, 24411308Santhony.gutierrez@amd.com xactCasMode = options.xact_cas_mode, 24511308Santhony.gutierrez@amd.com debugSegFault = options.SegFaultDebug, 24611308Santhony.gutierrez@amd.com functionalTLB = options.FunctionalTLB, 24711308Santhony.gutierrez@amd.com localMemBarrier = options.LocalMemBarrier, 24811308Santhony.gutierrez@amd.com countPages = options.countPages, 24911308Santhony.gutierrez@amd.com localDataStore = \ 25011308Santhony.gutierrez@amd.com LdsState(banks = options.numLdsBanks, 25111308Santhony.gutierrez@amd.com bankConflictPenalty = \ 25211308Santhony.gutierrez@amd.com options.ldsBankConflictPenalty))) 25311308Santhony.gutierrez@amd.com wavefronts = [] 25411308Santhony.gutierrez@amd.com vrfs = [] 25511308Santhony.gutierrez@amd.com for j in xrange(options.simds_per_cu): 25611308Santhony.gutierrez@amd.com for k in xrange(shader.n_wf): 25711534Sjohn.kalamatianos@amd.com wavefronts.append(Wavefront(simdId = j, wf_slot_id = k, 25811534Sjohn.kalamatianos@amd.com wfSize = options.wf_size)) 25911308Santhony.gutierrez@amd.com vrfs.append(VectorRegisterFile(simd_id=j, 26011308Santhony.gutierrez@amd.com num_regs_per_simd=options.vreg_file_size)) 26111308Santhony.gutierrez@amd.com compute_units[-1].wavefronts = wavefronts 26211308Santhony.gutierrez@amd.com compute_units[-1].vector_register_file = vrfs 26311308Santhony.gutierrez@amd.com if options.TLB_prefetch: 26411308Santhony.gutierrez@amd.com compute_units[-1].prefetch_depth = options.TLB_prefetch 26511308Santhony.gutierrez@amd.com compute_units[-1].prefetch_prev_type = options.pf_type 26611308Santhony.gutierrez@amd.com 26711308Santhony.gutierrez@amd.com # attach the LDS and the CU to the bus (actually a Bridge) 26811308Santhony.gutierrez@amd.com compute_units[-1].ldsPort = compute_units[-1].ldsBus.slave 26911308Santhony.gutierrez@amd.com compute_units[-1].ldsBus.master = compute_units[-1].localDataStore.cuPort 27011308Santhony.gutierrez@amd.com 27111308Santhony.gutierrez@amd.com# Attach compute units to GPU 27211308Santhony.gutierrez@amd.comshader.CUs = compute_units 27311308Santhony.gutierrez@amd.com 27411308Santhony.gutierrez@amd.com########################## Creating the CPU system ######################## 27511308Santhony.gutierrez@amd.comoptions.num_cpus = options.num_cpus 27611308Santhony.gutierrez@amd.com 27711308Santhony.gutierrez@amd.com# The shader core will be whatever is after the CPU cores are accounted for 27811308Santhony.gutierrez@amd.comshader_idx = options.num_cpus 27911308Santhony.gutierrez@amd.com 28011308Santhony.gutierrez@amd.com# The command processor will be whatever is after the shader is accounted for 28111308Santhony.gutierrez@amd.comcp_idx = shader_idx + 1 28211308Santhony.gutierrez@amd.comcp_list = [] 28311308Santhony.gutierrez@amd.com 28411308Santhony.gutierrez@amd.com# List of CPUs 28511308Santhony.gutierrez@amd.comcpu_list = [] 28611308Santhony.gutierrez@amd.com 28711616Sdavid.j.hashe@gmail.comCpuClass, mem_mode = Simulation.getCPUClass(options.cpu_type) 28811616Sdavid.j.hashe@gmail.comif CpuClass == AtomicSimpleCPU: 28911616Sdavid.j.hashe@gmail.com fatal("AtomicSimpleCPU is not supported") 29011616Sdavid.j.hashe@gmail.comif mem_mode != 'timing': 29111616Sdavid.j.hashe@gmail.com fatal("Only the timing memory mode is supported") 29211308Santhony.gutierrez@amd.comshader.timing = True 29311308Santhony.gutierrez@amd.com 29411616Sdavid.j.hashe@gmail.comif options.fast_forward and options.fast_forward_pseudo_op: 29511616Sdavid.j.hashe@gmail.com fatal("Cannot fast-forward based both on the number of instructions and" 29611616Sdavid.j.hashe@gmail.com " on pseudo-ops") 29711616Sdavid.j.hashe@gmail.comfast_forward = options.fast_forward or options.fast_forward_pseudo_op 29811616Sdavid.j.hashe@gmail.com 29911616Sdavid.j.hashe@gmail.comif fast_forward: 30011616Sdavid.j.hashe@gmail.com FutureCpuClass, future_mem_mode = CpuClass, mem_mode 30111616Sdavid.j.hashe@gmail.com 30211616Sdavid.j.hashe@gmail.com CpuClass = X86KvmCPU 30311616Sdavid.j.hashe@gmail.com mem_mode = 'atomic_noncaching' 30411616Sdavid.j.hashe@gmail.com # Leave shader.timing untouched, because its value only matters at the 30511616Sdavid.j.hashe@gmail.com # start of the simulation and because we require switching cpus 30611616Sdavid.j.hashe@gmail.com # *before* the first kernel launch. 30711616Sdavid.j.hashe@gmail.com 30811616Sdavid.j.hashe@gmail.com future_cpu_list = [] 30911616Sdavid.j.hashe@gmail.com 31011616Sdavid.j.hashe@gmail.com # Initial CPUs to be used during fast-forwarding. 31111616Sdavid.j.hashe@gmail.com for i in xrange(options.num_cpus): 31211616Sdavid.j.hashe@gmail.com cpu = CpuClass(cpu_id = i, 31311616Sdavid.j.hashe@gmail.com clk_domain = SrcClockDomain( 31411616Sdavid.j.hashe@gmail.com clock = options.CPUClock, 31511616Sdavid.j.hashe@gmail.com voltage_domain = VoltageDomain( 31611616Sdavid.j.hashe@gmail.com voltage = options.cpu_voltage))) 31711616Sdavid.j.hashe@gmail.com cpu_list.append(cpu) 31811616Sdavid.j.hashe@gmail.com 31911616Sdavid.j.hashe@gmail.com if options.fast_forward: 32011616Sdavid.j.hashe@gmail.com cpu.max_insts_any_thread = int(options.fast_forward) 32111616Sdavid.j.hashe@gmail.com 32211616Sdavid.j.hashe@gmail.comif fast_forward: 32311616Sdavid.j.hashe@gmail.com MainCpuClass = FutureCpuClass 32411616Sdavid.j.hashe@gmail.comelse: 32511616Sdavid.j.hashe@gmail.com MainCpuClass = CpuClass 32611616Sdavid.j.hashe@gmail.com 32711616Sdavid.j.hashe@gmail.com# CPs to be used throughout the simulation. 32811616Sdavid.j.hashe@gmail.comfor i in xrange(options.num_cp): 32911616Sdavid.j.hashe@gmail.com cp = MainCpuClass(cpu_id = options.num_cpus + i, 33011616Sdavid.j.hashe@gmail.com clk_domain = SrcClockDomain( 33111616Sdavid.j.hashe@gmail.com clock = options.CPUClock, 33211616Sdavid.j.hashe@gmail.com voltage_domain = VoltageDomain( 33311616Sdavid.j.hashe@gmail.com voltage = options.cpu_voltage))) 33411616Sdavid.j.hashe@gmail.com cp_list.append(cp) 33511616Sdavid.j.hashe@gmail.com 33611616Sdavid.j.hashe@gmail.com# Main CPUs (to be used after fast-forwarding if fast-forwarding is specified). 33711616Sdavid.j.hashe@gmail.comfor i in xrange(options.num_cpus): 33811616Sdavid.j.hashe@gmail.com cpu = MainCpuClass(cpu_id = i, 33911616Sdavid.j.hashe@gmail.com clk_domain = SrcClockDomain( 34011616Sdavid.j.hashe@gmail.com clock = options.CPUClock, 34111616Sdavid.j.hashe@gmail.com voltage_domain = VoltageDomain( 34211616Sdavid.j.hashe@gmail.com voltage = options.cpu_voltage))) 34311616Sdavid.j.hashe@gmail.com if fast_forward: 34411616Sdavid.j.hashe@gmail.com cpu.switched_out = True 34511616Sdavid.j.hashe@gmail.com future_cpu_list.append(cpu) 34611308Santhony.gutierrez@amd.com else: 34711616Sdavid.j.hashe@gmail.com cpu_list.append(cpu) 34811308Santhony.gutierrez@amd.com 34911308Santhony.gutierrez@amd.com########################## Creating the GPU dispatcher ######################## 35011308Santhony.gutierrez@amd.com# Dispatcher dispatches work from host CPU to GPU 35111308Santhony.gutierrez@amd.comhost_cpu = cpu_list[0] 35211308Santhony.gutierrez@amd.comdispatcher = GpuDispatcher() 35311308Santhony.gutierrez@amd.com 35411308Santhony.gutierrez@amd.com########################## Create and assign the workload ######################## 35511308Santhony.gutierrez@amd.com# Check for rel_path in elements of base_list using test, returning 35611308Santhony.gutierrez@amd.com# the first full path that satisfies test 35711308Santhony.gutierrez@amd.comdef find_path(base_list, rel_path, test): 35811308Santhony.gutierrez@amd.com for base in base_list: 35911308Santhony.gutierrez@amd.com if not base: 36011308Santhony.gutierrez@amd.com # base could be None if environment var not set 36111308Santhony.gutierrez@amd.com continue 36211308Santhony.gutierrez@amd.com full_path = os.path.join(base, rel_path) 36311308Santhony.gutierrez@amd.com if test(full_path): 36411308Santhony.gutierrez@amd.com return full_path 36511308Santhony.gutierrez@amd.com fatal("%s not found in %s" % (rel_path, base_list)) 36611308Santhony.gutierrez@amd.com 36711308Santhony.gutierrez@amd.comdef find_file(base_list, rel_path): 36811308Santhony.gutierrez@amd.com return find_path(base_list, rel_path, os.path.isfile) 36911308Santhony.gutierrez@amd.com 37011308Santhony.gutierrez@amd.comexecutable = find_path(benchmark_path, options.cmd, os.path.exists) 37111308Santhony.gutierrez@amd.com# it's common for a benchmark to be in a directory with the same 37211308Santhony.gutierrez@amd.com# name as the executable, so we handle that automatically 37311308Santhony.gutierrez@amd.comif os.path.isdir(executable): 37411308Santhony.gutierrez@amd.com benchmark_path = [executable] 37511308Santhony.gutierrez@amd.com executable = find_file(benchmark_path, options.cmd) 37611308Santhony.gutierrez@amd.comif options.kernel_files: 37711308Santhony.gutierrez@amd.com kernel_files = [find_file(benchmark_path, f) 37811308Santhony.gutierrez@amd.com for f in options.kernel_files.split(':')] 37911308Santhony.gutierrez@amd.comelse: 38011308Santhony.gutierrez@amd.com # if kernel_files is not set, see if there's a unique .asm file 38111308Santhony.gutierrez@amd.com # in the same directory as the executable 38211308Santhony.gutierrez@amd.com kernel_path = os.path.dirname(executable) 38311308Santhony.gutierrez@amd.com kernel_files = glob.glob(os.path.join(kernel_path, '*.asm')) 38411308Santhony.gutierrez@amd.com if kernel_files: 38511308Santhony.gutierrez@amd.com print "Using GPU kernel code file(s)", ",".join(kernel_files) 38611308Santhony.gutierrez@amd.com else: 38711308Santhony.gutierrez@amd.com fatal("Can't locate kernel code (.asm) in " + kernel_path) 38811308Santhony.gutierrez@amd.com 38911308Santhony.gutierrez@amd.com# OpenCL driver 39011308Santhony.gutierrez@amd.comdriver = ClDriver(filename="hsa", codefile=kernel_files) 39111308Santhony.gutierrez@amd.comfor cpu in cpu_list: 39211308Santhony.gutierrez@amd.com cpu.workload = LiveProcess(executable = executable, 39311308Santhony.gutierrez@amd.com cmd = [options.cmd] + options.options.split(), 39411308Santhony.gutierrez@amd.com drivers = [driver]) 39511308Santhony.gutierrez@amd.comfor cp in cp_list: 39611308Santhony.gutierrez@amd.com cp.workload = host_cpu.workload 39711308Santhony.gutierrez@amd.com 39811616Sdavid.j.hashe@gmail.comif fast_forward: 39911616Sdavid.j.hashe@gmail.com for i in xrange(len(future_cpu_list)): 40011616Sdavid.j.hashe@gmail.com future_cpu_list[i].workload = cpu_list[i].workload 40111616Sdavid.j.hashe@gmail.com 40211308Santhony.gutierrez@amd.com########################## Create the overall system ######################## 40311616Sdavid.j.hashe@gmail.com# List of CPUs that must be switched when moving between KVM and simulation 40411616Sdavid.j.hashe@gmail.comif fast_forward: 40511616Sdavid.j.hashe@gmail.com switch_cpu_list = \ 40611616Sdavid.j.hashe@gmail.com [(cpu_list[i], future_cpu_list[i]) for i in xrange(options.num_cpus)] 40711616Sdavid.j.hashe@gmail.com 40811308Santhony.gutierrez@amd.com# Full list of processing cores in the system. Note that 40911308Santhony.gutierrez@amd.com# dispatcher is also added to cpu_list although it is 41011308Santhony.gutierrez@amd.com# not a processing element 41111308Santhony.gutierrez@amd.comcpu_list = cpu_list + [shader] + cp_list + [dispatcher] 41211308Santhony.gutierrez@amd.com 41311308Santhony.gutierrez@amd.com# creating the overall system 41411308Santhony.gutierrez@amd.com# notice the cpu list is explicitly added as a parameter to System 41511308Santhony.gutierrez@amd.comsystem = System(cpu = cpu_list, 41611308Santhony.gutierrez@amd.com mem_ranges = [AddrRange(options.mem_size)], 41711308Santhony.gutierrez@amd.com cache_line_size = options.cacheline_size, 41811308Santhony.gutierrez@amd.com mem_mode = mem_mode) 41911616Sdavid.j.hashe@gmail.comif fast_forward: 42011616Sdavid.j.hashe@gmail.com system.future_cpu = future_cpu_list 42111308Santhony.gutierrez@amd.comsystem.voltage_domain = VoltageDomain(voltage = options.sys_voltage) 42211308Santhony.gutierrez@amd.comsystem.clk_domain = SrcClockDomain(clock = options.sys_clock, 42311308Santhony.gutierrez@amd.com voltage_domain = system.voltage_domain) 42411308Santhony.gutierrez@amd.com 42511616Sdavid.j.hashe@gmail.comif fast_forward: 42611616Sdavid.j.hashe@gmail.com have_kvm_support = 'BaseKvmCPU' in globals() 42711616Sdavid.j.hashe@gmail.com if have_kvm_support and buildEnv['TARGET_ISA'] == "x86": 42811616Sdavid.j.hashe@gmail.com system.vm = KvmVM() 42911616Sdavid.j.hashe@gmail.com for i in xrange(len(host_cpu.workload)): 43011616Sdavid.j.hashe@gmail.com host_cpu.workload[i].useArchPT = True 43111616Sdavid.j.hashe@gmail.com host_cpu.workload[i].kvmInSE = True 43211616Sdavid.j.hashe@gmail.com else: 43311616Sdavid.j.hashe@gmail.com fatal("KvmCPU can only be used in SE mode with x86") 43411616Sdavid.j.hashe@gmail.com 43511308Santhony.gutierrez@amd.com# configure the TLB hierarchy 43611308Santhony.gutierrez@amd.comGPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx) 43711308Santhony.gutierrez@amd.com 43811308Santhony.gutierrez@amd.com# create Ruby system 43911308Santhony.gutierrez@amd.comsystem.piobus = IOXBar(width=32, response_latency=0, 44011308Santhony.gutierrez@amd.com frontend_latency=0, forward_latency=0) 44111308Santhony.gutierrez@amd.comRuby.create_system(options, None, system) 44211308Santhony.gutierrez@amd.comsystem.ruby.clk_domain = SrcClockDomain(clock = options.ruby_clock, 44311308Santhony.gutierrez@amd.com voltage_domain = system.voltage_domain) 44411308Santhony.gutierrez@amd.com 44511308Santhony.gutierrez@amd.com# attach the CPU ports to Ruby 44611308Santhony.gutierrez@amd.comfor i in range(options.num_cpus): 44711308Santhony.gutierrez@amd.com ruby_port = system.ruby._cpu_ports[i] 44811308Santhony.gutierrez@amd.com 44911308Santhony.gutierrez@amd.com # Create interrupt controller 45011308Santhony.gutierrez@amd.com system.cpu[i].createInterruptController() 45111308Santhony.gutierrez@amd.com 45211308Santhony.gutierrez@amd.com # Connect cache port's to ruby 45311308Santhony.gutierrez@amd.com system.cpu[i].icache_port = ruby_port.slave 45411308Santhony.gutierrez@amd.com system.cpu[i].dcache_port = ruby_port.slave 45511308Santhony.gutierrez@amd.com 45611308Santhony.gutierrez@amd.com ruby_port.mem_master_port = system.piobus.slave 45711308Santhony.gutierrez@amd.com if buildEnv['TARGET_ISA'] == "x86": 45811308Santhony.gutierrez@amd.com system.cpu[i].interrupts[0].pio = system.piobus.master 45911308Santhony.gutierrez@amd.com system.cpu[i].interrupts[0].int_master = system.piobus.slave 46011308Santhony.gutierrez@amd.com system.cpu[i].interrupts[0].int_slave = system.piobus.master 46111616Sdavid.j.hashe@gmail.com if fast_forward: 46211616Sdavid.j.hashe@gmail.com system.cpu[i].itb.walker.port = ruby_port.slave 46311616Sdavid.j.hashe@gmail.com system.cpu[i].dtb.walker.port = ruby_port.slave 46411308Santhony.gutierrez@amd.com 46511308Santhony.gutierrez@amd.com# attach CU ports to Ruby 46611308Santhony.gutierrez@amd.com# Because of the peculiarities of the CP core, you may have 1 CPU but 2 46711308Santhony.gutierrez@amd.com# sequencers and thus 2 _cpu_ports created. Your GPUs shouldn't be 46811308Santhony.gutierrez@amd.com# hooked up until after the CP. To make this script generic, figure out 46911308Santhony.gutierrez@amd.com# the index as below, but note that this assumes there is one sequencer 47011308Santhony.gutierrez@amd.com# per compute unit and one sequencer per SQC for the math to work out 47111308Santhony.gutierrez@amd.com# correctly. 47211308Santhony.gutierrez@amd.comgpu_port_idx = len(system.ruby._cpu_ports) \ 47311308Santhony.gutierrez@amd.com - options.num_compute_units - options.num_sqc 47411308Santhony.gutierrez@amd.comgpu_port_idx = gpu_port_idx - options.num_cp * 2 47511308Santhony.gutierrez@amd.com 47611308Santhony.gutierrez@amd.comwavefront_size = options.wf_size 47711308Santhony.gutierrez@amd.comfor i in xrange(n_cu): 47811308Santhony.gutierrez@amd.com # The pipeline issues wavefront_size number of uncoalesced requests 47911308Santhony.gutierrez@amd.com # in one GPU issue cycle. Hence wavefront_size mem ports. 48011308Santhony.gutierrez@amd.com for j in xrange(wavefront_size): 48111308Santhony.gutierrez@amd.com system.cpu[shader_idx].CUs[i].memory_port[j] = \ 48211308Santhony.gutierrez@amd.com system.ruby._cpu_ports[gpu_port_idx].slave[j] 48311308Santhony.gutierrez@amd.com gpu_port_idx += 1 48411308Santhony.gutierrez@amd.com 48511308Santhony.gutierrez@amd.comfor i in xrange(n_cu): 48611308Santhony.gutierrez@amd.com if i > 0 and not i % options.cu_per_sqc: 48711308Santhony.gutierrez@amd.com print "incrementing idx on ", i 48811308Santhony.gutierrez@amd.com gpu_port_idx += 1 48911308Santhony.gutierrez@amd.com system.cpu[shader_idx].CUs[i].sqc_port = \ 49011308Santhony.gutierrez@amd.com system.ruby._cpu_ports[gpu_port_idx].slave 49111308Santhony.gutierrez@amd.comgpu_port_idx = gpu_port_idx + 1 49211308Santhony.gutierrez@amd.com 49311308Santhony.gutierrez@amd.com# attach CP ports to Ruby 49411308Santhony.gutierrez@amd.comfor i in xrange(options.num_cp): 49511308Santhony.gutierrez@amd.com system.cpu[cp_idx].createInterruptController() 49611308Santhony.gutierrez@amd.com system.cpu[cp_idx].dcache_port = \ 49711308Santhony.gutierrez@amd.com system.ruby._cpu_ports[gpu_port_idx + i * 2].slave 49811308Santhony.gutierrez@amd.com system.cpu[cp_idx].icache_port = \ 49911308Santhony.gutierrez@amd.com system.ruby._cpu_ports[gpu_port_idx + i * 2 + 1].slave 50011308Santhony.gutierrez@amd.com system.cpu[cp_idx].interrupts[0].pio = system.piobus.master 50111308Santhony.gutierrez@amd.com system.cpu[cp_idx].interrupts[0].int_master = system.piobus.slave 50211308Santhony.gutierrez@amd.com system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master 50311308Santhony.gutierrez@amd.com cp_idx = cp_idx + 1 50411308Santhony.gutierrez@amd.com 50511308Santhony.gutierrez@amd.com# connect dispatcher to the system.piobus 50611308Santhony.gutierrez@amd.comdispatcher.pio = system.piobus.master 50711308Santhony.gutierrez@amd.comdispatcher.dma = system.piobus.slave 50811308Santhony.gutierrez@amd.com 50911308Santhony.gutierrez@amd.com################# Connect the CPU and GPU via GPU Dispatcher ################### 51011308Santhony.gutierrez@amd.com# CPU rings the GPU doorbell to notify a pending task 51111308Santhony.gutierrez@amd.com# using this interface. 51211308Santhony.gutierrez@amd.com# And GPU uses this interface to notify the CPU of task completion 51311308Santhony.gutierrez@amd.com# The communcation happens through emulated driver. 51411308Santhony.gutierrez@amd.com 51511308Santhony.gutierrez@amd.com# Note this implicit setting of the cpu_pointer, shader_pointer and tlb array 51611308Santhony.gutierrez@amd.com# parameters must be after the explicit setting of the System cpu list 51711616Sdavid.j.hashe@gmail.comif fast_forward: 51811616Sdavid.j.hashe@gmail.com shader.cpu_pointer = future_cpu_list[0] 51911616Sdavid.j.hashe@gmail.com dispatcher.cpu = future_cpu_list[0] 52011616Sdavid.j.hashe@gmail.comelse: 52111616Sdavid.j.hashe@gmail.com shader.cpu_pointer = host_cpu 52211616Sdavid.j.hashe@gmail.com dispatcher.cpu = host_cpu 52311308Santhony.gutierrez@amd.comdispatcher.shader_pointer = shader 52411308Santhony.gutierrez@amd.comdispatcher.cl_driver = driver 52511308Santhony.gutierrez@amd.com 52611308Santhony.gutierrez@amd.com########################## Start simulation ######################## 52711308Santhony.gutierrez@amd.com 52811308Santhony.gutierrez@amd.comroot = Root(system=system, full_system=False) 52911308Santhony.gutierrez@amd.comm5.ticks.setGlobalFrequency('1THz') 53011308Santhony.gutierrez@amd.comif options.abs_max_tick: 53111308Santhony.gutierrez@amd.com maxtick = options.abs_max_tick 53211308Santhony.gutierrez@amd.comelse: 53311308Santhony.gutierrez@amd.com maxtick = m5.MaxTick 53411308Santhony.gutierrez@amd.com 53511308Santhony.gutierrez@amd.com# Benchmarks support work item annotations 53611308Santhony.gutierrez@amd.comSimulation.setWorkCountOptions(system, options) 53711308Santhony.gutierrez@amd.com 53811308Santhony.gutierrez@amd.com# Checkpointing is not supported by APU model 53911308Santhony.gutierrez@amd.comif (options.checkpoint_dir != None or 54011308Santhony.gutierrez@amd.com options.checkpoint_restore != None): 54111308Santhony.gutierrez@amd.com fatal("Checkpointing not supported by apu model") 54211308Santhony.gutierrez@amd.com 54311308Santhony.gutierrez@amd.comcheckpoint_dir = None 54411308Santhony.gutierrez@amd.comm5.instantiate(checkpoint_dir) 54511308Santhony.gutierrez@amd.com 54611308Santhony.gutierrez@amd.com# Map workload to this address space 54711308Santhony.gutierrez@amd.comhost_cpu.workload[0].map(0x10000000, 0x200000000, 4096) 54811308Santhony.gutierrez@amd.com 54911616Sdavid.j.hashe@gmail.comif options.fast_forward: 55011616Sdavid.j.hashe@gmail.com print "Switch at instruction count: %d" % \ 55111616Sdavid.j.hashe@gmail.com cpu_list[0].max_insts_any_thread 55211616Sdavid.j.hashe@gmail.com 55311308Santhony.gutierrez@amd.comexit_event = m5.simulate(maxtick) 55411616Sdavid.j.hashe@gmail.com 55511616Sdavid.j.hashe@gmail.comif options.fast_forward: 55611616Sdavid.j.hashe@gmail.com if exit_event.getCause() == "a thread reached the max instruction count": 55711616Sdavid.j.hashe@gmail.com m5.switchCpus(system, switch_cpu_list) 55811616Sdavid.j.hashe@gmail.com print "Switched CPUS @ tick %s" % (m5.curTick()) 55911616Sdavid.j.hashe@gmail.com m5.stats.reset() 56011616Sdavid.j.hashe@gmail.com exit_event = m5.simulate(maxtick - m5.curTick()) 56111616Sdavid.j.hashe@gmail.comelif options.fast_forward_pseudo_op: 56211616Sdavid.j.hashe@gmail.com while exit_event.getCause() == "switchcpu": 56311616Sdavid.j.hashe@gmail.com # If we are switching *to* kvm, then the current stats are meaningful 56411616Sdavid.j.hashe@gmail.com # Note that we don't do any warmup by default 56511616Sdavid.j.hashe@gmail.com if type(switch_cpu_list[0][0]) == FutureCpuClass: 56611616Sdavid.j.hashe@gmail.com print "Dumping stats..." 56711616Sdavid.j.hashe@gmail.com m5.stats.dump() 56811616Sdavid.j.hashe@gmail.com m5.switchCpus(system, switch_cpu_list) 56911616Sdavid.j.hashe@gmail.com print "Switched CPUS @ tick %s" % (m5.curTick()) 57011616Sdavid.j.hashe@gmail.com m5.stats.reset() 57111616Sdavid.j.hashe@gmail.com # This lets us switch back and forth without keeping a counter 57211616Sdavid.j.hashe@gmail.com switch_cpu_list = [(x[1], x[0]) for x in switch_cpu_list] 57311616Sdavid.j.hashe@gmail.com exit_event = m5.simulate(maxtick - m5.curTick()) 57411616Sdavid.j.hashe@gmail.com 57511308Santhony.gutierrez@amd.comprint "Ticks:", m5.curTick() 57611308Santhony.gutierrez@amd.comprint 'Exiting because ', exit_event.getCause() 57711308Santhony.gutierrez@amd.comsys.exit(exit_event.getCode()) 578