apu_se.py revision 12564
111308Santhony.gutierrez@amd.com# 211308Santhony.gutierrez@amd.com# Copyright (c) 2015 Advanced Micro Devices, Inc. 311308Santhony.gutierrez@amd.com# All rights reserved. 411308Santhony.gutierrez@amd.com# 511308Santhony.gutierrez@amd.com# For use for simulation and test purposes only 611308Santhony.gutierrez@amd.com# 711308Santhony.gutierrez@amd.com# Redistribution and use in source and binary forms, with or without 811308Santhony.gutierrez@amd.com# modification, are permitted provided that the following conditions are met: 911308Santhony.gutierrez@amd.com# 1011308Santhony.gutierrez@amd.com# 1. Redistributions of source code must retain the above copyright notice, 1111308Santhony.gutierrez@amd.com# this list of conditions and the following disclaimer. 1211308Santhony.gutierrez@amd.com# 1311308Santhony.gutierrez@amd.com# 2. Redistributions in binary form must reproduce the above copyright notice, 1411308Santhony.gutierrez@amd.com# this list of conditions and the following disclaimer in the documentation 1511308Santhony.gutierrez@amd.com# and/or other materials provided with the distribution. 1611308Santhony.gutierrez@amd.com# 1711308Santhony.gutierrez@amd.com# 3. Neither the name of the copyright holder nor the names of its contributors 1811308Santhony.gutierrez@amd.com# may be used to endorse or promote products derived from this software 1911308Santhony.gutierrez@amd.com# without specific prior written permission. 2011308Santhony.gutierrez@amd.com# 2111308Santhony.gutierrez@amd.com# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 2211308Santhony.gutierrez@amd.com# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 2311308Santhony.gutierrez@amd.com# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2411308Santhony.gutierrez@amd.com# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 2511308Santhony.gutierrez@amd.com# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 2611308Santhony.gutierrez@amd.com# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 2711308Santhony.gutierrez@amd.com# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 2811308Santhony.gutierrez@amd.com# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 2911308Santhony.gutierrez@amd.com# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 3011308Santhony.gutierrez@amd.com# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 3111308Santhony.gutierrez@amd.com# POSSIBILITY OF SUCH DAMAGE. 3211308Santhony.gutierrez@amd.com# 3311308Santhony.gutierrez@amd.com# Author: Sooraj Puthoor 3411308Santhony.gutierrez@amd.com# 3511308Santhony.gutierrez@amd.com 3612564Sgabeblack@google.comfrom __future__ import print_function 3712564Sgabeblack@google.com 3811308Santhony.gutierrez@amd.comimport optparse, os, re 3911308Santhony.gutierrez@amd.comimport math 4011308Santhony.gutierrez@amd.comimport glob 4111308Santhony.gutierrez@amd.comimport inspect 4211308Santhony.gutierrez@amd.com 4311308Santhony.gutierrez@amd.comimport m5 4411308Santhony.gutierrez@amd.comfrom m5.objects import * 4511308Santhony.gutierrez@amd.comfrom m5.util import addToPath 4611308Santhony.gutierrez@amd.com 4711670Sandreas.hansson@arm.comaddToPath('../') 4811670Sandreas.hansson@arm.com 4911670Sandreas.hansson@arm.comfrom ruby import Ruby 5011308Santhony.gutierrez@amd.com 5111682Sandreas.hansson@arm.comfrom common import Options 5211682Sandreas.hansson@arm.comfrom common import Simulation 5311682Sandreas.hansson@arm.comfrom common import GPUTLBOptions, GPUTLBConfig 5411308Santhony.gutierrez@amd.com 5511308Santhony.gutierrez@amd.com########################## Script Options ######################## 5611308Santhony.gutierrez@amd.comdef setOption(parser, opt_str, value = 1): 5711308Santhony.gutierrez@amd.com # check to make sure the option actually exists 5811308Santhony.gutierrez@amd.com if not parser.has_option(opt_str): 5911308Santhony.gutierrez@amd.com raise Exception("cannot find %s in list of possible options" % opt_str) 6011308Santhony.gutierrez@amd.com 6111308Santhony.gutierrez@amd.com opt = parser.get_option(opt_str) 6211308Santhony.gutierrez@amd.com # set the value 6311308Santhony.gutierrez@amd.com exec("parser.values.%s = %s" % (opt.dest, value)) 6411308Santhony.gutierrez@amd.com 6511308Santhony.gutierrez@amd.comdef getOption(parser, opt_str): 6611308Santhony.gutierrez@amd.com # check to make sure the option actually exists 6711308Santhony.gutierrez@amd.com if not parser.has_option(opt_str): 6811308Santhony.gutierrez@amd.com raise Exception("cannot find %s in list of possible options" % opt_str) 6911308Santhony.gutierrez@amd.com 7011308Santhony.gutierrez@amd.com opt = parser.get_option(opt_str) 7111308Santhony.gutierrez@amd.com # get the value 7211308Santhony.gutierrez@amd.com exec("return_value = parser.values.%s" % opt.dest) 7311308Santhony.gutierrez@amd.com return return_value 7411308Santhony.gutierrez@amd.com 7511308Santhony.gutierrez@amd.com# Adding script options 7611308Santhony.gutierrez@amd.comparser = optparse.OptionParser() 7711308Santhony.gutierrez@amd.comOptions.addCommonOptions(parser) 7811308Santhony.gutierrez@amd.comOptions.addSEOptions(parser) 7911308Santhony.gutierrez@amd.com 8011308Santhony.gutierrez@amd.comparser.add_option("--cpu-only-mode", action="store_true", default=False, 8111308Santhony.gutierrez@amd.com help="APU mode. Used to take care of problems in "\ 8211308Santhony.gutierrez@amd.com "Ruby.py while running APU protocols") 8311308Santhony.gutierrez@amd.comparser.add_option("-k", "--kernel-files", 8411308Santhony.gutierrez@amd.com help="file(s) containing GPU kernel code (colon separated)") 8511308Santhony.gutierrez@amd.comparser.add_option("-u", "--num-compute-units", type="int", default=1, 8611308Santhony.gutierrez@amd.com help="number of GPU compute units"), 8711308Santhony.gutierrez@amd.comparser.add_option("--num-cp", type="int", default=0, 8811308Santhony.gutierrez@amd.com help="Number of GPU Command Processors (CP)") 8911308Santhony.gutierrez@amd.comparser.add_option("--benchmark-root", help="Root of benchmark directory tree") 9011308Santhony.gutierrez@amd.com 9111308Santhony.gutierrez@amd.com# not super important now, but to avoid putting the number 4 everywhere, make 9211308Santhony.gutierrez@amd.com# it an option/knob 9311308Santhony.gutierrez@amd.comparser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs" \ 9411308Santhony.gutierrez@amd.com "sharing an SQC (icache, and thus icache TLB)") 9511308Santhony.gutierrez@amd.comparser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \ 9611308Santhony.gutierrez@amd.com "per CU") 9711308Santhony.gutierrez@amd.comparser.add_option("--wf-size", type="int", default=64, 9811308Santhony.gutierrez@amd.com help="Wavefront size(in workitems)") 9911308Santhony.gutierrez@amd.comparser.add_option("--sp-bypass-path-length", type="int", default=4, \ 10011308Santhony.gutierrez@amd.com help="Number of stages of bypass path in vector ALU for Single Precision ops") 10111308Santhony.gutierrez@amd.comparser.add_option("--dp-bypass-path-length", type="int", default=4, \ 10211308Santhony.gutierrez@amd.com help="Number of stages of bypass path in vector ALU for Double Precision ops") 10311308Santhony.gutierrez@amd.com# issue period per SIMD unit: number of cycles before issuing another vector 10411308Santhony.gutierrez@amd.comparser.add_option("--issue-period", type="int", default=4, \ 10511308Santhony.gutierrez@amd.com help="Number of cycles per vector instruction issue period") 10611308Santhony.gutierrez@amd.comparser.add_option("--glbmem-wr-bus-width", type="int", default=32, \ 10711308Santhony.gutierrez@amd.com help="VGPR to Coalescer (Global Memory) data bus width in bytes") 10811308Santhony.gutierrez@amd.comparser.add_option("--glbmem-rd-bus-width", type="int", default=32, \ 10911308Santhony.gutierrez@amd.com help="Coalescer to VGPR (Global Memory) data bus width in bytes") 11011308Santhony.gutierrez@amd.com# Currently we only support 1 local memory pipe 11111308Santhony.gutierrez@amd.comparser.add_option("--shr-mem-pipes-per-cu", type="int", default=1, \ 11211308Santhony.gutierrez@amd.com help="Number of Shared Memory pipelines per CU") 11311308Santhony.gutierrez@amd.com# Currently we only support 1 global memory pipe 11411308Santhony.gutierrez@amd.comparser.add_option("--glb-mem-pipes-per-cu", type="int", default=1, \ 11511308Santhony.gutierrez@amd.com help="Number of Global Memory pipelines per CU") 11611308Santhony.gutierrez@amd.comparser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \ 11711308Santhony.gutierrez@amd.com "WF slots per SIMD") 11811308Santhony.gutierrez@amd.com 11911308Santhony.gutierrez@amd.comparser.add_option("--vreg-file-size", type="int", default=2048, 12011308Santhony.gutierrez@amd.com help="number of physical vector registers per SIMD") 12111308Santhony.gutierrez@amd.comparser.add_option("--bw-scalor", type="int", default=0, 12211308Santhony.gutierrez@amd.com help="bandwidth scalor for scalability analysis") 12311308Santhony.gutierrez@amd.comparser.add_option("--CPUClock", type="string", default="2GHz", 12411308Santhony.gutierrez@amd.com help="CPU clock") 12511308Santhony.gutierrez@amd.comparser.add_option("--GPUClock", type="string", default="1GHz", 12611308Santhony.gutierrez@amd.com help="GPU clock") 12711308Santhony.gutierrez@amd.comparser.add_option("--cpu-voltage", action="store", type="string", 12811308Santhony.gutierrez@amd.com default='1.0V', 12911308Santhony.gutierrez@amd.com help = """CPU voltage domain""") 13011308Santhony.gutierrez@amd.comparser.add_option("--gpu-voltage", action="store", type="string", 13111308Santhony.gutierrez@amd.com default='1.0V', 13211308Santhony.gutierrez@amd.com help = """CPU voltage domain""") 13311308Santhony.gutierrez@amd.comparser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST", 13411308Santhony.gutierrez@amd.com help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)") 13511308Santhony.gutierrez@amd.comparser.add_option("--xact-cas-mode", action="store_true", 13611308Santhony.gutierrez@amd.com help="enable load_compare mode (transactional CAS)") 13711308Santhony.gutierrez@amd.comparser.add_option("--SegFaultDebug",action="store_true", 13811308Santhony.gutierrez@amd.com help="checks for GPU seg fault before TLB access") 13911308Santhony.gutierrez@amd.comparser.add_option("--FunctionalTLB",action="store_true", 14011308Santhony.gutierrez@amd.com help="Assumes TLB has no latency") 14111308Santhony.gutierrez@amd.comparser.add_option("--LocalMemBarrier",action="store_true", 14211308Santhony.gutierrez@amd.com help="Barrier does not wait for writethroughs to complete") 14311308Santhony.gutierrez@amd.comparser.add_option("--countPages", action="store_true", 14411308Santhony.gutierrez@amd.com help="Count Page Accesses and output in per-CU output files") 14511308Santhony.gutierrez@amd.comparser.add_option("--TLB-prefetch", type="int", help = "prefetch depth for"\ 14611308Santhony.gutierrez@amd.com "TLBs") 14711308Santhony.gutierrez@amd.comparser.add_option("--pf-type", type="string", help="type of prefetch: "\ 14811308Santhony.gutierrez@amd.com "PF_CU, PF_WF, PF_PHASE, PF_STRIDE") 14911308Santhony.gutierrez@amd.comparser.add_option("--pf-stride", type="int", help="set prefetch stride") 15011308Santhony.gutierrez@amd.comparser.add_option("--numLdsBanks", type="int", default=32, 15111308Santhony.gutierrez@amd.com help="number of physical banks per LDS module") 15211308Santhony.gutierrez@amd.comparser.add_option("--ldsBankConflictPenalty", type="int", default=1, 15311308Santhony.gutierrez@amd.com help="number of cycles per LDS bank conflict") 15411616Sdavid.j.hashe@gmail.comparser.add_option('--fast-forward-pseudo-op', action='store_true', 15511616Sdavid.j.hashe@gmail.com help = 'fast forward using kvm until the m5_switchcpu' 15611616Sdavid.j.hashe@gmail.com ' pseudo-op is encountered, then switch cpus. subsequent' 15711616Sdavid.j.hashe@gmail.com ' m5_switchcpu pseudo-ops will toggle back and forth') 15811700Santhony.gutierrez@amd.comparser.add_option('--outOfOrderDataDelivery', action='store_true', 15911700Santhony.gutierrez@amd.com default=False, help='enable OoO data delivery in the GM' 16011700Santhony.gutierrez@amd.com ' pipeline') 16111308Santhony.gutierrez@amd.com 16211308Santhony.gutierrez@amd.comRuby.define_options(parser) 16311308Santhony.gutierrez@amd.com 16411308Santhony.gutierrez@amd.com#add TLB options to the parser 16511308Santhony.gutierrez@amd.comGPUTLBOptions.tlb_options(parser) 16611308Santhony.gutierrez@amd.com 16711308Santhony.gutierrez@amd.com(options, args) = parser.parse_args() 16811308Santhony.gutierrez@amd.com 16911308Santhony.gutierrez@amd.com# The GPU cache coherence protocols only work with the backing store 17011308Santhony.gutierrez@amd.comsetOption(parser, "--access-backing-store") 17111308Santhony.gutierrez@amd.com 17211308Santhony.gutierrez@amd.com# if benchmark root is specified explicitly, that overrides the search path 17311308Santhony.gutierrez@amd.comif options.benchmark_root: 17411308Santhony.gutierrez@amd.com benchmark_path = [options.benchmark_root] 17511308Santhony.gutierrez@amd.comelse: 17611308Santhony.gutierrez@amd.com # Set default benchmark search path to current dir 17711308Santhony.gutierrez@amd.com benchmark_path = ['.'] 17811308Santhony.gutierrez@amd.com 17911308Santhony.gutierrez@amd.com########################## Sanity Check ######################## 18011308Santhony.gutierrez@amd.com 18111308Santhony.gutierrez@amd.com# Currently the gpu model requires ruby 18211308Santhony.gutierrez@amd.comif buildEnv['PROTOCOL'] == 'None': 18311308Santhony.gutierrez@amd.com fatal("GPU model requires ruby") 18411308Santhony.gutierrez@amd.com 18511308Santhony.gutierrez@amd.com# Currently the gpu model requires only timing or detailed CPU 18612014Sgabeblack@google.comif not (options.cpu_type == "TimingSimpleCPU" or 18712014Sgabeblack@google.com options.cpu_type == "DerivO3CPU"): 18812014Sgabeblack@google.com fatal("GPU model requires TimingSimpleCPU or DerivO3CPU") 18911308Santhony.gutierrez@amd.com 19011308Santhony.gutierrez@amd.com# This file can support multiple compute units 19111308Santhony.gutierrez@amd.comassert(options.num_compute_units >= 1) 19211308Santhony.gutierrez@amd.com 19311308Santhony.gutierrez@amd.com# Currently, the sqc (I-Cache of GPU) is shared by 19411308Santhony.gutierrez@amd.com# multiple compute units(CUs). The protocol works just fine 19511308Santhony.gutierrez@amd.com# even if sqc is not shared. Overriding this option here 19611308Santhony.gutierrez@amd.com# so that the user need not explicitly set this (assuming 19711308Santhony.gutierrez@amd.com# sharing sqc is the common usage) 19811308Santhony.gutierrez@amd.comn_cu = options.num_compute_units 19911308Santhony.gutierrez@amd.comnum_sqc = int(math.ceil(float(n_cu) / options.cu_per_sqc)) 20011308Santhony.gutierrez@amd.comoptions.num_sqc = num_sqc # pass this to Ruby 20111308Santhony.gutierrez@amd.com 20211308Santhony.gutierrez@amd.com########################## Creating the GPU system ######################## 20311308Santhony.gutierrez@amd.com# shader is the GPU 20411308Santhony.gutierrez@amd.comshader = Shader(n_wf = options.wfs_per_simd, 20511308Santhony.gutierrez@amd.com clk_domain = SrcClockDomain( 20611308Santhony.gutierrez@amd.com clock = options.GPUClock, 20711308Santhony.gutierrez@amd.com voltage_domain = VoltageDomain( 20811308Santhony.gutierrez@amd.com voltage = options.gpu_voltage))) 20911308Santhony.gutierrez@amd.com 21011308Santhony.gutierrez@amd.com# GPU_RfO(Read For Ownership) implements SC/TSO memory model. 21111308Santhony.gutierrez@amd.com# Other GPU protocols implement release consistency at GPU side. 21211308Santhony.gutierrez@amd.com# So, all GPU protocols other than GPU_RfO should make their writes 21311308Santhony.gutierrez@amd.com# visible to the global memory and should read from global memory 21411308Santhony.gutierrez@amd.com# during kernal boundary. The pipeline initiates(or do not initiate) 21511308Santhony.gutierrez@amd.com# the acquire/release operation depending on this impl_kern_boundary_sync 21611308Santhony.gutierrez@amd.com# flag. This flag=true means pipeline initiates a acquire/release operation 21711308Santhony.gutierrez@amd.com# at kernel boundary. 21811308Santhony.gutierrez@amd.comif buildEnv['PROTOCOL'] == 'GPU_RfO': 21911308Santhony.gutierrez@amd.com shader.impl_kern_boundary_sync = False 22011308Santhony.gutierrez@amd.comelse: 22111308Santhony.gutierrez@amd.com shader.impl_kern_boundary_sync = True 22211308Santhony.gutierrez@amd.com 22311308Santhony.gutierrez@amd.com# Switching off per-lane TLB by default 22411308Santhony.gutierrez@amd.comper_lane = False 22511308Santhony.gutierrez@amd.comif options.TLB_config == "perLane": 22611308Santhony.gutierrez@amd.com per_lane = True 22711308Santhony.gutierrez@amd.com 22811308Santhony.gutierrez@amd.com# List of compute units; one GPU can have multiple compute units 22911308Santhony.gutierrez@amd.comcompute_units = [] 23011308Santhony.gutierrez@amd.comfor i in xrange(n_cu): 23111308Santhony.gutierrez@amd.com compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane, 23211308Santhony.gutierrez@amd.com num_SIMDs = options.simds_per_cu, 23311308Santhony.gutierrez@amd.com wfSize = options.wf_size, 23411308Santhony.gutierrez@amd.com spbypass_pipe_length = options.sp_bypass_path_length, 23511308Santhony.gutierrez@amd.com dpbypass_pipe_length = options.dp_bypass_path_length, 23611308Santhony.gutierrez@amd.com issue_period = options.issue_period, 23711308Santhony.gutierrez@amd.com coalescer_to_vrf_bus_width = \ 23811308Santhony.gutierrez@amd.com options.glbmem_rd_bus_width, 23911308Santhony.gutierrez@amd.com vrf_to_coalescer_bus_width = \ 24011308Santhony.gutierrez@amd.com options.glbmem_wr_bus_width, 24111308Santhony.gutierrez@amd.com num_global_mem_pipes = \ 24211308Santhony.gutierrez@amd.com options.glb_mem_pipes_per_cu, 24311308Santhony.gutierrez@amd.com num_shared_mem_pipes = \ 24411308Santhony.gutierrez@amd.com options.shr_mem_pipes_per_cu, 24511308Santhony.gutierrez@amd.com n_wf = options.wfs_per_simd, 24611308Santhony.gutierrez@amd.com execPolicy = options.CUExecPolicy, 24711308Santhony.gutierrez@amd.com xactCasMode = options.xact_cas_mode, 24811308Santhony.gutierrez@amd.com debugSegFault = options.SegFaultDebug, 24911308Santhony.gutierrez@amd.com functionalTLB = options.FunctionalTLB, 25011308Santhony.gutierrez@amd.com localMemBarrier = options.LocalMemBarrier, 25111308Santhony.gutierrez@amd.com countPages = options.countPages, 25211308Santhony.gutierrez@amd.com localDataStore = \ 25311308Santhony.gutierrez@amd.com LdsState(banks = options.numLdsBanks, 25411308Santhony.gutierrez@amd.com bankConflictPenalty = \ 25511700Santhony.gutierrez@amd.com options.ldsBankConflictPenalty), 25611700Santhony.gutierrez@amd.com out_of_order_data_delivery = 25711700Santhony.gutierrez@amd.com options.outOfOrderDataDelivery)) 25811308Santhony.gutierrez@amd.com wavefronts = [] 25911308Santhony.gutierrez@amd.com vrfs = [] 26011308Santhony.gutierrez@amd.com for j in xrange(options.simds_per_cu): 26111308Santhony.gutierrez@amd.com for k in xrange(shader.n_wf): 26211534Sjohn.kalamatianos@amd.com wavefronts.append(Wavefront(simdId = j, wf_slot_id = k, 26311534Sjohn.kalamatianos@amd.com wfSize = options.wf_size)) 26411308Santhony.gutierrez@amd.com vrfs.append(VectorRegisterFile(simd_id=j, 26511308Santhony.gutierrez@amd.com num_regs_per_simd=options.vreg_file_size)) 26611308Santhony.gutierrez@amd.com compute_units[-1].wavefronts = wavefronts 26711308Santhony.gutierrez@amd.com compute_units[-1].vector_register_file = vrfs 26811308Santhony.gutierrez@amd.com if options.TLB_prefetch: 26911308Santhony.gutierrez@amd.com compute_units[-1].prefetch_depth = options.TLB_prefetch 27011308Santhony.gutierrez@amd.com compute_units[-1].prefetch_prev_type = options.pf_type 27111308Santhony.gutierrez@amd.com 27211308Santhony.gutierrez@amd.com # attach the LDS and the CU to the bus (actually a Bridge) 27311308Santhony.gutierrez@amd.com compute_units[-1].ldsPort = compute_units[-1].ldsBus.slave 27411308Santhony.gutierrez@amd.com compute_units[-1].ldsBus.master = compute_units[-1].localDataStore.cuPort 27511308Santhony.gutierrez@amd.com 27611308Santhony.gutierrez@amd.com# Attach compute units to GPU 27711308Santhony.gutierrez@amd.comshader.CUs = compute_units 27811308Santhony.gutierrez@amd.com 27911308Santhony.gutierrez@amd.com########################## Creating the CPU system ######################## 28011308Santhony.gutierrez@amd.comoptions.num_cpus = options.num_cpus 28111308Santhony.gutierrez@amd.com 28211308Santhony.gutierrez@amd.com# The shader core will be whatever is after the CPU cores are accounted for 28311308Santhony.gutierrez@amd.comshader_idx = options.num_cpus 28411308Santhony.gutierrez@amd.com 28511308Santhony.gutierrez@amd.com# The command processor will be whatever is after the shader is accounted for 28611308Santhony.gutierrez@amd.comcp_idx = shader_idx + 1 28711308Santhony.gutierrez@amd.comcp_list = [] 28811308Santhony.gutierrez@amd.com 28911308Santhony.gutierrez@amd.com# List of CPUs 29011308Santhony.gutierrez@amd.comcpu_list = [] 29111308Santhony.gutierrez@amd.com 29211616Sdavid.j.hashe@gmail.comCpuClass, mem_mode = Simulation.getCPUClass(options.cpu_type) 29311616Sdavid.j.hashe@gmail.comif CpuClass == AtomicSimpleCPU: 29411616Sdavid.j.hashe@gmail.com fatal("AtomicSimpleCPU is not supported") 29511616Sdavid.j.hashe@gmail.comif mem_mode != 'timing': 29611616Sdavid.j.hashe@gmail.com fatal("Only the timing memory mode is supported") 29711308Santhony.gutierrez@amd.comshader.timing = True 29811308Santhony.gutierrez@amd.com 29911616Sdavid.j.hashe@gmail.comif options.fast_forward and options.fast_forward_pseudo_op: 30011616Sdavid.j.hashe@gmail.com fatal("Cannot fast-forward based both on the number of instructions and" 30111616Sdavid.j.hashe@gmail.com " on pseudo-ops") 30211616Sdavid.j.hashe@gmail.comfast_forward = options.fast_forward or options.fast_forward_pseudo_op 30311616Sdavid.j.hashe@gmail.com 30411616Sdavid.j.hashe@gmail.comif fast_forward: 30511616Sdavid.j.hashe@gmail.com FutureCpuClass, future_mem_mode = CpuClass, mem_mode 30611616Sdavid.j.hashe@gmail.com 30711616Sdavid.j.hashe@gmail.com CpuClass = X86KvmCPU 30811616Sdavid.j.hashe@gmail.com mem_mode = 'atomic_noncaching' 30911616Sdavid.j.hashe@gmail.com # Leave shader.timing untouched, because its value only matters at the 31011616Sdavid.j.hashe@gmail.com # start of the simulation and because we require switching cpus 31111616Sdavid.j.hashe@gmail.com # *before* the first kernel launch. 31211616Sdavid.j.hashe@gmail.com 31311616Sdavid.j.hashe@gmail.com future_cpu_list = [] 31411616Sdavid.j.hashe@gmail.com 31511616Sdavid.j.hashe@gmail.com # Initial CPUs to be used during fast-forwarding. 31611616Sdavid.j.hashe@gmail.com for i in xrange(options.num_cpus): 31711616Sdavid.j.hashe@gmail.com cpu = CpuClass(cpu_id = i, 31811616Sdavid.j.hashe@gmail.com clk_domain = SrcClockDomain( 31911616Sdavid.j.hashe@gmail.com clock = options.CPUClock, 32011616Sdavid.j.hashe@gmail.com voltage_domain = VoltageDomain( 32111616Sdavid.j.hashe@gmail.com voltage = options.cpu_voltage))) 32211616Sdavid.j.hashe@gmail.com cpu_list.append(cpu) 32311616Sdavid.j.hashe@gmail.com 32411616Sdavid.j.hashe@gmail.com if options.fast_forward: 32511616Sdavid.j.hashe@gmail.com cpu.max_insts_any_thread = int(options.fast_forward) 32611616Sdavid.j.hashe@gmail.com 32711616Sdavid.j.hashe@gmail.comif fast_forward: 32811616Sdavid.j.hashe@gmail.com MainCpuClass = FutureCpuClass 32911616Sdavid.j.hashe@gmail.comelse: 33011616Sdavid.j.hashe@gmail.com MainCpuClass = CpuClass 33111616Sdavid.j.hashe@gmail.com 33211616Sdavid.j.hashe@gmail.com# CPs to be used throughout the simulation. 33311616Sdavid.j.hashe@gmail.comfor i in xrange(options.num_cp): 33411616Sdavid.j.hashe@gmail.com cp = MainCpuClass(cpu_id = options.num_cpus + i, 33511616Sdavid.j.hashe@gmail.com clk_domain = SrcClockDomain( 33611616Sdavid.j.hashe@gmail.com clock = options.CPUClock, 33711616Sdavid.j.hashe@gmail.com voltage_domain = VoltageDomain( 33811616Sdavid.j.hashe@gmail.com voltage = options.cpu_voltage))) 33911616Sdavid.j.hashe@gmail.com cp_list.append(cp) 34011616Sdavid.j.hashe@gmail.com 34111616Sdavid.j.hashe@gmail.com# Main CPUs (to be used after fast-forwarding if fast-forwarding is specified). 34211616Sdavid.j.hashe@gmail.comfor i in xrange(options.num_cpus): 34311616Sdavid.j.hashe@gmail.com cpu = MainCpuClass(cpu_id = i, 34411616Sdavid.j.hashe@gmail.com clk_domain = SrcClockDomain( 34511616Sdavid.j.hashe@gmail.com clock = options.CPUClock, 34611616Sdavid.j.hashe@gmail.com voltage_domain = VoltageDomain( 34711616Sdavid.j.hashe@gmail.com voltage = options.cpu_voltage))) 34811616Sdavid.j.hashe@gmail.com if fast_forward: 34911616Sdavid.j.hashe@gmail.com cpu.switched_out = True 35011616Sdavid.j.hashe@gmail.com future_cpu_list.append(cpu) 35111308Santhony.gutierrez@amd.com else: 35211616Sdavid.j.hashe@gmail.com cpu_list.append(cpu) 35311308Santhony.gutierrez@amd.com 35411308Santhony.gutierrez@amd.com########################## Creating the GPU dispatcher ######################## 35511308Santhony.gutierrez@amd.com# Dispatcher dispatches work from host CPU to GPU 35611308Santhony.gutierrez@amd.comhost_cpu = cpu_list[0] 35711308Santhony.gutierrez@amd.comdispatcher = GpuDispatcher() 35811308Santhony.gutierrez@amd.com 35911308Santhony.gutierrez@amd.com########################## Create and assign the workload ######################## 36011308Santhony.gutierrez@amd.com# Check for rel_path in elements of base_list using test, returning 36111308Santhony.gutierrez@amd.com# the first full path that satisfies test 36211308Santhony.gutierrez@amd.comdef find_path(base_list, rel_path, test): 36311308Santhony.gutierrez@amd.com for base in base_list: 36411308Santhony.gutierrez@amd.com if not base: 36511308Santhony.gutierrez@amd.com # base could be None if environment var not set 36611308Santhony.gutierrez@amd.com continue 36711308Santhony.gutierrez@amd.com full_path = os.path.join(base, rel_path) 36811308Santhony.gutierrez@amd.com if test(full_path): 36911308Santhony.gutierrez@amd.com return full_path 37011308Santhony.gutierrez@amd.com fatal("%s not found in %s" % (rel_path, base_list)) 37111308Santhony.gutierrez@amd.com 37211308Santhony.gutierrez@amd.comdef find_file(base_list, rel_path): 37311308Santhony.gutierrez@amd.com return find_path(base_list, rel_path, os.path.isfile) 37411308Santhony.gutierrez@amd.com 37511308Santhony.gutierrez@amd.comexecutable = find_path(benchmark_path, options.cmd, os.path.exists) 37611308Santhony.gutierrez@amd.com# it's common for a benchmark to be in a directory with the same 37711308Santhony.gutierrez@amd.com# name as the executable, so we handle that automatically 37811308Santhony.gutierrez@amd.comif os.path.isdir(executable): 37911308Santhony.gutierrez@amd.com benchmark_path = [executable] 38011308Santhony.gutierrez@amd.com executable = find_file(benchmark_path, options.cmd) 38111308Santhony.gutierrez@amd.comif options.kernel_files: 38211308Santhony.gutierrez@amd.com kernel_files = [find_file(benchmark_path, f) 38311308Santhony.gutierrez@amd.com for f in options.kernel_files.split(':')] 38411308Santhony.gutierrez@amd.comelse: 38511308Santhony.gutierrez@amd.com # if kernel_files is not set, see if there's a unique .asm file 38611308Santhony.gutierrez@amd.com # in the same directory as the executable 38711308Santhony.gutierrez@amd.com kernel_path = os.path.dirname(executable) 38811308Santhony.gutierrez@amd.com kernel_files = glob.glob(os.path.join(kernel_path, '*.asm')) 38911308Santhony.gutierrez@amd.com if kernel_files: 39012564Sgabeblack@google.com print("Using GPU kernel code file(s)", ",".join(kernel_files)) 39111308Santhony.gutierrez@amd.com else: 39211308Santhony.gutierrez@amd.com fatal("Can't locate kernel code (.asm) in " + kernel_path) 39311308Santhony.gutierrez@amd.com 39411308Santhony.gutierrez@amd.com# OpenCL driver 39511308Santhony.gutierrez@amd.comdriver = ClDriver(filename="hsa", codefile=kernel_files) 39611308Santhony.gutierrez@amd.comfor cpu in cpu_list: 39712418Santhony.gutierrez@amd.com cpu.createThreads() 39811851Sbrandon.potter@amd.com cpu.workload = Process(executable = executable, 39911851Sbrandon.potter@amd.com cmd = [options.cmd] + options.options.split(), 40011851Sbrandon.potter@amd.com drivers = [driver]) 40111308Santhony.gutierrez@amd.comfor cp in cp_list: 40211308Santhony.gutierrez@amd.com cp.workload = host_cpu.workload 40311308Santhony.gutierrez@amd.com 40411616Sdavid.j.hashe@gmail.comif fast_forward: 40511616Sdavid.j.hashe@gmail.com for i in xrange(len(future_cpu_list)): 40611616Sdavid.j.hashe@gmail.com future_cpu_list[i].workload = cpu_list[i].workload 40712418Santhony.gutierrez@amd.com future_cpu_list[i].createThreads() 40811616Sdavid.j.hashe@gmail.com 40911308Santhony.gutierrez@amd.com########################## Create the overall system ######################## 41011616Sdavid.j.hashe@gmail.com# List of CPUs that must be switched when moving between KVM and simulation 41111616Sdavid.j.hashe@gmail.comif fast_forward: 41211616Sdavid.j.hashe@gmail.com switch_cpu_list = \ 41311616Sdavid.j.hashe@gmail.com [(cpu_list[i], future_cpu_list[i]) for i in xrange(options.num_cpus)] 41411616Sdavid.j.hashe@gmail.com 41511308Santhony.gutierrez@amd.com# Full list of processing cores in the system. Note that 41611308Santhony.gutierrez@amd.com# dispatcher is also added to cpu_list although it is 41711308Santhony.gutierrez@amd.com# not a processing element 41811308Santhony.gutierrez@amd.comcpu_list = cpu_list + [shader] + cp_list + [dispatcher] 41911308Santhony.gutierrez@amd.com 42011308Santhony.gutierrez@amd.com# creating the overall system 42111308Santhony.gutierrez@amd.com# notice the cpu list is explicitly added as a parameter to System 42211308Santhony.gutierrez@amd.comsystem = System(cpu = cpu_list, 42311308Santhony.gutierrez@amd.com mem_ranges = [AddrRange(options.mem_size)], 42411308Santhony.gutierrez@amd.com cache_line_size = options.cacheline_size, 42511308Santhony.gutierrez@amd.com mem_mode = mem_mode) 42611616Sdavid.j.hashe@gmail.comif fast_forward: 42711616Sdavid.j.hashe@gmail.com system.future_cpu = future_cpu_list 42811308Santhony.gutierrez@amd.comsystem.voltage_domain = VoltageDomain(voltage = options.sys_voltage) 42911308Santhony.gutierrez@amd.comsystem.clk_domain = SrcClockDomain(clock = options.sys_clock, 43011308Santhony.gutierrez@amd.com voltage_domain = system.voltage_domain) 43111308Santhony.gutierrez@amd.com 43211616Sdavid.j.hashe@gmail.comif fast_forward: 43311616Sdavid.j.hashe@gmail.com have_kvm_support = 'BaseKvmCPU' in globals() 43411616Sdavid.j.hashe@gmail.com if have_kvm_support and buildEnv['TARGET_ISA'] == "x86": 43511616Sdavid.j.hashe@gmail.com system.vm = KvmVM() 43611616Sdavid.j.hashe@gmail.com for i in xrange(len(host_cpu.workload)): 43711616Sdavid.j.hashe@gmail.com host_cpu.workload[i].useArchPT = True 43811616Sdavid.j.hashe@gmail.com host_cpu.workload[i].kvmInSE = True 43911616Sdavid.j.hashe@gmail.com else: 44011616Sdavid.j.hashe@gmail.com fatal("KvmCPU can only be used in SE mode with x86") 44111616Sdavid.j.hashe@gmail.com 44211308Santhony.gutierrez@amd.com# configure the TLB hierarchy 44311308Santhony.gutierrez@amd.comGPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx) 44411308Santhony.gutierrez@amd.com 44511308Santhony.gutierrez@amd.com# create Ruby system 44611308Santhony.gutierrez@amd.comsystem.piobus = IOXBar(width=32, response_latency=0, 44711308Santhony.gutierrez@amd.com frontend_latency=0, forward_latency=0) 44811308Santhony.gutierrez@amd.comRuby.create_system(options, None, system) 44911308Santhony.gutierrez@amd.comsystem.ruby.clk_domain = SrcClockDomain(clock = options.ruby_clock, 45011308Santhony.gutierrez@amd.com voltage_domain = system.voltage_domain) 45111308Santhony.gutierrez@amd.com 45211308Santhony.gutierrez@amd.com# attach the CPU ports to Ruby 45311308Santhony.gutierrez@amd.comfor i in range(options.num_cpus): 45411308Santhony.gutierrez@amd.com ruby_port = system.ruby._cpu_ports[i] 45511308Santhony.gutierrez@amd.com 45611308Santhony.gutierrez@amd.com # Create interrupt controller 45711308Santhony.gutierrez@amd.com system.cpu[i].createInterruptController() 45811308Santhony.gutierrez@amd.com 45911308Santhony.gutierrez@amd.com # Connect cache port's to ruby 46011308Santhony.gutierrez@amd.com system.cpu[i].icache_port = ruby_port.slave 46111308Santhony.gutierrez@amd.com system.cpu[i].dcache_port = ruby_port.slave 46211308Santhony.gutierrez@amd.com 46311308Santhony.gutierrez@amd.com ruby_port.mem_master_port = system.piobus.slave 46411308Santhony.gutierrez@amd.com if buildEnv['TARGET_ISA'] == "x86": 46511308Santhony.gutierrez@amd.com system.cpu[i].interrupts[0].pio = system.piobus.master 46611308Santhony.gutierrez@amd.com system.cpu[i].interrupts[0].int_master = system.piobus.slave 46711308Santhony.gutierrez@amd.com system.cpu[i].interrupts[0].int_slave = system.piobus.master 46811616Sdavid.j.hashe@gmail.com if fast_forward: 46911616Sdavid.j.hashe@gmail.com system.cpu[i].itb.walker.port = ruby_port.slave 47011616Sdavid.j.hashe@gmail.com system.cpu[i].dtb.walker.port = ruby_port.slave 47111308Santhony.gutierrez@amd.com 47211308Santhony.gutierrez@amd.com# attach CU ports to Ruby 47311308Santhony.gutierrez@amd.com# Because of the peculiarities of the CP core, you may have 1 CPU but 2 47411308Santhony.gutierrez@amd.com# sequencers and thus 2 _cpu_ports created. Your GPUs shouldn't be 47511308Santhony.gutierrez@amd.com# hooked up until after the CP. To make this script generic, figure out 47611308Santhony.gutierrez@amd.com# the index as below, but note that this assumes there is one sequencer 47711308Santhony.gutierrez@amd.com# per compute unit and one sequencer per SQC for the math to work out 47811308Santhony.gutierrez@amd.com# correctly. 47911308Santhony.gutierrez@amd.comgpu_port_idx = len(system.ruby._cpu_ports) \ 48011308Santhony.gutierrez@amd.com - options.num_compute_units - options.num_sqc 48111308Santhony.gutierrez@amd.comgpu_port_idx = gpu_port_idx - options.num_cp * 2 48211308Santhony.gutierrez@amd.com 48311308Santhony.gutierrez@amd.comwavefront_size = options.wf_size 48411308Santhony.gutierrez@amd.comfor i in xrange(n_cu): 48511308Santhony.gutierrez@amd.com # The pipeline issues wavefront_size number of uncoalesced requests 48611308Santhony.gutierrez@amd.com # in one GPU issue cycle. Hence wavefront_size mem ports. 48711308Santhony.gutierrez@amd.com for j in xrange(wavefront_size): 48811308Santhony.gutierrez@amd.com system.cpu[shader_idx].CUs[i].memory_port[j] = \ 48911308Santhony.gutierrez@amd.com system.ruby._cpu_ports[gpu_port_idx].slave[j] 49011308Santhony.gutierrez@amd.com gpu_port_idx += 1 49111308Santhony.gutierrez@amd.com 49211308Santhony.gutierrez@amd.comfor i in xrange(n_cu): 49311308Santhony.gutierrez@amd.com if i > 0 and not i % options.cu_per_sqc: 49412564Sgabeblack@google.com print("incrementing idx on ", i) 49511308Santhony.gutierrez@amd.com gpu_port_idx += 1 49611308Santhony.gutierrez@amd.com system.cpu[shader_idx].CUs[i].sqc_port = \ 49711308Santhony.gutierrez@amd.com system.ruby._cpu_ports[gpu_port_idx].slave 49811308Santhony.gutierrez@amd.comgpu_port_idx = gpu_port_idx + 1 49911308Santhony.gutierrez@amd.com 50011308Santhony.gutierrez@amd.com# attach CP ports to Ruby 50111308Santhony.gutierrez@amd.comfor i in xrange(options.num_cp): 50211308Santhony.gutierrez@amd.com system.cpu[cp_idx].createInterruptController() 50311308Santhony.gutierrez@amd.com system.cpu[cp_idx].dcache_port = \ 50411308Santhony.gutierrez@amd.com system.ruby._cpu_ports[gpu_port_idx + i * 2].slave 50511308Santhony.gutierrez@amd.com system.cpu[cp_idx].icache_port = \ 50611308Santhony.gutierrez@amd.com system.ruby._cpu_ports[gpu_port_idx + i * 2 + 1].slave 50711308Santhony.gutierrez@amd.com system.cpu[cp_idx].interrupts[0].pio = system.piobus.master 50811308Santhony.gutierrez@amd.com system.cpu[cp_idx].interrupts[0].int_master = system.piobus.slave 50911308Santhony.gutierrez@amd.com system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master 51011308Santhony.gutierrez@amd.com cp_idx = cp_idx + 1 51111308Santhony.gutierrez@amd.com 51211308Santhony.gutierrez@amd.com# connect dispatcher to the system.piobus 51311308Santhony.gutierrez@amd.comdispatcher.pio = system.piobus.master 51411308Santhony.gutierrez@amd.comdispatcher.dma = system.piobus.slave 51511308Santhony.gutierrez@amd.com 51611308Santhony.gutierrez@amd.com################# Connect the CPU and GPU via GPU Dispatcher ################### 51711308Santhony.gutierrez@amd.com# CPU rings the GPU doorbell to notify a pending task 51811308Santhony.gutierrez@amd.com# using this interface. 51911308Santhony.gutierrez@amd.com# And GPU uses this interface to notify the CPU of task completion 52011308Santhony.gutierrez@amd.com# The communcation happens through emulated driver. 52111308Santhony.gutierrez@amd.com 52211308Santhony.gutierrez@amd.com# Note this implicit setting of the cpu_pointer, shader_pointer and tlb array 52311308Santhony.gutierrez@amd.com# parameters must be after the explicit setting of the System cpu list 52411616Sdavid.j.hashe@gmail.comif fast_forward: 52511616Sdavid.j.hashe@gmail.com shader.cpu_pointer = future_cpu_list[0] 52611616Sdavid.j.hashe@gmail.com dispatcher.cpu = future_cpu_list[0] 52711616Sdavid.j.hashe@gmail.comelse: 52811616Sdavid.j.hashe@gmail.com shader.cpu_pointer = host_cpu 52911616Sdavid.j.hashe@gmail.com dispatcher.cpu = host_cpu 53011308Santhony.gutierrez@amd.comdispatcher.shader_pointer = shader 53111308Santhony.gutierrez@amd.comdispatcher.cl_driver = driver 53211308Santhony.gutierrez@amd.com 53311308Santhony.gutierrez@amd.com########################## Start simulation ######################## 53411308Santhony.gutierrez@amd.com 53511308Santhony.gutierrez@amd.comroot = Root(system=system, full_system=False) 53611308Santhony.gutierrez@amd.comm5.ticks.setGlobalFrequency('1THz') 53711308Santhony.gutierrez@amd.comif options.abs_max_tick: 53811308Santhony.gutierrez@amd.com maxtick = options.abs_max_tick 53911308Santhony.gutierrez@amd.comelse: 54011308Santhony.gutierrez@amd.com maxtick = m5.MaxTick 54111308Santhony.gutierrez@amd.com 54211308Santhony.gutierrez@amd.com# Benchmarks support work item annotations 54311308Santhony.gutierrez@amd.comSimulation.setWorkCountOptions(system, options) 54411308Santhony.gutierrez@amd.com 54511308Santhony.gutierrez@amd.com# Checkpointing is not supported by APU model 54611308Santhony.gutierrez@amd.comif (options.checkpoint_dir != None or 54711308Santhony.gutierrez@amd.com options.checkpoint_restore != None): 54811308Santhony.gutierrez@amd.com fatal("Checkpointing not supported by apu model") 54911308Santhony.gutierrez@amd.com 55011308Santhony.gutierrez@amd.comcheckpoint_dir = None 55111308Santhony.gutierrez@amd.comm5.instantiate(checkpoint_dir) 55211308Santhony.gutierrez@amd.com 55311308Santhony.gutierrez@amd.com# Map workload to this address space 55411308Santhony.gutierrez@amd.comhost_cpu.workload[0].map(0x10000000, 0x200000000, 4096) 55511308Santhony.gutierrez@amd.com 55611616Sdavid.j.hashe@gmail.comif options.fast_forward: 55712564Sgabeblack@google.com print("Switch at instruction count: %d" % cpu_list[0].max_insts_any_thread) 55811616Sdavid.j.hashe@gmail.com 55911308Santhony.gutierrez@amd.comexit_event = m5.simulate(maxtick) 56011616Sdavid.j.hashe@gmail.com 56111616Sdavid.j.hashe@gmail.comif options.fast_forward: 56211616Sdavid.j.hashe@gmail.com if exit_event.getCause() == "a thread reached the max instruction count": 56311616Sdavid.j.hashe@gmail.com m5.switchCpus(system, switch_cpu_list) 56412564Sgabeblack@google.com print("Switched CPUS @ tick %s" % (m5.curTick())) 56511616Sdavid.j.hashe@gmail.com m5.stats.reset() 56611616Sdavid.j.hashe@gmail.com exit_event = m5.simulate(maxtick - m5.curTick()) 56711616Sdavid.j.hashe@gmail.comelif options.fast_forward_pseudo_op: 56811616Sdavid.j.hashe@gmail.com while exit_event.getCause() == "switchcpu": 56911616Sdavid.j.hashe@gmail.com # If we are switching *to* kvm, then the current stats are meaningful 57011616Sdavid.j.hashe@gmail.com # Note that we don't do any warmup by default 57111616Sdavid.j.hashe@gmail.com if type(switch_cpu_list[0][0]) == FutureCpuClass: 57212564Sgabeblack@google.com print("Dumping stats...") 57311616Sdavid.j.hashe@gmail.com m5.stats.dump() 57411616Sdavid.j.hashe@gmail.com m5.switchCpus(system, switch_cpu_list) 57512564Sgabeblack@google.com print("Switched CPUS @ tick %s" % (m5.curTick())) 57611616Sdavid.j.hashe@gmail.com m5.stats.reset() 57711616Sdavid.j.hashe@gmail.com # This lets us switch back and forth without keeping a counter 57811616Sdavid.j.hashe@gmail.com switch_cpu_list = [(x[1], x[0]) for x in switch_cpu_list] 57911616Sdavid.j.hashe@gmail.com exit_event = m5.simulate(maxtick - m5.curTick()) 58011616Sdavid.j.hashe@gmail.com 58112564Sgabeblack@google.comprint("Ticks:", m5.curTick()) 58212564Sgabeblack@google.comprint('Exiting because ', exit_event.getCause()) 58311308Santhony.gutierrez@amd.comsys.exit(exit_event.getCode()) 584