Cross Reference: /gem5/configs/example/apu

Deleted Added

sdiff udiff text old ( 12697:cd71b966be1e ) new ( 13731:67cd980cb20f )

full compact

apu_se.py (12697:cd71b966be1e)	apu_se.py (13731:67cd980cb20f)
1# Copyright (c) 2015 Advanced Micro Devices, Inc. 2# All rights reserved. 3# 4# For use for simulation and test purposes only 5# 6# Redistribution and use in source and binary forms, with or without 7# modification, are permitted provided that the following conditions are met: 8# --- 211 unchanged lines hidden (view full) --- 220 221# Switching off per-lane TLB by default 222per_lane = False 223if options.TLB_config == "perLane": 224 per_lane = True 225 226# List of compute units; one GPU can have multiple compute units 227compute_units = []	1# Copyright (c) 2015 Advanced Micro Devices, Inc. 2# All rights reserved. 3# 4# For use for simulation and test purposes only 5# 6# Redistribution and use in source and binary forms, with or without 7# modification, are permitted provided that the following conditions are met: 8# --- 211 unchanged lines hidden (view full) --- 220 221# Switching off per-lane TLB by default 222per_lane = False 223if options.TLB_config == "perLane": 224 per_lane = True 225 226# List of compute units; one GPU can have multiple compute units 227compute_units = []
228for i in xrange(n_cu):	228for i in range(n_cu):
229 compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane, 230 num_SIMDs = options.simds_per_cu, 231 wfSize = options.wf_size, 232 spbypass_pipe_length = options.sp_bypass_path_length, 233 dpbypass_pipe_length = options.dp_bypass_path_length, 234 issue_period = options.issue_period, 235 coalescer_to_vrf_bus_width = \ 236 options.glbmem_rd_bus_width, --- 13 unchanged lines hidden (view full) --- 250 localDataStore = \ 251 LdsState(banks = options.numLdsBanks, 252 bankConflictPenalty = \ 253 options.ldsBankConflictPenalty), 254 out_of_order_data_delivery = 255 options.outOfOrderDataDelivery)) 256 wavefronts = [] 257 vrfs = []	229 compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane, 230 num_SIMDs = options.simds_per_cu, 231 wfSize = options.wf_size, 232 spbypass_pipe_length = options.sp_bypass_path_length, 233 dpbypass_pipe_length = options.dp_bypass_path_length, 234 issue_period = options.issue_period, 235 coalescer_to_vrf_bus_width = \ 236 options.glbmem_rd_bus_width, --- 13 unchanged lines hidden (view full) --- 250 localDataStore = \ 251 LdsState(banks = options.numLdsBanks, 252 bankConflictPenalty = \ 253 options.ldsBankConflictPenalty), 254 out_of_order_data_delivery = 255 options.outOfOrderDataDelivery)) 256 wavefronts = [] 257 vrfs = []
258 for j in xrange(options.simds_per_cu): 259 for k in xrange(shader.n_wf):	258 for j in range(options.simds_per_cu): 259 for k in range(shader.n_wf):
260 wavefronts.append(Wavefront(simdId = j, wf_slot_id = k, 261 wfSize = options.wf_size)) 262 vrfs.append(VectorRegisterFile(simd_id=j, 263 num_regs_per_simd=options.vreg_file_size)) 264 compute_units[-1].wavefronts = wavefronts 265 compute_units[-1].vector_register_file = vrfs 266 if options.TLB_prefetch: 267 compute_units[-1].prefetch_depth = options.TLB_prefetch --- 38 unchanged lines hidden (view full) --- 306 mem_mode = 'atomic_noncaching' 307 # Leave shader.timing untouched, because its value only matters at the 308 # start of the simulation and because we require switching cpus 309 # before the first kernel launch. 310 311 future_cpu_list = [] 312 313 # Initial CPUs to be used during fast-forwarding.	260 wavefronts.append(Wavefront(simdId = j, wf_slot_id = k, 261 wfSize = options.wf_size)) 262 vrfs.append(VectorRegisterFile(simd_id=j, 263 num_regs_per_simd=options.vreg_file_size)) 264 compute_units[-1].wavefronts = wavefronts 265 compute_units[-1].vector_register_file = vrfs 266 if options.TLB_prefetch: 267 compute_units[-1].prefetch_depth = options.TLB_prefetch --- 38 unchanged lines hidden (view full) --- 306 mem_mode = 'atomic_noncaching' 307 # Leave shader.timing untouched, because its value only matters at the 308 # start of the simulation and because we require switching cpus 309 # before the first kernel launch. 310 311 future_cpu_list = [] 312 313 # Initial CPUs to be used during fast-forwarding.
314 for i in xrange(options.num_cpus):	314 for i in range(options.num_cpus):
315 cpu = CpuClass(cpu_id = i, 316 clk_domain = SrcClockDomain( 317 clock = options.CPUClock, 318 voltage_domain = VoltageDomain( 319 voltage = options.cpu_voltage))) 320 cpu_list.append(cpu) 321 322 if options.fast_forward: 323 cpu.max_insts_any_thread = int(options.fast_forward) 324 325if fast_forward: 326 MainCpuClass = FutureCpuClass 327else: 328 MainCpuClass = CpuClass 329 330# CPs to be used throughout the simulation.	315 cpu = CpuClass(cpu_id = i, 316 clk_domain = SrcClockDomain( 317 clock = options.CPUClock, 318 voltage_domain = VoltageDomain( 319 voltage = options.cpu_voltage))) 320 cpu_list.append(cpu) 321 322 if options.fast_forward: 323 cpu.max_insts_any_thread = int(options.fast_forward) 324 325if fast_forward: 326 MainCpuClass = FutureCpuClass 327else: 328 MainCpuClass = CpuClass 329 330# CPs to be used throughout the simulation.
331for i in xrange(options.num_cp):	331for i in range(options.num_cp):
332 cp = MainCpuClass(cpu_id = options.num_cpus + i, 333 clk_domain = SrcClockDomain( 334 clock = options.CPUClock, 335 voltage_domain = VoltageDomain( 336 voltage = options.cpu_voltage))) 337 cp_list.append(cp) 338 339# Main CPUs (to be used after fast-forwarding if fast-forwarding is specified).	332 cp = MainCpuClass(cpu_id = options.num_cpus + i, 333 clk_domain = SrcClockDomain( 334 clock = options.CPUClock, 335 voltage_domain = VoltageDomain( 336 voltage = options.cpu_voltage))) 337 cp_list.append(cp) 338 339# Main CPUs (to be used after fast-forwarding if fast-forwarding is specified).
340for i in xrange(options.num_cpus):	340for i in range(options.num_cpus):
341 cpu = MainCpuClass(cpu_id = i, 342 clk_domain = SrcClockDomain( 343 clock = options.CPUClock, 344 voltage_domain = VoltageDomain( 345 voltage = options.cpu_voltage))) 346 if fast_forward: 347 cpu.switched_out = True 348 future_cpu_list.append(cpu) --- 46 unchanged lines hidden (view full) --- 395 cpu.createThreads() 396 cpu.workload = Process(executable = executable, 397 cmd = [options.cmd] + options.options.split(), 398 drivers = [driver]) 399for cp in cp_list: 400 cp.workload = host_cpu.workload 401 402if fast_forward:	341 cpu = MainCpuClass(cpu_id = i, 342 clk_domain = SrcClockDomain( 343 clock = options.CPUClock, 344 voltage_domain = VoltageDomain( 345 voltage = options.cpu_voltage))) 346 if fast_forward: 347 cpu.switched_out = True 348 future_cpu_list.append(cpu) --- 46 unchanged lines hidden (view full) --- 395 cpu.createThreads() 396 cpu.workload = Process(executable = executable, 397 cmd = [options.cmd] + options.options.split(), 398 drivers = [driver]) 399for cp in cp_list: 400 cp.workload = host_cpu.workload 401 402if fast_forward:
403 for i in xrange(len(future_cpu_list)):	403 for i in range(len(future_cpu_list)):
404 future_cpu_list[i].workload = cpu_list[i].workload 405 future_cpu_list[i].createThreads() 406 407########################## Create the overall system ######################## 408# List of CPUs that must be switched when moving between KVM and simulation 409if fast_forward: 410 switch_cpu_list = \	404 future_cpu_list[i].workload = cpu_list[i].workload 405 future_cpu_list[i].createThreads() 406 407########################## Create the overall system ######################## 408# List of CPUs that must be switched when moving between KVM and simulation 409if fast_forward: 410 switch_cpu_list = \
411 [(cpu_list[i], future_cpu_list[i]) for i in xrange(options.num_cpus)]	411 [(cpu_list[i], future_cpu_list[i]) for i in range(options.num_cpus)]
412 413# Full list of processing cores in the system. Note that 414# dispatcher is also added to cpu_list although it is 415# not a processing element 416cpu_list = cpu_list + [shader] + cp_list + [dispatcher] 417 418# creating the overall system 419# notice the cpu list is explicitly added as a parameter to System --- 6 unchanged lines hidden (view full) --- 426system.voltage_domain = VoltageDomain(voltage = options.sys_voltage) 427system.clk_domain = SrcClockDomain(clock = options.sys_clock, 428 voltage_domain = system.voltage_domain) 429 430if fast_forward: 431 have_kvm_support = 'BaseKvmCPU' in globals() 432 if have_kvm_support and buildEnv['TARGET_ISA'] == "x86": 433 system.vm = KvmVM()	412 413# Full list of processing cores in the system. Note that 414# dispatcher is also added to cpu_list although it is 415# not a processing element 416cpu_list = cpu_list + [shader] + cp_list + [dispatcher] 417 418# creating the overall system 419# notice the cpu list is explicitly added as a parameter to System --- 6 unchanged lines hidden (view full) --- 426system.voltage_domain = VoltageDomain(voltage = options.sys_voltage) 427system.clk_domain = SrcClockDomain(clock = options.sys_clock, 428 voltage_domain = system.voltage_domain) 429 430if fast_forward: 431 have_kvm_support = 'BaseKvmCPU' in globals() 432 if have_kvm_support and buildEnv['TARGET_ISA'] == "x86": 433 system.vm = KvmVM()
434 for i in xrange(len(host_cpu.workload)):	434 for i in range(len(host_cpu.workload)):
435 host_cpu.workload[i].useArchPT = True 436 host_cpu.workload[i].kvmInSE = True 437 else: 438 fatal("KvmCPU can only be used in SE mode with x86") 439 440# configure the TLB hierarchy 441GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx) 442 --- 31 unchanged lines hidden (view full) --- 474# the index as below, but note that this assumes there is one sequencer 475# per compute unit and one sequencer per SQC for the math to work out 476# correctly. 477gpu_port_idx = len(system.ruby._cpu_ports) \ 478 - options.num_compute_units - options.num_sqc 479gpu_port_idx = gpu_port_idx - options.num_cp * 2 480 481wavefront_size = options.wf_size	435 host_cpu.workload[i].useArchPT = True 436 host_cpu.workload[i].kvmInSE = True 437 else: 438 fatal("KvmCPU can only be used in SE mode with x86") 439 440# configure the TLB hierarchy 441GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx) 442 --- 31 unchanged lines hidden (view full) --- 474# the index as below, but note that this assumes there is one sequencer 475# per compute unit and one sequencer per SQC for the math to work out 476# correctly. 477gpu_port_idx = len(system.ruby._cpu_ports) \ 478 - options.num_compute_units - options.num_sqc 479gpu_port_idx = gpu_port_idx - options.num_cp * 2 480 481wavefront_size = options.wf_size
482for i in xrange(n_cu):	482for i in range(n_cu):
483 # The pipeline issues wavefront_size number of uncoalesced requests 484 # in one GPU issue cycle. Hence wavefront_size mem ports.	483 # The pipeline issues wavefront_size number of uncoalesced requests 484 # in one GPU issue cycle. Hence wavefront_size mem ports.
485 for j in xrange(wavefront_size):	485 for j in range(wavefront_size):
486 system.cpu[shader_idx].CUs[i].memory_port[j] = \ 487 system.ruby._cpu_ports[gpu_port_idx].slave[j] 488 gpu_port_idx += 1 489	486 system.cpu[shader_idx].CUs[i].memory_port[j] = \ 487 system.ruby._cpu_ports[gpu_port_idx].slave[j] 488 gpu_port_idx += 1 489
490for i in xrange(n_cu):	490for i in range(n_cu):
491 if i > 0 and not i % options.cu_per_sqc: 492 print("incrementing idx on ", i) 493 gpu_port_idx += 1 494 system.cpu[shader_idx].CUs[i].sqc_port = \ 495 system.ruby._cpu_ports[gpu_port_idx].slave 496gpu_port_idx = gpu_port_idx + 1 497 498# attach CP ports to Ruby	491 if i > 0 and not i % options.cu_per_sqc: 492 print("incrementing idx on ", i) 493 gpu_port_idx += 1 494 system.cpu[shader_idx].CUs[i].sqc_port = \ 495 system.ruby._cpu_ports[gpu_port_idx].slave 496gpu_port_idx = gpu_port_idx + 1 497 498# attach CP ports to Ruby
499for i in xrange(options.num_cp):	499for i in range(options.num_cp):
500 system.cpu[cp_idx].createInterruptController() 501 system.cpu[cp_idx].dcache_port = \ 502 system.ruby._cpu_ports[gpu_port_idx + i * 2].slave 503 system.cpu[cp_idx].icache_port = \ 504 system.ruby._cpu_ports[gpu_port_idx + i * 2 + 1].slave 505 system.cpu[cp_idx].interrupts[0].pio = system.piobus.master 506 system.cpu[cp_idx].interrupts[0].int_master = system.piobus.slave 507 system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master --- 74 unchanged lines hidden ---	500 system.cpu[cp_idx].createInterruptController() 501 system.cpu[cp_idx].dcache_port = \ 502 system.ruby._cpu_ports[gpu_port_idx + i * 2].slave 503 system.cpu[cp_idx].icache_port = \ 504 system.ruby._cpu_ports[gpu_port_idx + i * 2 + 1].slave 505 system.cpu[cp_idx].interrupts[0].pio = system.piobus.master 506 system.cpu[cp_idx].interrupts[0].int_master = system.piobus.slave 507 system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master --- 74 unchanged lines hidden ---