apu_se.py (12697:cd71b966be1e) apu_se.py (13731:67cd980cb20f)
1# Copyright (c) 2015 Advanced Micro Devices, Inc.
2# All rights reserved.
3#
4# For use for simulation and test purposes only
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are met:
8#

--- 211 unchanged lines hidden (view full) ---

220
221# Switching off per-lane TLB by default
222per_lane = False
223if options.TLB_config == "perLane":
224 per_lane = True
225
226# List of compute units; one GPU can have multiple compute units
227compute_units = []
1# Copyright (c) 2015 Advanced Micro Devices, Inc.
2# All rights reserved.
3#
4# For use for simulation and test purposes only
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are met:
8#

--- 211 unchanged lines hidden (view full) ---

220
221# Switching off per-lane TLB by default
222per_lane = False
223if options.TLB_config == "perLane":
224 per_lane = True
225
226# List of compute units; one GPU can have multiple compute units
227compute_units = []
228for i in xrange(n_cu):
228for i in range(n_cu):
229 compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane,
230 num_SIMDs = options.simds_per_cu,
231 wfSize = options.wf_size,
232 spbypass_pipe_length = options.sp_bypass_path_length,
233 dpbypass_pipe_length = options.dp_bypass_path_length,
234 issue_period = options.issue_period,
235 coalescer_to_vrf_bus_width = \
236 options.glbmem_rd_bus_width,

--- 13 unchanged lines hidden (view full) ---

250 localDataStore = \
251 LdsState(banks = options.numLdsBanks,
252 bankConflictPenalty = \
253 options.ldsBankConflictPenalty),
254 out_of_order_data_delivery =
255 options.outOfOrderDataDelivery))
256 wavefronts = []
257 vrfs = []
229 compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane,
230 num_SIMDs = options.simds_per_cu,
231 wfSize = options.wf_size,
232 spbypass_pipe_length = options.sp_bypass_path_length,
233 dpbypass_pipe_length = options.dp_bypass_path_length,
234 issue_period = options.issue_period,
235 coalescer_to_vrf_bus_width = \
236 options.glbmem_rd_bus_width,

--- 13 unchanged lines hidden (view full) ---

250 localDataStore = \
251 LdsState(banks = options.numLdsBanks,
252 bankConflictPenalty = \
253 options.ldsBankConflictPenalty),
254 out_of_order_data_delivery =
255 options.outOfOrderDataDelivery))
256 wavefronts = []
257 vrfs = []
258 for j in xrange(options.simds_per_cu):
259 for k in xrange(shader.n_wf):
258 for j in range(options.simds_per_cu):
259 for k in range(shader.n_wf):
260 wavefronts.append(Wavefront(simdId = j, wf_slot_id = k,
261 wfSize = options.wf_size))
262 vrfs.append(VectorRegisterFile(simd_id=j,
263 num_regs_per_simd=options.vreg_file_size))
264 compute_units[-1].wavefronts = wavefronts
265 compute_units[-1].vector_register_file = vrfs
266 if options.TLB_prefetch:
267 compute_units[-1].prefetch_depth = options.TLB_prefetch

--- 38 unchanged lines hidden (view full) ---

306 mem_mode = 'atomic_noncaching'
307 # Leave shader.timing untouched, because its value only matters at the
308 # start of the simulation and because we require switching cpus
309 # *before* the first kernel launch.
310
311 future_cpu_list = []
312
313 # Initial CPUs to be used during fast-forwarding.
260 wavefronts.append(Wavefront(simdId = j, wf_slot_id = k,
261 wfSize = options.wf_size))
262 vrfs.append(VectorRegisterFile(simd_id=j,
263 num_regs_per_simd=options.vreg_file_size))
264 compute_units[-1].wavefronts = wavefronts
265 compute_units[-1].vector_register_file = vrfs
266 if options.TLB_prefetch:
267 compute_units[-1].prefetch_depth = options.TLB_prefetch

--- 38 unchanged lines hidden (view full) ---

306 mem_mode = 'atomic_noncaching'
307 # Leave shader.timing untouched, because its value only matters at the
308 # start of the simulation and because we require switching cpus
309 # *before* the first kernel launch.
310
311 future_cpu_list = []
312
313 # Initial CPUs to be used during fast-forwarding.
314 for i in xrange(options.num_cpus):
314 for i in range(options.num_cpus):
315 cpu = CpuClass(cpu_id = i,
316 clk_domain = SrcClockDomain(
317 clock = options.CPUClock,
318 voltage_domain = VoltageDomain(
319 voltage = options.cpu_voltage)))
320 cpu_list.append(cpu)
321
322 if options.fast_forward:
323 cpu.max_insts_any_thread = int(options.fast_forward)
324
325if fast_forward:
326 MainCpuClass = FutureCpuClass
327else:
328 MainCpuClass = CpuClass
329
330# CPs to be used throughout the simulation.
315 cpu = CpuClass(cpu_id = i,
316 clk_domain = SrcClockDomain(
317 clock = options.CPUClock,
318 voltage_domain = VoltageDomain(
319 voltage = options.cpu_voltage)))
320 cpu_list.append(cpu)
321
322 if options.fast_forward:
323 cpu.max_insts_any_thread = int(options.fast_forward)
324
325if fast_forward:
326 MainCpuClass = FutureCpuClass
327else:
328 MainCpuClass = CpuClass
329
330# CPs to be used throughout the simulation.
331for i in xrange(options.num_cp):
331for i in range(options.num_cp):
332 cp = MainCpuClass(cpu_id = options.num_cpus + i,
333 clk_domain = SrcClockDomain(
334 clock = options.CPUClock,
335 voltage_domain = VoltageDomain(
336 voltage = options.cpu_voltage)))
337 cp_list.append(cp)
338
339# Main CPUs (to be used after fast-forwarding if fast-forwarding is specified).
332 cp = MainCpuClass(cpu_id = options.num_cpus + i,
333 clk_domain = SrcClockDomain(
334 clock = options.CPUClock,
335 voltage_domain = VoltageDomain(
336 voltage = options.cpu_voltage)))
337 cp_list.append(cp)
338
339# Main CPUs (to be used after fast-forwarding if fast-forwarding is specified).
340for i in xrange(options.num_cpus):
340for i in range(options.num_cpus):
341 cpu = MainCpuClass(cpu_id = i,
342 clk_domain = SrcClockDomain(
343 clock = options.CPUClock,
344 voltage_domain = VoltageDomain(
345 voltage = options.cpu_voltage)))
346 if fast_forward:
347 cpu.switched_out = True
348 future_cpu_list.append(cpu)

--- 46 unchanged lines hidden (view full) ---

395 cpu.createThreads()
396 cpu.workload = Process(executable = executable,
397 cmd = [options.cmd] + options.options.split(),
398 drivers = [driver])
399for cp in cp_list:
400 cp.workload = host_cpu.workload
401
402if fast_forward:
341 cpu = MainCpuClass(cpu_id = i,
342 clk_domain = SrcClockDomain(
343 clock = options.CPUClock,
344 voltage_domain = VoltageDomain(
345 voltage = options.cpu_voltage)))
346 if fast_forward:
347 cpu.switched_out = True
348 future_cpu_list.append(cpu)

--- 46 unchanged lines hidden (view full) ---

395 cpu.createThreads()
396 cpu.workload = Process(executable = executable,
397 cmd = [options.cmd] + options.options.split(),
398 drivers = [driver])
399for cp in cp_list:
400 cp.workload = host_cpu.workload
401
402if fast_forward:
403 for i in xrange(len(future_cpu_list)):
403 for i in range(len(future_cpu_list)):
404 future_cpu_list[i].workload = cpu_list[i].workload
405 future_cpu_list[i].createThreads()
406
407########################## Create the overall system ########################
408# List of CPUs that must be switched when moving between KVM and simulation
409if fast_forward:
410 switch_cpu_list = \
404 future_cpu_list[i].workload = cpu_list[i].workload
405 future_cpu_list[i].createThreads()
406
407########################## Create the overall system ########################
408# List of CPUs that must be switched when moving between KVM and simulation
409if fast_forward:
410 switch_cpu_list = \
411 [(cpu_list[i], future_cpu_list[i]) for i in xrange(options.num_cpus)]
411 [(cpu_list[i], future_cpu_list[i]) for i in range(options.num_cpus)]
412
413# Full list of processing cores in the system. Note that
414# dispatcher is also added to cpu_list although it is
415# not a processing element
416cpu_list = cpu_list + [shader] + cp_list + [dispatcher]
417
418# creating the overall system
419# notice the cpu list is explicitly added as a parameter to System

--- 6 unchanged lines hidden (view full) ---

426system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
427system.clk_domain = SrcClockDomain(clock = options.sys_clock,
428 voltage_domain = system.voltage_domain)
429
430if fast_forward:
431 have_kvm_support = 'BaseKvmCPU' in globals()
432 if have_kvm_support and buildEnv['TARGET_ISA'] == "x86":
433 system.vm = KvmVM()
412
413# Full list of processing cores in the system. Note that
414# dispatcher is also added to cpu_list although it is
415# not a processing element
416cpu_list = cpu_list + [shader] + cp_list + [dispatcher]
417
418# creating the overall system
419# notice the cpu list is explicitly added as a parameter to System

--- 6 unchanged lines hidden (view full) ---

426system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
427system.clk_domain = SrcClockDomain(clock = options.sys_clock,
428 voltage_domain = system.voltage_domain)
429
430if fast_forward:
431 have_kvm_support = 'BaseKvmCPU' in globals()
432 if have_kvm_support and buildEnv['TARGET_ISA'] == "x86":
433 system.vm = KvmVM()
434 for i in xrange(len(host_cpu.workload)):
434 for i in range(len(host_cpu.workload)):
435 host_cpu.workload[i].useArchPT = True
436 host_cpu.workload[i].kvmInSE = True
437 else:
438 fatal("KvmCPU can only be used in SE mode with x86")
439
440# configure the TLB hierarchy
441GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx)
442

--- 31 unchanged lines hidden (view full) ---

474# the index as below, but note that this assumes there is one sequencer
475# per compute unit and one sequencer per SQC for the math to work out
476# correctly.
477gpu_port_idx = len(system.ruby._cpu_ports) \
478 - options.num_compute_units - options.num_sqc
479gpu_port_idx = gpu_port_idx - options.num_cp * 2
480
481wavefront_size = options.wf_size
435 host_cpu.workload[i].useArchPT = True
436 host_cpu.workload[i].kvmInSE = True
437 else:
438 fatal("KvmCPU can only be used in SE mode with x86")
439
440# configure the TLB hierarchy
441GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx)
442

--- 31 unchanged lines hidden (view full) ---

474# the index as below, but note that this assumes there is one sequencer
475# per compute unit and one sequencer per SQC for the math to work out
476# correctly.
477gpu_port_idx = len(system.ruby._cpu_ports) \
478 - options.num_compute_units - options.num_sqc
479gpu_port_idx = gpu_port_idx - options.num_cp * 2
480
481wavefront_size = options.wf_size
482for i in xrange(n_cu):
482for i in range(n_cu):
483 # The pipeline issues wavefront_size number of uncoalesced requests
484 # in one GPU issue cycle. Hence wavefront_size mem ports.
483 # The pipeline issues wavefront_size number of uncoalesced requests
484 # in one GPU issue cycle. Hence wavefront_size mem ports.
485 for j in xrange(wavefront_size):
485 for j in range(wavefront_size):
486 system.cpu[shader_idx].CUs[i].memory_port[j] = \
487 system.ruby._cpu_ports[gpu_port_idx].slave[j]
488 gpu_port_idx += 1
489
486 system.cpu[shader_idx].CUs[i].memory_port[j] = \
487 system.ruby._cpu_ports[gpu_port_idx].slave[j]
488 gpu_port_idx += 1
489
490for i in xrange(n_cu):
490for i in range(n_cu):
491 if i > 0 and not i % options.cu_per_sqc:
492 print("incrementing idx on ", i)
493 gpu_port_idx += 1
494 system.cpu[shader_idx].CUs[i].sqc_port = \
495 system.ruby._cpu_ports[gpu_port_idx].slave
496gpu_port_idx = gpu_port_idx + 1
497
498# attach CP ports to Ruby
491 if i > 0 and not i % options.cu_per_sqc:
492 print("incrementing idx on ", i)
493 gpu_port_idx += 1
494 system.cpu[shader_idx].CUs[i].sqc_port = \
495 system.ruby._cpu_ports[gpu_port_idx].slave
496gpu_port_idx = gpu_port_idx + 1
497
498# attach CP ports to Ruby
499for i in xrange(options.num_cp):
499for i in range(options.num_cp):
500 system.cpu[cp_idx].createInterruptController()
501 system.cpu[cp_idx].dcache_port = \
502 system.ruby._cpu_ports[gpu_port_idx + i * 2].slave
503 system.cpu[cp_idx].icache_port = \
504 system.ruby._cpu_ports[gpu_port_idx + i * 2 + 1].slave
505 system.cpu[cp_idx].interrupts[0].pio = system.piobus.master
506 system.cpu[cp_idx].interrupts[0].int_master = system.piobus.slave
507 system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master

--- 74 unchanged lines hidden ---
500 system.cpu[cp_idx].createInterruptController()
501 system.cpu[cp_idx].dcache_port = \
502 system.ruby._cpu_ports[gpu_port_idx + i * 2].slave
503 system.cpu[cp_idx].icache_port = \
504 system.ruby._cpu_ports[gpu_port_idx + i * 2 + 1].slave
505 system.cpu[cp_idx].interrupts[0].pio = system.piobus.master
506 system.cpu[cp_idx].interrupts[0].int_master = system.piobus.slave
507 system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master

--- 74 unchanged lines hidden ---