Cross Reference: /gem5/ext/mcpat/array.cc

Deleted Added

sdiff udiff text old ( 10152:52c552138ba1 ) new ( 10234:5cb711fa6176 )

full compact

4a5
> * Copyright (c) 2010-2013 Advanced Micro Devices, Inc.
28c29
< * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
---
> * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32,34d32
< #define GLOBALVAR
< #include <cassert>
< #include <cmath>
35a34
> #include <math.h>
38a38
> #include "common.h"
40d39
< #include "globalvar.h"
45,57c44,62
< ArrayST::ArrayST(const InputParameter *configure_interface,
< string _name,
< enum Device_ty device_ty_,
< bool opt_local_,
< enum Core_type core_ty_,
< bool _is_default)
< :l_ip(*configure_interface),
< name(_name),
< device_ty(device_ty_),
< opt_local(opt_local_),
< core_ty(core_ty_),
< is_default(_is_default)
< {
---
> double ArrayST::area_efficiency_threshold = 20.0;
> int ArrayST::ed = 0;
> //Fixed number, make sure timing can be satisfied.
> int ArrayST::delay_wt = 100;
> int ArrayST::cycle_time_wt = 1000;
> //Fixed number, This is used to exhaustive search for individual components.
> int ArrayST::area_wt = 10;
> //Fixed number, This is used to exhaustive search for individual components.
> int ArrayST::dynamic_power_wt = 10;
> int ArrayST::leakage_power_wt = 10;
> //Fixed number, make sure timing can be satisfied.
> int ArrayST::delay_dev = 1000000;
> int ArrayST::cycle_time_dev = 100;
> //Fixed number, This is used to exhaustive search for individual components.
> int ArrayST::area_dev = 1000000;
> //Fixed number, This is used to exhaustive search for individual components.
> int ArrayST::dynamic_power_dev = 1000000;
> int ArrayST::leakage_power_dev = 1000000;
> int ArrayST::cycle_time_dev_threshold = 10;
59,61d63
< if (l_ip.cache_sz<64) l_ip.cache_sz=64;
< l_ip.error_checking();//not only do the error checking but also fill some missing parameters
< optimize_array();
63c65,75
< }
---
> ArrayST::ArrayST(XMLNode* _xml_data,
> const InputParameter *configure_interface, string _name,
> enum Device_ty device_ty_, double _clockRate,
> bool opt_local_, enum Core_type core_ty_, bool _is_default)
> : McPATComponent(_xml_data), l_ip(*configure_interface),
> device_ty(device_ty_), opt_local(opt_local_), core_ty(core_ty_),
> is_default(_is_default) {
> name = _name;
> clockRate = _clockRate;
> if (l_ip.cache_sz < MIN_BUFFER_SIZE)
> l_ip.cache_sz = MIN_BUFFER_SIZE;
65,70c77,78
<
< void ArrayST::compute_base_power()
< {
< //l_ip.out_w =l_ip.line_sz*8;
< local_result=cacti_interface(&l_ip);
<
---
> if (!l_ip.error_checking(name)) {
> exit(1);
73,76c81
< void ArrayST::optimize_array()
< {
< list<uca_org_t > candidate_solutions(0);
< list<uca_org_t >::iterator candidate_iter, min_dynamic_energy_iter;
---
> output_data.reset();
78,79c83,85
< uca_org_t * temp_res = 0;
< local_result.valid=false;
---
> computeEnergy();
> computeArea();
> }
81,84c87,89
< double throughput=l_ip.throughput, latency=l_ip.latency;
< double area_efficiency_threshold = 20.0;
< bool throughput_overflow=true, latency_overflow=true;
< compute_base_power();
---
> void ArrayST::compute_base_power() {
> local_result = cacti_interface(&l_ip);
> }
86,89c91,94
< if ((local_result.cycle_time - throughput) <= 1e-10 )
< throughput_overflow=false;
< if ((local_result.access_time - latency)<= 1e-10)
< latency_overflow=false;
---
> void ArrayST::computeArea() {
> area.set_area(local_result.area);
> output_data.area = local_result.area / 1e6;
> }
91,95c96,98
< if (opt_for_clk && opt_local)
< {
< if (throughput_overflow || latency_overflow)
< {
< l_ip.ed=0;
---
> void ArrayST::computeEnergy() {
> list<uca_org_t > candidate_solutions(0);
> list<uca_org_t >::iterator candidate_iter, min_dynamic_energy_iter;
97,98c100,101
< l_ip.delay_wt = 100;//Fixed number, make sure timing can be satisfied.
< l_ip.cycle_time_wt = 1000;
---
> uca_org_t* temp_res = NULL;
> local_result.valid = false;
100,102c103,107
< l_ip.area_wt = 10;//Fixed number, This is used to exhaustive search for individual components.
< l_ip.dynamic_power_wt = 10;//Fixed number, This is used to exhaustive search for individual components.
< l_ip.leakage_power_wt = 10;
---
> double throughput = l_ip.throughput;
> double latency = l_ip.latency;
> bool throughput_overflow = true;
> bool latency_overflow = true;
> compute_base_power();
104,105c109,112
< l_ip.delay_dev = 1000000;//Fixed number, make sure timing can be satisfied.
< l_ip.cycle_time_dev = 100;
---
> if ((local_result.cycle_time - throughput) <= 1e-10 )
> throughput_overflow = false;
> if ((local_result.access_time - latency) <= 1e-10)
> latency_overflow = false;
107,109c114,116
< l_ip.area_dev = 1000000;//Fixed number, This is used to exhaustive search for individual components.
< l_ip.dynamic_power_dev = 1000000;//Fixed number, This is used to exhaustive search for individual components.
< l_ip.leakage_power_dev = 1000000;
---
> if (opt_for_clk && opt_local) {
> if (throughput_overflow || latency_overflow) {
> l_ip.ed = ed;
111,112c118,119
< throughput_overflow=true; //Reset overflow flag before start optimization iterations
< latency_overflow=true;
---
> l_ip.delay_wt = delay_wt;
> l_ip.cycle_time_wt = cycle_time_wt;
114,116c121,123
< temp_res = &local_result; //Clean up the result for optimized for ED^2P
< temp_res->cleanup();
< }
---
> l_ip.area_wt = area_wt;
> l_ip.dynamic_power_wt = dynamic_power_wt;
> l_ip.leakage_power_wt = leakage_power_wt;
117a125,126
> l_ip.delay_dev = delay_dev;
> l_ip.cycle_time_dev = cycle_time_dev;
119,121c128,130
< while ((throughput_overflow || latency_overflow)&&l_ip.cycle_time_dev > 10)// && l_ip.delay_dev > 10
< {
< compute_base_power();
---
> l_ip.area_dev = area_dev;
> l_ip.dynamic_power_dev = dynamic_power_dev;
> l_ip.leakage_power_dev = leakage_power_dev;
123c132,134
< l_ip.cycle_time_dev-=10;//This is the time_dev to be used for next iteration
---
> //Reset overflow flag before start optimization iterations
> throughput_overflow = true;
> latency_overflow = true;
125,136c136,139
< // from best area to worst area -->worst timing to best timing
< if ((((local_result.cycle_time - throughput) <= 1e-10 ) && (local_result.access_time - latency)<= 1e-10)||
< (local_result.data_array2->area_efficiency < area_efficiency_threshold && l_ip.assoc == 0))
< { //if no satisfiable solution is found,the most aggressive one is left
< candidate_solutions.push_back(local_result);
< //output_data_csv(candidate_solutions.back());
< if (((local_result.cycle_time - throughput) <= 1e-10) && ((local_result.access_time - latency)<= 1e-10))
< //ensure stop opt not because of cam
< {
< throughput_overflow=false;
< latency_overflow=false;
< }
---
> //Clean up the result for optimized for ED^2P
> temp_res = &local_result;
> temp_res->cleanup();
> }
138,145d140
< }
< else
< {
< //TODO: whether checking the partial satisfied results too, or just change the mark???
< if ((local_result.cycle_time - throughput) <= 1e-10)
< throughput_overflow=false;
< if ((local_result.access_time - latency)<= 1e-10)
< latency_overflow=false;
147,154c142,144
< if (l_ip.cycle_time_dev > 10)
< { //if not >10 local_result is the last result, it cannot be cleaned up
< temp_res = &local_result; //Only solutions not saved in the list need to be cleaned up
< temp_res->cleanup();
< }
< }
< // l_ip.cycle_time_dev-=10;
< // l_ip.delay_dev-=10;
---
> while ((throughput_overflow || latency_overflow) &&
> l_ip.cycle_time_dev > cycle_time_dev_threshold) {
> compute_base_power();
155a146,161
> //This is the time_dev to be used for next iteration
> l_ip.cycle_time_dev -= cycle_time_dev_threshold;
>
> // from best area to worst area -->worst timing to best timing
> if ((((local_result.cycle_time - throughput) <= 1e-10 ) &&
> (local_result.access_time - latency) <= 1e-10) ||
> (local_result.data_array2->area_efficiency <
> area_efficiency_threshold && l_ip.assoc == 0)) {
> //if no satisfiable solution is found,the most aggressive one
> //is left
> candidate_solutions.push_back(local_result);
> if (((local_result.cycle_time - throughput) <= 1e-10) &&
> ((local_result.access_time - latency) <= 1e-10)) {
> //ensure stop opt not because of cam
> throughput_overflow = false;
> latency_overflow = false;
157a164,168
> } else {
> if ((local_result.cycle_time - throughput) <= 1e-10)
> throughput_overflow = false;
> if ((local_result.access_time - latency) <= 1e-10)
> latency_overflow = false;
159,165c170,178
< if (l_ip.assoc > 0)
< {
< //For array structures except CAM and FA, Give warning but still provide a result with best timing found
< if (throughput_overflow==true)
< cout<< "Warning: " << name<<" array structure cannot satisfy throughput constraint." << endl;
< if (latency_overflow==true)
< cout<< "Warning: " << name<<" array structure cannot satisfy latency constraint." << endl;
---
> //if not >10 local_result is the last result, it cannot be
> //cleaned up
> if (l_ip.cycle_time_dev > cycle_time_dev_threshold) {
> //Only solutions not saved in the list need to be
> //cleaned up
> temp_res = &local_result;
> temp_res->cleanup();
> }
> }
168,184d180
< // else
< // {
< // /*According to "Content-Addressable Memory (CAM) Circuits and
< // Architectures": A Tutorial and Survey
< // by Kostas Pagiamtzis et al.
< // CAM structures can be heavily pipelined and use look-ahead techniques,
< // therefore timing can be relaxed. But McPAT does not model the advanced
< // techniques. If continue optimizing, the area efficiency will be too low
< // */
< // //For CAM and FA, stop opt if area efficiency is too low
< // if (throughput_overflow==true)
< // cout<< "Warning: " <<" McPAT stopped optimization on throughput for "<< name
< // <<" array structure because its area efficiency is below "<<area_efficiency_threshold<<"% " << endl;
< // if (latency_overflow==true)
< // cout<< "Warning: " <<" McPAT stopped optimization on latency for "<< name
< // <<" array structure because its area efficiency is below "<<area_efficiency_threshold<<"% " << endl;
< // }
186,191c182,193
< //double min_dynamic_energy, min_dynamic_power, min_leakage_power, min_cycle_time;
< double min_dynamic_energy=BIGNUM;
< if (candidate_solutions.empty()==false)
< {
< local_result.valid=true;
< for (candidate_iter = candidate_solutions.begin(); candidate_iter != candidate_solutions.end(); ++candidate_iter)
---
> if (l_ip.assoc > 0) {
> //For array structures except CAM and FA, Give warning but still
> //provide a result with best timing found
> if (throughput_overflow == true)
> cout << "Warning: " << name
> << " array structure cannot satisfy throughput constraint."
> << endl;
> if (latency_overflow == true)
> cout << "Warning: " << name
> << " array structure cannot satisfy latency constraint."
> << endl;
> }
193,199c195,209
< {
< if (min_dynamic_energy > (candidate_iter)->power.readOp.dynamic)
< {
< min_dynamic_energy = (candidate_iter)->power.readOp.dynamic;
< min_dynamic_energy_iter = candidate_iter;
< local_result = *(min_dynamic_energy_iter);
< //TODO: since results are reordered results and l_ip may miss match. Therefore, the final output spread sheets may show the miss match.
---
> double min_dynamic_energy = BIGNUM;
> if (candidate_solutions.empty() == false) {
> local_result.valid = true;
> for (candidate_iter = candidate_solutions.begin();
> candidate_iter != candidate_solutions.end();
> ++candidate_iter) {
> if (min_dynamic_energy >
> (candidate_iter)->power.readOp.dynamic) {
> min_dynamic_energy =
> (candidate_iter)->power.readOp.dynamic;
> min_dynamic_energy_iter = candidate_iter;
> local_result = *(min_dynamic_energy_iter);
> } else {
> candidate_iter->cleanup() ;
> }
201,205c211
< }
< else
< {
< candidate_iter->cleanup() ;
< }
---
> }
207d212
< }
209,211d213
<
< }
< candidate_solutions.clear();
212a215,216
> candidate_solutions.clear();
> }
214c218,219
< double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
---
> double long_channel_device_reduction =
> longer_channel_device_reduction(device_ty, core_ty);
216,219c221,224
< double macro_layout_overhead = g_tp.macro_layout_overhead;
< double chip_PR_overhead = g_tp.chip_layout_overhead;
< double total_overhead = macro_layout_overhead*chip_PR_overhead;
< local_result.area *= total_overhead;
---
> double macro_layout_overhead = g_tp.macro_layout_overhead;
> double chip_PR_overhead = g_tp.chip_layout_overhead;
> double total_overhead = macro_layout_overhead * chip_PR_overhead;
> local_result.area *= total_overhead;
221,222c226,227
< //maintain constant power density
< double pppm_t[4] = {total_overhead,1,1,total_overhead};
---
> //maintain constant power density
> double pppm_t[4] = {total_overhead, 1, 1, total_overhead};
224,231c229,236
< double sckRation = g_tp.sckt_co_eff;
< local_result.power.readOp.dynamic *= sckRation;
< local_result.power.writeOp.dynamic *= sckRation;
< local_result.power.searchOp.dynamic *= sckRation;
< local_result.power.readOp.leakage *= l_ip.nbanks;
< local_result.power.readOp.longer_channel_leakage =
< local_result.power.readOp.leakage*long_channel_device_reduction;
< local_result.power = local_result.power* pppm_t;
---
> double sckRation = g_tp.sckt_co_eff;
> local_result.power.readOp.dynamic *= sckRation;
> local_result.power.writeOp.dynamic *= sckRation;
> local_result.power.searchOp.dynamic *= sckRation;
> local_result.power.readOp.leakage *= l_ip.nbanks;
> local_result.power.readOp.longer_channel_leakage =
> local_result.power.readOp.leakage * long_channel_device_reduction;
> local_result.power = local_result.power * pppm_t;
233,239c238,245
< local_result.data_array2->power.readOp.dynamic *= sckRation;
< local_result.data_array2->power.writeOp.dynamic *= sckRation;
< local_result.data_array2->power.searchOp.dynamic *= sckRation;
< local_result.data_array2->power.readOp.leakage *= l_ip.nbanks;
< local_result.data_array2->power.readOp.longer_channel_leakage =
< local_result.data_array2->power.readOp.leakage*long_channel_device_reduction;
< local_result.data_array2->power = local_result.data_array2->power* pppm_t;
---
> local_result.data_array2->power.readOp.dynamic *= sckRation;
> local_result.data_array2->power.writeOp.dynamic *= sckRation;
> local_result.data_array2->power.searchOp.dynamic *= sckRation;
> local_result.data_array2->power.readOp.leakage *= l_ip.nbanks;
> local_result.data_array2->power.readOp.longer_channel_leakage =
> local_result.data_array2->power.readOp.leakage *
> long_channel_device_reduction;
> local_result.data_array2->power = local_result.data_array2->power * pppm_t;
242,251c248,258
< if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache)
< {
< local_result.tag_array2->power.readOp.dynamic *= sckRation;
< local_result.tag_array2->power.writeOp.dynamic *= sckRation;
< local_result.tag_array2->power.searchOp.dynamic *= sckRation;
< local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks;
< local_result.tag_array2->power.readOp.longer_channel_leakage =
< local_result.tag_array2->power.readOp.leakage*long_channel_device_reduction;
< local_result.tag_array2->power = local_result.tag_array2->power* pppm_t;
< }
---
> if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache) {
> local_result.tag_array2->power.readOp.dynamic *= sckRation;
> local_result.tag_array2->power.writeOp.dynamic *= sckRation;
> local_result.tag_array2->power.searchOp.dynamic *= sckRation;
> local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks;
> local_result.tag_array2->power.readOp.longer_channel_leakage =
> local_result.tag_array2->power.readOp.leakage *
> long_channel_device_reduction;
> local_result.tag_array2->power =
> local_result.tag_array2->power * pppm_t;
> }
252a260
> power = local_result.power;
253a262,264
> output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
> output_data.subthreshold_leakage_power = power.readOp.leakage;
> output_data.gate_leakage_power = power.readOp.gate_leakage;
299,301c310,311
< ArrayST:: ~ArrayST()
< {
< local_result.cleanup();
---
> ArrayST::~ArrayST() {
> local_result.cleanup();