Cross Reference: /gem5/ext/mcpat/array.cc

Deleted Added

sdiff udiff text old ( 10152:52c552138ba1 ) new ( 10234:5cb711fa6176 )

full compact

1/*****************************************************************************
2 * McPAT
3 * SOFTWARE LICENSE AGREEMENT
4 * Copyright 2012 Hewlett-Packard Development Company, L.P.

6 * All Rights Reserved
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are
10 * met: redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer;
12 * redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution;
15 * neither the name of the copyright holders nor the names of its
16 * contributors may be used to endorse or promote products derived from
17 * this software without specific prior written permission.
18
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”

29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

30 *
31 ***************************************************************************/
32

32#define GLOBALVAR
33#include <cassert>
34#include <cmath>

33#include <iostream>

34#include <math.h>

35
36#include "area.h"
37#include "array.h"

38#include "common.h"

39#include "decoder.h"

40#include "globalvar.h"

40#include "parameter.h"
41
42using namespace std;
43

45ArrayST::ArrayST(const InputParameter *configure_interface,
46 string _name,
47 enum Device_ty device_ty_,
48 bool opt_local_,
49 enum Core_type core_ty_,
50 bool _is_default)
51:l_ip(*configure_interface),
52 name(_name),
53 device_ty(device_ty_),
54 opt_local(opt_local_),
55 core_ty(core_ty_),
56 is_default(_is_default)
57 {

44double ArrayST::area_efficiency_threshold = 20.0;
45int ArrayST::ed = 0;
46//Fixed number, make sure timing can be satisfied.
47int ArrayST::delay_wt = 100;
48int ArrayST::cycle_time_wt = 1000;
49//Fixed number, This is used to exhaustive search for individual components.
50int ArrayST::area_wt = 10;
51//Fixed number, This is used to exhaustive search for individual components.
52int ArrayST::dynamic_power_wt = 10;
53int ArrayST::leakage_power_wt = 10;
54//Fixed number, make sure timing can be satisfied.
55int ArrayST::delay_dev = 1000000;
56int ArrayST::cycle_time_dev = 100;
57//Fixed number, This is used to exhaustive search for individual components.
58int ArrayST::area_dev = 1000000;
59//Fixed number, This is used to exhaustive search for individual components.
60int ArrayST::dynamic_power_dev = 1000000;
61int ArrayST::leakage_power_dev = 1000000;
62int ArrayST::cycle_time_dev_threshold = 10;

59 if (l_ip.cache_sz<64) l_ip.cache_sz=64;
60 l_ip.error_checking();//not only do the error checking but also fill some missing parameters
61 optimize_array();

63}

65ArrayST::ArrayST(XMLNode* _xml_data,
66 const InputParameter *configure_interface, string _name,
67 enum Device_ty device_ty_, double _clockRate,
68 bool opt_local_, enum Core_type core_ty_, bool _is_default)
69 : McPATComponent(_xml_data), l_ip(*configure_interface),
70 device_ty(device_ty_), opt_local(opt_local_), core_ty(core_ty_),
71 is_default(_is_default) {
72 name = _name;
73 clockRate = _clockRate;
74 if (l_ip.cache_sz < MIN_BUFFER_SIZE)
75 l_ip.cache_sz = MIN_BUFFER_SIZE;

65
66void ArrayST::compute_base_power()
67 {
68 //l_ip.out_w =l_ip.line_sz*8;
69 local_result=cacti_interface(&l_ip);
70

77 if (!l_ip.error_checking(name)) {
78 exit(1);

79 }
80

73void ArrayST::optimize_array()
74{
75 list<uca_org_t > candidate_solutions(0);
76 list<uca_org_t >::iterator candidate_iter, min_dynamic_energy_iter;

81 output_data.reset();

78 uca_org_t * temp_res = 0;
79 local_result.valid=false;

83 computeEnergy();
84 computeArea();
85}

81 double throughput=l_ip.throughput, latency=l_ip.latency;
82 double area_efficiency_threshold = 20.0;
83 bool throughput_overflow=true, latency_overflow=true;
84 compute_base_power();

87void ArrayST::compute_base_power() {
88 local_result = cacti_interface(&l_ip);
89}

86 if ((local_result.cycle_time - throughput) <= 1e-10 )
87 throughput_overflow=false;
88 if ((local_result.access_time - latency)<= 1e-10)
89 latency_overflow=false;

91void ArrayST::computeArea() {
92 area.set_area(local_result.area);
93 output_data.area = local_result.area / 1e6;
94}

91 if (opt_for_clk && opt_local)
92 {
93 if (throughput_overflow || latency_overflow)
94 {
95 l_ip.ed=0;

96void ArrayST::computeEnergy() {
97 list<uca_org_t > candidate_solutions(0);
98 list<uca_org_t >::iterator candidate_iter, min_dynamic_energy_iter;

97 l_ip.delay_wt = 100;//Fixed number, make sure timing can be satisfied.
98 l_ip.cycle_time_wt = 1000;

100 uca_org_t* temp_res = NULL;
101 local_result.valid = false;

102

~~100~~ l_ip.area_wt = 10;//Fixed number, This is used to exhaustive search for individual components.
~~101~~ l_ip.dynamic_power_wt = 10;//Fixed number, This is used to exhaustive search for individual components.
~~102~~ l_ip.leakage_power_wt = 10;

103 double throughput = l_ip.throughput;
104 double latency = l_ip.latency;
105 bool throughput_overflow = true;
106 bool latency_overflow = true;
107 compute_base_power();

108

~~104~~ l_ip.delay_dev = 1000000;//Fixed number, make sure timing can be satisfied.
~~105~~ l_ip.cycle_time_dev = 100;

109 if ((local_result.cycle_time - throughput) <= 1e-10 )
110 throughput_overflow = false;
111 if ((local_result.access_time - latency) <= 1e-10)
112 latency_overflow = false;

113

~~107~~ l_ip.area_dev = 1000000;//Fixed number, This is used to exhaustive search for individual components.
~~108~~ l_ip.dynamic_power_dev = 1000000;//Fixed number, This is used to exhaustive search for individual components.
~~109~~ l_ip.leakage_power_dev = 1000000;

114 if (opt_for_clk && opt_local) {
115 if (throughput_overflow || latency_overflow) {
116 l_ip.ed = ed;

117

~~111~~ throughput_overflow=true; //Reset overflow flag before start optimization iterations
~~112~~ latency_overflow=true;

118 l_ip.delay_wt = delay_wt;
119 l_ip.cycle_time_wt = cycle_time_wt;

120

~~114~~ temp_res = &local_result; //Clean up the result for optimized for ED^2P
~~115~~ temp_res->cleanup();
~~116~~ }

121 l_ip.area_wt = area_wt;
122 l_ip.dynamic_power_wt = dynamic_power_wt;
123 l_ip.leakage_power_wt = leakage_power_wt;

124

125 l_ip.delay_dev = delay_dev;
126 l_ip.cycle_time_dev = cycle_time_dev;

127

~~119~~ while ((throughput_overflow || latency_overflow)&&l_ip.cycle_time_dev > 10)// && l_ip.delay_dev > 10
~~120~~ {
~~121~~ compute_base_power();

128 l_ip.area_dev = area_dev;
129 l_ip.dynamic_power_dev = dynamic_power_dev;
130 l_ip.leakage_power_dev = leakage_power_dev;

131

~~123~~ l_ip.cycle_time_dev-=10;//This is the time_dev to be used for next iteration

132 //Reset overflow flag before start optimization iterations
133 throughput_overflow = true;
134 latency_overflow = true;

135

~~125~~ // from best area to worst area -->worst timing to best timing
~~126~~ if ((((local_result.cycle_time - throughput) <= 1e-10 ) && (local_result.access_time - latency)<= 1e-10)||
~~127~~ (local_result.data_array2->area_efficiency < area_efficiency_threshold && l_ip.assoc == 0))
~~128~~ { //if no satisfiable solution is found,the most aggressive one is left
~~129~~ candidate_solutions.push_back(local_result);
~~130~~ //output_data_csv(candidate_solutions.back());
~~131~~ if (((local_result.cycle_time - throughput) <= 1e-10) && ((local_result.access_time - latency)<= 1e-10))
~~132~~ //ensure stop opt not because of cam
~~133~~ {
~~134~~ throughput_overflow=false;
~~135~~ latency_overflow=false;
~~136~~ }

136 //Clean up the result for optimized for ED^2P
137 temp_res = &local_result;
138 temp_res->cleanup();
139 }

140

~~138~~ }
~~139~~ else
~~140~~ {
~~141~~ //TODO: whether checking the partial satisfied results too, or just change the mark???
~~142~~ if ((local_result.cycle_time - throughput) <= 1e-10)
~~143~~ throughput_overflow=false;
~~144~~ if ((local_result.access_time - latency)<= 1e-10)
~~145~~ latency_overflow=false;

141

~~147~~ if (l_ip.cycle_time_dev > 10)
~~148~~ { //if not >10 local_result is the last result, it cannot be cleaned up
~~149~~ temp_res = &local_result; //Only solutions not saved in the list need to be cleaned up
~~150~~ temp_res->cleanup();
~~151~~ }
~~152~~ }
~~153~~// l_ip.cycle_time_dev-=10;
~~154~~// l_ip.delay_dev-=10;

142 while ((throughput_overflow || latency_overflow) &&
143 l_ip.cycle_time_dev > cycle_time_dev_threshold) {
144 compute_base_power();

145

146 //This is the time_dev to be used for next iteration
147 l_ip.cycle_time_dev -= cycle_time_dev_threshold;
148
149 // from best area to worst area -->worst timing to best timing
150 if ((((local_result.cycle_time - throughput) <= 1e-10 ) &&
151 (local_result.access_time - latency) <= 1e-10) ||
152 (local_result.data_array2->area_efficiency <
153 area_efficiency_threshold && l_ip.assoc == 0)) {
154 //if no satisfiable solution is found,the most aggressive one
155 //is left
156 candidate_solutions.push_back(local_result);
157 if (((local_result.cycle_time - throughput) <= 1e-10) &&
158 ((local_result.access_time - latency) <= 1e-10)) {
159 //ensure stop opt not because of cam
160 throughput_overflow = false;
161 latency_overflow = false;

162 }
163

164 } else {
165 if ((local_result.cycle_time - throughput) <= 1e-10)
166 throughput_overflow = false;
167 if ((local_result.access_time - latency) <= 1e-10)
168 latency_overflow = false;

169

~~159~~ if (l_ip.assoc > 0)
~~160~~ {
~~161~~ //For array structures except CAM and FA, Give warning but still provide a result with best timing found
~~162~~ if (throughput_overflow==true)
~~163~~ cout<< "Warning: " << name<<" array structure cannot satisfy throughput constraint." << endl;
~~164~~ if (latency_overflow==true)
~~165~~ cout<< "Warning: " << name<<" array structure cannot satisfy latency constraint." << endl;

170 //if not >10 local_result is the last result, it cannot be
171 //cleaned up
172 if (l_ip.cycle_time_dev > cycle_time_dev_threshold) {
173 //Only solutions not saved in the list need to be
174 //cleaned up
175 temp_res = &local_result;
176 temp_res->cleanup();
177 }
178 }

179 }
180

~~168~~// else
~~169~~// {
~~170~~// /*According to "Content-Addressable Memory (CAM) Circuits and
~~171~~// Architectures": A Tutorial and Survey
~~172~~// by Kostas Pagiamtzis et al.
~~173~~// CAM structures can be heavily pipelined and use look-ahead techniques,
~~174~~// therefore timing can be relaxed. But McPAT does not model the advanced
~~175~~// techniques. If continue optimizing, the area efficiency will be too low
~~176~~// */
~~177~~// //For CAM and FA, stop opt if area efficiency is too low
~~178~~// if (throughput_overflow==true)
~~179~~// cout<< "Warning: " <<" McPAT stopped optimization on throughput for "<< name
~~180~~// <<" array structure because its area efficiency is below "<<area_efficiency_threshold<<"% " << endl;
~~181~~// if (latency_overflow==true)
~~182~~// cout<< "Warning: " <<" McPAT stopped optimization on latency for "<< name
~~183~~// <<" array structure because its area efficiency is below "<<area_efficiency_threshold<<"% " << endl;
~~184~~// }

181

~~186~~ //double min_dynamic_energy, min_dynamic_power, min_leakage_power, min_cycle_time;
~~187~~ double min_dynamic_energy=BIGNUM;
~~188~~ if (candidate_solutions.empty()==false)
~~189~~ {
~~190~~ local_result.valid=true;
~~191~~ for (candidate_iter = candidate_solutions.begin(); candidate_iter != candidate_solutions.end(); ++candidate_iter)

182 if (l_ip.assoc > 0) {
183 //For array structures except CAM and FA, Give warning but still
184 //provide a result with best timing found
185 if (throughput_overflow == true)
186 cout << "Warning: " << name
187 << " array structure cannot satisfy throughput constraint."
188 << endl;
189 if (latency_overflow == true)
190 cout << "Warning: " << name
191 << " array structure cannot satisfy latency constraint."
192 << endl;
193 }

194

~~193~~ {
~~194~~ if (min_dynamic_energy > (candidate_iter)->power.readOp.dynamic)
~~195~~ {
~~196~~ min_dynamic_energy = (candidate_iter)->power.readOp.dynamic;
~~197~~ min_dynamic_energy_iter = candidate_iter;
~~198~~ local_result = *(min_dynamic_energy_iter);
~~199~~ //TODO: since results are reordered results and l_ip may miss match. Therefore, the final output spread sheets may show the miss match.

195 double min_dynamic_energy = BIGNUM;
196 if (candidate_solutions.empty() == false) {
197 local_result.valid = true;
198 for (candidate_iter = candidate_solutions.begin();
199 candidate_iter != candidate_solutions.end();
200 ++candidate_iter) {
201 if (min_dynamic_energy >
202 (candidate_iter)->power.readOp.dynamic) {
203 min_dynamic_energy =
204 (candidate_iter)->power.readOp.dynamic;
205 min_dynamic_energy_iter = candidate_iter;
206 local_result = *(min_dynamic_energy_iter);
207 } else {
208 candidate_iter->cleanup() ;
209 }

210

~~201~~ }
~~202~~ else
~~203~~ {
~~204~~ candidate_iter->cleanup() ;
~~205~~ }

211 }

212

~~207~~ }

213

~~209~~
~~210~~ }
~~211~~ candidate_solutions.clear();

214 }

215 candidate_solutions.clear();
216 }

217

~~214~~ double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);

218 double long_channel_device_reduction =
219 longer_channel_device_reduction(device_ty, core_ty);

220

~~216~~ double macro_layout_overhead = g_tp.macro_layout_overhead;
~~217~~ double chip_PR_overhead = g_tp.chip_layout_overhead;
~~218~~ double total_overhead = macro_layout_overhead*chip_PR_overhead;
~~219~~ local_result.area *= total_overhead;

221 double macro_layout_overhead = g_tp.macro_layout_overhead;
222 double chip_PR_overhead = g_tp.chip_layout_overhead;
223 double total_overhead = macro_layout_overhead * chip_PR_overhead;
224 local_result.area *= total_overhead;

225

~~221~~ //maintain constant power density
~~222~~ double pppm_t[4] = {total_overhead,1,1,total_overhead};

226 //maintain constant power density
227 double pppm_t[4] = {total_overhead, 1, 1, total_overhead};

228

~~224~~ double sckRation = g_tp.sckt_co_eff;
~~225~~ local_result.power.readOp.dynamic *= sckRation;
~~226~~ local_result.power.writeOp.dynamic *= sckRation;
~~227~~ local_result.power.searchOp.dynamic *= sckRation;
~~228~~ local_result.power.readOp.leakage *= l_ip.nbanks;
~~229~~ local_result.power.readOp.longer_channel_leakage =
~~230~~ local_result.power.readOp.leakage*long_channel_device_reduction;
~~231~~ local_result.power = local_result.power* pppm_t;

229 double sckRation = g_tp.sckt_co_eff;
230 local_result.power.readOp.dynamic *= sckRation;
231 local_result.power.writeOp.dynamic *= sckRation;
232 local_result.power.searchOp.dynamic *= sckRation;
233 local_result.power.readOp.leakage *= l_ip.nbanks;
234 local_result.power.readOp.longer_channel_leakage =
235 local_result.power.readOp.leakage * long_channel_device_reduction;
236 local_result.power = local_result.power * pppm_t;

237

~~233~~ local_result.data_array2->power.readOp.dynamic *= sckRation;
~~234~~ local_result.data_array2->power.writeOp.dynamic *= sckRation;
~~235~~ local_result.data_array2->power.searchOp.dynamic *= sckRation;
~~236~~ local_result.data_array2->power.readOp.leakage *= l_ip.nbanks;
~~237~~ local_result.data_array2->power.readOp.longer_channel_leakage =
~~238~~ local_result.data_array2->power.readOp.leakage*long_channel_device_reduction;
~~239~~ local_result.data_array2->power = local_result.data_array2->power* pppm_t;

238 local_result.data_array2->power.readOp.dynamic *= sckRation;
239 local_result.data_array2->power.writeOp.dynamic *= sckRation;
240 local_result.data_array2->power.searchOp.dynamic *= sckRation;
241 local_result.data_array2->power.readOp.leakage *= l_ip.nbanks;
242 local_result.data_array2->power.readOp.longer_channel_leakage =
243 local_result.data_array2->power.readOp.leakage *
244 long_channel_device_reduction;
245 local_result.data_array2->power = local_result.data_array2->power * pppm_t;

246
247

~~242~~ if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache)
~~243~~ {
~~244~~ local_result.tag_array2->power.readOp.dynamic *= sckRation;
~~245~~ local_result.tag_array2->power.writeOp.dynamic *= sckRation;
~~246~~ local_result.tag_array2->power.searchOp.dynamic *= sckRation;
~~247~~ local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks;
~~248~~ local_result.tag_array2->power.readOp.longer_channel_leakage =
~~249~~ local_result.tag_array2->power.readOp.leakage*long_channel_device_reduction;
~~250~~ local_result.tag_array2->power = local_result.tag_array2->power* pppm_t;
~~251~~ }

248 if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache) {
249 local_result.tag_array2->power.readOp.dynamic *= sckRation;
250 local_result.tag_array2->power.writeOp.dynamic *= sckRation;
251 local_result.tag_array2->power.searchOp.dynamic *= sckRation;
252 local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks;
253 local_result.tag_array2->power.readOp.longer_channel_leakage =
254 local_result.tag_array2->power.readOp.leakage *
255 long_channel_device_reduction;
256 local_result.tag_array2->power =
257 local_result.tag_array2->power * pppm_t;
258 }

259

260 power = local_result.power;

261

262 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
263 output_data.subthreshold_leakage_power = power.readOp.leakage;
264 output_data.gate_leakage_power = power.readOp.gate_leakage;

265}
266
267void ArrayST::leakage_feedback(double temperature)
268{
269 // Update the temperature. l_ip is already set and error-checked in the creator function.
270 l_ip.temp = (unsigned int)round(temperature/10.0)*10;
271
272 // This corresponds to cacti_interface() in the initialization process. Leakage power is updated here.
273 reconfigure(&l_ip,&local_result);
274
275 // Scale the power values. This is part of ArrayST::optimize_array().
276 double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
277
278 double macro_layout_overhead = g_tp.macro_layout_overhead;
279 double chip_PR_overhead = g_tp.chip_layout_overhead;
280 double total_overhead = macro_layout_overhead*chip_PR_overhead;
281
282 double pppm_t[4] = {total_overhead,1,1,total_overhead};
283
284 double sckRation = g_tp.sckt_co_eff;
285 local_result.power.readOp.dynamic *= sckRation;
286 local_result.power.writeOp.dynamic *= sckRation;
287 local_result.power.searchOp.dynamic *= sckRation;
288 local_result.power.readOp.leakage *= l_ip.nbanks;
289 local_result.power.readOp.longer_channel_leakage = local_result.power.readOp.leakage*long_channel_device_reduction;
290 local_result.power = local_result.power* pppm_t;
291
292 local_result.data_array2->power.readOp.dynamic *= sckRation;
293 local_result.data_array2->power.writeOp.dynamic *= sckRation;
294 local_result.data_array2->power.searchOp.dynamic *= sckRation;
295 local_result.data_array2->power.readOp.leakage *= l_ip.nbanks;
296 local_result.data_array2->power.readOp.longer_channel_leakage = local_result.data_array2->power.readOp.leakage*long_channel_device_reduction;
297 local_result.data_array2->power = local_result.data_array2->power* pppm_t;
298
299 if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache)
300 {
301 local_result.tag_array2->power.readOp.dynamic *= sckRation;
302 local_result.tag_array2->power.writeOp.dynamic *= sckRation;
303 local_result.tag_array2->power.searchOp.dynamic *= sckRation;
304 local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks;
305 local_result.tag_array2->power.readOp.longer_channel_leakage = local_result.tag_array2->power.readOp.leakage*long_channel_device_reduction;
306 local_result.tag_array2->power = local_result.tag_array2->power* pppm_t;
307 }
308}
309

~~299~~ArrayST:: ~ArrayST()
~~300~~{
~~301~~ local_result.cleanup();

310ArrayST::~ArrayST() {
311 local_result.cleanup();

312}