1/***************************************************************************** 2 * McPAT 3 * SOFTWARE LICENSE AGREEMENT 4 * Copyright 2012 Hewlett-Packard Development Company, L.P.
|
5 * Copyright (c) 2010-2013 Advanced Micro Devices, Inc. |
6 * All Rights Reserved 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are 10 * met: redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer; 12 * redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution; 15 * neither the name of the copyright holders nor the names of its 16 * contributors may be used to endorse or promote products derived from 17 * this software without specific prior written permission. 18 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
|
29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
30 * 31 ***************************************************************************/ 32
|
32#define GLOBALVAR
33#include <cassert>
34#include <cmath>
|
33#include <iostream>
|
34#include <math.h> |
35 36#include "area.h" 37#include "array.h"
|
38#include "common.h" |
39#include "decoder.h"
|
40#include "globalvar.h"
|
40#include "parameter.h" 41 42using namespace std; 43
|
45ArrayST::ArrayST(const InputParameter *configure_interface,
46 string _name,
47 enum Device_ty device_ty_,
48 bool opt_local_,
49 enum Core_type core_ty_,
50 bool _is_default)
51:l_ip(*configure_interface),
52 name(_name),
53 device_ty(device_ty_),
54 opt_local(opt_local_),
55 core_ty(core_ty_),
56 is_default(_is_default)
57 {
|
44double ArrayST::area_efficiency_threshold = 20.0; 45int ArrayST::ed = 0; 46//Fixed number, make sure timing can be satisfied. 47int ArrayST::delay_wt = 100; 48int ArrayST::cycle_time_wt = 1000; 49//Fixed number, This is used to exhaustive search for individual components. 50int ArrayST::area_wt = 10; 51//Fixed number, This is used to exhaustive search for individual components. 52int ArrayST::dynamic_power_wt = 10; 53int ArrayST::leakage_power_wt = 10; 54//Fixed number, make sure timing can be satisfied. 55int ArrayST::delay_dev = 1000000; 56int ArrayST::cycle_time_dev = 100; 57//Fixed number, This is used to exhaustive search for individual components. 58int ArrayST::area_dev = 1000000; 59//Fixed number, This is used to exhaustive search for individual components. 60int ArrayST::dynamic_power_dev = 1000000; 61int ArrayST::leakage_power_dev = 1000000; 62int ArrayST::cycle_time_dev_threshold = 10; |
63
|
59 if (l_ip.cache_sz<64) l_ip.cache_sz=64;
60 l_ip.error_checking();//not only do the error checking but also fill some missing parameters
61 optimize_array();
|
64
|
63}
|
65ArrayST::ArrayST(XMLNode* _xml_data, 66 const InputParameter *configure_interface, string _name, 67 enum Device_ty device_ty_, double _clockRate, 68 bool opt_local_, enum Core_type core_ty_, bool _is_default) 69 : McPATComponent(_xml_data), l_ip(*configure_interface), 70 device_ty(device_ty_), opt_local(opt_local_), core_ty(core_ty_), 71 is_default(_is_default) { 72 name = _name; 73 clockRate = _clockRate; 74 if (l_ip.cache_sz < MIN_BUFFER_SIZE) 75 l_ip.cache_sz = MIN_BUFFER_SIZE; |
76
|
65
66void ArrayST::compute_base_power()
67 {
68 //l_ip.out_w =l_ip.line_sz*8;
69 local_result=cacti_interface(&l_ip);
70
|
77 if (!l_ip.error_checking(name)) { 78 exit(1); |
79 } 80
|
73void ArrayST::optimize_array()
74{
75 list<uca_org_t > candidate_solutions(0);
76 list<uca_org_t >::iterator candidate_iter, min_dynamic_energy_iter;
|
81 output_data.reset(); |
82
|
78 uca_org_t * temp_res = 0;
79 local_result.valid=false;
|
83 computeEnergy(); 84 computeArea(); 85} |
86
|
81 double throughput=l_ip.throughput, latency=l_ip.latency;
82 double area_efficiency_threshold = 20.0;
83 bool throughput_overflow=true, latency_overflow=true;
84 compute_base_power();
|
87void ArrayST::compute_base_power() { 88 local_result = cacti_interface(&l_ip); 89} |
90
|
86 if ((local_result.cycle_time - throughput) <= 1e-10 )
87 throughput_overflow=false;
88 if ((local_result.access_time - latency)<= 1e-10)
89 latency_overflow=false;
|
91void ArrayST::computeArea() { 92 area.set_area(local_result.area); 93 output_data.area = local_result.area / 1e6; 94} |
95
|
91 if (opt_for_clk && opt_local)
92 {
93 if (throughput_overflow || latency_overflow)
94 {
95 l_ip.ed=0;
|
96void ArrayST::computeEnergy() { 97 list<uca_org_t > candidate_solutions(0); 98 list<uca_org_t >::iterator candidate_iter, min_dynamic_energy_iter; |
99
|
97 l_ip.delay_wt = 100;//Fixed number, make sure timing can be satisfied.
98 l_ip.cycle_time_wt = 1000;
|
100 uca_org_t* temp_res = NULL; 101 local_result.valid = false; |
102
|
100 l_ip.area_wt = 10;//Fixed number, This is used to exhaustive search for individual components.
101 l_ip.dynamic_power_wt = 10;//Fixed number, This is used to exhaustive search for individual components.
102 l_ip.leakage_power_wt = 10;
|
103 double throughput = l_ip.throughput; 104 double latency = l_ip.latency; 105 bool throughput_overflow = true; 106 bool latency_overflow = true; 107 compute_base_power(); |
108
|
104 l_ip.delay_dev = 1000000;//Fixed number, make sure timing can be satisfied.
105 l_ip.cycle_time_dev = 100;
|
109 if ((local_result.cycle_time - throughput) <= 1e-10 ) 110 throughput_overflow = false; 111 if ((local_result.access_time - latency) <= 1e-10) 112 latency_overflow = false; |
113
|
107 l_ip.area_dev = 1000000;//Fixed number, This is used to exhaustive search for individual components.
108 l_ip.dynamic_power_dev = 1000000;//Fixed number, This is used to exhaustive search for individual components.
109 l_ip.leakage_power_dev = 1000000;
|
114 if (opt_for_clk && opt_local) { 115 if (throughput_overflow || latency_overflow) { 116 l_ip.ed = ed; |
117
|
111 throughput_overflow=true; //Reset overflow flag before start optimization iterations
112 latency_overflow=true;
|
118 l_ip.delay_wt = delay_wt; 119 l_ip.cycle_time_wt = cycle_time_wt; |
120
|
114 temp_res = &local_result; //Clean up the result for optimized for ED^2P
115 temp_res->cleanup();
116 }
|
121 l_ip.area_wt = area_wt; 122 l_ip.dynamic_power_wt = dynamic_power_wt; 123 l_ip.leakage_power_wt = leakage_power_wt; |
124
|
125 l_ip.delay_dev = delay_dev; 126 l_ip.cycle_time_dev = cycle_time_dev; |
127
|
119 while ((throughput_overflow || latency_overflow)&&l_ip.cycle_time_dev > 10)// && l_ip.delay_dev > 10
120 {
121 compute_base_power();
|
128 l_ip.area_dev = area_dev; 129 l_ip.dynamic_power_dev = dynamic_power_dev; 130 l_ip.leakage_power_dev = leakage_power_dev; |
131
|
123 l_ip.cycle_time_dev-=10;//This is the time_dev to be used for next iteration
|
132 //Reset overflow flag before start optimization iterations 133 throughput_overflow = true; 134 latency_overflow = true; |
135
|
125 // from best area to worst area -->worst timing to best timing
126 if ((((local_result.cycle_time - throughput) <= 1e-10 ) && (local_result.access_time - latency)<= 1e-10)||
127 (local_result.data_array2->area_efficiency < area_efficiency_threshold && l_ip.assoc == 0))
128 { //if no satisfiable solution is found,the most aggressive one is left
129 candidate_solutions.push_back(local_result);
130 //output_data_csv(candidate_solutions.back());
131 if (((local_result.cycle_time - throughput) <= 1e-10) && ((local_result.access_time - latency)<= 1e-10))
132 //ensure stop opt not because of cam
133 {
134 throughput_overflow=false;
135 latency_overflow=false;
136 }
|
136 //Clean up the result for optimized for ED^2P 137 temp_res = &local_result; 138 temp_res->cleanup(); 139 } |
140
|
138 }
139 else
140 {
141 //TODO: whether checking the partial satisfied results too, or just change the mark???
142 if ((local_result.cycle_time - throughput) <= 1e-10)
143 throughput_overflow=false;
144 if ((local_result.access_time - latency)<= 1e-10)
145 latency_overflow=false;
|
141
|
147 if (l_ip.cycle_time_dev > 10)
148 { //if not >10 local_result is the last result, it cannot be cleaned up
149 temp_res = &local_result; //Only solutions not saved in the list need to be cleaned up
150 temp_res->cleanup();
151 }
152 }
153// l_ip.cycle_time_dev-=10;
154// l_ip.delay_dev-=10;
|
142 while ((throughput_overflow || latency_overflow) && 143 l_ip.cycle_time_dev > cycle_time_dev_threshold) { 144 compute_base_power(); |
145
|
146 //This is the time_dev to be used for next iteration 147 l_ip.cycle_time_dev -= cycle_time_dev_threshold; 148 149 // from best area to worst area -->worst timing to best timing 150 if ((((local_result.cycle_time - throughput) <= 1e-10 ) && 151 (local_result.access_time - latency) <= 1e-10) || 152 (local_result.data_array2->area_efficiency < 153 area_efficiency_threshold && l_ip.assoc == 0)) { 154 //if no satisfiable solution is found,the most aggressive one 155 //is left 156 candidate_solutions.push_back(local_result); 157 if (((local_result.cycle_time - throughput) <= 1e-10) && 158 ((local_result.access_time - latency) <= 1e-10)) { 159 //ensure stop opt not because of cam 160 throughput_overflow = false; 161 latency_overflow = false; |
162 } 163
|
164 } else { 165 if ((local_result.cycle_time - throughput) <= 1e-10) 166 throughput_overflow = false; 167 if ((local_result.access_time - latency) <= 1e-10) 168 latency_overflow = false; |
169
|
159 if (l_ip.assoc > 0)
160 {
161 //For array structures except CAM and FA, Give warning but still provide a result with best timing found
162 if (throughput_overflow==true)
163 cout<< "Warning: " << name<<" array structure cannot satisfy throughput constraint." << endl;
164 if (latency_overflow==true)
165 cout<< "Warning: " << name<<" array structure cannot satisfy latency constraint." << endl;
|
170 //if not >10 local_result is the last result, it cannot be 171 //cleaned up 172 if (l_ip.cycle_time_dev > cycle_time_dev_threshold) { 173 //Only solutions not saved in the list need to be 174 //cleaned up 175 temp_res = &local_result; 176 temp_res->cleanup(); 177 } 178 } |
179 } 180
|
168// else
169// {
170// /*According to "Content-Addressable Memory (CAM) Circuits and
171// Architectures": A Tutorial and Survey
172// by Kostas Pagiamtzis et al.
173// CAM structures can be heavily pipelined and use look-ahead techniques,
174// therefore timing can be relaxed. But McPAT does not model the advanced
175// techniques. If continue optimizing, the area efficiency will be too low
176// */
177// //For CAM and FA, stop opt if area efficiency is too low
178// if (throughput_overflow==true)
179// cout<< "Warning: " <<" McPAT stopped optimization on throughput for "<< name
180// <<" array structure because its area efficiency is below "<<area_efficiency_threshold<<"% " << endl;
181// if (latency_overflow==true)
182// cout<< "Warning: " <<" McPAT stopped optimization on latency for "<< name
183// <<" array structure because its area efficiency is below "<<area_efficiency_threshold<<"% " << endl;
184// }
|
181
|
186 //double min_dynamic_energy, min_dynamic_power, min_leakage_power, min_cycle_time;
187 double min_dynamic_energy=BIGNUM;
188 if (candidate_solutions.empty()==false)
189 {
190 local_result.valid=true;
191 for (candidate_iter = candidate_solutions.begin(); candidate_iter != candidate_solutions.end(); ++candidate_iter)
|
182 if (l_ip.assoc > 0) { 183 //For array structures except CAM and FA, Give warning but still 184 //provide a result with best timing found 185 if (throughput_overflow == true) 186 cout << "Warning: " << name 187 << " array structure cannot satisfy throughput constraint." 188 << endl; 189 if (latency_overflow == true) 190 cout << "Warning: " << name 191 << " array structure cannot satisfy latency constraint." 192 << endl; 193 } |
194
|
193 {
194 if (min_dynamic_energy > (candidate_iter)->power.readOp.dynamic)
195 {
196 min_dynamic_energy = (candidate_iter)->power.readOp.dynamic;
197 min_dynamic_energy_iter = candidate_iter;
198 local_result = *(min_dynamic_energy_iter);
199 //TODO: since results are reordered results and l_ip may miss match. Therefore, the final output spread sheets may show the miss match.
|
195 double min_dynamic_energy = BIGNUM; 196 if (candidate_solutions.empty() == false) { 197 local_result.valid = true; 198 for (candidate_iter = candidate_solutions.begin(); 199 candidate_iter != candidate_solutions.end(); 200 ++candidate_iter) { 201 if (min_dynamic_energy > 202 (candidate_iter)->power.readOp.dynamic) { 203 min_dynamic_energy = 204 (candidate_iter)->power.readOp.dynamic; 205 min_dynamic_energy_iter = candidate_iter; 206 local_result = *(min_dynamic_energy_iter); 207 } else { 208 candidate_iter->cleanup() ; 209 } |
210
|
201 }
202 else
203 {
204 candidate_iter->cleanup() ;
205 }
|
211 } |
212
|
207 }
|
213
|
209
210 }
211 candidate_solutions.clear();
|
214 }
|
215 candidate_solutions.clear(); 216 } |
217
|
214 double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
|
218 double long_channel_device_reduction = 219 longer_channel_device_reduction(device_ty, core_ty); |
220
|
216 double macro_layout_overhead = g_tp.macro_layout_overhead;
217 double chip_PR_overhead = g_tp.chip_layout_overhead;
218 double total_overhead = macro_layout_overhead*chip_PR_overhead;
219 local_result.area *= total_overhead;
|
221 double macro_layout_overhead = g_tp.macro_layout_overhead; 222 double chip_PR_overhead = g_tp.chip_layout_overhead; 223 double total_overhead = macro_layout_overhead * chip_PR_overhead; 224 local_result.area *= total_overhead; |
225
|
221 //maintain constant power density
222 double pppm_t[4] = {total_overhead,1,1,total_overhead};
|
226 //maintain constant power density 227 double pppm_t[4] = {total_overhead, 1, 1, total_overhead}; |
228
|
224 double sckRation = g_tp.sckt_co_eff;
225 local_result.power.readOp.dynamic *= sckRation;
226 local_result.power.writeOp.dynamic *= sckRation;
227 local_result.power.searchOp.dynamic *= sckRation;
228 local_result.power.readOp.leakage *= l_ip.nbanks;
229 local_result.power.readOp.longer_channel_leakage =
230 local_result.power.readOp.leakage*long_channel_device_reduction;
231 local_result.power = local_result.power* pppm_t;
|
229 double sckRation = g_tp.sckt_co_eff; 230 local_result.power.readOp.dynamic *= sckRation; 231 local_result.power.writeOp.dynamic *= sckRation; 232 local_result.power.searchOp.dynamic *= sckRation; 233 local_result.power.readOp.leakage *= l_ip.nbanks; 234 local_result.power.readOp.longer_channel_leakage = 235 local_result.power.readOp.leakage * long_channel_device_reduction; 236 local_result.power = local_result.power * pppm_t; |
237
|
233 local_result.data_array2->power.readOp.dynamic *= sckRation;
234 local_result.data_array2->power.writeOp.dynamic *= sckRation;
235 local_result.data_array2->power.searchOp.dynamic *= sckRation;
236 local_result.data_array2->power.readOp.leakage *= l_ip.nbanks;
237 local_result.data_array2->power.readOp.longer_channel_leakage =
238 local_result.data_array2->power.readOp.leakage*long_channel_device_reduction;
239 local_result.data_array2->power = local_result.data_array2->power* pppm_t;
|
238 local_result.data_array2->power.readOp.dynamic *= sckRation; 239 local_result.data_array2->power.writeOp.dynamic *= sckRation; 240 local_result.data_array2->power.searchOp.dynamic *= sckRation; 241 local_result.data_array2->power.readOp.leakage *= l_ip.nbanks; 242 local_result.data_array2->power.readOp.longer_channel_leakage = 243 local_result.data_array2->power.readOp.leakage * 244 long_channel_device_reduction; 245 local_result.data_array2->power = local_result.data_array2->power * pppm_t; |
246 247
|
242 if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache)
243 {
244 local_result.tag_array2->power.readOp.dynamic *= sckRation;
245 local_result.tag_array2->power.writeOp.dynamic *= sckRation;
246 local_result.tag_array2->power.searchOp.dynamic *= sckRation;
247 local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks;
248 local_result.tag_array2->power.readOp.longer_channel_leakage =
249 local_result.tag_array2->power.readOp.leakage*long_channel_device_reduction;
250 local_result.tag_array2->power = local_result.tag_array2->power* pppm_t;
251 }
|
248 if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache) { 249 local_result.tag_array2->power.readOp.dynamic *= sckRation; 250 local_result.tag_array2->power.writeOp.dynamic *= sckRation; 251 local_result.tag_array2->power.searchOp.dynamic *= sckRation; 252 local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks; 253 local_result.tag_array2->power.readOp.longer_channel_leakage = 254 local_result.tag_array2->power.readOp.leakage * 255 long_channel_device_reduction; 256 local_result.tag_array2->power = 257 local_result.tag_array2->power * pppm_t; 258 } |
259
|
260 power = local_result.power; |
261
|
262 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate; 263 output_data.subthreshold_leakage_power = power.readOp.leakage; 264 output_data.gate_leakage_power = power.readOp.gate_leakage; |
265} 266 267void ArrayST::leakage_feedback(double temperature) 268{ 269 // Update the temperature. l_ip is already set and error-checked in the creator function. 270 l_ip.temp = (unsigned int)round(temperature/10.0)*10; 271 272 // This corresponds to cacti_interface() in the initialization process. Leakage power is updated here. 273 reconfigure(&l_ip,&local_result); 274 275 // Scale the power values. This is part of ArrayST::optimize_array(). 276 double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty); 277 278 double macro_layout_overhead = g_tp.macro_layout_overhead; 279 double chip_PR_overhead = g_tp.chip_layout_overhead; 280 double total_overhead = macro_layout_overhead*chip_PR_overhead; 281 282 double pppm_t[4] = {total_overhead,1,1,total_overhead}; 283 284 double sckRation = g_tp.sckt_co_eff; 285 local_result.power.readOp.dynamic *= sckRation; 286 local_result.power.writeOp.dynamic *= sckRation; 287 local_result.power.searchOp.dynamic *= sckRation; 288 local_result.power.readOp.leakage *= l_ip.nbanks; 289 local_result.power.readOp.longer_channel_leakage = local_result.power.readOp.leakage*long_channel_device_reduction; 290 local_result.power = local_result.power* pppm_t; 291 292 local_result.data_array2->power.readOp.dynamic *= sckRation; 293 local_result.data_array2->power.writeOp.dynamic *= sckRation; 294 local_result.data_array2->power.searchOp.dynamic *= sckRation; 295 local_result.data_array2->power.readOp.leakage *= l_ip.nbanks; 296 local_result.data_array2->power.readOp.longer_channel_leakage = local_result.data_array2->power.readOp.leakage*long_channel_device_reduction; 297 local_result.data_array2->power = local_result.data_array2->power* pppm_t; 298 299 if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache) 300 { 301 local_result.tag_array2->power.readOp.dynamic *= sckRation; 302 local_result.tag_array2->power.writeOp.dynamic *= sckRation; 303 local_result.tag_array2->power.searchOp.dynamic *= sckRation; 304 local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks; 305 local_result.tag_array2->power.readOp.longer_channel_leakage = local_result.tag_array2->power.readOp.leakage*long_channel_device_reduction; 306 local_result.tag_array2->power = local_result.tag_array2->power* pppm_t; 307 } 308} 309
|
299ArrayST:: ~ArrayST()
300{
301 local_result.cleanup();
|
310ArrayST::~ArrayST() { 311 local_result.cleanup(); |
312}
|