1/***************************************************************************** 2 * McPAT 3 * SOFTWARE LICENSE AGREEMENT 4 * Copyright (c) 2010-2013 Advanced Micro Devices, Inc. 5 * All Rights Reserved 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are 9 * met: redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer; 11 * redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution; 14 * neither the name of the copyright holders nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 * 30 * Authors: Joel Hestness 31 * Yasuko Eckert 32 * 33 ***************************************************************************/ 34 35#include <cmath> 36#include <iostream> 37 38#include "area.h" 39#include "cachearray.h" 40#include "common.h" 41#include "decoder.h" 42#include "parameter.h" 43 44using namespace std; 45 46double CacheArray::area_efficiency_threshold = 20.0; 47int CacheArray::ed = 0; 48//Fixed number, make sure timing can be satisfied. 49int CacheArray::delay_wt = 100; 50int CacheArray::cycle_time_wt = 1000; 51//Fixed number, This is used to exhaustive search for individual components. 52int CacheArray::area_wt = 10; 53//Fixed number, This is used to exhaustive search for individual components. 54int CacheArray::dynamic_power_wt = 10; 55int CacheArray::leakage_power_wt = 10; 56//Fixed number, make sure timing can be satisfied. 57int CacheArray::delay_dev = 1000000; 58int CacheArray::cycle_time_dev = 100; 59//Fixed number, This is used to exhaustive search for individual components. 60int CacheArray::area_dev = 1000000; 61//Fixed number, This is used to exhaustive search for individual components. 62int CacheArray::dynamic_power_dev = 1000000; 63int CacheArray::leakage_power_dev = 1000000; 64int CacheArray::cycle_time_dev_threshold = 10; 65 66CacheArray::CacheArray(XMLNode* _xml_data, 67 const InputParameter *configure_interface, string _name, 68 enum Device_ty device_ty_, double _clockRate, 69 bool opt_local_, enum Core_type core_ty_, bool _is_default) 70 : McPATComponent(_xml_data), l_ip(*configure_interface), 71 device_ty(device_ty_), opt_local(opt_local_), core_ty(core_ty_), 72 is_default(_is_default), sbt_dir_overhead(0) { 73 name = _name; 74 clockRate = _clockRate; 75 if (l_ip.cache_sz < MIN_BUFFER_SIZE) { 76 l_ip.cache_sz = MIN_BUFFER_SIZE; 77 } 78 79 if (!l_ip.error_checking(name)) { 80 exit(1); 81 } 82 83 sbt_tdp_stats.reset(); 84 sbt_rtp_stats.reset(); 85 86 // Compute initial search point 87 local_result.valid = false; 88 compute_base_power(); 89 90 // Set up the cache by searching design space with cacti 91 list<uca_org_t > candidate_solutions(0); 92 list<uca_org_t >::iterator candidate_iter, min_dynamic_energy_iter; 93 uca_org_t* temp_res = NULL; 94 double throughput = l_ip.throughput; 95 double latency = l_ip.latency; 96 bool throughput_overflow = true; 97 bool latency_overflow = true; 98 99 if ((local_result.cycle_time - throughput) <= 1e-10 ) 100 throughput_overflow = false; 101 if ((local_result.access_time - latency) <= 1e-10) 102 latency_overflow = false; 103 104 if (opt_for_clk && opt_local) { 105 if (throughput_overflow || latency_overflow) { 106 l_ip.ed = ed; 107 108 l_ip.delay_wt = delay_wt; 109 l_ip.cycle_time_wt = cycle_time_wt; 110 111 l_ip.area_wt = area_wt; 112 l_ip.dynamic_power_wt = dynamic_power_wt; 113 l_ip.leakage_power_wt = leakage_power_wt; 114 115 l_ip.delay_dev = delay_dev; 116 l_ip.cycle_time_dev = cycle_time_dev; 117 118 l_ip.area_dev = area_dev; 119 l_ip.dynamic_power_dev = dynamic_power_dev; 120 l_ip.leakage_power_dev = leakage_power_dev; 121 122 //Reset overflow flag before start optimization iterations 123 throughput_overflow = true; 124 latency_overflow = true; 125 126 //Clean up the result for optimized for ED^2P 127 temp_res = &local_result; 128 temp_res->cleanup(); 129 } 130 131 132 while ((throughput_overflow || latency_overflow) && 133 l_ip.cycle_time_dev > cycle_time_dev_threshold) { 134 compute_base_power(); 135 136 //This is the time_dev to be used for next iteration 137 l_ip.cycle_time_dev -= cycle_time_dev_threshold; 138 139 // from best area to worst area -->worst timing to best timing 140 if ((((local_result.cycle_time - throughput) <= 1e-10 ) && 141 (local_result.access_time - latency) <= 1e-10) || 142 (local_result.data_array2->area_efficiency < 143 area_efficiency_threshold && l_ip.assoc == 0)) { 144 //if no satisfiable solution is found,the most aggressive one 145 //is left 146 candidate_solutions.push_back(local_result); 147 if (((local_result.cycle_time - throughput) <= 1e-10) && 148 ((local_result.access_time - latency) <= 1e-10)) { 149 //ensure stop opt not because of cam 150 throughput_overflow = false; 151 latency_overflow = false; 152 } 153 154 } else { 155 if ((local_result.cycle_time - throughput) <= 1e-10) 156 throughput_overflow = false; 157 if ((local_result.access_time - latency) <= 1e-10) 158 latency_overflow = false; 159 160 //if not >10 local_result is the last result, it cannot be 161 //cleaned up 162 if (l_ip.cycle_time_dev > cycle_time_dev_threshold) { 163 //Only solutions not saved in the list need to be 164 //cleaned up 165 temp_res = &local_result; 166 temp_res->cleanup(); 167 } 168 } 169 } 170 171 172 if (l_ip.assoc > 0) { 173 //For array structures except CAM and FA, Give warning but still 174 //provide a result with best timing found 175 if (throughput_overflow == true) 176 cout << "Warning: " << name 177 << " array structure cannot satisfy throughput constraint." 178 << endl; 179 if (latency_overflow == true) 180 cout << "Warning: " << name 181 << " array structure cannot satisfy latency constraint." 182 << endl; 183 } 184 185 double min_dynamic_energy = BIGNUM; 186 if (candidate_solutions.empty() == false) { 187 local_result.valid = true; 188 for (candidate_iter = candidate_solutions.begin(); 189 candidate_iter != candidate_solutions.end(); 190 ++candidate_iter) { 191 if (min_dynamic_energy > 192 (candidate_iter)->power.readOp.dynamic) { 193 min_dynamic_energy = 194 (candidate_iter)->power.readOp.dynamic; 195 min_dynamic_energy_iter = candidate_iter; 196 local_result = *(min_dynamic_energy_iter); 197 198 } else { 199 candidate_iter->cleanup() ; 200 } 201 202 } 203 204 205 } 206 candidate_solutions.clear(); 207 } 208 209 double long_channel_device_reduction = 210 longer_channel_device_reduction(device_ty, core_ty); 211 212 double macro_layout_overhead = g_tp.macro_layout_overhead; 213 double chip_PR_overhead = g_tp.chip_layout_overhead; 214 double total_overhead = macro_layout_overhead * chip_PR_overhead; 215 local_result.area *= total_overhead; 216 217 //maintain constant power density 218 double pppm_t[4] = {total_overhead, 1, 1, total_overhead}; 219 220 double sckRation = g_tp.sckt_co_eff; 221 local_result.power.readOp.dynamic *= sckRation; 222 local_result.power.writeOp.dynamic *= sckRation; 223 local_result.power.searchOp.dynamic *= sckRation; 224 local_result.power.readOp.leakage *= l_ip.nbanks; 225 local_result.power.readOp.longer_channel_leakage = 226 local_result.power.readOp.leakage * long_channel_device_reduction; 227 local_result.power = local_result.power * pppm_t; 228 229 local_result.data_array2->power.readOp.dynamic *= sckRation; 230 local_result.data_array2->power.writeOp.dynamic *= sckRation; 231 local_result.data_array2->power.searchOp.dynamic *= sckRation; 232 local_result.data_array2->power.readOp.leakage *= l_ip.nbanks; 233 local_result.data_array2->power.readOp.longer_channel_leakage = 234 local_result.data_array2->power.readOp.leakage * 235 long_channel_device_reduction; 236 local_result.data_array2->power = local_result.data_array2->power * pppm_t; 237 238 239 if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache) { 240 local_result.tag_array2->power.readOp.dynamic *= sckRation; 241 local_result.tag_array2->power.writeOp.dynamic *= sckRation; 242 local_result.tag_array2->power.searchOp.dynamic *= sckRation; 243 local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks; 244 local_result.tag_array2->power.readOp.longer_channel_leakage = 245 local_result.tag_array2->power.readOp.leakage * 246 long_channel_device_reduction; 247 local_result.tag_array2->power = 248 local_result.tag_array2->power * pppm_t; 249 } 250} 251 252void CacheArray::compute_base_power() { 253 local_result = cacti_interface(&l_ip); 254} 255 256void CacheArray::computeArea() { 257 area.set_area(local_result.area); 258 output_data.area = local_result.area / 1e6; 259} 260 261void CacheArray::computeEnergy() { 262 // Set the leakage power numbers 263 output_data.subthreshold_leakage_power = local_result.power.readOp.leakage; 264 output_data.gate_leakage_power = local_result.power.readOp.gate_leakage; 265 266 if (l_ip.assoc && l_ip.is_cache) { 267 // This is a standard cache array with data and tags 268 // Calculate peak dynamic power 269 output_data.peak_dynamic_power = 270 (local_result.tag_array2->power.readOp.dynamic + 271 local_result.data_array2->power.readOp.dynamic) * 272 tdp_stats.readAc.hit + 273 (local_result.tag_array2->power.readOp.dynamic) * 274 tdp_stats.readAc.miss + 275 (local_result.tag_array2->power.readOp.dynamic + 276 local_result.data_array2->power.writeOp.dynamic) * 277 tdp_stats.writeAc.hit + 278 (local_result.tag_array2->power.readOp.dynamic) * 279 tdp_stats.writeAc.miss; 280 output_data.peak_dynamic_power *= clockRate; 281 282 // Calculate the runtime dynamic power 283 output_data.runtime_dynamic_energy = 284 local_result.data_array2->power.readOp.dynamic * 285 rtp_stats.dataReadAc.access + 286 local_result.data_array2->power.writeOp.dynamic * 287 rtp_stats.dataWriteAc.access + 288 (local_result.tag_array2->power.readOp.dynamic * 289 rtp_stats.tagReadAc.access + 290 local_result.tag_array2->power.writeOp.dynamic * 291 rtp_stats.tagWriteAc.access) * l_ip.assoc; 292 } else { 293 // Calculate peak dynamic power 294 output_data.peak_dynamic_power = 295 local_result.power.readOp.dynamic * tdp_stats.readAc.access + 296 local_result.power.writeOp.dynamic * tdp_stats.writeAc.access + 297 local_result.power.searchOp.dynamic * tdp_stats.searchAc.access; 298 output_data.peak_dynamic_power *= clockRate; 299 300 // Calculate the runtime dynamic power 301 output_data.runtime_dynamic_energy = 302 local_result.power.readOp.dynamic * rtp_stats.readAc.access + 303 local_result.power.writeOp.dynamic * rtp_stats.writeAc.access + 304 local_result.power.searchOp.dynamic * rtp_stats.searchAc.access; 305 } 306 307 // An SBT directory has more dynamic power 308 if (sbt_dir_overhead > 0) { 309 // Calculate peak dynamic power 310 output_data.peak_dynamic_power += 311 (computeSBTDynEnergy(&sbt_tdp_stats) * clockRate); 312 313 // Calculate the runtime dynamic power 314 output_data.runtime_dynamic_energy += 315 computeSBTDynEnergy(&sbt_rtp_stats); 316 } 317} 318 319CacheArray::~CacheArray() { 320 local_result.cleanup(); 321} 322