Ucache.cc revision 10234
1/***************************************************************************** 2 * McPAT/CACTI 3 * SOFTWARE LICENSE AGREEMENT 4 * Copyright 2012 Hewlett-Packard Development Company, L.P. 5 * Copyright (c) 2010-2013 Advanced Micro Devices, Inc. 6 * All Rights Reserved 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are 10 * met: redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer; 12 * redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution; 15 * neither the name of the copyright holders nor the names of its 16 * contributors may be used to endorse or promote products derived from 17 * this software without specific prior written permission. 18 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 ***************************************************************************/ 32 33 34#include <pthread.h> 35 36#include <algorithm> 37#include <cmath> 38#include <ctime> 39#include <iostream> 40#include <list> 41 42#include "Ucache.h" 43#include "area.h" 44#include "bank.h" 45#include "basic_circuit.h" 46#include "component.h" 47#include "const.h" 48#include "decoder.h" 49#include "parameter.h" 50#include "subarray.h" 51#include "uca.h" 52 53using namespace std; 54 55const uint32_t nthreads = NTHREADS; 56 57 58void min_values_t::update_min_values(const min_values_t * val) { 59 min_delay = (min_delay > val->min_delay) ? val->min_delay : min_delay; 60 min_dyn = (min_dyn > val->min_dyn) ? val->min_dyn : min_dyn; 61 min_leakage = (min_leakage > val->min_leakage) ? val->min_leakage : min_leakage; 62 min_area = (min_area > val->min_area) ? val->min_area : min_area; 63 min_cyc = (min_cyc > val->min_cyc) ? val->min_cyc : min_cyc; 64} 65 66 67 68void min_values_t::update_min_values(const uca_org_t & res) { 69 min_delay = (min_delay > res.access_time) ? res.access_time : min_delay; 70 min_dyn = (min_dyn > res.power.readOp.dynamic) ? res.power.readOp.dynamic : min_dyn; 71 min_leakage = (min_leakage > res.power.readOp.leakage) ? res.power.readOp.leakage : min_leakage; 72 min_area = (min_area > res.area) ? res.area : min_area; 73 min_cyc = (min_cyc > res.cycle_time) ? res.cycle_time : min_cyc; 74} 75 76void min_values_t::update_min_values(const nuca_org_t * res) { 77 min_delay = (min_delay > res->nuca_pda.delay) ? res->nuca_pda.delay : min_delay; 78 min_dyn = (min_dyn > res->nuca_pda.power.readOp.dynamic) ? res->nuca_pda.power.readOp.dynamic : min_dyn; 79 min_leakage = (min_leakage > res->nuca_pda.power.readOp.leakage) ? res->nuca_pda.power.readOp.leakage : min_leakage; 80 min_area = (min_area > res->nuca_pda.area.get_area()) ? res->nuca_pda.area.get_area() : min_area; 81 min_cyc = (min_cyc > res->nuca_pda.cycle_time) ? res->nuca_pda.cycle_time : min_cyc; 82} 83 84void min_values_t::update_min_values(const mem_array * res) { 85 min_delay = (min_delay > res->access_time) ? res->access_time : min_delay; 86 min_dyn = (min_dyn > res->power.readOp.dynamic) ? res->power.readOp.dynamic : min_dyn; 87 min_leakage = (min_leakage > res->power.readOp.leakage) ? res->power.readOp.leakage : min_leakage; 88 min_area = (min_area > res->area) ? res->area : min_area; 89 min_cyc = (min_cyc > res->cycle_time) ? res->cycle_time : min_cyc; 90} 91 92 93 94void * calc_time_mt_wrapper(void * void_obj) { 95 calc_time_mt_wrapper_struct * calc_obj = (calc_time_mt_wrapper_struct *) void_obj; 96 uint32_t tid = calc_obj->tid; 97 list<mem_array *> & data_arr = calc_obj->data_arr; 98 list<mem_array *> & tag_arr = calc_obj->tag_arr; 99 bool is_tag = calc_obj->is_tag; 100 bool pure_ram = calc_obj->pure_ram; 101 bool pure_cam = calc_obj->pure_cam; 102 bool is_main_mem = calc_obj->is_main_mem; 103 double Nspd_min = calc_obj->Nspd_min; 104 min_values_t * data_res = calc_obj->data_res; 105 min_values_t * tag_res = calc_obj->tag_res; 106 107 data_arr.clear(); 108 data_arr.push_back(new mem_array); 109 tag_arr.clear(); 110 tag_arr.push_back(new mem_array); 111 112 uint32_t Ndwl_niter = _log2(MAXDATAN) + 1; 113 uint32_t Ndbl_niter = _log2(MAXDATAN) + 1; 114 uint32_t Ndcm_niter = _log2(MAX_COL_MUX) + 1; 115 uint32_t niter = Ndwl_niter * Ndbl_niter * Ndcm_niter; 116 117 118 bool is_valid_partition; 119 int wt_min, wt_max; 120 121 if (g_ip->force_wiretype) { 122 if (g_ip->wt == 0) { 123 wt_min = Low_swing; 124 wt_max = Low_swing; 125 } else { 126 wt_min = Global; 127 wt_max = Low_swing - 1; 128 } 129 } else { 130 wt_min = Global; 131 wt_max = Low_swing; 132 } 133 134 for (double Nspd = Nspd_min; Nspd <= MAXDATASPD; Nspd *= 2) { 135 for (int wr = wt_min; wr <= wt_max; wr++) { 136 for (uint32_t iter = tid; iter < niter; iter += nthreads) { 137 // reconstruct Ndwl, Ndbl, Ndcm 138 unsigned int Ndwl = 1 << (iter / (Ndbl_niter * Ndcm_niter)); 139 unsigned int Ndbl = 1 << ((iter / (Ndcm_niter)) % Ndbl_niter); 140 unsigned int Ndcm = 1 << (iter % Ndcm_niter); 141 for (unsigned int Ndsam_lev_1 = 1; Ndsam_lev_1 <= MAX_COL_MUX; 142 Ndsam_lev_1 *= 2) { 143 for (unsigned int Ndsam_lev_2 = 1; 144 Ndsam_lev_2 <= MAX_COL_MUX; Ndsam_lev_2 *= 2) { 145 //for debuging 146 if (g_ip->force_cache_config && is_tag == false) { 147 wr = g_ip->wt; 148 Ndwl = g_ip->ndwl; 149 Ndbl = g_ip->ndbl; 150 Ndcm = g_ip->ndcm; 151 if (g_ip->nspd != 0) { 152 Nspd = g_ip->nspd; 153 } 154 if (g_ip->ndsam1 != 0) { 155 Ndsam_lev_1 = g_ip->ndsam1; 156 Ndsam_lev_2 = g_ip->ndsam2; 157 } 158 } 159 160 if (is_tag == true) { 161 is_valid_partition = calculate_time(is_tag, pure_ram, pure_cam, Nspd, Ndwl, 162 Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2, 163 tag_arr.back(), 0, NULL, NULL, 164 is_main_mem); 165 } 166 // If it's a fully-associative cache, the data array partition parameters are identical to that of 167 // the tag array, so compute data array partition properties also here. 168 if (is_tag == false || g_ip->fully_assoc) { 169 is_valid_partition = calculate_time(is_tag/*false*/, pure_ram, pure_cam, Nspd, Ndwl, 170 Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2, 171 data_arr.back(), 0, NULL, NULL, 172 is_main_mem); 173 } 174 175 if (is_valid_partition) { 176 if (is_tag == true) { 177 tag_arr.back()->wt = (enum Wire_type) wr; 178 tag_res->update_min_values(tag_arr.back()); 179 tag_arr.push_back(new mem_array); 180 } 181 if (is_tag == false || g_ip->fully_assoc) { 182 data_arr.back()->wt = (enum Wire_type) wr; 183 data_res->update_min_values(data_arr.back()); 184 data_arr.push_back(new mem_array); 185 } 186 } 187 188 if (g_ip->force_cache_config && is_tag == false) { 189 wr = wt_max; 190 iter = niter; 191 if (g_ip->nspd != 0) { 192 Nspd = MAXDATASPD; 193 } 194 if (g_ip->ndsam1 != 0) { 195 Ndsam_lev_1 = MAX_COL_MUX + 1; 196 Ndsam_lev_2 = MAX_COL_MUX + 1; 197 } 198 } 199 } 200 } 201 } 202 } 203 } 204 205 delete data_arr.back(); 206 delete tag_arr.back(); 207 data_arr.pop_back(); 208 tag_arr.pop_back(); 209 210#ifndef DEBUG 211 pthread_exit(NULL); 212#else 213 return NULL; 214#endif 215} 216 217 218 219bool calculate_time( 220 bool is_tag, 221 int pure_ram, 222 bool pure_cam, 223 double Nspd, 224 unsigned int Ndwl, 225 unsigned int Ndbl, 226 unsigned int Ndcm, 227 unsigned int Ndsam_lev_1, 228 unsigned int Ndsam_lev_2, 229 mem_array *ptr_array, 230 int flag_results_populate, 231 results_mem_array *ptr_results, 232 uca_org_t *ptr_fin_res, 233 bool is_main_mem) { 234 DynamicParameter dyn_p(is_tag, pure_ram, pure_cam, Nspd, Ndwl, Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2, is_main_mem); 235 236 if (dyn_p.is_valid == false) { 237 return false; 238 } 239 240 UCA * uca = new UCA(dyn_p); 241 242 243 //For the final solution, populate the ptr_results data structure 244 //-- TODO: copy only necessary variables 245 if (flag_results_populate) { 246 } else { 247 int num_act_mats_hor_dir = uca->bank.dp.num_act_mats_hor_dir; 248 int num_mats = uca->bank.dp.num_mats; 249 bool is_fa = uca->bank.dp.fully_assoc; 250 bool pure_cam = uca->bank.dp.pure_cam; 251 ptr_array->Ndwl = Ndwl; 252 ptr_array->Ndbl = Ndbl; 253 ptr_array->Nspd = Nspd; 254 ptr_array->deg_bl_muxing = dyn_p.deg_bl_muxing; 255 ptr_array->Ndsam_lev_1 = Ndsam_lev_1; 256 ptr_array->Ndsam_lev_2 = Ndsam_lev_2; 257 ptr_array->access_time = uca->access_time; 258 ptr_array->cycle_time = uca->cycle_time; 259 ptr_array->multisubbank_interleave_cycle_time = 260 uca->multisubbank_interleave_cycle_time; 261 ptr_array->area_ram_cells = uca->area_all_dataramcells; 262 ptr_array->area = uca->area.get_area(); 263 ptr_array->height = uca->area.h; 264 ptr_array->width = uca->area.w; 265 ptr_array->mat_height = uca->bank.mat.area.h; 266 ptr_array->mat_length = uca->bank.mat.area.w; 267 ptr_array->subarray_height = uca->bank.mat.subarray.area.h; 268 ptr_array->subarray_length = uca->bank.mat.subarray.area.w; 269 ptr_array->power = uca->power; 270 ptr_array->delay_senseamp_mux_decoder = 271 MAX(uca->delay_array_to_sa_mux_lev_1_decoder, 272 uca->delay_array_to_sa_mux_lev_2_decoder); 273 ptr_array->delay_before_subarray_output_driver = 274 uca->delay_before_subarray_output_driver; 275 ptr_array->delay_from_subarray_output_driver_to_output = 276 uca->delay_from_subarray_out_drv_to_out; 277 278 ptr_array->delay_route_to_bank = uca->htree_in_add->delay; 279 ptr_array->delay_input_htree = uca->bank.htree_in_add->delay; 280 ptr_array->delay_row_predecode_driver_and_block = 281 uca->bank.mat.r_predec->delay; 282 ptr_array->delay_row_decoder = uca->bank.mat.row_dec->delay; 283 ptr_array->delay_bitlines = uca->bank.mat.delay_bitline; 284 ptr_array->delay_matchlines = uca->bank.mat.delay_matchchline; 285 ptr_array->delay_sense_amp = uca->bank.mat.delay_sa; 286 ptr_array->delay_subarray_output_driver = 287 uca->bank.mat.delay_subarray_out_drv_htree; 288 ptr_array->delay_dout_htree = uca->bank.htree_out_data->delay; 289 ptr_array->delay_comparator = uca->bank.mat.delay_comparator; 290 291 ptr_array->all_banks_height = uca->area.h; 292 ptr_array->all_banks_width = uca->area.w; 293 ptr_array->area_efficiency = uca->area_all_dataramcells * 100 / 294 (uca->area.get_area()); 295 296 ptr_array->power_routing_to_bank = uca->power_routing_to_bank; 297 ptr_array->power_addr_input_htree = uca->bank.htree_in_add->power; 298 ptr_array->power_data_input_htree = uca->bank.htree_in_data->power; 299 ptr_array->power_data_output_htree = uca->bank.htree_out_data->power; 300 301 ptr_array->power_row_predecoder_drivers = 302 uca->bank.mat.r_predec->driver_power; 303 ptr_array->power_row_predecoder_drivers.readOp.dynamic *= 304 num_act_mats_hor_dir; 305 ptr_array->power_row_predecoder_drivers.writeOp.dynamic *= 306 num_act_mats_hor_dir; 307 ptr_array->power_row_predecoder_drivers.searchOp.dynamic *= 308 num_act_mats_hor_dir; 309 310 ptr_array->power_row_predecoder_blocks = 311 uca->bank.mat.r_predec->block_power; 312 ptr_array->power_row_predecoder_blocks.readOp.dynamic *= 313 num_act_mats_hor_dir; 314 ptr_array->power_row_predecoder_blocks.writeOp.dynamic *= 315 num_act_mats_hor_dir; 316 ptr_array->power_row_predecoder_blocks.searchOp.dynamic *= 317 num_act_mats_hor_dir; 318 319 ptr_array->power_row_decoders = uca->bank.mat.power_row_decoders; 320 ptr_array->power_row_decoders.readOp.dynamic *= num_act_mats_hor_dir; 321 ptr_array->power_row_decoders.writeOp.dynamic *= num_act_mats_hor_dir; 322 ptr_array->power_row_decoders.searchOp.dynamic *= num_act_mats_hor_dir; 323 324 ptr_array->power_bit_mux_predecoder_drivers = 325 uca->bank.mat.b_mux_predec->driver_power; 326 ptr_array->power_bit_mux_predecoder_drivers.readOp.dynamic *= 327 num_act_mats_hor_dir; 328 ptr_array->power_bit_mux_predecoder_drivers.writeOp.dynamic *= 329 num_act_mats_hor_dir; 330 ptr_array->power_bit_mux_predecoder_drivers.searchOp.dynamic *= 331 num_act_mats_hor_dir; 332 333 ptr_array->power_bit_mux_predecoder_blocks = 334 uca->bank.mat.b_mux_predec->block_power; 335 ptr_array->power_bit_mux_predecoder_blocks.readOp.dynamic *= 336 num_act_mats_hor_dir; 337 ptr_array->power_bit_mux_predecoder_blocks.writeOp.dynamic *= 338 num_act_mats_hor_dir; 339 ptr_array->power_bit_mux_predecoder_blocks.searchOp.dynamic *= 340 num_act_mats_hor_dir; 341 342 ptr_array->power_bit_mux_decoders = uca->bank.mat.power_bit_mux_decoders; 343 ptr_array->power_bit_mux_decoders.readOp.dynamic *= num_act_mats_hor_dir; 344 ptr_array->power_bit_mux_decoders.writeOp.dynamic *= 345 num_act_mats_hor_dir; 346 ptr_array->power_bit_mux_decoders.searchOp.dynamic *= 347 num_act_mats_hor_dir; 348 349 ptr_array->power_senseamp_mux_lev_1_predecoder_drivers = 350 uca->bank.mat.sa_mux_lev_1_predec->driver_power; 351 ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .readOp.dynamic *= 352 num_act_mats_hor_dir; 353 ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .writeOp.dynamic *= 354 num_act_mats_hor_dir; 355 ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .searchOp.dynamic *= 356 num_act_mats_hor_dir; 357 358 ptr_array->power_senseamp_mux_lev_1_predecoder_blocks = 359 uca->bank.mat.sa_mux_lev_1_predec->block_power; 360 ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.readOp.dynamic *= 361 num_act_mats_hor_dir; 362 ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.writeOp.dynamic *= 363 num_act_mats_hor_dir; 364 ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.searchOp.dynamic *= 365 num_act_mats_hor_dir; 366 367 ptr_array->power_senseamp_mux_lev_1_decoders = 368 uca->bank.mat.power_sa_mux_lev_1_decoders; 369 ptr_array->power_senseamp_mux_lev_1_decoders.readOp.dynamic *= 370 num_act_mats_hor_dir; 371 ptr_array->power_senseamp_mux_lev_1_decoders.writeOp.dynamic *= 372 num_act_mats_hor_dir; 373 ptr_array->power_senseamp_mux_lev_1_decoders.searchOp.dynamic *= 374 num_act_mats_hor_dir; 375 376 ptr_array->power_senseamp_mux_lev_2_predecoder_drivers = 377 uca->bank.mat.sa_mux_lev_2_predec->driver_power; 378 ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.readOp.dynamic *= 379 num_act_mats_hor_dir; 380 ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.writeOp.dynamic *= 381 num_act_mats_hor_dir; 382 ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.searchOp.dynamic *= 383 num_act_mats_hor_dir; 384 385 ptr_array->power_senseamp_mux_lev_2_predecoder_blocks = 386 uca->bank.mat.sa_mux_lev_2_predec->block_power; 387 ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.readOp.dynamic *= 388 num_act_mats_hor_dir; 389 ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.writeOp.dynamic *= 390 num_act_mats_hor_dir; 391 ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.searchOp.dynamic *= 392 num_act_mats_hor_dir; 393 394 ptr_array->power_senseamp_mux_lev_2_decoders = 395 uca->bank.mat.power_sa_mux_lev_2_decoders; 396 ptr_array->power_senseamp_mux_lev_2_decoders .readOp.dynamic *= 397 num_act_mats_hor_dir; 398 ptr_array->power_senseamp_mux_lev_2_decoders .writeOp.dynamic *= 399 num_act_mats_hor_dir; 400 ptr_array->power_senseamp_mux_lev_2_decoders .searchOp.dynamic *= 401 num_act_mats_hor_dir; 402 403 ptr_array->power_bitlines = uca->bank.mat.power_bitline; 404 ptr_array->power_bitlines.readOp.dynamic *= num_act_mats_hor_dir; 405 ptr_array->power_bitlines.writeOp.dynamic *= num_act_mats_hor_dir; 406 ptr_array->power_bitlines.searchOp.dynamic *= num_act_mats_hor_dir; 407 408 ptr_array->power_sense_amps = uca->bank.mat.power_sa; 409 ptr_array->power_sense_amps.readOp.dynamic *= num_act_mats_hor_dir; 410 ptr_array->power_sense_amps.writeOp.dynamic *= num_act_mats_hor_dir; 411 ptr_array->power_sense_amps.searchOp.dynamic *= num_act_mats_hor_dir; 412 413 ptr_array->power_prechg_eq_drivers = 414 uca->bank.mat.power_bl_precharge_eq_drv; 415 ptr_array->power_prechg_eq_drivers.readOp.dynamic *= 416 num_act_mats_hor_dir; 417 ptr_array->power_prechg_eq_drivers.writeOp.dynamic *= 418 num_act_mats_hor_dir; 419 ptr_array->power_prechg_eq_drivers.searchOp.dynamic *= 420 num_act_mats_hor_dir; 421 422 ptr_array->power_output_drivers_at_subarray = 423 uca->bank.mat.power_subarray_out_drv; 424 ptr_array->power_output_drivers_at_subarray.readOp.dynamic *= 425 num_act_mats_hor_dir; 426 ptr_array->power_output_drivers_at_subarray.writeOp.dynamic *= 427 num_act_mats_hor_dir; 428 ptr_array->power_output_drivers_at_subarray.searchOp.dynamic *= 429 num_act_mats_hor_dir; 430 431 ptr_array->power_comparators = uca->bank.mat.power_comparator; 432 ptr_array->power_comparators.readOp.dynamic *= num_act_mats_hor_dir; 433 ptr_array->power_comparators.writeOp.dynamic *= num_act_mats_hor_dir; 434 ptr_array->power_comparators.searchOp.dynamic *= num_act_mats_hor_dir; 435 436 if (is_fa || pure_cam) { 437 ptr_array->power_htree_in_search = 438 uca->bank.htree_in_search->power; 439 ptr_array->power_htree_out_search = 440 uca->bank.htree_out_search->power; 441 ptr_array->power_searchline = uca->bank.mat.power_searchline; 442 ptr_array->power_searchline.searchOp.dynamic *= num_mats; 443 ptr_array->power_searchline_precharge = 444 uca->bank.mat.power_searchline_precharge; 445 ptr_array->power_searchline_precharge.searchOp.dynamic *= num_mats; 446 ptr_array->power_matchlines = uca->bank.mat.power_matchline; 447 ptr_array->power_matchlines.searchOp.dynamic *= num_mats; 448 ptr_array->power_matchline_precharge = 449 uca->bank.mat.power_matchline_precharge; 450 ptr_array->power_matchline_precharge.searchOp.dynamic *= num_mats; 451 ptr_array->power_matchline_to_wordline_drv = 452 uca->bank.mat.power_ml_to_ram_wl_drv; 453 } 454 455 ptr_array->activate_energy = uca->activate_energy; 456 ptr_array->read_energy = uca->read_energy; 457 ptr_array->write_energy = uca->write_energy; 458 ptr_array->precharge_energy = uca->precharge_energy; 459 ptr_array->refresh_power = uca->refresh_power; 460 ptr_array->leak_power_subbank_closed_page = 461 uca->leak_power_subbank_closed_page; 462 ptr_array->leak_power_subbank_open_page = 463 uca->leak_power_subbank_open_page; 464 ptr_array->leak_power_request_and_reply_networks = 465 uca->leak_power_request_and_reply_networks; 466 467 ptr_array->precharge_delay = uca->precharge_delay; 468 } 469 470 471 delete uca; 472 return true; 473} 474 475 476 477bool check_uca_org(uca_org_t & u, min_values_t *minval) { 478 if (((u.access_time - minval->min_delay) * 100 / minval->min_delay) > 479 g_ip->delay_dev) { 480 return false; 481 } 482 if (((u.power.readOp.dynamic - minval->min_dyn) / minval->min_dyn)*100 > 483 g_ip->dynamic_power_dev) { 484 return false; 485 } 486 if (((u.power.readOp.leakage - minval->min_leakage) / 487 minval->min_leakage) * 100 > 488 g_ip->leakage_power_dev) { 489 return false; 490 } 491 if (((u.cycle_time - minval->min_cyc) / minval->min_cyc)*100 > 492 g_ip->cycle_time_dev) { 493 return false; 494 } 495 if (((u.area - minval->min_area) / minval->min_area)*100 > 496 g_ip->area_dev) { 497 return false; 498 } 499 return true; 500} 501 502bool check_mem_org(mem_array & u, const min_values_t *minval) { 503 if (((u.access_time - minval->min_delay) * 100 / minval->min_delay) > 504 g_ip->delay_dev) { 505 return false; 506 } 507 if (((u.power.readOp.dynamic - minval->min_dyn) / minval->min_dyn)*100 > 508 g_ip->dynamic_power_dev) { 509 return false; 510 } 511 if (((u.power.readOp.leakage - minval->min_leakage) / 512 minval->min_leakage) * 100 > 513 g_ip->leakage_power_dev) { 514 return false; 515 } 516 if (((u.cycle_time - minval->min_cyc) / minval->min_cyc) * 100 > 517 g_ip->cycle_time_dev) { 518 return false; 519 } 520 if (((u.area - minval->min_area) / minval->min_area) * 100 > 521 g_ip->area_dev) { 522 return false; 523 } 524 return true; 525} 526 527 528 529 530void find_optimal_uca(uca_org_t *res, min_values_t * minval, 531 list<uca_org_t> & ulist) { 532 double cost = 0; 533 double min_cost = BIGNUM; 534 float d, a, dp, lp, c; 535 536 dp = g_ip->dynamic_power_wt; 537 lp = g_ip->leakage_power_wt; 538 a = g_ip->area_wt; 539 d = g_ip->delay_wt; 540 c = g_ip->cycle_time_wt; 541 542 if (ulist.empty() == true) { 543 cout << "ERROR: no valid cache organizations found" << endl; 544 exit(0); 545 } 546 547 for (list<uca_org_t>::iterator niter = ulist.begin(); niter != ulist.end(); 548 niter++) { 549 if (g_ip->ed == 1) { 550 cost = ((niter)->access_time / minval->min_delay) * 551 ((niter)->power.readOp.dynamic / minval->min_dyn); 552 if (min_cost > cost) { 553 min_cost = cost; 554 *res = (*(niter)); 555 } 556 } else if (g_ip->ed == 2) { 557 cost = ((niter)->access_time / minval->min_delay) * 558 ((niter)->access_time / minval->min_delay) * 559 ((niter)->power.readOp.dynamic / minval->min_dyn); 560 if (min_cost > cost) { 561 min_cost = cost; 562 *res = (*(niter)); 563 } 564 } else { 565 /* 566 * check whether the current organization 567 * meets the input deviation constraints 568 */ 569 bool v = check_uca_org(*niter, minval); 570 571 if (v) { 572 cost = (d * ((niter)->access_time / minval->min_delay) + 573 c * ((niter)->cycle_time / minval->min_cyc) + 574 dp * ((niter)->power.readOp.dynamic / minval->min_dyn) + 575 lp * 576 ((niter)->power.readOp.leakage / minval->min_leakage) + 577 a * ((niter)->area / minval->min_area)); 578 579 if (min_cost > cost) { 580 min_cost = cost; 581 *res = (*(niter)); 582 niter = ulist.erase(niter); 583 if (niter != ulist.begin()) 584 niter--; 585 } 586 } else { 587 niter = ulist.erase(niter); 588 if (niter != ulist.begin()) 589 niter--; 590 } 591 } 592 } 593 594 if (min_cost == BIGNUM) { 595 cout << "ERROR: no cache organizations met optimization criteria" 596 << endl; 597 exit(0); 598 } 599} 600 601 602 603void filter_tag_arr(const min_values_t * min, list<mem_array *> & list) { 604 double cost = BIGNUM; 605 double cur_cost; 606 double wt_delay = g_ip->delay_wt; 607 double wt_dyn = g_ip->dynamic_power_wt; 608 double wt_leakage = g_ip->leakage_power_wt; 609 double wt_cyc = g_ip->cycle_time_wt; 610 double wt_area = g_ip->area_wt; 611 mem_array * res = NULL; 612 613 if (list.empty() == true) { 614 cout << "ERROR: no valid tag organizations found" << endl; 615 exit(1); 616 } 617 618 619 while (list.empty() != true) { 620 bool v = check_mem_org(*list.back(), min); 621 if (v) { 622 cur_cost = wt_delay * (list.back()->access_time / min->min_delay) + 623 wt_dyn * (list.back()->power.readOp.dynamic / 624 min->min_dyn) + 625 wt_leakage * (list.back()->power.readOp.leakage / 626 min->min_leakage) + 627 wt_area * (list.back()->area / min->min_area) + 628 wt_cyc * (list.back()->cycle_time / min->min_cyc); 629 } else { 630 cur_cost = BIGNUM; 631 } 632 if (cur_cost < cost) { 633 if (res != NULL) { 634 delete res; 635 } 636 cost = cur_cost; 637 res = list.back(); 638 } else { 639 delete list.back(); 640 } 641 list.pop_back(); 642 } 643 if (!res) { 644 cout << "ERROR: no valid tag organizations found" << endl; 645 exit(0); 646 } 647 648 list.push_back(res); 649} 650 651 652 653void filter_data_arr(list<mem_array *> & curr_list) { 654 if (curr_list.empty() == true) { 655 cout << "ERROR: no valid data array organizations found" << endl; 656 exit(1); 657 } 658 659 list<mem_array *>::iterator iter; 660 661 for (iter = curr_list.begin(); iter != curr_list.end(); ++iter) { 662 mem_array * m = *iter; 663 664 if (m == NULL) exit(1); 665 666 if (((m->access_time - m->arr_min->min_delay) / m->arr_min->min_delay > 667 0.5) && 668 ((m->power.readOp.dynamic - m->arr_min->min_dyn) / 669 m->arr_min->min_dyn > 0.5)) { 670 delete m; 671 iter = curr_list.erase(iter); 672 iter --; 673 } 674 } 675} 676 677 678 679/* 680 * Performs exhaustive search across different sub-array sizes, 681 * wire types and aspect ratios to find an optimal UCA organization 682 * 1. First different valid tag array organizations are calculated 683 * and stored in tag_arr array 684 * 2. The exhaustive search is repeated to find valid data array 685 * organizations and stored in data_arr array 686 * 3. Cache area, delay, power, and cycle time for different 687 * cache organizations are calculated based on the 688 * above results 689 * 4. Cache model with least cost is picked from sol_list 690 */ 691void solve(uca_org_t *fin_res) { 692 bool is_dram = false; 693 int pure_ram = g_ip->pure_ram; 694 bool pure_cam = g_ip->pure_cam; 695 696 init_tech_params(g_ip->F_sz_um, false); 697 698 699 list<mem_array *> tag_arr (0); 700 list<mem_array *> data_arr(0); 701 list<mem_array *>::iterator miter; 702 list<uca_org_t> sol_list(1, uca_org_t()); 703 704 fin_res->tag_array.access_time = 0; 705 fin_res->tag_array.Ndwl = 0; 706 fin_res->tag_array.Ndbl = 0; 707 fin_res->tag_array.Nspd = 0; 708 fin_res->tag_array.deg_bl_muxing = 0; 709 fin_res->tag_array.Ndsam_lev_1 = 0; 710 fin_res->tag_array.Ndsam_lev_2 = 0; 711 712 713 // distribute calculate_time() execution to multiple threads 714 calc_time_mt_wrapper_struct * calc_array = 715 new calc_time_mt_wrapper_struct[nthreads]; 716 pthread_t threads[nthreads]; 717 718 for (uint32_t t = 0; t < nthreads; t++) { 719 calc_array[t].tid = t; 720 calc_array[t].pure_ram = pure_ram; 721 calc_array[t].pure_cam = pure_cam; 722 calc_array[t].data_res = new min_values_t(); 723 calc_array[t].tag_res = new min_values_t(); 724 } 725 726 bool is_tag; 727 uint32_t ram_cell_tech_type; 728 729 // If it's a cache, first calculate the area, delay and power for all tag array partitions. 730 if (!(pure_ram || pure_cam || g_ip->fully_assoc)) { //cache 731 is_tag = true; 732 ram_cell_tech_type = g_ip->tag_arr_ram_cell_tech_type; 733 is_dram = ((ram_cell_tech_type == lp_dram) || 734 (ram_cell_tech_type == comm_dram)); 735 init_tech_params(g_ip->F_sz_um, is_tag); 736 737 for (uint32_t t = 0; t < nthreads; t++) { 738 calc_array[t].is_tag = is_tag; 739 calc_array[t].is_main_mem = false; 740 calc_array[t].Nspd_min = 0.125; 741#ifndef DEBUG 742 pthread_create(&threads[t], NULL, calc_time_mt_wrapper, 743 (void *)(&(calc_array[t]))); 744#else 745 calc_time_mt_wrapper((void *)(&(calc_array[t]))); 746#endif 747 } 748 749#ifndef DEBUG 750 for (uint32_t t = 0; t < nthreads; t++) { 751 pthread_join(threads[t], NULL); 752 } 753#endif 754 755 for (uint32_t t = 0; t < nthreads; t++) { 756 calc_array[t].data_arr.sort(mem_array::lt); 757 data_arr.merge(calc_array[t].data_arr, mem_array::lt); 758 calc_array[t].tag_arr.sort(mem_array::lt); 759 tag_arr.merge(calc_array[t].tag_arr, mem_array::lt); 760 } 761 } 762 763 764 // calculate the area, delay and power for all data array partitions (for cache or plain RAM). 765 // in the new cacti, cam, fully_associative cache are processed as single array in the data portion 766 is_tag = false; 767 ram_cell_tech_type = g_ip->data_arr_ram_cell_tech_type; 768 is_dram = ((ram_cell_tech_type == lp_dram) || (ram_cell_tech_type == comm_dram)); 769 init_tech_params(g_ip->F_sz_um, is_tag); 770 771 for (uint32_t t = 0; t < nthreads; t++) { 772 calc_array[t].is_tag = is_tag; 773 calc_array[t].is_main_mem = g_ip->is_main_mem; 774 if (!(pure_cam || g_ip->fully_assoc)) { 775 calc_array[t].Nspd_min = (double)(g_ip->out_w) / 776 (double)(g_ip->block_sz * 8); 777 } else { 778 calc_array[t].Nspd_min = 1; 779 } 780 781#ifndef DEBUG 782 pthread_create(&threads[t], NULL, calc_time_mt_wrapper, 783 (void *)(&(calc_array[t]))); 784#else 785 calc_time_mt_wrapper((void *)(&(calc_array[t]))); 786#endif 787 } 788 789#ifndef DEBUG 790 for (uint32_t t = 0; t < nthreads; t++) { 791 pthread_join(threads[t], NULL); 792 } 793#endif 794 795 data_arr.clear(); 796 for (uint32_t t = 0; t < nthreads; t++) { 797 calc_array[t].data_arr.sort(mem_array::lt); 798 data_arr.merge(calc_array[t].data_arr, mem_array::lt); 799 800 801 } 802 803 804 805 min_values_t * d_min = new min_values_t(); 806 min_values_t * t_min = new min_values_t(); 807 min_values_t * cache_min = new min_values_t(); 808 809 for (uint32_t t = 0; t < nthreads; t++) { 810 d_min->update_min_values(calc_array[t].data_res); 811 t_min->update_min_values(calc_array[t].tag_res); 812 } 813 814 for (miter = data_arr.begin(); miter != data_arr.end(); miter++) { 815 (*miter)->arr_min = d_min; 816 } 817 818 filter_data_arr(data_arr); 819 if (!(pure_ram || pure_cam || g_ip->fully_assoc)) { 820 filter_tag_arr(t_min, tag_arr); 821 } 822 823 if (pure_ram || pure_cam || g_ip->fully_assoc) { 824 for (miter = data_arr.begin(); miter != data_arr.end(); miter++) { 825 uca_org_t & curr_org = sol_list.back(); 826 curr_org.tag_array2 = NULL; 827 curr_org.data_array2 = (*miter); 828 829 curr_org.find_delay(); 830 curr_org.find_energy(); 831 curr_org.find_area(); 832 curr_org.find_cyc(); 833 834 //update min values for the entire cache 835 cache_min->update_min_values(curr_org); 836 837 sol_list.push_back(uca_org_t()); 838 } 839 } else { 840 while (tag_arr.empty() != true) { 841 mem_array * arr_temp = (tag_arr.back()); 842 tag_arr.pop_back(); 843 844 for (miter = data_arr.begin(); miter != data_arr.end(); miter++) { 845 uca_org_t & curr_org = sol_list.back(); 846 curr_org.tag_array2 = arr_temp; 847 curr_org.data_array2 = (*miter); 848 849 curr_org.find_delay(); 850 curr_org.find_energy(); 851 curr_org.find_area(); 852 curr_org.find_cyc(); 853 854 //update min values for the entire cache 855 cache_min->update_min_values(curr_org); 856 857 sol_list.push_back(uca_org_t()); 858 } 859 } 860 } 861 862 sol_list.pop_back(); 863 864 find_optimal_uca(fin_res, cache_min, sol_list); 865 866 sol_list.clear(); 867 868 for (miter = data_arr.begin(); miter != data_arr.end(); ++miter) { 869 if (*miter != fin_res->data_array2) { 870 delete *miter; 871 } 872 } 873 data_arr.clear(); 874 875 for (uint32_t t = 0; t < nthreads; t++) { 876 delete calc_array[t].data_res; 877 delete calc_array[t].tag_res; 878 } 879 880 delete [] calc_array; 881 delete cache_min; 882 delete d_min; 883 delete t_min; 884} 885 886void update(uca_org_t *fin_res) 887{ 888 if(fin_res->tag_array2) 889 { 890 init_tech_params(g_ip->F_sz_um,true); 891 DynamicParameter tag_arr_dyn_p(true, g_ip->pure_ram, g_ip->pure_cam, 892 fin_res->tag_array2->Nspd, 893 fin_res->tag_array2->Ndwl, 894 fin_res->tag_array2->Ndbl, 895 fin_res->tag_array2->Ndcm, 896 fin_res->tag_array2->Ndsam_lev_1, 897 fin_res->tag_array2->Ndsam_lev_2, 898 g_ip->is_main_mem); 899 if(tag_arr_dyn_p.is_valid) 900 { 901 UCA * tag_arr = new UCA(tag_arr_dyn_p); 902 fin_res->tag_array2->power = tag_arr->power; 903 } 904 else 905 { 906 cout << "ERROR: Cannot retrieve array structure for leakage feedback" 907 << endl; 908 exit(1); 909 } 910 } 911 init_tech_params(g_ip->F_sz_um,false); 912 DynamicParameter data_arr_dyn_p(false, g_ip->pure_ram, g_ip->pure_cam, 913 fin_res->data_array2->Nspd, 914 fin_res->data_array2->Ndwl, 915 fin_res->data_array2->Ndbl, 916 fin_res->data_array2->Ndcm, 917 fin_res->data_array2->Ndsam_lev_1, 918 fin_res->data_array2->Ndsam_lev_2, 919 g_ip->is_main_mem); 920 if(data_arr_dyn_p.is_valid) 921 { 922 UCA * data_arr = new UCA(data_arr_dyn_p); 923 fin_res->data_array2->power = data_arr->power; 924 } 925 else 926 { 927 cout << "ERROR: Cannot retrieve array structure for leakage feedback" 928 << endl; 929 exit(1); 930 } 931 932 fin_res->find_energy(); 933} 934 935