nuca.cc revision 10234
1/***************************************************************************** 2 * McPAT/CACTI 3 * SOFTWARE LICENSE AGREEMENT 4 * Copyright 2012 Hewlett-Packard Development Company, L.P. 5 * Copyright (c) 2010-2013 Advanced Micro Devices, Inc. 6 * All Rights Reserved 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are 10 * met: redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer; 12 * redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution; 15 * neither the name of the copyright holders nor the names of its 16 * contributors may be used to endorse or promote products derived from 17 * this software without specific prior written permission. 18 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 ***************************************************************************/ 32 33 34 35#include <cassert> 36 37#include "Ucache.h" 38#include "nuca.h" 39 40unsigned int MIN_BANKSIZE = 65536; 41#define FIXED_OVERHEAD 55e-12 /* clock skew and jitter in s. Ref: Hrishikesh et al ISCA 01 */ 42#define LATCH_DELAY 28e-12 /* latch delay in s (later should use FO4 TODO) */ 43#define CONTR_2_BANK_LAT 0 44 45int cont_stats[2 /*l2 or l3*/][5/* cores */][ROUTER_TYPES][7 /*banks*/][8 /* cycle time */]; 46 47Nuca::Nuca( 48 TechnologyParameter::DeviceType *dt = &(g_tp.peri_global) 49): deviceType(dt) { 50 init_cont(); 51} 52 53void 54Nuca::init_cont() { 55 FILE *cont; 56 char line[5000]; 57 char jk[5000]; 58 cont = fopen("contention.dat", "r"); 59 if (!cont) { 60 cout << "contention.dat file is missing!\n"; 61 exit(0); 62 } 63 64 for (int i = 0; i < 2; i++) { 65 for (int j = 2; j < 5; j++) { 66 for (int k = 0; k < ROUTER_TYPES; k++) { 67 for (int l = 0; l < 7; l++) { 68 int *temp = cont_stats[i/*l2 or l3*/][j/*core*/][k/*64 or 128 or 256 link bw*/][l /* no banks*/]; 69 assert(fscanf(cont, "%[^\n]\n", line) != EOF); 70 sscanf(line, "%[^:]: %d %d %d %d %d %d %d %d", jk, 71 &temp[0], &temp[1], &temp[2], &temp[3], 72 &temp[4], &temp[5], &temp[6], &temp[7]); 73 } 74 } 75 } 76 } 77 fclose(cont); 78} 79 80void 81Nuca::print_cont_stats() { 82 for (int i = 0; i < 2; i++) { 83 for (int j = 2; j < 5; j++) { 84 for (int k = 0; k < ROUTER_TYPES; k++) { 85 for (int l = 0; l < 7; l++) { 86 for (int m = 0; l < 7; l++) { 87 cout << cont_stats[i][j][k][l][m] << " "; 88 } 89 cout << endl; 90 } 91 } 92 } 93 } 94 cout << endl; 95} 96 97Nuca::~Nuca() { 98 for (int i = wt_min; i <= wt_max; i++) { 99 delete wire_vertical[i]; 100 delete wire_horizontal[i]; 101 } 102} 103 104/* converts latency (in s) to cycles depending upon the FREQUENCY (in GHz) */ 105int 106Nuca::calc_cycles(double lat, double oper_freq) { 107 //TODO: convert latch delay to FO4 */ 108 double cycle_time = (1.0 / (oper_freq * 1e9)); /*s*/ 109 cycle_time -= LATCH_DELAY; 110 cycle_time -= FIXED_OVERHEAD; 111 112 return (int)ceil(lat / cycle_time); 113} 114 115 116nuca_org_t::~nuca_org_t() { 117 // if(h_wire) delete h_wire; 118 // if(v_wire) delete v_wire; 119 // if(router) delete router; 120} 121 122/* 123 * Version - 6.0 124 * 125 * Perform exhaustive search across different bank organizatons, 126 * router configurations, grid organizations, and wire models and 127 * find an optimal NUCA organization 128 * For different bank count values 129 * 1. Optimal bank organization is calculated 130 * 2. For each bank organization, find different NUCA organizations 131 * using various router configurations, grid organizations, 132 * and wire models. 133 * 3. NUCA model with the least cost is picked for 134 * this particular bank count 135 * Finally include contention statistics and find the optimal 136 * NUCA configuration 137 */ 138void 139Nuca::sim_nuca() { 140 /* temp variables */ 141 int it, ro, wr; 142 int num_cyc; 143 unsigned int i, j, k; 144 unsigned int r, c; 145 int l2_c; 146 int bank_count = 0; 147 uca_org_t ures; 148 nuca_org_t *opt_n; 149 mem_array tag, data; 150 list<nuca_org_t *> nuca_list; 151 Router *router_s[ROUTER_TYPES]; 152 router_s[0] = new Router(64.0, 8, 4, &(g_tp.peri_global)); 153 router_s[0]->print_router(); 154 router_s[1] = new Router(128.0, 8, 4, &(g_tp.peri_global)); 155 router_s[1]->print_router(); 156 router_s[2] = new Router(256.0, 8, 4, &(g_tp.peri_global)); 157 router_s[2]->print_router(); 158 159 int core_in; // to store no. of cores 160 161 /* to search diff grid organizations */ 162 double curr_hop, totno_hops, totno_hhops, totno_vhops, tot_lat, 163 curr_acclat; 164 double avg_lat, avg_hop, avg_hhop, avg_vhop, avg_dyn_power, 165 avg_leakage_power; 166 167 double opt_acclat = INF, opt_avg_lat = INF, opt_tot_lat = INF; 168 int opt_rows = 0; 169 int opt_columns = 0; 170 double opt_totno_hops = 0; 171 double opt_avg_hop = 0; 172 double opt_dyn_power = 0, opt_leakage_power = 0; 173 min_values_t minval; 174 175 int bank_start = 0; 176 177 int flit_width = 0; 178 179 /* vertical and horizontal hop latency values */ 180 int ver_hop_lat, hor_hop_lat; /* in cycles */ 181 182 183 /* no. of different bank sizes to consider */ 184 int iterations; 185 186 187 g_ip->nuca_cache_sz = g_ip->cache_sz; 188 nuca_list.push_back(new nuca_org_t()); 189 190 if (g_ip->cache_level == 0) l2_c = 1; 191 else l2_c = 0; 192 193 if (g_ip->cores <= 4) core_in = 2; 194 else if (g_ip->cores <= 8) core_in = 3; 195 else if (g_ip->cores <= 16) core_in = 4; 196 else { 197 cout << "Number of cores should be <= 16!\n"; 198 exit(0); 199 } 200 201 202 // set the lower bound to an appropriate value. this depends on cache associativity 203 if (g_ip->assoc > 2) { 204 i = 2; 205 while (i != g_ip->assoc) { 206 MIN_BANKSIZE *= 2; 207 i *= 2; 208 } 209 } 210 211 iterations = (int)logtwo((int)g_ip->cache_sz / MIN_BANKSIZE); 212 213 if (g_ip->force_wiretype) { 214 if (g_ip->wt == Low_swing) { 215 wt_min = Low_swing; 216 wt_max = Low_swing; 217 } else { 218 wt_min = Global; 219 wt_max = Low_swing - 1; 220 } 221 } else { 222 wt_min = Global; 223 wt_max = Low_swing; 224 } 225 if (g_ip->nuca_bank_count != 0) { // simulate just one bank 226 if (g_ip->nuca_bank_count != 2 && g_ip->nuca_bank_count != 4 && 227 g_ip->nuca_bank_count != 8 && g_ip->nuca_bank_count != 16 && 228 g_ip->nuca_bank_count != 32 && g_ip->nuca_bank_count != 64) { 229 fprintf(stderr, "Incorrect bank count value! Please fix the ", 230 "value in cache.cfg\n"); 231 } 232 bank_start = (int)logtwo((double)g_ip->nuca_bank_count); 233 iterations = bank_start + 1; 234 g_ip->cache_sz = g_ip->cache_sz / g_ip->nuca_bank_count; 235 } 236 cout << "Simulating various NUCA configurations\n"; 237 for (it = bank_start; it < iterations; it++) { 238 /* different bank count values */ 239 ures.tag_array2 = &tag; 240 ures.data_array2 = &data; 241 /* 242 * find the optimal bank organization 243 */ 244 solve(&ures); 245// output_UCA(&ures); 246 bank_count = g_ip->nuca_cache_sz / g_ip->cache_sz; 247 cout << "====" << g_ip->cache_sz << "\n"; 248 249 for (wr = wt_min; wr <= wt_max; wr++) { 250 251 for (ro = 0; ro < ROUTER_TYPES; ro++) { 252 flit_width = (int) router_s[ro]->flit_size; //initialize router 253 nuca_list.back()->nuca_pda.cycle_time = router_s[ro]->cycle_time; 254 255 /* calculate router and wire parameters */ 256 257 double vlength = ures.cache_ht; /* length of the wire (u)*/ 258 double hlength = ures.cache_len; // u 259 260 /* find delay, area, and power for wires */ 261 wire_vertical[wr] = new Wire((enum Wire_type) wr, vlength); 262 wire_horizontal[wr] = new Wire((enum Wire_type) wr, hlength); 263 264 265 hor_hop_lat = 266 calc_cycles(wire_horizontal[wr]->delay, 267 1 /(nuca_list.back()->nuca_pda.cycle_time * 268 .001)); 269 ver_hop_lat = 270 calc_cycles(wire_vertical[wr]->delay, 271 1 / (nuca_list.back()->nuca_pda.cycle_time * 272 .001)); 273 274 /* 275 * assume a grid like topology and explore for optimal network 276 * configuration using different row and column count values. 277 */ 278 for (c = 1; c <= (unsigned int)bank_count; c++) { 279 while (bank_count % c != 0) c++; 280 r = bank_count / c; 281 282 /* 283 * to find the avg access latency of a NUCA cache, uncontended 284 * access time to each bank from the 285 * cache controller is calculated. 286 * avg latency = 287 * sum of the access latencies to individual banks)/bank 288 * count value. 289 */ 290 totno_hops = totno_hhops = totno_vhops = tot_lat = 0; 291 k = 1; 292 for (i = 0; i < r; i++) { 293 for (j = 0; j < c; j++) { 294 /* 295 * vertical hops including the 296 * first hop from the cache controller 297 */ 298 curr_hop = i + 1; 299 curr_hop += j; /* horizontal hops */ 300 totno_hhops += j; 301 totno_vhops += (i + 1); 302 curr_acclat = (i * ver_hop_lat + CONTR_2_BANK_LAT + 303 j * hor_hop_lat); 304 305 tot_lat += curr_acclat; 306 totno_hops += curr_hop; 307 } 308 } 309 avg_lat = tot_lat / bank_count; 310 avg_hop = totno_hops / bank_count; 311 avg_hhop = totno_hhops / bank_count; 312 avg_vhop = totno_vhops / bank_count; 313 314 /* net access latency */ 315 curr_acclat = 2 * avg_lat + 2 * (router_s[ro]->delay * 316 avg_hop) + 317 calc_cycles(ures.access_time, 318 1 / 319 (nuca_list.back()->nuca_pda.cycle_time * 320 .001)); 321 322 /* avg access lat of nuca */ 323 avg_dyn_power = 324 avg_hop * 325 (router_s[ro]->power.readOp.dynamic) + avg_hhop * 326 (wire_horizontal[wr]->power.readOp.dynamic) * 327 (g_ip->block_sz * 8 + 64) + avg_vhop * 328 (wire_vertical[wr]->power.readOp.dynamic) * 329 (g_ip->block_sz * 8 + 64) + ures.power.readOp.dynamic; 330 331 avg_leakage_power = 332 bank_count * router_s[ro]->power.readOp.leakage + 333 avg_hhop * (wire_horizontal[wr]->power.readOp.leakage * 334 wire_horizontal[wr]->delay) * flit_width + 335 avg_vhop * (wire_vertical[wr]->power.readOp.leakage * 336 wire_horizontal[wr]->delay); 337 338 if (curr_acclat < opt_acclat) { 339 opt_acclat = curr_acclat; 340 opt_tot_lat = tot_lat; 341 opt_avg_lat = avg_lat; 342 opt_totno_hops = totno_hops; 343 opt_avg_hop = avg_hop; 344 opt_rows = r; 345 opt_columns = c; 346 opt_dyn_power = avg_dyn_power; 347 opt_leakage_power = avg_leakage_power; 348 } 349 totno_hops = 0; 350 tot_lat = 0; 351 totno_hhops = 0; 352 totno_vhops = 0; 353 } 354 nuca_list.back()->wire_pda.power.readOp.dynamic = 355 opt_avg_hop * flit_width * 356 (wire_horizontal[wr]->power.readOp.dynamic + 357 wire_vertical[wr]->power.readOp.dynamic); 358 nuca_list.back()->avg_hops = opt_avg_hop; 359 /* network delay/power */ 360 nuca_list.back()->h_wire = wire_horizontal[wr]; 361 nuca_list.back()->v_wire = wire_vertical[wr]; 362 nuca_list.back()->router = router_s[ro]; 363 /* bank delay/power */ 364 365 nuca_list.back()->bank_pda.delay = ures.access_time; 366 nuca_list.back()->bank_pda.power = ures.power; 367 nuca_list.back()->bank_pda.area.h = ures.cache_ht; 368 nuca_list.back()->bank_pda.area.w = ures.cache_len; 369 nuca_list.back()->bank_pda.cycle_time = ures.cycle_time; 370 371 num_cyc = calc_cycles(nuca_list.back()->bank_pda.delay /*s*/, 372 1 / 373 (nuca_list.back()->nuca_pda.cycle_time * 374 .001/*GHz*/)); 375 if (num_cyc % 2 != 0) num_cyc++; 376 if (num_cyc > 16) num_cyc = 16; // we have data only up to 16 cycles 377 378 if (it < 7) { 379 nuca_list.back()->nuca_pda.delay = opt_acclat + 380 cont_stats[l2_c][core_in][ro][it][num_cyc/2-1]; 381 nuca_list.back()->contention = 382 cont_stats[l2_c][core_in][ro][it][num_cyc/2-1]; 383 } else { 384 nuca_list.back()->nuca_pda.delay = opt_acclat + 385 cont_stats[l2_c][core_in][ro][7][num_cyc/2-1]; 386 nuca_list.back()->contention = 387 cont_stats[l2_c][core_in][ro][7][num_cyc/2-1]; 388 } 389 nuca_list.back()->nuca_pda.power.readOp.dynamic = opt_dyn_power; 390 nuca_list.back()->nuca_pda.power.readOp.leakage = opt_leakage_power; 391 392 /* array organization */ 393 nuca_list.back()->bank_count = bank_count; 394 nuca_list.back()->rows = opt_rows; 395 nuca_list.back()->columns = opt_columns; 396 calculate_nuca_area (nuca_list.back()); 397 398 minval.update_min_values(nuca_list.back()); 399 nuca_list.push_back(new nuca_org_t()); 400 opt_acclat = BIGNUM; 401 402 } 403 } 404 g_ip->cache_sz /= 2; 405 } 406 407 delete(nuca_list.back()); 408 nuca_list.pop_back(); 409 opt_n = find_optimal_nuca(&nuca_list, &minval); 410 print_nuca(opt_n); 411 g_ip->cache_sz = g_ip->nuca_cache_sz / opt_n->bank_count; 412 413 list<nuca_org_t *>::iterator niter; 414 for (niter = nuca_list.begin(); niter != nuca_list.end(); ++niter) { 415 delete *niter; 416 } 417 nuca_list.clear(); 418 419 for (int i = 0; i < ROUTER_TYPES; i++) { 420 delete router_s[i]; 421 } 422 g_ip->display_ip(); 423 // g_ip->force_cache_config = true; 424 // g_ip->ndwl = 8; 425 // g_ip->ndbl = 16; 426 // g_ip->nspd = 4; 427 // g_ip->ndcm = 1; 428 // g_ip->ndsam1 = 8; 429 // g_ip->ndsam2 = 32; 430 431} 432 433 434void 435Nuca::print_nuca (nuca_org_t *fr) { 436 printf("\n---------- CACTI version 6.5, Non-uniform Cache Access " 437 "----------\n\n"); 438 printf("Optimal number of banks - %d\n", fr->bank_count); 439 printf("Grid organization rows x columns - %d x %d\n", 440 fr->rows, fr->columns); 441 printf("Network frequency - %g GHz\n", 442 (1 / fr->nuca_pda.cycle_time)*1e3); 443 printf("Cache dimension (mm x mm) - %g x %g\n", 444 fr->nuca_pda.area.h, 445 fr->nuca_pda.area.w); 446 447 fr->router->print_router(); 448 449 printf("\n\nWire stats:\n"); 450 if (fr->h_wire->wt == Global) { 451 printf("\tWire type - Full swing global wires with least " 452 "possible delay\n"); 453 } else if (fr->h_wire->wt == Global_5) { 454 printf("\tWire type - Full swing global wires with " 455 "5%% delay penalty\n"); 456 } else if (fr->h_wire->wt == Global_10) { 457 printf("\tWire type - Full swing global wires with " 458 "10%% delay penalty\n"); 459 } else if (fr->h_wire->wt == Global_20) { 460 printf("\tWire type - Full swing global wires with " 461 "20%% delay penalty\n"); 462 } else if (fr->h_wire->wt == Global_30) { 463 printf("\tWire type - Full swing global wires with " 464 "30%% delay penalty\n"); 465 } else if (fr->h_wire->wt == Low_swing) { 466 printf("\tWire type - Low swing wires\n"); 467 } 468 469 printf("\tHorizontal link delay - %g (ns)\n", 470 fr->h_wire->delay*1e9); 471 printf("\tVertical link delay - %g (ns)\n", 472 fr->v_wire->delay*1e9); 473 printf("\tDelay/length - %g (ns/mm)\n", 474 fr->h_wire->delay*1e9 / fr->bank_pda.area.w); 475 printf("\tHorizontal link energy -dynamic/access %g (nJ)\n" 476 "\t -leakage %g (nW)\n\n", 477 fr->h_wire->power.readOp.dynamic*1e9, 478 fr->h_wire->power.readOp.leakage*1e9); 479 printf("\tVertical link energy -dynamic/access %g (nJ)\n" 480 "\t -leakage %g (nW)\n\n", 481 fr->v_wire->power.readOp.dynamic*1e9, 482 fr->v_wire->power.readOp.leakage*1e9); 483 printf("\n\n"); 484 fr->v_wire->print_wire(); 485 printf("\n\nBank stats:\n"); 486} 487 488 489nuca_org_t * 490Nuca::find_optimal_nuca (list<nuca_org_t *> *n, min_values_t *minval) { 491 double cost = 0; 492 double min_cost = BIGNUM; 493 nuca_org_t *res = NULL; 494 float d, a, dp, lp, c; 495 int v; 496 dp = g_ip->dynamic_power_wt_nuca; 497 lp = g_ip->leakage_power_wt_nuca; 498 a = g_ip->area_wt_nuca; 499 d = g_ip->delay_wt_nuca; 500 c = g_ip->cycle_time_wt_nuca; 501 502 list<nuca_org_t *>::iterator niter; 503 504 505 for (niter = n->begin(); niter != n->end(); niter++) { 506 fprintf(stderr, "\n-----------------------------" 507 "---------------\n"); 508 509 510 printf("NUCA___stats %d \tbankcount: lat = %g \tdynP = %g \twt = %d\t " 511 "bank_dpower = %g \tleak = %g \tcycle = %g\n", 512 (*niter)->bank_count, 513 (*niter)->nuca_pda.delay, 514 (*niter)->nuca_pda.power.readOp.dynamic, 515 (*niter)->h_wire->wt, 516 (*niter)->bank_pda.power.readOp.dynamic, 517 (*niter)->nuca_pda.power.readOp.leakage, 518 (*niter)->nuca_pda.cycle_time); 519 520 521 if (g_ip->ed == 1) { 522 cost = ((*niter)->nuca_pda.delay / minval->min_delay) * 523 ((*niter)->nuca_pda.power.readOp.dynamic / minval->min_dyn); 524 if (min_cost > cost) { 525 min_cost = cost; 526 res = ((*niter)); 527 } 528 } else if (g_ip->ed == 2) { 529 cost = ((*niter)->nuca_pda.delay / minval->min_delay) * 530 ((*niter)->nuca_pda.delay / minval->min_delay) * 531 ((*niter)->nuca_pda.power.readOp.dynamic / minval->min_dyn); 532 if (min_cost > cost) { 533 min_cost = cost; 534 res = ((*niter)); 535 } 536 } else { 537 /* 538 * check whether the current organization 539 * meets the input deviation constraints 540 */ 541 v = check_nuca_org((*niter), minval); 542 if (minval->min_leakage == 0) minval->min_leakage = 0.1; //FIXME remove this after leakage modeling 543 544 if (v) { 545 cost = (d * ((*niter)->nuca_pda.delay / minval->min_delay) + 546 c * ((*niter)->nuca_pda.cycle_time / minval->min_cyc) + 547 dp * ((*niter)->nuca_pda.power.readOp.dynamic / 548 minval->min_dyn) + 549 lp * ((*niter)->nuca_pda.power.readOp.leakage / 550 minval->min_leakage) + 551 a * ((*niter)->nuca_pda.area.get_area() / 552 minval->min_area)); 553 fprintf(stderr, "cost = %g\n", cost); 554 555 if (min_cost > cost) { 556 min_cost = cost; 557 res = ((*niter)); 558 } 559 } else { 560 niter = n->erase(niter); 561 if (niter != n->begin()) 562 niter --; 563 } 564 } 565 } 566 return res; 567} 568 569int 570Nuca::check_nuca_org (nuca_org_t *n, min_values_t *minval) { 571 if (((n->nuca_pda.delay - minval->min_delay)*100 / minval->min_delay) > 572 g_ip->delay_dev_nuca) { 573 return 0; 574 } 575 if (((n->nuca_pda.power.readOp.dynamic - minval->min_dyn) / 576 minval->min_dyn)*100 > 577 g_ip->dynamic_power_dev_nuca) { 578 return 0; 579 } 580 if (((n->nuca_pda.power.readOp.leakage - minval->min_leakage) / 581 minval->min_leakage)*100 > 582 g_ip->leakage_power_dev_nuca) { 583 return 0; 584 } 585 if (((n->nuca_pda.cycle_time - minval->min_cyc) / minval->min_cyc)*100 > 586 g_ip->cycle_time_dev_nuca) { 587 return 0; 588 } 589 if (((n->nuca_pda.area.get_area() - minval->min_area) / minval->min_area) * 590 100 > 591 g_ip->area_dev_nuca) { 592 return 0; 593 } 594 return 1; 595} 596 597void 598Nuca::calculate_nuca_area (nuca_org_t *nuca) { 599 nuca->nuca_pda.area.h = 600 nuca->rows * ((nuca->h_wire->wire_width + 601 nuca->h_wire->wire_spacing) 602 * nuca->router->flit_size + 603 nuca->bank_pda.area.h); 604 605 nuca->nuca_pda.area.w = 606 nuca->columns * ((nuca->v_wire->wire_width + 607 nuca->v_wire->wire_spacing) 608 * nuca->router->flit_size + 609 nuca->bank_pda.area.w); 610} 611 612