nuca.cc revision 10152
1/***************************************************************************** 2 * McPAT/CACTI 3 * SOFTWARE LICENSE AGREEMENT 4 * Copyright 2012 Hewlett-Packard Development Company, L.P. 5 * All Rights Reserved 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are 9 * met: redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer; 11 * redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution; 14 * neither the name of the copyright holders nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” 29 * 30 ***************************************************************************/ 31 32 33 34#include <cassert> 35 36#include "Ucache.h" 37#include "nuca.h" 38 39unsigned int MIN_BANKSIZE=65536; 40#define FIXED_OVERHEAD 55e-12 /* clock skew and jitter in s. Ref: Hrishikesh et al ISCA 01 */ 41#define LATCH_DELAY 28e-12 /* latch delay in s (later should use FO4 TODO) */ 42#define CONTR_2_BANK_LAT 0 43 44int cont_stats[2 /*l2 or l3*/][5/* cores */][ROUTER_TYPES][7 /*banks*/][8 /* cycle time */]; 45 46 Nuca::Nuca( 47 TechnologyParameter::DeviceType *dt = &(g_tp.peri_global) 48 ):deviceType(dt) 49{ 50 init_cont(); 51} 52 53void 54Nuca::init_cont() 55{ 56 FILE *cont; 57 char line[5000]; 58 char jk[5000]; 59 cont = fopen("contention.dat", "r"); 60 if (!cont) { 61 cout << "contention.dat file is missing!\n"; 62 exit(0); 63 } 64 65 for(int i=0; i<2; i++) { 66 for(int j=2; j<5; j++) { 67 for(int k=0; k<ROUTER_TYPES; k++) { 68 for(int l=0;l<7; l++) { 69 int *temp = cont_stats[i/*l2 or l3*/][j/*core*/][k/*64 or 128 or 256 link bw*/][l /* no banks*/]; 70 assert(fscanf(cont, "%[^\n]\n", line) != EOF); 71 sscanf(line, "%[^:]: %d %d %d %d %d %d %d %d",jk, &temp[0], &temp[1], &temp[2], &temp[3], 72 &temp[4], &temp[5], &temp[6], &temp[7]); 73 } 74 } 75 } 76 } 77 fclose(cont); 78} 79 80 void 81Nuca::print_cont_stats() 82{ 83 for(int i=0; i<2; i++) { 84 for(int j=2; j<5; j++) { 85 for(int k=0; k<ROUTER_TYPES; k++) { 86 for(int l=0;l<7; l++) { 87 for(int m=0;l<7; l++) { 88 cout << cont_stats[i][j][k][l][m] << " "; 89 } 90 cout << endl; 91 } 92 } 93 } 94 } 95 cout << endl; 96} 97 98Nuca::~Nuca(){ 99 for (int i = wt_min; i <= wt_max; i++) { 100 delete wire_vertical[i]; 101 delete wire_horizontal[i]; 102 } 103} 104 105/* converts latency (in s) to cycles depending upon the FREQUENCY (in GHz) */ 106 int 107Nuca::calc_cycles(double lat, double oper_freq) 108{ 109 //TODO: convert latch delay to FO4 */ 110 double cycle_time = (1.0/(oper_freq*1e9)); /*s*/ 111 cycle_time -= LATCH_DELAY; 112 cycle_time -= FIXED_OVERHEAD; 113 114 return (int)ceil(lat/cycle_time); 115} 116 117 118nuca_org_t::~nuca_org_t() { 119 // if(h_wire) delete h_wire; 120 // if(v_wire) delete v_wire; 121 // if(router) delete router; 122} 123 124/* 125 * Version - 6.0 126 * 127 * Perform exhaustive search across different bank organizatons, 128 * router configurations, grid organizations, and wire models and 129 * find an optimal NUCA organization 130 * For different bank count values 131 * 1. Optimal bank organization is calculated 132 * 2. For each bank organization, find different NUCA organizations 133 * using various router configurations, grid organizations, 134 * and wire models. 135 * 3. NUCA model with the least cost is picked for 136 * this particular bank count 137 * Finally include contention statistics and find the optimal 138 * NUCA configuration 139 */ 140 void 141Nuca::sim_nuca() 142{ 143 /* temp variables */ 144 int it, ro, wr; 145 int num_cyc; 146 unsigned int i, j, k; 147 unsigned int r, c; 148 int l2_c; 149 int bank_count = 0; 150 uca_org_t ures; 151 nuca_org_t *opt_n; 152 mem_array tag, data; 153 list<nuca_org_t *> nuca_list; 154 Router *router_s[ROUTER_TYPES]; 155 router_s[0] = new Router(64.0, 8, 4, &(g_tp.peri_global)); 156 router_s[0]->print_router(); 157 router_s[1] = new Router(128.0, 8, 4, &(g_tp.peri_global)); 158 router_s[1]->print_router(); 159 router_s[2] = new Router(256.0, 8, 4, &(g_tp.peri_global)); 160 router_s[2]->print_router(); 161 162 int core_in; // to store no. of cores 163 164 /* to search diff grid organizations */ 165 double curr_hop, totno_hops, totno_hhops, totno_vhops, tot_lat, 166 curr_acclat; 167 double avg_lat, avg_hop, avg_hhop, avg_vhop, avg_dyn_power, 168 avg_leakage_power; 169 170 double opt_acclat = INF, opt_avg_lat = INF, opt_tot_lat = INF; 171 int opt_rows = 0; 172 int opt_columns = 0; 173 double opt_totno_hops = 0; 174 double opt_avg_hop = 0; 175 double opt_dyn_power = 0, opt_leakage_power = 0; 176 min_values_t minval; 177 178 int bank_start = 0; 179 180 int flit_width = 0; 181 182 /* vertical and horizontal hop latency values */ 183 int ver_hop_lat, hor_hop_lat; /* in cycles */ 184 185 186 /* no. of different bank sizes to consider */ 187 int iterations; 188 189 190 g_ip->nuca_cache_sz = g_ip->cache_sz; 191 nuca_list.push_back(new nuca_org_t()); 192 193 if (g_ip->cache_level == 0) l2_c = 1; 194 else l2_c = 0; 195 196 if (g_ip->cores <= 4) core_in = 2; 197 else if (g_ip->cores <= 8) core_in = 3; 198 else if (g_ip->cores <= 16) core_in = 4; 199 else {cout << "Number of cores should be <= 16!\n"; exit(0);} 200 201 202 // set the lower bound to an appropriate value. this depends on cache associativity 203 if (g_ip->assoc > 2) { 204 i = 2; 205 while (i != g_ip->assoc) { 206 MIN_BANKSIZE *= 2; 207 i *= 2; 208 } 209 } 210 211 iterations = (int)logtwo((int)g_ip->cache_sz/MIN_BANKSIZE); 212 213 if (g_ip->force_wiretype) 214 { 215 if (g_ip->wt == Low_swing) { 216 wt_min = Low_swing; 217 wt_max = Low_swing; 218 } 219 else { 220 wt_min = Global; 221 wt_max = Low_swing-1; 222 } 223 } 224 else { 225 wt_min = Global; 226 wt_max = Low_swing; 227 } 228 if (g_ip->nuca_bank_count != 0) { // simulate just one bank 229 if (g_ip->nuca_bank_count != 2 && g_ip->nuca_bank_count != 4 && 230 g_ip->nuca_bank_count != 8 && g_ip->nuca_bank_count != 16 && 231 g_ip->nuca_bank_count != 32 && g_ip->nuca_bank_count != 64) { 232 fprintf(stderr,"Incorrect bank count value! Please fix the value in cache.cfg\n"); 233 } 234 bank_start = (int)logtwo((double)g_ip->nuca_bank_count); 235 iterations = bank_start+1; 236 g_ip->cache_sz = g_ip->cache_sz/g_ip->nuca_bank_count; 237 } 238 cout << "Simulating various NUCA configurations\n"; 239 for (it=bank_start; it<iterations; it++) { /* different bank count values */ 240 ures.tag_array2 = &tag; 241 ures.data_array2 = &data; 242 /* 243 * find the optimal bank organization 244 */ 245 solve(&ures); 246// output_UCA(&ures); 247 bank_count = g_ip->nuca_cache_sz/g_ip->cache_sz; 248 cout << "====" << g_ip->cache_sz << "\n"; 249 250 for (wr=wt_min; wr<=wt_max; wr++) { 251 252 for (ro=0; ro<ROUTER_TYPES; ro++) 253 { 254 flit_width = (int) router_s[ro]->flit_size; //initialize router 255 nuca_list.back()->nuca_pda.cycle_time = router_s[ro]->cycle_time; 256 257 /* calculate router and wire parameters */ 258 259 double vlength = ures.cache_ht; /* length of the wire (u)*/ 260 double hlength = ures.cache_len; // u 261 262 /* find delay, area, and power for wires */ 263 wire_vertical[wr] = new Wire((enum Wire_type) wr, vlength); 264 wire_horizontal[wr] = new Wire((enum Wire_type) wr, hlength); 265 266 267 hor_hop_lat = calc_cycles(wire_horizontal[wr]->delay, 268 1/(nuca_list.back()->nuca_pda.cycle_time*.001)); 269 ver_hop_lat = calc_cycles(wire_vertical[wr]->delay, 270 1/(nuca_list.back()->nuca_pda.cycle_time*.001)); 271 272 /* 273 * assume a grid like topology and explore for optimal network 274 * configuration using different row and column count values. 275 */ 276 for (c=1; c<=(unsigned int)bank_count; c++) { 277 while (bank_count%c != 0) c++; 278 r = bank_count/c; 279 280 /* 281 * to find the avg access latency of a NUCA cache, uncontended 282 * access time to each bank from the 283 * cache controller is calculated. 284 * avg latency = 285 * sum of the access latencies to individual banks)/bank 286 * count value. 287 */ 288 totno_hops = totno_hhops = totno_vhops = tot_lat = 0; 289 k = 1; 290 for (i=0; i<r; i++) { 291 for (j=0; j<c; j++) { 292 /* 293 * vertical hops including the 294 * first hop from the cache controller 295 */ 296 curr_hop = i + 1; 297 curr_hop += j; /* horizontal hops */ 298 totno_hhops += j; 299 totno_vhops += (i+1); 300 curr_acclat = (i * ver_hop_lat + CONTR_2_BANK_LAT + 301 j * hor_hop_lat); 302 303 tot_lat += curr_acclat; 304 totno_hops += curr_hop; 305 } 306 } 307 avg_lat = tot_lat/bank_count; 308 avg_hop = totno_hops/bank_count; 309 avg_hhop = totno_hhops/bank_count; 310 avg_vhop = totno_vhops/bank_count; 311 312 /* net access latency */ 313 curr_acclat = 2*avg_lat + 2*(router_s[ro]->delay*avg_hop) + 314 calc_cycles(ures.access_time, 315 1/(nuca_list.back()->nuca_pda.cycle_time*.001)); 316 317 /* avg access lat of nuca */ 318 avg_dyn_power = 319 avg_hop * 320 (router_s[ro]->power.readOp.dynamic) + avg_hhop * 321 (wire_horizontal[wr]->power.readOp.dynamic) * 322 (g_ip->block_sz*8 + 64) + avg_vhop * 323 (wire_vertical[wr]->power.readOp.dynamic) * 324 (g_ip->block_sz*8 + 64) + ures.power.readOp.dynamic; 325 326 avg_leakage_power = 327 bank_count * router_s[ro]->power.readOp.leakage + 328 avg_hhop * (wire_horizontal[wr]->power.readOp.leakage* 329 wire_horizontal[wr]->delay) * flit_width + 330 avg_vhop * (wire_vertical[wr]->power.readOp.leakage * 331 wire_horizontal[wr]->delay); 332 333 if (curr_acclat < opt_acclat) { 334 opt_acclat = curr_acclat; 335 opt_tot_lat = tot_lat; 336 opt_avg_lat = avg_lat; 337 opt_totno_hops = totno_hops; 338 opt_avg_hop = avg_hop; 339 opt_rows = r; 340 opt_columns = c; 341 opt_dyn_power = avg_dyn_power; 342 opt_leakage_power = avg_leakage_power; 343 } 344 totno_hops = 0; 345 tot_lat = 0; 346 totno_hhops = 0; 347 totno_vhops = 0; 348 } 349 nuca_list.back()->wire_pda.power.readOp.dynamic = 350 opt_avg_hop * flit_width * 351 (wire_horizontal[wr]->power.readOp.dynamic + 352 wire_vertical[wr]->power.readOp.dynamic); 353 nuca_list.back()->avg_hops = opt_avg_hop; 354 /* network delay/power */ 355 nuca_list.back()->h_wire = wire_horizontal[wr]; 356 nuca_list.back()->v_wire = wire_vertical[wr]; 357 nuca_list.back()->router = router_s[ro]; 358 /* bank delay/power */ 359 360 nuca_list.back()->bank_pda.delay = ures.access_time; 361 nuca_list.back()->bank_pda.power = ures.power; 362 nuca_list.back()->bank_pda.area.h = ures.cache_ht; 363 nuca_list.back()->bank_pda.area.w = ures.cache_len; 364 nuca_list.back()->bank_pda.cycle_time = ures.cycle_time; 365 366 num_cyc = calc_cycles(nuca_list.back()->bank_pda.delay /*s*/, 367 1/(nuca_list.back()->nuca_pda.cycle_time*.001/*GHz*/)); 368 if(num_cyc%2 != 0) num_cyc++; 369 if (num_cyc > 16) num_cyc = 16; // we have data only up to 16 cycles 370 371 if (it < 7) { 372 nuca_list.back()->nuca_pda.delay = opt_acclat + 373 cont_stats[l2_c][core_in][ro][it][num_cyc/2-1]; 374 nuca_list.back()->contention = 375 cont_stats[l2_c][core_in][ro][it][num_cyc/2-1]; 376 } 377 else { 378 nuca_list.back()->nuca_pda.delay = opt_acclat + 379 cont_stats[l2_c][core_in][ro][7][num_cyc/2-1]; 380 nuca_list.back()->contention = 381 cont_stats[l2_c][core_in][ro][7][num_cyc/2-1]; 382 } 383 nuca_list.back()->nuca_pda.power.readOp.dynamic = opt_dyn_power; 384 nuca_list.back()->nuca_pda.power.readOp.leakage = opt_leakage_power; 385 386 /* array organization */ 387 nuca_list.back()->bank_count = bank_count; 388 nuca_list.back()->rows = opt_rows; 389 nuca_list.back()->columns = opt_columns; 390 calculate_nuca_area (nuca_list.back()); 391 392 minval.update_min_values(nuca_list.back()); 393 nuca_list.push_back(new nuca_org_t()); 394 opt_acclat = BIGNUM; 395 396 } 397 } 398 g_ip->cache_sz /= 2; 399 } 400 401 delete(nuca_list.back()); 402 nuca_list.pop_back(); 403 opt_n = find_optimal_nuca(&nuca_list, &minval); 404 print_nuca(opt_n); 405 g_ip->cache_sz = g_ip->nuca_cache_sz/opt_n->bank_count; 406 407 list<nuca_org_t *>::iterator niter; 408 for (niter = nuca_list.begin(); niter != nuca_list.end(); ++niter) 409 { 410 delete *niter; 411 } 412 nuca_list.clear(); 413 414 for(int i=0; i < ROUTER_TYPES; i++) 415 { 416 delete router_s[i]; 417 } 418 g_ip->display_ip(); 419 // g_ip->force_cache_config = true; 420 // g_ip->ndwl = 8; 421 // g_ip->ndbl = 16; 422 // g_ip->nspd = 4; 423 // g_ip->ndcm = 1; 424 // g_ip->ndsam1 = 8; 425 // g_ip->ndsam2 = 32; 426 427} 428 429 430 void 431Nuca::print_nuca (nuca_org_t *fr) 432{ 433 printf("\n---------- CACTI version 6.5, Non-uniform Cache Access " 434 "----------\n\n"); 435 printf("Optimal number of banks - %d\n", fr->bank_count); 436 printf("Grid organization rows x columns - %d x %d\n", 437 fr->rows, fr->columns); 438 printf("Network frequency - %g GHz\n", 439 (1/fr->nuca_pda.cycle_time)*1e3); 440 printf("Cache dimension (mm x mm) - %g x %g\n", 441 fr->nuca_pda.area.h, 442 fr->nuca_pda.area.w); 443 444 fr->router->print_router(); 445 446 printf("\n\nWire stats:\n"); 447 if (fr->h_wire->wt == Global) { 448 printf("\tWire type - Full swing global wires with least " 449 "possible delay\n"); 450 } 451 else if (fr->h_wire->wt == Global_5) { 452 printf("\tWire type - Full swing global wires with " 453 "5%% delay penalty\n"); 454 } 455 else if (fr->h_wire->wt == Global_10) { 456 printf("\tWire type - Full swing global wires with " 457 "10%% delay penalty\n"); 458 } 459 else if (fr->h_wire->wt == Global_20) { 460 printf("\tWire type - Full swing global wires with " 461 "20%% delay penalty\n"); 462 } 463 else if (fr->h_wire->wt == Global_30) { 464 printf("\tWire type - Full swing global wires with " 465 "30%% delay penalty\n"); 466 } 467 else if(fr->h_wire->wt == Low_swing) { 468 printf("\tWire type - Low swing wires\n"); 469 } 470 471 printf("\tHorizontal link delay - %g (ns)\n", 472 fr->h_wire->delay*1e9); 473 printf("\tVertical link delay - %g (ns)\n", 474 fr->v_wire->delay*1e9); 475 printf("\tDelay/length - %g (ns/mm)\n", 476 fr->h_wire->delay*1e9/fr->bank_pda.area.w); 477 printf("\tHorizontal link energy -dynamic/access %g (nJ)\n" 478 "\t -leakage %g (nW)\n\n", 479 fr->h_wire->power.readOp.dynamic*1e9, 480 fr->h_wire->power.readOp.leakage*1e9); 481 printf("\tVertical link energy -dynamic/access %g (nJ)\n" 482 "\t -leakage %g (nW)\n\n", 483 fr->v_wire->power.readOp.dynamic*1e9, 484 fr->v_wire->power.readOp.leakage*1e9); 485 printf("\n\n"); 486 fr->v_wire->print_wire(); 487 printf("\n\nBank stats:\n"); 488} 489 490 491 nuca_org_t * 492Nuca::find_optimal_nuca (list<nuca_org_t *> *n, min_values_t *minval) 493{ 494 double cost = 0; 495 double min_cost = BIGNUM; 496 nuca_org_t *res = NULL; 497 float d, a, dp, lp, c; 498 int v; 499 dp = g_ip->dynamic_power_wt_nuca; 500 lp = g_ip->leakage_power_wt_nuca; 501 a = g_ip->area_wt_nuca; 502 d = g_ip->delay_wt_nuca; 503 c = g_ip->cycle_time_wt_nuca; 504 505 list<nuca_org_t *>::iterator niter; 506 507 508 for (niter = n->begin(); niter != n->end(); niter++) { 509 fprintf(stderr, "\n-----------------------------" 510 "---------------\n"); 511 512 513 printf("NUCA___stats %d \tbankcount: lat = %g \tdynP = %g \twt = %d\t " 514 "bank_dpower = %g \tleak = %g \tcycle = %g\n", 515 (*niter)->bank_count, 516 (*niter)->nuca_pda.delay, 517 (*niter)->nuca_pda.power.readOp.dynamic, 518 (*niter)->h_wire->wt, 519 (*niter)->bank_pda.power.readOp.dynamic, 520 (*niter)->nuca_pda.power.readOp.leakage, 521 (*niter)->nuca_pda.cycle_time); 522 523 524 if (g_ip->ed == 1) { 525 cost = ((*niter)->nuca_pda.delay/minval->min_delay)* 526 ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn); 527 if (min_cost > cost) { 528 min_cost = cost; 529 res = ((*niter)); 530 } 531 } 532 else if (g_ip->ed == 2) { 533 cost = ((*niter)->nuca_pda.delay/minval->min_delay)* 534 ((*niter)->nuca_pda.delay/minval->min_delay)* 535 ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn); 536 if (min_cost > cost) { 537 min_cost = cost; 538 res = ((*niter)); 539 } 540 } 541 else { 542 /* 543 * check whether the current organization 544 * meets the input deviation constraints 545 */ 546 v = check_nuca_org((*niter), minval); 547 if (minval->min_leakage == 0) minval->min_leakage = 0.1; //FIXME remove this after leakage modeling 548 549 if (v) { 550 cost = (d * ((*niter)->nuca_pda.delay/minval->min_delay) + 551 c * ((*niter)->nuca_pda.cycle_time/minval->min_cyc) + 552 dp * ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn) + 553 lp * ((*niter)->nuca_pda.power.readOp.leakage/minval->min_leakage) + 554 a * ((*niter)->nuca_pda.area.get_area()/minval->min_area)); 555 fprintf(stderr, "cost = %g\n", cost); 556 557 if (min_cost > cost) { 558 min_cost = cost; 559 res = ((*niter)); 560 } 561 } 562 else { 563 niter = n->erase(niter); 564 if (niter !=n->begin()) 565 niter --; 566 } 567 } 568 } 569 return res; 570} 571 572 int 573Nuca::check_nuca_org (nuca_org_t *n, min_values_t *minval) 574{ 575 if (((n->nuca_pda.delay - minval->min_delay)*100/minval->min_delay) > g_ip->delay_dev_nuca) { 576 return 0; 577 } 578 if (((n->nuca_pda.power.readOp.dynamic - minval->min_dyn)/minval->min_dyn)*100 > 579 g_ip->dynamic_power_dev_nuca) { 580 return 0; 581 } 582 if (((n->nuca_pda.power.readOp.leakage - minval->min_leakage)/minval->min_leakage)*100 > 583 g_ip->leakage_power_dev_nuca) { 584 return 0; 585 } 586 if (((n->nuca_pda.cycle_time - minval->min_cyc)/minval->min_cyc)*100 > 587 g_ip->cycle_time_dev_nuca) { 588 return 0; 589 } 590 if (((n->nuca_pda.area.get_area() - minval->min_area)/minval->min_area)*100 > 591 g_ip->area_dev_nuca) { 592 return 0; 593 } 594 return 1; 595} 596 597 void 598Nuca::calculate_nuca_area (nuca_org_t *nuca) 599{ 600 nuca->nuca_pda.area.h= 601 nuca->rows * ((nuca->h_wire->wire_width + 602 nuca->h_wire->wire_spacing) 603 * nuca->router->flit_size + 604 nuca->bank_pda.area.h); 605 606 nuca->nuca_pda.area.w = 607 nuca->columns * ((nuca->v_wire->wire_width + 608 nuca->v_wire->wire_spacing) 609 * nuca->router->flit_size + 610 nuca->bank_pda.area.w); 611} 612 613