Deleted Added
sdiff udiff text old ( 10152:52c552138ba1 ) new ( 10234:5cb711fa6176 )
full compact
1/*****************************************************************************
2 * McPAT/CACTI
3 * SOFTWARE LICENSE AGREEMENT
4 * Copyright 2012 Hewlett-Packard Development Company, L.P.
5 * Copyright (c) 2010-2013 Advanced Micro Devices, Inc.
6 * All Rights Reserved
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are
10 * met: redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer;
12 * redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the

--- 7 unchanged lines hidden (view full) ---

21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 ***************************************************************************/
32
33
34
35#include <cassert>
36
37#include "Ucache.h"
38#include "nuca.h"
39
40unsigned int MIN_BANKSIZE = 65536;
41#define FIXED_OVERHEAD 55e-12 /* clock skew and jitter in s. Ref: Hrishikesh et al ISCA 01 */
42#define LATCH_DELAY 28e-12 /* latch delay in s (later should use FO4 TODO) */
43#define CONTR_2_BANK_LAT 0
44
45int cont_stats[2 /*l2 or l3*/][5/* cores */][ROUTER_TYPES][7 /*banks*/][8 /* cycle time */];
46
47Nuca::Nuca(
48 TechnologyParameter::DeviceType *dt = &(g_tp.peri_global)
49): deviceType(dt) {
50 init_cont();
51}
52
53void
54Nuca::init_cont() {
55 FILE *cont;
56 char line[5000];
57 char jk[5000];
58 cont = fopen("contention.dat", "r");
59 if (!cont) {
60 cout << "contention.dat file is missing!\n";
61 exit(0);
62 }
63
64 for (int i = 0; i < 2; i++) {
65 for (int j = 2; j < 5; j++) {
66 for (int k = 0; k < ROUTER_TYPES; k++) {
67 for (int l = 0; l < 7; l++) {
68 int *temp = cont_stats[i/*l2 or l3*/][j/*core*/][k/*64 or 128 or 256 link bw*/][l /* no banks*/];
69 assert(fscanf(cont, "%[^\n]\n", line) != EOF);
70 sscanf(line, "%[^:]: %d %d %d %d %d %d %d %d", jk,
71 &temp[0], &temp[1], &temp[2], &temp[3],
72 &temp[4], &temp[5], &temp[6], &temp[7]);
73 }
74 }
75 }
76 }
77 fclose(cont);
78}
79
80void
81Nuca::print_cont_stats() {
82 for (int i = 0; i < 2; i++) {
83 for (int j = 2; j < 5; j++) {
84 for (int k = 0; k < ROUTER_TYPES; k++) {
85 for (int l = 0; l < 7; l++) {
86 for (int m = 0; l < 7; l++) {
87 cout << cont_stats[i][j][k][l][m] << " ";
88 }
89 cout << endl;
90 }
91 }
92 }
93 }
94 cout << endl;
95}
96
97Nuca::~Nuca() {
98 for (int i = wt_min; i <= wt_max; i++) {
99 delete wire_vertical[i];
100 delete wire_horizontal[i];
101 }
102}
103
104/* converts latency (in s) to cycles depending upon the FREQUENCY (in GHz) */
105int
106Nuca::calc_cycles(double lat, double oper_freq) {
107 //TODO: convert latch delay to FO4 */
108 double cycle_time = (1.0 / (oper_freq * 1e9)); /*s*/
109 cycle_time -= LATCH_DELAY;
110 cycle_time -= FIXED_OVERHEAD;
111
112 return (int)ceil(lat / cycle_time);
113}
114
115
116nuca_org_t::~nuca_org_t() {
117 // if(h_wire) delete h_wire;
118 // if(v_wire) delete v_wire;
119 // if(router) delete router;
120}
121
122/*
123 * Version - 6.0
124 *
125 * Perform exhaustive search across different bank organizatons,
126 * router configurations, grid organizations, and wire models and
127 * find an optimal NUCA organization
128 * For different bank count values
129 * 1. Optimal bank organization is calculated
130 * 2. For each bank organization, find different NUCA organizations
131 * using various router configurations, grid organizations,
132 * and wire models.
133 * 3. NUCA model with the least cost is picked for
134 * this particular bank count
135 * Finally include contention statistics and find the optimal
136 * NUCA configuration
137 */
138void
139Nuca::sim_nuca() {
140 /* temp variables */
141 int it, ro, wr;
142 int num_cyc;
143 unsigned int i, j, k;
144 unsigned int r, c;
145 int l2_c;
146 int bank_count = 0;
147 uca_org_t ures;
148 nuca_org_t *opt_n;
149 mem_array tag, data;
150 list<nuca_org_t *> nuca_list;
151 Router *router_s[ROUTER_TYPES];
152 router_s[0] = new Router(64.0, 8, 4, &(g_tp.peri_global));
153 router_s[0]->print_router();
154 router_s[1] = new Router(128.0, 8, 4, &(g_tp.peri_global));
155 router_s[1]->print_router();
156 router_s[2] = new Router(256.0, 8, 4, &(g_tp.peri_global));
157 router_s[2]->print_router();
158
159 int core_in; // to store no. of cores
160
161 /* to search diff grid organizations */
162 double curr_hop, totno_hops, totno_hhops, totno_vhops, tot_lat,
163 curr_acclat;
164 double avg_lat, avg_hop, avg_hhop, avg_vhop, avg_dyn_power,
165 avg_leakage_power;
166
167 double opt_acclat = INF, opt_avg_lat = INF, opt_tot_lat = INF;
168 int opt_rows = 0;
169 int opt_columns = 0;
170 double opt_totno_hops = 0;
171 double opt_avg_hop = 0;
172 double opt_dyn_power = 0, opt_leakage_power = 0;
173 min_values_t minval;
174
175 int bank_start = 0;
176
177 int flit_width = 0;
178
179 /* vertical and horizontal hop latency values */
180 int ver_hop_lat, hor_hop_lat; /* in cycles */
181
182
183 /* no. of different bank sizes to consider */
184 int iterations;
185
186
187 g_ip->nuca_cache_sz = g_ip->cache_sz;
188 nuca_list.push_back(new nuca_org_t());
189
190 if (g_ip->cache_level == 0) l2_c = 1;
191 else l2_c = 0;
192
193 if (g_ip->cores <= 4) core_in = 2;
194 else if (g_ip->cores <= 8) core_in = 3;
195 else if (g_ip->cores <= 16) core_in = 4;
196 else {
197 cout << "Number of cores should be <= 16!\n";
198 exit(0);
199 }
200
201
202 // set the lower bound to an appropriate value. this depends on cache associativity
203 if (g_ip->assoc > 2) {
204 i = 2;
205 while (i != g_ip->assoc) {
206 MIN_BANKSIZE *= 2;
207 i *= 2;
208 }
209 }
210
211 iterations = (int)logtwo((int)g_ip->cache_sz / MIN_BANKSIZE);
212
213 if (g_ip->force_wiretype) {
214 if (g_ip->wt == Low_swing) {
215 wt_min = Low_swing;
216 wt_max = Low_swing;
217 } else {
218 wt_min = Global;
219 wt_max = Low_swing - 1;
220 }
221 } else {
222 wt_min = Global;
223 wt_max = Low_swing;
224 }
225 if (g_ip->nuca_bank_count != 0) { // simulate just one bank
226 if (g_ip->nuca_bank_count != 2 && g_ip->nuca_bank_count != 4 &&
227 g_ip->nuca_bank_count != 8 && g_ip->nuca_bank_count != 16 &&
228 g_ip->nuca_bank_count != 32 && g_ip->nuca_bank_count != 64) {
229 fprintf(stderr, "Incorrect bank count value! Please fix the ",
230 "value in cache.cfg\n");
231 }
232 bank_start = (int)logtwo((double)g_ip->nuca_bank_count);
233 iterations = bank_start + 1;
234 g_ip->cache_sz = g_ip->cache_sz / g_ip->nuca_bank_count;
235 }
236 cout << "Simulating various NUCA configurations\n";
237 for (it = bank_start; it < iterations; it++) {
238 /* different bank count values */
239 ures.tag_array2 = &tag;
240 ures.data_array2 = &data;
241 /*
242 * find the optimal bank organization
243 */
244 solve(&ures);
245// output_UCA(&ures);
246 bank_count = g_ip->nuca_cache_sz / g_ip->cache_sz;
247 cout << "====" << g_ip->cache_sz << "\n";
248
249 for (wr = wt_min; wr <= wt_max; wr++) {
250
251 for (ro = 0; ro < ROUTER_TYPES; ro++) {
252 flit_width = (int) router_s[ro]->flit_size; //initialize router
253 nuca_list.back()->nuca_pda.cycle_time = router_s[ro]->cycle_time;
254
255 /* calculate router and wire parameters */
256
257 double vlength = ures.cache_ht; /* length of the wire (u)*/
258 double hlength = ures.cache_len; // u
259
260 /* find delay, area, and power for wires */
261 wire_vertical[wr] = new Wire((enum Wire_type) wr, vlength);
262 wire_horizontal[wr] = new Wire((enum Wire_type) wr, hlength);
263
264
265 hor_hop_lat =
266 calc_cycles(wire_horizontal[wr]->delay,
267 1 /(nuca_list.back()->nuca_pda.cycle_time *
268 .001));
269 ver_hop_lat =
270 calc_cycles(wire_vertical[wr]->delay,
271 1 / (nuca_list.back()->nuca_pda.cycle_time *
272 .001));
273
274 /*
275 * assume a grid like topology and explore for optimal network
276 * configuration using different row and column count values.
277 */
278 for (c = 1; c <= (unsigned int)bank_count; c++) {
279 while (bank_count % c != 0) c++;
280 r = bank_count / c;
281
282 /*
283 * to find the avg access latency of a NUCA cache, uncontended
284 * access time to each bank from the
285 * cache controller is calculated.
286 * avg latency =
287 * sum of the access latencies to individual banks)/bank
288 * count value.
289 */
290 totno_hops = totno_hhops = totno_vhops = tot_lat = 0;
291 k = 1;
292 for (i = 0; i < r; i++) {
293 for (j = 0; j < c; j++) {
294 /*
295 * vertical hops including the
296 * first hop from the cache controller
297 */
298 curr_hop = i + 1;
299 curr_hop += j; /* horizontal hops */
300 totno_hhops += j;
301 totno_vhops += (i + 1);
302 curr_acclat = (i * ver_hop_lat + CONTR_2_BANK_LAT +
303 j * hor_hop_lat);
304
305 tot_lat += curr_acclat;
306 totno_hops += curr_hop;
307 }
308 }
309 avg_lat = tot_lat / bank_count;
310 avg_hop = totno_hops / bank_count;
311 avg_hhop = totno_hhops / bank_count;
312 avg_vhop = totno_vhops / bank_count;
313
314 /* net access latency */
315 curr_acclat = 2 * avg_lat + 2 * (router_s[ro]->delay *
316 avg_hop) +
317 calc_cycles(ures.access_time,
318 1 /
319 (nuca_list.back()->nuca_pda.cycle_time *
320 .001));
321
322 /* avg access lat of nuca */
323 avg_dyn_power =
324 avg_hop *
325 (router_s[ro]->power.readOp.dynamic) + avg_hhop *
326 (wire_horizontal[wr]->power.readOp.dynamic) *
327 (g_ip->block_sz * 8 + 64) + avg_vhop *
328 (wire_vertical[wr]->power.readOp.dynamic) *
329 (g_ip->block_sz * 8 + 64) + ures.power.readOp.dynamic;
330
331 avg_leakage_power =
332 bank_count * router_s[ro]->power.readOp.leakage +
333 avg_hhop * (wire_horizontal[wr]->power.readOp.leakage *
334 wire_horizontal[wr]->delay) * flit_width +
335 avg_vhop * (wire_vertical[wr]->power.readOp.leakage *
336 wire_horizontal[wr]->delay);
337
338 if (curr_acclat < opt_acclat) {
339 opt_acclat = curr_acclat;
340 opt_tot_lat = tot_lat;
341 opt_avg_lat = avg_lat;
342 opt_totno_hops = totno_hops;
343 opt_avg_hop = avg_hop;
344 opt_rows = r;
345 opt_columns = c;
346 opt_dyn_power = avg_dyn_power;
347 opt_leakage_power = avg_leakage_power;
348 }
349 totno_hops = 0;
350 tot_lat = 0;
351 totno_hhops = 0;
352 totno_vhops = 0;
353 }
354 nuca_list.back()->wire_pda.power.readOp.dynamic =
355 opt_avg_hop * flit_width *
356 (wire_horizontal[wr]->power.readOp.dynamic +
357 wire_vertical[wr]->power.readOp.dynamic);
358 nuca_list.back()->avg_hops = opt_avg_hop;
359 /* network delay/power */
360 nuca_list.back()->h_wire = wire_horizontal[wr];
361 nuca_list.back()->v_wire = wire_vertical[wr];
362 nuca_list.back()->router = router_s[ro];
363 /* bank delay/power */
364
365 nuca_list.back()->bank_pda.delay = ures.access_time;
366 nuca_list.back()->bank_pda.power = ures.power;
367 nuca_list.back()->bank_pda.area.h = ures.cache_ht;
368 nuca_list.back()->bank_pda.area.w = ures.cache_len;
369 nuca_list.back()->bank_pda.cycle_time = ures.cycle_time;
370
371 num_cyc = calc_cycles(nuca_list.back()->bank_pda.delay /*s*/,
372 1 /
373 (nuca_list.back()->nuca_pda.cycle_time *
374 .001/*GHz*/));
375 if (num_cyc % 2 != 0) num_cyc++;
376 if (num_cyc > 16) num_cyc = 16; // we have data only up to 16 cycles
377
378 if (it < 7) {
379 nuca_list.back()->nuca_pda.delay = opt_acclat +
380 cont_stats[l2_c][core_in][ro][it][num_cyc/2-1];
381 nuca_list.back()->contention =
382 cont_stats[l2_c][core_in][ro][it][num_cyc/2-1];
383 } else {
384 nuca_list.back()->nuca_pda.delay = opt_acclat +
385 cont_stats[l2_c][core_in][ro][7][num_cyc/2-1];
386 nuca_list.back()->contention =
387 cont_stats[l2_c][core_in][ro][7][num_cyc/2-1];
388 }
389 nuca_list.back()->nuca_pda.power.readOp.dynamic = opt_dyn_power;
390 nuca_list.back()->nuca_pda.power.readOp.leakage = opt_leakage_power;
391
392 /* array organization */
393 nuca_list.back()->bank_count = bank_count;
394 nuca_list.back()->rows = opt_rows;
395 nuca_list.back()->columns = opt_columns;
396 calculate_nuca_area (nuca_list.back());
397
398 minval.update_min_values(nuca_list.back());
399 nuca_list.push_back(new nuca_org_t());
400 opt_acclat = BIGNUM;
401
402 }
403 }
404 g_ip->cache_sz /= 2;
405 }
406
407 delete(nuca_list.back());
408 nuca_list.pop_back();
409 opt_n = find_optimal_nuca(&nuca_list, &minval);
410 print_nuca(opt_n);
411 g_ip->cache_sz = g_ip->nuca_cache_sz / opt_n->bank_count;
412
413 list<nuca_org_t *>::iterator niter;
414 for (niter = nuca_list.begin(); niter != nuca_list.end(); ++niter) {
415 delete *niter;
416 }
417 nuca_list.clear();
418
419 for (int i = 0; i < ROUTER_TYPES; i++) {
420 delete router_s[i];
421 }
422 g_ip->display_ip();
423 // g_ip->force_cache_config = true;
424 // g_ip->ndwl = 8;
425 // g_ip->ndbl = 16;
426 // g_ip->nspd = 4;
427 // g_ip->ndcm = 1;
428 // g_ip->ndsam1 = 8;
429 // g_ip->ndsam2 = 32;
430
431}
432
433
434void
435Nuca::print_nuca (nuca_org_t *fr) {
436 printf("\n---------- CACTI version 6.5, Non-uniform Cache Access "
437 "----------\n\n");
438 printf("Optimal number of banks - %d\n", fr->bank_count);
439 printf("Grid organization rows x columns - %d x %d\n",
440 fr->rows, fr->columns);
441 printf("Network frequency - %g GHz\n",
442 (1 / fr->nuca_pda.cycle_time)*1e3);
443 printf("Cache dimension (mm x mm) - %g x %g\n",
444 fr->nuca_pda.area.h,
445 fr->nuca_pda.area.w);
446
447 fr->router->print_router();
448
449 printf("\n\nWire stats:\n");
450 if (fr->h_wire->wt == Global) {
451 printf("\tWire type - Full swing global wires with least "
452 "possible delay\n");
453 } else if (fr->h_wire->wt == Global_5) {
454 printf("\tWire type - Full swing global wires with "
455 "5%% delay penalty\n");
456 } else if (fr->h_wire->wt == Global_10) {
457 printf("\tWire type - Full swing global wires with "
458 "10%% delay penalty\n");
459 } else if (fr->h_wire->wt == Global_20) {
460 printf("\tWire type - Full swing global wires with "
461 "20%% delay penalty\n");
462 } else if (fr->h_wire->wt == Global_30) {
463 printf("\tWire type - Full swing global wires with "
464 "30%% delay penalty\n");
465 } else if (fr->h_wire->wt == Low_swing) {
466 printf("\tWire type - Low swing wires\n");
467 }
468
469 printf("\tHorizontal link delay - %g (ns)\n",
470 fr->h_wire->delay*1e9);
471 printf("\tVertical link delay - %g (ns)\n",
472 fr->v_wire->delay*1e9);
473 printf("\tDelay/length - %g (ns/mm)\n",
474 fr->h_wire->delay*1e9 / fr->bank_pda.area.w);
475 printf("\tHorizontal link energy -dynamic/access %g (nJ)\n"
476 "\t -leakage %g (nW)\n\n",
477 fr->h_wire->power.readOp.dynamic*1e9,
478 fr->h_wire->power.readOp.leakage*1e9);
479 printf("\tVertical link energy -dynamic/access %g (nJ)\n"
480 "\t -leakage %g (nW)\n\n",
481 fr->v_wire->power.readOp.dynamic*1e9,
482 fr->v_wire->power.readOp.leakage*1e9);
483 printf("\n\n");
484 fr->v_wire->print_wire();
485 printf("\n\nBank stats:\n");
486}
487
488
489nuca_org_t *
490Nuca::find_optimal_nuca (list<nuca_org_t *> *n, min_values_t *minval) {
491 double cost = 0;
492 double min_cost = BIGNUM;
493 nuca_org_t *res = NULL;
494 float d, a, dp, lp, c;
495 int v;
496 dp = g_ip->dynamic_power_wt_nuca;
497 lp = g_ip->leakage_power_wt_nuca;
498 a = g_ip->area_wt_nuca;
499 d = g_ip->delay_wt_nuca;
500 c = g_ip->cycle_time_wt_nuca;
501
502 list<nuca_org_t *>::iterator niter;
503
504
505 for (niter = n->begin(); niter != n->end(); niter++) {
506 fprintf(stderr, "\n-----------------------------"
507 "---------------\n");
508
509
510 printf("NUCA___stats %d \tbankcount: lat = %g \tdynP = %g \twt = %d\t "
511 "bank_dpower = %g \tleak = %g \tcycle = %g\n",
512 (*niter)->bank_count,
513 (*niter)->nuca_pda.delay,
514 (*niter)->nuca_pda.power.readOp.dynamic,
515 (*niter)->h_wire->wt,
516 (*niter)->bank_pda.power.readOp.dynamic,
517 (*niter)->nuca_pda.power.readOp.leakage,
518 (*niter)->nuca_pda.cycle_time);
519
520
521 if (g_ip->ed == 1) {
522 cost = ((*niter)->nuca_pda.delay / minval->min_delay) *
523 ((*niter)->nuca_pda.power.readOp.dynamic / minval->min_dyn);
524 if (min_cost > cost) {
525 min_cost = cost;
526 res = ((*niter));
527 }
528 } else if (g_ip->ed == 2) {
529 cost = ((*niter)->nuca_pda.delay / minval->min_delay) *
530 ((*niter)->nuca_pda.delay / minval->min_delay) *
531 ((*niter)->nuca_pda.power.readOp.dynamic / minval->min_dyn);
532 if (min_cost > cost) {
533 min_cost = cost;
534 res = ((*niter));
535 }
536 } else {
537 /*
538 * check whether the current organization
539 * meets the input deviation constraints
540 */
541 v = check_nuca_org((*niter), minval);
542 if (minval->min_leakage == 0) minval->min_leakage = 0.1; //FIXME remove this after leakage modeling
543
544 if (v) {
545 cost = (d * ((*niter)->nuca_pda.delay / minval->min_delay) +
546 c * ((*niter)->nuca_pda.cycle_time / minval->min_cyc) +
547 dp * ((*niter)->nuca_pda.power.readOp.dynamic /
548 minval->min_dyn) +
549 lp * ((*niter)->nuca_pda.power.readOp.leakage /
550 minval->min_leakage) +
551 a * ((*niter)->nuca_pda.area.get_area() /
552 minval->min_area));
553 fprintf(stderr, "cost = %g\n", cost);
554
555 if (min_cost > cost) {
556 min_cost = cost;
557 res = ((*niter));
558 }
559 } else {
560 niter = n->erase(niter);
561 if (niter != n->begin())
562 niter --;
563 }
564 }
565 }
566 return res;
567}
568
569int
570Nuca::check_nuca_org (nuca_org_t *n, min_values_t *minval) {
571 if (((n->nuca_pda.delay - minval->min_delay)*100 / minval->min_delay) >
572 g_ip->delay_dev_nuca) {
573 return 0;
574 }
575 if (((n->nuca_pda.power.readOp.dynamic - minval->min_dyn) /
576 minval->min_dyn)*100 >
577 g_ip->dynamic_power_dev_nuca) {
578 return 0;
579 }
580 if (((n->nuca_pda.power.readOp.leakage - minval->min_leakage) /
581 minval->min_leakage)*100 >
582 g_ip->leakage_power_dev_nuca) {
583 return 0;
584 }
585 if (((n->nuca_pda.cycle_time - minval->min_cyc) / minval->min_cyc)*100 >
586 g_ip->cycle_time_dev_nuca) {
587 return 0;
588 }
589 if (((n->nuca_pda.area.get_area() - minval->min_area) / minval->min_area) *
590 100 >
591 g_ip->area_dev_nuca) {
592 return 0;
593 }
594 return 1;
595}
596
597void
598Nuca::calculate_nuca_area (nuca_org_t *nuca) {
599 nuca->nuca_pda.area.h =
600 nuca->rows * ((nuca->h_wire->wire_width +
601 nuca->h_wire->wire_spacing)
602 * nuca->router->flit_size +
603 nuca->bank_pda.area.h);
604
605 nuca->nuca_pda.area.w =
606 nuca->columns * ((nuca->v_wire->wire_width +
607 nuca->v_wire->wire_spacing)
608 * nuca->router->flit_size +
609 nuca->bank_pda.area.w);
610}
611