nuca.cc revision 10152
1/*****************************************************************************
2 *                                McPAT/CACTI
3 *                      SOFTWARE LICENSE AGREEMENT
4 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
5 *                          All Rights Reserved
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are
9 * met: redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer;
11 * redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution;
14 * neither the name of the copyright holders nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
29 *
30 ***************************************************************************/
31
32
33
34#include <cassert>
35
36#include "Ucache.h"
37#include "nuca.h"
38
39unsigned int MIN_BANKSIZE=65536;
40#define FIXED_OVERHEAD 55e-12 /* clock skew and jitter in s. Ref: Hrishikesh et al ISCA 01 */
41#define LATCH_DELAY 28e-12 /* latch delay in s (later should use FO4 TODO) */
42#define CONTR_2_BANK_LAT 0
43
44int cont_stats[2 /*l2 or l3*/][5/* cores */][ROUTER_TYPES][7 /*banks*/][8 /* cycle time */];
45
46  Nuca::Nuca(
47      TechnologyParameter::DeviceType *dt = &(g_tp.peri_global)
48      ):deviceType(dt)
49{
50  init_cont();
51}
52
53void
54Nuca::init_cont()
55{
56  FILE *cont;
57  char line[5000];
58  char jk[5000];
59  cont = fopen("contention.dat", "r");
60  if (!cont) {
61    cout << "contention.dat file is missing!\n";
62    exit(0);
63  }
64
65  for(int i=0; i<2; i++) {
66    for(int j=2; j<5; j++) {
67      for(int k=0; k<ROUTER_TYPES; k++) {
68        for(int l=0;l<7; l++) {
69          int *temp = cont_stats[i/*l2 or l3*/][j/*core*/][k/*64 or 128 or 256 link bw*/][l /* no banks*/];
70          assert(fscanf(cont, "%[^\n]\n", line) != EOF);
71          sscanf(line, "%[^:]: %d %d %d %d %d %d %d %d",jk, &temp[0], &temp[1], &temp[2], &temp[3],
72              &temp[4], &temp[5], &temp[6], &temp[7]);
73        }
74      }
75    }
76  }
77  fclose(cont);
78}
79
80  void
81Nuca::print_cont_stats()
82{
83  for(int i=0; i<2; i++) {
84    for(int j=2; j<5; j++) {
85      for(int k=0; k<ROUTER_TYPES; k++) {
86        for(int l=0;l<7; l++) {
87          for(int m=0;l<7; l++) {
88            cout << cont_stats[i][j][k][l][m] << " ";
89          }
90          cout << endl;
91        }
92      }
93    }
94  }
95  cout << endl;
96}
97
98Nuca::~Nuca(){
99  for (int i = wt_min; i <= wt_max; i++) {
100    delete wire_vertical[i];
101    delete wire_horizontal[i];
102  }
103}
104
105/* converts latency (in s) to cycles depending upon the FREQUENCY (in GHz) */
106  int
107Nuca::calc_cycles(double lat, double oper_freq)
108{
109  //TODO: convert latch delay to FO4 */
110  double cycle_time = (1.0/(oper_freq*1e9)); /*s*/
111  cycle_time -= LATCH_DELAY;
112  cycle_time -= FIXED_OVERHEAD;
113
114  return (int)ceil(lat/cycle_time);
115}
116
117
118nuca_org_t::~nuca_org_t() {
119  // if(h_wire) delete h_wire;
120  // if(v_wire) delete v_wire;
121  // if(router) delete router;
122}
123
124/*
125 * Version - 6.0
126 *
127 * Perform exhaustive search across different bank organizatons,
128 * router configurations, grid organizations, and wire models and
129 * find an optimal NUCA organization
130 * For different bank count values
131 * 1. Optimal bank organization is calculated
132 * 2. For each bank organization, find different NUCA organizations
133 *    using various router configurations, grid organizations,
134 *    and wire models.
135 * 3. NUCA model with the least cost is picked for
136 *    this particular bank count
137 * Finally include contention statistics and find the optimal
138 *    NUCA configuration
139 */
140  void
141Nuca::sim_nuca()
142{
143  /* temp variables */
144  int it, ro, wr;
145  int num_cyc;
146  unsigned int i, j, k;
147  unsigned int r, c;
148  int l2_c;
149  int bank_count = 0;
150  uca_org_t ures;
151  nuca_org_t *opt_n;
152  mem_array tag, data;
153  list<nuca_org_t *> nuca_list;
154  Router *router_s[ROUTER_TYPES];
155  router_s[0] = new Router(64.0, 8, 4, &(g_tp.peri_global));
156  router_s[0]->print_router();
157  router_s[1] = new Router(128.0, 8, 4, &(g_tp.peri_global));
158  router_s[1]->print_router();
159  router_s[2] = new Router(256.0, 8, 4, &(g_tp.peri_global));
160  router_s[2]->print_router();
161
162  int core_in; // to store no. of cores
163
164  /* to search diff grid organizations */
165  double curr_hop, totno_hops, totno_hhops, totno_vhops, tot_lat,
166         curr_acclat;
167  double avg_lat, avg_hop, avg_hhop, avg_vhop, avg_dyn_power,
168         avg_leakage_power;
169
170  double opt_acclat = INF, opt_avg_lat = INF, opt_tot_lat = INF;
171  int opt_rows = 0;
172  int opt_columns = 0;
173  double opt_totno_hops = 0;
174  double opt_avg_hop = 0;
175  double opt_dyn_power = 0, opt_leakage_power = 0;
176  min_values_t minval;
177
178  int bank_start = 0;
179
180  int flit_width = 0;
181
182  /* vertical and horizontal hop latency values */
183  int ver_hop_lat, hor_hop_lat; /* in cycles */
184
185
186  /* no. of different bank sizes to consider */
187  int iterations;
188
189
190  g_ip->nuca_cache_sz = g_ip->cache_sz;
191  nuca_list.push_back(new nuca_org_t());
192
193  if (g_ip->cache_level == 0) l2_c = 1;
194  else l2_c = 0;
195
196  if (g_ip->cores <= 4) core_in = 2;
197  else if (g_ip->cores <= 8) core_in = 3;
198  else if (g_ip->cores <= 16) core_in = 4;
199  else {cout << "Number of cores should be <= 16!\n"; exit(0);}
200
201
202  // set the lower bound to an appropriate value. this depends on cache associativity
203  if (g_ip->assoc > 2) {
204    i = 2;
205    while (i != g_ip->assoc) {
206      MIN_BANKSIZE *= 2;
207      i *= 2;
208    }
209  }
210
211  iterations = (int)logtwo((int)g_ip->cache_sz/MIN_BANKSIZE);
212
213  if (g_ip->force_wiretype)
214  {
215    if (g_ip->wt == Low_swing) {
216      wt_min = Low_swing;
217      wt_max = Low_swing;
218    }
219    else {
220      wt_min = Global;
221      wt_max = Low_swing-1;
222    }
223  }
224  else {
225    wt_min = Global;
226    wt_max = Low_swing;
227  }
228  if (g_ip->nuca_bank_count != 0) { // simulate just one bank
229    if (g_ip->nuca_bank_count != 2 && g_ip->nuca_bank_count != 4 &&
230        g_ip->nuca_bank_count != 8 && g_ip->nuca_bank_count != 16 &&
231        g_ip->nuca_bank_count != 32 && g_ip->nuca_bank_count != 64) {
232      fprintf(stderr,"Incorrect bank count value! Please fix the value in cache.cfg\n");
233    }
234    bank_start = (int)logtwo((double)g_ip->nuca_bank_count);
235    iterations = bank_start+1;
236    g_ip->cache_sz = g_ip->cache_sz/g_ip->nuca_bank_count;
237  }
238  cout << "Simulating various NUCA configurations\n";
239  for (it=bank_start; it<iterations; it++) { /* different bank count values */
240    ures.tag_array2 = &tag;
241    ures.data_array2 = &data;
242    /*
243     * find the optimal bank organization
244     */
245    solve(&ures);
246//    output_UCA(&ures);
247    bank_count = g_ip->nuca_cache_sz/g_ip->cache_sz;
248    cout << "====" <<  g_ip->cache_sz << "\n";
249
250    for (wr=wt_min; wr<=wt_max; wr++) {
251
252      for (ro=0; ro<ROUTER_TYPES; ro++)
253      {
254        flit_width = (int) router_s[ro]->flit_size; //initialize router
255        nuca_list.back()->nuca_pda.cycle_time = router_s[ro]->cycle_time;
256
257        /* calculate router and wire parameters */
258
259        double vlength = ures.cache_ht; /* length of the wire (u)*/
260        double hlength = ures.cache_len; // u
261
262        /* find delay, area, and power for wires */
263        wire_vertical[wr] = new Wire((enum Wire_type) wr, vlength);
264        wire_horizontal[wr] = new Wire((enum Wire_type) wr, hlength);
265
266
267        hor_hop_lat = calc_cycles(wire_horizontal[wr]->delay,
268            1/(nuca_list.back()->nuca_pda.cycle_time*.001));
269        ver_hop_lat = calc_cycles(wire_vertical[wr]->delay,
270            1/(nuca_list.back()->nuca_pda.cycle_time*.001));
271
272        /*
273         * assume a grid like topology and explore for optimal network
274         * configuration using different row and column count values.
275         */
276        for (c=1; c<=(unsigned int)bank_count; c++) {
277          while (bank_count%c != 0) c++;
278          r = bank_count/c;
279
280          /*
281           * to find the avg access latency of a NUCA cache, uncontended
282           * access time to each bank from the
283           * cache controller is calculated.
284           * avg latency =
285           * sum of the access latencies to individual banks)/bank
286           * count value.
287           */
288          totno_hops = totno_hhops = totno_vhops = tot_lat = 0;
289          k = 1;
290          for (i=0; i<r; i++) {
291            for (j=0; j<c; j++) {
292              /*
293               * vertical hops including the
294               * first hop from the cache controller
295               */
296              curr_hop = i + 1;
297              curr_hop += j; /* horizontal hops */
298              totno_hhops += j;
299              totno_vhops += (i+1);
300              curr_acclat = (i * ver_hop_lat + CONTR_2_BANK_LAT +
301                  j * hor_hop_lat);
302
303              tot_lat += curr_acclat;
304              totno_hops += curr_hop;
305            }
306          }
307          avg_lat = tot_lat/bank_count;
308          avg_hop = totno_hops/bank_count;
309          avg_hhop = totno_hhops/bank_count;
310          avg_vhop = totno_vhops/bank_count;
311
312          /* net access latency */
313          curr_acclat = 2*avg_lat + 2*(router_s[ro]->delay*avg_hop) +
314            calc_cycles(ures.access_time,
315                1/(nuca_list.back()->nuca_pda.cycle_time*.001));
316
317          /* avg access lat of nuca */
318          avg_dyn_power =
319            avg_hop *
320            (router_s[ro]->power.readOp.dynamic) + avg_hhop *
321            (wire_horizontal[wr]->power.readOp.dynamic) *
322            (g_ip->block_sz*8 + 64) + avg_vhop *
323            (wire_vertical[wr]->power.readOp.dynamic) *
324            (g_ip->block_sz*8 + 64) + ures.power.readOp.dynamic;
325
326          avg_leakage_power =
327            bank_count * router_s[ro]->power.readOp.leakage +
328            avg_hhop * (wire_horizontal[wr]->power.readOp.leakage*
329                wire_horizontal[wr]->delay) * flit_width +
330            avg_vhop * (wire_vertical[wr]->power.readOp.leakage *
331                wire_horizontal[wr]->delay);
332
333          if (curr_acclat < opt_acclat) {
334            opt_acclat = curr_acclat;
335            opt_tot_lat = tot_lat;
336            opt_avg_lat = avg_lat;
337            opt_totno_hops = totno_hops;
338            opt_avg_hop = avg_hop;
339            opt_rows = r;
340            opt_columns = c;
341            opt_dyn_power = avg_dyn_power;
342            opt_leakage_power = avg_leakage_power;
343          }
344          totno_hops = 0;
345          tot_lat = 0;
346          totno_hhops = 0;
347          totno_vhops = 0;
348        }
349        nuca_list.back()->wire_pda.power.readOp.dynamic =
350          opt_avg_hop * flit_width *
351          (wire_horizontal[wr]->power.readOp.dynamic +
352           wire_vertical[wr]->power.readOp.dynamic);
353        nuca_list.back()->avg_hops = opt_avg_hop;
354        /* network delay/power */
355        nuca_list.back()->h_wire = wire_horizontal[wr];
356        nuca_list.back()->v_wire = wire_vertical[wr];
357        nuca_list.back()->router = router_s[ro];
358        /* bank delay/power */
359
360        nuca_list.back()->bank_pda.delay = ures.access_time;
361        nuca_list.back()->bank_pda.power = ures.power;
362        nuca_list.back()->bank_pda.area.h = ures.cache_ht;
363        nuca_list.back()->bank_pda.area.w = ures.cache_len;
364        nuca_list.back()->bank_pda.cycle_time = ures.cycle_time;
365
366        num_cyc = calc_cycles(nuca_list.back()->bank_pda.delay /*s*/,
367            1/(nuca_list.back()->nuca_pda.cycle_time*.001/*GHz*/));
368        if(num_cyc%2 != 0) num_cyc++;
369        if (num_cyc > 16) num_cyc = 16; // we have data only up to 16 cycles
370
371        if (it < 7) {
372          nuca_list.back()->nuca_pda.delay = opt_acclat +
373            cont_stats[l2_c][core_in][ro][it][num_cyc/2-1];
374          nuca_list.back()->contention =
375            cont_stats[l2_c][core_in][ro][it][num_cyc/2-1];
376        }
377        else {
378          nuca_list.back()->nuca_pda.delay = opt_acclat +
379            cont_stats[l2_c][core_in][ro][7][num_cyc/2-1];
380          nuca_list.back()->contention =
381            cont_stats[l2_c][core_in][ro][7][num_cyc/2-1];
382        }
383        nuca_list.back()->nuca_pda.power.readOp.dynamic = opt_dyn_power;
384        nuca_list.back()->nuca_pda.power.readOp.leakage = opt_leakage_power;
385
386        /* array organization */
387        nuca_list.back()->bank_count = bank_count;
388        nuca_list.back()->rows = opt_rows;
389        nuca_list.back()->columns = opt_columns;
390        calculate_nuca_area (nuca_list.back());
391
392        minval.update_min_values(nuca_list.back());
393        nuca_list.push_back(new nuca_org_t());
394        opt_acclat = BIGNUM;
395
396      }
397    }
398    g_ip->cache_sz /= 2;
399  }
400
401  delete(nuca_list.back());
402  nuca_list.pop_back();
403  opt_n = find_optimal_nuca(&nuca_list, &minval);
404  print_nuca(opt_n);
405  g_ip->cache_sz = g_ip->nuca_cache_sz/opt_n->bank_count;
406
407  list<nuca_org_t *>::iterator niter;
408  for (niter = nuca_list.begin(); niter != nuca_list.end(); ++niter)
409  {
410    delete *niter;
411  }
412  nuca_list.clear();
413
414  for(int i=0; i < ROUTER_TYPES; i++)
415  {
416    delete router_s[i];
417  }
418  g_ip->display_ip();
419  //  g_ip->force_cache_config = true;
420  //  g_ip->ndwl = 8;
421  //  g_ip->ndbl = 16;
422  //  g_ip->nspd = 4;
423  //  g_ip->ndcm = 1;
424  //  g_ip->ndsam1 = 8;
425  //  g_ip->ndsam2 = 32;
426
427}
428
429
430  void
431Nuca::print_nuca (nuca_org_t *fr)
432{
433  printf("\n---------- CACTI version 6.5, Non-uniform Cache Access "
434      "----------\n\n");
435  printf("Optimal number of banks - %d\n", fr->bank_count);
436  printf("Grid organization rows x columns - %d x %d\n",
437      fr->rows, fr->columns);
438  printf("Network frequency - %g GHz\n",
439      (1/fr->nuca_pda.cycle_time)*1e3);
440  printf("Cache dimension (mm x mm) - %g x %g\n",
441      fr->nuca_pda.area.h,
442      fr->nuca_pda.area.w);
443
444  fr->router->print_router();
445
446  printf("\n\nWire stats:\n");
447  if (fr->h_wire->wt == Global) {
448    printf("\tWire type - Full swing global wires with least "
449        "possible delay\n");
450  }
451  else if (fr->h_wire->wt == Global_5) {
452    printf("\tWire type - Full swing global wires with "
453        "5%% delay penalty\n");
454  }
455  else if (fr->h_wire->wt == Global_10) {
456    printf("\tWire type - Full swing global wires with "
457        "10%% delay penalty\n");
458  }
459  else if (fr->h_wire->wt == Global_20) {
460    printf("\tWire type - Full swing global wires with "
461        "20%% delay penalty\n");
462  }
463  else if (fr->h_wire->wt == Global_30) {
464    printf("\tWire type - Full swing global wires with "
465        "30%% delay penalty\n");
466  }
467  else if(fr->h_wire->wt == Low_swing) {
468    printf("\tWire type - Low swing wires\n");
469  }
470
471  printf("\tHorizontal link delay - %g (ns)\n",
472      fr->h_wire->delay*1e9);
473  printf("\tVertical link delay - %g (ns)\n",
474      fr->v_wire->delay*1e9);
475  printf("\tDelay/length - %g (ns/mm)\n",
476      fr->h_wire->delay*1e9/fr->bank_pda.area.w);
477  printf("\tHorizontal link energy -dynamic/access %g (nJ)\n"
478      "\t                       -leakage %g (nW)\n\n",
479      fr->h_wire->power.readOp.dynamic*1e9,
480      fr->h_wire->power.readOp.leakage*1e9);
481  printf("\tVertical link energy -dynamic/access %g (nJ)\n"
482      "\t                     -leakage %g (nW)\n\n",
483      fr->v_wire->power.readOp.dynamic*1e9,
484      fr->v_wire->power.readOp.leakage*1e9);
485  printf("\n\n");
486  fr->v_wire->print_wire();
487  printf("\n\nBank stats:\n");
488}
489
490
491  nuca_org_t *
492Nuca::find_optimal_nuca (list<nuca_org_t *> *n, min_values_t *minval)
493{
494  double cost = 0;
495  double min_cost = BIGNUM;
496  nuca_org_t *res = NULL;
497  float d, a, dp, lp, c;
498  int v;
499  dp = g_ip->dynamic_power_wt_nuca;
500  lp = g_ip->leakage_power_wt_nuca;
501  a = g_ip->area_wt_nuca;
502  d = g_ip->delay_wt_nuca;
503  c = g_ip->cycle_time_wt_nuca;
504
505  list<nuca_org_t *>::iterator niter;
506
507
508  for (niter = n->begin(); niter != n->end(); niter++) {
509    fprintf(stderr, "\n-----------------------------"
510        "---------------\n");
511
512
513    printf("NUCA___stats %d \tbankcount: lat = %g \tdynP = %g \twt = %d\t "
514        "bank_dpower = %g \tleak = %g \tcycle = %g\n",
515        (*niter)->bank_count,
516        (*niter)->nuca_pda.delay,
517        (*niter)->nuca_pda.power.readOp.dynamic,
518        (*niter)->h_wire->wt,
519        (*niter)->bank_pda.power.readOp.dynamic,
520        (*niter)->nuca_pda.power.readOp.leakage,
521        (*niter)->nuca_pda.cycle_time);
522
523
524    if (g_ip->ed == 1) {
525      cost = ((*niter)->nuca_pda.delay/minval->min_delay)*
526        ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn);
527      if (min_cost > cost) {
528        min_cost = cost;
529        res = ((*niter));
530      }
531    }
532    else if (g_ip->ed == 2) {
533      cost = ((*niter)->nuca_pda.delay/minval->min_delay)*
534        ((*niter)->nuca_pda.delay/minval->min_delay)*
535        ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn);
536      if (min_cost > cost) {
537        min_cost = cost;
538        res = ((*niter));
539      }
540    }
541    else {
542      /*
543       * check whether the current organization
544       * meets the input deviation constraints
545       */
546      v = check_nuca_org((*niter), minval);
547      if (minval->min_leakage == 0) minval->min_leakage = 0.1; //FIXME remove this after leakage modeling
548
549      if (v) {
550        cost = (d  * ((*niter)->nuca_pda.delay/minval->min_delay) +
551            c  * ((*niter)->nuca_pda.cycle_time/minval->min_cyc) +
552            dp * ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn) +
553            lp * ((*niter)->nuca_pda.power.readOp.leakage/minval->min_leakage) +
554            a  * ((*niter)->nuca_pda.area.get_area()/minval->min_area));
555        fprintf(stderr, "cost = %g\n", cost);
556
557        if (min_cost > cost) {
558          min_cost = cost;
559          res = ((*niter));
560        }
561      }
562      else {
563        niter = n->erase(niter);
564        if (niter !=n->begin())
565                niter --;
566      }
567    }
568  }
569  return res;
570}
571
572  int
573Nuca::check_nuca_org (nuca_org_t *n, min_values_t *minval)
574{
575  if (((n->nuca_pda.delay - minval->min_delay)*100/minval->min_delay) > g_ip->delay_dev_nuca) {
576    return 0;
577  }
578  if (((n->nuca_pda.power.readOp.dynamic - minval->min_dyn)/minval->min_dyn)*100 >
579      g_ip->dynamic_power_dev_nuca) {
580    return 0;
581  }
582  if (((n->nuca_pda.power.readOp.leakage - minval->min_leakage)/minval->min_leakage)*100 >
583      g_ip->leakage_power_dev_nuca) {
584    return 0;
585  }
586  if (((n->nuca_pda.cycle_time - minval->min_cyc)/minval->min_cyc)*100 >
587      g_ip->cycle_time_dev_nuca) {
588    return 0;
589  }
590  if (((n->nuca_pda.area.get_area() - minval->min_area)/minval->min_area)*100 >
591      g_ip->area_dev_nuca) {
592    return 0;
593  }
594  return 1;
595}
596
597  void
598Nuca::calculate_nuca_area (nuca_org_t *nuca)
599{
600  nuca->nuca_pda.area.h=
601    nuca->rows * ((nuca->h_wire->wire_width +
602          nuca->h_wire->wire_spacing)
603        * nuca->router->flit_size +
604        nuca->bank_pda.area.h);
605
606  nuca->nuca_pda.area.w =
607    nuca->columns * ((nuca->v_wire->wire_width +
608          nuca->v_wire->wire_spacing)
609        * nuca->router->flit_size +
610        nuca->bank_pda.area.w);
611}
612
613