Ucache.cc revision 10234
1/*****************************************************************************
2 *                                McPAT/CACTI
3 *                      SOFTWARE LICENSE AGREEMENT
4 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
5 *            Copyright (c) 2010-2013 Advanced Micro Devices, Inc.
6 *                          All Rights Reserved
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are
10 * met: redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer;
12 * redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution;
15 * neither the name of the copyright holders nor the names of its
16 * contributors may be used to endorse or promote products derived from
17 * this software without specific prior written permission.
18
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 ***************************************************************************/
32
33
34#include <pthread.h>
35
36#include <algorithm>
37#include <cmath>
38#include <ctime>
39#include <iostream>
40#include <list>
41
42#include "Ucache.h"
43#include "area.h"
44#include "bank.h"
45#include "basic_circuit.h"
46#include "component.h"
47#include "const.h"
48#include "decoder.h"
49#include "parameter.h"
50#include "subarray.h"
51#include "uca.h"
52
53using namespace std;
54
55const uint32_t nthreads = NTHREADS;
56
57
58void min_values_t::update_min_values(const min_values_t * val) {
59    min_delay   = (min_delay > val->min_delay) ? val->min_delay : min_delay;
60    min_dyn     = (min_dyn > val->min_dyn) ? val->min_dyn : min_dyn;
61    min_leakage = (min_leakage > val->min_leakage) ? val->min_leakage : min_leakage;
62    min_area    = (min_area > val->min_area) ? val->min_area : min_area;
63    min_cyc     = (min_cyc > val->min_cyc) ? val->min_cyc : min_cyc;
64}
65
66
67
68void min_values_t::update_min_values(const uca_org_t & res) {
69    min_delay   = (min_delay > res.access_time) ? res.access_time : min_delay;
70    min_dyn     = (min_dyn > res.power.readOp.dynamic) ? res.power.readOp.dynamic : min_dyn;
71    min_leakage = (min_leakage > res.power.readOp.leakage) ? res.power.readOp.leakage : min_leakage;
72    min_area    = (min_area > res.area) ? res.area : min_area;
73    min_cyc     = (min_cyc > res.cycle_time) ? res.cycle_time : min_cyc;
74}
75
76void min_values_t::update_min_values(const nuca_org_t * res) {
77    min_delay   = (min_delay > res->nuca_pda.delay) ? res->nuca_pda.delay : min_delay;
78    min_dyn     = (min_dyn > res->nuca_pda.power.readOp.dynamic) ? res->nuca_pda.power.readOp.dynamic : min_dyn;
79    min_leakage = (min_leakage > res->nuca_pda.power.readOp.leakage) ? res->nuca_pda.power.readOp.leakage : min_leakage;
80    min_area    = (min_area > res->nuca_pda.area.get_area()) ? res->nuca_pda.area.get_area() : min_area;
81    min_cyc     = (min_cyc > res->nuca_pda.cycle_time) ? res->nuca_pda.cycle_time : min_cyc;
82}
83
84void min_values_t::update_min_values(const mem_array * res) {
85    min_delay   = (min_delay > res->access_time) ? res->access_time : min_delay;
86    min_dyn     = (min_dyn > res->power.readOp.dynamic) ? res->power.readOp.dynamic : min_dyn;
87    min_leakage = (min_leakage > res->power.readOp.leakage) ? res->power.readOp.leakage : min_leakage;
88    min_area    = (min_area > res->area) ? res->area : min_area;
89    min_cyc     = (min_cyc > res->cycle_time) ? res->cycle_time : min_cyc;
90}
91
92
93
94void * calc_time_mt_wrapper(void * void_obj) {
95    calc_time_mt_wrapper_struct * calc_obj = (calc_time_mt_wrapper_struct *) void_obj;
96    uint32_t tid                   = calc_obj->tid;
97    list<mem_array *> & data_arr   = calc_obj->data_arr;
98    list<mem_array *> & tag_arr    = calc_obj->tag_arr;
99    bool is_tag                    = calc_obj->is_tag;
100    bool pure_ram                  = calc_obj->pure_ram;
101    bool pure_cam					 = calc_obj->pure_cam;
102    bool is_main_mem               = calc_obj->is_main_mem;
103    double Nspd_min                = calc_obj->Nspd_min;
104    min_values_t * data_res        = calc_obj->data_res;
105    min_values_t * tag_res         = calc_obj->tag_res;
106
107    data_arr.clear();
108    data_arr.push_back(new mem_array);
109    tag_arr.clear();
110    tag_arr.push_back(new mem_array);
111
112    uint32_t Ndwl_niter = _log2(MAXDATAN) + 1;
113    uint32_t Ndbl_niter = _log2(MAXDATAN) + 1;
114    uint32_t Ndcm_niter = _log2(MAX_COL_MUX) + 1;
115    uint32_t niter      = Ndwl_niter * Ndbl_niter * Ndcm_niter;
116
117
118    bool is_valid_partition;
119    int wt_min, wt_max;
120
121    if (g_ip->force_wiretype) {
122        if (g_ip->wt == 0) {
123            wt_min = Low_swing;
124            wt_max = Low_swing;
125        } else {
126            wt_min = Global;
127            wt_max = Low_swing - 1;
128        }
129    } else {
130        wt_min = Global;
131        wt_max = Low_swing;
132    }
133
134    for (double Nspd = Nspd_min; Nspd <= MAXDATASPD; Nspd *= 2) {
135        for (int wr = wt_min; wr <= wt_max; wr++) {
136            for (uint32_t iter = tid; iter < niter; iter += nthreads) {
137                // reconstruct Ndwl, Ndbl, Ndcm
138                unsigned int Ndwl = 1 << (iter / (Ndbl_niter * Ndcm_niter));
139                unsigned int Ndbl = 1 << ((iter / (Ndcm_niter)) % Ndbl_niter);
140                unsigned int Ndcm = 1 << (iter % Ndcm_niter);
141                for (unsigned int Ndsam_lev_1 = 1; Ndsam_lev_1 <= MAX_COL_MUX;
142                     Ndsam_lev_1 *= 2) {
143                    for (unsigned int Ndsam_lev_2 = 1;
144                         Ndsam_lev_2 <= MAX_COL_MUX; Ndsam_lev_2 *= 2) {
145                        //for debuging
146                        if (g_ip->force_cache_config && is_tag == false) {
147                            wr   = g_ip->wt;
148                            Ndwl = g_ip->ndwl;
149                            Ndbl = g_ip->ndbl;
150                            Ndcm = g_ip->ndcm;
151                            if (g_ip->nspd != 0) {
152                                Nspd = g_ip->nspd;
153                            }
154                            if (g_ip->ndsam1 != 0) {
155                                Ndsam_lev_1 = g_ip->ndsam1;
156                                Ndsam_lev_2 = g_ip->ndsam2;
157                            }
158                        }
159
160                        if (is_tag == true) {
161                            is_valid_partition = calculate_time(is_tag, pure_ram, pure_cam, Nspd, Ndwl,
162                                                                Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2,
163                                                                tag_arr.back(), 0, NULL, NULL,
164                                                                is_main_mem);
165                        }
166                        // If it's a fully-associative cache, the data array partition parameters are identical to that of
167                        // the tag array, so compute data array partition properties also here.
168                        if (is_tag == false || g_ip->fully_assoc) {
169                            is_valid_partition = calculate_time(is_tag/*false*/, pure_ram, pure_cam, Nspd, Ndwl,
170                                                                Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2,
171                                                                data_arr.back(), 0, NULL, NULL,
172                                                                is_main_mem);
173                        }
174
175                        if (is_valid_partition) {
176                            if (is_tag == true) {
177                                tag_arr.back()->wt = (enum Wire_type) wr;
178                                tag_res->update_min_values(tag_arr.back());
179                                tag_arr.push_back(new mem_array);
180                            }
181                            if (is_tag == false || g_ip->fully_assoc) {
182                                data_arr.back()->wt = (enum Wire_type) wr;
183                                data_res->update_min_values(data_arr.back());
184                                data_arr.push_back(new mem_array);
185                            }
186                        }
187
188                        if (g_ip->force_cache_config && is_tag == false) {
189                            wr   = wt_max;
190                            iter = niter;
191                            if (g_ip->nspd != 0) {
192                                Nspd = MAXDATASPD;
193                            }
194                            if (g_ip->ndsam1 != 0) {
195                                Ndsam_lev_1 = MAX_COL_MUX + 1;
196                                Ndsam_lev_2 = MAX_COL_MUX + 1;
197                            }
198                        }
199                    }
200                }
201            }
202        }
203    }
204
205    delete data_arr.back();
206    delete tag_arr.back();
207    data_arr.pop_back();
208    tag_arr.pop_back();
209
210#ifndef DEBUG
211    pthread_exit(NULL);
212#else
213    return NULL;
214#endif
215}
216
217
218
219bool calculate_time(
220    bool is_tag,
221    int pure_ram,
222    bool pure_cam,
223    double Nspd,
224    unsigned int Ndwl,
225    unsigned int Ndbl,
226    unsigned int Ndcm,
227    unsigned int Ndsam_lev_1,
228    unsigned int Ndsam_lev_2,
229    mem_array *ptr_array,
230    int flag_results_populate,
231    results_mem_array *ptr_results,
232    uca_org_t *ptr_fin_res,
233    bool is_main_mem) {
234    DynamicParameter dyn_p(is_tag, pure_ram, pure_cam, Nspd, Ndwl, Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2, is_main_mem);
235
236    if (dyn_p.is_valid == false) {
237        return false;
238    }
239
240    UCA * uca = new UCA(dyn_p);
241
242
243    //For the final solution, populate the ptr_results data structure
244    //-- TODO: copy only necessary variables
245    if (flag_results_populate) {
246    } else {
247        int num_act_mats_hor_dir = uca->bank.dp.num_act_mats_hor_dir;
248        int num_mats = uca->bank.dp.num_mats;
249        bool is_fa = uca->bank.dp.fully_assoc;
250        bool pure_cam = uca->bank.dp.pure_cam;
251        ptr_array->Ndwl = Ndwl;
252        ptr_array->Ndbl = Ndbl;
253        ptr_array->Nspd = Nspd;
254        ptr_array->deg_bl_muxing = dyn_p.deg_bl_muxing;
255        ptr_array->Ndsam_lev_1 = Ndsam_lev_1;
256        ptr_array->Ndsam_lev_2 = Ndsam_lev_2;
257        ptr_array->access_time = uca->access_time;
258        ptr_array->cycle_time = uca->cycle_time;
259        ptr_array->multisubbank_interleave_cycle_time =
260            uca->multisubbank_interleave_cycle_time;
261        ptr_array->area_ram_cells = uca->area_all_dataramcells;
262        ptr_array->area   = uca->area.get_area();
263        ptr_array->height = uca->area.h;
264        ptr_array->width  = uca->area.w;
265        ptr_array->mat_height = uca->bank.mat.area.h;
266        ptr_array->mat_length = uca->bank.mat.area.w;
267        ptr_array->subarray_height = uca->bank.mat.subarray.area.h;
268        ptr_array->subarray_length = uca->bank.mat.subarray.area.w;
269        ptr_array->power  = uca->power;
270        ptr_array->delay_senseamp_mux_decoder =
271            MAX(uca->delay_array_to_sa_mux_lev_1_decoder,
272                uca->delay_array_to_sa_mux_lev_2_decoder);
273        ptr_array->delay_before_subarray_output_driver =
274            uca->delay_before_subarray_output_driver;
275        ptr_array->delay_from_subarray_output_driver_to_output =
276            uca->delay_from_subarray_out_drv_to_out;
277
278        ptr_array->delay_route_to_bank = uca->htree_in_add->delay;
279        ptr_array->delay_input_htree = uca->bank.htree_in_add->delay;
280        ptr_array->delay_row_predecode_driver_and_block =
281            uca->bank.mat.r_predec->delay;
282        ptr_array->delay_row_decoder = uca->bank.mat.row_dec->delay;
283        ptr_array->delay_bitlines = uca->bank.mat.delay_bitline;
284        ptr_array->delay_matchlines = uca->bank.mat.delay_matchchline;
285        ptr_array->delay_sense_amp = uca->bank.mat.delay_sa;
286        ptr_array->delay_subarray_output_driver =
287            uca->bank.mat.delay_subarray_out_drv_htree;
288        ptr_array->delay_dout_htree = uca->bank.htree_out_data->delay;
289        ptr_array->delay_comparator = uca->bank.mat.delay_comparator;
290
291        ptr_array->all_banks_height = uca->area.h;
292        ptr_array->all_banks_width = uca->area.w;
293        ptr_array->area_efficiency = uca->area_all_dataramcells * 100 /
294            (uca->area.get_area());
295
296        ptr_array->power_routing_to_bank = uca->power_routing_to_bank;
297        ptr_array->power_addr_input_htree = uca->bank.htree_in_add->power;
298        ptr_array->power_data_input_htree = uca->bank.htree_in_data->power;
299        ptr_array->power_data_output_htree = uca->bank.htree_out_data->power;
300
301        ptr_array->power_row_predecoder_drivers =
302            uca->bank.mat.r_predec->driver_power;
303        ptr_array->power_row_predecoder_drivers.readOp.dynamic *=
304            num_act_mats_hor_dir;
305        ptr_array->power_row_predecoder_drivers.writeOp.dynamic *=
306            num_act_mats_hor_dir;
307        ptr_array->power_row_predecoder_drivers.searchOp.dynamic *=
308            num_act_mats_hor_dir;
309
310        ptr_array->power_row_predecoder_blocks =
311            uca->bank.mat.r_predec->block_power;
312        ptr_array->power_row_predecoder_blocks.readOp.dynamic *=
313            num_act_mats_hor_dir;
314        ptr_array->power_row_predecoder_blocks.writeOp.dynamic *=
315            num_act_mats_hor_dir;
316        ptr_array->power_row_predecoder_blocks.searchOp.dynamic *=
317            num_act_mats_hor_dir;
318
319        ptr_array->power_row_decoders = uca->bank.mat.power_row_decoders;
320        ptr_array->power_row_decoders.readOp.dynamic *= num_act_mats_hor_dir;
321        ptr_array->power_row_decoders.writeOp.dynamic *= num_act_mats_hor_dir;
322        ptr_array->power_row_decoders.searchOp.dynamic *= num_act_mats_hor_dir;
323
324        ptr_array->power_bit_mux_predecoder_drivers =
325            uca->bank.mat.b_mux_predec->driver_power;
326        ptr_array->power_bit_mux_predecoder_drivers.readOp.dynamic *=
327            num_act_mats_hor_dir;
328        ptr_array->power_bit_mux_predecoder_drivers.writeOp.dynamic *=
329            num_act_mats_hor_dir;
330        ptr_array->power_bit_mux_predecoder_drivers.searchOp.dynamic *=
331            num_act_mats_hor_dir;
332
333        ptr_array->power_bit_mux_predecoder_blocks =
334            uca->bank.mat.b_mux_predec->block_power;
335        ptr_array->power_bit_mux_predecoder_blocks.readOp.dynamic *=
336            num_act_mats_hor_dir;
337        ptr_array->power_bit_mux_predecoder_blocks.writeOp.dynamic *=
338            num_act_mats_hor_dir;
339        ptr_array->power_bit_mux_predecoder_blocks.searchOp.dynamic *=
340            num_act_mats_hor_dir;
341
342        ptr_array->power_bit_mux_decoders = uca->bank.mat.power_bit_mux_decoders;
343        ptr_array->power_bit_mux_decoders.readOp.dynamic *= num_act_mats_hor_dir;
344        ptr_array->power_bit_mux_decoders.writeOp.dynamic *=
345            num_act_mats_hor_dir;
346        ptr_array->power_bit_mux_decoders.searchOp.dynamic *=
347            num_act_mats_hor_dir;
348
349        ptr_array->power_senseamp_mux_lev_1_predecoder_drivers =
350            uca->bank.mat.sa_mux_lev_1_predec->driver_power;
351        ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .readOp.dynamic *=
352            num_act_mats_hor_dir;
353        ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .writeOp.dynamic *=
354            num_act_mats_hor_dir;
355        ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .searchOp.dynamic *=
356            num_act_mats_hor_dir;
357
358        ptr_array->power_senseamp_mux_lev_1_predecoder_blocks =
359            uca->bank.mat.sa_mux_lev_1_predec->block_power;
360        ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.readOp.dynamic *=
361            num_act_mats_hor_dir;
362        ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.writeOp.dynamic *=
363            num_act_mats_hor_dir;
364        ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.searchOp.dynamic *=
365            num_act_mats_hor_dir;
366
367        ptr_array->power_senseamp_mux_lev_1_decoders =
368            uca->bank.mat.power_sa_mux_lev_1_decoders;
369        ptr_array->power_senseamp_mux_lev_1_decoders.readOp.dynamic *=
370            num_act_mats_hor_dir;
371        ptr_array->power_senseamp_mux_lev_1_decoders.writeOp.dynamic *=
372            num_act_mats_hor_dir;
373        ptr_array->power_senseamp_mux_lev_1_decoders.searchOp.dynamic *=
374            num_act_mats_hor_dir;
375
376        ptr_array->power_senseamp_mux_lev_2_predecoder_drivers =
377            uca->bank.mat.sa_mux_lev_2_predec->driver_power;
378        ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.readOp.dynamic *=
379            num_act_mats_hor_dir;
380        ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.writeOp.dynamic *=
381            num_act_mats_hor_dir;
382        ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.searchOp.dynamic *=
383            num_act_mats_hor_dir;
384
385        ptr_array->power_senseamp_mux_lev_2_predecoder_blocks =
386            uca->bank.mat.sa_mux_lev_2_predec->block_power;
387        ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.readOp.dynamic *=
388            num_act_mats_hor_dir;
389        ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.writeOp.dynamic *=
390            num_act_mats_hor_dir;
391        ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.searchOp.dynamic *=
392            num_act_mats_hor_dir;
393
394        ptr_array->power_senseamp_mux_lev_2_decoders =
395            uca->bank.mat.power_sa_mux_lev_2_decoders;
396        ptr_array->power_senseamp_mux_lev_2_decoders .readOp.dynamic *=
397            num_act_mats_hor_dir;
398        ptr_array->power_senseamp_mux_lev_2_decoders .writeOp.dynamic *=
399            num_act_mats_hor_dir;
400        ptr_array->power_senseamp_mux_lev_2_decoders .searchOp.dynamic *=
401            num_act_mats_hor_dir;
402
403        ptr_array->power_bitlines = uca->bank.mat.power_bitline;
404        ptr_array->power_bitlines.readOp.dynamic *= num_act_mats_hor_dir;
405        ptr_array->power_bitlines.writeOp.dynamic *= num_act_mats_hor_dir;
406        ptr_array->power_bitlines.searchOp.dynamic *= num_act_mats_hor_dir;
407
408        ptr_array->power_sense_amps = uca->bank.mat.power_sa;
409        ptr_array->power_sense_amps.readOp.dynamic *= num_act_mats_hor_dir;
410        ptr_array->power_sense_amps.writeOp.dynamic *= num_act_mats_hor_dir;
411        ptr_array->power_sense_amps.searchOp.dynamic *= num_act_mats_hor_dir;
412
413        ptr_array->power_prechg_eq_drivers =
414            uca->bank.mat.power_bl_precharge_eq_drv;
415        ptr_array->power_prechg_eq_drivers.readOp.dynamic *=
416            num_act_mats_hor_dir;
417        ptr_array->power_prechg_eq_drivers.writeOp.dynamic *=
418            num_act_mats_hor_dir;
419        ptr_array->power_prechg_eq_drivers.searchOp.dynamic *=
420            num_act_mats_hor_dir;
421
422        ptr_array->power_output_drivers_at_subarray =
423            uca->bank.mat.power_subarray_out_drv;
424        ptr_array->power_output_drivers_at_subarray.readOp.dynamic *=
425            num_act_mats_hor_dir;
426        ptr_array->power_output_drivers_at_subarray.writeOp.dynamic *=
427            num_act_mats_hor_dir;
428        ptr_array->power_output_drivers_at_subarray.searchOp.dynamic *=
429            num_act_mats_hor_dir;
430
431        ptr_array->power_comparators = uca->bank.mat.power_comparator;
432        ptr_array->power_comparators.readOp.dynamic *= num_act_mats_hor_dir;
433        ptr_array->power_comparators.writeOp.dynamic *= num_act_mats_hor_dir;
434        ptr_array->power_comparators.searchOp.dynamic *= num_act_mats_hor_dir;
435
436        if (is_fa || pure_cam) {
437            ptr_array->power_htree_in_search =
438                uca->bank.htree_in_search->power;
439            ptr_array->power_htree_out_search =
440                uca->bank.htree_out_search->power;
441            ptr_array->power_searchline = uca->bank.mat.power_searchline;
442            ptr_array->power_searchline.searchOp.dynamic *= num_mats;
443            ptr_array->power_searchline_precharge =
444                uca->bank.mat.power_searchline_precharge;
445            ptr_array->power_searchline_precharge.searchOp.dynamic *= num_mats;
446            ptr_array->power_matchlines = uca->bank.mat.power_matchline;
447            ptr_array->power_matchlines.searchOp.dynamic *= num_mats;
448            ptr_array->power_matchline_precharge =
449                uca->bank.mat.power_matchline_precharge;
450            ptr_array->power_matchline_precharge.searchOp.dynamic *= num_mats;
451            ptr_array->power_matchline_to_wordline_drv =
452                uca->bank.mat.power_ml_to_ram_wl_drv;
453        }
454
455        ptr_array->activate_energy = uca->activate_energy;
456        ptr_array->read_energy = uca->read_energy;
457        ptr_array->write_energy = uca->write_energy;
458        ptr_array->precharge_energy = uca->precharge_energy;
459        ptr_array->refresh_power = uca->refresh_power;
460        ptr_array->leak_power_subbank_closed_page =
461            uca->leak_power_subbank_closed_page;
462        ptr_array->leak_power_subbank_open_page =
463            uca->leak_power_subbank_open_page;
464        ptr_array->leak_power_request_and_reply_networks =
465            uca->leak_power_request_and_reply_networks;
466
467        ptr_array->precharge_delay = uca->precharge_delay;
468    }
469
470
471    delete uca;
472    return true;
473}
474
475
476
477bool check_uca_org(uca_org_t & u, min_values_t *minval) {
478    if (((u.access_time - minval->min_delay) * 100 / minval->min_delay) >
479        g_ip->delay_dev) {
480        return false;
481    }
482    if (((u.power.readOp.dynamic - minval->min_dyn) / minval->min_dyn)*100 >
483        g_ip->dynamic_power_dev) {
484        return false;
485    }
486    if (((u.power.readOp.leakage - minval->min_leakage) /
487         minval->min_leakage) * 100 >
488        g_ip->leakage_power_dev) {
489        return false;
490    }
491    if (((u.cycle_time - minval->min_cyc) / minval->min_cyc)*100 >
492        g_ip->cycle_time_dev) {
493        return false;
494    }
495    if (((u.area - minval->min_area) / minval->min_area)*100 >
496        g_ip->area_dev) {
497        return false;
498    }
499    return true;
500}
501
502bool check_mem_org(mem_array & u, const min_values_t *minval) {
503    if (((u.access_time - minval->min_delay) * 100 / minval->min_delay) >
504        g_ip->delay_dev) {
505        return false;
506    }
507    if (((u.power.readOp.dynamic - minval->min_dyn) / minval->min_dyn)*100 >
508            g_ip->dynamic_power_dev) {
509        return false;
510    }
511    if (((u.power.readOp.leakage - minval->min_leakage) /
512         minval->min_leakage) * 100 >
513        g_ip->leakage_power_dev) {
514        return false;
515    }
516    if (((u.cycle_time - minval->min_cyc) / minval->min_cyc) * 100 >
517        g_ip->cycle_time_dev) {
518        return false;
519    }
520    if (((u.area - minval->min_area) / minval->min_area) * 100 >
521        g_ip->area_dev) {
522        return false;
523    }
524    return true;
525}
526
527
528
529
530void find_optimal_uca(uca_org_t *res, min_values_t * minval,
531                      list<uca_org_t> & ulist) {
532    double cost = 0;
533    double min_cost = BIGNUM;
534    float d, a, dp, lp, c;
535
536    dp = g_ip->dynamic_power_wt;
537    lp = g_ip->leakage_power_wt;
538    a  = g_ip->area_wt;
539    d  = g_ip->delay_wt;
540    c  = g_ip->cycle_time_wt;
541
542    if (ulist.empty() == true) {
543        cout << "ERROR: no valid cache organizations found" << endl;
544        exit(0);
545    }
546
547    for (list<uca_org_t>::iterator niter = ulist.begin(); niter != ulist.end();
548         niter++) {
549        if (g_ip->ed == 1) {
550            cost = ((niter)->access_time / minval->min_delay) *
551                ((niter)->power.readOp.dynamic / minval->min_dyn);
552            if (min_cost > cost) {
553                min_cost = cost;
554                *res = (*(niter));
555            }
556        } else if (g_ip->ed == 2) {
557            cost = ((niter)->access_time / minval->min_delay) *
558                   ((niter)->access_time / minval->min_delay) *
559                   ((niter)->power.readOp.dynamic / minval->min_dyn);
560            if (min_cost > cost) {
561                min_cost = cost;
562                *res = (*(niter));
563            }
564        } else {
565            /*
566             * check whether the current organization
567             * meets the input deviation constraints
568             */
569            bool v = check_uca_org(*niter, minval);
570
571            if (v) {
572                cost = (d  * ((niter)->access_time / minval->min_delay) +
573                        c  * ((niter)->cycle_time / minval->min_cyc) +
574                        dp * ((niter)->power.readOp.dynamic / minval->min_dyn) +
575                        lp *
576                        ((niter)->power.readOp.leakage / minval->min_leakage) +
577                        a  * ((niter)->area / minval->min_area));
578
579                if (min_cost > cost) {
580                    min_cost = cost;
581                    *res = (*(niter));
582                    niter = ulist.erase(niter);
583                    if (niter != ulist.begin())
584                        niter--;
585                }
586            } else {
587                niter = ulist.erase(niter);
588                if (niter != ulist.begin())
589                    niter--;
590            }
591        }
592    }
593
594    if (min_cost == BIGNUM) {
595        cout << "ERROR: no cache organizations met optimization criteria"
596             << endl;
597        exit(0);
598    }
599}
600
601
602
603void filter_tag_arr(const min_values_t * min, list<mem_array *> & list) {
604    double cost = BIGNUM;
605    double cur_cost;
606    double wt_delay = g_ip->delay_wt;
607    double wt_dyn = g_ip->dynamic_power_wt;
608    double wt_leakage = g_ip->leakage_power_wt;
609    double wt_cyc = g_ip->cycle_time_wt;
610    double wt_area = g_ip->area_wt;
611    mem_array * res = NULL;
612
613    if (list.empty() == true) {
614        cout << "ERROR: no valid tag organizations found" << endl;
615        exit(1);
616    }
617
618
619    while (list.empty() != true) {
620        bool v = check_mem_org(*list.back(), min);
621        if (v) {
622            cur_cost = wt_delay * (list.back()->access_time / min->min_delay) +
623                       wt_dyn * (list.back()->power.readOp.dynamic /
624                                 min->min_dyn) +
625                       wt_leakage * (list.back()->power.readOp.leakage /
626                                     min->min_leakage) +
627                       wt_area * (list.back()->area / min->min_area) +
628                       wt_cyc * (list.back()->cycle_time / min->min_cyc);
629        } else {
630            cur_cost = BIGNUM;
631        }
632        if (cur_cost < cost) {
633            if (res != NULL) {
634                delete res;
635            }
636            cost = cur_cost;
637            res  = list.back();
638        } else {
639            delete list.back();
640        }
641        list.pop_back();
642    }
643    if (!res) {
644        cout << "ERROR: no valid tag organizations found" << endl;
645        exit(0);
646    }
647
648    list.push_back(res);
649}
650
651
652
653void filter_data_arr(list<mem_array *> & curr_list) {
654    if (curr_list.empty() == true) {
655        cout << "ERROR: no valid data array organizations found" << endl;
656        exit(1);
657    }
658
659    list<mem_array *>::iterator iter;
660
661    for (iter = curr_list.begin(); iter != curr_list.end(); ++iter) {
662        mem_array * m = *iter;
663
664        if (m == NULL) exit(1);
665
666        if (((m->access_time - m->arr_min->min_delay) / m->arr_min->min_delay >
667             0.5) &&
668            ((m->power.readOp.dynamic - m->arr_min->min_dyn) /
669             m->arr_min->min_dyn > 0.5)) {
670            delete m;
671            iter = curr_list.erase(iter);
672            iter --;
673        }
674    }
675}
676
677
678
679/*
680 * Performs exhaustive search across different sub-array sizes,
681 * wire types and aspect ratios to find an optimal UCA organization
682 * 1. First different valid tag array organizations are calculated
683 *    and stored in tag_arr array
684 * 2. The exhaustive search is repeated to find valid data array
685 *    organizations and stored in data_arr array
686 * 3. Cache area, delay, power, and cycle time for different
687 *    cache organizations are calculated based on the
688 *    above results
689 * 4. Cache model with least cost is picked from sol_list
690 */
691void solve(uca_org_t *fin_res) {
692    bool   is_dram  = false;
693    int    pure_ram = g_ip->pure_ram;
694    bool   pure_cam = g_ip->pure_cam;
695
696    init_tech_params(g_ip->F_sz_um, false);
697
698
699    list<mem_array *> tag_arr (0);
700    list<mem_array *> data_arr(0);
701    list<mem_array *>::iterator miter;
702    list<uca_org_t> sol_list(1, uca_org_t());
703
704    fin_res->tag_array.access_time = 0;
705    fin_res->tag_array.Ndwl = 0;
706    fin_res->tag_array.Ndbl = 0;
707    fin_res->tag_array.Nspd = 0;
708    fin_res->tag_array.deg_bl_muxing = 0;
709    fin_res->tag_array.Ndsam_lev_1 = 0;
710    fin_res->tag_array.Ndsam_lev_2 = 0;
711
712
713    // distribute calculate_time() execution to multiple threads
714    calc_time_mt_wrapper_struct * calc_array =
715        new calc_time_mt_wrapper_struct[nthreads];
716    pthread_t threads[nthreads];
717
718    for (uint32_t t = 0; t < nthreads; t++) {
719        calc_array[t].tid         = t;
720        calc_array[t].pure_ram    = pure_ram;
721        calc_array[t].pure_cam    = pure_cam;
722        calc_array[t].data_res    = new min_values_t();
723        calc_array[t].tag_res     = new min_values_t();
724    }
725
726    bool     is_tag;
727    uint32_t ram_cell_tech_type;
728
729    // If it's a cache, first calculate the area, delay and power for all tag array partitions.
730    if (!(pure_ram || pure_cam || g_ip->fully_assoc)) { //cache
731        is_tag = true;
732        ram_cell_tech_type = g_ip->tag_arr_ram_cell_tech_type;
733        is_dram = ((ram_cell_tech_type == lp_dram) ||
734                   (ram_cell_tech_type == comm_dram));
735        init_tech_params(g_ip->F_sz_um, is_tag);
736
737        for (uint32_t t = 0; t < nthreads; t++) {
738            calc_array[t].is_tag      = is_tag;
739            calc_array[t].is_main_mem = false;
740            calc_array[t].Nspd_min    = 0.125;
741#ifndef DEBUG
742            pthread_create(&threads[t], NULL, calc_time_mt_wrapper,
743                           (void *)(&(calc_array[t])));
744#else
745            calc_time_mt_wrapper((void *)(&(calc_array[t])));
746#endif
747        }
748
749#ifndef DEBUG
750        for (uint32_t t = 0; t < nthreads; t++) {
751            pthread_join(threads[t], NULL);
752        }
753#endif
754
755        for (uint32_t t = 0; t < nthreads; t++) {
756            calc_array[t].data_arr.sort(mem_array::lt);
757            data_arr.merge(calc_array[t].data_arr, mem_array::lt);
758            calc_array[t].tag_arr.sort(mem_array::lt);
759            tag_arr.merge(calc_array[t].tag_arr, mem_array::lt);
760        }
761    }
762
763
764    // calculate the area, delay and power for all data array partitions (for cache or plain RAM).
765    // in the new cacti, cam, fully_associative cache are processed as single array in the data portion
766    is_tag              = false;
767    ram_cell_tech_type  = g_ip->data_arr_ram_cell_tech_type;
768    is_dram             = ((ram_cell_tech_type == lp_dram) || (ram_cell_tech_type == comm_dram));
769    init_tech_params(g_ip->F_sz_um, is_tag);
770
771    for (uint32_t t = 0; t < nthreads; t++) {
772        calc_array[t].is_tag      = is_tag;
773        calc_array[t].is_main_mem = g_ip->is_main_mem;
774        if (!(pure_cam || g_ip->fully_assoc)) {
775            calc_array[t].Nspd_min = (double)(g_ip->out_w) /
776                (double)(g_ip->block_sz * 8);
777        } else {
778            calc_array[t].Nspd_min = 1;
779        }
780
781#ifndef DEBUG
782        pthread_create(&threads[t], NULL, calc_time_mt_wrapper,
783                       (void *)(&(calc_array[t])));
784#else
785        calc_time_mt_wrapper((void *)(&(calc_array[t])));
786#endif
787    }
788
789#ifndef DEBUG
790    for (uint32_t t = 0; t < nthreads; t++) {
791        pthread_join(threads[t], NULL);
792    }
793#endif
794
795    data_arr.clear();
796    for (uint32_t t = 0; t < nthreads; t++) {
797        calc_array[t].data_arr.sort(mem_array::lt);
798        data_arr.merge(calc_array[t].data_arr, mem_array::lt);
799
800
801    }
802
803
804
805    min_values_t * d_min = new min_values_t();
806    min_values_t * t_min = new min_values_t();
807    min_values_t * cache_min = new min_values_t();
808
809    for (uint32_t t = 0; t < nthreads; t++) {
810        d_min->update_min_values(calc_array[t].data_res);
811        t_min->update_min_values(calc_array[t].tag_res);
812    }
813
814    for (miter = data_arr.begin(); miter != data_arr.end(); miter++) {
815        (*miter)->arr_min = d_min;
816    }
817
818    filter_data_arr(data_arr);
819    if (!(pure_ram || pure_cam || g_ip->fully_assoc)) {
820        filter_tag_arr(t_min, tag_arr);
821    }
822
823    if (pure_ram || pure_cam || g_ip->fully_assoc) {
824        for (miter = data_arr.begin(); miter != data_arr.end(); miter++) {
825            uca_org_t & curr_org  = sol_list.back();
826            curr_org.tag_array2  = NULL;
827            curr_org.data_array2 = (*miter);
828
829            curr_org.find_delay();
830            curr_org.find_energy();
831            curr_org.find_area();
832            curr_org.find_cyc();
833
834            //update min values for the entire cache
835            cache_min->update_min_values(curr_org);
836
837            sol_list.push_back(uca_org_t());
838        }
839    } else {
840        while (tag_arr.empty() != true) {
841            mem_array * arr_temp = (tag_arr.back());
842            tag_arr.pop_back();
843
844            for (miter = data_arr.begin(); miter != data_arr.end(); miter++) {
845                uca_org_t & curr_org  = sol_list.back();
846                curr_org.tag_array2  = arr_temp;
847                curr_org.data_array2 = (*miter);
848
849                curr_org.find_delay();
850                curr_org.find_energy();
851                curr_org.find_area();
852                curr_org.find_cyc();
853
854                //update min values for the entire cache
855                cache_min->update_min_values(curr_org);
856
857                sol_list.push_back(uca_org_t());
858            }
859        }
860    }
861
862    sol_list.pop_back();
863
864    find_optimal_uca(fin_res, cache_min, sol_list);
865
866    sol_list.clear();
867
868    for (miter = data_arr.begin(); miter != data_arr.end(); ++miter) {
869        if (*miter != fin_res->data_array2) {
870            delete *miter;
871        }
872    }
873    data_arr.clear();
874
875    for (uint32_t t = 0; t < nthreads; t++) {
876        delete calc_array[t].data_res;
877        delete calc_array[t].tag_res;
878    }
879
880    delete [] calc_array;
881    delete cache_min;
882    delete d_min;
883    delete t_min;
884}
885
886void update(uca_org_t *fin_res)
887{
888  if(fin_res->tag_array2)
889  {
890    init_tech_params(g_ip->F_sz_um,true);
891    DynamicParameter tag_arr_dyn_p(true, g_ip->pure_ram, g_ip->pure_cam,
892                                   fin_res->tag_array2->Nspd,
893                                   fin_res->tag_array2->Ndwl,
894                                   fin_res->tag_array2->Ndbl,
895                                   fin_res->tag_array2->Ndcm,
896                                   fin_res->tag_array2->Ndsam_lev_1,
897                                   fin_res->tag_array2->Ndsam_lev_2,
898                                   g_ip->is_main_mem);
899    if(tag_arr_dyn_p.is_valid)
900    {
901      UCA * tag_arr = new UCA(tag_arr_dyn_p);
902      fin_res->tag_array2->power = tag_arr->power;
903    }
904    else
905    {
906      cout << "ERROR: Cannot retrieve array structure for leakage feedback"
907           << endl;
908      exit(1);
909    }
910  }
911  init_tech_params(g_ip->F_sz_um,false);
912  DynamicParameter data_arr_dyn_p(false, g_ip->pure_ram, g_ip->pure_cam,
913                                  fin_res->data_array2->Nspd,
914                                  fin_res->data_array2->Ndwl,
915                                  fin_res->data_array2->Ndbl,
916                                  fin_res->data_array2->Ndcm,
917                                  fin_res->data_array2->Ndsam_lev_1,
918                                  fin_res->data_array2->Ndsam_lev_2,
919                                  g_ip->is_main_mem);
920  if(data_arr_dyn_p.is_valid)
921  {
922    UCA * data_arr = new UCA(data_arr_dyn_p);
923    fin_res->data_array2->power = data_arr->power;
924  }
925  else
926  {
927    cout << "ERROR: Cannot retrieve array structure for leakage feedback"
928         << endl;
929    exit(1);
930  }
931
932  fin_res->find_energy();
933}
934
935