1/*****************************************************************************
2 *                                McPAT
3 *                      SOFTWARE LICENSE AGREEMENT
4 *            Copyright (c) 2010-2013 Advanced Micro Devices, Inc.
5 *                          All Rights Reserved
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are
9 * met: redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer;
11 * redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution;
14 * neither the name of the copyright holders nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *
30 * Authors: Joel Hestness
31 *          Yasuko Eckert
32 *
33 ***************************************************************************/
34
35#include <cmath>
36#include <iostream>
37
38#include "area.h"
39#include "cachearray.h"
40#include "common.h"
41#include "decoder.h"
42#include "parameter.h"
43
44using namespace std;
45
46double CacheArray::area_efficiency_threshold = 20.0;
47int CacheArray::ed = 0;
48//Fixed number, make sure timing can be satisfied.
49int CacheArray::delay_wt = 100;
50int CacheArray::cycle_time_wt = 1000;
51//Fixed number, This is used to exhaustive search for individual components.
52int CacheArray::area_wt = 10;
53//Fixed number, This is used to exhaustive search for individual components.
54int CacheArray::dynamic_power_wt = 10;
55int CacheArray::leakage_power_wt = 10;
56//Fixed number, make sure timing can be satisfied.
57int CacheArray::delay_dev = 1000000;
58int CacheArray::cycle_time_dev = 100;
59//Fixed number, This is used to exhaustive search for individual components.
60int CacheArray::area_dev = 1000000;
61//Fixed number, This is used to exhaustive search for individual components.
62int CacheArray::dynamic_power_dev = 1000000;
63int CacheArray::leakage_power_dev = 1000000;
64int CacheArray::cycle_time_dev_threshold = 10;
65
66CacheArray::CacheArray(XMLNode* _xml_data,
67                 const InputParameter *configure_interface, string _name,
68                 enum Device_ty device_ty_, double _clockRate,
69                 bool opt_local_, enum Core_type core_ty_, bool _is_default)
70        : McPATComponent(_xml_data), l_ip(*configure_interface),
71        device_ty(device_ty_), opt_local(opt_local_), core_ty(core_ty_),
72        is_default(_is_default), sbt_dir_overhead(0) {
73    name = _name;
74    clockRate = _clockRate;
75    if (l_ip.cache_sz < MIN_BUFFER_SIZE) {
76        l_ip.cache_sz = MIN_BUFFER_SIZE;
77    }
78
79    if (!l_ip.error_checking(name)) {
80        exit(1);
81    }
82
83    sbt_tdp_stats.reset();
84    sbt_rtp_stats.reset();
85
86    // Compute initial search point
87    local_result.valid = false;
88    compute_base_power();
89
90    // Set up the cache by searching design space with cacti
91    list<uca_org_t > candidate_solutions(0);
92    list<uca_org_t >::iterator candidate_iter, min_dynamic_energy_iter;
93    uca_org_t* temp_res = NULL;
94    double throughput = l_ip.throughput;
95    double latency = l_ip.latency;
96    bool throughput_overflow = true;
97    bool latency_overflow = true;
98
99    if ((local_result.cycle_time - throughput) <= 1e-10 )
100        throughput_overflow = false;
101    if ((local_result.access_time - latency) <= 1e-10)
102        latency_overflow = false;
103
104    if (opt_for_clk && opt_local) {
105        if (throughput_overflow || latency_overflow) {
106            l_ip.ed = ed;
107
108            l_ip.delay_wt = delay_wt;
109            l_ip.cycle_time_wt = cycle_time_wt;
110
111            l_ip.area_wt = area_wt;
112            l_ip.dynamic_power_wt = dynamic_power_wt;
113            l_ip.leakage_power_wt = leakage_power_wt;
114
115            l_ip.delay_dev = delay_dev;
116            l_ip.cycle_time_dev = cycle_time_dev;
117
118            l_ip.area_dev = area_dev;
119            l_ip.dynamic_power_dev = dynamic_power_dev;
120            l_ip.leakage_power_dev = leakage_power_dev;
121
122            //Reset overflow flag before start optimization iterations
123            throughput_overflow = true;
124            latency_overflow = true;
125
126            //Clean up the result for optimized for ED^2P
127            temp_res = &local_result;
128            temp_res->cleanup();
129        }
130
131
132        while ((throughput_overflow || latency_overflow) &&
133               l_ip.cycle_time_dev > cycle_time_dev_threshold) {
134            compute_base_power();
135
136            //This is the time_dev to be used for next iteration
137            l_ip.cycle_time_dev -= cycle_time_dev_threshold;
138
139            //      from best area to worst area -->worst timing to best timing
140            if ((((local_result.cycle_time - throughput) <= 1e-10 ) &&
141                 (local_result.access_time - latency) <= 1e-10) ||
142                (local_result.data_array2->area_efficiency <
143                 area_efficiency_threshold && l_ip.assoc == 0)) {
144                //if no satisfiable solution is found,the most aggressive one
145                //is left
146                candidate_solutions.push_back(local_result);
147                if (((local_result.cycle_time - throughput) <= 1e-10) &&
148                    ((local_result.access_time - latency) <= 1e-10)) {
149                    //ensure stop opt not because of cam
150                    throughput_overflow = false;
151                    latency_overflow = false;
152                }
153
154            } else {
155                if ((local_result.cycle_time - throughput) <= 1e-10)
156                    throughput_overflow = false;
157                if ((local_result.access_time - latency) <= 1e-10)
158                    latency_overflow = false;
159
160                //if not >10 local_result is the last result, it cannot be
161                //cleaned up
162                if (l_ip.cycle_time_dev > cycle_time_dev_threshold) {
163                    //Only solutions not saved in the list need to be
164                    //cleaned up
165                    temp_res = &local_result;
166                    temp_res->cleanup();
167                }
168            }
169        }
170
171
172        if (l_ip.assoc > 0) {
173            //For array structures except CAM and FA, Give warning but still
174            //provide a result with best timing found
175            if (throughput_overflow == true)
176                cout << "Warning: " << name
177                     << " array structure cannot satisfy throughput constraint."
178                     << endl;
179            if (latency_overflow == true)
180                cout << "Warning: " << name
181                     << " array structure cannot satisfy latency constraint."
182                     << endl;
183        }
184
185        double min_dynamic_energy = BIGNUM;
186        if (candidate_solutions.empty() == false) {
187            local_result.valid = true;
188            for (candidate_iter = candidate_solutions.begin();
189                 candidate_iter != candidate_solutions.end();
190                 ++candidate_iter) {
191                if (min_dynamic_energy >
192                    (candidate_iter)->power.readOp.dynamic) {
193                    min_dynamic_energy =
194                        (candidate_iter)->power.readOp.dynamic;
195                    min_dynamic_energy_iter = candidate_iter;
196                    local_result = *(min_dynamic_energy_iter);
197
198                } else {
199                    candidate_iter->cleanup() ;
200                }
201
202            }
203
204
205        }
206        candidate_solutions.clear();
207    }
208
209    double long_channel_device_reduction =
210        longer_channel_device_reduction(device_ty, core_ty);
211
212    double macro_layout_overhead = g_tp.macro_layout_overhead;
213    double chip_PR_overhead = g_tp.chip_layout_overhead;
214    double total_overhead = macro_layout_overhead * chip_PR_overhead;
215    local_result.area *= total_overhead;
216
217    //maintain constant power density
218    double pppm_t[4]    = {total_overhead, 1, 1, total_overhead};
219
220    double sckRation = g_tp.sckt_co_eff;
221    local_result.power.readOp.dynamic *= sckRation;
222    local_result.power.writeOp.dynamic *= sckRation;
223    local_result.power.searchOp.dynamic *= sckRation;
224    local_result.power.readOp.leakage *= l_ip.nbanks;
225    local_result.power.readOp.longer_channel_leakage =
226        local_result.power.readOp.leakage * long_channel_device_reduction;
227    local_result.power = local_result.power * pppm_t;
228
229    local_result.data_array2->power.readOp.dynamic *= sckRation;
230    local_result.data_array2->power.writeOp.dynamic *= sckRation;
231    local_result.data_array2->power.searchOp.dynamic *= sckRation;
232    local_result.data_array2->power.readOp.leakage *= l_ip.nbanks;
233    local_result.data_array2->power.readOp.longer_channel_leakage =
234        local_result.data_array2->power.readOp.leakage *
235        long_channel_device_reduction;
236    local_result.data_array2->power = local_result.data_array2->power * pppm_t;
237
238
239    if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache) {
240        local_result.tag_array2->power.readOp.dynamic *= sckRation;
241        local_result.tag_array2->power.writeOp.dynamic *= sckRation;
242        local_result.tag_array2->power.searchOp.dynamic *= sckRation;
243        local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks;
244        local_result.tag_array2->power.readOp.longer_channel_leakage =
245            local_result.tag_array2->power.readOp.leakage *
246            long_channel_device_reduction;
247        local_result.tag_array2->power =
248            local_result.tag_array2->power * pppm_t;
249    }
250}
251
252void CacheArray::compute_base_power() {
253    local_result = cacti_interface(&l_ip);
254}
255
256void CacheArray::computeArea() {
257    area.set_area(local_result.area);
258    output_data.area = local_result.area / 1e6;
259}
260
261void CacheArray::computeEnergy() {
262    // Set the leakage power numbers
263    output_data.subthreshold_leakage_power = local_result.power.readOp.leakage;
264    output_data.gate_leakage_power = local_result.power.readOp.gate_leakage;
265
266    if (l_ip.assoc && l_ip.is_cache) {
267        // This is a standard cache array with data and tags
268        // Calculate peak dynamic power
269        output_data.peak_dynamic_power =
270            (local_result.tag_array2->power.readOp.dynamic +
271             local_result.data_array2->power.readOp.dynamic) *
272            tdp_stats.readAc.hit +
273            (local_result.tag_array2->power.readOp.dynamic) *
274            tdp_stats.readAc.miss +
275            (local_result.tag_array2->power.readOp.dynamic +
276             local_result.data_array2->power.writeOp.dynamic) *
277            tdp_stats.writeAc.hit +
278            (local_result.tag_array2->power.readOp.dynamic) *
279            tdp_stats.writeAc.miss;
280        output_data.peak_dynamic_power *= clockRate;
281
282        // Calculate the runtime dynamic power
283        output_data.runtime_dynamic_energy =
284            local_result.data_array2->power.readOp.dynamic *
285            rtp_stats.dataReadAc.access +
286            local_result.data_array2->power.writeOp.dynamic *
287            rtp_stats.dataWriteAc.access +
288            (local_result.tag_array2->power.readOp.dynamic *
289             rtp_stats.tagReadAc.access +
290             local_result.tag_array2->power.writeOp.dynamic *
291             rtp_stats.tagWriteAc.access) * l_ip.assoc;
292    } else {
293        // Calculate peak dynamic power
294        output_data.peak_dynamic_power =
295                local_result.power.readOp.dynamic * tdp_stats.readAc.access +
296                local_result.power.writeOp.dynamic * tdp_stats.writeAc.access +
297                local_result.power.searchOp.dynamic * tdp_stats.searchAc.access;
298        output_data.peak_dynamic_power *= clockRate;
299
300        // Calculate the runtime dynamic power
301        output_data.runtime_dynamic_energy =
302                local_result.power.readOp.dynamic * rtp_stats.readAc.access +
303                local_result.power.writeOp.dynamic * rtp_stats.writeAc.access +
304                local_result.power.searchOp.dynamic * rtp_stats.searchAc.access;
305    }
306
307    // An SBT directory has more dynamic power
308    if (sbt_dir_overhead > 0) {
309        // Calculate peak dynamic power
310        output_data.peak_dynamic_power +=
311            (computeSBTDynEnergy(&sbt_tdp_stats) * clockRate);
312
313        // Calculate the runtime dynamic power
314        output_data.runtime_dynamic_energy +=
315            computeSBTDynEnergy(&sbt_rtp_stats);
316    }
317}
318
319CacheArray::~CacheArray() {
320    local_result.cleanup();
321}
322