1/***************************************************************************** 2 * McPAT 3 * SOFTWARE LICENSE AGREEMENT 4 * Copyright 2012 Hewlett-Packard Development Company, L.P. 5 * Copyright (c) 2010-2013 Advanced Micro Devices, Inc. 6 * All Rights Reserved 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are 10 * met: redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer; 12 * redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution; 15 * neither the name of the copyright holders nor the names of its 16 * contributors may be used to endorse or promote products derived from 17 * this software without specific prior written permission. 18 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 ***************************************************************************/ 32 33#include "common.h" 34#include "logic.h" 35 36//selection_logic 37selection_logic::selection_logic(XMLNode* _xml_data, bool _is_default, 38 int _win_entries, int issue_width_, 39 const InputParameter *configure_interface, 40 string _name, double _accesses, 41 double clockRate_, enum Device_ty device_ty_, 42 enum Core_type core_ty_) 43 : McPATComponent(_xml_data), is_default(_is_default), 44 win_entries(_win_entries), 45 issue_width(issue_width_), 46 accesses(_accesses), 47 device_ty(device_ty_), 48 core_ty(core_ty_) { 49 clockRate = clockRate_; 50 name = _name; 51 l_ip = *configure_interface; 52 local_result = init_interface(&l_ip, name); 53} 54 55void selection_logic::computeArea() { 56 output_data.area = local_result.area; 57} 58 59void selection_logic::computeEnergy() { 60 //based on cost effective superscalar processor TR pp27-31 61 double Ctotal, Cor, Cpencode; 62 int num_arbiter; 63 double WSelORn, WSelORprequ, WSelPn, WSelPp, WSelEnn, WSelEnp; 64 65 //the 0.8um process data is used. 66 //this was 10 micron for the 0.8 micron process 67 WSelORn = 12.5 * l_ip.F_sz_um; 68 //this was 40 micron for the 0.8 micron process 69 WSelORprequ = 50 * l_ip.F_sz_um; 70 //this was 10mcron for the 0.8 micron process 71 WSelPn = 12.5 * l_ip.F_sz_um; 72 //this was 15 micron for the 0.8 micron process 73 WSelPp = 18.75 * l_ip.F_sz_um; 74 //this was 5 micron for the 0.8 micron process 75 WSelEnn = 6.25 * l_ip.F_sz_um; 76 //this was 10 micron for the 0.8 micron process 77 WSelEnp = 12.5 * l_ip.F_sz_um; 78 79 Ctotal = 0; 80 num_arbiter = 1; 81 while (win_entries > 4) { 82 win_entries = (int)ceil((double)win_entries / 4.0); 83 num_arbiter += win_entries; 84 } 85 //the 4-input OR logic to generate anyreq 86 Cor = 4 * drain_C_(WSelORn, NCH, 1, 1, g_tp.cell_h_def) + 87 drain_C_(WSelORprequ, PCH, 1, 1, g_tp.cell_h_def); 88 power.readOp.gate_leakage = 89 cmos_Ig_leakage(WSelORn, WSelORprequ, 4, nor) * g_tp.peri_global.Vdd; 90 91 //The total capacity of the 4-bit priority encoder 92 Cpencode = drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) + 93 drain_C_(WSelPp, PCH, 1, 1, g_tp.cell_h_def) + 94 2 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) + 95 drain_C_(WSelPp, PCH, 2, 1, g_tp.cell_h_def) + 96 3 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) + 97 drain_C_(WSelPp, PCH, 3, 1, g_tp.cell_h_def) + 98 4 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) + 99 drain_C_(WSelPp, PCH, 4, 1, g_tp.cell_h_def) +//precompute priority logic 100 2 * 4 * gate_C(WSelEnn + WSelEnp, 20.0) + 101 4 * drain_C_(WSelEnn, NCH, 1, 1, g_tp.cell_h_def) + 102 2 * 4 * drain_C_(WSelEnp, PCH, 1, 1, g_tp.cell_h_def) +//enable logic 103 (2 * 4 + 2 * 3 + 2 * 2 + 2) * 104 gate_C(WSelPn + WSelPp, 10.0);//requests signal 105 106 Ctotal += issue_width * num_arbiter * (Cor + Cpencode); 107 108 //2 means the abitration signal need to travel round trip 109 power.readOp.dynamic = 110 Ctotal * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 2; 111 power.readOp.leakage = issue_width * num_arbiter * 112 (cmos_Isub_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p 113 + cmos_Isub_leakage(WSelPn, WSelPp, 3, nor)//grant2p 114 + cmos_Isub_leakage(WSelPn, WSelPp, 4, nor)//grant3p 115 + cmos_Isub_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic 116 + cmos_Isub_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant sIsubnals 117 ) * g_tp.peri_global.Vdd; 118 power.readOp.gate_leakage = issue_width * num_arbiter * 119 (cmos_Ig_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p 120 + cmos_Ig_leakage(WSelPn, WSelPp, 3, nor)//grant2p 121 + cmos_Ig_leakage(WSelPn, WSelPp, 4, nor)//grant3p 122 + cmos_Ig_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic 123 + cmos_Ig_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant signals 124 ) * g_tp.peri_global.Vdd; 125 double sckRation = g_tp.sckt_co_eff; 126 power.readOp.dynamic *= sckRation; 127 power.writeOp.dynamic *= sckRation; 128 power.searchOp.dynamic *= sckRation; 129 130 double long_channel_device_reduction = 131 longer_channel_device_reduction(device_ty, core_ty); 132 power.readOp.longer_channel_leakage = 133 power.readOp.leakage * long_channel_device_reduction; 134 135 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate; 136 output_data.subthreshold_leakage_power = power.readOp.leakage; 137 output_data.gate_leakage_power = power.readOp.gate_leakage; 138 output_data.runtime_dynamic_energy = power.readOp.dynamic * accesses; 139} 140 141dep_resource_conflict_check::dep_resource_conflict_check( 142 XMLNode* _xml_data, const string _name, 143 const InputParameter *configure_interface, 144 const CoreParameters & dyn_p_, int compare_bits_, 145 double clockRate_, bool _is_default) 146 : McPATComponent(_xml_data), l_ip(*configure_interface), 147 coredynp(dyn_p_), compare_bits(compare_bits_), is_default(_is_default) { 148 149 name = _name; 150 clockRate = clockRate_; 151 //this was 20.0 micron for the 0.8 micron process 152 Wcompn = 25 * l_ip.F_sz_um; 153 //this was 20.0 micron for the 0.8 micron process 154 Wevalinvp = 25 * l_ip.F_sz_um; 155 //this was 80.0 mcron for the 0.8 micron process 156 Wevalinvn = 100 * l_ip.F_sz_um; 157 //this was 40.0 micron for the 0.8 micron process 158 Wcomppreequ = 50 * l_ip.F_sz_um; 159 //this was 5.4 micron for the 0.8 micron process 160 WNORn = 6.75 * l_ip.F_sz_um; 161 //this was 30.5 micron for the 0.8 micron process 162 WNORp = 38.125 * l_ip.F_sz_um; 163 164 // To make CACTI happy. 165 l_ip.cache_sz = MIN_BUFFER_SIZE; 166 local_result = init_interface(&l_ip, name); 167 168 if (coredynp.core_ty == Inorder) 169 //TODO: opcode bits + log(shared resources) + REG TAG BITS --> 170 //opcode comparator 171 compare_bits += 16 + 8 + 8; 172 else 173 compare_bits += 16 + 8 + 8; 174 175 conflict_check_power(); 176 double sckRation = g_tp.sckt_co_eff; 177 power.readOp.dynamic *= sckRation; 178 power.writeOp.dynamic *= sckRation; 179 power.searchOp.dynamic *= sckRation; 180 181} 182 183void dep_resource_conflict_check::conflict_check_power() { 184 double Ctotal; 185 int num_comparators; 186 //2(N*N-N) is used for source to dest comparison, (N*N-N) is used for 187 //dest to dest comparision. 188 num_comparators = 3 * ((coredynp.decodeW) * (coredynp.decodeW) - 189 coredynp.decodeW); 190 191 Ctotal = num_comparators * compare_cap(); 192 193 power.readOp.dynamic = Ctotal * /*CLOCKRATE*/ g_tp.peri_global.Vdd * 194 g_tp.peri_global.Vdd /*AF*/; 195 power.readOp.leakage = num_comparators * compare_bits * 2 * 196 simplified_nmos_leakage(Wcompn, false); 197 198 double long_channel_device_reduction = 199 longer_channel_device_reduction(Core_device, coredynp.core_ty); 200 power.readOp.longer_channel_leakage = 201 power.readOp.leakage * long_channel_device_reduction; 202 power.readOp.gate_leakage = num_comparators * compare_bits * 2 * 203 cmos_Ig_leakage(Wcompn, 0, 2, nmos); 204 205} 206 207/* estimate comparator power consumption (this comparator is similar 208 to the tag-match structure in a CAM */ 209double dep_resource_conflict_check::compare_cap() { 210 double c1, c2; 211 212 //resize the big NOR gate at the DCL according to fan in. 213 WNORp = WNORp * compare_bits / 2.0; 214 /* bottom part of comparator */ 215 c2 = (compare_bits) * (drain_C_(Wcompn, NCH, 1, 1, g_tp.cell_h_def) + 216 drain_C_(Wcompn, NCH, 2, 1, g_tp.cell_h_def)) + 217 drain_C_(Wevalinvp, PCH, 1, 1, g_tp.cell_h_def) + 218 drain_C_(Wevalinvn, NCH, 1, 1, g_tp.cell_h_def); 219 220 /* top part of comparator */ 221 c1 = (compare_bits) * (drain_C_(Wcompn, NCH, 1, 1, g_tp.cell_h_def) + 222 drain_C_(Wcompn, NCH, 2, 1, g_tp.cell_h_def) + 223 drain_C_(Wcomppreequ, NCH, 1, 1, g_tp.cell_h_def)) + 224 gate_C(WNORn + WNORp, 10.0) + 225 drain_C_(WNORp, NCH, 2, 1, g_tp.cell_h_def) + compare_bits * 226 drain_C_(WNORn, NCH, 2, 1, g_tp.cell_h_def); 227 return(c1 + c2); 228 229} 230 231void dep_resource_conflict_check::leakage_feedback(double temperature) 232{ 233 l_ip.temp = (unsigned int)round(temperature/10.0)*10; 234 uca_org_t init_result = init_interface(&l_ip, name); // init_result is dummy 235 236 // This is part of conflict_check_power() 237 // 2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest 238 // to dest comparison. 239 int num_comparators = 3 * ((coredynp.decodeW) * (coredynp.decodeW) - 240 coredynp.decodeW); 241 power.readOp.leakage = num_comparators * compare_bits * 2 * 242 simplified_nmos_leakage(Wcompn, false); 243 244 double long_channel_device_reduction = 245 longer_channel_device_reduction(Core_device, coredynp.core_ty); 246 power.readOp.longer_channel_leakage = power.readOp.leakage * 247 long_channel_device_reduction; 248 power.readOp.gate_leakage = num_comparators * compare_bits * 2 * 249 cmos_Ig_leakage(Wcompn, 0, 2, nmos); 250} 251 252 253DFFCell::DFFCell( 254 bool _is_dram, 255 double _WdecNANDn, 256 double _WdecNANDp, 257 double _cell_load, 258 const InputParameter *configure_interface) 259 : is_dram(_is_dram), 260 cell_load(_cell_load), 261 WdecNANDn(_WdecNANDn), 262 WdecNANDp(_WdecNANDp) { //this model is based on the NAND2 based DFF. 263 l_ip = *configure_interface; 264 area.set_area(5 * compute_gate_area(NAND, 2,WdecNANDn,WdecNANDp, 265 g_tp.cell_h_def) 266 + compute_gate_area(NAND, 2,WdecNANDn,WdecNANDn, 267 g_tp.cell_h_def)); 268 269 270} 271 272 273double DFFCell::fpfp_node_cap(unsigned int fan_in, unsigned int fan_out) { 274 double Ctotal = 0; 275 276 /* part 1: drain cap of NAND gate */ 277 Ctotal += drain_C_(WdecNANDn, NCH, 2, 1, g_tp.cell_h_def, is_dram) + fan_in * drain_C_(WdecNANDp, PCH, 1, 1, g_tp.cell_h_def, is_dram); 278 279 /* part 2: gate cap of NAND gates */ 280 Ctotal += fan_out * gate_C(WdecNANDn + WdecNANDp, 0, is_dram); 281 282 return Ctotal; 283} 284 285 286void DFFCell::compute_DFF_cell() { 287 double c1, c2, c3, c4, c5, c6; 288 /* node 5 and node 6 are identical to node 1 in capacitance */ 289 c1 = c5 = c6 = fpfp_node_cap(2, 1); 290 c2 = fpfp_node_cap(2, 3); 291 c3 = fpfp_node_cap(3, 2); 292 c4 = fpfp_node_cap(2, 2); 293 294 //cap-load of the clock signal in each Dff, actually the clock signal only connected to one NAND2 295 clock_cap = 2 * gate_C(WdecNANDn + WdecNANDp, 0, is_dram); 296 e_switch.readOp.dynamic += (c4 + c1 + c2 + c3 + c5 + c6 + 2 * cell_load) * 297 0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;; 298 299 /* no 1/2 for e_keep and e_clock because clock signal switches twice in one cycle */ 300 e_keep_1.readOp.dynamic += 301 c3 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ; 302 e_keep_0.readOp.dynamic += 303 c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ; 304 e_clock.readOp.dynamic += 305 clock_cap * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;; 306 307 /* static power */ 308 e_switch.readOp.leakage += 309 (cmos_Isub_leakage(WdecNANDn, WdecNANDp, 2, nand) * 310 5//5 NAND2 and 1 NAND3 in a DFF 311 + cmos_Isub_leakage(WdecNANDn, WdecNANDn, 3, nand)) * 312 g_tp.peri_global.Vdd; 313 e_switch.readOp.gate_leakage += 314 (cmos_Ig_leakage(WdecNANDn, WdecNANDp, 2, nand) * 315 5//5 NAND2 and 1 NAND3 in a DFF 316 + cmos_Ig_leakage(WdecNANDn, WdecNANDn, 3, nand)) * 317 g_tp.peri_global.Vdd; 318} 319 320Pipeline::Pipeline(XMLNode* _xml_data, 321 const InputParameter *configure_interface, 322 const CoreParameters & dyn_p_, 323 enum Device_ty device_ty_, 324 bool _is_core_pipeline, 325 bool _is_default) 326 : McPATComponent(_xml_data), l_ip(*configure_interface), 327 coredynp(dyn_p_), device_ty(device_ty_), 328 is_core_pipeline(_is_core_pipeline), is_default(_is_default), 329 num_piperegs(0.0) { 330 name = "Pipeline?"; 331 332 local_result = init_interface(&l_ip, name); 333 if (!coredynp.Embedded) { 334 process_ind = true; 335 } else { 336 process_ind = false; 337 } 338 //this was 20 micron for the 0.8 micron process 339 WNANDn = (process_ind) ? 25 * l_ip.F_sz_um : g_tp.min_w_nmos_ ; 340 //this was 30 micron for the 0.8 micron process 341 WNANDp = (process_ind) ? 37.5 * l_ip.F_sz_um : g_tp.min_w_nmos_ * 342 pmos_to_nmos_sz_ratio(); 343 load_per_pipeline_stage = 2 * gate_C(WNANDn + WNANDp, 0, false); 344 compute(); 345 346} 347 348void Pipeline::compute() { 349 compute_stage_vector(); 350 DFFCell pipe_reg(false, WNANDn, WNANDp, load_per_pipeline_stage, &l_ip); 351 pipe_reg.compute_DFF_cell(); 352 353 double clock_power_pipereg = num_piperegs * pipe_reg.e_clock.readOp.dynamic; 354 //******************pipeline power: currently, we average all the possibilities of the states of DFFs in the pipeline. A better way to do it is to consider 355 //the harming distance of two consecutive signals, However McPAT does not have plan to do this in near future as it focuses on worst case power. 356 double pipe_reg_power = num_piperegs * 357 (pipe_reg.e_switch.readOp.dynamic + pipe_reg.e_keep_0.readOp.dynamic + 358 pipe_reg.e_keep_1.readOp.dynamic) / 3 + clock_power_pipereg; 359 double pipe_reg_leakage = num_piperegs * pipe_reg.e_switch.readOp.leakage; 360 double pipe_reg_gate_leakage = num_piperegs * 361 pipe_reg.e_switch.readOp.gate_leakage; 362 power.readOp.dynamic += pipe_reg_power; 363 power.readOp.leakage += pipe_reg_leakage; 364 power.readOp.gate_leakage += pipe_reg_gate_leakage; 365 area.set_area(num_piperegs * pipe_reg.area.get_area()); 366 367 double long_channel_device_reduction = 368 longer_channel_device_reduction(device_ty, coredynp.core_ty); 369 power.readOp.longer_channel_leakage = power.readOp.leakage * 370 long_channel_device_reduction; 371 372 373 double sckRation = g_tp.sckt_co_eff; 374 power.readOp.dynamic *= sckRation; 375 power.writeOp.dynamic *= sckRation; 376 power.searchOp.dynamic *= sckRation; 377 double macro_layout_overhead = g_tp.macro_layout_overhead; 378 if (!coredynp.Embedded) 379 area.set_area(area.get_area() * macro_layout_overhead); 380 381 output_data.area = area.get_area() / 1e6; 382 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate; 383 output_data.subthreshold_leakage_power = power.readOp.leakage; 384 output_data.gate_leakage_power = power.readOp.gate_leakage; 385 output_data.runtime_dynamic_energy = power.readOp.dynamic * total_cycles; 386} 387 388void Pipeline::compute_stage_vector() { 389 double num_stages, tot_stage_vector, per_stage_vector; 390 int opcode_length = coredynp.x86 ? 391 coredynp.micro_opcode_length : coredynp.opcode_width; 392 393 if (!is_core_pipeline) { 394 //The number of pipeline stages are calculated based on the achievable 395 //throughput and required throughput 396 num_piperegs = l_ip.pipeline_stages * l_ip.per_stage_vector; 397 } else { 398 if (coredynp.core_ty == Inorder) { 399 /* assume 6 pipe stages and try to estimate bits per pipe stage */ 400 /* pipe stage 0/IF */ 401 num_piperegs += coredynp.pc_width * 2 * coredynp.num_hthreads; 402 /* pipe stage IF/ID */ 403 num_piperegs += coredynp.fetchW * 404 (coredynp.instruction_length + coredynp.pc_width) * 405 coredynp.num_hthreads; 406 /* pipe stage IF/ThreadSEL */ 407 if (coredynp.multithreaded) { 408 num_piperegs += coredynp.num_hthreads * 409 coredynp.perThreadState; //8 bit thread states 410 } 411 /* pipe stage ID/EXE */ 412 num_piperegs += coredynp.decodeW * 413 (coredynp.instruction_length + coredynp.pc_width + 414 pow(2.0, opcode_length) + 2 * coredynp.int_data_width) * 415 coredynp.num_hthreads; 416 /* pipe stage EXE/MEM */ 417 num_piperegs += coredynp.issueW * 418 (3 * coredynp.arch_ireg_width + pow(2.0, opcode_length) + 8 * 419 2 * coredynp.int_data_width/*+2*powers (2,reg_length)*/); 420 /* pipe stage MEM/WB the 2^opcode_length means the total decoded signal for the opcode*/ 421 num_piperegs += coredynp.issueW * 422 (2 * coredynp.int_data_width + pow(2.0, opcode_length) + 8 * 423 2 * coredynp.int_data_width/*+2*powers (2,reg_length)*/); 424 num_stages = 6; 425 } else { 426 /* assume 12 stage pipe stages and try to estimate bits per pipe stage */ 427 /*OOO: Fetch, decode, rename, IssueQ, dispatch, regread, EXE, MEM, WB, CM */ 428 429 /* pipe stage 0/1F*/ 430 num_piperegs += 431 coredynp.pc_width * 2 * coredynp.num_hthreads ;//PC and Next PC 432 /* pipe stage IF/ID */ 433 num_piperegs += coredynp.fetchW * 434 (coredynp.instruction_length + coredynp.pc_width) * 435 coredynp.num_hthreads;//PC is used to feed branch predictor in ID 436 /* pipe stage 1D/Renaming*/ 437 num_piperegs += coredynp.decodeW * 438 (coredynp.instruction_length + coredynp.pc_width) * 439 coredynp.num_hthreads;//PC is for branch exe in later stage. 440 /* pipe stage Renaming/wire_drive */ 441 num_piperegs += coredynp.decodeW * 442 (coredynp.instruction_length + coredynp.pc_width); 443 /* pipe stage Renaming/IssueQ */ 444 //3*coredynp.phy_ireg_width means 2 sources and 1 dest 445 num_piperegs += coredynp.issueW * 446 (coredynp.instruction_length + coredynp.pc_width + 3 * 447 coredynp.phy_ireg_width) * coredynp.num_hthreads; 448 /* pipe stage IssueQ/Dispatch */ 449 num_piperegs += coredynp.issueW * 450 (coredynp.instruction_length + 3 * coredynp.phy_ireg_width); 451 /* pipe stage Dispatch/EXE */ 452 453 num_piperegs += coredynp.issueW * 454 (3 * coredynp.phy_ireg_width + coredynp.pc_width + 455 pow(2.0, opcode_length)/*+2*powers (2,reg_length)*/); 456 /* 2^opcode_length means the total decoded signal for the opcode*/ 457 num_piperegs += coredynp.issueW * 458 (2 * coredynp.int_data_width + pow(2.0, opcode_length) 459 /*+2*powers (2,reg_length)*/); 460 /*2 source operands in EXE; Assume 2EXE stages* since we do not really distinguish OP*/ 461 num_piperegs += coredynp.issueW * 462 (2 * coredynp.int_data_width + pow(2.0, opcode_length) 463 /*+2*powers (2,reg_length)*/); 464 /* pipe stage EXE/MEM, data need to be read/write, address*/ 465 //memory Opcode still need to be passed 466 num_piperegs += coredynp.issueW * 467 (coredynp.int_data_width + coredynp.v_address_width + 468 pow(2.0, opcode_length)/*+2*powers (2,reg_length)*/); 469 /* pipe stage MEM/WB; result data, writeback regs */ 470 num_piperegs += coredynp.issueW * 471 (coredynp.int_data_width + coredynp.phy_ireg_width 472 /* powers (2,opcode_length) + 473 (2,opcode_length)+2*powers (2,reg_length)*/); 474 /* pipe stage WB/CM ; result data, regs need to be updated, address for resolve memory ops in ROB's top*/ 475 num_piperegs += coredynp.commitW * 476 (coredynp.int_data_width + coredynp.v_address_width + 477 coredynp.phy_ireg_width 478 /*+ powers (2,opcode_length)*2*powers (2,reg_length)*/) * 479 coredynp.num_hthreads; 480 num_stages = 12; 481 482 } 483 484 /* assume 50% extra in control registers and interrupt registers (rule of thumb) */ 485 num_piperegs = num_piperegs * 1.5; 486 tot_stage_vector = num_piperegs; 487 per_stage_vector = tot_stage_vector / num_stages; 488 489 if (coredynp.core_ty == Inorder) { 490 if (coredynp.pipeline_stages > 6) 491 num_piperegs = per_stage_vector * coredynp.pipeline_stages; 492 } else { //OOO 493 if (coredynp.pipeline_stages > 12) 494 num_piperegs = per_stage_vector * coredynp.pipeline_stages; 495 } 496 } 497 498} 499 500FunctionalUnit::FunctionalUnit(XMLNode* _xml_data, 501 InputParameter* interface_ip_, 502 const CoreParameters & _core_params, 503 const CoreStatistics & _core_stats, 504 enum FU_type fu_type_) 505 : McPATComponent(_xml_data), 506 interface_ip(*interface_ip_), core_params(_core_params), 507 core_stats(_core_stats), fu_type(fu_type_) { 508 double area_t; 509 double leakage; 510 double gate_leakage; 511 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); 512 clockRate = core_params.clockRate; 513 514 uca_org_t result2; 515 // Temp name for the following function call 516 name = "Functional Unit"; 517 518 result2 = init_interface(&interface_ip, name); 519 520 if (core_params.Embedded) { 521 if (fu_type == FPU) { 522 num_fu=core_params.num_fpus; 523 //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 524 area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number 525 //4.47 contains both VFP and NEON processing unit, VFP is about 40% and NEON is about 60% 526 if (g_ip->F_sz_nm>90) 527 area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 528 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 529 gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 530 //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles. 531// base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) 532// base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2); 533 base_energy = 0; 534 per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per Hz energy(nJ) 535 //FPU power from Sandia's processor sizing tech report 536 FU_height=(18667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data 537 } else if (fu_type == ALU) { 538 num_fu=core_params.num_alus; 539 area_t = 280*260*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl 540 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 541 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; 542// base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) 543// base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2); 544 base_energy = 0; 545 per_access_energy = 1.15/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ) 546 FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU 547 548 } else if (fu_type == MUL) { 549 num_fu=core_params.num_muls; 550 area_t = 280*260*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl 551 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 552 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; 553// base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) 554// base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2); 555 base_energy = 0; 556 per_access_energy = 1.15*2/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch 557 FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data 558 } else { 559 cout<<"Unknown Functional Unit Type"<<endl; 560 exit(0); 561 } 562 per_access_energy *=0.5;//According to ARM data embedded processor has much lower per acc energy 563 } else { 564 if (fu_type == FPU) { 565 name = "Floating Point Unit(s)"; 566 num_fu = core_params.num_fpus; 567 area_t = 8.47 * 1e6 * (g_ip->F_sz_nm * g_ip->F_sz_nm / 90.0 / 568 90.0);//this is um^2 569 if (g_ip->F_sz_nm > 90) 570 area_t = 8.47 * 1e6 * 571 g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 572 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 573 gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 574 //W The base energy of ALU average numbers from Intel 4G and 575 //773Mhz (Wattch) 576 base_energy = core_params.core_ty == Inorder ? 0 : 89e-3 * 3; 577 base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 / 578 1.2); 579 per_access_energy = 1.15*3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per op energy(nJ) 580 FU_height=(38667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data 581 } else if (fu_type == ALU) { 582 name = "Integer ALU(s)"; 583 num_fu = core_params.num_alus; 584 //this is um^2 ALU + MUl 585 area_t = 280 * 260 * 2 * g_tp.scaling_factor.logic_scaling_co_eff; 586 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 587 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; 588 //W The base energy of ALU average numbers from Intel 4G and 773Mhz 589 //(Wattch) 590 base_energy = core_params.core_ty == Inorder ? 0 : 89e-3; 591 base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 / 592 1.2); 593 per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ) 594 FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU 595 } else if (fu_type == MUL) { 596 name = "Multiply/Divide Unit(s)"; 597 num_fu = core_params.num_muls; 598 //this is um^2 ALU + MUl 599 area_t = 280 * 260 * 2 * 3 * 600 g_tp.scaling_factor.logic_scaling_co_eff; 601 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 602 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; 603 //W The base energy of ALU average numbers from Intel 4G and 773Mhz 604 //(Wattch) 605 base_energy = core_params.core_ty == Inorder ? 0 : 89e-3 * 2; 606 base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 / 607 1.2); 608 per_access_energy = 1.15*2/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch 609 FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data 610 } else { 611 cout << "Unknown Functional Unit Type" << endl; 612 exit(0); 613 } 614 } 615 616 area.set_area(area_t*num_fu); 617 power.readOp.leakage = leakage * num_fu; 618 power.readOp.gate_leakage = gate_leakage * num_fu; 619 620 double long_channel_device_reduction = 621 longer_channel_device_reduction(Core_device, core_params.core_ty); 622 power.readOp.longer_channel_leakage = 623 power.readOp.leakage * long_channel_device_reduction; 624 double macro_layout_overhead = g_tp.macro_layout_overhead; 625 area.set_area(area.get_area()*macro_layout_overhead); 626} 627 628void FunctionalUnit::computeEnergy() { 629 double pppm_t[4] = {1, 1, 1, 1}; 630 double FU_duty_cycle; 631 double sckRation = g_tp.sckt_co_eff; 632 633 // TDP power calculation 634 //2 means two source operands needs to be passed for each int instruction. 635 set_pppm(pppm_t, 2, 2, 2, 2); 636 tdp_stats.readAc.access = num_fu; 637 if (fu_type == FPU) { 638 FU_duty_cycle = core_stats.FPU_duty_cycle; 639 } else if (fu_type == ALU) { 640 FU_duty_cycle = core_stats.ALU_duty_cycle; 641 } else if (fu_type == MUL) { 642 FU_duty_cycle = core_stats.MUL_duty_cycle; 643 } 644 645 power.readOp.dynamic = 646 per_access_energy * tdp_stats.readAc.access + base_energy / clockRate; 647 power.readOp.dynamic *= sckRation * FU_duty_cycle; 648 649 // Runtime power calculation 650 if (fu_type == FPU) { 651 rtp_stats.readAc.access = core_stats.fpu_accesses; 652 } else if (fu_type == ALU) { 653 rtp_stats.readAc.access = core_stats.ialu_accesses; 654 } else if (fu_type == MUL) { 655 rtp_stats.readAc.access = core_stats.mul_accesses; 656 } 657 658 rt_power.readOp.dynamic = per_access_energy * rtp_stats.readAc.access + 659 base_energy * execution_time; 660 rt_power.readOp.dynamic *= sckRation; 661 662 output_data.area = area.get_area() / 1e6; 663 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate; 664 output_data.subthreshold_leakage_power = 665 (longer_channel_device) ? power.readOp.longer_channel_leakage : 666 power.readOp.leakage; 667 output_data.gate_leakage_power = power.readOp.gate_leakage; 668 output_data.runtime_dynamic_energy = rt_power.readOp.dynamic; 669} 670 671void FunctionalUnit::leakage_feedback(double temperature) 672{ 673 // Update the temperature and initialize the global interfaces. 674 interface_ip.temp = (unsigned int)round(temperature/10.0)*10; 675 676 // init_result is dummy 677 uca_org_t init_result = init_interface(&interface_ip, name); 678 679 // This is part of FunctionalUnit() 680 double area_t, leakage, gate_leakage; 681 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); 682 683 if (fu_type == FPU) 684 { 685 area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number 686 if (g_ip->F_sz_nm>90) 687 area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 688 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 689 gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 690 } 691 else if (fu_type == ALU) 692 { 693 area_t = 280*260*2*num_fu*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl 694 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 695 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; 696 } 697 else if (fu_type == MUL) 698 { 699 area_t = 280*260*2*3*num_fu*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl 700 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 701 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; 702 } 703 else 704 { 705 cout<<"Unknown Functional Unit Type"<<endl; 706 exit(1); 707 } 708 709 power.readOp.leakage = leakage*num_fu; 710 power.readOp.gate_leakage = gate_leakage*num_fu; 711 power.readOp.longer_channel_leakage = 712 longer_channel_device_reduction(Core_device, core_params.core_ty); 713} 714 715UndiffCore::UndiffCore(XMLNode* _xml_data, InputParameter* interface_ip_, 716 const CoreParameters & dyn_p_, 717 bool exist_) 718 : McPATComponent(_xml_data), 719 interface_ip(*interface_ip_), coredynp(dyn_p_), 720 core_ty(coredynp.core_ty), embedded(coredynp.Embedded), 721 pipeline_stage(coredynp.pipeline_stages), 722 num_hthreads(coredynp.num_hthreads), issue_width(coredynp.issueW), 723 exist(exist_) { 724 if (!exist) return; 725 726 name = "Undifferentiated Core"; 727 clockRate = coredynp.clockRate; 728 729 double undifferentiated_core = 0; 730 double core_tx_density = 0; 731 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); 732 double undifferentiated_core_coe; 733 uca_org_t result2; 734 result2 = init_interface(&interface_ip, name); 735 736 //Compute undifferentiated core area at 90nm. 737 if (embedded == false) { 738 //Based on the results of polynomial/log curve fitting based on undifferentiated core of Niagara, Niagara2, Merom, Penyrn, Prescott, Opteron die measurements 739 if (core_ty == OOO) { 740 undifferentiated_core = (3.57 * log(pipeline_stage) - 1.2643) > 0 ? 741 (3.57 * log(pipeline_stage) - 1.2643) : 0; 742 } else if (core_ty == Inorder) { 743 undifferentiated_core = (-2.19 * log(pipeline_stage) + 6.55) > 0 ? 744 (-2.19 * log(pipeline_stage) + 6.55) : 0; 745 } else { 746 cout << "invalid core type" << endl; 747 exit(0); 748 } 749 undifferentiated_core *= (1 + logtwo(num_hthreads) * 0.0716); 750 } else { 751 //Based on the results in paper "parametrized processor models" Sandia Labs 752 if (opt_for_clk) 753 undifferentiated_core_coe = 0.05; 754 else 755 undifferentiated_core_coe = 0; 756 undifferentiated_core = (0.4109 * pipeline_stage - 0.776) * 757 undifferentiated_core_coe; 758 undifferentiated_core *= (1 + logtwo(num_hthreads) * 0.0426); 759 } 760 761 undifferentiated_core *= g_tp.scaling_factor.logic_scaling_co_eff * 762 1e6;//change from mm^2 to um^2 763 core_tx_density = g_tp.scaling_factor.core_tx_density; 764 power.readOp.leakage = undifferentiated_core*(core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W 765 power.readOp.gate_leakage = undifferentiated_core*(core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd; 766 767 double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty); 768 power.readOp.longer_channel_leakage = 769 power.readOp.leakage * long_channel_device_reduction; 770 area.set_area(undifferentiated_core); 771 772 scktRatio = g_tp.sckt_co_eff; 773 power.readOp.dynamic *= scktRatio; 774 power.writeOp.dynamic *= scktRatio; 775 power.searchOp.dynamic *= scktRatio; 776 macro_PR_overhead = g_tp.macro_layout_overhead; 777 area.set_area(area.get_area()*macro_PR_overhead); 778 779 output_data.area = area.get_area() / 1e6; 780 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate; 781 output_data.subthreshold_leakage_power = 782 longer_channel_device ? power.readOp.longer_channel_leakage : 783 power.readOp.leakage; 784 output_data.gate_leakage_power = power.readOp.gate_leakage; 785} 786 787InstructionDecoder::InstructionDecoder(XMLNode* _xml_data, const string _name, 788 bool _is_default, 789 const InputParameter *configure_interface, 790 int opcode_length_, int num_decoders_, 791 bool x86_, 792 double clockRate_, 793 enum Device_ty device_ty_, 794 enum Core_type core_ty_) 795 : McPATComponent(_xml_data), is_default(_is_default), 796 opcode_length(opcode_length_), num_decoders(num_decoders_), x86(x86_), 797 device_ty(device_ty_), core_ty(core_ty_) { 798 /* 799 * Instruction decoder is different from n to 2^n decoders 800 * that are commonly used in row decoders in memory arrays. 801 * The RISC instruction decoder is typically a very simple device. 802 * We can decode an instruction by simply 803 * separating the machine word into small parts using wire slices 804 * The RISC instruction decoder can be approximate by the n to 2^n decoders, 805 * although this approximation usually underestimate power since each decoded 806 * instruction normally has more than 1 active signal. 807 * 808 * However, decoding a CISC instruction word is much more difficult 809 * than the RISC case. A CISC decoder is typically set up as a state machine. 810 * The machine reads the opcode field to determine 811 * what type of instruction it is, 812 * and where the other data values are. 813 * The instruction word is read in piece by piece, 814 * and decisions are made at each stage as to 815 * how the remainder of the instruction word will be read. 816 * (sequencer and ROM are usually needed) 817 * An x86 decoder can be even more complex since 818 * it involve both decoding instructions into u-ops and 819 * merge u-ops when doing micro-ops fusion. 820 */ 821 name = _name; 822 clockRate = clockRate_; 823 bool is_dram = false; 824 double pmos_to_nmos_sizing_r; 825 double load_nmos_width, load_pmos_width; 826 double C_driver_load, R_wire_load; 827 Area cell; 828 829 l_ip = *configure_interface; 830 local_result = init_interface(&l_ip, name); 831 cell.h = g_tp.cell_h_def; 832 cell.w = g_tp.cell_h_def; 833 834 num_decoder_segments = (int)ceil(opcode_length / 18.0); 835 if (opcode_length > 18) opcode_length = 18; 836 num_decoded_signals = (int)pow(2.0, opcode_length); 837 pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); 838 load_nmos_width = g_tp.max_w_nmos_ / 2; 839 load_pmos_width = g_tp.max_w_nmos_ * pmos_to_nmos_sizing_r; 840 C_driver_load = 1024 * gate_C(load_nmos_width + load_pmos_width, 0, is_dram); 841 R_wire_load = 3000 * l_ip.F_sz_um * g_tp.wire_outside_mat.R_per_um; 842 843 final_dec = new Decoder( 844 num_decoded_signals, 845 false, 846 C_driver_load, 847 R_wire_load, 848 false/*is_fa*/, 849 false/*is_dram*/, 850 false/*wl_tr*/, //to use peri device 851 cell); 852 853 PredecBlk * predec_blk1 = new PredecBlk( 854 num_decoded_signals, 855 final_dec, 856 0,//Assuming predec and dec are back to back 857 0, 858 1,//Each Predec only drives one final dec 859 false/*is_dram*/, 860 true); 861 PredecBlk * predec_blk2 = new PredecBlk( 862 num_decoded_signals, 863 final_dec, 864 0,//Assuming predec and dec are back to back 865 0, 866 1,//Each Predec only drives one final dec 867 false/*is_dram*/, 868 false); 869 870 PredecBlkDrv * predec_blk_drv1 = new PredecBlkDrv(0, predec_blk1, false); 871 PredecBlkDrv * predec_blk_drv2 = new PredecBlkDrv(0, predec_blk2, false); 872 873 pre_dec = new Predec(predec_blk_drv1, predec_blk_drv2); 874 875 double area_decoder = final_dec->area.get_area() * num_decoded_signals * 876 num_decoder_segments * num_decoders; 877 //double w_decoder = area_decoder / area.get_h(); 878 double area_pre_dec = (predec_blk_drv1->area.get_area() + 879 predec_blk_drv2->area.get_area() + 880 predec_blk1->area.get_area() + 881 predec_blk2->area.get_area()) * 882 num_decoder_segments * num_decoders; 883 area.set_area(area.get_area() + area_decoder + area_pre_dec); 884 double macro_layout_overhead = g_tp.macro_layout_overhead; 885 double chip_PR_overhead = g_tp.chip_layout_overhead; 886 area.set_area(area.get_area()*macro_layout_overhead*chip_PR_overhead); 887 888 inst_decoder_delay_power(); 889 890 double sckRation = g_tp.sckt_co_eff; 891 power.readOp.dynamic *= sckRation; 892 power.writeOp.dynamic *= sckRation; 893 power.searchOp.dynamic *= sckRation; 894 895 double long_channel_device_reduction = 896 longer_channel_device_reduction(device_ty, core_ty); 897 power.readOp.longer_channel_leakage = power.readOp.leakage * 898 long_channel_device_reduction; 899 900 output_data.area = area.get_area() / 1e6; 901 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate; 902 output_data.subthreshold_leakage_power = power.readOp.leakage; 903 output_data.gate_leakage_power = power.readOp.gate_leakage; 904} 905 906void InstructionDecoder::inst_decoder_delay_power() { 907 908 double dec_outrisetime; 909 double inrisetime = 0, outrisetime; 910 double pppm_t[4] = {1, 1, 1, 1}; 911 double squencer_passes = x86 ? 2 : 1; 912 913 outrisetime = pre_dec->compute_delays(inrisetime); 914 dec_outrisetime = final_dec->compute_delays(outrisetime); 915 set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments); 916 power = power + pre_dec->power * pppm_t; 917 set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals, 918 num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments); 919 power = power + final_dec->power * pppm_t; 920} 921 922void InstructionDecoder::leakage_feedback(double temperature) { 923 l_ip.temp = (unsigned int)round(temperature/10.0)*10; 924 uca_org_t init_result = init_interface(&l_ip, name); // init_result is dummy 925 926 final_dec->leakage_feedback(temperature); 927 pre_dec->leakage_feedback(temperature); 928 929 double pppm_t[4] = {1,1,1,1}; 930 double squencer_passes = x86?2:1; 931 932 set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments); 933 power = pre_dec->power*pppm_t; 934 935 set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals,num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments); 936 power = power + final_dec->power*pppm_t; 937 938 double sckRation = g_tp.sckt_co_eff; 939 940 power.readOp.dynamic *= sckRation; 941 power.writeOp.dynamic *= sckRation; 942 power.searchOp.dynamic *= sckRation; 943 944 double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty); 945 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; 946} 947 948InstructionDecoder::~InstructionDecoder() { 949 local_result.cleanup(); 950 951 delete final_dec; 952 953 delete pre_dec->blk1; 954 delete pre_dec->blk2; 955 delete pre_dec->drv1; 956 delete pre_dec->drv2; 957 delete pre_dec; 958} 959