logic.cc revision 10152
1/***************************************************************************** 2 * McPAT 3 * SOFTWARE LICENSE AGREEMENT 4 * Copyright 2012 Hewlett-Packard Development Company, L.P. 5 * All Rights Reserved 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are 9 * met: redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer; 11 * redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution; 14 * neither the name of the copyright holders nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” 29 * 30 ***************************************************************************/ 31 32#include "logic.h" 33 34 35//selection_logic 36selection_logic::selection_logic( 37 bool _is_default, 38 int win_entries_, 39 int issue_width_, 40 const InputParameter *configure_interface, 41 enum Device_ty device_ty_, 42 enum Core_type core_ty_) 43 //const ParseXML *_XML_interface) 44 :is_default(_is_default), 45 win_entries(win_entries_), 46 issue_width(issue_width_), 47 device_ty(device_ty_), 48 core_ty(core_ty_) 49 { 50 //uca_org_t result2; 51 l_ip=*configure_interface; 52 local_result = init_interface(&l_ip); 53 //init_tech_params(l_ip.F_sz_um, false); 54 //win_entries=numIBEntries;//IQentries; 55 //issue_width=issueWidth; 56 selection_power(); 57 double sckRation = g_tp.sckt_co_eff; 58 power.readOp.dynamic *= sckRation; 59 power.writeOp.dynamic *= sckRation; 60 power.searchOp.dynamic *= sckRation; 61 62 double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty); 63 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; 64 } 65 66void selection_logic::selection_power() 67{//based on cost effective superscalar processor TR pp27-31 68 double Ctotal, Cor, Cpencode; 69 int num_arbiter; 70 double WSelORn, WSelORprequ, WSelPn, WSelPp, WSelEnn, WSelEnp; 71 72 //TODO: the 0.8um process data is used. 73 WSelORn = 12.5 * l_ip.F_sz_um;//this was 10 micron for the 0.8 micron process 74 WSelORprequ = 50 * l_ip.F_sz_um;//this was 40 micron for the 0.8 micron process 75 WSelPn = 12.5 * l_ip.F_sz_um;//this was 10mcron for the 0.8 micron process 76 WSelPp = 18.75 * l_ip.F_sz_um;//this was 15 micron for the 0.8 micron process 77 WSelEnn = 6.25 * l_ip.F_sz_um;//this was 5 micron for the 0.8 micron process 78 WSelEnp = 12.5 * l_ip.F_sz_um;//this was 10 micron for the 0.8 micron process 79 80 81 Ctotal=0; 82 num_arbiter=1; 83 while(win_entries > 4) 84 { 85 win_entries = (int)ceil((double)win_entries / 4.0); 86 num_arbiter += win_entries; 87 } 88 //the 4-input OR logic to generate anyreq 89 Cor = 4 * drain_C_(WSelORn,NCH,1,1, g_tp.cell_h_def) + drain_C_(WSelORprequ,PCH,1,1, g_tp.cell_h_def); 90 power.readOp.gate_leakage = cmos_Ig_leakage(WSelORn, WSelORprequ, 4, nor)*g_tp.peri_global.Vdd; 91 92 //The total capacity of the 4-bit priority encoder 93 Cpencode = drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,1, 1, g_tp.cell_h_def) + 94 2*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,2, 1, g_tp.cell_h_def) + 95 3*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,3, 1, g_tp.cell_h_def) + 96 4*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,4, 1, g_tp.cell_h_def) +//precompute priority logic 97 2*4*gate_C(WSelEnn+WSelEnp,20.0)+ 98 4*drain_C_(WSelEnn,NCH,1, 1, g_tp.cell_h_def) + 2*4*drain_C_(WSelEnp,PCH,1, 1, g_tp.cell_h_def)+//enable logic 99 (2*4+2*3+2*2+2)*gate_C(WSelPn+WSelPp,10.0);//requests signal 100 101 Ctotal += issue_width * num_arbiter*(Cor+Cpencode); 102 103 power.readOp.dynamic = Ctotal*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*2;//2 means the abitration signal need to travel round trip 104 power.readOp.leakage = issue_width * num_arbiter * 105 (cmos_Isub_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p 106 + cmos_Isub_leakage(WSelPn, WSelPp, 3, nor)//grant2p 107 + cmos_Isub_leakage(WSelPn, WSelPp, 4, nor)//grant3p 108 + cmos_Isub_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic 109 + cmos_Isub_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant sIsubnals 110 )*g_tp.peri_global.Vdd; 111 power.readOp.gate_leakage = issue_width * num_arbiter * 112 (cmos_Ig_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p 113 + cmos_Ig_leakage(WSelPn, WSelPp, 3, nor)//grant2p 114 + cmos_Ig_leakage(WSelPn, WSelPp, 4, nor)//grant3p 115 + cmos_Ig_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic 116 + cmos_Ig_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant signals 117 )*g_tp.peri_global.Vdd; 118} 119 120 121dep_resource_conflict_check::dep_resource_conflict_check( 122 const InputParameter *configure_interface, 123 const CoreDynParam & dyn_p_, 124 int compare_bits_, 125 bool _is_default) 126 : l_ip(*configure_interface), 127 coredynp(dyn_p_), 128 compare_bits(compare_bits_), 129 is_default(_is_default) 130{ 131 Wcompn = 25 * l_ip.F_sz_um;//this was 20.0 micron for the 0.8 micron process 132 Wevalinvp = 25 * l_ip.F_sz_um;//this was 20.0 micron for the 0.8 micron process 133 Wevalinvn = 100 * l_ip.F_sz_um;//this was 80.0 mcron for the 0.8 micron process 134 Wcomppreequ = 50 * l_ip.F_sz_um;//this was 40.0 micron for the 0.8 micron process 135 WNORn = 6.75 * l_ip.F_sz_um;//this was 5.4 micron for the 0.8 micron process 136 WNORp = 38.125 * l_ip.F_sz_um;//this was 30.5 micron for the 0.8 micron process 137 138 local_result = init_interface(&l_ip); 139 140 if (coredynp.core_ty==Inorder) 141 compare_bits += 16 + 8 + 8;//TODO: opcode bits + log(shared resources) + REG TAG BITS-->opcode comparator 142 else 143 compare_bits += 16 + 8 + 8; 144 145 conflict_check_power(); 146 double sckRation = g_tp.sckt_co_eff; 147 power.readOp.dynamic *= sckRation; 148 power.writeOp.dynamic *= sckRation; 149 power.searchOp.dynamic *= sckRation; 150 151} 152 153void dep_resource_conflict_check::conflict_check_power() 154{ 155 double Ctotal; 156 int num_comparators; 157 num_comparators = 3*((coredynp.decodeW) * (coredynp.decodeW)-coredynp.decodeW);//2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest to dest comparision. 158 //When decode-width ==1, no dcl logic 159 160 Ctotal = num_comparators * compare_cap(); 161 //printf("%i,%s\n",XML_interface->sys.core[0].predictor.predictor_entries,XML_interface->sys.core[0].predictor.prediction_scheme); 162 163 power.readOp.dynamic=Ctotal*/*CLOCKRATE*/g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/*AF*/; 164 power.readOp.leakage=num_comparators*compare_bits*2*simplified_nmos_leakage(Wcompn, false); 165 166 double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty); 167 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; 168 power.readOp.gate_leakage=num_comparators*compare_bits*2*cmos_Ig_leakage(Wcompn, 0, 2, nmos); 169 170} 171 172/* estimate comparator power consumption (this comparator is similar 173 to the tag-match structure in a CAM */ 174double dep_resource_conflict_check::compare_cap() 175{ 176 double c1, c2; 177 178 WNORp = WNORp * compare_bits/2.0;//resize the big NOR gate at the DCL according to fan in. 179 /* bottom part of comparator */ 180 c2 = (compare_bits)*(drain_C_(Wcompn,NCH,1,1, g_tp.cell_h_def)+drain_C_(Wcompn,NCH,2,1, g_tp.cell_h_def))+ 181 drain_C_(Wevalinvp,PCH,1,1, g_tp.cell_h_def) + drain_C_(Wevalinvn,NCH,1,1, g_tp.cell_h_def); 182 183 /* top part of comparator */ 184 c1 = (compare_bits)*(drain_C_(Wcompn,NCH,1,1, g_tp.cell_h_def)+drain_C_(Wcompn,NCH,2,1, g_tp.cell_h_def)+ 185 drain_C_(Wcomppreequ,NCH,1,1, g_tp.cell_h_def)) + gate_C(WNORn + WNORp,10.0) + 186 drain_C_(WNORp,NCH,2,1, g_tp.cell_h_def) + compare_bits*drain_C_(WNORn,NCH,2,1, g_tp.cell_h_def); 187 return(c1 + c2); 188 189} 190 191void dep_resource_conflict_check::leakage_feedback(double temperature) 192{ 193 l_ip.temp = (unsigned int)round(temperature/10.0)*10; 194 uca_org_t init_result = init_interface(&l_ip); // init_result is dummy 195 196 // This is part of conflict_check_power() 197 int num_comparators = 3*((coredynp.decodeW) * (coredynp.decodeW)-coredynp.decodeW);//2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest to dest comparision. 198 power.readOp.leakage=num_comparators*compare_bits*2*simplified_nmos_leakage(Wcompn, false); 199 200 double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty); 201 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; 202 power.readOp.gate_leakage=num_comparators*compare_bits*2*cmos_Ig_leakage(Wcompn, 0, 2, nmos); 203} 204 205//TODO: add inverter and transmission gate base DFF. 206 207DFFCell::DFFCell( 208 bool _is_dram, 209 double _WdecNANDn, 210 double _WdecNANDp, 211 double _cell_load, 212 const InputParameter *configure_interface) 213:is_dram(_is_dram), 214cell_load(_cell_load), 215WdecNANDn(_WdecNANDn), 216WdecNANDp(_WdecNANDp) 217{//this model is based on the NAND2 based DFF. 218 l_ip=*configure_interface; 219// area.set_area(730*l_ip.F_sz_um*l_ip.F_sz_um); 220 area.set_area(5*compute_gate_area(NAND, 2,WdecNANDn,WdecNANDp, g_tp.cell_h_def) 221 + compute_gate_area(NAND, 2,WdecNANDn,WdecNANDn, g_tp.cell_h_def)); 222 223 224} 225 226 227double DFFCell::fpfp_node_cap(unsigned int fan_in, unsigned int fan_out) 228{ 229 double Ctotal = 0; 230 //printf("WdecNANDn = %E\n", WdecNANDn); 231 232 /* part 1: drain cap of NAND gate */ 233 Ctotal += drain_C_(WdecNANDn, NCH, 2, 1, g_tp.cell_h_def, is_dram) + fan_in * drain_C_(WdecNANDp, PCH, 1, 1, g_tp.cell_h_def, is_dram); 234 235 /* part 2: gate cap of NAND gates */ 236 Ctotal += fan_out * gate_C(WdecNANDn + WdecNANDp, 0, is_dram); 237 238 return Ctotal; 239} 240 241 242void DFFCell::compute_DFF_cell() 243{ 244 double c1, c2, c3, c4, c5, c6; 245 /* node 5 and node 6 are identical to node 1 in capacitance */ 246 c1 = c5 = c6 = fpfp_node_cap(2, 1); 247 c2 = fpfp_node_cap(2, 3); 248 c3 = fpfp_node_cap(3, 2); 249 c4 = fpfp_node_cap(2, 2); 250 251 //cap-load of the clock signal in each Dff, actually the clock signal only connected to one NAND2 252 clock_cap= 2 * gate_C(WdecNANDn + WdecNANDp, 0, is_dram); 253 e_switch.readOp.dynamic += (c4 + c1 + c2 + c3 + c5 + c6 + 2*cell_load)*0.5*g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;; 254 255 /* no 1/2 for e_keep and e_clock because clock signal switches twice in one cycle */ 256 e_keep_1.readOp.dynamic += c3 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ; 257 e_keep_0.readOp.dynamic += c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ; 258 e_clock.readOp.dynamic += clock_cap* g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;; 259 260 /* static power */ 261 e_switch.readOp.leakage += (cmos_Isub_leakage(WdecNANDn, WdecNANDp, 2, nand)*5//5 NAND2 and 1 NAND3 in a DFF 262 + cmos_Isub_leakage(WdecNANDn, WdecNANDn, 3, nand))*g_tp.peri_global.Vdd; 263 e_switch.readOp.gate_leakage += (cmos_Ig_leakage(WdecNANDn, WdecNANDp, 2, nand)*5//5 NAND2 and 1 NAND3 in a DFF 264 + cmos_Ig_leakage(WdecNANDn, WdecNANDn, 3, nand))*g_tp.peri_global.Vdd; 265 //printf("leakage =%E\n",cmos_Ileak(1, is_dram) ); 266} 267 268Pipeline::Pipeline( 269 const InputParameter *configure_interface, 270 const CoreDynParam & dyn_p_, 271 enum Device_ty device_ty_, 272 bool _is_core_pipeline, 273 bool _is_default) 274: l_ip(*configure_interface), 275 coredynp(dyn_p_), 276 device_ty(device_ty_), 277 is_core_pipeline(_is_core_pipeline), 278 is_default(_is_default), 279 num_piperegs(0.0) 280 281 { 282 local_result = init_interface(&l_ip); 283 if (!coredynp.Embedded) 284 process_ind = true; 285 else 286 process_ind = false; 287 WNANDn = (process_ind)? 25 * l_ip.F_sz_um : g_tp.min_w_nmos_ ;//this was 20 micron for the 0.8 micron process 288 WNANDp = (process_ind)? 37.5 * l_ip.F_sz_um : g_tp.min_w_nmos_*pmos_to_nmos_sz_ratio();//this was 30 micron for the 0.8 micron process 289 load_per_pipeline_stage = 2*gate_C(WNANDn + WNANDp, 0, false); 290 compute(); 291 292} 293 294void Pipeline::compute() 295{ 296 compute_stage_vector(); 297 DFFCell pipe_reg(false, WNANDn,WNANDp, load_per_pipeline_stage, &l_ip); 298 pipe_reg.compute_DFF_cell(); 299 300 double clock_power_pipereg = num_piperegs * pipe_reg.e_clock.readOp.dynamic; 301 //******************pipeline power: currently, we average all the possibilities of the states of DFFs in the pipeline. A better way to do it is to consider 302 //the harming distance of two consecutive signals, However McPAT does not have plan to do this in near future as it focuses on worst case power. 303 double pipe_reg_power = num_piperegs * (pipe_reg.e_switch.readOp.dynamic+pipe_reg.e_keep_0.readOp.dynamic+pipe_reg.e_keep_1.readOp.dynamic)/3+clock_power_pipereg; 304 double pipe_reg_leakage = num_piperegs * pipe_reg.e_switch.readOp.leakage; 305 double pipe_reg_gate_leakage = num_piperegs * pipe_reg.e_switch.readOp.gate_leakage; 306 power.readOp.dynamic +=pipe_reg_power; 307 power.readOp.leakage +=pipe_reg_leakage; 308 power.readOp.gate_leakage +=pipe_reg_gate_leakage; 309 area.set_area(num_piperegs * pipe_reg.area.get_area()); 310 311 double long_channel_device_reduction = longer_channel_device_reduction(device_ty, coredynp.core_ty); 312 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; 313 314 315 double sckRation = g_tp.sckt_co_eff; 316 power.readOp.dynamic *= sckRation; 317 power.writeOp.dynamic *= sckRation; 318 power.searchOp.dynamic *= sckRation; 319 double macro_layout_overhead = g_tp.macro_layout_overhead; 320 if (!coredynp.Embedded) 321 area.set_area(area.get_area()*macro_layout_overhead); 322} 323 324void Pipeline::compute_stage_vector() 325{ 326 double num_stages, tot_stage_vector, per_stage_vector; 327 int opcode_length = coredynp.x86? coredynp.micro_opcode_length:coredynp.opcode_length; 328 //Hthread = thread_clock_gated? 1:num_thread; 329 330 if (!is_core_pipeline) 331 { 332 num_piperegs=l_ip.pipeline_stages*l_ip.per_stage_vector;//The number of pipeline stages are calculated based on the achievable throughput and required throughput 333 } 334 else 335 { 336 if (coredynp.core_ty==Inorder) 337 { 338 /* assume 6 pipe stages and try to estimate bits per pipe stage */ 339 /* pipe stage 0/IF */ 340 num_piperegs += coredynp.pc_width*2*coredynp.num_hthreads; 341 /* pipe stage IF/ID */ 342 num_piperegs += coredynp.fetchW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads; 343 /* pipe stage IF/ThreadSEL */ 344 if (coredynp.multithreaded) num_piperegs += coredynp.num_hthreads*coredynp.perThreadState; //8 bit thread states 345 /* pipe stage ID/EXE */ 346 num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width + pow(2.0,opcode_length)+ 2*coredynp.int_data_width)*coredynp.num_hthreads; 347 /* pipe stage EXE/MEM */ 348 num_piperegs += coredynp.issueW*(3 * coredynp.arch_ireg_width + pow(2.0,opcode_length) + 8*2*coredynp.int_data_width/*+2*powers (2,reg_length)*/); 349 /* pipe stage MEM/WB the 2^opcode_length means the total decoded signal for the opcode*/ 350 num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length) + 8*2*coredynp.int_data_width/*+2*powers (2,reg_length)*/); 351// /* pipe stage 5/6 */ 352// num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/*+2*powers (2,reg_length)*/); 353// /* pipe stage 6/7 */ 354// num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/*+2*powers (2,reg_length)*/); 355// /* pipe stage 7/8 */ 356// num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/**2*powers (2,reg_length)*/); 357// /* assume 50% extra in control signals (rule of thumb) */ 358 num_stages=6; 359 360 } 361 else 362 { 363 /* assume 12 stage pipe stages and try to estimate bits per pipe stage */ 364 /*OOO: Fetch, decode, rename, IssueQ, dispatch, regread, EXE, MEM, WB, CM */ 365 366 /* pipe stage 0/1F*/ 367 num_piperegs += coredynp.pc_width*2*coredynp.num_hthreads ;//PC and Next PC 368 /* pipe stage IF/ID */ 369 num_piperegs += coredynp.fetchW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;//PC is used to feed branch predictor in ID 370 /* pipe stage 1D/Renaming*/ 371 num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;//PC is for branch exe in later stage. 372 /* pipe stage Renaming/wire_drive */ 373 num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width); 374 /* pipe stage Renaming/IssueQ */ 375 num_piperegs += coredynp.issueW*(coredynp.instruction_length + coredynp.pc_width + 3*coredynp.phy_ireg_width)*coredynp.num_hthreads;//3*coredynp.phy_ireg_width means 2 sources and 1 dest 376 /* pipe stage IssueQ/Dispatch */ 377 num_piperegs += coredynp.issueW*(coredynp.instruction_length + 3 * coredynp.phy_ireg_width); 378 /* pipe stage Dispatch/EXE */ 379 380 num_piperegs += coredynp.issueW*(3 * coredynp.phy_ireg_width + coredynp.pc_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/); 381 /* 2^opcode_length means the total decoded signal for the opcode*/ 382 num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/); 383 /*2 source operands in EXE; Assume 2EXE stages* since we do not really distinguish OP*/ 384 num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/); 385 /* pipe stage EXE/MEM, data need to be read/write, address*/ 386 num_piperegs += coredynp.issueW*(coredynp.int_data_width + coredynp.v_address_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);//memory Opcode still need to be passed 387 /* pipe stage MEM/WB; result data, writeback regs */ 388 num_piperegs += coredynp.issueW*(coredynp.int_data_width + coredynp.phy_ireg_width /* powers (2,opcode_length) + (2,opcode_length)+2*powers (2,reg_length)*/); 389 /* pipe stage WB/CM ; result data, regs need to be updated, address for resolve memory ops in ROB's top*/ 390 num_piperegs += coredynp.commitW*(coredynp.int_data_width + coredynp.v_address_width + coredynp.phy_ireg_width/*+ powers (2,opcode_length)*2*powers (2,reg_length)*/)*coredynp.num_hthreads; 391// if (multithreaded) 392// { 393// 394// } 395 num_stages=12; 396 397 } 398 399 /* assume 50% extra in control registers and interrupt registers (rule of thumb) */ 400 num_piperegs = num_piperegs * 1.5; 401 tot_stage_vector=num_piperegs; 402 per_stage_vector=tot_stage_vector/num_stages; 403 404 if (coredynp.core_ty==Inorder) 405 { 406 if (coredynp.pipeline_stages>6) 407 num_piperegs= per_stage_vector*coredynp.pipeline_stages; 408 } 409 else//OOO 410 { 411 if (coredynp.pipeline_stages>12) 412 num_piperegs= per_stage_vector*coredynp.pipeline_stages; 413 } 414 } 415 416} 417 418FunctionalUnit::FunctionalUnit(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, enum FU_type fu_type_) 419:XML(XML_interface), 420 ithCore(ithCore_), 421 interface_ip(*interface_ip_), 422 coredynp(dyn_p_), 423 fu_type(fu_type_) 424{ 425 double area_t;//, leakage, gate_leakage; 426 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); 427 clockRate = coredynp.clockRate; 428 executionTime = coredynp.executionTime; 429 430 //XML_interface=_XML_interface; 431 uca_org_t result2; 432 result2 = init_interface(&interface_ip); 433 if (XML->sys.Embedded) 434 { 435 if (fu_type == FPU) 436 { 437 num_fu=coredynp.num_fpus; 438 //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 439 area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number 440 //4.47 contains both VFP and NEON processing unit, VFP is about 40% and NEON is about 60% 441 if (g_ip->F_sz_nm>90) 442 area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 443 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 444 gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 445 //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles. 446// base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) 447// base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2); 448 base_energy = 0; 449 per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per Hz energy(nJ) 450 //FPU power from Sandia's processor sizing tech report 451 FU_height=(18667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data 452 } 453 else if (fu_type == ALU) 454 { 455 num_fu=coredynp.num_alus; 456 area_t = 280*260*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl 457 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 458 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; 459// base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) 460// base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2); 461 base_energy = 0; 462 per_access_energy = 1.15/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ) 463 FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU 464 465 } 466 else if (fu_type == MUL) 467 { 468 num_fu=coredynp.num_muls; 469 area_t = 280*260*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl 470 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 471 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; 472// base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) 473// base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2); 474 base_energy = 0; 475 per_access_energy = 1.15*2/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch 476 FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data 477 } 478 else 479 { 480 cout<<"Unknown Functional Unit Type"<<endl; 481 exit(0); 482 } 483 per_access_energy *=0.5;//According to ARM data embedded processor has much lower per acc energy 484 } 485 else 486 { 487 if (fu_type == FPU) 488 { 489 num_fu=coredynp.num_fpus; 490 //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 491 area_t = 8.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 492 if (g_ip->F_sz_nm>90) 493 area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 494 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 495 gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 496 //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles. 497 base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) 498 base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2); 499 per_access_energy = 1.15*3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per op energy(nJ) 500 FU_height=(38667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data 501 } 502 else if (fu_type == ALU) 503 { 504 num_fu=coredynp.num_alus; 505 area_t = 280*260*2*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl 506 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 507 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; 508 base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) 509 base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2); 510 per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ) 511 FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU 512 513 } 514 else if (fu_type == MUL) 515 { 516 num_fu=coredynp.num_muls; 517 area_t = 280*260*2*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl 518 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 519 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; 520 base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) 521 base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2); 522 per_access_energy = 1.15*2/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch 523 FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data 524 } 525 else 526 { 527 cout<<"Unknown Functional Unit Type"<<endl; 528 exit(0); 529 } 530 } 531 //IEXEU, simple ALU and FPU 532 // double C_ALU, C_EXEU, C_FPU; //Lum Equivalent capacitance of IEXEU and FPU. Based on Intel and Sun 90nm process fabracation. 533 // 534 // C_ALU = 0.025e-9;//F 535 // C_EXEU = 0.05e-9; //F 536 // C_FPU = 0.35e-9;//F 537 area.set_area(area_t*num_fu); 538 leakage *= num_fu; 539 gate_leakage *=num_fu; 540 double macro_layout_overhead = g_tp.macro_layout_overhead; 541// if (!XML->sys.Embedded) 542 area.set_area(area.get_area()*macro_layout_overhead); 543} 544 545void FunctionalUnit::computeEnergy(bool is_tdp) 546{ 547 double pppm_t[4] = {1,1,1,1}; 548 double FU_duty_cycle; 549 if (is_tdp) 550 { 551 552 553 set_pppm(pppm_t, 2, 2, 2, 2);//2 means two source operands needs to be passed for each int instruction. 554 if (fu_type == FPU) 555 { 556 stats_t.readAc.access = num_fu; 557 tdp_stats = stats_t; 558 FU_duty_cycle = coredynp.FPU_duty_cycle; 559 } 560 else if (fu_type == ALU) 561 { 562 stats_t.readAc.access = 1*num_fu; 563 tdp_stats = stats_t; 564 FU_duty_cycle = coredynp.ALU_duty_cycle; 565 } 566 else if (fu_type == MUL) 567 { 568 stats_t.readAc.access = num_fu; 569 tdp_stats = stats_t; 570 FU_duty_cycle = coredynp.MUL_duty_cycle; 571 } 572 573 //power.readOp.dynamic = base_energy/clockRate + energy*stats_t.readAc.access; 574 power.readOp.dynamic = per_access_energy*stats_t.readAc.access + base_energy/clockRate; 575 double sckRation = g_tp.sckt_co_eff; 576 power.readOp.dynamic *= sckRation*FU_duty_cycle; 577 power.writeOp.dynamic *= sckRation; 578 power.searchOp.dynamic *= sckRation; 579 580 power.readOp.leakage = leakage; 581 power.readOp.gate_leakage = gate_leakage; 582 double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty); 583 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; 584 585 } 586 else 587 { 588 if (fu_type == FPU) 589 { 590 stats_t.readAc.access = XML->sys.core[ithCore].fpu_accesses; 591 rtp_stats = stats_t; 592 } 593 else if (fu_type == ALU) 594 { 595 stats_t.readAc.access = XML->sys.core[ithCore].ialu_accesses; 596 rtp_stats = stats_t; 597 } 598 else if (fu_type == MUL) 599 { 600 stats_t.readAc.access = XML->sys.core[ithCore].mul_accesses; 601 rtp_stats = stats_t; 602 } 603 604 //rt_power.readOp.dynamic = base_energy*executionTime + energy*stats_t.readAc.access; 605 rt_power.readOp.dynamic = per_access_energy*stats_t.readAc.access + base_energy*executionTime; 606 double sckRation = g_tp.sckt_co_eff; 607 rt_power.readOp.dynamic *= sckRation; 608 rt_power.writeOp.dynamic *= sckRation; 609 rt_power.searchOp.dynamic *= sckRation; 610 611 } 612 613 614} 615 616void FunctionalUnit::displayEnergy(uint32_t indent,int plevel,bool is_tdp) 617{ 618 string indent_str(indent, ' '); 619 string indent_str_next(indent+2, ' '); 620 bool long_channel = XML->sys.longer_channel_device; 621 622// cout << indent_str_next << "Results Broadcast Bus Area = " << bypass->area.get_area() *1e-6 << " mm^2" << endl; 623 if (is_tdp) 624 { 625 if (fu_type == FPU) 626 { 627 cout << indent_str << "Floating Point Units (FPUs) (Count: "<< coredynp.num_fpus <<" ):" << endl; 628 cout << indent_str_next << "Area = " << area.get_area()*1e-6 << " mm^2" << endl; 629 cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl; 630// cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage << " W" << endl; 631 cout << indent_str_next<< "Subthreshold Leakage = " 632 << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; 633 cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; 634 cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; 635 cout <<endl; 636 } 637 else if (fu_type == ALU) 638 { 639 cout << indent_str << "Integer ALUs (Count: "<< coredynp.num_alus <<" ):" << endl; 640 cout << indent_str_next << "Area = " << area.get_area()*1e-6 << " mm^2" << endl; 641 cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl; 642// cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage << " W" << endl; 643 cout << indent_str_next<< "Subthreshold Leakage = " 644 << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; 645 cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; 646 cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; 647 cout <<endl; 648 } 649 else if (fu_type == MUL) 650 { 651 cout << indent_str << "Complex ALUs (Mul/Div) (Count: "<< coredynp.num_muls <<" ):" << endl; 652 cout << indent_str_next << "Area = " << area.get_area()*1e-6 << " mm^2" << endl; 653 cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl; 654// cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage << " W" << endl; 655 cout << indent_str_next<< "Subthreshold Leakage = " 656 << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; 657 cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; 658 cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; 659 cout <<endl; 660 661 } 662 663 } 664 else 665 { 666 } 667 668} 669 670void FunctionalUnit::leakage_feedback(double temperature) 671{ 672 // Update the temperature and initialize the global interfaces. 673 interface_ip.temp = (unsigned int)round(temperature/10.0)*10; 674 675 uca_org_t init_result = init_interface(&interface_ip); // init_result is dummy 676 677 // This is part of FunctionalUnit() 678 double area_t, leakage, gate_leakage; 679 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); 680 681 if (fu_type == FPU) 682 { 683 area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number 684 if (g_ip->F_sz_nm>90) 685 area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 686 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 687 gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 688 } 689 else if (fu_type == ALU) 690 { 691 area_t = 280*260*2*num_fu*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl 692 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 693 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; 694 } 695 else if (fu_type == MUL) 696 { 697 area_t = 280*260*2*3*num_fu*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl 698 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W 699 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; 700 } 701 else 702 { 703 cout<<"Unknown Functional Unit Type"<<endl; 704 exit(1); 705 } 706 707 power.readOp.leakage = leakage*num_fu; 708 power.readOp.gate_leakage = gate_leakage*num_fu; 709 power.readOp.longer_channel_leakage = longer_channel_device_reduction(Core_device, coredynp.core_ty); 710} 711 712UndiffCore::UndiffCore(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_, bool embedded_) 713:XML(XML_interface), 714 ithCore(ithCore_), 715 interface_ip(*interface_ip_), 716 coredynp(dyn_p_), 717 core_ty(coredynp.core_ty), 718 embedded(XML->sys.Embedded), 719 pipeline_stage(coredynp.pipeline_stages), 720 num_hthreads(coredynp.num_hthreads), 721 issue_width(coredynp.issueW), 722 exist(exist_) 723// is_default(_is_default) 724{ 725 if (!exist) return; 726 double undifferentiated_core=0; 727 double core_tx_density=0; 728 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); 729 double undifferentiated_core_coe; 730 //XML_interface=_XML_interface; 731 uca_org_t result2; 732 result2 = init_interface(&interface_ip); 733 734 //Compute undifferentiated core area at 90nm. 735 if (embedded==false) 736 { 737 //Based on the results of polynomial/log curve fitting based on undifferentiated core of Niagara, Niagara2, Merom, Penyrn, Prescott, Opteron die measurements 738 if (core_ty==OOO) 739 { 740 //undifferentiated_core = (0.0764*pipeline_stage*pipeline_stage -2.3685*pipeline_stage + 10.405);//OOO 741 undifferentiated_core = (3.57*log(pipeline_stage)-1.2643)>0?(3.57*log(pipeline_stage)-1.2643):0; 742 } 743 else if (core_ty==Inorder) 744 { 745 //undifferentiated_core = (0.1238*pipeline_stage + 7.2572)*0.9;//inorder 746 undifferentiated_core = (-2.19*log(pipeline_stage)+6.55)>0?(-2.19*log(pipeline_stage)+6.55):0; 747 } 748 else 749 { 750 cout<<"invalid core type"<<endl; 751 exit(0); 752 } 753 undifferentiated_core *= (1+ logtwo(num_hthreads)* 0.0716); 754 } 755 else 756 { 757 //Based on the results in paper "parametrized processor models" Sandia Labs 758 if (XML->sys.opt_clockrate) 759 undifferentiated_core_coe = 0.05; 760 else 761 undifferentiated_core_coe = 0; 762 undifferentiated_core = (0.4109* pipeline_stage - 0.776)*undifferentiated_core_coe; 763 undifferentiated_core *= (1+ logtwo(num_hthreads)* 0.0426); 764 } 765 766 undifferentiated_core *= g_tp.scaling_factor.logic_scaling_co_eff*1e6;//change from mm^2 to um^2 767 core_tx_density = g_tp.scaling_factor.core_tx_density; 768 //undifferentiated_core = 3*1e6; 769 //undifferentiated_core *= g_tp.scaling_factor.logic_scaling_co_eff;//(g_ip->F_sz_um*g_ip->F_sz_um/0.09/0.09)*; 770 power.readOp.leakage = undifferentiated_core*(core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W 771 power.readOp.gate_leakage = undifferentiated_core*(core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd; 772 773 double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty); 774 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; 775 area.set_area(undifferentiated_core); 776 777 scktRatio = g_tp.sckt_co_eff; 778 power.readOp.dynamic *= scktRatio; 779 power.writeOp.dynamic *= scktRatio; 780 power.searchOp.dynamic *= scktRatio; 781 macro_PR_overhead = g_tp.macro_layout_overhead; 782 area.set_area(area.get_area()*macro_PR_overhead); 783 784 785 786// double vt=g_tp.peri_global.Vth; 787// double velocity_index=1.1; 788// double c_in=gate_C(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r , 0.0, false); 789// double c_out= drain_C_(g_tp.min_w_nmos_, NCH, 2, 1, g_tp.cell_h_def, false) + drain_C_(g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, PCH, 1, 1, g_tp.cell_h_def, false) + c_in; 790// double w_nmos=g_tp.min_w_nmos_; 791// double w_pmos=g_tp.min_w_nmos_*pmos_to_nmos_sizing_r; 792// double i_on_n=1.0; 793// double i_on_p=1.0; 794// double i_on_n_in=1.0; 795// double i_on_p_in=1; 796// double vdd=g_tp.peri_global.Vdd; 797 798// power.readOp.sc=shortcircuit_simple(vt, velocity_index, c_in, c_out, w_nmos,w_pmos, i_on_n, i_on_p,i_on_n_in, i_on_p_in, vdd); 799// power.readOp.dynamic=c_out*vdd*vdd/2; 800 801// cout<<power.readOp.dynamic << "dynamic" <<endl; 802// cout<<power.readOp.sc << "sc" << endl; 803 804// power.readOp.sc=shortcircuit(vt, velocity_index, c_in, c_out, w_nmos,w_pmos, i_on_n, i_on_p,i_on_n_in, i_on_p_in, vdd); 805// power.readOp.dynamic=c_out*vdd*vdd/2; 806// 807// cout<<power.readOp.dynamic << "dynamic" <<endl; 808// cout<<power.readOp.sc << "sc" << endl; 809 810 811 812} 813 814 815void UndiffCore::displayEnergy(uint32_t indent,int plevel,bool is_tdp) 816{ 817 string indent_str(indent, ' '); 818 string indent_str_next(indent+2, ' '); 819 bool long_channel = XML->sys.longer_channel_device; 820 821 if (is_tdp) 822 { 823 cout << indent_str << "UndiffCore:" << endl; 824 cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl; 825 cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl; 826 //cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl; 827 cout << indent_str_next<< "Subthreshold Leakage = " 828 << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; 829 cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; 830 //cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; 831 cout <<endl; 832 } 833 else 834 { 835 cout << indent_str << "UndiffCore:" << endl; 836 cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl; 837 cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl; 838 cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl; 839 cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; 840 //cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; 841 cout <<endl; 842 } 843 844} 845 846inst_decoder::inst_decoder( 847 bool _is_default, 848 const InputParameter *configure_interface, 849 int opcode_length_, 850 int num_decoders_, 851 bool x86_, 852 enum Device_ty device_ty_, 853 enum Core_type core_ty_) 854:is_default(_is_default), 855 opcode_length(opcode_length_), 856 num_decoders(num_decoders_), 857 x86(x86_), 858 device_ty(device_ty_), 859 core_ty(core_ty_) 860 { 861 /* 862 * Instruction decoder is different from n to 2^n decoders 863 * that are commonly used in row decoders in memory arrays. 864 * The RISC instruction decoder is typically a very simple device. 865 * We can decode an instruction by simply 866 * separating the machine word into small parts using wire slices 867 * The RISC instruction decoder can be approximate by the n to 2^n decoders, 868 * although this approximation usually underestimate power since each decoded 869 * instruction normally has more than 1 active signal. 870 * 871 * However, decoding a CISC instruction word is much more difficult 872 * than the RISC case. A CISC decoder is typically set up as a state machine. 873 * The machine reads the opcode field to determine 874 * what type of instruction it is, 875 * and where the other data values are. 876 * The instruction word is read in piece by piece, 877 * and decisions are made at each stage as to 878 * how the remainder of the instruction word will be read. 879 * (sequencer and ROM are usually needed) 880 * An x86 decoder can be even more complex since 881 * it involve both decoding instructions into u-ops and 882 * merge u-ops when doing micro-ops fusion. 883 */ 884 bool is_dram=false; 885 double pmos_to_nmos_sizing_r; 886 double load_nmos_width, load_pmos_width; 887 double C_driver_load, R_wire_load; 888 Area cell; 889 890 l_ip=*configure_interface; 891 local_result = init_interface(&l_ip); 892 cell.h =g_tp.cell_h_def; 893 cell.w =g_tp.cell_h_def; 894 895 num_decoder_segments = (int)ceil(opcode_length/18.0); 896 if (opcode_length > 18) opcode_length = 18; 897 num_decoded_signals= (int)pow(2.0,opcode_length); 898 pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); 899 load_nmos_width=g_tp.max_w_nmos_ /2; 900 load_pmos_width= g_tp.max_w_nmos_ * pmos_to_nmos_sizing_r; 901 C_driver_load = 1024*gate_C(load_nmos_width + load_pmos_width, 0, is_dram); //TODO: this number 1024 needs to be revisited 902 R_wire_load = 3000*l_ip.F_sz_um * g_tp.wire_outside_mat.R_per_um; 903 904 final_dec = new Decoder( 905 num_decoded_signals, 906 false, 907 C_driver_load, 908 R_wire_load, 909 false/*is_fa*/, 910 false/*is_dram*/, 911 false/*wl_tr*/, //to use peri device 912 cell); 913 914 PredecBlk * predec_blk1 = new PredecBlk( 915 num_decoded_signals, 916 final_dec, 917 0,//Assuming predec and dec are back to back 918 0, 919 1,//Each Predec only drives one final dec 920 false/*is_dram*/, 921 true); 922 PredecBlk * predec_blk2 = new PredecBlk( 923 num_decoded_signals, 924 final_dec, 925 0,//Assuming predec and dec are back to back 926 0, 927 1,//Each Predec only drives one final dec 928 false/*is_dram*/, 929 false); 930 931 PredecBlkDrv * predec_blk_drv1 = new PredecBlkDrv(0, predec_blk1, false); 932 PredecBlkDrv * predec_blk_drv2 = new PredecBlkDrv(0, predec_blk2, false); 933 934 pre_dec = new Predec(predec_blk_drv1, predec_blk_drv2); 935 936 double area_decoder = final_dec->area.get_area() * num_decoded_signals * num_decoder_segments*num_decoders; 937 //double w_decoder = area_decoder / area.get_h(); 938 double area_pre_dec = (predec_blk_drv1->area.get_area() + 939 predec_blk_drv2->area.get_area() + 940 predec_blk1->area.get_area() + 941 predec_blk2->area.get_area())* 942 num_decoder_segments*num_decoders; 943 area.set_area(area.get_area()+ area_decoder + area_pre_dec); 944 double macro_layout_overhead = g_tp.macro_layout_overhead; 945 double chip_PR_overhead = g_tp.chip_layout_overhead; 946 area.set_area(area.get_area()*macro_layout_overhead*chip_PR_overhead); 947 948 inst_decoder_delay_power(); 949 950 double sckRation = g_tp.sckt_co_eff; 951 power.readOp.dynamic *= sckRation; 952 power.writeOp.dynamic *= sckRation; 953 power.searchOp.dynamic *= sckRation; 954 955 double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty); 956 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; 957 958} 959 960void inst_decoder::inst_decoder_delay_power() 961{ 962 963 double dec_outrisetime; 964 double inrisetime=0, outrisetime; 965 double pppm_t[4] = {1,1,1,1}; 966 double squencer_passes = x86?2:1; 967 968 outrisetime = pre_dec->compute_delays(inrisetime); 969 dec_outrisetime = final_dec->compute_delays(outrisetime); 970 set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments); 971 power = power + pre_dec->power*pppm_t; 972 set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals, 973 num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments); 974 power = power + final_dec->power*pppm_t; 975} 976void inst_decoder::leakage_feedback(double temperature) 977{ 978 l_ip.temp = (unsigned int)round(temperature/10.0)*10; 979 uca_org_t init_result = init_interface(&l_ip); // init_result is dummy 980 981 final_dec->leakage_feedback(temperature); 982 pre_dec->leakage_feedback(temperature); 983 984 double pppm_t[4] = {1,1,1,1}; 985 double squencer_passes = x86?2:1; 986 987 set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments); 988 power = pre_dec->power*pppm_t; 989 990 set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals,num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments); 991 power = power + final_dec->power*pppm_t; 992 993 double sckRation = g_tp.sckt_co_eff; 994 995 power.readOp.dynamic *= sckRation; 996 power.writeOp.dynamic *= sckRation; 997 power.searchOp.dynamic *= sckRation; 998 999 double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty); 1000 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; 1001} 1002 1003inst_decoder::~inst_decoder() 1004{ 1005 local_result.cleanup(); 1006 1007 delete final_dec; 1008 1009 delete pre_dec->blk1; 1010 delete pre_dec->blk2; 1011 delete pre_dec->drv1; 1012 delete pre_dec->drv2; 1013 delete pre_dec; 1014} 1015