Cross Reference: /gem5/ext/mcpat/logic.cc

logic.cc (10152:52c552138ba1)	logic.cc (10234:5cb711fa6176)
1/***************************************************************************** 2 * McPAT 3 * SOFTWARE LICENSE AGREEMENT 4 * Copyright 2012 Hewlett-Packard Development Company, L.P.	1/***************************************************************************** 2 * McPAT 3 * SOFTWARE LICENSE AGREEMENT 4 * Copyright 2012 Hewlett-Packard Development Company, L.P.
	5 * Copyright (c) 2010-2013 Advanced Micro Devices, Inc.
5 * All Rights Reserved 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are 9 * met: redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer; 11 * redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the --- 7 unchanged lines hidden (view full) --- 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE	6 * All Rights Reserved 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are 10 * met: redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer; 12 * redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the --- 7 unchanged lines hidden (view full) --- 21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”	29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 * 30 ***************************************************************************/ 31	30 * 31 ***************************************************************************/ 32
	33#include "common.h"
32#include "logic.h" 33	34#include "logic.h" 35
34
35//selection_logic	36//selection_logic
36selection_logic::selection_logic( 37 bool _is_default, 38 int win_entries_, 39 int issue_width_, 40 const InputParameter configure_interface, 41 enum Device_ty device_ty_, 42 enum Core_type core_ty_) 43 //const ParseXML _XML_interface) 44 :is_default(_is_default), 45 win_entries(win_entries_), 46 issue_width(issue_width_), 47 device_ty(device_ty_), 48 core_ty(core_ty_) 49 { 50 //uca_org_t result2; 51 l_ip=configure_interface; 52 local_result = init_interface(&l_ip); 53 //init_tech_params(l_ip.F_sz_um, false); 54 //win_entries=numIBEntries;//IQentries; 55 //issue_width=issueWidth; 56 selection_power(); 57 double sckRation = g_tp.sckt_co_eff; 58 power.readOp.dynamic = sckRation; 59 power.writeOp.dynamic = sckRation; 60 power.searchOp.dynamic = sckRation;	37selection_logic::selection_logic(XMLNode* _xml_data, bool _is_default, 38 int _win_entries, int issue_width_, 39 const InputParameter configure_interface, 40 string _name, double _accesses, 41 double clockRate_, enum Device_ty device_ty_, 42 enum Core_type core_ty_) 43 : McPATComponent(_xml_data), is_default(_is_default), 44 win_entries(_win_entries), 45 issue_width(issue_width_), 46 accesses(_accesses), 47 device_ty(device_ty_), 48 core_ty(core_ty_) { 49 clockRate = clockRate_; 50 name = _name; 51 l_ip = configure_interface; 52 local_result = init_interface(&l_ip, name); 53}
61	54
62 double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty); 63 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; 64 }	55void selection_logic::computeArea() { 56 output_data.area = local_result.area; 57}
65	58
66void selection_logic::selection_power() 67{//based on cost effective superscalar processor TR pp27-31 68 double Ctotal, Cor, Cpencode; 69 int num_arbiter; 70 double WSelORn, WSelORprequ, WSelPn, WSelPp, WSelEnn, WSelEnp;	59void selection_logic::computeEnergy() { 60 //based on cost effective superscalar processor TR pp27-31 61 double Ctotal, Cor, Cpencode; 62 int num_arbiter; 63 double WSelORn, WSelORprequ, WSelPn, WSelPp, WSelEnn, WSelEnp;
71	64
72 //TODO: the 0.8um process data is used. 73 WSelORn = 12.5 * l_ip.F_sz_um;//this was 10 micron for the 0.8 micron process 74 WSelORprequ = 50 * l_ip.F_sz_um;//this was 40 micron for the 0.8 micron process 75 WSelPn = 12.5 * l_ip.F_sz_um;//this was 10mcron for the 0.8 micron process 76 WSelPp = 18.75 * l_ip.F_sz_um;//this was 15 micron for the 0.8 micron process 77 WSelEnn = 6.25 * l_ip.F_sz_um;//this was 5 micron for the 0.8 micron process 78 WSelEnp = 12.5 * l_ip.F_sz_um;//this was 10 micron for the 0.8 micron process	65 //the 0.8um process data is used. 66 //this was 10 micron for the 0.8 micron process 67 WSelORn = 12.5 * l_ip.F_sz_um; 68 //this was 40 micron for the 0.8 micron process 69 WSelORprequ = 50 * l_ip.F_sz_um; 70 //this was 10mcron for the 0.8 micron process 71 WSelPn = 12.5 * l_ip.F_sz_um; 72 //this was 15 micron for the 0.8 micron process 73 WSelPp = 18.75 * l_ip.F_sz_um; 74 //this was 5 micron for the 0.8 micron process 75 WSelEnn = 6.25 * l_ip.F_sz_um; 76 //this was 10 micron for the 0.8 micron process 77 WSelEnp = 12.5 * l_ip.F_sz_um;
79	78
80 81 Ctotal=0; 82 num_arbiter=1; 83 while(win_entries > 4) 84 { 85 win_entries = (int)ceil((double)win_entries / 4.0); 86 num_arbiter += win_entries;	79 Ctotal = 0; 80 num_arbiter = 1; 81 while (win_entries > 4) { 82 win_entries = (int)ceil((double)win_entries / 4.0); 83 num_arbiter += win_entries;
87 }	84 }
88 //the 4-input OR logic to generate anyreq 89 Cor = 4 * drain_C_(WSelORn,NCH,1,1, g_tp.cell_h_def) + drain_C_(WSelORprequ,PCH,1,1, g_tp.cell_h_def); 90 power.readOp.gate_leakage = cmos_Ig_leakage(WSelORn, WSelORprequ, 4, nor)*g_tp.peri_global.Vdd;	85 //the 4-input OR logic to generate anyreq 86 Cor = 4 * drain_C_(WSelORn, NCH, 1, 1, g_tp.cell_h_def) + 87 drain_C_(WSelORprequ, PCH, 1, 1, g_tp.cell_h_def); 88 power.readOp.gate_leakage = 89 cmos_Ig_leakage(WSelORn, WSelORprequ, 4, nor) * g_tp.peri_global.Vdd;
91	90
92 //The total capacity of the 4-bit priority encoder 93 Cpencode = drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,1, 1, g_tp.cell_h_def) + 94 2drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,2, 1, g_tp.cell_h_def) + 95 3drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,3, 1, g_tp.cell_h_def) + 96 4drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,4, 1, g_tp.cell_h_def) +//precompute priority logic 97 24gate_C(WSelEnn+WSelEnp,20.0)+ 98 4drain_C_(WSelEnn,NCH,1, 1, g_tp.cell_h_def) + 24drain_C_(WSelEnp,PCH,1, 1, g_tp.cell_h_def)+//enable logic 99 (24+23+22+2)gate_C(WSelPn+WSelPp,10.0);//requests signal	91 //The total capacity of the 4-bit priority encoder 92 Cpencode = drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) + 93 drain_C_(WSelPp, PCH, 1, 1, g_tp.cell_h_def) + 94 2 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) + 95 drain_C_(WSelPp, PCH, 2, 1, g_tp.cell_h_def) + 96 3 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) + 97 drain_C_(WSelPp, PCH, 3, 1, g_tp.cell_h_def) + 98 4 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) + 99 drain_C_(WSelPp, PCH, 4, 1, g_tp.cell_h_def) +//precompute priority logic 100 2 * 4 * gate_C(WSelEnn + WSelEnp, 20.0) + 101 4 * drain_C_(WSelEnn, NCH, 1, 1, g_tp.cell_h_def) + 102 2 * 4 * drain_C_(WSelEnp, PCH, 1, 1, g_tp.cell_h_def) +//enable logic 103 (2 * 4 + 2 * 3 + 2 * 2 + 2) * 104 gate_C(WSelPn + WSelPp, 10.0);//requests signal
100	105
101 Ctotal += issue_width * num_arbiter*(Cor+Cpencode);	106 Ctotal += issue_width * num_arbiter * (Cor + Cpencode);
102	107
103 power.readOp.dynamic = Ctotalg_tp.peri_global.Vddg_tp.peri_global.Vdd2;//2 means the abitration signal need to travel round trip 104* power.readOp.leakage = issue_width * num_arbiter * 105 (cmos_Isub_leakage(WSelPn, WSelPp, 2, nor)/approximate precompute with a nor gate///grant1p 106 + cmos_Isub_leakage(WSelPn, WSelPp, 3, nor)//grant2p 107 + cmos_Isub_leakage(WSelPn, WSelPp, 4, nor)//grant3p 108 + cmos_Isub_leakage(WSelEnn, WSelEnp, 2, nor)4//enable logic 109* + cmos_Isub_leakage(WSelEnn, WSelEnp, 1, inv)23//for each grant there are two inverters, there are 3 grant sIsubnals 110 )g_tp.peri_global.Vdd; 111* power.readOp.gate_leakage = issue_width * num_arbiter * 112 (cmos_Ig_leakage(WSelPn, WSelPp, 2, nor)/approximate precompute with a nor gate///grant1p 113 + cmos_Ig_leakage(WSelPn, WSelPp, 3, nor)//grant2p 114 + cmos_Ig_leakage(WSelPn, WSelPp, 4, nor)//grant3p 115 + cmos_Ig_leakage(WSelEnn, WSelEnp, 2, nor)4//enable logic 116* + cmos_Ig_leakage(WSelEnn, WSelEnp, 1, inv)23//for each grant there are two inverters, there are 3 grant signals 117 )g_tp.peri_global.Vdd; 118*}	108 //2 means the abitration signal need to travel round trip 109 power.readOp.dynamic = 110 Ctotal * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 2; 111 power.readOp.leakage = issue_width * num_arbiter * 112 (cmos_Isub_leakage(WSelPn, WSelPp, 2, nor)/approximate precompute with a nor gate///grant1p 113 + cmos_Isub_leakage(WSelPn, WSelPp, 3, nor)//grant2p 114 + cmos_Isub_leakage(WSelPn, WSelPp, 4, nor)//grant3p 115 + cmos_Isub_leakage(WSelEnn, WSelEnp, 2, nor)4//enable logic 116* + cmos_Isub_leakage(WSelEnn, WSelEnp, 1, inv)23//for each grant there are two inverters, there are 3 grant sIsubnals 117 ) * g_tp.peri_global.Vdd; 118 power.readOp.gate_leakage = issue_width * num_arbiter * 119 (cmos_Ig_leakage(WSelPn, WSelPp, 2, nor)/approximate precompute with a nor gate///grant1p 120 + cmos_Ig_leakage(WSelPn, WSelPp, 3, nor)//grant2p 121 + cmos_Ig_leakage(WSelPn, WSelPp, 4, nor)//grant3p 122 + cmos_Ig_leakage(WSelEnn, WSelEnp, 2, nor)4//enable logic 123* + cmos_Ig_leakage(WSelEnn, WSelEnp, 1, inv)23//for each grant there are two inverters, there are 3 grant signals 124 ) * g_tp.peri_global.Vdd; 125 double sckRation = g_tp.sckt_co_eff; 126 power.readOp.dynamic = sckRation; 127* power.writeOp.dynamic = sckRation; 128* power.searchOp.dynamic *= sckRation;
119	129
	130 double long_channel_device_reduction = 131 longer_channel_device_reduction(device_ty, core_ty); 132 power.readOp.longer_channel_leakage = 133 power.readOp.leakage * long_channel_device_reduction;
120	134
	135 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate; 136 output_data.subthreshold_leakage_power = power.readOp.leakage; 137 output_data.gate_leakage_power = power.readOp.gate_leakage; 138 output_data.runtime_dynamic_energy = power.readOp.dynamic * accesses; 139} 140
121dep_resource_conflict_check::dep_resource_conflict_check(	141dep_resource_conflict_check::dep_resource_conflict_check(
122 const InputParameter configure_interface, 123* const CoreDynParam & dyn_p_, 124 int compare_bits_, 125 bool _is_default) 126 : l_ip(configure_interface), 127* coredynp(dyn_p_), 128 compare_bits(compare_bits_), 129 is_default(_is_default) 130{ 131 Wcompn = 25 * l_ip.F_sz_um;//this was 20.0 micron for the 0.8 micron process 132 Wevalinvp = 25 * l_ip.F_sz_um;//this was 20.0 micron for the 0.8 micron process 133 Wevalinvn = 100 * l_ip.F_sz_um;//this was 80.0 mcron for the 0.8 micron process 134 Wcomppreequ = 50 * l_ip.F_sz_um;//this was 40.0 micron for the 0.8 micron process 135 WNORn = 6.75 * l_ip.F_sz_um;//this was 5.4 micron for the 0.8 micron process 136 WNORp = 38.125 * l_ip.F_sz_um;//this was 30.5 micron for the 0.8 micron process	142 XMLNode* _xml_data, const string _name, 143 const InputParameter configure_interface, 144* const CoreParameters & dyn_p_, int compare_bits_, 145 double clockRate_, bool _is_default) 146 : McPATComponent(_xml_data), l_ip(configure_interface), 147* coredynp(dyn_p_), compare_bits(compare_bits_), is_default(_is_default) {
137	148
138 local_result = init_interface(&l_ip);	149 name = _name; 150 clockRate = clockRate_; 151 //this was 20.0 micron for the 0.8 micron process 152 Wcompn = 25 * l_ip.F_sz_um; 153 //this was 20.0 micron for the 0.8 micron process 154 Wevalinvp = 25 * l_ip.F_sz_um; 155 //this was 80.0 mcron for the 0.8 micron process 156 Wevalinvn = 100 * l_ip.F_sz_um; 157 //this was 40.0 micron for the 0.8 micron process 158 Wcomppreequ = 50 * l_ip.F_sz_um; 159 //this was 5.4 micron for the 0.8 micron process 160 WNORn = 6.75 * l_ip.F_sz_um; 161 //this was 30.5 micron for the 0.8 micron process 162 WNORp = 38.125 * l_ip.F_sz_um;
139	163
140 if (coredynp.core_ty==Inorder) 141 compare_bits += 16 + 8 + 8;//TODO: opcode bits + log(shared resources) + REG TAG BITS-->opcode comparator 142 else 143 compare_bits += 16 + 8 + 8;	164 // To make CACTI happy. 165 l_ip.cache_sz = MIN_BUFFER_SIZE; 166 local_result = init_interface(&l_ip, name);
144	167
145 conflict_check_power(); 146 double sckRation = g_tp.sckt_co_eff; 147 power.readOp.dynamic = sckRation; 148* power.writeOp.dynamic = sckRation; 149* power.searchOp.dynamic *= sckRation;	168 if (coredynp.core_ty == Inorder) 169 //TODO: opcode bits + log(shared resources) + REG TAG BITS --> 170 //opcode comparator 171 compare_bits += 16 + 8 + 8; 172 else 173 compare_bits += 16 + 8 + 8;
150	174
	175 conflict_check_power(); 176 double sckRation = g_tp.sckt_co_eff; 177 power.readOp.dynamic = sckRation; 178* power.writeOp.dynamic = sckRation; 179* power.searchOp.dynamic = sckRation; 180*
151} 152	181} 182
153void dep_resource_conflict_check::conflict_check_power() 154{ 155 double Ctotal; 156 int num_comparators; 157 num_comparators = 3((coredynp.decodeW) (coredynp.decodeW)-coredynp.decodeW);//2(NN-N) is used for source to dest comparison, (NN-N) is used for dest to dest comparision. 158 //When decode-width ==1, no dcl logic	183void dep_resource_conflict_check::conflict_check_power() { 184 double Ctotal; 185 int num_comparators; 186 //2(NN-N) is used for source to dest comparison, (NN-N) is used for 187 //dest to dest comparision. 188 num_comparators = 3 * ((coredynp.decodeW) * (coredynp.decodeW) - 189 coredynp.decodeW);
159	190
160 Ctotal = num_comparators * compare_cap(); 161 //printf("%i,%s\n",XML_interface->sys.core[0].predictor.predictor_entries,XML_interface->sys.core[0].predictor.prediction_scheme);	191 Ctotal = num_comparators * compare_cap();
162	192
163 power.readOp.dynamic=Ctotal/CLOCKRATE/g_tp.peri_global.Vddg_tp.peri_global.Vdd/AF/; 164 power.readOp.leakage=num_comparatorscompare_bits2*simplified_nmos_leakage(Wcompn, false);	193 power.readOp.dynamic = Ctotal * /CLOCKRATE/ g_tp.peri_global.Vdd * 194 g_tp.peri_global.Vdd /AF/; 195 power.readOp.leakage = num_comparators * compare_bits * 2 * 196 simplified_nmos_leakage(Wcompn, false);
165	197
166 double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty); 167 power.readOp.longer_channel_leakage = power.readOp.leakagelong_channel_device_reduction; 168* power.readOp.gate_leakage=num_comparatorscompare_bits2*cmos_Ig_leakage(Wcompn, 0, 2, nmos);	198 double long_channel_device_reduction = 199 longer_channel_device_reduction(Core_device, coredynp.core_ty); 200 power.readOp.longer_channel_leakage = 201 power.readOp.leakage * long_channel_device_reduction; 202 power.readOp.gate_leakage = num_comparators * compare_bits * 2 * 203 cmos_Ig_leakage(Wcompn, 0, 2, nmos);
169 170} 171 172/* estimate comparator power consumption (this comparator is similar 173 to the tag-match structure in a CAM */	204 205} 206 207/* estimate comparator power consumption (this comparator is similar 208 to the tag-match structure in a CAM */
174double dep_resource_conflict_check::compare_cap() 175{ 176 double c1, c2;	209double dep_resource_conflict_check::compare_cap() { 210 double c1, c2;
177	211
178 WNORp = WNORp * compare_bits/2.0;//resize the big NOR gate at the DCL according to fan in. 179 /* bottom part of comparator / 180* c2 = (compare_bits)(drain_C_(Wcompn,NCH,1,1, g_tp.cell_h_def)+drain_C_(Wcompn,NCH,2,1, g_tp.cell_h_def))+ 181* drain_C_(Wevalinvp,PCH,1,1, g_tp.cell_h_def) + drain_C_(Wevalinvn,NCH,1,1, g_tp.cell_h_def);	212 //resize the big NOR gate at the DCL according to fan in. 213 WNORp = WNORp * compare_bits / 2.0; 214 /* bottom part of comparator / 215* c2 = (compare_bits) * (drain_C_(Wcompn, NCH, 1, 1, g_tp.cell_h_def) + 216 drain_C_(Wcompn, NCH, 2, 1, g_tp.cell_h_def)) + 217 drain_C_(Wevalinvp, PCH, 1, 1, g_tp.cell_h_def) + 218 drain_C_(Wevalinvn, NCH, 1, 1, g_tp.cell_h_def);
182	219
183 /* top part of comparator / 184* c1 = (compare_bits)(drain_C_(Wcompn,NCH,1,1, g_tp.cell_h_def)+drain_C_(Wcompn,NCH,2,1, g_tp.cell_h_def)+ 185* drain_C_(Wcomppreequ,NCH,1,1, g_tp.cell_h_def)) + gate_C(WNORn + WNORp,10.0) + 186 drain_C_(WNORp,NCH,2,1, g_tp.cell_h_def) + compare_bitsdrain_C_(WNORn,NCH,2,1, g_tp.cell_h_def); 187* return(c1 + c2);	220 /* top part of comparator / 221* c1 = (compare_bits) * (drain_C_(Wcompn, NCH, 1, 1, g_tp.cell_h_def) + 222 drain_C_(Wcompn, NCH, 2, 1, g_tp.cell_h_def) + 223 drain_C_(Wcomppreequ, NCH, 1, 1, g_tp.cell_h_def)) + 224 gate_C(WNORn + WNORp, 10.0) + 225 drain_C_(WNORp, NCH, 2, 1, g_tp.cell_h_def) + compare_bits * 226 drain_C_(WNORn, NCH, 2, 1, g_tp.cell_h_def); 227 return(c1 + c2);
188 189} 190 191void dep_resource_conflict_check::leakage_feedback(double temperature) 192{ 193 l_ip.temp = (unsigned int)round(temperature/10.0)*10;	228 229} 230 231void dep_resource_conflict_check::leakage_feedback(double temperature) 232{ 233 l_ip.temp = (unsigned int)round(temperature/10.0)*10;
194 uca_org_t init_result = init_interface(&l_ip); // init_result is dummy	234 uca_org_t init_result = init_interface(&l_ip, name); // init_result is dummy
195 196 // This is part of conflict_check_power()	235 236 // This is part of conflict_check_power()
197 int num_comparators = 3((coredynp.decodeW) (coredynp.decodeW)-coredynp.decodeW);//2(NN-N) is used for source to dest comparison, (NN-N) is used for dest to dest comparision. 198 power.readOp.leakage=num_comparatorscompare_bits2*simplified_nmos_leakage(Wcompn, false);	237 // 2(NN-N) is used for source to dest comparison, (NN-N) is used for dest 238 // to dest comparison. 239 int num_comparators = 3 * ((coredynp.decodeW) * (coredynp.decodeW) - 240 coredynp.decodeW); 241 power.readOp.leakage = num_comparators * compare_bits * 2 * 242 simplified_nmos_leakage(Wcompn, false);
199	243
200 double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty); 201 power.readOp.longer_channel_leakage = power.readOp.leakagelong_channel_device_reduction; 202* power.readOp.gate_leakage=num_comparatorscompare_bits2*cmos_Ig_leakage(Wcompn, 0, 2, nmos);	244 double long_channel_device_reduction = 245 longer_channel_device_reduction(Core_device, coredynp.core_ty); 246 power.readOp.longer_channel_leakage = power.readOp.leakage * 247 long_channel_device_reduction; 248 power.readOp.gate_leakage = num_comparators * compare_bits * 2 * 249 cmos_Ig_leakage(Wcompn, 0, 2, nmos);
203} 204	250} 251
205//TODO: add inverter and transmission gate base DFF.
206 207DFFCell::DFFCell(	252 253DFFCell::DFFCell(
208 bool _is_dram, 209 double _WdecNANDn, 210 double _WdecNANDp, 211 double _cell_load, 212 const InputParameter configure_interface) 213:is_dram(_is_dram), 214cell_load(_cell_load), 215WdecNANDn(_WdecNANDn), 216WdecNANDp(_WdecNANDp) 217{//this model is based on the NAND2 based DFF. 218* l_ip=configure_interface; 219// area.set_area(730l_ip.F_sz_uml_ip.F_sz_um); 220* area.set_area(5compute_gate_area(NAND, 2,WdecNANDn,WdecNANDp, g_tp.cell_h_def) 221* + compute_gate_area(NAND, 2,WdecNANDn,WdecNANDn, g_tp.cell_h_def));	254 bool _is_dram, 255 double _WdecNANDn, 256 double _WdecNANDp, 257 double _cell_load, 258 const InputParameter configure_interface) 259* : is_dram(_is_dram), 260 cell_load(_cell_load), 261 WdecNANDn(_WdecNANDn), 262 WdecNANDp(_WdecNANDp) { //this model is based on the NAND2 based DFF. 263 l_ip = configure_interface; 264* area.set_area(5 * compute_gate_area(NAND, 2,WdecNANDn,WdecNANDp, 265 g_tp.cell_h_def) 266 + compute_gate_area(NAND, 2,WdecNANDn,WdecNANDn, 267 g_tp.cell_h_def));
222 223 224} 225 226	268 269 270} 271 272
227double DFFCell::fpfp_node_cap(unsigned int fan_in, unsigned int fan_out) 228{ 229 double Ctotal = 0; 230 //printf("WdecNANDn = %E\n", WdecNANDn);	273double DFFCell::fpfp_node_cap(unsigned int fan_in, unsigned int fan_out) { 274 double Ctotal = 0;
231	275
232 /* part 1: drain cap of NAND gate / 233* Ctotal += drain_C_(WdecNANDn, NCH, 2, 1, g_tp.cell_h_def, is_dram) + fan_in * drain_C_(WdecNANDp, PCH, 1, 1, g_tp.cell_h_def, is_dram);	276 /* part 1: drain cap of NAND gate / 277* Ctotal += drain_C_(WdecNANDn, NCH, 2, 1, g_tp.cell_h_def, is_dram) + fan_in * drain_C_(WdecNANDp, PCH, 1, 1, g_tp.cell_h_def, is_dram);
234	278
235 /* part 2: gate cap of NAND gates / 236* Ctotal += fan_out * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);	279 /* part 2: gate cap of NAND gates / 280* Ctotal += fan_out * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
237	281
238 return Ctotal;	282 return Ctotal;
239} 240 241	283} 284 285
242void DFFCell::compute_DFF_cell() 243{ 244 double c1, c2, c3, c4, c5, c6; 245 /* node 5 and node 6 are identical to node 1 in capacitance / 246* c1 = c5 = c6 = fpfp_node_cap(2, 1); 247 c2 = fpfp_node_cap(2, 3); 248 c3 = fpfp_node_cap(3, 2); 249 c4 = fpfp_node_cap(2, 2);	286void DFFCell::compute_DFF_cell() { 287 double c1, c2, c3, c4, c5, c6; 288 /* node 5 and node 6 are identical to node 1 in capacitance / 289* c1 = c5 = c6 = fpfp_node_cap(2, 1); 290 c2 = fpfp_node_cap(2, 3); 291 c3 = fpfp_node_cap(3, 2); 292 c4 = fpfp_node_cap(2, 2);
250	293
251 //cap-load of the clock signal in each Dff, actually the clock signal only connected to one NAND2 252 clock_cap= 2 * gate_C(WdecNANDn + WdecNANDp, 0, is_dram); 253 e_switch.readOp.dynamic += (c4 + c1 + c2 + c3 + c5 + c6 + 2cell_load)0.5g_tp.peri_global.Vdd g_tp.peri_global.Vdd;;	294 //cap-load of the clock signal in each Dff, actually the clock signal only connected to one NAND2 295 clock_cap = 2 * gate_C(WdecNANDn + WdecNANDp, 0, is_dram); 296 e_switch.readOp.dynamic += (c4 + c1 + c2 + c3 + c5 + c6 + 2 * cell_load) * 297 0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;
254	298
255 /* no 1/2 for e_keep and e_clock because clock signal switches twice in one cycle / 256* e_keep_1.readOp.dynamic += c3 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ; 257 e_keep_0.readOp.dynamic += c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ; 258 e_clock.readOp.dynamic += clock_cap* g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;	299 /* no 1/2 for e_keep and e_clock because clock signal switches twice in one cycle / 300* e_keep_1.readOp.dynamic += 301 c3 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ; 302 e_keep_0.readOp.dynamic += 303 c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ; 304 e_clock.readOp.dynamic += 305 clock_cap * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;
259	306
260 /* static power / 261* e_switch.readOp.leakage += (cmos_Isub_leakage(WdecNANDn, WdecNANDp, 2, nand)5//5 NAND2 and 1 NAND3 in a DFF 262* + cmos_Isub_leakage(WdecNANDn, WdecNANDn, 3, nand))g_tp.peri_global.Vdd; 263* e_switch.readOp.gate_leakage += (cmos_Ig_leakage(WdecNANDn, WdecNANDp, 2, nand)5//5 NAND2 and 1 NAND3 in a DFF 264* + cmos_Ig_leakage(WdecNANDn, WdecNANDn, 3, nand))g_tp.peri_global.Vdd; 265* //printf("leakage =%E\n",cmos_Ileak(1, is_dram) );	307 /* static power / 308* e_switch.readOp.leakage += 309 (cmos_Isub_leakage(WdecNANDn, WdecNANDp, 2, nand) * 310 5//5 NAND2 and 1 NAND3 in a DFF 311 + cmos_Isub_leakage(WdecNANDn, WdecNANDn, 3, nand)) * 312 g_tp.peri_global.Vdd; 313 e_switch.readOp.gate_leakage += 314 (cmos_Ig_leakage(WdecNANDn, WdecNANDp, 2, nand) * 315 5//5 NAND2 and 1 NAND3 in a DFF 316 + cmos_Ig_leakage(WdecNANDn, WdecNANDn, 3, nand)) * 317 g_tp.peri_global.Vdd;
266} 267	318} 319
268Pipeline::Pipeline( 269 const InputParameter configure_interface, 270* const CoreDynParam & dyn_p_, 271 enum Device_ty device_ty_, 272 bool _is_core_pipeline, 273 bool _is_default) 274: l_ip(configure_interface), 275* coredynp(dyn_p_), 276 device_ty(device_ty_), 277 is_core_pipeline(_is_core_pipeline), 278 is_default(_is_default), 279 num_piperegs(0.0)	320Pipeline::Pipeline(XMLNode* _xml_data, 321 const InputParameter configure_interface, 322* const CoreParameters & dyn_p_, 323 enum Device_ty device_ty_, 324 bool _is_core_pipeline, 325 bool _is_default) 326 : McPATComponent(_xml_data), l_ip(configure_interface), 327* coredynp(dyn_p_), device_ty(device_ty_), 328 is_core_pipeline(_is_core_pipeline), is_default(_is_default), 329 num_piperegs(0.0) { 330 name = "Pipeline?";
280	331
281 { 282 local_result = init_interface(&l_ip); 283 if (!coredynp.Embedded) 284 process_ind = true; 285 else 286 process_ind = false; 287 WNANDn = (process_ind)? 25 * l_ip.F_sz_um : g_tp.min_w_nmos_ ;//this was 20 micron for the 0.8 micron process 288 WNANDp = (process_ind)? 37.5 * l_ip.F_sz_um : g_tp.min_w_nmos_pmos_to_nmos_sz_ratio();//this was 30 micron for the 0.8 micron process 289* load_per_pipeline_stage = 2gate_C(WNANDn + WNANDp, 0, false); 290* compute();	332 local_result = init_interface(&l_ip, name); 333 if (!coredynp.Embedded) { 334 process_ind = true; 335 } else { 336 process_ind = false; 337 } 338 //this was 20 micron for the 0.8 micron process 339 WNANDn = (process_ind) ? 25 * l_ip.F_sz_um : g_tp.min_w_nmos_ ; 340 //this was 30 micron for the 0.8 micron process 341 WNANDp = (process_ind) ? 37.5 * l_ip.F_sz_um : g_tp.min_w_nmos_ * 342 pmos_to_nmos_sz_ratio(); 343 load_per_pipeline_stage = 2 * gate_C(WNANDn + WNANDp, 0, false); 344 compute();
291 292} 293	345 346} 347
294void Pipeline::compute() 295{ 296 compute_stage_vector(); 297 DFFCell pipe_reg(false, WNANDn,WNANDp, load_per_pipeline_stage, &l_ip); 298 pipe_reg.compute_DFF_cell();	348void Pipeline::compute() { 349 compute_stage_vector(); 350 DFFCell pipe_reg(false, WNANDn, WNANDp, load_per_pipeline_stage, &l_ip); 351 pipe_reg.compute_DFF_cell();
299	352
300 double clock_power_pipereg = num_piperegs * pipe_reg.e_clock.readOp.dynamic; 301 //*****************pipeline power: currently, we average all the possibilities of the states of DFFs in the pipeline. A better way to do it is to consider 302* //the harming distance of two consecutive signals, However McPAT does not have plan to do this in near future as it focuses on worst case power. 303 double pipe_reg_power = num_piperegs * (pipe_reg.e_switch.readOp.dynamic+pipe_reg.e_keep_0.readOp.dynamic+pipe_reg.e_keep_1.readOp.dynamic)/3+clock_power_pipereg; 304 double pipe_reg_leakage = num_piperegs * pipe_reg.e_switch.readOp.leakage; 305 double pipe_reg_gate_leakage = num_piperegs * pipe_reg.e_switch.readOp.gate_leakage; 306 power.readOp.dynamic +=pipe_reg_power; 307 power.readOp.leakage +=pipe_reg_leakage; 308 power.readOp.gate_leakage +=pipe_reg_gate_leakage; 309 area.set_area(num_piperegs * pipe_reg.area.get_area());	353 double clock_power_pipereg = num_piperegs * pipe_reg.e_clock.readOp.dynamic; 354 //*****************pipeline power: currently, we average all the possibilities of the states of DFFs in the pipeline. A better way to do it is to consider 355* //the harming distance of two consecutive signals, However McPAT does not have plan to do this in near future as it focuses on worst case power. 356 double pipe_reg_power = num_piperegs * 357 (pipe_reg.e_switch.readOp.dynamic + pipe_reg.e_keep_0.readOp.dynamic + 358 pipe_reg.e_keep_1.readOp.dynamic) / 3 + clock_power_pipereg; 359 double pipe_reg_leakage = num_piperegs * pipe_reg.e_switch.readOp.leakage; 360 double pipe_reg_gate_leakage = num_piperegs * 361 pipe_reg.e_switch.readOp.gate_leakage; 362 power.readOp.dynamic += pipe_reg_power; 363 power.readOp.leakage += pipe_reg_leakage; 364 power.readOp.gate_leakage += pipe_reg_gate_leakage; 365 area.set_area(num_piperegs * pipe_reg.area.get_area());
310	366
311 double long_channel_device_reduction = longer_channel_device_reduction(device_ty, coredynp.core_ty); 312 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;	367 double long_channel_device_reduction = 368 longer_channel_device_reduction(device_ty, coredynp.core_ty); 369 power.readOp.longer_channel_leakage = power.readOp.leakage * 370 long_channel_device_reduction;
313 314	371 372
315 double sckRation = g_tp.sckt_co_eff; 316 power.readOp.dynamic = sckRation; 317* power.writeOp.dynamic = sckRation; 318* power.searchOp.dynamic = sckRation; 319* double macro_layout_overhead = g_tp.macro_layout_overhead;	373 double sckRation = g_tp.sckt_co_eff; 374 power.readOp.dynamic = sckRation; 375* power.writeOp.dynamic = sckRation; 376* power.searchOp.dynamic = sckRation; 377* double macro_layout_overhead = g_tp.macro_layout_overhead;
320 if (!coredynp.Embedded)	378 if (!coredynp.Embedded)
321 area.set_area(area.get_area()*macro_layout_overhead);	379 area.set_area(area.get_area() * macro_layout_overhead); 380 381 output_data.area = area.get_area() / 1e6; 382 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate; 383 output_data.subthreshold_leakage_power = power.readOp.leakage; 384 output_data.gate_leakage_power = power.readOp.gate_leakage; 385 output_data.runtime_dynamic_energy = power.readOp.dynamic * total_cycles;
322} 323	386} 387
324void Pipeline::compute_stage_vector() 325{ 326 double num_stages, tot_stage_vector, per_stage_vector; 327 int opcode_length = coredynp.x86? coredynp.micro_opcode_length:coredynp.opcode_length; 328 //Hthread = thread_clock_gated? 1:num_thread;	388void Pipeline::compute_stage_vector() { 389 double num_stages, tot_stage_vector, per_stage_vector; 390 int opcode_length = coredynp.x86 ? 391 coredynp.micro_opcode_length : coredynp.opcode_width;
329	392
330 if (!is_core_pipeline) 331 { 332 num_piperegs=l_ip.pipeline_stagesl_ip.per_stage_vector;//The number of pipeline stages are calculated based on the achievable throughput and required throughput 333* } 334 else 335 { 336 if (coredynp.core_ty==Inorder) 337 { 338 /* assume 6 pipe stages and try to estimate bits per pipe stage / 339* /* pipe stage 0/IF / 340* num_piperegs += coredynp.pc_width2coredynp.num_hthreads; 341 /* pipe stage IF/ID / 342* num_piperegs += coredynp.fetchW(coredynp.instruction_length + coredynp.pc_width)coredynp.num_hthreads; 343 /* pipe stage IF/ThreadSEL / 344* if (coredynp.multithreaded) num_piperegs += coredynp.num_hthreadscoredynp.perThreadState; //8 bit thread states 345* /* pipe stage ID/EXE / 346* num_piperegs += coredynp.decodeW(coredynp.instruction_length + coredynp.pc_width + pow(2.0,opcode_length)+ 2coredynp.int_data_width)coredynp.num_hthreads; 347* /* pipe stage EXE/MEM / 348* num_piperegs += coredynp.issueW(3 coredynp.arch_ireg_width + pow(2.0,opcode_length) + 82coredynp.int_data_width/+2powers (2,reg_length)/); 349* /* pipe stage MEM/WB the 2^opcode_length means the total decoded signal for the opcode/ 350* num_piperegs += coredynp.issueW(2coredynp.int_data_width + pow(2.0,opcode_length) + 82coredynp.int_data_width/+2powers (2,reg_length)/); 351// / pipe stage 5/6 / 352// num_piperegs += issueWidth(data_width + powers (2,opcode_length)/+2powers (2,reg_length)/); 353// / pipe stage 6/7 / 354// num_piperegs += issueWidth(data_width + powers (2,opcode_length)/+2powers (2,reg_length)/); 355// / pipe stage 7/8 / 356// num_piperegs += issueWidth(data_width + powers (2,opcode_length)/*2powers (2,reg_length)/); 357// / assume 50% extra in control signals (rule of thumb) / 358* num_stages=6;	393 if (!is_core_pipeline) { 394 //The number of pipeline stages are calculated based on the achievable 395 //throughput and required throughput 396 num_piperegs = l_ip.pipeline_stages * l_ip.per_stage_vector; 397 } else { 398 if (coredynp.core_ty == Inorder) { 399 /* assume 6 pipe stages and try to estimate bits per pipe stage / 400* /* pipe stage 0/IF / 401* num_piperegs += coredynp.pc_width * 2 * coredynp.num_hthreads; 402 /* pipe stage IF/ID / 403* num_piperegs += coredynp.fetchW * 404 (coredynp.instruction_length + coredynp.pc_width) * 405 coredynp.num_hthreads; 406 /* pipe stage IF/ThreadSEL / 407* if (coredynp.multithreaded) { 408 num_piperegs += coredynp.num_hthreads * 409 coredynp.perThreadState; //8 bit thread states 410 } 411 /* pipe stage ID/EXE / 412* num_piperegs += coredynp.decodeW * 413 (coredynp.instruction_length + coredynp.pc_width + 414 pow(2.0, opcode_length) + 2 * coredynp.int_data_width) * 415 coredynp.num_hthreads; 416 /* pipe stage EXE/MEM / 417* num_piperegs += coredynp.issueW * 418 (3 * coredynp.arch_ireg_width + pow(2.0, opcode_length) + 8 * 419 2 * coredynp.int_data_width/+2powers (2,reg_length)/); 420* /* pipe stage MEM/WB the 2^opcode_length means the total decoded signal for the opcode/ 421* num_piperegs += coredynp.issueW * 422 (2 * coredynp.int_data_width + pow(2.0, opcode_length) + 8 * 423 2 * coredynp.int_data_width/+2powers (2,reg_length)/); 424* num_stages = 6; 425 } else { 426 /* assume 12 stage pipe stages and try to estimate bits per pipe stage / 427* /OOO: Fetch, decode, rename, IssueQ, dispatch, regread, EXE, MEM, WB, CM /
359	428
360 } 361 else 362 { 363 /* assume 12 stage pipe stages and try to estimate bits per pipe stage / 364* /OOO: Fetch, decode, rename, IssueQ, dispatch, regread, EXE, MEM, WB, CM /	429 /* pipe stage 0/1F/ 430* num_piperegs += 431 coredynp.pc_width * 2 * coredynp.num_hthreads ;//PC and Next PC 432 /* pipe stage IF/ID / 433* num_piperegs += coredynp.fetchW * 434 (coredynp.instruction_length + coredynp.pc_width) * 435 coredynp.num_hthreads;//PC is used to feed branch predictor in ID 436 /* pipe stage 1D/Renaming/ 437* num_piperegs += coredynp.decodeW * 438 (coredynp.instruction_length + coredynp.pc_width) * 439 coredynp.num_hthreads;//PC is for branch exe in later stage. 440 /* pipe stage Renaming/wire_drive / 441* num_piperegs += coredynp.decodeW * 442 (coredynp.instruction_length + coredynp.pc_width); 443 /* pipe stage Renaming/IssueQ / 444* //3coredynp.phy_ireg_width means 2 sources and 1 dest 445* num_piperegs += coredynp.issueW * 446 (coredynp.instruction_length + coredynp.pc_width + 3 * 447 coredynp.phy_ireg_width) * coredynp.num_hthreads; 448 /* pipe stage IssueQ/Dispatch / 449* num_piperegs += coredynp.issueW * 450 (coredynp.instruction_length + 3 * coredynp.phy_ireg_width); 451 /* pipe stage Dispatch/EXE */
365	452
366 /* pipe stage 0/1F/ 367* num_piperegs += coredynp.pc_width2coredynp.num_hthreads ;//PC and Next PC 368 /* pipe stage IF/ID / 369* num_piperegs += coredynp.fetchW(coredynp.instruction_length + coredynp.pc_width)coredynp.num_hthreads;//PC is used to feed branch predictor in ID 370 /* pipe stage 1D/Renaming/ 371* num_piperegs += coredynp.decodeW(coredynp.instruction_length + coredynp.pc_width)coredynp.num_hthreads;//PC is for branch exe in later stage. 372 /* pipe stage Renaming/wire_drive / 373* num_piperegs += coredynp.decodeW(coredynp.instruction_length + coredynp.pc_width); 374* /* pipe stage Renaming/IssueQ / 375* num_piperegs += coredynp.issueW(coredynp.instruction_length + coredynp.pc_width + 3coredynp.phy_ireg_width)coredynp.num_hthreads;//3coredynp.phy_ireg_width means 2 sources and 1 dest 376 /* pipe stage IssueQ/Dispatch / 377* num_piperegs += coredynp.issueW(coredynp.instruction_length + 3 coredynp.phy_ireg_width); 378 /* pipe stage Dispatch/EXE */	453 num_piperegs += coredynp.issueW * 454 (3 * coredynp.phy_ireg_width + coredynp.pc_width + 455 pow(2.0, opcode_length)/+2powers (2,reg_length)/); 456* /* 2^opcode_length means the total decoded signal for the opcode/ 457* num_piperegs += coredynp.issueW * 458 (2 * coredynp.int_data_width + pow(2.0, opcode_length) 459 /+2powers (2,reg_length)/); 460* /2 source operands in EXE; Assume 2EXE stages since we do not really distinguish OP/ 461* num_piperegs += coredynp.issueW * 462 (2 * coredynp.int_data_width + pow(2.0, opcode_length) 463 /+2powers (2,reg_length)/); 464* /* pipe stage EXE/MEM, data need to be read/write, address/ 465* //memory Opcode still need to be passed 466 num_piperegs += coredynp.issueW * 467 (coredynp.int_data_width + coredynp.v_address_width + 468 pow(2.0, opcode_length)/+2powers (2,reg_length)/); 469* /* pipe stage MEM/WB; result data, writeback regs / 470* num_piperegs += coredynp.issueW * 471 (coredynp.int_data_width + coredynp.phy_ireg_width 472 /* powers (2,opcode_length) + 473 (2,opcode_length)+2powers (2,reg_length)/); 474 /* pipe stage WB/CM ; result data, regs need to be updated, address for resolve memory ops in ROB's top/ 475* num_piperegs += coredynp.commitW * 476 (coredynp.int_data_width + coredynp.v_address_width + 477 coredynp.phy_ireg_width 478 /+ powers (2,opcode_length)2powers (2,reg_length)/) * 479 coredynp.num_hthreads; 480 num_stages = 12;
379	481
380 num_piperegs += coredynp.issueW(3 coredynp.phy_ireg_width + coredynp.pc_width + pow(2.0,opcode_length)/+2powers (2,reg_length)/); 381* /* 2^opcode_length means the total decoded signal for the opcode/ 382* num_piperegs += coredynp.issueW(2coredynp.int_data_width + pow(2.0,opcode_length)/+2powers (2,reg_length)/); 383* /2 source operands in EXE; Assume 2EXE stages since we do not really distinguish OP/ 384* num_piperegs += coredynp.issueW(2coredynp.int_data_width + pow(2.0,opcode_length)/+2powers (2,reg_length)/); 385* /* pipe stage EXE/MEM, data need to be read/write, address/ 386* num_piperegs += coredynp.issueW(coredynp.int_data_width + coredynp.v_address_width + pow(2.0,opcode_length)/+2powers (2,reg_length)/);//memory Opcode still need to be passed 387 /* pipe stage MEM/WB; result data, writeback regs / 388* num_piperegs += coredynp.issueW(coredynp.int_data_width + coredynp.phy_ireg_width / powers (2,opcode_length) + (2,opcode_length)+2powers (2,reg_length)/); 389 /* pipe stage WB/CM ; result data, regs need to be updated, address for resolve memory ops in ROB's top/ 390* num_piperegs += coredynp.commitW(coredynp.int_data_width + coredynp.v_address_width + coredynp.phy_ireg_width/+ powers (2,opcode_length)2powers (2,reg_length)/)coredynp.num_hthreads; 391// if (multithreaded) 392// { 393// 394// } 395 num_stages=12; 396
397 } 398 399 /* assume 50% extra in control registers and interrupt registers (rule of thumb) / 400* num_piperegs = num_piperegs * 1.5;	482 } 483 484 /* assume 50% extra in control registers and interrupt registers (rule of thumb) / 485* num_piperegs = num_piperegs * 1.5;
401 tot_stage_vector=num_piperegs; 402 per_stage_vector=tot_stage_vector/num_stages;	486 tot_stage_vector = num_piperegs; 487 per_stage_vector = tot_stage_vector / num_stages;
403	488
404 if (coredynp.core_ty==Inorder) 405 { 406 if (coredynp.pipeline_stages>6) 407 num_piperegs= per_stage_vector*coredynp.pipeline_stages;	489 if (coredynp.core_ty == Inorder) { 490 if (coredynp.pipeline_stages > 6) 491 num_piperegs = per_stage_vector * coredynp.pipeline_stages; 492 } else { //OOO 493 if (coredynp.pipeline_stages > 12) 494 num_piperegs = per_stage_vector * coredynp.pipeline_stages;
408 }	495 }
409 else//OOO 410 { 411 if (coredynp.pipeline_stages>12) 412 num_piperegs= per_stage_vectorcoredynp.pipeline_stages; 413* } 414 }	496 }
415 416} 417	497 498} 499
418FunctionalUnit::FunctionalUnit(ParseXML XML_interface, int ithCore_, InputParameter interface_ip_,const CoreDynParam & dyn_p_, enum FU_type fu_type_) 419:XML(XML_interface), 420 ithCore(ithCore_), 421 interface_ip(interface_ip_), 422* coredynp(dyn_p_), 423 fu_type(fu_type_) 424{ 425 double area_t;//, leakage, gate_leakage;	500FunctionalUnit::FunctionalUnit(XMLNode* _xml_data, 501 InputParameter* interface_ip_, 502 const CoreParameters & _core_params, 503 const CoreStatistics & _core_stats, 504 enum FU_type fu_type_) 505 : McPATComponent(_xml_data), 506 interface_ip(interface_ip_), core_params(_core_params), 507* core_stats(_core_stats), fu_type(fu_type_) { 508 double area_t; 509 double leakage; 510 double gate_leakage;
426 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();	511 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
427 clockRate = coredynp.clockRate; 428 executionTime = coredynp.executionTime;	512 clockRate = core_params.clockRate;
429	513
430 //XML_interface=_XML_interface; 431 uca_org_t result2; 432 result2 = init_interface(&interface_ip); 433 if (XML->sys.Embedded) 434 { 435 if (fu_type == FPU) 436 { 437 num_fu=coredynp.num_fpus;	514 uca_org_t result2; 515 // Temp name for the following function call 516 name = "Functional Unit"; 517 518 result2 = init_interface(&interface_ip, name); 519 520 if (core_params.Embedded) { 521 if (fu_type == FPU) { 522 num_fu=core_params.num_fpus;
438 //area_t = 8.471e6g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 439 area_t = 4.471e6(g_ip->F_sz_nmg_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number 440* //4.47 contains both VFP and NEON processing unit, VFP is about 40% and NEON is about 60% 441 if (g_ip->F_sz_nm>90) 442 area_t = 4.471e6g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 443 leakage = area_t (g_tp.scaling_factor.core_tx_density)cmos_Isub_leakage(5g_tp.min_w_nmos_, 5g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2;//unit W 444 gate_leakage = area_t (g_tp.scaling_factor.core_tx_density)cmos_Ig_leakage(5g_tp.min_w_nmos_, 5g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2;//unit W 445 //energy = 0.3529/101e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles. 446// base_energy = coredynp.core_ty==Inorder? 0: 89e-33; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) 447// base_energy =(g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2); 448 base_energy = 0; 449 per_access_energy = 1.15/1e9/4/1.3/1.3g_tp.peri_global.Vddg_tp.peri_global.Vdd(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2);//0.006491e-9; //This is per Hz energy(nJ) 450* //FPU power from Sandia's processor sizing tech report 451 FU_height=(18667num_fu)interface_ip.F_sz_um;//FPU from Sun's data	523 //area_t = 8.471e6g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 524 area_t = 4.471e6(g_ip->F_sz_nmg_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number 525* //4.47 contains both VFP and NEON processing unit, VFP is about 40% and NEON is about 60% 526 if (g_ip->F_sz_nm>90) 527 area_t = 4.471e6g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 528 leakage = area_t (g_tp.scaling_factor.core_tx_density)cmos_Isub_leakage(5g_tp.min_w_nmos_, 5g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2;//unit W 529 gate_leakage = area_t (g_tp.scaling_factor.core_tx_density)cmos_Ig_leakage(5g_tp.min_w_nmos_, 5g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2;//unit W 530 //energy = 0.3529/101e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles. 531// base_energy = coredynp.core_ty==Inorder? 0: 89e-33; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) 532// base_energy =(g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2); 533 base_energy = 0; 534 per_access_energy = 1.15/1e9/4/1.3/1.3g_tp.peri_global.Vddg_tp.peri_global.Vdd(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2);//0.006491e-9; //This is per Hz energy(nJ) 535* //FPU power from Sandia's processor sizing tech report 536 FU_height=(18667num_fu)interface_ip.F_sz_um;//FPU from Sun's data
452 } 453 else if (fu_type == ALU) 454 { 455 num_fu=coredynp.num_alus;	537 } else if (fu_type == ALU) { 538 num_fu=core_params.num_alus;
456 area_t = 280260g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl 457 leakage = area_t (g_tp.scaling_factor.core_tx_density)cmos_Isub_leakage(20g_tp.min_w_nmos_, 20g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2;//unit W 458 gate_leakage = area_t(g_tp.scaling_factor.core_tx_density)cmos_Ig_leakage(20g_tp.min_w_nmos_, 20g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2; 459// base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) 460// base_energy =(g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2); 461 base_energy = 0; 462 per_access_energy = 1.15/3/1e9/4/1.3/1.3g_tp.peri_global.Vddg_tp.peri_global.Vdd(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2);//0.006491e-9; //This is per cycle energy(nJ) 463* FU_height=(6222num_fu)interface_ip.F_sz_um;//integer ALU 464	539 area_t = 280260g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl 540 leakage = area_t (g_tp.scaling_factor.core_tx_density)cmos_Isub_leakage(20g_tp.min_w_nmos_, 20g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2;//unit W 541 gate_leakage = area_t(g_tp.scaling_factor.core_tx_density)cmos_Ig_leakage(20g_tp.min_w_nmos_, 20g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2; 542// base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) 543// base_energy =(g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2); 544 base_energy = 0; 545 per_access_energy = 1.15/3/1e9/4/1.3/1.3g_tp.peri_global.Vddg_tp.peri_global.Vdd(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2);//0.006491e-9; //This is per cycle energy(nJ) 546* FU_height=(6222num_fu)interface_ip.F_sz_um;//integer ALU 547
465 } 466 else if (fu_type == MUL) 467 { 468 num_fu=coredynp.num_muls;	548 } else if (fu_type == MUL) { 549 num_fu=core_params.num_muls;
469 area_t = 2802603g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl 470* leakage = area_t (g_tp.scaling_factor.core_tx_density)cmos_Isub_leakage(20g_tp.min_w_nmos_, 20g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2;//unit W 471 gate_leakage = area_t(g_tp.scaling_factor.core_tx_density)cmos_Ig_leakage(20g_tp.min_w_nmos_, 20g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2; 472// base_energy = coredynp.core_ty==Inorder? 0:89e-32; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) 473// base_energy =(g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2); 474* base_energy = 0; 475 per_access_energy = 1.152/3/1e9/4/1.3/1.3g_tp.peri_global.Vddg_tp.peri_global.Vdd(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2);//0.006491e-9; //This is per cycle energy(nJ), coefficient based on Wattch 476 FU_height=(9334num_fu )interface_ip.F_sz_um;//divider/mul from Sun's data	550 area_t = 2802603g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl 551* leakage = area_t (g_tp.scaling_factor.core_tx_density)cmos_Isub_leakage(20g_tp.min_w_nmos_, 20g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2;//unit W 552 gate_leakage = area_t(g_tp.scaling_factor.core_tx_density)cmos_Ig_leakage(20g_tp.min_w_nmos_, 20g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2; 553// base_energy = coredynp.core_ty==Inorder? 0:89e-32; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) 554// base_energy =(g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2); 555* base_energy = 0; 556 per_access_energy = 1.152/3/1e9/4/1.3/1.3g_tp.peri_global.Vddg_tp.peri_global.Vdd(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2);//0.006491e-9; //This is per cycle energy(nJ), coefficient based on Wattch 557 FU_height=(9334num_fu )interface_ip.F_sz_um;//divider/mul from Sun's data
477 } 478 else 479 {	558 } else {
480 cout<<"Unknown Functional Unit Type"<<endl; 481 exit(0); 482 } 483 per_access_energy *=0.5;//According to ARM data embedded processor has much lower per acc energy	559 cout<<"Unknown Functional Unit Type"<<endl; 560 exit(0); 561 } 562 per_access_energy *=0.5;//According to ARM data embedded processor has much lower per acc energy
	563 } else { 564 if (fu_type == FPU) { 565 name = "Floating Point Unit(s)"; 566 num_fu = core_params.num_fpus; 567 area_t = 8.47 * 1e6 * (g_ip->F_sz_nm * g_ip->F_sz_nm / 90.0 / 568 90.0);//this is um^2 569 if (g_ip->F_sz_nm > 90) 570 area_t = 8.47 * 1e6 * 571 g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 572 leakage = area_t (g_tp.scaling_factor.core_tx_density)cmos_Isub_leakage(5g_tp.min_w_nmos_, 5g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2;//unit W 573 gate_leakage = area_t (g_tp.scaling_factor.core_tx_density)cmos_Ig_leakage(5g_tp.min_w_nmos_, 5g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2;//unit W 574 //W The base energy of ALU average numbers from Intel 4G and 575 //773Mhz (Wattch) 576 base_energy = core_params.core_ty == Inorder ? 0 : 89e-3 * 3; 577 base_energy = (g_tp.peri_global.Vdd g_tp.peri_global.Vdd / 1.2 / 578 1.2); 579 per_access_energy = 1.153/1e9/4/1.3/1.3g_tp.peri_global.Vddg_tp.peri_global.Vdd(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2);//0.006491e-9; //This is per op energy(nJ) 580 FU_height=(38667num_fu)interface_ip.F_sz_um;//FPU from Sun's data 581 } else if (fu_type == ALU) { 582 name = "Integer ALU(s)"; 583 num_fu = core_params.num_alus; 584 //this is um^2 ALU + MUl 585 area_t = 280 * 260 * 2 * g_tp.scaling_factor.logic_scaling_co_eff; 586 leakage = area_t (g_tp.scaling_factor.core_tx_density)cmos_Isub_leakage(20g_tp.min_w_nmos_, 20g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2;//unit W 587 gate_leakage = area_t(g_tp.scaling_factor.core_tx_density)cmos_Ig_leakage(20g_tp.min_w_nmos_, 20g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2; 588 //W The base energy of ALU average numbers from Intel 4G and 773Mhz 589 //(Wattch) 590 base_energy = core_params.core_ty == Inorder ? 0 : 89e-3; 591 base_energy = (g_tp.peri_global.Vdd g_tp.peri_global.Vdd / 1.2 / 592 1.2); 593 per_access_energy = 1.15/1e9/4/1.3/1.3g_tp.peri_global.Vddg_tp.peri_global.Vdd(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2);//0.006491e-9; //This is per cycle energy(nJ) 594* FU_height=(6222num_fu)interface_ip.F_sz_um;//integer ALU 595 } else if (fu_type == MUL) { 596 name = "Multiply/Divide Unit(s)"; 597 num_fu = core_params.num_muls; 598 //this is um^2 ALU + MUl 599 area_t = 280 * 260 * 2 * 3 * 600 g_tp.scaling_factor.logic_scaling_co_eff; 601 leakage = area_t (g_tp.scaling_factor.core_tx_density)cmos_Isub_leakage(20g_tp.min_w_nmos_, 20g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2;//unit W 602 gate_leakage = area_t(g_tp.scaling_factor.core_tx_density)cmos_Ig_leakage(20g_tp.min_w_nmos_, 20g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2; 603 //W The base energy of ALU average numbers from Intel 4G and 773Mhz 604 //(Wattch) 605 base_energy = core_params.core_ty == Inorder ? 0 : 89e-3 * 2; 606 base_energy = (g_tp.peri_global.Vdd g_tp.peri_global.Vdd / 1.2 / 607 1.2); 608 per_access_energy = 1.152/1e9/4/1.3/1.3g_tp.peri_global.Vddg_tp.peri_global.Vdd(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2);//0.006491e-9; //This is per cycle energy(nJ), coefficient based on Wattch 609 FU_height=(9334num_fu )interface_ip.F_sz_um;//divider/mul from Sun's data 610 } else { 611 cout << "Unknown Functional Unit Type" << endl; 612 exit(0);
484 }	613 }
485 else 486 { 487 if (fu_type == FPU) 488 { 489 num_fu=coredynp.num_fpus; 490 //area_t = 8.471e6g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 491 area_t = 8.471e6(g_ip->F_sz_nmg_ip->F_sz_nm/90.0/90.0);//this is um^2 492* if (g_ip->F_sz_nm>90) 493 area_t = 8.471e6g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 494 leakage = area_t (g_tp.scaling_factor.core_tx_density)cmos_Isub_leakage(5g_tp.min_w_nmos_, 5g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2;//unit W 495 gate_leakage = area_t (g_tp.scaling_factor.core_tx_density)cmos_Ig_leakage(5g_tp.min_w_nmos_, 5g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2;//unit W 496 //energy = 0.3529/101e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles. 497* base_energy = coredynp.core_ty==Inorder? 0: 89e-33; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) 498* base_energy =(g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2); 499 per_access_energy = 1.153/1e9/4/1.3/1.3g_tp.peri_global.Vddg_tp.peri_global.Vdd(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2);//0.006491e-9; //This is per op energy(nJ) 500 FU_height=(38667num_fu)interface_ip.F_sz_um;//FPU from Sun's data 501 } 502 else if (fu_type == ALU) 503 { 504 num_fu=coredynp.num_alus; 505 area_t = 2802602g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl 506* leakage = area_t (g_tp.scaling_factor.core_tx_density)cmos_Isub_leakage(20g_tp.min_w_nmos_, 20g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2;//unit W 507 gate_leakage = area_t(g_tp.scaling_factor.core_tx_density)cmos_Ig_leakage(20g_tp.min_w_nmos_, 20g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2; 508 base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) 509 base_energy =(g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2); 510 per_access_energy = 1.15/1e9/4/1.3/1.3g_tp.peri_global.Vddg_tp.peri_global.Vdd(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2);//0.006491e-9; //This is per cycle energy(nJ) 511* FU_height=(6222num_fu)interface_ip.F_sz_um;//integer ALU	614 }
512	615
513 } 514 else if (fu_type == MUL) 515 { 516 num_fu=coredynp.num_muls; 517 area_t = 28026023g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl 518 leakage = area_t (g_tp.scaling_factor.core_tx_density)cmos_Isub_leakage(20g_tp.min_w_nmos_, 20g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2;//unit W 519 gate_leakage = area_t(g_tp.scaling_factor.core_tx_density)cmos_Ig_leakage(20g_tp.min_w_nmos_, 20g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd/2; 520 base_energy = coredynp.core_ty==Inorder? 0:89e-32; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) 521* base_energy =(g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2); 522 per_access_energy = 1.152/1e9/4/1.3/1.3g_tp.peri_global.Vddg_tp.peri_global.Vdd(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vddg_tp.peri_global.Vdd/1.2/1.2);//0.006491e-9; //This is per cycle energy(nJ), coefficient based on Wattch 523 FU_height=(9334num_fu )interface_ip.F_sz_um;//divider/mul from Sun's data 524 } 525 else 526 { 527 cout<<"Unknown Functional Unit Type"<<endl; 528 exit(0); 529 } 530 } 531 //IEXEU, simple ALU and FPU 532 // double C_ALU, C_EXEU, C_FPU; //Lum Equivalent capacitance of IEXEU and FPU. Based on Intel and Sun 90nm process fabracation. 533 // 534 // C_ALU = 0.025e-9;//F 535 // C_EXEU = 0.05e-9; //F 536 // C_FPU = 0.35e-9;//F
537 area.set_area(area_t*num_fu);	616 area.set_area(area_t*num_fu);
538 leakage = num_fu; 539* gate_leakage =num_fu; 540* double macro_layout_overhead = g_tp.macro_layout_overhead; 541// if (!XML->sys.Embedded) 542 area.set_area(area.get_area()*macro_layout_overhead);	617 power.readOp.leakage = leakage * num_fu; 618 power.readOp.gate_leakage = gate_leakage * num_fu; 619 620 double long_channel_device_reduction = 621 longer_channel_device_reduction(Core_device, core_params.core_ty); 622 power.readOp.longer_channel_leakage = 623 power.readOp.leakage * long_channel_device_reduction; 624 double macro_layout_overhead = g_tp.macro_layout_overhead; 625 area.set_area(area.get_area()*macro_layout_overhead);
543} 544	626} 627
545void FunctionalUnit::computeEnergy(bool is_tdp) 546{ 547 double pppm_t[4] = {1,1,1,1}; 548 double FU_duty_cycle; 549 if (is_tdp) 550 {	628void FunctionalUnit::computeEnergy() { 629 double pppm_t[4] = {1, 1, 1, 1}; 630 double FU_duty_cycle; 631 double sckRation = g_tp.sckt_co_eff;
551	632
	633 // TDP power calculation 634 //2 means two source operands needs to be passed for each int instruction. 635 set_pppm(pppm_t, 2, 2, 2, 2); 636 tdp_stats.readAc.access = num_fu; 637 if (fu_type == FPU) { 638 FU_duty_cycle = core_stats.FPU_duty_cycle; 639 } else if (fu_type == ALU) { 640 FU_duty_cycle = core_stats.ALU_duty_cycle; 641 } else if (fu_type == MUL) { 642 FU_duty_cycle = core_stats.MUL_duty_cycle; 643 }
552	644
553 set_pppm(pppm_t, 2, 2, 2, 2);//2 means two source operands needs to be passed for each int instruction. 554 if (fu_type == FPU) 555 { 556 stats_t.readAc.access = num_fu; 557 tdp_stats = stats_t; 558 FU_duty_cycle = coredynp.FPU_duty_cycle; 559 } 560 else if (fu_type == ALU) 561 { 562 stats_t.readAc.access = 1num_fu; 563* tdp_stats = stats_t; 564 FU_duty_cycle = coredynp.ALU_duty_cycle; 565 } 566 else if (fu_type == MUL) 567 { 568 stats_t.readAc.access = num_fu; 569 tdp_stats = stats_t; 570 FU_duty_cycle = coredynp.MUL_duty_cycle; 571 }	645 power.readOp.dynamic = 646 per_access_energy * tdp_stats.readAc.access + base_energy / clockRate; 647 power.readOp.dynamic = sckRation FU_duty_cycle;
572	648
573 //power.readOp.dynamic = base_energy/clockRate + energystats_t.readAc.access; 574* power.readOp.dynamic = per_access_energystats_t.readAc.access + base_energy/clockRate; 575* double sckRation = g_tp.sckt_co_eff; 576 power.readOp.dynamic = sckRationFU_duty_cycle; 577 power.writeOp.dynamic = sckRation; 578* power.searchOp.dynamic *= sckRation;	649 // Runtime power calculation 650 if (fu_type == FPU) { 651 rtp_stats.readAc.access = core_stats.fpu_accesses; 652 } else if (fu_type == ALU) { 653 rtp_stats.readAc.access = core_stats.ialu_accesses; 654 } else if (fu_type == MUL) { 655 rtp_stats.readAc.access = core_stats.mul_accesses; 656 }
579	657
580 power.readOp.leakage = leakage; 581 power.readOp.gate_leakage = gate_leakage; 582 double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty); 583 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;	658 rt_power.readOp.dynamic = per_access_energy * rtp_stats.readAc.access + 659 base_energy * execution_time; 660 rt_power.readOp.dynamic *= sckRation;
584	661
585 } 586 else 587 { 588 if (fu_type == FPU) 589 { 590 stats_t.readAc.access = XML->sys.core[ithCore].fpu_accesses; 591 rtp_stats = stats_t; 592 } 593 else if (fu_type == ALU) 594 { 595 stats_t.readAc.access = XML->sys.core[ithCore].ialu_accesses; 596 rtp_stats = stats_t; 597 } 598 else if (fu_type == MUL) 599 { 600 stats_t.readAc.access = XML->sys.core[ithCore].mul_accesses; 601 rtp_stats = stats_t; 602 } 603 604 //rt_power.readOp.dynamic = base_energyexecutionTime + energystats_t.readAc.access; 605 rt_power.readOp.dynamic = per_access_energystats_t.readAc.access + base_energyexecutionTime; 606 double sckRation = g_tp.sckt_co_eff; 607 rt_power.readOp.dynamic = sckRation; 608* rt_power.writeOp.dynamic = sckRation; 609* rt_power.searchOp.dynamic = sckRation; 610* 611 } 612 613	662 output_data.area = area.get_area() / 1e6; 663 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate; 664 output_data.subthreshold_leakage_power = 665 (longer_channel_device) ? power.readOp.longer_channel_leakage : 666 power.readOp.leakage; 667 output_data.gate_leakage_power = power.readOp.gate_leakage; 668 output_data.runtime_dynamic_energy = rt_power.readOp.dynamic;
614} 615	669} 670
616void FunctionalUnit::displayEnergy(uint32_t indent,int plevel,bool is_tdp) 617{ 618 string indent_str(indent, ' '); 619 string indent_str_next(indent+2, ' '); 620 bool long_channel = XML->sys.longer_channel_device; 621 622// cout << indent_str_next << "Results Broadcast Bus Area = " << bypass->area.get_area() 1e-6 << " mm^2" << endl; 623* if (is_tdp) 624 { 625 if (fu_type == FPU) 626 { 627 cout << indent_str << "Floating Point Units (FPUs) (Count: "<< coredynp.num_fpus <<" ):" << endl; 628 cout << indent_str_next << "Area = " << area.get_area()1e-6 << " mm^2" << endl; 629* cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamicclockRate << " W" << endl; 630// cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage << " W" << endl; 631* cout << indent_str_next<< "Subthreshold Leakage = " 632 << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; 633 cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; 634 cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; 635 cout <<endl; 636 } 637 else if (fu_type == ALU) 638 { 639 cout << indent_str << "Integer ALUs (Count: "<< coredynp.num_alus <<" ):" << endl; 640 cout << indent_str_next << "Area = " << area.get_area()1e-6 << " mm^2" << endl; 641* cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamicclockRate << " W" << endl; 642// cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage << " W" << endl; 643* cout << indent_str_next<< "Subthreshold Leakage = " 644 << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; 645 cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; 646 cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; 647 cout <<endl; 648 } 649 else if (fu_type == MUL) 650 { 651 cout << indent_str << "Complex ALUs (Mul/Div) (Count: "<< coredynp.num_muls <<" ):" << endl; 652 cout << indent_str_next << "Area = " << area.get_area()1e-6 << " mm^2" << endl; 653* cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamicclockRate << " W" << endl; 654// cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage << " W" << endl; 655* cout << indent_str_next<< "Subthreshold Leakage = " 656 << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; 657 cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; 658 cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; 659 cout <<endl; 660 661 } 662 663 } 664 else 665 { 666 } 667 668} 669
670void FunctionalUnit::leakage_feedback(double temperature) 671{ 672 // Update the temperature and initialize the global interfaces. 673 interface_ip.temp = (unsigned int)round(temperature/10.0)10; 674*	671void FunctionalUnit::leakage_feedback(double temperature) 672{ 673 // Update the temperature and initialize the global interfaces. 674 interface_ip.temp = (unsigned int)round(temperature/10.0)10; 675*
675 uca_org_t init_result = init_interface(&interface_ip); // init_result is dummy	676 // init_result is dummy 677 uca_org_t init_result = init_interface(&interface_ip, name);
676 677 // This is part of FunctionalUnit() 678 double area_t, leakage, gate_leakage; 679 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); 680 681 if (fu_type == FPU) 682 { 683 area_t = 4.471e6(g_ip->F_sz_nmg_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number --- 17 unchanged lines hidden* (view full) --- 701 else 702 { 703 cout<<"Unknown Functional Unit Type"<<endl; 704 exit(1); 705 } 706 707 power.readOp.leakage = leakagenum_fu; 708* power.readOp.gate_leakage = gate_leakage*num_fu;	678 679 // This is part of FunctionalUnit() 680 double area_t, leakage, gate_leakage; 681 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); 682 683 if (fu_type == FPU) 684 { 685 area_t = 4.471e6(g_ip->F_sz_nmg_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number --- 17 unchanged lines hidden* (view full) --- 703 else 704 { 705 cout<<"Unknown Functional Unit Type"<<endl; 706 exit(1); 707 } 708 709 power.readOp.leakage = leakagenum_fu; 710* power.readOp.gate_leakage = gate_leakage*num_fu;
709 power.readOp.longer_channel_leakage = longer_channel_device_reduction(Core_device, coredynp.core_ty);	711 power.readOp.longer_channel_leakage = 712 longer_channel_device_reduction(Core_device, core_params.core_ty);
710} 711	713} 714
712UndiffCore::UndiffCore(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_, bool embedded_) 713:XML(XML_interface), 714 ithCore(ithCore_), 715 interface_ip(interface_ip_), 716* coredynp(dyn_p_), 717 core_ty(coredynp.core_ty), 718 embedded(XML->sys.Embedded), 719 pipeline_stage(coredynp.pipeline_stages), 720 num_hthreads(coredynp.num_hthreads), 721 issue_width(coredynp.issueW), 722 exist(exist_) 723// is_default(_is_default) 724{ 725 if (!exist) return; 726 double undifferentiated_core=0; 727 double core_tx_density=0; 728 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();	715UndiffCore::UndiffCore(XMLNode* _xml_data, InputParameter* interface_ip_, 716 const CoreParameters & dyn_p_, 717 bool exist_) 718 : McPATComponent(_xml_data), 719 interface_ip(interface_ip_), coredynp(dyn_p_), 720* core_ty(coredynp.core_ty), embedded(coredynp.Embedded), 721 pipeline_stage(coredynp.pipeline_stages), 722 num_hthreads(coredynp.num_hthreads), issue_width(coredynp.issueW), 723 exist(exist_) { 724 if (!exist) return; 725 726 name = "Undifferentiated Core"; 727 clockRate = coredynp.clockRate; 728 729 double undifferentiated_core = 0; 730 double core_tx_density = 0; 731 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
729 double undifferentiated_core_coe;	732 double undifferentiated_core_coe;
730 //XML_interface=_XML_interface; 731 uca_org_t result2; 732 result2 = init_interface(&interface_ip);	733 uca_org_t result2; 734 result2 = init_interface(&interface_ip, name);
733	735
734 //Compute undifferentiated core area at 90nm. 735 if (embedded==false) 736 { 737 //Based on the results of polynomial/log curve fitting based on undifferentiated core of Niagara, Niagara2, Merom, Penyrn, Prescott, Opteron die measurements 738 if (core_ty==OOO) 739 { 740 //undifferentiated_core = (0.0764pipeline_stagepipeline_stage -2.3685pipeline_stage + 10.405);//OOO 741* undifferentiated_core = (3.57log(pipeline_stage)-1.2643)>0?(3.57log(pipeline_stage)-1.2643):0; 742 } 743 else if (core_ty==Inorder) 744 { 745 //undifferentiated_core = (0.1238pipeline_stage + 7.2572)0.9;//inorder 746 undifferentiated_core = (-2.19log(pipeline_stage)+6.55)>0?(-2.19log(pipeline_stage)+6.55):0; 747 } 748 else 749 { 750 cout<<"invalid core type"<<endl; 751 exit(0); 752 } 753 undifferentiated_core = (1+ logtwo(num_hthreads) 0.0716);	736 //Compute undifferentiated core area at 90nm. 737 if (embedded == false) { 738 //Based on the results of polynomial/log curve fitting based on undifferentiated core of Niagara, Niagara2, Merom, Penyrn, Prescott, Opteron die measurements 739 if (core_ty == OOO) { 740 undifferentiated_core = (3.57 * log(pipeline_stage) - 1.2643) > 0 ? 741 (3.57 * log(pipeline_stage) - 1.2643) : 0; 742 } else if (core_ty == Inorder) { 743 undifferentiated_core = (-2.19 * log(pipeline_stage) + 6.55) > 0 ? 744 (-2.19 * log(pipeline_stage) + 6.55) : 0; 745 } else { 746 cout << "invalid core type" << endl; 747 exit(0);
754 }	748 }
755 else 756 { 757 //Based on the results in paper "parametrized processor models" Sandia Labs 758 if (XML->sys.opt_clockrate)	749 undifferentiated_core = (1 + logtwo(num_hthreads) 0.0716); 750 } else { 751 //Based on the results in paper "parametrized processor models" Sandia Labs 752 if (opt_for_clk)
759 undifferentiated_core_coe = 0.05; 760 else 761 undifferentiated_core_coe = 0;	753 undifferentiated_core_coe = 0.05; 754 else 755 undifferentiated_core_coe = 0;
762 undifferentiated_core = (0.4109* pipeline_stage - 0.776)undifferentiated_core_coe; 763* undifferentiated_core = (1+ logtwo(num_hthreads) 0.0426); 764 }	756 undifferentiated_core = (0.4109 * pipeline_stage - 0.776) * 757 undifferentiated_core_coe; 758 undifferentiated_core = (1 + logtwo(num_hthreads) 0.0426); 759 }
765	760
766 undifferentiated_core = g_tp.scaling_factor.logic_scaling_co_eff1e6;//change from mm^2 to um^2 767 core_tx_density = g_tp.scaling_factor.core_tx_density; 768 //undifferentiated_core = 31e6; 769* //undifferentiated_core = g_tp.scaling_factor.logic_scaling_co_eff;//(g_ip->F_sz_umg_ip->F_sz_um/0.09/0.09); 770* power.readOp.leakage = undifferentiated_core(core_tx_density)cmos_Isub_leakage(5g_tp.min_w_nmos_, 5g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd;//unit W 771 power.readOp.gate_leakage = undifferentiated_core(core_tx_density)cmos_Ig_leakage(5g_tp.min_w_nmos_, 5g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd;	761 undifferentiated_core = g_tp.scaling_factor.logic_scaling_co_eff 762 1e6;//change from mm^2 to um^2 763 core_tx_density = g_tp.scaling_factor.core_tx_density; 764 power.readOp.leakage = undifferentiated_core(core_tx_density)cmos_Isub_leakage(5g_tp.min_w_nmos_, 5g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd;//unit W 765 power.readOp.gate_leakage = undifferentiated_core(core_tx_density)cmos_Ig_leakage(5g_tp.min_w_nmos_, 5g_tp.min_w_nmos_pmos_to_nmos_sizing_r, 1, inv)g_tp.peri_global.Vdd;
772	766
773 double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty); 774 power.readOp.longer_channel_leakage = power.readOp.leakagelong_channel_device_reduction; 775* area.set_area(undifferentiated_core);	767 double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty); 768 power.readOp.longer_channel_leakage = 769 power.readOp.leakage * long_channel_device_reduction; 770 area.set_area(undifferentiated_core);
776	771
777 scktRatio = g_tp.sckt_co_eff; 778 power.readOp.dynamic = scktRatio; 779* power.writeOp.dynamic = scktRatio; 780* power.searchOp.dynamic = scktRatio; 781* macro_PR_overhead = g_tp.macro_layout_overhead; 782 area.set_area(area.get_area()*macro_PR_overhead);	772 scktRatio = g_tp.sckt_co_eff; 773 power.readOp.dynamic = scktRatio; 774* power.writeOp.dynamic = scktRatio; 775* power.searchOp.dynamic = scktRatio; 776* macro_PR_overhead = g_tp.macro_layout_overhead; 777 area.set_area(area.get_area()*macro_PR_overhead);
783	778
784 785 786// double vt=g_tp.peri_global.Vth; 787// double velocity_index=1.1; 788// double c_in=gate_C(g_tp.min_w_nmos_, g_tp.min_w_nmos_pmos_to_nmos_sizing_r , 0.0, false); 789// double c_out= drain_C_(g_tp.min_w_nmos_, NCH, 2, 1, g_tp.cell_h_def, false) + drain_C_(g_tp.min_w_nmos_pmos_to_nmos_sizing_r, PCH, 1, 1, g_tp.cell_h_def, false) + c_in; 790// double w_nmos=g_tp.min_w_nmos_; 791// double w_pmos=g_tp.min_w_nmos_pmos_to_nmos_sizing_r; 792// double i_on_n=1.0; 793// double i_on_p=1.0; 794// double i_on_n_in=1.0; 795// double i_on_p_in=1; 796// double vdd=g_tp.peri_global.Vdd; 797* 798// power.readOp.sc=shortcircuit_simple(vt, velocity_index, c_in, c_out, w_nmos,w_pmos, i_on_n, i_on_p,i_on_n_in, i_on_p_in, vdd); 799// power.readOp.dynamic=c_outvddvdd/2; 800 801// cout<<power.readOp.dynamic << "dynamic" <<endl; 802// cout<<power.readOp.sc << "sc" << endl; 803 804// power.readOp.sc=shortcircuit(vt, velocity_index, c_in, c_out, w_nmos,w_pmos, i_on_n, i_on_p,i_on_n_in, i_on_p_in, vdd); 805// power.readOp.dynamic=c_outvddvdd/2; 806// 807// cout<<power.readOp.dynamic << "dynamic" <<endl; 808// cout<<power.readOp.sc << "sc" << endl; 809 810 811	779 output_data.area = area.get_area() / 1e6; 780 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate; 781 output_data.subthreshold_leakage_power = 782 longer_channel_device ? power.readOp.longer_channel_leakage : 783 power.readOp.leakage; 784 output_data.gate_leakage_power = power.readOp.gate_leakage;
812} 813	785} 786
	787InstructionDecoder::InstructionDecoder(XMLNode* _xml_data, const string _name, 788 bool _is_default, 789 const InputParameter configure_interface, 790* int opcode_length_, int num_decoders_, 791 bool x86_, 792 double clockRate_, 793 enum Device_ty device_ty_, 794 enum Core_type core_ty_) 795 : McPATComponent(_xml_data), is_default(_is_default), 796 opcode_length(opcode_length_), num_decoders(num_decoders_), x86(x86_), 797 device_ty(device_ty_), core_ty(core_ty_) { 798 /* 799 * Instruction decoder is different from n to 2^n decoders 800 * that are commonly used in row decoders in memory arrays. 801 * The RISC instruction decoder is typically a very simple device. 802 * We can decode an instruction by simply 803 * separating the machine word into small parts using wire slices 804 * The RISC instruction decoder can be approximate by the n to 2^n decoders, 805 * although this approximation usually underestimate power since each decoded 806 * instruction normally has more than 1 active signal. 807 * 808 * However, decoding a CISC instruction word is much more difficult 809 * than the RISC case. A CISC decoder is typically set up as a state machine. 810 * The machine reads the opcode field to determine 811 * what type of instruction it is, 812 * and where the other data values are. 813 * The instruction word is read in piece by piece, 814 * and decisions are made at each stage as to 815 * how the remainder of the instruction word will be read. 816 * (sequencer and ROM are usually needed) 817 * An x86 decoder can be even more complex since 818 * it involve both decoding instructions into u-ops and 819 * merge u-ops when doing micro-ops fusion. 820 / 821* name = _name; 822 clockRate = clockRate_; 823 bool is_dram = false; 824 double pmos_to_nmos_sizing_r; 825 double load_nmos_width, load_pmos_width; 826 double C_driver_load, R_wire_load; 827 Area cell;
814	828
815void UndiffCore::displayEnergy(uint32_t indent,int plevel,bool is_tdp) 816{ 817 string indent_str(indent, ' '); 818 string indent_str_next(indent+2, ' '); 819 bool long_channel = XML->sys.longer_channel_device;	829 l_ip = configure_interface; 830* local_result = init_interface(&l_ip, name); 831 cell.h = g_tp.cell_h_def; 832 cell.w = g_tp.cell_h_def;
820	833
821 if (is_tdp) 822 { 823 cout << indent_str << "UndiffCore:" << endl; 824 cout << indent_str_next << "Area = " << area.get_area()1e-6<< " mm^2" << endl; 825* cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamicclockRate << " W" << endl; 826* //cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl; 827 cout << indent_str_next<< "Subthreshold Leakage = " 828 << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; 829 cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; 830 //cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; 831 cout <<endl; 832 } 833 else 834 { 835 cout << indent_str << "UndiffCore:" << endl; 836 cout << indent_str_next << "Area = " << area.get_area()1e-6<< " mm^2" << endl; 837* cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamicclockRate << " W" << endl; 838* cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl; 839 cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; 840 //cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; 841 cout <<endl; 842 }	834 num_decoder_segments = (int)ceil(opcode_length / 18.0); 835 if (opcode_length > 18) opcode_length = 18; 836 num_decoded_signals = (int)pow(2.0, opcode_length); 837 pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); 838 load_nmos_width = g_tp.max_w_nmos_ / 2; 839 load_pmos_width = g_tp.max_w_nmos_ * pmos_to_nmos_sizing_r; 840 C_driver_load = 1024 * gate_C(load_nmos_width + load_pmos_width, 0, is_dram); 841 R_wire_load = 3000 * l_ip.F_sz_um * g_tp.wire_outside_mat.R_per_um;
843	842
844}	843 final_dec = new Decoder( 844 num_decoded_signals, 845 false, 846 C_driver_load, 847 R_wire_load, 848 false/is_fa/, 849 false/is_dram/, 850 false/wl_tr/, //to use peri device 851 cell);
845	852
846inst_decoder::inst_decoder( 847 bool _is_default, 848 const InputParameter configure_interface, 849* int opcode_length_, 850 int num_decoders_, 851 bool x86_, 852 enum Device_ty device_ty_, 853 enum Core_type core_ty_) 854:is_default(_is_default), 855 opcode_length(opcode_length_), 856 num_decoders(num_decoders_), 857 x86(x86_), 858 device_ty(device_ty_), 859 core_ty(core_ty_) 860 { 861 /* 862 * Instruction decoder is different from n to 2^n decoders 863 * that are commonly used in row decoders in memory arrays. 864 * The RISC instruction decoder is typically a very simple device. 865 * We can decode an instruction by simply 866 * separating the machine word into small parts using wire slices 867 * The RISC instruction decoder can be approximate by the n to 2^n decoders, 868 * although this approximation usually underestimate power since each decoded 869 * instruction normally has more than 1 active signal. 870 * 871 * However, decoding a CISC instruction word is much more difficult 872 * than the RISC case. A CISC decoder is typically set up as a state machine. 873 * The machine reads the opcode field to determine 874 * what type of instruction it is, 875 * and where the other data values are. 876 * The instruction word is read in piece by piece, 877 * and decisions are made at each stage as to 878 * how the remainder of the instruction word will be read. 879 * (sequencer and ROM are usually needed) 880 * An x86 decoder can be even more complex since 881 * it involve both decoding instructions into u-ops and 882 * merge u-ops when doing micro-ops fusion. 883 / 884* bool is_dram=false; 885 double pmos_to_nmos_sizing_r; 886 double load_nmos_width, load_pmos_width; 887 double C_driver_load, R_wire_load; 888 Area cell;	853 PredecBlk * predec_blk1 = new PredecBlk( 854 num_decoded_signals, 855 final_dec, 856 0,//Assuming predec and dec are back to back 857 0, 858 1,//Each Predec only drives one final dec 859 false/is_dram/, 860 true); 861 PredecBlk * predec_blk2 = new PredecBlk( 862 num_decoded_signals, 863 final_dec, 864 0,//Assuming predec and dec are back to back 865 0, 866 1,//Each Predec only drives one final dec 867 false/is_dram/, 868 false);
889	869
890 l_ip=configure_interface; 891* local_result = init_interface(&l_ip); 892 cell.h =g_tp.cell_h_def; 893 cell.w =g_tp.cell_h_def;	870 PredecBlkDrv * predec_blk_drv1 = new PredecBlkDrv(0, predec_blk1, false); 871 PredecBlkDrv * predec_blk_drv2 = new PredecBlkDrv(0, predec_blk2, false);
894	872
895 num_decoder_segments = (int)ceil(opcode_length/18.0); 896 if (opcode_length > 18) opcode_length = 18; 897 num_decoded_signals= (int)pow(2.0,opcode_length); 898 pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); 899 load_nmos_width=g_tp.max_w_nmos_ /2; 900 load_pmos_width= g_tp.max_w_nmos_ * pmos_to_nmos_sizing_r; 901 C_driver_load = 1024gate_C(load_nmos_width + load_pmos_width, 0, is_dram); //TODO: this number 1024 needs to be revisited 902* R_wire_load = 3000l_ip.F_sz_um g_tp.wire_outside_mat.R_per_um;	873 pre_dec = new Predec(predec_blk_drv1, predec_blk_drv2);
903	874
904 final_dec = new Decoder( 905 num_decoded_signals, 906 false, 907 C_driver_load, 908 R_wire_load, 909 false/is_fa/, 910 false/is_dram/, 911 false/wl_tr/, //to use peri device 912 cell);	875 double area_decoder = final_dec->area.get_area() * num_decoded_signals * 876 num_decoder_segments * num_decoders; 877 //double w_decoder = area_decoder / area.get_h(); 878 double area_pre_dec = (predec_blk_drv1->area.get_area() + 879 predec_blk_drv2->area.get_area() + 880 predec_blk1->area.get_area() + 881 predec_blk2->area.get_area()) * 882 num_decoder_segments * num_decoders; 883 area.set_area(area.get_area() + area_decoder + area_pre_dec); 884 double macro_layout_overhead = g_tp.macro_layout_overhead; 885 double chip_PR_overhead = g_tp.chip_layout_overhead; 886 area.set_area(area.get_area()macro_layout_overheadchip_PR_overhead);
913	887
914 PredecBlk * predec_blk1 = new PredecBlk( 915 num_decoded_signals, 916 final_dec, 917 0,//Assuming predec and dec are back to back 918 0, 919 1,//Each Predec only drives one final dec 920 false/is_dram/, 921 true); 922 PredecBlk * predec_blk2 = new PredecBlk( 923 num_decoded_signals, 924 final_dec, 925 0,//Assuming predec and dec are back to back 926 0, 927 1,//Each Predec only drives one final dec 928 false/is_dram/, 929 false);	888 inst_decoder_delay_power();
930	889
931 PredecBlkDrv * predec_blk_drv1 = new PredecBlkDrv(0, predec_blk1, false); 932 PredecBlkDrv * predec_blk_drv2 = new PredecBlkDrv(0, predec_blk2, false);	890 double sckRation = g_tp.sckt_co_eff; 891 power.readOp.dynamic = sckRation; 892* power.writeOp.dynamic = sckRation; 893* power.searchOp.dynamic *= sckRation;
933	894
934 pre_dec = new Predec(predec_blk_drv1, predec_blk_drv2);	895 double long_channel_device_reduction = 896 longer_channel_device_reduction(device_ty, core_ty); 897 power.readOp.longer_channel_leakage = power.readOp.leakage * 898 long_channel_device_reduction;
935	899
936 double area_decoder = final_dec->area.get_area() * num_decoded_signals * num_decoder_segmentsnum_decoders; 937* //double w_decoder = area_decoder / area.get_h(); 938 double area_pre_dec = (predec_blk_drv1->area.get_area() + 939 predec_blk_drv2->area.get_area() + 940 predec_blk1->area.get_area() + 941 predec_blk2->area.get_area())* 942 num_decoder_segmentsnum_decoders; 943* area.set_area(area.get_area()+ area_decoder + area_pre_dec); 944 double macro_layout_overhead = g_tp.macro_layout_overhead; 945 double chip_PR_overhead = g_tp.chip_layout_overhead; 946 area.set_area(area.get_area()macro_layout_overheadchip_PR_overhead); 947 948 inst_decoder_delay_power(); 949 950 double sckRation = g_tp.sckt_co_eff; 951 power.readOp.dynamic = sckRation; 952* power.writeOp.dynamic = sckRation; 953* power.searchOp.dynamic = sckRation; 954* 955 double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty); 956 power.readOp.longer_channel_leakage = power.readOp.leakagelong_channel_device_reduction; 957*	900 output_data.area = area.get_area() / 1e6; 901 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate; 902 output_data.subthreshold_leakage_power = power.readOp.leakage; 903 output_data.gate_leakage_power = power.readOp.gate_leakage;
958} 959	904} 905
960void inst_decoder::inst_decoder_delay_power() 961{	906void InstructionDecoder::inst_decoder_delay_power() {
962	907
963 double dec_outrisetime; 964 double inrisetime=0, outrisetime; 965 double pppm_t[4] = {1,1,1,1}; 966 double squencer_passes = x86?2:1;	908 double dec_outrisetime; 909 double inrisetime = 0, outrisetime; 910 double pppm_t[4] = {1, 1, 1, 1}; 911 double squencer_passes = x86 ? 2 : 1;
967	912
968 outrisetime = pre_dec->compute_delays(inrisetime); 969 dec_outrisetime = final_dec->compute_delays(outrisetime); 970 set_pppm(pppm_t, squencer_passesnum_decoder_segments, num_decoder_segments, squencer_passesnum_decoder_segments, num_decoder_segments); 971 power = power + pre_dec->power*pppm_t;	913 outrisetime = pre_dec->compute_delays(inrisetime); 914 dec_outrisetime = final_dec->compute_delays(outrisetime); 915 set_pppm(pppm_t, squencer_passesnum_decoder_segments, num_decoder_segments, squencer_passesnum_decoder_segments, num_decoder_segments); 916 power = power + pre_dec->power * pppm_t;
972 set_pppm(pppm_t, squencer_passesnum_decoder_segments, num_decoder_segmentsnum_decoded_signals,	917 set_pppm(pppm_t, squencer_passesnum_decoder_segments, num_decoder_segmentsnum_decoded_signals,
973 num_decoder_segmentsnum_decoded_signals, squencer_passesnum_decoder_segments); 974 power = power + final_dec->power*pppm_t;	918 num_decoder_segmentsnum_decoded_signals, squencer_passesnum_decoder_segments); 919 power = power + final_dec->power * pppm_t;
975}	920}
976void inst_decoder::leakage_feedback(double temperature) 977{	921 922void InstructionDecoder::leakage_feedback(double temperature) {
978 l_ip.temp = (unsigned int)round(temperature/10.0)*10;	923 l_ip.temp = (unsigned int)round(temperature/10.0)*10;
979 uca_org_t init_result = init_interface(&l_ip); // init_result is dummy	924 uca_org_t init_result = init_interface(&l_ip, name); // init_result is dummy
980 981 final_dec->leakage_feedback(temperature); 982 pre_dec->leakage_feedback(temperature); 983 984 double pppm_t[4] = {1,1,1,1}; 985 double squencer_passes = x86?2:1; 986 987 set_pppm(pppm_t, squencer_passesnum_decoder_segments, num_decoder_segments, squencer_passesnum_decoder_segments, num_decoder_segments); --- 7 unchanged lines hidden (view full) --- 995 power.readOp.dynamic = sckRation; 996* power.writeOp.dynamic = sckRation; 997* power.searchOp.dynamic = sckRation; 998* 999 double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty); 1000 power.readOp.longer_channel_leakage = power.readOp.leakagelong_channel_device_reduction; 1001} 1002*	925 926 final_dec->leakage_feedback(temperature); 927 pre_dec->leakage_feedback(temperature); 928 929 double pppm_t[4] = {1,1,1,1}; 930 double squencer_passes = x86?2:1; 931 932 set_pppm(pppm_t, squencer_passesnum_decoder_segments, num_decoder_segments, squencer_passesnum_decoder_segments, num_decoder_segments); --- 7 unchanged lines hidden (view full) --- 940 power.readOp.dynamic = sckRation; 941* power.writeOp.dynamic = sckRation; 942* power.searchOp.dynamic = sckRation; 943* 944 double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty); 945 power.readOp.longer_channel_leakage = power.readOp.leakagelong_channel_device_reduction; 946} 947*
1003inst_decoder::~inst_decoder() 1004{ 1005 local_result.cleanup();	948InstructionDecoder::~InstructionDecoder() { 949 local_result.cleanup();
1006	950
1007 delete final_dec;	951 delete final_dec;
1008	952
1009 delete pre_dec->blk1; 1010 delete pre_dec->blk2; 1011 delete pre_dec->drv1; 1012 delete pre_dec->drv2; 1013 delete pre_dec;	953 delete pre_dec->blk1; 954 delete pre_dec->blk2; 955 delete pre_dec->drv1; 956 delete pre_dec->drv2; 957 delete pre_dec;
1014}	958}

1/*****************************************************************************
2 * McPAT
3 * SOFTWARE LICENSE AGREEMENT
4 * Copyright 2012 Hewlett-Packard Development Company, L.P.

5 * All Rights Reserved
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are
9 * met: redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer;
11 * redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the

--- 7 unchanged lines hidden (view full) ---

20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

6 * All Rights Reserved
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are
10 * met: redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer;
12 * redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the

--- 7 unchanged lines hidden (view full) ---

21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”

29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

29 *
30 ***************************************************************************/
31

30 *
31 ***************************************************************************/
32

33#include "common.h"

32#include "logic.h"
33

34#include "logic.h"
35

34

35//selection_logic

36//selection_logic

36selection_logic::selection_logic(
37 bool _is_default,
38 int win_entries_,
39 int issue_width_,
40 const InputParameter *configure_interface,
41 enum Device_ty device_ty_,
42 enum Core_type core_ty_)
43 //const ParseXML *_XML_interface)
44 :is_default(_is_default),
45 win_entries(win_entries_),
46 issue_width(issue_width_),
47 device_ty(device_ty_),
48 core_ty(core_ty_)
49 {
50 //uca_org_t result2;
51 l_ip=*configure_interface;
52 local_result = init_interface(&l_ip);
53 //init_tech_params(l_ip.F_sz_um, false);
54 //win_entries=numIBEntries;//IQentries;
55 //issue_width=issueWidth;
56 selection_power();
57 double sckRation = g_tp.sckt_co_eff;
58 power.readOp.dynamic *= sckRation;
59 power.writeOp.dynamic *= sckRation;
60 power.searchOp.dynamic *= sckRation;

37selection_logic::selection_logic(XMLNode* _xml_data, bool _is_default,
38 int _win_entries, int issue_width_,
39 const InputParameter *configure_interface,
40 string _name, double _accesses,
41 double clockRate_, enum Device_ty device_ty_,
42 enum Core_type core_ty_)
43 : McPATComponent(_xml_data), is_default(_is_default),
44 win_entries(_win_entries),
45 issue_width(issue_width_),
46 accesses(_accesses),
47 device_ty(device_ty_),
48 core_ty(core_ty_) {
49 clockRate = clockRate_;
50 name = _name;
51 l_ip = *configure_interface;
52 local_result = init_interface(&l_ip, name);
53}

61

54

62 double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
63 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;
64 }

55void selection_logic::computeArea() {
56 output_data.area = local_result.area;
57}

65

58

66void selection_logic::selection_power()
67{//based on cost effective superscalar processor TR pp27-31
68 double Ctotal, Cor, Cpencode;
69 int num_arbiter;
70 double WSelORn, WSelORprequ, WSelPn, WSelPp, WSelEnn, WSelEnp;

59void selection_logic::computeEnergy() {
60 //based on cost effective superscalar processor TR pp27-31
61 double Ctotal, Cor, Cpencode;
62 int num_arbiter;
63 double WSelORn, WSelORprequ, WSelPn, WSelPp, WSelEnn, WSelEnp;

71

64

72 //TODO: the 0.8um process data is used.
73 WSelORn = 12.5 * l_ip.F_sz_um;//this was 10 micron for the 0.8 micron process
74 WSelORprequ = 50 * l_ip.F_sz_um;//this was 40 micron for the 0.8 micron process
75 WSelPn = 12.5 * l_ip.F_sz_um;//this was 10mcron for the 0.8 micron process
76 WSelPp = 18.75 * l_ip.F_sz_um;//this was 15 micron for the 0.8 micron process
77 WSelEnn = 6.25 * l_ip.F_sz_um;//this was 5 micron for the 0.8 micron process
78 WSelEnp = 12.5 * l_ip.F_sz_um;//this was 10 micron for the 0.8 micron process

65 //the 0.8um process data is used.
66 //this was 10 micron for the 0.8 micron process
67 WSelORn = 12.5 * l_ip.F_sz_um;
68 //this was 40 micron for the 0.8 micron process
69 WSelORprequ = 50 * l_ip.F_sz_um;
70 //this was 10mcron for the 0.8 micron process
71 WSelPn = 12.5 * l_ip.F_sz_um;
72 //this was 15 micron for the 0.8 micron process
73 WSelPp = 18.75 * l_ip.F_sz_um;
74 //this was 5 micron for the 0.8 micron process
75 WSelEnn = 6.25 * l_ip.F_sz_um;
76 //this was 10 micron for the 0.8 micron process
77 WSelEnp = 12.5 * l_ip.F_sz_um;

79

78

80
81 Ctotal=0;
82 num_arbiter=1;
83 while(win_entries > 4)
84 {
85 win_entries = (int)ceil((double)win_entries / 4.0);
86 num_arbiter += win_entries;

79 Ctotal = 0;
80 num_arbiter = 1;
81 while (win_entries > 4) {
82 win_entries = (int)ceil((double)win_entries / 4.0);
83 num_arbiter += win_entries;

87 }

84 }

88 //the 4-input OR logic to generate anyreq
89 Cor = 4 * drain_C_(WSelORn,NCH,1,1, g_tp.cell_h_def) + drain_C_(WSelORprequ,PCH,1,1, g_tp.cell_h_def);
90 power.readOp.gate_leakage = cmos_Ig_leakage(WSelORn, WSelORprequ, 4, nor)*g_tp.peri_global.Vdd;

85 //the 4-input OR logic to generate anyreq
86 Cor = 4 * drain_C_(WSelORn, NCH, 1, 1, g_tp.cell_h_def) +
87 drain_C_(WSelORprequ, PCH, 1, 1, g_tp.cell_h_def);
88 power.readOp.gate_leakage =
89 cmos_Ig_leakage(WSelORn, WSelORprequ, 4, nor) * g_tp.peri_global.Vdd;

91

90

92 //The total capacity of the 4-bit priority encoder
93 Cpencode = drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,1, 1, g_tp.cell_h_def) +
94 2*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,2, 1, g_tp.cell_h_def) +
95 3*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,3, 1, g_tp.cell_h_def) +
96 4*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,4, 1, g_tp.cell_h_def) +//precompute priority logic
97 2*4*gate_C(WSelEnn+WSelEnp,20.0)+
98 4*drain_C_(WSelEnn,NCH,1, 1, g_tp.cell_h_def) + 2*4*drain_C_(WSelEnp,PCH,1, 1, g_tp.cell_h_def)+//enable logic
99 (2*4+2*3+2*2+2)*gate_C(WSelPn+WSelPp,10.0);//requests signal

91 //The total capacity of the 4-bit priority encoder
92 Cpencode = drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
93 drain_C_(WSelPp, PCH, 1, 1, g_tp.cell_h_def) +
94 2 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
95 drain_C_(WSelPp, PCH, 2, 1, g_tp.cell_h_def) +
96 3 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
97 drain_C_(WSelPp, PCH, 3, 1, g_tp.cell_h_def) +
98 4 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
99 drain_C_(WSelPp, PCH, 4, 1, g_tp.cell_h_def) +//precompute priority logic
100 2 * 4 * gate_C(WSelEnn + WSelEnp, 20.0) +
101 4 * drain_C_(WSelEnn, NCH, 1, 1, g_tp.cell_h_def) +
102 2 * 4 * drain_C_(WSelEnp, PCH, 1, 1, g_tp.cell_h_def) +//enable logic
103 (2 * 4 + 2 * 3 + 2 * 2 + 2) *
104 gate_C(WSelPn + WSelPp, 10.0);//requests signal

100

105

101 Ctotal += issue_width * num_arbiter*(Cor+Cpencode);

106 Ctotal += issue_width * num_arbiter * (Cor + Cpencode);

102

107

103 power.readOp.dynamic = Ctotal*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*2;//2 means the abitration signal need to travel round trip
104 power.readOp.leakage = issue_width * num_arbiter *
105 (cmos_Isub_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
106 + cmos_Isub_leakage(WSelPn, WSelPp, 3, nor)//grant2p
107 + cmos_Isub_leakage(WSelPn, WSelPp, 4, nor)//grant3p
108 + cmos_Isub_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
109 + cmos_Isub_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant sIsubnals
110 )*g_tp.peri_global.Vdd;
111 power.readOp.gate_leakage = issue_width * num_arbiter *
112 (cmos_Ig_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
113 + cmos_Ig_leakage(WSelPn, WSelPp, 3, nor)//grant2p
114 + cmos_Ig_leakage(WSelPn, WSelPp, 4, nor)//grant3p
115 + cmos_Ig_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
116 + cmos_Ig_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant signals
117 )*g_tp.peri_global.Vdd;
118}

108 //2 means the abitration signal need to travel round trip
109 power.readOp.dynamic =
110 Ctotal * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 2;
111 power.readOp.leakage = issue_width * num_arbiter *
112 (cmos_Isub_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
113 + cmos_Isub_leakage(WSelPn, WSelPp, 3, nor)//grant2p
114 + cmos_Isub_leakage(WSelPn, WSelPp, 4, nor)//grant3p
115 + cmos_Isub_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
116 + cmos_Isub_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant sIsubnals
117 ) * g_tp.peri_global.Vdd;
118 power.readOp.gate_leakage = issue_width * num_arbiter *
119 (cmos_Ig_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
120 + cmos_Ig_leakage(WSelPn, WSelPp, 3, nor)//grant2p
121 + cmos_Ig_leakage(WSelPn, WSelPp, 4, nor)//grant3p
122 + cmos_Ig_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
123 + cmos_Ig_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant signals
124 ) * g_tp.peri_global.Vdd;
125 double sckRation = g_tp.sckt_co_eff;
126 power.readOp.dynamic *= sckRation;
127 power.writeOp.dynamic *= sckRation;
128 power.searchOp.dynamic *= sckRation;

119

129

130 double long_channel_device_reduction =
131 longer_channel_device_reduction(device_ty, core_ty);
132 power.readOp.longer_channel_leakage =
133 power.readOp.leakage * long_channel_device_reduction;

120

134

135 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
136 output_data.subthreshold_leakage_power = power.readOp.leakage;
137 output_data.gate_leakage_power = power.readOp.gate_leakage;
138 output_data.runtime_dynamic_energy = power.readOp.dynamic * accesses;
139}
140

121dep_resource_conflict_check::dep_resource_conflict_check(

141dep_resource_conflict_check::dep_resource_conflict_check(

122 const InputParameter *configure_interface,
123 const CoreDynParam & dyn_p_,
124 int compare_bits_,
125 bool _is_default)
126 : l_ip(*configure_interface),
127 coredynp(dyn_p_),
128 compare_bits(compare_bits_),
129 is_default(_is_default)
130{
131 Wcompn = 25 * l_ip.F_sz_um;//this was 20.0 micron for the 0.8 micron process
132 Wevalinvp = 25 * l_ip.F_sz_um;//this was 20.0 micron for the 0.8 micron process
133 Wevalinvn = 100 * l_ip.F_sz_um;//this was 80.0 mcron for the 0.8 micron process
134 Wcomppreequ = 50 * l_ip.F_sz_um;//this was 40.0 micron for the 0.8 micron process
135 WNORn = 6.75 * l_ip.F_sz_um;//this was 5.4 micron for the 0.8 micron process
136 WNORp = 38.125 * l_ip.F_sz_um;//this was 30.5 micron for the 0.8 micron process

142 XMLNode* _xml_data, const string _name,
143 const InputParameter *configure_interface,
144 const CoreParameters & dyn_p_, int compare_bits_,
145 double clockRate_, bool _is_default)
146 : McPATComponent(_xml_data), l_ip(*configure_interface),
147 coredynp(dyn_p_), compare_bits(compare_bits_), is_default(_is_default) {

137

148

138 local_result = init_interface(&l_ip);

149 name = _name;
150 clockRate = clockRate_;
151 //this was 20.0 micron for the 0.8 micron process
152 Wcompn = 25 * l_ip.F_sz_um;
153 //this was 20.0 micron for the 0.8 micron process
154 Wevalinvp = 25 * l_ip.F_sz_um;
155 //this was 80.0 mcron for the 0.8 micron process
156 Wevalinvn = 100 * l_ip.F_sz_um;
157 //this was 40.0 micron for the 0.8 micron process
158 Wcomppreequ = 50 * l_ip.F_sz_um;
159 //this was 5.4 micron for the 0.8 micron process
160 WNORn = 6.75 * l_ip.F_sz_um;
161 //this was 30.5 micron for the 0.8 micron process
162 WNORp = 38.125 * l_ip.F_sz_um;

139

163

140 if (coredynp.core_ty==Inorder)
141 compare_bits += 16 + 8 + 8;//TODO: opcode bits + log(shared resources) + REG TAG BITS-->opcode comparator
142 else
143 compare_bits += 16 + 8 + 8;

164 // To make CACTI happy.
165 l_ip.cache_sz = MIN_BUFFER_SIZE;
166 local_result = init_interface(&l_ip, name);

144

167

145 conflict_check_power();
146 double sckRation = g_tp.sckt_co_eff;
147 power.readOp.dynamic *= sckRation;
148 power.writeOp.dynamic *= sckRation;
149 power.searchOp.dynamic *= sckRation;

168 if (coredynp.core_ty == Inorder)
169 //TODO: opcode bits + log(shared resources) + REG TAG BITS -->
170 //opcode comparator
171 compare_bits += 16 + 8 + 8;
172 else
173 compare_bits += 16 + 8 + 8;

150

174

175 conflict_check_power();
176 double sckRation = g_tp.sckt_co_eff;
177 power.readOp.dynamic *= sckRation;
178 power.writeOp.dynamic *= sckRation;
179 power.searchOp.dynamic *= sckRation;
180

151}
152

181}
182

153void dep_resource_conflict_check::conflict_check_power()
154{
155 double Ctotal;
156 int num_comparators;
157 num_comparators = 3*((coredynp.decodeW) * (coredynp.decodeW)-coredynp.decodeW);//2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest to dest comparision.
158 //When decode-width ==1, no dcl logic

183void dep_resource_conflict_check::conflict_check_power() {
184 double Ctotal;
185 int num_comparators;
186 //2(N*N-N) is used for source to dest comparison, (N*N-N) is used for
187 //dest to dest comparision.
188 num_comparators = 3 * ((coredynp.decodeW) * (coredynp.decodeW) -
189 coredynp.decodeW);

159

190

160 Ctotal = num_comparators * compare_cap();
161 //printf("%i,%s\n",XML_interface->sys.core[0].predictor.predictor_entries,XML_interface->sys.core[0].predictor.prediction_scheme);

191 Ctotal = num_comparators * compare_cap();

162

192

163 power.readOp.dynamic=Ctotal*/*CLOCKRATE*/g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/*AF*/;
164 power.readOp.leakage=num_comparators*compare_bits*2*simplified_nmos_leakage(Wcompn, false);

193 power.readOp.dynamic = Ctotal * /*CLOCKRATE*/ g_tp.peri_global.Vdd *
194 g_tp.peri_global.Vdd /*AF*/;
195 power.readOp.leakage = num_comparators * compare_bits * 2 *
196 simplified_nmos_leakage(Wcompn, false);

165

197

166 double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
167 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;
168 power.readOp.gate_leakage=num_comparators*compare_bits*2*cmos_Ig_leakage(Wcompn, 0, 2, nmos);

198 double long_channel_device_reduction =
199 longer_channel_device_reduction(Core_device, coredynp.core_ty);
200 power.readOp.longer_channel_leakage =
201 power.readOp.leakage * long_channel_device_reduction;
202 power.readOp.gate_leakage = num_comparators * compare_bits * 2 *
203 cmos_Ig_leakage(Wcompn, 0, 2, nmos);

169
170}
171
172/* estimate comparator power consumption (this comparator is similar
173 to the tag-match structure in a CAM */

204
205}
206
207/* estimate comparator power consumption (this comparator is similar
208 to the tag-match structure in a CAM */

174double dep_resource_conflict_check::compare_cap()
175{
176 double c1, c2;

209double dep_resource_conflict_check::compare_cap() {
210 double c1, c2;

177

211

178 WNORp = WNORp * compare_bits/2.0;//resize the big NOR gate at the DCL according to fan in.
179 /* bottom part of comparator */
180 c2 = (compare_bits)*(drain_C_(Wcompn,NCH,1,1, g_tp.cell_h_def)+drain_C_(Wcompn,NCH,2,1, g_tp.cell_h_def))+
181 drain_C_(Wevalinvp,PCH,1,1, g_tp.cell_h_def) + drain_C_(Wevalinvn,NCH,1,1, g_tp.cell_h_def);

212 //resize the big NOR gate at the DCL according to fan in.
213 WNORp = WNORp * compare_bits / 2.0;
214 /* bottom part of comparator */
215 c2 = (compare_bits) * (drain_C_(Wcompn, NCH, 1, 1, g_tp.cell_h_def) +
216 drain_C_(Wcompn, NCH, 2, 1, g_tp.cell_h_def)) +
217 drain_C_(Wevalinvp, PCH, 1, 1, g_tp.cell_h_def) +
218 drain_C_(Wevalinvn, NCH, 1, 1, g_tp.cell_h_def);

182

219

183 /* top part of comparator */
184 c1 = (compare_bits)*(drain_C_(Wcompn,NCH,1,1, g_tp.cell_h_def)+drain_C_(Wcompn,NCH,2,1, g_tp.cell_h_def)+
185 drain_C_(Wcomppreequ,NCH,1,1, g_tp.cell_h_def)) + gate_C(WNORn + WNORp,10.0) +
186 drain_C_(WNORp,NCH,2,1, g_tp.cell_h_def) + compare_bits*drain_C_(WNORn,NCH,2,1, g_tp.cell_h_def);
187 return(c1 + c2);

220 /* top part of comparator */
221 c1 = (compare_bits) * (drain_C_(Wcompn, NCH, 1, 1, g_tp.cell_h_def) +
222 drain_C_(Wcompn, NCH, 2, 1, g_tp.cell_h_def) +
223 drain_C_(Wcomppreequ, NCH, 1, 1, g_tp.cell_h_def)) +
224 gate_C(WNORn + WNORp, 10.0) +
225 drain_C_(WNORp, NCH, 2, 1, g_tp.cell_h_def) + compare_bits *
226 drain_C_(WNORn, NCH, 2, 1, g_tp.cell_h_def);
227 return(c1 + c2);

188
189}
190
191void dep_resource_conflict_check::leakage_feedback(double temperature)
192{
193 l_ip.temp = (unsigned int)round(temperature/10.0)*10;

228
229}
230
231void dep_resource_conflict_check::leakage_feedback(double temperature)
232{
233 l_ip.temp = (unsigned int)round(temperature/10.0)*10;

194 uca_org_t init_result = init_interface(&l_ip); // init_result is dummy

234 uca_org_t init_result = init_interface(&l_ip, name); // init_result is dummy

195
196 // This is part of conflict_check_power()

235
236 // This is part of conflict_check_power()

197 int num_comparators = 3*((coredynp.decodeW) * (coredynp.decodeW)-coredynp.decodeW);//2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest to dest comparision.
198 power.readOp.leakage=num_comparators*compare_bits*2*simplified_nmos_leakage(Wcompn, false);

237 // 2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest
238 // to dest comparison.
239 int num_comparators = 3 * ((coredynp.decodeW) * (coredynp.decodeW) -
240 coredynp.decodeW);
241 power.readOp.leakage = num_comparators * compare_bits * 2 *
242 simplified_nmos_leakage(Wcompn, false);

199

243

200 double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
201 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;
202 power.readOp.gate_leakage=num_comparators*compare_bits*2*cmos_Ig_leakage(Wcompn, 0, 2, nmos);

244 double long_channel_device_reduction =
245 longer_channel_device_reduction(Core_device, coredynp.core_ty);
246 power.readOp.longer_channel_leakage = power.readOp.leakage *
247 long_channel_device_reduction;
248 power.readOp.gate_leakage = num_comparators * compare_bits * 2 *
249 cmos_Ig_leakage(Wcompn, 0, 2, nmos);

203}
204

250}
251

205//TODO: add inverter and transmission gate base DFF.

206
207DFFCell::DFFCell(

252
253DFFCell::DFFCell(

208 bool _is_dram,
209 double _WdecNANDn,
210 double _WdecNANDp,
211 double _cell_load,
212 const InputParameter *configure_interface)
213:is_dram(_is_dram),
214cell_load(_cell_load),
215WdecNANDn(_WdecNANDn),
216WdecNANDp(_WdecNANDp)
217{//this model is based on the NAND2 based DFF.
218 l_ip=*configure_interface;
219// area.set_area(730*l_ip.F_sz_um*l_ip.F_sz_um);
220 area.set_area(5*compute_gate_area(NAND, 2,WdecNANDn,WdecNANDp, g_tp.cell_h_def)
221 + compute_gate_area(NAND, 2,WdecNANDn,WdecNANDn, g_tp.cell_h_def));

254 bool _is_dram,
255 double _WdecNANDn,
256 double _WdecNANDp,
257 double _cell_load,
258 const InputParameter *configure_interface)
259 : is_dram(_is_dram),
260 cell_load(_cell_load),
261 WdecNANDn(_WdecNANDn),
262 WdecNANDp(_WdecNANDp) { //this model is based on the NAND2 based DFF.
263 l_ip = *configure_interface;
264 area.set_area(5 * compute_gate_area(NAND, 2,WdecNANDn,WdecNANDp,
265 g_tp.cell_h_def)
266 + compute_gate_area(NAND, 2,WdecNANDn,WdecNANDn,
267 g_tp.cell_h_def));

222
223
224}
225
226

268
269
270}
271
272

227double DFFCell::fpfp_node_cap(unsigned int fan_in, unsigned int fan_out)
228{
229 double Ctotal = 0;
230 //printf("WdecNANDn = %E\n", WdecNANDn);

273double DFFCell::fpfp_node_cap(unsigned int fan_in, unsigned int fan_out) {
274 double Ctotal = 0;

231

275

232 /* part 1: drain cap of NAND gate */
233 Ctotal += drain_C_(WdecNANDn, NCH, 2, 1, g_tp.cell_h_def, is_dram) + fan_in * drain_C_(WdecNANDp, PCH, 1, 1, g_tp.cell_h_def, is_dram);

276 /* part 1: drain cap of NAND gate */
277 Ctotal += drain_C_(WdecNANDn, NCH, 2, 1, g_tp.cell_h_def, is_dram) + fan_in * drain_C_(WdecNANDp, PCH, 1, 1, g_tp.cell_h_def, is_dram);

234

278

235 /* part 2: gate cap of NAND gates */
236 Ctotal += fan_out * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);

279 /* part 2: gate cap of NAND gates */
280 Ctotal += fan_out * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);

237

281

238 return Ctotal;

282 return Ctotal;

239}
240
241

283}
284
285

242void DFFCell::compute_DFF_cell()
243{
244 double c1, c2, c3, c4, c5, c6;
245 /* node 5 and node 6 are identical to node 1 in capacitance */
246 c1 = c5 = c6 = fpfp_node_cap(2, 1);
247 c2 = fpfp_node_cap(2, 3);
248 c3 = fpfp_node_cap(3, 2);
249 c4 = fpfp_node_cap(2, 2);

286void DFFCell::compute_DFF_cell() {
287 double c1, c2, c3, c4, c5, c6;
288 /* node 5 and node 6 are identical to node 1 in capacitance */
289 c1 = c5 = c6 = fpfp_node_cap(2, 1);
290 c2 = fpfp_node_cap(2, 3);
291 c3 = fpfp_node_cap(3, 2);
292 c4 = fpfp_node_cap(2, 2);

250

293

251 //cap-load of the clock signal in each Dff, actually the clock signal only connected to one NAND2
252 clock_cap= 2 * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
253 e_switch.readOp.dynamic += (c4 + c1 + c2 + c3 + c5 + c6 + 2*cell_load)*0.5*g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;

294 //cap-load of the clock signal in each Dff, actually the clock signal only connected to one NAND2
295 clock_cap = 2 * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
296 e_switch.readOp.dynamic += (c4 + c1 + c2 + c3 + c5 + c6 + 2 * cell_load) *
297 0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;

254

298

255 /* no 1/2 for e_keep and e_clock because clock signal switches twice in one cycle */
256 e_keep_1.readOp.dynamic += c3 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
257 e_keep_0.readOp.dynamic += c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
258 e_clock.readOp.dynamic += clock_cap* g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;

299 /* no 1/2 for e_keep and e_clock because clock signal switches twice in one cycle */
300 e_keep_1.readOp.dynamic +=
301 c3 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
302 e_keep_0.readOp.dynamic +=
303 c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
304 e_clock.readOp.dynamic +=
305 clock_cap * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;

259

306

260 /* static power */
261 e_switch.readOp.leakage += (cmos_Isub_leakage(WdecNANDn, WdecNANDp, 2, nand)*5//5 NAND2 and 1 NAND3 in a DFF
262 + cmos_Isub_leakage(WdecNANDn, WdecNANDn, 3, nand))*g_tp.peri_global.Vdd;
263 e_switch.readOp.gate_leakage += (cmos_Ig_leakage(WdecNANDn, WdecNANDp, 2, nand)*5//5 NAND2 and 1 NAND3 in a DFF
264 + cmos_Ig_leakage(WdecNANDn, WdecNANDn, 3, nand))*g_tp.peri_global.Vdd;
265 //printf("leakage =%E\n",cmos_Ileak(1, is_dram) );

307 /* static power */
308 e_switch.readOp.leakage +=
309 (cmos_Isub_leakage(WdecNANDn, WdecNANDp, 2, nand) *
310 5//5 NAND2 and 1 NAND3 in a DFF
311 + cmos_Isub_leakage(WdecNANDn, WdecNANDn, 3, nand)) *
312 g_tp.peri_global.Vdd;
313 e_switch.readOp.gate_leakage +=
314 (cmos_Ig_leakage(WdecNANDn, WdecNANDp, 2, nand) *
315 5//5 NAND2 and 1 NAND3 in a DFF
316 + cmos_Ig_leakage(WdecNANDn, WdecNANDn, 3, nand)) *
317 g_tp.peri_global.Vdd;

266}
267

318}
319

268Pipeline::Pipeline(
269 const InputParameter *configure_interface,
270 const CoreDynParam & dyn_p_,
271 enum Device_ty device_ty_,
272 bool _is_core_pipeline,
273 bool _is_default)
274: l_ip(*configure_interface),
275 coredynp(dyn_p_),
276 device_ty(device_ty_),
277 is_core_pipeline(_is_core_pipeline),
278 is_default(_is_default),
279 num_piperegs(0.0)

320Pipeline::Pipeline(XMLNode* _xml_data,
321 const InputParameter *configure_interface,
322 const CoreParameters & dyn_p_,
323 enum Device_ty device_ty_,
324 bool _is_core_pipeline,
325 bool _is_default)
326 : McPATComponent(_xml_data), l_ip(*configure_interface),
327 coredynp(dyn_p_), device_ty(device_ty_),
328 is_core_pipeline(_is_core_pipeline), is_default(_is_default),
329 num_piperegs(0.0) {
330 name = "Pipeline?";

280

331

281 {
282 local_result = init_interface(&l_ip);
283 if (!coredynp.Embedded)
284 process_ind = true;
285 else
286 process_ind = false;
287 WNANDn = (process_ind)? 25 * l_ip.F_sz_um : g_tp.min_w_nmos_ ;//this was 20 micron for the 0.8 micron process
288 WNANDp = (process_ind)? 37.5 * l_ip.F_sz_um : g_tp.min_w_nmos_*pmos_to_nmos_sz_ratio();//this was 30 micron for the 0.8 micron process
289 load_per_pipeline_stage = 2*gate_C(WNANDn + WNANDp, 0, false);
290 compute();

332 local_result = init_interface(&l_ip, name);
333 if (!coredynp.Embedded) {
334 process_ind = true;
335 } else {
336 process_ind = false;
337 }
338 //this was 20 micron for the 0.8 micron process
339 WNANDn = (process_ind) ? 25 * l_ip.F_sz_um : g_tp.min_w_nmos_ ;
340 //this was 30 micron for the 0.8 micron process
341 WNANDp = (process_ind) ? 37.5 * l_ip.F_sz_um : g_tp.min_w_nmos_ *
342 pmos_to_nmos_sz_ratio();
343 load_per_pipeline_stage = 2 * gate_C(WNANDn + WNANDp, 0, false);
344 compute();

291
292}
293

345
346}
347

294void Pipeline::compute()
295{
296 compute_stage_vector();
297 DFFCell pipe_reg(false, WNANDn,WNANDp, load_per_pipeline_stage, &l_ip);
298 pipe_reg.compute_DFF_cell();

348void Pipeline::compute() {
349 compute_stage_vector();
350 DFFCell pipe_reg(false, WNANDn, WNANDp, load_per_pipeline_stage, &l_ip);
351 pipe_reg.compute_DFF_cell();

299

352

300 double clock_power_pipereg = num_piperegs * pipe_reg.e_clock.readOp.dynamic;
301 //******************pipeline power: currently, we average all the possibilities of the states of DFFs in the pipeline. A better way to do it is to consider
302 //the harming distance of two consecutive signals, However McPAT does not have plan to do this in near future as it focuses on worst case power.
303 double pipe_reg_power = num_piperegs * (pipe_reg.e_switch.readOp.dynamic+pipe_reg.e_keep_0.readOp.dynamic+pipe_reg.e_keep_1.readOp.dynamic)/3+clock_power_pipereg;
304 double pipe_reg_leakage = num_piperegs * pipe_reg.e_switch.readOp.leakage;
305 double pipe_reg_gate_leakage = num_piperegs * pipe_reg.e_switch.readOp.gate_leakage;
306 power.readOp.dynamic +=pipe_reg_power;
307 power.readOp.leakage +=pipe_reg_leakage;
308 power.readOp.gate_leakage +=pipe_reg_gate_leakage;
309 area.set_area(num_piperegs * pipe_reg.area.get_area());

353 double clock_power_pipereg = num_piperegs * pipe_reg.e_clock.readOp.dynamic;
354 //******************pipeline power: currently, we average all the possibilities of the states of DFFs in the pipeline. A better way to do it is to consider
355 //the harming distance of two consecutive signals, However McPAT does not have plan to do this in near future as it focuses on worst case power.
356 double pipe_reg_power = num_piperegs *
357 (pipe_reg.e_switch.readOp.dynamic + pipe_reg.e_keep_0.readOp.dynamic +
358 pipe_reg.e_keep_1.readOp.dynamic) / 3 + clock_power_pipereg;
359 double pipe_reg_leakage = num_piperegs * pipe_reg.e_switch.readOp.leakage;
360 double pipe_reg_gate_leakage = num_piperegs *
361 pipe_reg.e_switch.readOp.gate_leakage;
362 power.readOp.dynamic += pipe_reg_power;
363 power.readOp.leakage += pipe_reg_leakage;
364 power.readOp.gate_leakage += pipe_reg_gate_leakage;
365 area.set_area(num_piperegs * pipe_reg.area.get_area());

310

366

311 double long_channel_device_reduction = longer_channel_device_reduction(device_ty, coredynp.core_ty);
312 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;

367 double long_channel_device_reduction =
368 longer_channel_device_reduction(device_ty, coredynp.core_ty);
369 power.readOp.longer_channel_leakage = power.readOp.leakage *
370 long_channel_device_reduction;

313
314

371
372

315 double sckRation = g_tp.sckt_co_eff;
316 power.readOp.dynamic *= sckRation;
317 power.writeOp.dynamic *= sckRation;
318 power.searchOp.dynamic *= sckRation;
319 double macro_layout_overhead = g_tp.macro_layout_overhead;

373 double sckRation = g_tp.sckt_co_eff;
374 power.readOp.dynamic *= sckRation;
375 power.writeOp.dynamic *= sckRation;
376 power.searchOp.dynamic *= sckRation;
377 double macro_layout_overhead = g_tp.macro_layout_overhead;

320 if (!coredynp.Embedded)

378 if (!coredynp.Embedded)

321 area.set_area(area.get_area()*macro_layout_overhead);

379 area.set_area(area.get_area() * macro_layout_overhead);
380
381 output_data.area = area.get_area() / 1e6;
382 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
383 output_data.subthreshold_leakage_power = power.readOp.leakage;
384 output_data.gate_leakage_power = power.readOp.gate_leakage;
385 output_data.runtime_dynamic_energy = power.readOp.dynamic * total_cycles;

322}
323

386}
387

324void Pipeline::compute_stage_vector()
325{
326 double num_stages, tot_stage_vector, per_stage_vector;
327 int opcode_length = coredynp.x86? coredynp.micro_opcode_length:coredynp.opcode_length;
328 //Hthread = thread_clock_gated? 1:num_thread;

388void Pipeline::compute_stage_vector() {
389 double num_stages, tot_stage_vector, per_stage_vector;
390 int opcode_length = coredynp.x86 ?
391 coredynp.micro_opcode_length : coredynp.opcode_width;

329

392

330 if (!is_core_pipeline)
331 {
332 num_piperegs=l_ip.pipeline_stages*l_ip.per_stage_vector;//The number of pipeline stages are calculated based on the achievable throughput and required throughput
333 }
334 else
335 {
336 if (coredynp.core_ty==Inorder)
337 {
338 /* assume 6 pipe stages and try to estimate bits per pipe stage */
339 /* pipe stage 0/IF */
340 num_piperegs += coredynp.pc_width*2*coredynp.num_hthreads;
341 /* pipe stage IF/ID */
342 num_piperegs += coredynp.fetchW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;
343 /* pipe stage IF/ThreadSEL */
344 if (coredynp.multithreaded) num_piperegs += coredynp.num_hthreads*coredynp.perThreadState; //8 bit thread states
345 /* pipe stage ID/EXE */
346 num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width + pow(2.0,opcode_length)+ 2*coredynp.int_data_width)*coredynp.num_hthreads;
347 /* pipe stage EXE/MEM */
348 num_piperegs += coredynp.issueW*(3 * coredynp.arch_ireg_width + pow(2.0,opcode_length) + 8*2*coredynp.int_data_width/*+2*powers (2,reg_length)*/);
349 /* pipe stage MEM/WB the 2^opcode_length means the total decoded signal for the opcode*/
350 num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length) + 8*2*coredynp.int_data_width/*+2*powers (2,reg_length)*/);
351// /* pipe stage 5/6 */
352// num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/*+2*powers (2,reg_length)*/);
353// /* pipe stage 6/7 */
354// num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/*+2*powers (2,reg_length)*/);
355// /* pipe stage 7/8 */
356// num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/**2*powers (2,reg_length)*/);
357// /* assume 50% extra in control signals (rule of thumb) */
358 num_stages=6;

393 if (!is_core_pipeline) {
394 //The number of pipeline stages are calculated based on the achievable
395 //throughput and required throughput
396 num_piperegs = l_ip.pipeline_stages * l_ip.per_stage_vector;
397 } else {
398 if (coredynp.core_ty == Inorder) {
399 /* assume 6 pipe stages and try to estimate bits per pipe stage */
400 /* pipe stage 0/IF */
401 num_piperegs += coredynp.pc_width * 2 * coredynp.num_hthreads;
402 /* pipe stage IF/ID */
403 num_piperegs += coredynp.fetchW *
404 (coredynp.instruction_length + coredynp.pc_width) *
405 coredynp.num_hthreads;
406 /* pipe stage IF/ThreadSEL */
407 if (coredynp.multithreaded) {
408 num_piperegs += coredynp.num_hthreads *
409 coredynp.perThreadState; //8 bit thread states
410 }
411 /* pipe stage ID/EXE */
412 num_piperegs += coredynp.decodeW *
413 (coredynp.instruction_length + coredynp.pc_width +
414 pow(2.0, opcode_length) + 2 * coredynp.int_data_width) *
415 coredynp.num_hthreads;
416 /* pipe stage EXE/MEM */
417 num_piperegs += coredynp.issueW *
418 (3 * coredynp.arch_ireg_width + pow(2.0, opcode_length) + 8 *
419 2 * coredynp.int_data_width/*+2*powers (2,reg_length)*/);
420 /* pipe stage MEM/WB the 2^opcode_length means the total decoded signal for the opcode*/
421 num_piperegs += coredynp.issueW *
422 (2 * coredynp.int_data_width + pow(2.0, opcode_length) + 8 *
423 2 * coredynp.int_data_width/*+2*powers (2,reg_length)*/);
424 num_stages = 6;
425 } else {
426 /* assume 12 stage pipe stages and try to estimate bits per pipe stage */
427 /*OOO: Fetch, decode, rename, IssueQ, dispatch, regread, EXE, MEM, WB, CM */

359

428

360 }
361 else
362 {
363 /* assume 12 stage pipe stages and try to estimate bits per pipe stage */
364 /*OOO: Fetch, decode, rename, IssueQ, dispatch, regread, EXE, MEM, WB, CM */

429 /* pipe stage 0/1F*/
430 num_piperegs +=
431 coredynp.pc_width * 2 * coredynp.num_hthreads ;//PC and Next PC
432 /* pipe stage IF/ID */
433 num_piperegs += coredynp.fetchW *
434 (coredynp.instruction_length + coredynp.pc_width) *
435 coredynp.num_hthreads;//PC is used to feed branch predictor in ID
436 /* pipe stage 1D/Renaming*/
437 num_piperegs += coredynp.decodeW *
438 (coredynp.instruction_length + coredynp.pc_width) *
439 coredynp.num_hthreads;//PC is for branch exe in later stage.
440 /* pipe stage Renaming/wire_drive */
441 num_piperegs += coredynp.decodeW *
442 (coredynp.instruction_length + coredynp.pc_width);
443 /* pipe stage Renaming/IssueQ */
444 //3*coredynp.phy_ireg_width means 2 sources and 1 dest
445 num_piperegs += coredynp.issueW *
446 (coredynp.instruction_length + coredynp.pc_width + 3 *
447 coredynp.phy_ireg_width) * coredynp.num_hthreads;
448 /* pipe stage IssueQ/Dispatch */
449 num_piperegs += coredynp.issueW *
450 (coredynp.instruction_length + 3 * coredynp.phy_ireg_width);
451 /* pipe stage Dispatch/EXE */

365

452

366 /* pipe stage 0/1F*/
367 num_piperegs += coredynp.pc_width*2*coredynp.num_hthreads ;//PC and Next PC
368 /* pipe stage IF/ID */
369 num_piperegs += coredynp.fetchW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;//PC is used to feed branch predictor in ID
370 /* pipe stage 1D/Renaming*/
371 num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;//PC is for branch exe in later stage.
372 /* pipe stage Renaming/wire_drive */
373 num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width);
374 /* pipe stage Renaming/IssueQ */
375 num_piperegs += coredynp.issueW*(coredynp.instruction_length + coredynp.pc_width + 3*coredynp.phy_ireg_width)*coredynp.num_hthreads;//3*coredynp.phy_ireg_width means 2 sources and 1 dest
376 /* pipe stage IssueQ/Dispatch */
377 num_piperegs += coredynp.issueW*(coredynp.instruction_length + 3 * coredynp.phy_ireg_width);
378 /* pipe stage Dispatch/EXE */

453 num_piperegs += coredynp.issueW *
454 (3 * coredynp.phy_ireg_width + coredynp.pc_width +
455 pow(2.0, opcode_length)/*+2*powers (2,reg_length)*/);
456 /* 2^opcode_length means the total decoded signal for the opcode*/
457 num_piperegs += coredynp.issueW *
458 (2 * coredynp.int_data_width + pow(2.0, opcode_length)
459 /*+2*powers (2,reg_length)*/);
460 /*2 source operands in EXE; Assume 2EXE stages* since we do not really distinguish OP*/
461 num_piperegs += coredynp.issueW *
462 (2 * coredynp.int_data_width + pow(2.0, opcode_length)
463 /*+2*powers (2,reg_length)*/);
464 /* pipe stage EXE/MEM, data need to be read/write, address*/
465 //memory Opcode still need to be passed
466 num_piperegs += coredynp.issueW *
467 (coredynp.int_data_width + coredynp.v_address_width +
468 pow(2.0, opcode_length)/*+2*powers (2,reg_length)*/);
469 /* pipe stage MEM/WB; result data, writeback regs */
470 num_piperegs += coredynp.issueW *
471 (coredynp.int_data_width + coredynp.phy_ireg_width
472 /* powers (2,opcode_length) +
473 (2,opcode_length)+2*powers (2,reg_length)*/);
474 /* pipe stage WB/CM ; result data, regs need to be updated, address for resolve memory ops in ROB's top*/
475 num_piperegs += coredynp.commitW *
476 (coredynp.int_data_width + coredynp.v_address_width +
477 coredynp.phy_ireg_width
478 /*+ powers (2,opcode_length)*2*powers (2,reg_length)*/) *
479 coredynp.num_hthreads;
480 num_stages = 12;

379

481

380 num_piperegs += coredynp.issueW*(3 * coredynp.phy_ireg_width + coredynp.pc_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);
381 /* 2^opcode_length means the total decoded signal for the opcode*/
382 num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);
383 /*2 source operands in EXE; Assume 2EXE stages* since we do not really distinguish OP*/
384 num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);
385 /* pipe stage EXE/MEM, data need to be read/write, address*/
386 num_piperegs += coredynp.issueW*(coredynp.int_data_width + coredynp.v_address_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);//memory Opcode still need to be passed
387 /* pipe stage MEM/WB; result data, writeback regs */
388 num_piperegs += coredynp.issueW*(coredynp.int_data_width + coredynp.phy_ireg_width /* powers (2,opcode_length) + (2,opcode_length)+2*powers (2,reg_length)*/);
389 /* pipe stage WB/CM ; result data, regs need to be updated, address for resolve memory ops in ROB's top*/
390 num_piperegs += coredynp.commitW*(coredynp.int_data_width + coredynp.v_address_width + coredynp.phy_ireg_width/*+ powers (2,opcode_length)*2*powers (2,reg_length)*/)*coredynp.num_hthreads;
391// if (multithreaded)
392// {
393//
394// }
395 num_stages=12;
396

397 }
398
399 /* assume 50% extra in control registers and interrupt registers (rule of thumb) */
400 num_piperegs = num_piperegs * 1.5;

482 }
483
484 /* assume 50% extra in control registers and interrupt registers (rule of thumb) */
485 num_piperegs = num_piperegs * 1.5;

401 tot_stage_vector=num_piperegs;
402 per_stage_vector=tot_stage_vector/num_stages;

486 tot_stage_vector = num_piperegs;
487 per_stage_vector = tot_stage_vector / num_stages;

403

488

404 if (coredynp.core_ty==Inorder)
405 {
406 if (coredynp.pipeline_stages>6)
407 num_piperegs= per_stage_vector*coredynp.pipeline_stages;

489 if (coredynp.core_ty == Inorder) {
490 if (coredynp.pipeline_stages > 6)
491 num_piperegs = per_stage_vector * coredynp.pipeline_stages;
492 } else { //OOO
493 if (coredynp.pipeline_stages > 12)
494 num_piperegs = per_stage_vector * coredynp.pipeline_stages;

408 }

495 }

409 else//OOO
410 {
411 if (coredynp.pipeline_stages>12)
412 num_piperegs= per_stage_vector*coredynp.pipeline_stages;
413 }
414 }

496 }

415
416}
417

497
498}
499

418FunctionalUnit::FunctionalUnit(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, enum FU_type fu_type_)
419:XML(XML_interface),
420 ithCore(ithCore_),
421 interface_ip(*interface_ip_),
422 coredynp(dyn_p_),
423 fu_type(fu_type_)
424{
425 double area_t;//, leakage, gate_leakage;

500FunctionalUnit::FunctionalUnit(XMLNode* _xml_data,
501 InputParameter* interface_ip_,
502 const CoreParameters & _core_params,
503 const CoreStatistics & _core_stats,
504 enum FU_type fu_type_)
505 : McPATComponent(_xml_data),
506 interface_ip(*interface_ip_), core_params(_core_params),
507 core_stats(_core_stats), fu_type(fu_type_) {
508 double area_t;
509 double leakage;
510 double gate_leakage;

426 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();

511 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();

427 clockRate = coredynp.clockRate;
428 executionTime = coredynp.executionTime;

512 clockRate = core_params.clockRate;

429

513

430 //XML_interface=_XML_interface;
431 uca_org_t result2;
432 result2 = init_interface(&interface_ip);
433 if (XML->sys.Embedded)
434 {
435 if (fu_type == FPU)
436 {
437 num_fu=coredynp.num_fpus;

514 uca_org_t result2;
515 // Temp name for the following function call
516 name = "Functional Unit";
517
518 result2 = init_interface(&interface_ip, name);
519
520 if (core_params.Embedded) {
521 if (fu_type == FPU) {
522 num_fu=core_params.num_fpus;

438 //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
439 area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number
440 //4.47 contains both VFP and NEON processing unit, VFP is about 40% and NEON is about 60%
441 if (g_ip->F_sz_nm>90)
442 area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
443 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
444 gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
445 //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles.
446// base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
447// base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
448 base_energy = 0;
449 per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per Hz energy(nJ)
450 //FPU power from Sandia's processor sizing tech report
451 FU_height=(18667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data

523 //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
524 area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number
525 //4.47 contains both VFP and NEON processing unit, VFP is about 40% and NEON is about 60%
526 if (g_ip->F_sz_nm>90)
527 area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
528 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
529 gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
530 //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles.
531// base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
532// base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
533 base_energy = 0;
534 per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per Hz energy(nJ)
535 //FPU power from Sandia's processor sizing tech report
536 FU_height=(18667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data

452 }
453 else if (fu_type == ALU)
454 {
455 num_fu=coredynp.num_alus;

537 } else if (fu_type == ALU) {
538 num_fu=core_params.num_alus;

456 area_t = 280*260*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
457 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
458 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
459// base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
460// base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
461 base_energy = 0;
462 per_access_energy = 1.15/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
463 FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU
464

539 area_t = 280*260*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
540 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
541 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
542// base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
543// base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
544 base_energy = 0;
545 per_access_energy = 1.15/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
546 FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU
547

465 }
466 else if (fu_type == MUL)
467 {
468 num_fu=coredynp.num_muls;

548 } else if (fu_type == MUL) {
549 num_fu=core_params.num_muls;

469 area_t = 280*260*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
470 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
471 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
472// base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
473// base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
474 base_energy = 0;
475 per_access_energy = 1.15*2/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
476 FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data

550 area_t = 280*260*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
551 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
552 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
553// base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
554// base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
555 base_energy = 0;
556 per_access_energy = 1.15*2/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
557 FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data

477 }
478 else
479 {

558 } else {

480 cout<<"Unknown Functional Unit Type"<<endl;
481 exit(0);
482 }
483 per_access_energy *=0.5;//According to ARM data embedded processor has much lower per acc energy

559 cout<<"Unknown Functional Unit Type"<<endl;
560 exit(0);
561 }
562 per_access_energy *=0.5;//According to ARM data embedded processor has much lower per acc energy

563 } else {
564 if (fu_type == FPU) {
565 name = "Floating Point Unit(s)";
566 num_fu = core_params.num_fpus;
567 area_t = 8.47 * 1e6 * (g_ip->F_sz_nm * g_ip->F_sz_nm / 90.0 /
568 90.0);//this is um^2
569 if (g_ip->F_sz_nm > 90)
570 area_t = 8.47 * 1e6 *
571 g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
572 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
573 gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
574 //W The base energy of ALU average numbers from Intel 4G and
575 //773Mhz (Wattch)
576 base_energy = core_params.core_ty == Inorder ? 0 : 89e-3 * 3;
577 base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 /
578 1.2);
579 per_access_energy = 1.15*3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per op energy(nJ)
580 FU_height=(38667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data
581 } else if (fu_type == ALU) {
582 name = "Integer ALU(s)";
583 num_fu = core_params.num_alus;
584 //this is um^2 ALU + MUl
585 area_t = 280 * 260 * 2 * g_tp.scaling_factor.logic_scaling_co_eff;
586 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
587 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
588 //W The base energy of ALU average numbers from Intel 4G and 773Mhz
589 //(Wattch)
590 base_energy = core_params.core_ty == Inorder ? 0 : 89e-3;
591 base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 /
592 1.2);
593 per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
594 FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU
595 } else if (fu_type == MUL) {
596 name = "Multiply/Divide Unit(s)";
597 num_fu = core_params.num_muls;
598 //this is um^2 ALU + MUl
599 area_t = 280 * 260 * 2 * 3 *
600 g_tp.scaling_factor.logic_scaling_co_eff;
601 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
602 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
603 //W The base energy of ALU average numbers from Intel 4G and 773Mhz
604 //(Wattch)
605 base_energy = core_params.core_ty == Inorder ? 0 : 89e-3 * 2;
606 base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 /
607 1.2);
608 per_access_energy = 1.15*2/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
609 FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data
610 } else {
611 cout << "Unknown Functional Unit Type" << endl;
612 exit(0);

484 }

613 }

485 else
486 {
487 if (fu_type == FPU)
488 {
489 num_fu=coredynp.num_fpus;
490 //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
491 area_t = 8.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2
492 if (g_ip->F_sz_nm>90)
493 area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
494 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
495 gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
496 //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles.
497 base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
498 base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
499 per_access_energy = 1.15*3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per op energy(nJ)
500 FU_height=(38667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data
501 }
502 else if (fu_type == ALU)
503 {
504 num_fu=coredynp.num_alus;
505 area_t = 280*260*2*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
506 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
507 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
508 base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
509 base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
510 per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
511 FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU

614 }

512

615

513 }
514 else if (fu_type == MUL)
515 {
516 num_fu=coredynp.num_muls;
517 area_t = 280*260*2*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
518 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
519 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
520 base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
521 base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
522 per_access_energy = 1.15*2/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
523 FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data
524 }
525 else
526 {
527 cout<<"Unknown Functional Unit Type"<<endl;
528 exit(0);
529 }
530 }
531 //IEXEU, simple ALU and FPU
532 // double C_ALU, C_EXEU, C_FPU; //Lum Equivalent capacitance of IEXEU and FPU. Based on Intel and Sun 90nm process fabracation.
533 //
534 // C_ALU = 0.025e-9;//F
535 // C_EXEU = 0.05e-9; //F
536 // C_FPU = 0.35e-9;//F

537 area.set_area(area_t*num_fu);

616 area.set_area(area_t*num_fu);

538 leakage *= num_fu;
539 gate_leakage *=num_fu;
540 double macro_layout_overhead = g_tp.macro_layout_overhead;
541// if (!XML->sys.Embedded)
542 area.set_area(area.get_area()*macro_layout_overhead);

617 power.readOp.leakage = leakage * num_fu;
618 power.readOp.gate_leakage = gate_leakage * num_fu;
619
620 double long_channel_device_reduction =
621 longer_channel_device_reduction(Core_device, core_params.core_ty);
622 power.readOp.longer_channel_leakage =
623 power.readOp.leakage * long_channel_device_reduction;
624 double macro_layout_overhead = g_tp.macro_layout_overhead;
625 area.set_area(area.get_area()*macro_layout_overhead);

543}
544

626}
627

545void FunctionalUnit::computeEnergy(bool is_tdp)
546{
547 double pppm_t[4] = {1,1,1,1};
548 double FU_duty_cycle;
549 if (is_tdp)
550 {

628void FunctionalUnit::computeEnergy() {
629 double pppm_t[4] = {1, 1, 1, 1};
630 double FU_duty_cycle;
631 double sckRation = g_tp.sckt_co_eff;

551

632

633 // TDP power calculation
634 //2 means two source operands needs to be passed for each int instruction.
635 set_pppm(pppm_t, 2, 2, 2, 2);
636 tdp_stats.readAc.access = num_fu;
637 if (fu_type == FPU) {
638 FU_duty_cycle = core_stats.FPU_duty_cycle;
639 } else if (fu_type == ALU) {
640 FU_duty_cycle = core_stats.ALU_duty_cycle;
641 } else if (fu_type == MUL) {
642 FU_duty_cycle = core_stats.MUL_duty_cycle;
643 }

552

644

553 set_pppm(pppm_t, 2, 2, 2, 2);//2 means two source operands needs to be passed for each int instruction.
554 if (fu_type == FPU)
555 {
556 stats_t.readAc.access = num_fu;
557 tdp_stats = stats_t;
558 FU_duty_cycle = coredynp.FPU_duty_cycle;
559 }
560 else if (fu_type == ALU)
561 {
562 stats_t.readAc.access = 1*num_fu;
563 tdp_stats = stats_t;
564 FU_duty_cycle = coredynp.ALU_duty_cycle;
565 }
566 else if (fu_type == MUL)
567 {
568 stats_t.readAc.access = num_fu;
569 tdp_stats = stats_t;
570 FU_duty_cycle = coredynp.MUL_duty_cycle;
571 }

645 power.readOp.dynamic =
646 per_access_energy * tdp_stats.readAc.access + base_energy / clockRate;
647 power.readOp.dynamic *= sckRation * FU_duty_cycle;

572

648

573 //power.readOp.dynamic = base_energy/clockRate + energy*stats_t.readAc.access;
574 power.readOp.dynamic = per_access_energy*stats_t.readAc.access + base_energy/clockRate;
575 double sckRation = g_tp.sckt_co_eff;
576 power.readOp.dynamic *= sckRation*FU_duty_cycle;
577 power.writeOp.dynamic *= sckRation;
578 power.searchOp.dynamic *= sckRation;

649 // Runtime power calculation
650 if (fu_type == FPU) {
651 rtp_stats.readAc.access = core_stats.fpu_accesses;
652 } else if (fu_type == ALU) {
653 rtp_stats.readAc.access = core_stats.ialu_accesses;
654 } else if (fu_type == MUL) {
655 rtp_stats.readAc.access = core_stats.mul_accesses;
656 }

579

657

580 power.readOp.leakage = leakage;
581 power.readOp.gate_leakage = gate_leakage;
582 double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
583 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;

658 rt_power.readOp.dynamic = per_access_energy * rtp_stats.readAc.access +
659 base_energy * execution_time;
660 rt_power.readOp.dynamic *= sckRation;

584

661

585 }
586 else
587 {
588 if (fu_type == FPU)
589 {
590 stats_t.readAc.access = XML->sys.core[ithCore].fpu_accesses;
591 rtp_stats = stats_t;
592 }
593 else if (fu_type == ALU)
594 {
595 stats_t.readAc.access = XML->sys.core[ithCore].ialu_accesses;
596 rtp_stats = stats_t;
597 }
598 else if (fu_type == MUL)
599 {
600 stats_t.readAc.access = XML->sys.core[ithCore].mul_accesses;
601 rtp_stats = stats_t;
602 }
603
604 //rt_power.readOp.dynamic = base_energy*executionTime + energy*stats_t.readAc.access;
605 rt_power.readOp.dynamic = per_access_energy*stats_t.readAc.access + base_energy*executionTime;
606 double sckRation = g_tp.sckt_co_eff;
607 rt_power.readOp.dynamic *= sckRation;
608 rt_power.writeOp.dynamic *= sckRation;
609 rt_power.searchOp.dynamic *= sckRation;
610
611 }
612
613

662 output_data.area = area.get_area() / 1e6;
663 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
664 output_data.subthreshold_leakage_power =
665 (longer_channel_device) ? power.readOp.longer_channel_leakage :
666 power.readOp.leakage;
667 output_data.gate_leakage_power = power.readOp.gate_leakage;
668 output_data.runtime_dynamic_energy = rt_power.readOp.dynamic;

614}
615

669}
670

616void FunctionalUnit::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
617{
618 string indent_str(indent, ' ');
619 string indent_str_next(indent+2, ' ');
620 bool long_channel = XML->sys.longer_channel_device;
621
622// cout << indent_str_next << "Results Broadcast Bus Area = " << bypass->area.get_area() *1e-6 << " mm^2" << endl;
623 if (is_tdp)
624 {
625 if (fu_type == FPU)
626 {
627 cout << indent_str << "Floating Point Units (FPUs) (Count: "<< coredynp.num_fpus <<" ):" << endl;
628 cout << indent_str_next << "Area = " << area.get_area()*1e-6 << " mm^2" << endl;
629 cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl;
630// cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage << " W" << endl;
631 cout << indent_str_next<< "Subthreshold Leakage = "
632 << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
633 cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
634 cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
635 cout <<endl;
636 }
637 else if (fu_type == ALU)
638 {
639 cout << indent_str << "Integer ALUs (Count: "<< coredynp.num_alus <<" ):" << endl;
640 cout << indent_str_next << "Area = " << area.get_area()*1e-6 << " mm^2" << endl;
641 cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl;
642// cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage << " W" << endl;
643 cout << indent_str_next<< "Subthreshold Leakage = "
644 << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
645 cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
646 cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
647 cout <<endl;
648 }
649 else if (fu_type == MUL)
650 {
651 cout << indent_str << "Complex ALUs (Mul/Div) (Count: "<< coredynp.num_muls <<" ):" << endl;
652 cout << indent_str_next << "Area = " << area.get_area()*1e-6 << " mm^2" << endl;
653 cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl;
654// cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage << " W" << endl;
655 cout << indent_str_next<< "Subthreshold Leakage = "
656 << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
657 cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
658 cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
659 cout <<endl;
660
661 }
662
663 }
664 else
665 {
666 }
667
668}
669

670void FunctionalUnit::leakage_feedback(double temperature)
671{
672 // Update the temperature and initialize the global interfaces.
673 interface_ip.temp = (unsigned int)round(temperature/10.0)*10;
674

671void FunctionalUnit::leakage_feedback(double temperature)
672{
673 // Update the temperature and initialize the global interfaces.
674 interface_ip.temp = (unsigned int)round(temperature/10.0)*10;
675

675 uca_org_t init_result = init_interface(&interface_ip); // init_result is dummy

676 // init_result is dummy
677 uca_org_t init_result = init_interface(&interface_ip, name);

676
677 // This is part of FunctionalUnit()
678 double area_t, leakage, gate_leakage;
679 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
680
681 if (fu_type == FPU)
682 {
683 area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number

--- 17 unchanged lines hidden (view full) ---

701 else
702 {
703 cout<<"Unknown Functional Unit Type"<<endl;
704 exit(1);
705 }
706
707 power.readOp.leakage = leakage*num_fu;
708 power.readOp.gate_leakage = gate_leakage*num_fu;

678
679 // This is part of FunctionalUnit()
680 double area_t, leakage, gate_leakage;
681 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
682
683 if (fu_type == FPU)
684 {
685 area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number

--- 17 unchanged lines hidden (view full) ---

703 else
704 {
705 cout<<"Unknown Functional Unit Type"<<endl;
706 exit(1);
707 }
708
709 power.readOp.leakage = leakage*num_fu;
710 power.readOp.gate_leakage = gate_leakage*num_fu;

709 power.readOp.longer_channel_leakage = longer_channel_device_reduction(Core_device, coredynp.core_ty);

711 power.readOp.longer_channel_leakage =
712 longer_channel_device_reduction(Core_device, core_params.core_ty);

710}
711

713}
714

712UndiffCore::UndiffCore(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_, bool embedded_)
713:XML(XML_interface),
714 ithCore(ithCore_),
715 interface_ip(*interface_ip_),
716 coredynp(dyn_p_),
717 core_ty(coredynp.core_ty),
718 embedded(XML->sys.Embedded),
719 pipeline_stage(coredynp.pipeline_stages),
720 num_hthreads(coredynp.num_hthreads),
721 issue_width(coredynp.issueW),
722 exist(exist_)
723// is_default(_is_default)
724{
725 if (!exist) return;
726 double undifferentiated_core=0;
727 double core_tx_density=0;
728 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();

715UndiffCore::UndiffCore(XMLNode* _xml_data, InputParameter* interface_ip_,
716 const CoreParameters & dyn_p_,
717 bool exist_)
718 : McPATComponent(_xml_data),
719 interface_ip(*interface_ip_), coredynp(dyn_p_),
720 core_ty(coredynp.core_ty), embedded(coredynp.Embedded),
721 pipeline_stage(coredynp.pipeline_stages),
722 num_hthreads(coredynp.num_hthreads), issue_width(coredynp.issueW),
723 exist(exist_) {
724 if (!exist) return;
725
726 name = "Undifferentiated Core";
727 clockRate = coredynp.clockRate;
728
729 double undifferentiated_core = 0;
730 double core_tx_density = 0;
731 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();

729 double undifferentiated_core_coe;

732 double undifferentiated_core_coe;

730 //XML_interface=_XML_interface;
731 uca_org_t result2;
732 result2 = init_interface(&interface_ip);

733 uca_org_t result2;
734 result2 = init_interface(&interface_ip, name);

733

735

734 //Compute undifferentiated core area at 90nm.
735 if (embedded==false)
736 {
737 //Based on the results of polynomial/log curve fitting based on undifferentiated core of Niagara, Niagara2, Merom, Penyrn, Prescott, Opteron die measurements
738 if (core_ty==OOO)
739 {
740 //undifferentiated_core = (0.0764*pipeline_stage*pipeline_stage -2.3685*pipeline_stage + 10.405);//OOO
741 undifferentiated_core = (3.57*log(pipeline_stage)-1.2643)>0?(3.57*log(pipeline_stage)-1.2643):0;
742 }
743 else if (core_ty==Inorder)
744 {
745 //undifferentiated_core = (0.1238*pipeline_stage + 7.2572)*0.9;//inorder
746 undifferentiated_core = (-2.19*log(pipeline_stage)+6.55)>0?(-2.19*log(pipeline_stage)+6.55):0;
747 }
748 else
749 {
750 cout<<"invalid core type"<<endl;
751 exit(0);
752 }
753 undifferentiated_core *= (1+ logtwo(num_hthreads)* 0.0716);

736 //Compute undifferentiated core area at 90nm.
737 if (embedded == false) {
738 //Based on the results of polynomial/log curve fitting based on undifferentiated core of Niagara, Niagara2, Merom, Penyrn, Prescott, Opteron die measurements
739 if (core_ty == OOO) {
740 undifferentiated_core = (3.57 * log(pipeline_stage) - 1.2643) > 0 ?
741 (3.57 * log(pipeline_stage) - 1.2643) : 0;
742 } else if (core_ty == Inorder) {
743 undifferentiated_core = (-2.19 * log(pipeline_stage) + 6.55) > 0 ?
744 (-2.19 * log(pipeline_stage) + 6.55) : 0;
745 } else {
746 cout << "invalid core type" << endl;
747 exit(0);

754 }

748 }

755 else
756 {
757 //Based on the results in paper "parametrized processor models" Sandia Labs
758 if (XML->sys.opt_clockrate)

749 undifferentiated_core *= (1 + logtwo(num_hthreads) * 0.0716);
750 } else {
751 //Based on the results in paper "parametrized processor models" Sandia Labs
752 if (opt_for_clk)

759 undifferentiated_core_coe = 0.05;
760 else
761 undifferentiated_core_coe = 0;

753 undifferentiated_core_coe = 0.05;
754 else
755 undifferentiated_core_coe = 0;

762 undifferentiated_core = (0.4109* pipeline_stage - 0.776)*undifferentiated_core_coe;
763 undifferentiated_core *= (1+ logtwo(num_hthreads)* 0.0426);
764 }

756 undifferentiated_core = (0.4109 * pipeline_stage - 0.776) *
757 undifferentiated_core_coe;
758 undifferentiated_core *= (1 + logtwo(num_hthreads) * 0.0426);
759 }

765

760

766 undifferentiated_core *= g_tp.scaling_factor.logic_scaling_co_eff*1e6;//change from mm^2 to um^2
767 core_tx_density = g_tp.scaling_factor.core_tx_density;
768 //undifferentiated_core = 3*1e6;
769 //undifferentiated_core *= g_tp.scaling_factor.logic_scaling_co_eff;//(g_ip->F_sz_um*g_ip->F_sz_um/0.09/0.09)*;
770 power.readOp.leakage = undifferentiated_core*(core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
771 power.readOp.gate_leakage = undifferentiated_core*(core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;

761 undifferentiated_core *= g_tp.scaling_factor.logic_scaling_co_eff *
762 1e6;//change from mm^2 to um^2
763 core_tx_density = g_tp.scaling_factor.core_tx_density;
764 power.readOp.leakage = undifferentiated_core*(core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
765 power.readOp.gate_leakage = undifferentiated_core*(core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;

772

766

773 double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
774 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;
775 area.set_area(undifferentiated_core);

767 double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
768 power.readOp.longer_channel_leakage =
769 power.readOp.leakage * long_channel_device_reduction;
770 area.set_area(undifferentiated_core);

776

771

777 scktRatio = g_tp.sckt_co_eff;
778 power.readOp.dynamic *= scktRatio;
779 power.writeOp.dynamic *= scktRatio;
780 power.searchOp.dynamic *= scktRatio;
781 macro_PR_overhead = g_tp.macro_layout_overhead;
782 area.set_area(area.get_area()*macro_PR_overhead);

772 scktRatio = g_tp.sckt_co_eff;
773 power.readOp.dynamic *= scktRatio;
774 power.writeOp.dynamic *= scktRatio;
775 power.searchOp.dynamic *= scktRatio;
776 macro_PR_overhead = g_tp.macro_layout_overhead;
777 area.set_area(area.get_area()*macro_PR_overhead);

783

778

784
785
786// double vt=g_tp.peri_global.Vth;
787// double velocity_index=1.1;
788// double c_in=gate_C(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r , 0.0, false);
789// double c_out= drain_C_(g_tp.min_w_nmos_, NCH, 2, 1, g_tp.cell_h_def, false) + drain_C_(g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, PCH, 1, 1, g_tp.cell_h_def, false) + c_in;
790// double w_nmos=g_tp.min_w_nmos_;
791// double w_pmos=g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
792// double i_on_n=1.0;
793// double i_on_p=1.0;
794// double i_on_n_in=1.0;
795// double i_on_p_in=1;
796// double vdd=g_tp.peri_global.Vdd;
797
798// power.readOp.sc=shortcircuit_simple(vt, velocity_index, c_in, c_out, w_nmos,w_pmos, i_on_n, i_on_p,i_on_n_in, i_on_p_in, vdd);
799// power.readOp.dynamic=c_out*vdd*vdd/2;
800
801// cout<<power.readOp.dynamic << "dynamic" <<endl;
802// cout<<power.readOp.sc << "sc" << endl;
803
804// power.readOp.sc=shortcircuit(vt, velocity_index, c_in, c_out, w_nmos,w_pmos, i_on_n, i_on_p,i_on_n_in, i_on_p_in, vdd);
805// power.readOp.dynamic=c_out*vdd*vdd/2;
806//
807// cout<<power.readOp.dynamic << "dynamic" <<endl;
808// cout<<power.readOp.sc << "sc" << endl;
809
810
811

779 output_data.area = area.get_area() / 1e6;
780 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
781 output_data.subthreshold_leakage_power =
782 longer_channel_device ? power.readOp.longer_channel_leakage :
783 power.readOp.leakage;
784 output_data.gate_leakage_power = power.readOp.gate_leakage;

812}
813

785}
786

787InstructionDecoder::InstructionDecoder(XMLNode* _xml_data, const string _name,
788 bool _is_default,
789 const InputParameter *configure_interface,
790 int opcode_length_, int num_decoders_,
791 bool x86_,
792 double clockRate_,
793 enum Device_ty device_ty_,
794 enum Core_type core_ty_)
795 : McPATComponent(_xml_data), is_default(_is_default),
796 opcode_length(opcode_length_), num_decoders(num_decoders_), x86(x86_),
797 device_ty(device_ty_), core_ty(core_ty_) {
798 /*
799 * Instruction decoder is different from n to 2^n decoders
800 * that are commonly used in row decoders in memory arrays.
801 * The RISC instruction decoder is typically a very simple device.
802 * We can decode an instruction by simply
803 * separating the machine word into small parts using wire slices
804 * The RISC instruction decoder can be approximate by the n to 2^n decoders,
805 * although this approximation usually underestimate power since each decoded
806 * instruction normally has more than 1 active signal.
807 *
808 * However, decoding a CISC instruction word is much more difficult
809 * than the RISC case. A CISC decoder is typically set up as a state machine.
810 * The machine reads the opcode field to determine
811 * what type of instruction it is,
812 * and where the other data values are.
813 * The instruction word is read in piece by piece,
814 * and decisions are made at each stage as to
815 * how the remainder of the instruction word will be read.
816 * (sequencer and ROM are usually needed)
817 * An x86 decoder can be even more complex since
818 * it involve both decoding instructions into u-ops and
819 * merge u-ops when doing micro-ops fusion.
820 */
821 name = _name;
822 clockRate = clockRate_;
823 bool is_dram = false;
824 double pmos_to_nmos_sizing_r;
825 double load_nmos_width, load_pmos_width;
826 double C_driver_load, R_wire_load;
827 Area cell;

814

828

815void UndiffCore::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
816{
817 string indent_str(indent, ' ');
818 string indent_str_next(indent+2, ' ');
819 bool long_channel = XML->sys.longer_channel_device;

829 l_ip = *configure_interface;
830 local_result = init_interface(&l_ip, name);
831 cell.h = g_tp.cell_h_def;
832 cell.w = g_tp.cell_h_def;

820

833

821 if (is_tdp)
822 {
823 cout << indent_str << "UndiffCore:" << endl;
824 cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
825 cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl;
826 //cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl;
827 cout << indent_str_next<< "Subthreshold Leakage = "
828 << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
829 cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
830 //cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
831 cout <<endl;
832 }
833 else
834 {
835 cout << indent_str << "UndiffCore:" << endl;
836 cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
837 cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl;
838 cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl;
839 cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
840 //cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
841 cout <<endl;
842 }

834 num_decoder_segments = (int)ceil(opcode_length / 18.0);
835 if (opcode_length > 18) opcode_length = 18;
836 num_decoded_signals = (int)pow(2.0, opcode_length);
837 pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
838 load_nmos_width = g_tp.max_w_nmos_ / 2;
839 load_pmos_width = g_tp.max_w_nmos_ * pmos_to_nmos_sizing_r;
840 C_driver_load = 1024 * gate_C(load_nmos_width + load_pmos_width, 0, is_dram);
841 R_wire_load = 3000 * l_ip.F_sz_um * g_tp.wire_outside_mat.R_per_um;

843

842

844}

843 final_dec = new Decoder(
844 num_decoded_signals,
845 false,
846 C_driver_load,
847 R_wire_load,
848 false/*is_fa*/,
849 false/*is_dram*/,
850 false/*wl_tr*/, //to use peri device
851 cell);

845

852

846inst_decoder::inst_decoder(
847 bool _is_default,
848 const InputParameter *configure_interface,
849 int opcode_length_,
850 int num_decoders_,
851 bool x86_,
852 enum Device_ty device_ty_,
853 enum Core_type core_ty_)
854:is_default(_is_default),
855 opcode_length(opcode_length_),
856 num_decoders(num_decoders_),
857 x86(x86_),
858 device_ty(device_ty_),
859 core_ty(core_ty_)
860 {
861 /*
862 * Instruction decoder is different from n to 2^n decoders
863 * that are commonly used in row decoders in memory arrays.
864 * The RISC instruction decoder is typically a very simple device.
865 * We can decode an instruction by simply
866 * separating the machine word into small parts using wire slices
867 * The RISC instruction decoder can be approximate by the n to 2^n decoders,
868 * although this approximation usually underestimate power since each decoded
869 * instruction normally has more than 1 active signal.
870 *
871 * However, decoding a CISC instruction word is much more difficult
872 * than the RISC case. A CISC decoder is typically set up as a state machine.
873 * The machine reads the opcode field to determine
874 * what type of instruction it is,
875 * and where the other data values are.
876 * The instruction word is read in piece by piece,
877 * and decisions are made at each stage as to
878 * how the remainder of the instruction word will be read.
879 * (sequencer and ROM are usually needed)
880 * An x86 decoder can be even more complex since
881 * it involve both decoding instructions into u-ops and
882 * merge u-ops when doing micro-ops fusion.
883 */
884 bool is_dram=false;
885 double pmos_to_nmos_sizing_r;
886 double load_nmos_width, load_pmos_width;
887 double C_driver_load, R_wire_load;
888 Area cell;

853 PredecBlk * predec_blk1 = new PredecBlk(
854 num_decoded_signals,
855 final_dec,
856 0,//Assuming predec and dec are back to back
857 0,
858 1,//Each Predec only drives one final dec
859 false/*is_dram*/,
860 true);
861 PredecBlk * predec_blk2 = new PredecBlk(
862 num_decoded_signals,
863 final_dec,
864 0,//Assuming predec and dec are back to back
865 0,
866 1,//Each Predec only drives one final dec
867 false/*is_dram*/,
868 false);

889

869

890 l_ip=*configure_interface;
891 local_result = init_interface(&l_ip);
892 cell.h =g_tp.cell_h_def;
893 cell.w =g_tp.cell_h_def;

870 PredecBlkDrv * predec_blk_drv1 = new PredecBlkDrv(0, predec_blk1, false);
871 PredecBlkDrv * predec_blk_drv2 = new PredecBlkDrv(0, predec_blk2, false);

894

872

895 num_decoder_segments = (int)ceil(opcode_length/18.0);
896 if (opcode_length > 18) opcode_length = 18;
897 num_decoded_signals= (int)pow(2.0,opcode_length);
898 pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
899 load_nmos_width=g_tp.max_w_nmos_ /2;
900 load_pmos_width= g_tp.max_w_nmos_ * pmos_to_nmos_sizing_r;
901 C_driver_load = 1024*gate_C(load_nmos_width + load_pmos_width, 0, is_dram); //TODO: this number 1024 needs to be revisited
902 R_wire_load = 3000*l_ip.F_sz_um * g_tp.wire_outside_mat.R_per_um;

873 pre_dec = new Predec(predec_blk_drv1, predec_blk_drv2);

903

874

904 final_dec = new Decoder(
905 num_decoded_signals,
906 false,
907 C_driver_load,
908 R_wire_load,
909 false/*is_fa*/,
910 false/*is_dram*/,
911 false/*wl_tr*/, //to use peri device
912 cell);

875 double area_decoder = final_dec->area.get_area() * num_decoded_signals *
876 num_decoder_segments * num_decoders;
877 //double w_decoder = area_decoder / area.get_h();
878 double area_pre_dec = (predec_blk_drv1->area.get_area() +
879 predec_blk_drv2->area.get_area() +
880 predec_blk1->area.get_area() +
881 predec_blk2->area.get_area()) *
882 num_decoder_segments * num_decoders;
883 area.set_area(area.get_area() + area_decoder + area_pre_dec);
884 double macro_layout_overhead = g_tp.macro_layout_overhead;
885 double chip_PR_overhead = g_tp.chip_layout_overhead;
886 area.set_area(area.get_area()*macro_layout_overhead*chip_PR_overhead);

913

887

914 PredecBlk * predec_blk1 = new PredecBlk(
915 num_decoded_signals,
916 final_dec,
917 0,//Assuming predec and dec are back to back
918 0,
919 1,//Each Predec only drives one final dec
920 false/*is_dram*/,
921 true);
922 PredecBlk * predec_blk2 = new PredecBlk(
923 num_decoded_signals,
924 final_dec,
925 0,//Assuming predec and dec are back to back
926 0,
927 1,//Each Predec only drives one final dec
928 false/*is_dram*/,
929 false);

888 inst_decoder_delay_power();

930

889

931 PredecBlkDrv * predec_blk_drv1 = new PredecBlkDrv(0, predec_blk1, false);
932 PredecBlkDrv * predec_blk_drv2 = new PredecBlkDrv(0, predec_blk2, false);

890 double sckRation = g_tp.sckt_co_eff;
891 power.readOp.dynamic *= sckRation;
892 power.writeOp.dynamic *= sckRation;
893 power.searchOp.dynamic *= sckRation;

933

894

934 pre_dec = new Predec(predec_blk_drv1, predec_blk_drv2);

895 double long_channel_device_reduction =
896 longer_channel_device_reduction(device_ty, core_ty);
897 power.readOp.longer_channel_leakage = power.readOp.leakage *
898 long_channel_device_reduction;

935

899

936 double area_decoder = final_dec->area.get_area() * num_decoded_signals * num_decoder_segments*num_decoders;
937 //double w_decoder = area_decoder / area.get_h();
938 double area_pre_dec = (predec_blk_drv1->area.get_area() +
939 predec_blk_drv2->area.get_area() +
940 predec_blk1->area.get_area() +
941 predec_blk2->area.get_area())*
942 num_decoder_segments*num_decoders;
943 area.set_area(area.get_area()+ area_decoder + area_pre_dec);
944 double macro_layout_overhead = g_tp.macro_layout_overhead;
945 double chip_PR_overhead = g_tp.chip_layout_overhead;
946 area.set_area(area.get_area()*macro_layout_overhead*chip_PR_overhead);
947
948 inst_decoder_delay_power();
949
950 double sckRation = g_tp.sckt_co_eff;
951 power.readOp.dynamic *= sckRation;
952 power.writeOp.dynamic *= sckRation;
953 power.searchOp.dynamic *= sckRation;
954
955 double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
956 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;
957

900 output_data.area = area.get_area() / 1e6;
901 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
902 output_data.subthreshold_leakage_power = power.readOp.leakage;
903 output_data.gate_leakage_power = power.readOp.gate_leakage;

958}
959

904}
905

960void inst_decoder::inst_decoder_delay_power()
961{

906void InstructionDecoder::inst_decoder_delay_power() {

962

907

963 double dec_outrisetime;
964 double inrisetime=0, outrisetime;
965 double pppm_t[4] = {1,1,1,1};
966 double squencer_passes = x86?2:1;

908 double dec_outrisetime;
909 double inrisetime = 0, outrisetime;
910 double pppm_t[4] = {1, 1, 1, 1};
911 double squencer_passes = x86 ? 2 : 1;

967

912

968 outrisetime = pre_dec->compute_delays(inrisetime);
969 dec_outrisetime = final_dec->compute_delays(outrisetime);
970 set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments);
971 power = power + pre_dec->power*pppm_t;

913 outrisetime = pre_dec->compute_delays(inrisetime);
914 dec_outrisetime = final_dec->compute_delays(outrisetime);
915 set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments);
916 power = power + pre_dec->power * pppm_t;

972 set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals,

917 set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals,

973 num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments);
974 power = power + final_dec->power*pppm_t;

918 num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments);
919 power = power + final_dec->power * pppm_t;

975}

920}

976void inst_decoder::leakage_feedback(double temperature)
977{

921
922void InstructionDecoder::leakage_feedback(double temperature) {

978 l_ip.temp = (unsigned int)round(temperature/10.0)*10;

923 l_ip.temp = (unsigned int)round(temperature/10.0)*10;

979 uca_org_t init_result = init_interface(&l_ip); // init_result is dummy

924 uca_org_t init_result = init_interface(&l_ip, name); // init_result is dummy

980
981 final_dec->leakage_feedback(temperature);
982 pre_dec->leakage_feedback(temperature);
983
984 double pppm_t[4] = {1,1,1,1};
985 double squencer_passes = x86?2:1;
986
987 set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments);

--- 7 unchanged lines hidden (view full) ---

995 power.readOp.dynamic *= sckRation;
996 power.writeOp.dynamic *= sckRation;
997 power.searchOp.dynamic *= sckRation;
998
999 double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
1000 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;
1001}
1002

925
926 final_dec->leakage_feedback(temperature);
927 pre_dec->leakage_feedback(temperature);
928
929 double pppm_t[4] = {1,1,1,1};
930 double squencer_passes = x86?2:1;
931
932 set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments);

--- 7 unchanged lines hidden (view full) ---

940 power.readOp.dynamic *= sckRation;
941 power.writeOp.dynamic *= sckRation;
942 power.searchOp.dynamic *= sckRation;
943
944 double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
945 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;
946}
947

1003inst_decoder::~inst_decoder()
1004{
1005 local_result.cleanup();

948InstructionDecoder::~InstructionDecoder() {
949 local_result.cleanup();

1006

950

1007 delete final_dec;

951 delete final_dec;

1008

952

1009 delete pre_dec->blk1;
1010 delete pre_dec->blk2;
1011 delete pre_dec->drv1;
1012 delete pre_dec->drv2;
1013 delete pre_dec;

953 delete pre_dec->blk1;
954 delete pre_dec->blk2;
955 delete pre_dec->drv1;
956 delete pre_dec->drv2;
957 delete pre_dec;