logic.cc revision 10152:52c552138ba1
1/*****************************************************************************
2 *                                McPAT
3 *                      SOFTWARE LICENSE AGREEMENT
4 *            Copyright 2012 Hewlett-Packard Development Company, L.P.
5 *                          All Rights Reserved
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are
9 * met: redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer;
11 * redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution;
14 * neither the name of the copyright holders nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
29 *
30 ***************************************************************************/
31
32#include "logic.h"
33
34
35//selection_logic
36selection_logic::selection_logic(
37    bool   _is_default,
38    int    win_entries_,
39    int    issue_width_,
40    const InputParameter *configure_interface,
41    enum Device_ty device_ty_,
42    enum Core_type core_ty_)
43    //const ParseXML *_XML_interface)
44 :is_default(_is_default),
45  win_entries(win_entries_),
46  issue_width(issue_width_),
47  device_ty(device_ty_),
48  core_ty(core_ty_)
49 {
50        //uca_org_t result2;
51        l_ip=*configure_interface;
52        local_result = init_interface(&l_ip);
53        //init_tech_params(l_ip.F_sz_um, false);
54        //win_entries=numIBEntries;//IQentries;
55                //issue_width=issueWidth;
56        selection_power();
57        double sckRation = g_tp.sckt_co_eff;
58        power.readOp.dynamic *= sckRation;
59        power.writeOp.dynamic *= sckRation;
60        power.searchOp.dynamic *= sckRation;
61
62        double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
63        power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;
64         }
65
66void selection_logic::selection_power()
67{//based on cost effective superscalar processor TR pp27-31
68  double Ctotal, Cor, Cpencode;
69  int num_arbiter;
70  double WSelORn, WSelORprequ, WSelPn, WSelPp, WSelEnn, WSelEnp;
71
72  //TODO: the 0.8um process data is used.
73  WSelORn    	=  	12.5 * l_ip.F_sz_um;//this was 10 micron for the 0.8 micron process
74  WSelORprequ   = 	50 * l_ip.F_sz_um;//this was 40 micron for the 0.8 micron process
75  WSelPn     	= 	12.5 * l_ip.F_sz_um;//this was 10mcron for the 0.8 micron process
76  WSelPp     	=  	18.75 * l_ip.F_sz_um;//this was 15 micron for the 0.8 micron process
77  WSelEnn    	=  	6.25 * l_ip.F_sz_um;//this was 5 micron for the 0.8 micron process
78  WSelEnp    	=  	12.5 * l_ip.F_sz_um;//this was 10 micron for the 0.8 micron process
79
80
81  Ctotal=0;
82  num_arbiter=1;
83  while(win_entries > 4)
84    {
85      win_entries = (int)ceil((double)win_entries / 4.0);
86      num_arbiter += win_entries;
87    }
88  //the 4-input OR logic to generate anyreq
89  Cor = 4 * drain_C_(WSelORn,NCH,1,1, g_tp.cell_h_def) + drain_C_(WSelORprequ,PCH,1,1, g_tp.cell_h_def);
90  power.readOp.gate_leakage = cmos_Ig_leakage(WSelORn, WSelORprequ, 4, nor)*g_tp.peri_global.Vdd;
91
92  //The total capacity of the 4-bit priority encoder
93  Cpencode = drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,1, 1, g_tp.cell_h_def) +
94    2*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,2, 1, g_tp.cell_h_def) +
95    3*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,3, 1, g_tp.cell_h_def) +
96    4*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,4, 1, g_tp.cell_h_def) +//precompute priority logic
97    2*4*gate_C(WSelEnn+WSelEnp,20.0)+
98    4*drain_C_(WSelEnn,NCH,1, 1, g_tp.cell_h_def) + 2*4*drain_C_(WSelEnp,PCH,1, 1, g_tp.cell_h_def)+//enable logic
99    (2*4+2*3+2*2+2)*gate_C(WSelPn+WSelPp,10.0);//requests signal
100
101  Ctotal += issue_width * num_arbiter*(Cor+Cpencode);
102
103  power.readOp.dynamic = Ctotal*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*2;//2 means the abitration signal need to travel round trip
104  power.readOp.leakage = issue_width * num_arbiter *
105      (cmos_Isub_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
106       + cmos_Isub_leakage(WSelPn, WSelPp, 3, nor)//grant2p
107       + cmos_Isub_leakage(WSelPn, WSelPp, 4, nor)//grant3p
108       + cmos_Isub_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
109       + cmos_Isub_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant sIsubnals
110                  )*g_tp.peri_global.Vdd;
111  power.readOp.gate_leakage = issue_width * num_arbiter *
112      (cmos_Ig_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
113       + cmos_Ig_leakage(WSelPn, WSelPp, 3, nor)//grant2p
114       + cmos_Ig_leakage(WSelPn, WSelPp, 4, nor)//grant3p
115       + cmos_Ig_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
116       + cmos_Ig_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant signals
117        )*g_tp.peri_global.Vdd;
118}
119
120
121dep_resource_conflict_check::dep_resource_conflict_check(
122        const InputParameter *configure_interface,
123        const CoreDynParam & dyn_p_,
124        int   compare_bits_,
125    bool   _is_default)
126 :  l_ip(*configure_interface),
127    coredynp(dyn_p_),
128    compare_bits(compare_bits_),
129        is_default(_is_default)
130{
131        Wcompn    	=  	25 * l_ip.F_sz_um;//this was 20.0 micron for the 0.8 micron process
132        Wevalinvp   = 	25 * l_ip.F_sz_um;//this was 20.0 micron for the 0.8 micron process
133        Wevalinvn   = 	100 * l_ip.F_sz_um;//this was 80.0 mcron for the 0.8 micron process
134        Wcomppreequ =  	50 * l_ip.F_sz_um;//this was 40.0  micron for the 0.8 micron process
135        WNORn    	=  	6.75 * l_ip.F_sz_um;//this was 5.4 micron for the 0.8 micron process
136        WNORp    	=  	38.125 * l_ip.F_sz_um;//this was 30.5 micron for the 0.8 micron process
137
138        local_result = init_interface(&l_ip);
139
140        if (coredynp.core_ty==Inorder)
141                    compare_bits += 16 + 8 + 8;//TODO: opcode bits + log(shared resources) + REG TAG BITS-->opcode comparator
142        else
143                compare_bits += 16 + 8 + 8;
144
145                conflict_check_power();
146        double sckRation = g_tp.sckt_co_eff;
147        power.readOp.dynamic *= sckRation;
148        power.writeOp.dynamic *= sckRation;
149        power.searchOp.dynamic *= sckRation;
150
151}
152
153void dep_resource_conflict_check::conflict_check_power()
154{
155        double Ctotal;
156        int num_comparators;
157        num_comparators = 3*((coredynp.decodeW) * (coredynp.decodeW)-coredynp.decodeW);//2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest to dest comparision.
158        //When decode-width ==1, no dcl logic
159
160        Ctotal = num_comparators * compare_cap();
161        //printf("%i,%s\n",XML_interface->sys.core[0].predictor.predictor_entries,XML_interface->sys.core[0].predictor.prediction_scheme);
162
163        power.readOp.dynamic=Ctotal*/*CLOCKRATE*/g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/*AF*/;
164        power.readOp.leakage=num_comparators*compare_bits*2*simplified_nmos_leakage(Wcompn,  false);
165
166        double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
167        power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;
168        power.readOp.gate_leakage=num_comparators*compare_bits*2*cmos_Ig_leakage(Wcompn, 0, 2, nmos);
169
170}
171
172/* estimate comparator power consumption (this comparator is similar
173   to the tag-match structure in a CAM */
174double dep_resource_conflict_check::compare_cap()
175{
176  double c1, c2;
177
178  WNORp = WNORp * compare_bits/2.0;//resize the big NOR gate at the DCL according to fan in.
179  /* bottom part of comparator */
180  c2 = (compare_bits)*(drain_C_(Wcompn,NCH,1,1, g_tp.cell_h_def)+drain_C_(Wcompn,NCH,2,1, g_tp.cell_h_def))+
181  drain_C_(Wevalinvp,PCH,1,1, g_tp.cell_h_def) + drain_C_(Wevalinvn,NCH,1,1, g_tp.cell_h_def);
182
183  /* top part of comparator */
184  c1 = (compare_bits)*(drain_C_(Wcompn,NCH,1,1, g_tp.cell_h_def)+drain_C_(Wcompn,NCH,2,1, g_tp.cell_h_def)+
185                  drain_C_(Wcomppreequ,NCH,1,1, g_tp.cell_h_def)) +  gate_C(WNORn + WNORp,10.0) +
186                  drain_C_(WNORp,NCH,2,1, g_tp.cell_h_def) + compare_bits*drain_C_(WNORn,NCH,2,1, g_tp.cell_h_def);
187  return(c1 + c2);
188
189}
190
191void dep_resource_conflict_check::leakage_feedback(double temperature)
192{
193  l_ip.temp = (unsigned int)round(temperature/10.0)*10;
194  uca_org_t init_result = init_interface(&l_ip); // init_result is dummy
195
196  // This is part of conflict_check_power()
197  int num_comparators = 3*((coredynp.decodeW) * (coredynp.decodeW)-coredynp.decodeW);//2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest to dest comparision.
198  power.readOp.leakage=num_comparators*compare_bits*2*simplified_nmos_leakage(Wcompn,  false);
199
200  double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
201  power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;
202  power.readOp.gate_leakage=num_comparators*compare_bits*2*cmos_Ig_leakage(Wcompn, 0, 2, nmos);
203}
204
205//TODO: add inverter and transmission gate base DFF.
206
207DFFCell::DFFCell(
208                bool _is_dram,
209                double _WdecNANDn,
210                double _WdecNANDp,
211                double _cell_load,
212                const InputParameter *configure_interface)
213:is_dram(_is_dram),
214cell_load(_cell_load),
215WdecNANDn(_WdecNANDn),
216WdecNANDp(_WdecNANDp)
217{//this model is based on the NAND2 based DFF.
218                        l_ip=*configure_interface;
219//			area.set_area(730*l_ip.F_sz_um*l_ip.F_sz_um);
220                        area.set_area(5*compute_gate_area(NAND, 2,WdecNANDn,WdecNANDp, g_tp.cell_h_def)
221                                + compute_gate_area(NAND, 2,WdecNANDn,WdecNANDn, g_tp.cell_h_def));
222
223
224}
225
226
227double DFFCell::fpfp_node_cap(unsigned int fan_in, unsigned int fan_out)
228{
229  double Ctotal = 0;
230  //printf("WdecNANDn = %E\n", WdecNANDn);
231
232  /* part 1: drain cap of NAND gate */
233  Ctotal += drain_C_(WdecNANDn, NCH, 2, 1, g_tp.cell_h_def, is_dram) + fan_in * drain_C_(WdecNANDp, PCH, 1, 1, g_tp.cell_h_def, is_dram);
234
235  /* part 2: gate cap of NAND gates */
236  Ctotal += fan_out * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
237
238  return Ctotal;
239}
240
241
242void DFFCell::compute_DFF_cell()
243{
244        double c1, c2, c3, c4, c5, c6;
245           /* node 5 and node 6 are identical to node 1 in capacitance */
246           c1 = c5 = c6 = fpfp_node_cap(2, 1);
247           c2 = fpfp_node_cap(2, 3);
248           c3 = fpfp_node_cap(3, 2);
249           c4 = fpfp_node_cap(2, 2);
250
251           //cap-load of the clock signal in each Dff, actually the clock signal only connected to one NAND2
252           clock_cap= 2 * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
253           e_switch.readOp.dynamic += (c4 + c1 + c2 + c3 + c5 + c6 + 2*cell_load)*0.5*g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;
254
255           /* no 1/2 for e_keep and e_clock because clock signal switches twice in one cycle */
256           e_keep_1.readOp.dynamic += c3 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
257           e_keep_0.readOp.dynamic += c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
258           e_clock.readOp.dynamic += clock_cap* g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;
259
260           /* static power */
261           e_switch.readOp.leakage +=  (cmos_Isub_leakage(WdecNANDn, WdecNANDp, 2, nand)*5//5 NAND2 and 1 NAND3 in a DFF
262                                           + cmos_Isub_leakage(WdecNANDn, WdecNANDn, 3, nand))*g_tp.peri_global.Vdd;
263           e_switch.readOp.gate_leakage += (cmos_Ig_leakage(WdecNANDn, WdecNANDp, 2, nand)*5//5 NAND2 and 1 NAND3 in a DFF
264                                           + cmos_Ig_leakage(WdecNANDn, WdecNANDn, 3, nand))*g_tp.peri_global.Vdd;
265           //printf("leakage =%E\n",cmos_Ileak(1, is_dram) );
266}
267
268Pipeline::Pipeline(
269                const InputParameter *configure_interface,
270                const CoreDynParam & dyn_p_,
271                enum Device_ty device_ty_,
272                bool _is_core_pipeline,
273                bool _is_default)
274: l_ip(*configure_interface),
275  coredynp(dyn_p_),
276  device_ty(device_ty_),
277  is_core_pipeline(_is_core_pipeline),
278  is_default(_is_default),
279  num_piperegs(0.0)
280
281  {
282        local_result = init_interface(&l_ip);
283        if (!coredynp.Embedded)
284                process_ind = true;
285        else
286                process_ind = false;
287        WNANDn = (process_ind)? 25 *   l_ip.F_sz_um : g_tp.min_w_nmos_ ;//this was  20 micron for the 0.8 micron process
288        WNANDp = (process_ind)? 37.5 * l_ip.F_sz_um : g_tp.min_w_nmos_*pmos_to_nmos_sz_ratio();//this was  30 micron for the 0.8 micron process
289        load_per_pipeline_stage = 2*gate_C(WNANDn + WNANDp, 0, false);
290        compute();
291
292}
293
294void Pipeline::compute()
295{
296        compute_stage_vector();
297        DFFCell pipe_reg(false, WNANDn,WNANDp, load_per_pipeline_stage, &l_ip);
298        pipe_reg.compute_DFF_cell();
299
300        double clock_power_pipereg = num_piperegs * pipe_reg.e_clock.readOp.dynamic;
301        //******************pipeline power: currently, we average all the possibilities of the states of DFFs in the pipeline. A better way to do it is to consider
302        //the harming distance of two consecutive signals, However McPAT does not have plan to do this in near future as it focuses on worst case power.
303        double pipe_reg_power = num_piperegs * (pipe_reg.e_switch.readOp.dynamic+pipe_reg.e_keep_0.readOp.dynamic+pipe_reg.e_keep_1.readOp.dynamic)/3+clock_power_pipereg;
304        double pipe_reg_leakage = num_piperegs * pipe_reg.e_switch.readOp.leakage;
305        double pipe_reg_gate_leakage = num_piperegs * pipe_reg.e_switch.readOp.gate_leakage;
306        power.readOp.dynamic	+=pipe_reg_power;
307        power.readOp.leakage	+=pipe_reg_leakage;
308        power.readOp.gate_leakage	+=pipe_reg_gate_leakage;
309        area.set_area(num_piperegs * pipe_reg.area.get_area());
310
311        double long_channel_device_reduction = longer_channel_device_reduction(device_ty, coredynp.core_ty);
312        power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;
313
314
315        double sckRation = g_tp.sckt_co_eff;
316        power.readOp.dynamic *= sckRation;
317        power.writeOp.dynamic *= sckRation;
318        power.searchOp.dynamic *= sckRation;
319        double macro_layout_overhead = g_tp.macro_layout_overhead;
320        if (!coredynp.Embedded)
321                area.set_area(area.get_area()*macro_layout_overhead);
322}
323
324void Pipeline::compute_stage_vector()
325{
326        double num_stages, tot_stage_vector, per_stage_vector;
327        int opcode_length = coredynp.x86? coredynp.micro_opcode_length:coredynp.opcode_length;
328        //Hthread = thread_clock_gated? 1:num_thread;
329
330  if (!is_core_pipeline)
331  {
332        num_piperegs=l_ip.pipeline_stages*l_ip.per_stage_vector;//The number of pipeline stages are calculated based on the achievable throughput and required throughput
333  }
334  else
335  {
336        if (coredynp.core_ty==Inorder)
337        {
338                /* assume 6 pipe stages and try to estimate bits per pipe stage */
339                /* pipe stage 0/IF */
340                num_piperegs += coredynp.pc_width*2*coredynp.num_hthreads;
341                /* pipe stage IF/ID */
342                num_piperegs += coredynp.fetchW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;
343                /* pipe stage IF/ThreadSEL */
344                if (coredynp.multithreaded) num_piperegs += coredynp.num_hthreads*coredynp.perThreadState; //8 bit thread states
345                /* pipe stage ID/EXE */
346                num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width + pow(2.0,opcode_length)+ 2*coredynp.int_data_width)*coredynp.num_hthreads;
347                /* pipe stage EXE/MEM */
348                num_piperegs += coredynp.issueW*(3 * coredynp.arch_ireg_width + pow(2.0,opcode_length) + 8*2*coredynp.int_data_width/*+2*powers (2,reg_length)*/);
349                /* pipe stage MEM/WB the 2^opcode_length means the total decoded signal for the opcode*/
350                num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length) + 8*2*coredynp.int_data_width/*+2*powers (2,reg_length)*/);
351//		/* pipe stage 5/6 */
352//		num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/*+2*powers (2,reg_length)*/);
353//		/* pipe stage 6/7 */
354//		num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/*+2*powers (2,reg_length)*/);
355//		/* pipe stage 7/8 */
356//		num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/**2*powers (2,reg_length)*/);
357//		/* assume 50% extra in control signals (rule of thumb) */
358                num_stages=6;
359
360        }
361        else
362        {
363                /* assume 12 stage pipe stages and try to estimate bits per pipe stage */
364                /*OOO: Fetch, decode, rename, IssueQ, dispatch, regread, EXE, MEM, WB, CM */
365
366                /* pipe stage 0/1F*/
367                num_piperegs += coredynp.pc_width*2*coredynp.num_hthreads ;//PC and Next PC
368                /* pipe stage IF/ID */
369                num_piperegs += coredynp.fetchW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;//PC is used to feed branch predictor in ID
370                /* pipe stage 1D/Renaming*/
371                num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;//PC is for branch exe in later stage.
372                /* pipe stage Renaming/wire_drive */
373                num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width);
374                /* pipe stage Renaming/IssueQ */
375                num_piperegs += coredynp.issueW*(coredynp.instruction_length  + coredynp.pc_width + 3*coredynp.phy_ireg_width)*coredynp.num_hthreads;//3*coredynp.phy_ireg_width means 2 sources and 1 dest
376                /* pipe stage IssueQ/Dispatch */
377                num_piperegs += coredynp.issueW*(coredynp.instruction_length + 3 * coredynp.phy_ireg_width);
378                /* pipe stage Dispatch/EXE */
379
380                num_piperegs += coredynp.issueW*(3 * coredynp.phy_ireg_width + coredynp.pc_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);
381                /* 2^opcode_length means the total decoded signal for the opcode*/
382                num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);
383                /*2 source operands in EXE; Assume 2EXE stages* since we do not really distinguish OP*/
384                num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);
385                /* pipe stage EXE/MEM, data need to be read/write, address*/
386                num_piperegs += coredynp.issueW*(coredynp.int_data_width + coredynp.v_address_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);//memory Opcode still need to be passed
387                /* pipe stage MEM/WB; result data, writeback regs */
388                num_piperegs += coredynp.issueW*(coredynp.int_data_width + coredynp.phy_ireg_width /* powers (2,opcode_length) + (2,opcode_length)+2*powers (2,reg_length)*/);
389                /* pipe stage WB/CM ; result data, regs need to be updated, address for resolve memory ops in ROB's top*/
390                num_piperegs += coredynp.commitW*(coredynp.int_data_width + coredynp.v_address_width + coredynp.phy_ireg_width/*+ powers (2,opcode_length)*2*powers (2,reg_length)*/)*coredynp.num_hthreads;
391//		if (multithreaded)
392//		{
393//
394//		}
395                num_stages=12;
396
397        }
398
399        /* assume 50% extra in control registers and interrupt registers (rule of thumb) */
400        num_piperegs = num_piperegs * 1.5;
401        tot_stage_vector=num_piperegs;
402        per_stage_vector=tot_stage_vector/num_stages;
403
404        if (coredynp.core_ty==Inorder)
405        {
406                if (coredynp.pipeline_stages>6)
407                        num_piperegs= per_stage_vector*coredynp.pipeline_stages;
408        }
409        else//OOO
410        {
411                if (coredynp.pipeline_stages>12)
412                        num_piperegs= per_stage_vector*coredynp.pipeline_stages;
413        }
414  }
415
416}
417
418FunctionalUnit::FunctionalUnit(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, enum FU_type fu_type_)
419:XML(XML_interface),
420 ithCore(ithCore_),
421 interface_ip(*interface_ip_),
422 coredynp(dyn_p_),
423 fu_type(fu_type_)
424{
425    double area_t;//, leakage, gate_leakage;
426    double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
427        clockRate = coredynp.clockRate;
428        executionTime = coredynp.executionTime;
429
430        //XML_interface=_XML_interface;
431        uca_org_t result2;
432        result2 = init_interface(&interface_ip);
433        if (XML->sys.Embedded)
434        {
435                if (fu_type == FPU)
436                {
437                        num_fu=coredynp.num_fpus;
438                        //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
439                        area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number
440                        //4.47 contains both VFP and NEON processing unit, VFP is about 40% and NEON is about 60%
441                        if (g_ip->F_sz_nm>90)
442                                area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
443                        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
444                        gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
445                        //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles.
446//			base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
447//			base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
448                        base_energy = 0;
449                        per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per Hz energy(nJ)
450                        //FPU power from Sandia's processor sizing tech report
451                        FU_height=(18667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data
452                }
453                else if (fu_type == ALU)
454                {
455                        num_fu=coredynp.num_alus;
456                        area_t = 280*260*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
457                        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
458                        gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
459//			base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
460//			base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
461                        base_energy = 0;
462                        per_access_energy = 1.15/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
463                        FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU
464
465                }
466                else if (fu_type == MUL)
467                {
468                        num_fu=coredynp.num_muls;
469                        area_t = 280*260*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
470                        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
471                        gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
472//			base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
473//			base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
474                        base_energy = 0;
475                        per_access_energy = 1.15*2/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
476                        FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data
477                }
478                else
479                {
480                        cout<<"Unknown Functional Unit Type"<<endl;
481                        exit(0);
482                }
483                per_access_energy *=0.5;//According to ARM data embedded processor has much lower per acc energy
484        }
485        else
486        {
487                if (fu_type == FPU)
488                {
489                        num_fu=coredynp.num_fpus;
490                        //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
491                        area_t = 8.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2
492                        if (g_ip->F_sz_nm>90)
493                                area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
494                        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
495                        gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
496                        //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles.
497                        base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
498                        base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
499                        per_access_energy = 1.15*3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per op energy(nJ)
500                        FU_height=(38667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data
501                }
502                else if (fu_type == ALU)
503                {
504                        num_fu=coredynp.num_alus;
505                        area_t = 280*260*2*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
506                        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
507                        gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
508                        base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
509                        base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
510                        per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
511                        FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU
512
513                }
514                else if (fu_type == MUL)
515                {
516                        num_fu=coredynp.num_muls;
517                        area_t = 280*260*2*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
518                        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
519                        gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
520                        base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
521                        base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
522                        per_access_energy = 1.15*2/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
523                        FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data
524                }
525                else
526                {
527                        cout<<"Unknown Functional Unit Type"<<endl;
528                        exit(0);
529                }
530        }
531        //IEXEU, simple ALU and FPU
532        //  double C_ALU, C_EXEU, C_FPU; //Lum Equivalent capacitance of IEXEU and FPU. Based on Intel and Sun 90nm process fabracation.
533        //
534        //  C_ALU	  = 0.025e-9;//F
535        //  C_EXEU  = 0.05e-9; //F
536        //  C_FPU	  = 0.35e-9;//F
537    area.set_area(area_t*num_fu);
538    leakage *= num_fu;
539    gate_leakage *=num_fu;
540        double macro_layout_overhead = g_tp.macro_layout_overhead;
541//	if (!XML->sys.Embedded)
542                area.set_area(area.get_area()*macro_layout_overhead);
543}
544
545void FunctionalUnit::computeEnergy(bool is_tdp)
546{
547        double pppm_t[4]    = {1,1,1,1};
548        double FU_duty_cycle;
549        if (is_tdp)
550        {
551
552
553                set_pppm(pppm_t, 2, 2, 2, 2);//2 means two source operands needs to be passed for each int instruction.
554                if (fu_type == FPU)
555                {
556                        stats_t.readAc.access = num_fu;
557                        tdp_stats = stats_t;
558                        FU_duty_cycle = coredynp.FPU_duty_cycle;
559                }
560                else if (fu_type == ALU)
561                {
562                        stats_t.readAc.access = 1*num_fu;
563                        tdp_stats = stats_t;
564                        FU_duty_cycle = coredynp.ALU_duty_cycle;
565                }
566                else if (fu_type == MUL)
567                {
568                        stats_t.readAc.access = num_fu;
569                        tdp_stats = stats_t;
570                        FU_duty_cycle = coredynp.MUL_duty_cycle;
571                }
572
573            //power.readOp.dynamic = base_energy/clockRate + energy*stats_t.readAc.access;
574            power.readOp.dynamic = per_access_energy*stats_t.readAc.access + base_energy/clockRate;
575                double sckRation = g_tp.sckt_co_eff;
576                power.readOp.dynamic *= sckRation*FU_duty_cycle;
577                power.writeOp.dynamic *= sckRation;
578                power.searchOp.dynamic *= sckRation;
579
580            power.readOp.leakage = leakage;
581            power.readOp.gate_leakage = gate_leakage;
582            double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
583            power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;
584
585        }
586        else
587        {
588                if (fu_type == FPU)
589                {
590                        stats_t.readAc.access = XML->sys.core[ithCore].fpu_accesses;
591                        rtp_stats = stats_t;
592                }
593                else if (fu_type == ALU)
594                {
595                        stats_t.readAc.access = XML->sys.core[ithCore].ialu_accesses;
596                        rtp_stats = stats_t;
597                }
598                else if (fu_type == MUL)
599                {
600                        stats_t.readAc.access = XML->sys.core[ithCore].mul_accesses;
601                        rtp_stats = stats_t;
602                }
603
604            //rt_power.readOp.dynamic = base_energy*executionTime + energy*stats_t.readAc.access;
605            rt_power.readOp.dynamic = per_access_energy*stats_t.readAc.access + base_energy*executionTime;
606                double sckRation = g_tp.sckt_co_eff;
607                rt_power.readOp.dynamic *= sckRation;
608                rt_power.writeOp.dynamic *= sckRation;
609                rt_power.searchOp.dynamic *= sckRation;
610
611        }
612
613
614}
615
616void FunctionalUnit::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
617{
618        string indent_str(indent, ' ');
619        string indent_str_next(indent+2, ' ');
620        bool long_channel = XML->sys.longer_channel_device;
621
622//	cout << indent_str_next << "Results Broadcast Bus Area = " << bypass->area.get_area() *1e-6 << " mm^2" << endl;
623        if (is_tdp)
624        {
625                if (fu_type == FPU)
626                {
627                        cout << indent_str << "Floating Point Units (FPUs) (Count: "<< coredynp.num_fpus <<" ):" << endl;
628                        cout << indent_str_next << "Area = " << area.get_area()*1e-6  << " mm^2" << endl;
629                        cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate  << " W" << endl;
630//			cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage  << " W" << endl;
631                        cout << indent_str_next<< "Subthreshold Leakage = "
632                                                << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
633                        cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage  << " W" << endl;
634                        cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
635                        cout <<endl;
636                }
637                else if (fu_type == ALU)
638                {
639                        cout << indent_str << "Integer ALUs (Count: "<< coredynp.num_alus <<" ):" << endl;
640                        cout << indent_str_next << "Area = " << area.get_area()*1e-6  << " mm^2" << endl;
641                        cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate  << " W" << endl;
642//			cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage  << " W" << endl;
643                        cout << indent_str_next<< "Subthreshold Leakage = "
644                                                << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
645                        cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage  << " W" << endl;
646                        cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
647                        cout <<endl;
648                }
649                else if (fu_type == MUL)
650                {
651                        cout << indent_str << "Complex ALUs (Mul/Div) (Count: "<< coredynp.num_muls <<" ):" << endl;
652                        cout << indent_str_next << "Area = " << area.get_area()*1e-6  << " mm^2" << endl;
653                        cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate  << " W" << endl;
654//			cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage  << " W" << endl;
655                        cout << indent_str_next<< "Subthreshold Leakage = "
656                                                << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
657                        cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage  << " W" << endl;
658                        cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
659                        cout <<endl;
660
661                }
662
663        }
664        else
665        {
666        }
667
668}
669
670void FunctionalUnit::leakage_feedback(double temperature)
671{
672  // Update the temperature and initialize the global interfaces.
673  interface_ip.temp = (unsigned int)round(temperature/10.0)*10;
674
675  uca_org_t init_result = init_interface(&interface_ip); // init_result is dummy
676
677  // This is part of FunctionalUnit()
678  double area_t, leakage, gate_leakage;
679  double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
680
681  if (fu_type == FPU)
682  {
683        area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number
684        if (g_ip->F_sz_nm>90)
685                area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
686        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
687        gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
688  }
689  else if (fu_type == ALU)
690  {
691    area_t = 280*260*2*num_fu*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
692    leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
693    gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
694  }
695  else if (fu_type == MUL)
696  {
697    area_t = 280*260*2*3*num_fu*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
698    leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
699    gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
700  }
701  else
702  {
703    cout<<"Unknown Functional Unit Type"<<endl;
704    exit(1);
705  }
706
707  power.readOp.leakage = leakage*num_fu;
708  power.readOp.gate_leakage = gate_leakage*num_fu;
709  power.readOp.longer_channel_leakage = longer_channel_device_reduction(Core_device, coredynp.core_ty);
710}
711
712UndiffCore::UndiffCore(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_,  bool embedded_)
713:XML(XML_interface),
714 ithCore(ithCore_),
715 interface_ip(*interface_ip_),
716 coredynp(dyn_p_),
717 core_ty(coredynp.core_ty),
718 embedded(XML->sys.Embedded),
719 pipeline_stage(coredynp.pipeline_stages),
720 num_hthreads(coredynp.num_hthreads),
721 issue_width(coredynp.issueW),
722 exist(exist_)
723// is_default(_is_default)
724{
725        if (!exist) return;
726        double undifferentiated_core=0;
727        double core_tx_density=0;
728        double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
729        double undifferentiated_core_coe;
730        //XML_interface=_XML_interface;
731        uca_org_t result2;
732        result2 = init_interface(&interface_ip);
733
734        //Compute undifferentiated core area at 90nm.
735        if (embedded==false)
736        {
737                //Based on the results of polynomial/log curve fitting based on undifferentiated core of Niagara, Niagara2, Merom, Penyrn, Prescott, Opteron die measurements
738                if (core_ty==OOO)
739                {
740                        //undifferentiated_core = (0.0764*pipeline_stage*pipeline_stage -2.3685*pipeline_stage + 10.405);//OOO
741                        undifferentiated_core = (3.57*log(pipeline_stage)-1.2643)>0?(3.57*log(pipeline_stage)-1.2643):0;
742                }
743                else if (core_ty==Inorder)
744                {
745                        //undifferentiated_core = (0.1238*pipeline_stage + 7.2572)*0.9;//inorder
746                        undifferentiated_core = (-2.19*log(pipeline_stage)+6.55)>0?(-2.19*log(pipeline_stage)+6.55):0;
747                }
748                else
749                {
750                        cout<<"invalid core type"<<endl;
751                        exit(0);
752                }
753                undifferentiated_core *= (1+ logtwo(num_hthreads)* 0.0716);
754        }
755        else
756        {
757                //Based on the results in paper "parametrized processor models" Sandia Labs
758                if (XML->sys.opt_clockrate)
759                        undifferentiated_core_coe = 0.05;
760                else
761                        undifferentiated_core_coe = 0;
762                undifferentiated_core = (0.4109* pipeline_stage - 0.776)*undifferentiated_core_coe;
763                undifferentiated_core *= (1+ logtwo(num_hthreads)* 0.0426);
764        }
765
766        undifferentiated_core 		    *= g_tp.scaling_factor.logic_scaling_co_eff*1e6;//change from mm^2 to um^2
767        core_tx_density                 = g_tp.scaling_factor.core_tx_density;
768        //undifferentiated_core 		    = 3*1e6;
769        //undifferentiated_core			*= g_tp.scaling_factor.logic_scaling_co_eff;//(g_ip->F_sz_um*g_ip->F_sz_um/0.09/0.09)*;
770        power.readOp.leakage = undifferentiated_core*(core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
771        power.readOp.gate_leakage = undifferentiated_core*(core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;
772
773        double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
774        power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;
775        area.set_area(undifferentiated_core);
776
777        scktRatio = g_tp.sckt_co_eff;
778        power.readOp.dynamic *= scktRatio;
779        power.writeOp.dynamic *= scktRatio;
780        power.searchOp.dynamic *= scktRatio;
781        macro_PR_overhead = g_tp.macro_layout_overhead;
782        area.set_area(area.get_area()*macro_PR_overhead);
783
784
785
786//		double vt=g_tp.peri_global.Vth;
787//		double velocity_index=1.1;
788//		double c_in=gate_C(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r , 0.0, false);
789//		double c_out= drain_C_(g_tp.min_w_nmos_, NCH, 2, 1, g_tp.cell_h_def, false) + drain_C_(g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, PCH, 1, 1, g_tp.cell_h_def, false) + c_in;
790//		double w_nmos=g_tp.min_w_nmos_;
791//		double w_pmos=g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
792//		double i_on_n=1.0;
793//		double i_on_p=1.0;
794//		double i_on_n_in=1.0;
795//		double i_on_p_in=1;
796//		double vdd=g_tp.peri_global.Vdd;
797
798//		power.readOp.sc=shortcircuit_simple(vt, velocity_index, c_in, c_out, w_nmos,w_pmos, i_on_n, i_on_p,i_on_n_in, i_on_p_in, vdd);
799//		power.readOp.dynamic=c_out*vdd*vdd/2;
800
801//		cout<<power.readOp.dynamic << "dynamic" <<endl;
802//		cout<<power.readOp.sc << "sc" << endl;
803
804//		power.readOp.sc=shortcircuit(vt, velocity_index, c_in, c_out, w_nmos,w_pmos, i_on_n, i_on_p,i_on_n_in, i_on_p_in, vdd);
805//		power.readOp.dynamic=c_out*vdd*vdd/2;
806//
807//		cout<<power.readOp.dynamic << "dynamic" <<endl;
808//		cout<<power.readOp.sc << "sc" << endl;
809
810
811
812}
813
814
815void UndiffCore::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
816{
817        string indent_str(indent, ' ');
818        string indent_str_next(indent+2, ' ');
819        bool long_channel = XML->sys.longer_channel_device;
820
821        if (is_tdp)
822        {
823                cout << indent_str << "UndiffCore:" << endl;
824                cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
825                cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl;
826                //cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl;
827                cout << indent_str_next<< "Subthreshold Leakage = "
828                                        << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
829                cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
830                //cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
831                cout <<endl;
832        }
833        else
834        {
835                cout << indent_str << "UndiffCore:" << endl;
836                cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
837                cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl;
838                cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl;
839                cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
840                //cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
841                cout <<endl;
842        }
843
844}
845
846inst_decoder::inst_decoder(
847                bool   _is_default,
848                const InputParameter *configure_interface,
849                int opcode_length_,
850                int num_decoders_,
851                bool x86_,
852            enum Device_ty device_ty_,
853            enum Core_type core_ty_)
854:is_default(_is_default),
855 opcode_length(opcode_length_),
856 num_decoders(num_decoders_),
857 x86(x86_),
858 device_ty(device_ty_),
859 core_ty(core_ty_)
860 {
861                        /*
862                         * Instruction decoder is different from n to 2^n decoders
863                         * that are commonly used in row decoders in memory arrays.
864                         * The RISC instruction decoder is typically a very simple device.
865                         * We can decode an instruction by simply
866                         * separating the machine word into small parts using wire slices
867                         * The RISC instruction decoder can be approximate by the n to 2^n decoders,
868                         * although this approximation usually underestimate power since each decoded
869                         * instruction normally has more than 1 active signal.
870                         *
871                         * However, decoding a CISC instruction word is much more difficult
872                         * than the RISC case. A CISC decoder is typically set up as a state machine.
873                         * The machine reads the opcode field to determine
874                         * what type of instruction it is,
875                         * and where the other data values are.
876                         * The instruction word is read in piece by piece,
877                         * and decisions are made at each stage as to
878                         * how the remainder of the instruction word will be read.
879                         * (sequencer and ROM are usually needed)
880                         * An x86 decoder can be even more complex since
881                         * it involve  both decoding instructions into u-ops and
882                         * merge u-ops when doing micro-ops fusion.
883                         */
884                        bool is_dram=false;
885                        double pmos_to_nmos_sizing_r;
886                        double load_nmos_width, load_pmos_width;
887                        double C_driver_load, R_wire_load;
888                        Area cell;
889
890                        l_ip=*configure_interface;
891                        local_result = init_interface(&l_ip);
892                        cell.h =g_tp.cell_h_def;
893                        cell.w =g_tp.cell_h_def;
894
895                        num_decoder_segments = (int)ceil(opcode_length/18.0);
896                        if (opcode_length > 18)	opcode_length = 18;
897                        num_decoded_signals= (int)pow(2.0,opcode_length);
898                        pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
899                        load_nmos_width=g_tp.max_w_nmos_ /2;
900                        load_pmos_width= g_tp.max_w_nmos_ * pmos_to_nmos_sizing_r;
901                        C_driver_load = 1024*gate_C(load_nmos_width + load_pmos_width, 0, is_dram); //TODO: this number 1024 needs to be revisited
902                        R_wire_load   = 3000*l_ip.F_sz_um * g_tp.wire_outside_mat.R_per_um;
903
904                        final_dec = new Decoder(
905                                        num_decoded_signals,
906                                        false,
907                                        C_driver_load,
908                                        R_wire_load,
909                                        false/*is_fa*/,
910                                        false/*is_dram*/,
911                                        false/*wl_tr*/, //to use peri device
912                                        cell);
913
914                        PredecBlk * predec_blk1 = new PredecBlk(
915                                        num_decoded_signals,
916                                        final_dec,
917                                        0,//Assuming predec and dec are back to back
918                                        0,
919                                        1,//Each Predec only drives one final dec
920                                        false/*is_dram*/,
921                                        true);
922                        PredecBlk * predec_blk2 = new PredecBlk(
923                                        num_decoded_signals,
924                                        final_dec,
925                                        0,//Assuming predec and dec are back to back
926                                        0,
927                                        1,//Each Predec only drives one final dec
928                                        false/*is_dram*/,
929                                        false);
930
931                        PredecBlkDrv * predec_blk_drv1 = new PredecBlkDrv(0, predec_blk1, false);
932                        PredecBlkDrv * predec_blk_drv2 = new PredecBlkDrv(0, predec_blk2, false);
933
934                        pre_dec            = new Predec(predec_blk_drv1, predec_blk_drv2);
935
936                        double area_decoder = final_dec->area.get_area() * num_decoded_signals * num_decoder_segments*num_decoders;
937                        //double w_decoder    = area_decoder / area.get_h();
938                        double area_pre_dec = (predec_blk_drv1->area.get_area() +
939                                        predec_blk_drv2->area.get_area() +
940                                        predec_blk1->area.get_area() +
941                                        predec_blk2->area.get_area())*
942                                        num_decoder_segments*num_decoders;
943                        area.set_area(area.get_area()+ area_decoder + area_pre_dec);
944                        double macro_layout_overhead   = g_tp.macro_layout_overhead;
945                        double chip_PR_overhead        = g_tp.chip_layout_overhead;
946                        area.set_area(area.get_area()*macro_layout_overhead*chip_PR_overhead);
947
948                        inst_decoder_delay_power();
949
950                        double sckRation = g_tp.sckt_co_eff;
951                        power.readOp.dynamic *= sckRation;
952                        power.writeOp.dynamic *= sckRation;
953                        power.searchOp.dynamic *= sckRation;
954
955                        double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
956                        power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;
957
958}
959
960void inst_decoder::inst_decoder_delay_power()
961{
962
963        double dec_outrisetime;
964        double inrisetime=0, outrisetime;
965        double pppm_t[4]    = {1,1,1,1};
966        double squencer_passes = x86?2:1;
967
968        outrisetime = pre_dec->compute_delays(inrisetime);
969        dec_outrisetime = final_dec->compute_delays(outrisetime);
970        set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments);
971    power = power + pre_dec->power*pppm_t;
972    set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals,
973                num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments);
974    power = power + final_dec->power*pppm_t;
975}
976void inst_decoder::leakage_feedback(double temperature)
977{
978  l_ip.temp = (unsigned int)round(temperature/10.0)*10;
979  uca_org_t init_result = init_interface(&l_ip); // init_result is dummy
980
981  final_dec->leakage_feedback(temperature);
982  pre_dec->leakage_feedback(temperature);
983
984  double pppm_t[4]    = {1,1,1,1};
985  double squencer_passes = x86?2:1;
986
987  set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments);
988  power = pre_dec->power*pppm_t;
989
990  set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals,num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments);
991  power = power + final_dec->power*pppm_t;
992
993  double sckRation = g_tp.sckt_co_eff;
994
995  power.readOp.dynamic *= sckRation;
996  power.writeOp.dynamic *= sckRation;
997  power.searchOp.dynamic *= sckRation;
998
999  double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
1000  power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;
1001}
1002
1003inst_decoder::~inst_decoder()
1004{
1005          local_result.cleanup();
1006
1007          delete final_dec;
1008
1009          delete pre_dec->blk1;
1010          delete pre_dec->blk2;
1011          delete pre_dec->drv1;
1012          delete pre_dec->drv2;
1013          delete pre_dec;
1014}
1015