4a5
> * Copyright (c) 2010-2013 Advanced Micro Devices, Inc.
28c29
< * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
---
> * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37d37
< #include "XML_Parse.h"
39c39
< #include "basic_components.h"
---
> #include "common.h"
44d43
< #include "parameter.h"
72,76c71,75
< NIUController::NIUController(ParseXML *XML_interface,InputParameter* interface_ip_)
< :XML(XML_interface),
< interface_ip(*interface_ip_)
< {
< local_result = init_interface(&interface_ip);
---
> NIUController::NIUController(XMLNode* _xml_data,InputParameter* interface_ip_)
> : McPATComponent(_xml_data, interface_ip_) {
> name = "NIU";
> set_niu_param();
> }
78,82c77,80
< double frontend_area, phy_area, mac_area, SerDer_area;
< double frontend_dyn, mac_dyn, SerDer_dyn;
< double frontend_gates, mac_gates, SerDer_gates;
< double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
< double NMOS_sizing, PMOS_sizing;
---
> void NIUController::computeArea() {
> double mac_area;
> double frontend_area;
> double SerDer_area;
84c82,114
< set_niu_param();
---
> if (niup.type == 0) { //high performance NIU
> //Area estimation based on average of die photo from Niagara 2 and
> //Cadence ChipEstimate using 65nm.
> mac_area = (1.53 + 0.3) / 2 * (interface_ip.F_sz_um / 0.065) *
> (interface_ip.F_sz_um / 0.065);
> //Area estimation based on average of die photo from Niagara 2, ISSCC
> //"An 800mW 10Gb Ethernet Transceiver in 0.13μm CMOS"
> //and"A 1.2-V-Only 900-mW 10 Gb Ethernet Transceiver and XAUI Interface
> //With Robust VCO Tuning Technique" Frontend is PCS
> frontend_area = (9.8 + (6 + 18) * 65 / 130 * 65 / 130) / 3 *
> (interface_ip.F_sz_um / 0.065) * (interface_ip.F_sz_um / 0.065);
> //Area estimation based on average of die photo from Niagara 2 and
> //Cadence ChipEstimate hard IP @65nm.
> //SerDer is very hard to scale
> SerDer_area = (1.39 + 0.36) * (interface_ip.F_sz_um /
> 0.065);//* (interface_ip.F_sz_um/0.065);
> } else {
> //Low power implementations are mostly from Cadence ChipEstimator;
> //Ignore the multiple IP effect
> // ---When there are multiple IP (same kind or not) selected, Cadence
> //ChipEstimator results are not a simple summation of all IPs.
> //Ignore this effect
> mac_area = 0.24 * (interface_ip.F_sz_um / 0.065) *
> (interface_ip.F_sz_um / 0.065);
> frontend_area = 0.1 * (interface_ip.F_sz_um / 0.065) *
> (interface_ip.F_sz_um / 0.065);//Frontend is the PCS layer
> SerDer_area = 0.35 * (interface_ip.F_sz_um / 0.065) *
> (interface_ip.F_sz_um/0.065);
> //Compare 130um implementation in "A 1.2-V-Only 900-mW 10 Gb Ethernet
> //Transceiver and XAUI Interface With Robust VCO Tuning Technique"
> //and the ChipEstimator XAUI PHY hard IP, confirm that even PHY can
> //scale perfectly with the technology
> }
86,107c116,118
< if (niup.type == 0) //high performance NIU
< {
< //Area estimation based on average of die photo from Niagara 2 and Cadence ChipEstimate using 65nm.
< mac_area = (1.53 + 0.3)/2 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
< //Area estimation based on average of die photo from Niagara 2, ISSCC "An 800mW 10Gb Ethernet Transceiver in 0.13μm CMOS"
< //and"A 1.2-V-Only 900-mW 10 Gb Ethernet Transceiver and XAUI Interface With Robust VCO Tuning Technique" Frontend is PCS
< frontend_area = (9.8 + (6 + 18)*65/130*65/130)/3 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
< //Area estimation based on average of die photo from Niagara 2 and Cadence ChipEstimate hard IP @65nm.
< //SerDer is very hard to scale
< SerDer_area = (1.39 + 0.36) * (interface_ip.F_sz_um/0.065);//* (interface_ip.F_sz_um/0.065);
< phy_area = frontend_area + SerDer_area;
< //total area
< area.set_area((mac_area + frontend_area + SerDer_area)*1e6);
< //Power
< //Cadence ChipEstimate using 65nm (mac, front_end are all energy. E=P*T = P/F = 1.37/1Ghz = 1.37e-9);
< mac_dyn = 2.19e-9*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);//niup.clockRate; //2.19W@1GHz fully active according to Cadence ChipEstimate @65nm
< //Cadence ChipEstimate using 65nm soft IP;
< frontend_dyn = 0.27e-9*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);//niup.clockRate;
< //according to "A 100mW 9.6Gb/s Transceiver in 90nm CMOS..." ISSCC 2006
< //SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
< SerDer_dyn = 0.01*10*sqrt(interface_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;
< SerDer_dyn /= niup.clockRate;//covert to energy per clock cycle of whole NIU
---
> //total area
> output_data.area = (mac_area + frontend_area + SerDer_area) * 1e6;
> }
109,114c120,129
< //Cadence ChipEstimate using 65nm
< mac_gates = 111700;
< frontend_gates = 320000;
< SerDer_gates = 200000;
< NMOS_sizing = 5*g_tp.min_w_nmos_;
< PMOS_sizing = 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
---
> void NIUController::computeEnergy() {
> double mac_dyn;
> double frontend_dyn;
> double SerDer_dyn;
> double frontend_gates;
> double mac_gates;
> double SerDer_gates;
> double NMOS_sizing;
> double PMOS_sizing;
> double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
115a131,144
> if (niup.type == 0) { //high performance NIU
> //Power
> //Cadence ChipEstimate using 65nm (mac, front_end are all energy.
> //E=P*T = P/F = 1.37/1Ghz = 1.37e-9);
> //2.19W@1GHz fully active according to Cadence ChipEstimate @65nm
> mac_dyn = 2.19e-9 * g_tp.peri_global.Vdd / 1.1 * g_tp.peri_global.Vdd /
> 1.1 * (interface_ip.F_sz_nm / 65.0);//niup.clockRate;
> //Cadence ChipEstimate using 65nm soft IP;
> frontend_dyn = 0.27e-9 * g_tp.peri_global.Vdd / 1.1 *
> g_tp.peri_global.Vdd / 1.1 * (interface_ip.F_sz_nm / 65.0);
> //according to "A 100mW 9.6Gb/s Transceiver in 90nm CMOS..." ISSCC 2006
> //SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
> SerDer_dyn = 0.01 * 10 * sqrt(interface_ip.F_sz_um / 0.09) *
> g_tp.peri_global.Vdd / 1.2 * g_tp.peri_global.Vdd / 1.2;
117,136c146,164
< }
< else
< {//Low power implementations are mostly from Cadence ChipEstimator; Ignore the multiple IP effect
< // ---When there are multiple IP (same kind or not) selected, Cadence ChipEstimator results are not
< // a simple summation of all IPs. Ignore this effect
< mac_area = 0.24 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
< frontend_area = 0.1 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);//Frontend is the PCS layer
< SerDer_area = 0.35 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
< //Compare 130um implementation in "A 1.2-V-Only 900-mW 10 Gb Ethernet Transceiver and XAUI Interface With Robust VCO Tuning Technique"
< //and the ChipEstimator XAUI PHY hard IP, confirm that even PHY can scale perfectly with the technology
< //total area
< area.set_area((mac_area + frontend_area + SerDer_area)*1e6);
< //Power
< //Cadence ChipEstimate using 65nm (mac, front_end are all energy. E=P*T = P/F = 1.37/1Ghz = 1.37e-9);
< mac_dyn = 1.257e-9*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);//niup.clockRate; //2.19W@1GHz fully active according to Cadence ChipEstimate @65nm
< //Cadence ChipEstimate using 65nm soft IP;
< frontend_dyn = 0.6e-9*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);//niup.clockRate;
< //SerDer_dyn is power not energy, scaling from 216mw/10Gb/s @130nm
< SerDer_dyn = 0.0216*10*(interface_ip.F_sz_um/0.13)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;
< SerDer_dyn /= niup.clockRate;//covert to energy per clock cycle of whole NIU
---
> //Cadence ChipEstimate using 65nm
> mac_gates = 111700;
> frontend_gates = 320000;
> SerDer_gates = 200000;
> NMOS_sizing = 5 * g_tp.min_w_nmos_;
> PMOS_sizing = 5 * g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r;
> } else {
> //Power
> //Cadence ChipEstimate using 65nm (mac, front_end are all energy.
> ///E=P*T = P/F = 1.37/1Ghz = 1.37e-9);
> //2.19W@1GHz fully active according to Cadence ChipEstimate @65nm
> mac_dyn = 1.257e-9 * g_tp.peri_global.Vdd / 1.1 * g_tp.peri_global.Vdd
> / 1.1 * (interface_ip.F_sz_nm / 65.0);//niup.clockRate;
> //Cadence ChipEstimate using 65nm soft IP;
> frontend_dyn = 0.6e-9 * g_tp.peri_global.Vdd / 1.1 *
> g_tp.peri_global.Vdd / 1.1 * (interface_ip.F_sz_nm / 65.0);
> //SerDer_dyn is power not energy, scaling from 216mw/10Gb/s @130nm
> SerDer_dyn = 0.0216 * 10 * (interface_ip.F_sz_um / 0.13) *
> g_tp.peri_global.Vdd / 1.2 * g_tp.peri_global.Vdd / 1.2;
138,140c166,171
< mac_gates = 111700;
< frontend_gates = 52000;
< SerDer_gates = 199260;
---
> mac_gates = 111700;
> frontend_gates = 52000;
> SerDer_gates = 199260;
> NMOS_sizing = g_tp.min_w_nmos_;
> PMOS_sizing = g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r;
> }
142,143c173,174
< NMOS_sizing = g_tp.min_w_nmos_;
< PMOS_sizing = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
---
> //covert to energy per clock cycle of whole NIU
> SerDer_dyn /= niup.clockRate;
145c176,186
< }
---
> power.readOp.dynamic = mac_dyn + frontend_dyn + SerDer_dyn;
> power.readOp.leakage = (mac_gates + frontend_gates + frontend_gates) *
> cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand) *
> g_tp.peri_global.Vdd;//unit W
> double long_channel_device_reduction =
> longer_channel_device_reduction(Uncore_device);
> power.readOp.longer_channel_leakage =
> power.readOp.leakage * long_channel_device_reduction;
> power.readOp.gate_leakage = (mac_gates + frontend_gates + frontend_gates) *
> cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand) *
> g_tp.peri_global.Vdd;//unit W
147,152c188,195
< power_t.readOp.dynamic = mac_dyn + frontend_dyn + SerDer_dyn;
< power_t.readOp.leakage = (mac_gates + frontend_gates + frontend_gates)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
< double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
< power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction;
< power_t.readOp.gate_leakage = (mac_gates + frontend_gates + frontend_gates)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
< }
---
> // Output power
> output_data.subthreshold_leakage_power =
> longer_channel_device ? power.readOp.longer_channel_leakage :
> power.readOp.leakage;
> output_data.gate_leakage_power = power.readOp.gate_leakage;
> output_data.peak_dynamic_power = power.readOp.dynamic * nius.duty_cycle;
> output_data.runtime_dynamic_energy = power.readOp.dynamic * nius.perc_load;
> }
154,157c197,203
< void NIUController::computeEnergy(bool is_tdp)
< {
< if (is_tdp)
< {
---
> void NIUController::set_niu_param() {
> int num_children = xml_data->nChildNode("param");
> int i;
> for (i = 0; i < num_children; i++) {
> XMLNode* paramNode = xml_data->getChildNodePtr("param", &i);
> XMLCSTR node_name = paramNode->getAttribute("name");
> XMLCSTR value = paramNode->getAttribute("value");
158a205,206
> if (!node_name)
> warnMissingParamName(paramNode->getAttribute("id"));
160,161c208,210
< power = power_t;
< power.readOp.dynamic *= niup.duty_cycle;
---
> ASSIGN_FP_IF("niu_clockRate", niup.clockRate);
> ASSIGN_INT_IF("num_units", niup.num_units);
> ASSIGN_INT_IF("type", niup.type);
162a212,214
> else {
> warnUnrecognizedParam(node_name);
> }
164,169d215
< else
< {
< rt_power = power_t;
< rt_power.readOp.dynamic *= niup.perc_load;
< }
< }
171,175c217,218
< void NIUController::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
< {
< string indent_str(indent, ' ');
< string indent_str_next(indent+2, ' ');
< bool long_channel = XML->sys.longer_channel_device;
---
> // Change from MHz to Hz
> niup.clockRate *= 1e6;
177,190c220,224
< if (is_tdp)
< {
< cout << "NIU:" << endl;
< cout << indent_str<< "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
< cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic*niup.clockRate << " W" << endl;
< cout << indent_str<< "Subthreshold Leakage = "
< << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
< //cout << indent_str<< "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
< cout << indent_str<< "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
< cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic*niup.clockRate << " W" << endl;
< cout<<endl;
< }
< else
< {
---
> num_children = xml_data->nChildNode("stat");
> for (i = 0; i < num_children; i++) {
> XMLNode* statNode = xml_data->getChildNodePtr("stat", &i);
> XMLCSTR node_name = statNode->getAttribute("name");
> XMLCSTR value = statNode->getAttribute("value");
192c226,227
< }
---
> if (!node_name)
> warnMissingStatName(statNode->getAttribute("id"));
193a229,235
> ASSIGN_FP_IF("duty_cycle", nius.duty_cycle);
> ASSIGN_FP_IF("perc_load", nius.perc_load);
>
> else {
> warnUnrecognizedStat(node_name);
> }
> }
196,204c238,242
< void NIUController::set_niu_param()
< {
< niup.clockRate = XML->sys.niu.clockrate;
< niup.clockRate *= 1e6;
< niup.num_units = XML->sys.niu.number_units;
< niup.duty_cycle = XML->sys.niu.duty_cycle;
< niup.perc_load = XML->sys.niu.total_load_perc;
< niup.type = XML->sys.niu.type;
< // niup.executionTime = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
---
> PCIeController::PCIeController(XMLNode* _xml_data,
> InputParameter* interface_ip_)
> : McPATComponent(_xml_data, interface_ip_) {
> name = "PCIe";
> set_pcie_param();
207,216c245,247
< PCIeController::PCIeController(ParseXML *XML_interface,InputParameter* interface_ip_)
< :XML(XML_interface),
< interface_ip(*interface_ip_)
< {
< local_result = init_interface(&interface_ip);
< double frontend_area, phy_area, ctrl_area, SerDer_area;
< double ctrl_dyn, frontend_dyn, SerDer_dyn;
< double ctrl_gates,frontend_gates, SerDer_gates;
< double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
< double NMOS_sizing, PMOS_sizing;
---
> void PCIeController::computeArea() {
> double ctrl_area;
> double SerDer_area;
218,221c249,252
< /* Assuming PCIe is bit-slice based architecture
< * This is the reason for /8 in both area and power calculation
< * to get per lane numbers
< */
---
> /* Assuming PCIe is bit-slice based architecture
> * This is the reason for /8 in both area and power calculation
> * to get per lane numbers
> */
223,242c254,271
< set_pcie_param();
< if (pciep.type == 0) //high performance NIU
< {
< //Area estimation based on average of die photo from Niagara 2 and Cadence ChipEstimate @ 65nm.
< ctrl_area = (5.2 + 0.5)/2 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
< //Area estimation based on average of die photo from Niagara 2, and Cadence ChipEstimate @ 65nm.
< frontend_area = (5.2 + 0.1)/2 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
< //Area estimation based on average of die photo from Niagara 2 and Cadence ChipEstimate hard IP @65nm.
< //SerDer is very hard to scale
< SerDer_area = (3.03 + 0.36) * (interface_ip.F_sz_um/0.065);//* (interface_ip.F_sz_um/0.065);
< phy_area = frontend_area + SerDer_area;
< //total area
< //Power
< //Cadence ChipEstimate using 65nm the controller includes everything: the PHY, the data link and transaction layer
< ctrl_dyn = 3.75e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
< // //Cadence ChipEstimate using 65nm soft IP;
< // frontend_dyn = 0.27e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
< //SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
< SerDer_dyn = 0.01*4*(interface_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;//PCIe 2.0 max per lane speed is 4Gb/s
< SerDer_dyn /= pciep.clockRate;//covert to energy per clock cycle
---
> if (pciep.type == 0) { //high performance PCIe
> //Area estimation based on average of die photo from Niagara 2 and
> //Cadence ChipEstimate @ 65nm.
> ctrl_area = (5.2 + 0.5) / 2 * (interface_ip.F_sz_um / 0.065) *
> (interface_ip.F_sz_um / 0.065);
> //Area estimation based on average of die photo from Niagara 2 and
> //Cadence ChipEstimate hard IP @65nm.
> //SerDer is very hard to scale
> SerDer_area = (3.03 + 0.36) * (interface_ip.F_sz_um /
> 0.065);//* (interface_ip.F_sz_um/0.065);
> } else {
> ctrl_area = 0.412 * (interface_ip.F_sz_um / 0.065) *
> (interface_ip.F_sz_um / 0.065);
> //Area estimation based on average of die photo from Niagara 2, and
> //Cadence ChipEstimate @ 65nm.
> SerDer_area = 0.36 * (interface_ip.F_sz_um / 0.065) *
> (interface_ip.F_sz_um / 0.065);
> }
244,265c273,276
< //power_t.readOp.dynamic = (ctrl_dyn)*pciep.num_channels;
< //Cadence ChipEstimate using 65nm
< ctrl_gates = 900000/8*pciep.num_channels;
< // frontend_gates = 120000/8;
< // SerDer_gates = 200000/8;
< NMOS_sizing = 5*g_tp.min_w_nmos_;
< PMOS_sizing = 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
< }
< else
< {
< ctrl_area = 0.412 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
< //Area estimation based on average of die photo from Niagara 2, and Cadence ChipEstimate @ 65nm.
< SerDer_area = 0.36 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
< //total area
< //Power
< //Cadence ChipEstimate using 65nm the controller includes everything: the PHY, the data link and transaction layer
< ctrl_dyn = 2.21e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
< // //Cadence ChipEstimate using 65nm soft IP;
< // frontend_dyn = 0.27e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
< //SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
< SerDer_dyn = 0.01*4*(interface_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;//PCIe 2.0 max per lane speed is 4Gb/s
< SerDer_dyn /= pciep.clockRate;//covert to energy per clock cycle
---
> // Total area
> output_data.area = ((ctrl_area + (pciep.withPHY ? SerDer_area : 0)) / 8 *
> pciep.num_channels) * 1e6;
> }
267,272c278,285
< //Cadence ChipEstimate using 65nm
< ctrl_gates = 200000/8*pciep.num_channels;
< // frontend_gates = 120000/8;
< SerDer_gates = 200000/8*pciep.num_channels;
< NMOS_sizing = g_tp.min_w_nmos_;
< PMOS_sizing = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
---
> void PCIeController::computeEnergy() {
> double ctrl_dyn;
> double SerDer_dyn;
> double ctrl_gates;
> double SerDer_gates = 0;
> double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
> double NMOS_sizing;
> double PMOS_sizing;
274,281c287,290
< }
< area.set_area(((ctrl_area + (pciep.withPHY? SerDer_area:0))/8*pciep.num_channels)*1e6);
< power_t.readOp.dynamic = (ctrl_dyn + (pciep.withPHY? SerDer_dyn:0))*pciep.num_channels;
< power_t.readOp.leakage = (ctrl_gates + (pciep.withPHY? SerDer_gates:0))*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
< double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
< power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction;
< power_t.readOp.gate_leakage = (ctrl_gates + (pciep.withPHY? SerDer_gates:0))*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
< }
---
> /* Assuming PCIe is bit-slice based architecture
> * This is the reason for /8 in both area and power calculation
> * to get per lane numbers
> */
283,286c292,302
< void PCIeController::computeEnergy(bool is_tdp)
< {
< if (is_tdp)
< {
---
> if (pciep.type == 0) { //high performance PCIe
> //Power
> //Cadence ChipEstimate using 65nm the controller includes everything: the PHY, the data link and transaction layer
> ctrl_dyn = 3.75e-9 / 8 * g_tp.peri_global.Vdd / 1.1 *
> g_tp.peri_global.Vdd / 1.1 * (interface_ip.F_sz_nm / 65.0);
> // //Cadence ChipEstimate using 65nm soft IP;
> // frontend_dyn = 0.27e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
> //SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
> //PCIe 2.0 max per lane speed is 4Gb/s
> SerDer_dyn = 0.01 * 4 * (interface_ip.F_sz_um /0.09) *
> g_tp.peri_global.Vdd / 1.2 * g_tp.peri_global.Vdd /1.2;
287a304,320
> //Cadence ChipEstimate using 65nm
> ctrl_gates = 900000 / 8 * pciep.num_channels;
> // frontend_gates = 120000/8;
> // SerDer_gates = 200000/8;
> NMOS_sizing = 5 * g_tp.min_w_nmos_;
> PMOS_sizing = 5 * g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r;
> } else {
> //Power
> //Cadence ChipEstimate using 65nm the controller includes everything: the PHY, the data link and transaction layer
> ctrl_dyn = 2.21e-9 / 8 * g_tp.peri_global.Vdd / 1.1 *
> g_tp.peri_global.Vdd / 1.1 * (interface_ip.F_sz_nm / 65.0);
> // //Cadence ChipEstimate using 65nm soft IP;
> // frontend_dyn = 0.27e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
> //SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
> //PCIe 2.0 max per lane speed is 4Gb/s
> SerDer_dyn = 0.01 * 4 * (interface_ip.F_sz_um / 0.09) *
> g_tp.peri_global.Vdd / 1.2 * g_tp.peri_global.Vdd /1.2;
289,290c322,327
< power = power_t;
< power.readOp.dynamic *= pciep.duty_cycle;
---
> //Cadence ChipEstimate using 65nm
> ctrl_gates = 200000 / 8 * pciep.num_channels;
> // frontend_gates = 120000/8;
> SerDer_gates = 200000 / 8 * pciep.num_channels;
> NMOS_sizing = g_tp.min_w_nmos_;
> PMOS_sizing = g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r;
293,297c330,355
< else
< {
< rt_power = power_t;
< rt_power.readOp.dynamic *= pciep.perc_load;
< }
---
>
> //covert to energy per clock cycle
> SerDer_dyn /= pciep.clockRate;
>
> power.readOp.dynamic = (ctrl_dyn + (pciep.withPHY ? SerDer_dyn : 0)) *
> pciep.num_channels;
> power.readOp.leakage = (ctrl_gates + (pciep.withPHY ? SerDer_gates : 0)) *
> cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand) *
> g_tp.peri_global.Vdd;//unit W
> double long_channel_device_reduction =
> longer_channel_device_reduction(Uncore_device);
> power.readOp.longer_channel_leakage =
> power.readOp.leakage * long_channel_device_reduction;
> power.readOp.gate_leakage = (ctrl_gates +
> (pciep.withPHY ? SerDer_gates : 0)) *
> cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand) *
> g_tp.peri_global.Vdd;//unit W
>
> // Output power
> output_data.subthreshold_leakage_power =
> longer_channel_device ? power.readOp.longer_channel_leakage :
> power.readOp.leakage;
> output_data.gate_leakage_power = power.readOp.gate_leakage;
> output_data.peak_dynamic_power = power.readOp.dynamic * pcies.duty_cycle;
> output_data.runtime_dynamic_energy =
> power.readOp.dynamic * pcies.perc_load;
300,304c358,364
< void PCIeController::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
< {
< string indent_str(indent, ' ');
< string indent_str_next(indent+2, ' ');
< bool long_channel = XML->sys.longer_channel_device;
---
> void PCIeController::set_pcie_param() {
> int num_children = xml_data->nChildNode("param");
> int i;
> for (i = 0; i < num_children; i++) {
> XMLNode* paramNode = xml_data->getChildNodePtr("param", &i);
> XMLCSTR node_name = paramNode->getAttribute("name");
> XMLCSTR value = paramNode->getAttribute("value");
306,319c366,367
< if (is_tdp)
< {
< cout << "PCIe:" << endl;
< cout << indent_str<< "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
< cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic*pciep.clockRate << " W" << endl;
< cout << indent_str<< "Subthreshold Leakage = "
< << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
< //cout << indent_str<< "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
< cout << indent_str<< "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
< cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic*pciep.clockRate << " W" << endl;
< cout<<endl;
< }
< else
< {
---
> if (!node_name)
> warnMissingParamName(paramNode->getAttribute("id"));
320a369,376
> ASSIGN_FP_IF("pcie_clockRate", pciep.clockRate);
> ASSIGN_INT_IF("num_units", pciep.num_units);
> ASSIGN_INT_IF("num_channels", pciep.num_channels);
> ASSIGN_INT_IF("type", pciep.type);
> ASSIGN_ENUM_IF("withPHY", pciep.withPHY, bool);
>
> else {
> warnUnrecognizedParam(node_name);
321a378
> }
323c380,381
< }
---
> // Change from MHz to Hz
> pciep.clockRate *= 1e6;
325,335c383,387
< void PCIeController::set_pcie_param()
< {
< pciep.clockRate = XML->sys.pcie.clockrate;
< pciep.clockRate *= 1e6;
< pciep.num_units = XML->sys.pcie.number_units;
< pciep.num_channels = XML->sys.pcie.num_channels;
< pciep.duty_cycle = XML->sys.pcie.duty_cycle;
< pciep.perc_load = XML->sys.pcie.total_load_perc;
< pciep.type = XML->sys.pcie.type;
< pciep.withPHY = XML->sys.pcie.withPHY;
< // pciep.executionTime = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
---
> num_children = xml_data->nChildNode("stat");
> for (i = 0; i < num_children; i++) {
> XMLNode* statNode = xml_data->getChildNodePtr("stat", &i);
> XMLCSTR node_name = statNode->getAttribute("name");
> XMLCSTR value = statNode->getAttribute("value");
336a389,398
> if (!node_name)
> warnMissingStatName(statNode->getAttribute("id"));
>
> ASSIGN_FP_IF("duty_cycle", pcies.duty_cycle);
> ASSIGN_FP_IF("perc_load", pcies.perc_load);
>
> else {
> warnUnrecognizedStat(node_name);
> }
> }
339,348c401,406
< FlashController::FlashController(ParseXML *XML_interface,InputParameter* interface_ip_)
< :XML(XML_interface),
< interface_ip(*interface_ip_)
< {
< local_result = init_interface(&interface_ip);
< double frontend_area, phy_area, ctrl_area, SerDer_area;
< double ctrl_dyn, frontend_dyn, SerDer_dyn;
< double ctrl_gates,frontend_gates, SerDer_gates;
< double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
< double NMOS_sizing, PMOS_sizing;
---
> FlashController::FlashController(XMLNode* _xml_data,
> InputParameter* interface_ip_)
> : McPATComponent(_xml_data, interface_ip_) {
> name = "Flash Controller";
> set_fc_param();
> }
350,353c408,410
< /* Assuming PCIe is bit-slice based architecture
< * This is the reason for /8 in both area and power calculation
< * to get per lane numbers
< */
---
> void FlashController::computeArea() {
> double ctrl_area;
> double SerDer_area;
355,373c412,415
< set_fc_param();
< if (fcp.type == 0) //high performance NIU
< {
< cout<<"Current McPAT does not support high performance flash contorller since even low power designs are enough for maintain throughput"<<endl;
< exit(0);
< NMOS_sizing = 5*g_tp.min_w_nmos_;
< PMOS_sizing = 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
< }
< else
< {
< ctrl_area = 0.243 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
< //Area estimation based on Cadence ChipEstimate @ 65nm: NANDFLASH-CTRL from CAST
< SerDer_area = 0.36/8 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);
< //based On PCIe PHY TSMC65GP from Cadence ChipEstimate @ 65nm, it support 8x lanes with each lane
< //speed up to 250MB/s (PCIe1.1x) This is already saturate the 200MB/s of the flash controller core above.
< ctrl_gates = 129267;
< SerDer_gates = 200000/8;
< NMOS_sizing = g_tp.min_w_nmos_;
< PMOS_sizing = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
---
> /* Assuming Flash is bit-slice based architecture
> * This is the reason for /8 in both area and power calculation
> * to get per lane numbers
> */
375,389c417,429
< //Power
< //Cadence ChipEstimate using 65nm the controller 125mW for every 200MB/s This is power not energy!
< ctrl_dyn = 0.125*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
< //SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
< SerDer_dyn = 0.01*1.6*(interface_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;
< //max Per controller speed is 1.6Gb/s (200MB/s)
< }
< double number_channel = 1+(fcp.num_channels-1)*0.2;
< area.set_area((ctrl_area + (fcp.withPHY? SerDer_area:0))*1e6*number_channel);
< power_t.readOp.dynamic = (ctrl_dyn + (fcp.withPHY? SerDer_dyn:0))*number_channel;
< power_t.readOp.leakage = ((ctrl_gates + (fcp.withPHY? SerDer_gates:0))*number_channel)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
< double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
< power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction;
< power_t.readOp.gate_leakage = ((ctrl_gates + (fcp.withPHY? SerDer_gates:0))*number_channel)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
< }
---
> if (fcp.type == 0) { //high performance flash controller
> cout << "Current McPAT does not support high performance flash "
> << "controller since even low power designs are enough for "
> << "maintain throughput" <<endl;
> exit(0);
> } else {
> ctrl_area = 0.243 * (interface_ip.F_sz_um / 0.065) *
> (interface_ip.F_sz_um / 0.065);
> //Area estimation based on Cadence ChipEstimate @ 65nm: NANDFLASH-CTRL
> //from CAST
> SerDer_area = 0.36 / 8 * (interface_ip.F_sz_um / 0.065) *
> (interface_ip.F_sz_um / 0.065);
> }
391,394c431,434
< void FlashController::computeEnergy(bool is_tdp)
< {
< if (is_tdp)
< {
---
> double number_channel = 1 + (fcp.num_channels - 1) * 0.2;
> output_data.area = (ctrl_area + (fcp.withPHY ? SerDer_area : 0)) *
> 1e6 * number_channel;
> }
395a436,443
> void FlashController::computeEnergy() {
> double ctrl_dyn;
> double SerDer_dyn;
> double ctrl_gates;
> double SerDer_gates;
> double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
> double NMOS_sizing;
> double PMOS_sizing;
397,398c445,448
< power = power_t;
< power.readOp.dynamic *= fcp.duty_cycle;
---
> /* Assuming Flash is bit-slice based architecture
> * This is the reason for /8 in both area and power calculation
> * to get per lane numbers
> */
399a450,475
> if (fcp.type == 0) { //high performance flash controller
> cout << "Current McPAT does not support high performance flash "
> << "controller since even low power designs are enough for "
> << "maintain throughput" <<endl;
> exit(0);
> NMOS_sizing = 5 * g_tp.min_w_nmos_;
> PMOS_sizing = 5 * g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r;
> } else {
> //based On PCIe PHY TSMC65GP from Cadence ChipEstimate @ 65nm, it
> //support 8x lanes with each lane speed up to 250MB/s (PCIe1.1x).
> //This is already saturate the 200MB/s of the flash controller core
> //above.
> ctrl_gates = 129267;
> SerDer_gates = 200000 / 8;
> NMOS_sizing = g_tp.min_w_nmos_;
> PMOS_sizing = g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r;
>
> //Power
> //Cadence ChipEstimate using 65nm the controller 125mW for every
> //200MB/s This is power not energy!
> ctrl_dyn = 0.125 * g_tp.peri_global.Vdd / 1.1 * g_tp.peri_global.Vdd /
> 1.1 * (interface_ip.F_sz_nm / 65.0);
> //SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
> SerDer_dyn = 0.01 * 1.6 * (interface_ip.F_sz_um / 0.09) *
> g_tp.peri_global.Vdd / 1.2 * g_tp.peri_global.Vdd / 1.2;
> //max Per controller speed is 1.6Gb/s (200MB/s)
401,405c477,500
< else
< {
< rt_power = power_t;
< rt_power.readOp.dynamic *= fcp.perc_load;
< }
---
>
> double number_channel = 1 + (fcp.num_channels - 1) * 0.2;
> power.readOp.dynamic = (ctrl_dyn + (fcp.withPHY ? SerDer_dyn : 0)) *
> number_channel;
> power.readOp.leakage = ((ctrl_gates + (fcp.withPHY ? SerDer_gates : 0)) *
> number_channel) *
> cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand) *
> g_tp.peri_global.Vdd;//unit W
> double long_channel_device_reduction =
> longer_channel_device_reduction(Uncore_device);
> power.readOp.longer_channel_leakage =
> power.readOp.leakage * long_channel_device_reduction;
> power.readOp.gate_leakage =
> ((ctrl_gates + (fcp.withPHY ? SerDer_gates : 0)) * number_channel) *
> cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand) *
> g_tp.peri_global.Vdd;//unit W
>
> // Output power
> output_data.subthreshold_leakage_power =
> longer_channel_device ? power.readOp.longer_channel_leakage :
> power.readOp.leakage;
> output_data.gate_leakage_power = power.readOp.gate_leakage;
> output_data.peak_dynamic_power = power.readOp.dynamic * fcs.duty_cycle;
> output_data.runtime_dynamic_energy = power.readOp.dynamic * fcs.perc_load;
408c503
< void FlashController::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
---
> void FlashController::set_fc_param()
410,412c505,510
< string indent_str(indent, ' ');
< string indent_str_next(indent+2, ' ');
< bool long_channel = XML->sys.longer_channel_device;
---
> int num_children = xml_data->nChildNode("param");
> int i;
> for (i = 0; i < num_children; i++) {
> XMLNode* paramNode = xml_data->getChildNodePtr("param", &i);
> XMLCSTR node_name = paramNode->getAttribute("name");
> XMLCSTR value = paramNode->getAttribute("value");
414,427c512,513
< if (is_tdp)
< {
< cout << "Flash Controller:" << endl;
< cout << indent_str<< "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
< cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic << " W" << endl;//no multiply of clock since this is power already
< cout << indent_str<< "Subthreshold Leakage = "
< << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
< //cout << indent_str<< "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
< cout << indent_str<< "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
< cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic << " W" << endl;
< cout<<endl;
< }
< else
< {
---
> if (!node_name)
> warnMissingParamName(paramNode->getAttribute("id"));
428a515,520
> ASSIGN_INT_IF("num_channels", fcp.num_channels);
> ASSIGN_INT_IF("type", fcp.type);
> ASSIGN_ENUM_IF("withPHY", fcp.withPHY, bool);
>
> else {
> warnUnrecognizedParam(node_name);
429a522
> }
431c524,528
< }
---
> num_children = xml_data->nChildNode("stat");
> for (i = 0; i < num_children; i++) {
> XMLNode* statNode = xml_data->getChildNodePtr("stat", &i);
> XMLCSTR node_name = statNode->getAttribute("name");
> XMLCSTR value = statNode->getAttribute("value");
433,444c530,531
< void FlashController::set_fc_param()
< {
< // fcp.clockRate = XML->sys.flashc.mc_clock;
< // fcp.clockRate *= 1e6;
< fcp.peakDataTransferRate = XML->sys.flashc.peak_transfer_rate;
< fcp.num_channels = ceil(fcp.peakDataTransferRate/200);
< fcp.num_mcs = XML->sys.flashc.number_mcs;
< fcp.duty_cycle = XML->sys.flashc.duty_cycle;
< fcp.perc_load = XML->sys.flashc.total_load_perc;
< fcp.type = XML->sys.flashc.type;
< fcp.withPHY = XML->sys.flashc.withPHY;
< // flashcp.executionTime = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
---
> if (!node_name)
> warnMissingStatName(statNode->getAttribute("id"));
445a533,539
> ASSIGN_FP_IF("duty_cycle", fcs.duty_cycle);
> ASSIGN_FP_IF("perc_load", fcs.perc_load);
>
> else {
> warnUnrecognizedStat(node_name);
> }
> }