ext/mcpat/logic.cc - arm/gem5 - Git at Google

 /*****************************************************************************
  *                                McPAT
  *                      SOFTWARE LICENSE AGREEMENT
  *            Copyright 2012 Hewlett-Packard Development Company, L.P.
  *                          All Rights Reserved
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met: redistributions of source code must retain the above copyright
  * notice, this list of conditions and the following disclaimer;
  * redistributions in binary form must reproduce the above copyright
  * notice, this list of conditions and the following disclaimer in the
  * documentation and/or other materials provided with the distribution;
  * neither the name of the copyright holders nor the names of its
  * contributors may be used to endorse or promote products derived from
  * this software without specific prior written permission.

  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
  *
  ***************************************************************************/

 #include "logic.h"


 //selection_logic
 selection_logic::selection_logic(
     bool   _is_default,
     int    win_entries_,
     int    issue_width_,
     const InputParameter *configure_interface,
     enum Device_ty device_ty_,
     enum Core_type core_ty_)
     //const ParseXML *_XML_interface)
  :is_default(_is_default),
   win_entries(win_entries_),
   issue_width(issue_width_),
   device_ty(device_ty_),
   core_ty(core_ty_)
  {
         //uca_org_t result2;
         l_ip=*configure_interface;
         local_result = init_interface(&l_ip);
         //init_tech_params(l_ip.F_sz_um, false);
         //win_entries=numIBEntries;//IQentries;
                 //issue_width=issueWidth;
         selection_power();
         double sckRation = g_tp.sckt_co_eff;
         power.readOp.dynamic *= sckRation;
         power.writeOp.dynamic *= sckRation;
         power.searchOp.dynamic *= sckRation;

         double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
         power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;
          }

 void selection_logic::selection_power()
 {//based on cost effective superscalar processor TR pp27-31
   double Ctotal, Cor, Cpencode;
   int num_arbiter;
   double WSelORn, WSelORprequ, WSelPn, WSelPp, WSelEnn, WSelEnp;

   //TODO: the 0.8um process data is used.
   WSelORn    	=  	12.5 * l_ip.F_sz_um;//this was 10 micron for the 0.8 micron process
   WSelORprequ   = 	50 * l_ip.F_sz_um;//this was 40 micron for the 0.8 micron process
   WSelPn     	= 	12.5 * l_ip.F_sz_um;//this was 10mcron for the 0.8 micron process
   WSelPp     	=  	18.75 * l_ip.F_sz_um;//this was 15 micron for the 0.8 micron process
   WSelEnn    	=  	6.25 * l_ip.F_sz_um;//this was 5 micron for the 0.8 micron process
   WSelEnp    	=  	12.5 * l_ip.F_sz_um;//this was 10 micron for the 0.8 micron process


   Ctotal=0;
   num_arbiter=1;
   while(win_entries > 4)
     {
       win_entries = (int)ceil((double)win_entries / 4.0);
       num_arbiter += win_entries;
     }
   //the 4-input OR logic to generate anyreq
   Cor = 4 * drain_C_(WSelORn,NCH,1,1, g_tp.cell_h_def) + drain_C_(WSelORprequ,PCH,1,1, g_tp.cell_h_def);
   power.readOp.gate_leakage = cmos_Ig_leakage(WSelORn, WSelORprequ, 4, nor)*g_tp.peri_global.Vdd;

   //The total capacity of the 4-bit priority encoder
   Cpencode = drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,1, 1, g_tp.cell_h_def) +
     2*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,2, 1, g_tp.cell_h_def) +
     3*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,3, 1, g_tp.cell_h_def) +
     4*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,4, 1, g_tp.cell_h_def) +//precompute priority logic
     2*4*gate_C(WSelEnn+WSelEnp,20.0)+
     4*drain_C_(WSelEnn,NCH,1, 1, g_tp.cell_h_def) + 2*4*drain_C_(WSelEnp,PCH,1, 1, g_tp.cell_h_def)+//enable logic
     (2*4+2*3+2*2+2)*gate_C(WSelPn+WSelPp,10.0);//requests signal

   Ctotal += issue_width * num_arbiter*(Cor+Cpencode);

   power.readOp.dynamic = Ctotal*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*2;//2 means the abitration signal need to travel round trip
   power.readOp.leakage = issue_width * num_arbiter *
       (cmos_Isub_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
        + cmos_Isub_leakage(WSelPn, WSelPp, 3, nor)//grant2p
        + cmos_Isub_leakage(WSelPn, WSelPp, 4, nor)//grant3p
        + cmos_Isub_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
        + cmos_Isub_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant sIsubnals
                   )*g_tp.peri_global.Vdd;
   power.readOp.gate_leakage = issue_width * num_arbiter *
       (cmos_Ig_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
        + cmos_Ig_leakage(WSelPn, WSelPp, 3, nor)//grant2p
        + cmos_Ig_leakage(WSelPn, WSelPp, 4, nor)//grant3p
        + cmos_Ig_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
        + cmos_Ig_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant signals
         )*g_tp.peri_global.Vdd;
 }


 dep_resource_conflict_check::dep_resource_conflict_check(
         const InputParameter *configure_interface,
         const CoreDynParam & dyn_p_,
         int   compare_bits_,
     bool   _is_default)
  :  l_ip(*configure_interface),
     coredynp(dyn_p_),
     compare_bits(compare_bits_),
         is_default(_is_default)
 {
         Wcompn    	=  	25 * l_ip.F_sz_um;//this was 20.0 micron for the 0.8 micron process
         Wevalinvp   = 	25 * l_ip.F_sz_um;//this was 20.0 micron for the 0.8 micron process
         Wevalinvn   = 	100 * l_ip.F_sz_um;//this was 80.0 mcron for the 0.8 micron process
         Wcomppreequ =  	50 * l_ip.F_sz_um;//this was 40.0  micron for the 0.8 micron process
         WNORn    	=  	6.75 * l_ip.F_sz_um;//this was 5.4 micron for the 0.8 micron process
         WNORp    	=  	38.125 * l_ip.F_sz_um;//this was 30.5 micron for the 0.8 micron process

         local_result = init_interface(&l_ip);

         if (coredynp.core_ty==Inorder)
                     compare_bits += 16 + 8 + 8;//TODO: opcode bits + log(shared resources) + REG TAG BITS-->opcode comparator
         else
                 compare_bits += 16 + 8 + 8;

                 conflict_check_power();
         double sckRation = g_tp.sckt_co_eff;
         power.readOp.dynamic *= sckRation;
         power.writeOp.dynamic *= sckRation;
         power.searchOp.dynamic *= sckRation;

 }

 void dep_resource_conflict_check::conflict_check_power()
 {
         double Ctotal;
         int num_comparators;
         num_comparators = 3*((coredynp.decodeW) * (coredynp.decodeW)-coredynp.decodeW);//2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest to dest comparision.
         //When decode-width ==1, no dcl logic

         Ctotal = num_comparators * compare_cap();
         //printf("%i,%s\n",XML_interface->sys.core[0].predictor.predictor_entries,XML_interface->sys.core[0].predictor.prediction_scheme);

         power.readOp.dynamic=Ctotal*/*CLOCKRATE*/g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/*AF*/;
         power.readOp.leakage=num_comparators*compare_bits*2*simplified_nmos_leakage(Wcompn,  false);

         double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
         power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;
         power.readOp.gate_leakage=num_comparators*compare_bits*2*cmos_Ig_leakage(Wcompn, 0, 2, nmos);

 }

 /* estimate comparator power consumption (this comparator is similar
    to the tag-match structure in a CAM */
 double dep_resource_conflict_check::compare_cap()
 {
   double c1, c2;

   WNORp = WNORp * compare_bits/2.0;//resize the big NOR gate at the DCL according to fan in.
   /* bottom part of comparator */
   c2 = (compare_bits)*(drain_C_(Wcompn,NCH,1,1, g_tp.cell_h_def)+drain_C_(Wcompn,NCH,2,1, g_tp.cell_h_def))+
   drain_C_(Wevalinvp,PCH,1,1, g_tp.cell_h_def) + drain_C_(Wevalinvn,NCH,1,1, g_tp.cell_h_def);

   /* top part of comparator */
   c1 = (compare_bits)*(drain_C_(Wcompn,NCH,1,1, g_tp.cell_h_def)+drain_C_(Wcompn,NCH,2,1, g_tp.cell_h_def)+
                   drain_C_(Wcomppreequ,NCH,1,1, g_tp.cell_h_def)) +  gate_C(WNORn + WNORp,10.0) +
                   drain_C_(WNORp,NCH,2,1, g_tp.cell_h_def) + compare_bits*drain_C_(WNORn,NCH,2,1, g_tp.cell_h_def);
   return(c1 + c2);

 }

 void dep_resource_conflict_check::leakage_feedback(double temperature)
 {
   l_ip.temp = (unsigned int)round(temperature/10.0)*10;
   uca_org_t init_result = init_interface(&l_ip); // init_result is dummy

   // This is part of conflict_check_power()
   int num_comparators = 3*((coredynp.decodeW) * (coredynp.decodeW)-coredynp.decodeW);//2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest to dest comparision.
   power.readOp.leakage=num_comparators*compare_bits*2*simplified_nmos_leakage(Wcompn,  false);

   double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
   power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;
   power.readOp.gate_leakage=num_comparators*compare_bits*2*cmos_Ig_leakage(Wcompn, 0, 2, nmos);
 }

 //TODO: add inverter and transmission gate base DFF.

 DFFCell::DFFCell(
                 bool _is_dram,
                 double _WdecNANDn,
                 double _WdecNANDp,
                 double _cell_load,
                 const InputParameter *configure_interface)
 :is_dram(_is_dram),
 cell_load(_cell_load),
 WdecNANDn(_WdecNANDn),
 WdecNANDp(_WdecNANDp)
 {//this model is based on the NAND2 based DFF.
                         l_ip=*configure_interface;
 //			area.set_area(730*l_ip.F_sz_um*l_ip.F_sz_um);
                         area.set_area(5*compute_gate_area(NAND, 2,WdecNANDn,WdecNANDp, g_tp.cell_h_def)
                                 + compute_gate_area(NAND, 2,WdecNANDn,WdecNANDn, g_tp.cell_h_def));


 }


 double DFFCell::fpfp_node_cap(unsigned int fan_in, unsigned int fan_out)
 {
   double Ctotal = 0;
   //printf("WdecNANDn = %E\n", WdecNANDn);

   /* part 1: drain cap of NAND gate */
   Ctotal += drain_C_(WdecNANDn, NCH, 2, 1, g_tp.cell_h_def, is_dram) + fan_in * drain_C_(WdecNANDp, PCH, 1, 1, g_tp.cell_h_def, is_dram);

   /* part 2: gate cap of NAND gates */
   Ctotal += fan_out * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);

   return Ctotal;
 }


 void DFFCell::compute_DFF_cell()
 {
         double c1, c2, c3, c4, c5, c6;
            /* node 5 and node 6 are identical to node 1 in capacitance */
            c1 = c5 = c6 = fpfp_node_cap(2, 1);
            c2 = fpfp_node_cap(2, 3);
            c3 = fpfp_node_cap(3, 2);
            c4 = fpfp_node_cap(2, 2);

            //cap-load of the clock signal in each Dff, actually the clock signal only connected to one NAND2
            clock_cap= 2 * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
            e_switch.readOp.dynamic += (c4 + c1 + c2 + c3 + c5 + c6 + 2*cell_load)*0.5*g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;

            /* no 1/2 for e_keep and e_clock because clock signal switches twice in one cycle */
            e_keep_1.readOp.dynamic += c3 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
            e_keep_0.readOp.dynamic += c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
            e_clock.readOp.dynamic += clock_cap* g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;

            /* static power */
            e_switch.readOp.leakage +=  (cmos_Isub_leakage(WdecNANDn, WdecNANDp, 2, nand)*5//5 NAND2 and 1 NAND3 in a DFF
                                            + cmos_Isub_leakage(WdecNANDn, WdecNANDn, 3, nand))*g_tp.peri_global.Vdd;
            e_switch.readOp.gate_leakage += (cmos_Ig_leakage(WdecNANDn, WdecNANDp, 2, nand)*5//5 NAND2 and 1 NAND3 in a DFF
                                            + cmos_Ig_leakage(WdecNANDn, WdecNANDn, 3, nand))*g_tp.peri_global.Vdd;
            //printf("leakage =%E\n",cmos_Ileak(1, is_dram) );
 }

 Pipeline::Pipeline(
                 const InputParameter *configure_interface,
                 const CoreDynParam & dyn_p_,
                 enum Device_ty device_ty_,
                 bool _is_core_pipeline,
                 bool _is_default)
 : l_ip(*configure_interface),
   coredynp(dyn_p_),
   device_ty(device_ty_),
   is_core_pipeline(_is_core_pipeline),
   is_default(_is_default),
   num_piperegs(0.0)

   {
         local_result = init_interface(&l_ip);
         if (!coredynp.Embedded)
                 process_ind = true;
         else
                 process_ind = false;
         WNANDn = (process_ind)? 25 *   l_ip.F_sz_um : g_tp.min_w_nmos_ ;//this was  20 micron for the 0.8 micron process
         WNANDp = (process_ind)? 37.5 * l_ip.F_sz_um : g_tp.min_w_nmos_*pmos_to_nmos_sz_ratio();//this was  30 micron for the 0.8 micron process
         load_per_pipeline_stage = 2*gate_C(WNANDn + WNANDp, 0, false);
         compute();

 }

 void Pipeline::compute()
 {
         compute_stage_vector();
         DFFCell pipe_reg(false, WNANDn,WNANDp, load_per_pipeline_stage, &l_ip);
         pipe_reg.compute_DFF_cell();

         double clock_power_pipereg = num_piperegs * pipe_reg.e_clock.readOp.dynamic;
         //******************pipeline power: currently, we average all the possibilities of the states of DFFs in the pipeline. A better way to do it is to consider
         //the harming distance of two consecutive signals, However McPAT does not have plan to do this in near future as it focuses on worst case power.
         double pipe_reg_power = num_piperegs * (pipe_reg.e_switch.readOp.dynamic+pipe_reg.e_keep_0.readOp.dynamic+pipe_reg.e_keep_1.readOp.dynamic)/3+clock_power_pipereg;
         double pipe_reg_leakage = num_piperegs * pipe_reg.e_switch.readOp.leakage;
         double pipe_reg_gate_leakage = num_piperegs * pipe_reg.e_switch.readOp.gate_leakage;
         power.readOp.dynamic	+=pipe_reg_power;
         power.readOp.leakage	+=pipe_reg_leakage;
         power.readOp.gate_leakage	+=pipe_reg_gate_leakage;
         area.set_area(num_piperegs * pipe_reg.area.get_area());

         double long_channel_device_reduction = longer_channel_device_reduction(device_ty, coredynp.core_ty);
         power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;


         double sckRation = g_tp.sckt_co_eff;
         power.readOp.dynamic *= sckRation;
         power.writeOp.dynamic *= sckRation;
         power.searchOp.dynamic *= sckRation;
         double macro_layout_overhead = g_tp.macro_layout_overhead;
         if (!coredynp.Embedded)
                 area.set_area(area.get_area()*macro_layout_overhead);
 }

 void Pipeline::compute_stage_vector()
 {
         double num_stages, tot_stage_vector, per_stage_vector;
         int opcode_length = coredynp.x86? coredynp.micro_opcode_length:coredynp.opcode_length;
         //Hthread = thread_clock_gated? 1:num_thread;

   if (!is_core_pipeline)
   {
         num_piperegs=l_ip.pipeline_stages*l_ip.per_stage_vector;//The number of pipeline stages are calculated based on the achievable throughput and required throughput
   }
   else
   {
         if (coredynp.core_ty==Inorder)
         {
                 /* assume 6 pipe stages and try to estimate bits per pipe stage */
                 /* pipe stage 0/IF */
                 num_piperegs += coredynp.pc_width*2*coredynp.num_hthreads;
                 /* pipe stage IF/ID */
                 num_piperegs += coredynp.fetchW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;
                 /* pipe stage IF/ThreadSEL */
                 if (coredynp.multithreaded) num_piperegs += coredynp.num_hthreads*coredynp.perThreadState; //8 bit thread states
                 /* pipe stage ID/EXE */
                 num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width + pow(2.0,opcode_length)+ 2*coredynp.int_data_width)*coredynp.num_hthreads;
                 /* pipe stage EXE/MEM */
                 num_piperegs += coredynp.issueW*(3 * coredynp.arch_ireg_width + pow(2.0,opcode_length) + 8*2*coredynp.int_data_width/*+2*powers (2,reg_length)*/);
                 /* pipe stage MEM/WB the 2^opcode_length means the total decoded signal for the opcode*/
                 num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length) + 8*2*coredynp.int_data_width/*+2*powers (2,reg_length)*/);
 //		/* pipe stage 5/6 */
 //		num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/*+2*powers (2,reg_length)*/);
 //		/* pipe stage 6/7 */
 //		num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/*+2*powers (2,reg_length)*/);
 //		/* pipe stage 7/8 */
 //		num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/**2*powers (2,reg_length)*/);
 //		/* assume 50% extra in control signals (rule of thumb) */
                 num_stages=6;

         }
         else
         {
                 /* assume 12 stage pipe stages and try to estimate bits per pipe stage */
                 /*OOO: Fetch, decode, rename, IssueQ, dispatch, regread, EXE, MEM, WB, CM */

                 /* pipe stage 0/1F*/
                 num_piperegs += coredynp.pc_width*2*coredynp.num_hthreads ;//PC and Next PC
                 /* pipe stage IF/ID */
                 num_piperegs += coredynp.fetchW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;//PC is used to feed branch predictor in ID
                 /* pipe stage 1D/Renaming*/
                 num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;//PC is for branch exe in later stage.
                 /* pipe stage Renaming/wire_drive */
                 num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width);
                 /* pipe stage Renaming/IssueQ */
                 num_piperegs += coredynp.issueW*(coredynp.instruction_length  + coredynp.pc_width + 3*coredynp.phy_ireg_width)*coredynp.num_hthreads;//3*coredynp.phy_ireg_width means 2 sources and 1 dest
                 /* pipe stage IssueQ/Dispatch */
                 num_piperegs += coredynp.issueW*(coredynp.instruction_length + 3 * coredynp.phy_ireg_width);
                 /* pipe stage Dispatch/EXE */

                 num_piperegs += coredynp.issueW*(3 * coredynp.phy_ireg_width + coredynp.pc_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);
                 /* 2^opcode_length means the total decoded signal for the opcode*/
                 num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);
                 /*2 source operands in EXE; Assume 2EXE stages* since we do not really distinguish OP*/
                 num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);
                 /* pipe stage EXE/MEM, data need to be read/write, address*/
                 num_piperegs += coredynp.issueW*(coredynp.int_data_width + coredynp.v_address_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);//memory Opcode still need to be passed
                 /* pipe stage MEM/WB; result data, writeback regs */
                 num_piperegs += coredynp.issueW*(coredynp.int_data_width + coredynp.phy_ireg_width /* powers (2,opcode_length) + (2,opcode_length)+2*powers (2,reg_length)*/);
                 /* pipe stage WB/CM ; result data, regs need to be updated, address for resolve memory ops in ROB's top*/
                 num_piperegs += coredynp.commitW*(coredynp.int_data_width + coredynp.v_address_width + coredynp.phy_ireg_width/*+ powers (2,opcode_length)*2*powers (2,reg_length)*/)*coredynp.num_hthreads;
 //		if (multithreaded)
 //		{
 //
 //		}
                 num_stages=12;

         }

         /* assume 50% extra in control registers and interrupt registers (rule of thumb) */
         num_piperegs = num_piperegs * 1.5;
         tot_stage_vector=num_piperegs;
         per_stage_vector=tot_stage_vector/num_stages;

         if (coredynp.core_ty==Inorder)
         {
                 if (coredynp.pipeline_stages>6)
                         num_piperegs= per_stage_vector*coredynp.pipeline_stages;
         }
         else//OOO
         {
                 if (coredynp.pipeline_stages>12)
                         num_piperegs= per_stage_vector*coredynp.pipeline_stages;
         }
   }

 }

 FunctionalUnit::FunctionalUnit(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, enum FU_type fu_type_)
 :XML(XML_interface),
  ithCore(ithCore_),
  interface_ip(*interface_ip_),
  coredynp(dyn_p_),
  fu_type(fu_type_)
 {
     double area_t;//, leakage, gate_leakage;
     double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
         clockRate = coredynp.clockRate;
         executionTime = coredynp.executionTime;

         //XML_interface=_XML_interface;
         uca_org_t result2;
         result2 = init_interface(&interface_ip);
         if (XML->sys.Embedded)
         {
                 if (fu_type == FPU)
                 {
                         num_fu=coredynp.num_fpus;
                         //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
                         area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number
                         //4.47 contains both VFP and NEON processing unit, VFP is about 40% and NEON is about 60%
                         if (g_ip->F_sz_nm>90)
                                 area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
                         leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
                         gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
                         //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles.
 //			base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
 //			base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
                         base_energy = 0;
                         per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per Hz energy(nJ)
                         //FPU power from Sandia's processor sizing tech report
                         FU_height=(18667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data
                 }
                 else if (fu_type == ALU)
                 {
                         num_fu=coredynp.num_alus;
                         area_t = 280*260*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
                         leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
                         gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
 //			base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
 //			base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
                         base_energy = 0;
                         per_access_energy = 1.15/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
                         FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU

                 }
                 else if (fu_type == MUL)
                 {
                         num_fu=coredynp.num_muls;
                         area_t = 280*260*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
                         leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
                         gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
 //			base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
 //			base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
                         base_energy = 0;
                         per_access_energy = 1.15*2/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
                         FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data
                 }
                 else
                 {
                         cout<<"Unknown Functional Unit Type"<<endl;
                         exit(0);
                 }
                 per_access_energy *=0.5;//According to ARM data embedded processor has much lower per acc energy
         }
         else
         {
                 if (fu_type == FPU)
                 {
                         num_fu=coredynp.num_fpus;
                         //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
                         area_t = 8.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2
                         if (g_ip->F_sz_nm>90)
                                 area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
                         leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
                         gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
                         //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles.
                         base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
                         base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
                         per_access_energy = 1.15*3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per op energy(nJ)
                         FU_height=(38667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data
                 }
                 else if (fu_type == ALU)
                 {
                         num_fu=coredynp.num_alus;
                         area_t = 280*260*2*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
                         leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
                         gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
                         base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
                         base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
                         per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
                         FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU

                 }
                 else if (fu_type == MUL)
                 {
                         num_fu=coredynp.num_muls;
                         area_t = 280*260*2*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
                         leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
                         gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
                         base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
                         base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
                         per_access_energy = 1.15*2/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
                         FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data
                 }
                 else
                 {
                         cout<<"Unknown Functional Unit Type"<<endl;
                         exit(0);
                 }
         }
         //IEXEU, simple ALU and FPU
         //  double C_ALU, C_EXEU, C_FPU; //Lum Equivalent capacitance of IEXEU and FPU. Based on Intel and Sun 90nm process fabracation.
         //
         //  C_ALU	  = 0.025e-9;//F
         //  C_EXEU  = 0.05e-9; //F
         //  C_FPU	  = 0.35e-9;//F
     area.set_area(area_t*num_fu);
     leakage *= num_fu;
     gate_leakage *=num_fu;
         double macro_layout_overhead = g_tp.macro_layout_overhead;
 //	if (!XML->sys.Embedded)
                 area.set_area(area.get_area()*macro_layout_overhead);
 }

 void FunctionalUnit::computeEnergy(bool is_tdp)
 {
         double pppm_t[4]    = {1,1,1,1};
         double FU_duty_cycle;
         if (is_tdp)
         {


                 set_pppm(pppm_t, 2, 2, 2, 2);//2 means two source operands needs to be passed for each int instruction.
                 if (fu_type == FPU)
                 {
                         stats_t.readAc.access = num_fu;
                         tdp_stats = stats_t;
                         FU_duty_cycle = coredynp.FPU_duty_cycle;
                 }
                 else if (fu_type == ALU)
                 {
                         stats_t.readAc.access = 1*num_fu;
                         tdp_stats = stats_t;
                         FU_duty_cycle = coredynp.ALU_duty_cycle;
                 }
                 else if (fu_type == MUL)
                 {
                         stats_t.readAc.access = num_fu;
                         tdp_stats = stats_t;
                         FU_duty_cycle = coredynp.MUL_duty_cycle;
                 }

             //power.readOp.dynamic = base_energy/clockRate + energy*stats_t.readAc.access;
             power.readOp.dynamic = per_access_energy*stats_t.readAc.access + base_energy/clockRate;
                 double sckRation = g_tp.sckt_co_eff;
                 power.readOp.dynamic *= sckRation*FU_duty_cycle;
                 power.writeOp.dynamic *= sckRation;
                 power.searchOp.dynamic *= sckRation;

             power.readOp.leakage = leakage;
             power.readOp.gate_leakage = gate_leakage;
             double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
             power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;

         }
         else
         {
                 if (fu_type == FPU)
                 {
                         stats_t.readAc.access = XML->sys.core[ithCore].fpu_accesses;
                         rtp_stats = stats_t;
                 }
                 else if (fu_type == ALU)
                 {
                         stats_t.readAc.access = XML->sys.core[ithCore].ialu_accesses;
                         rtp_stats = stats_t;
                 }
                 else if (fu_type == MUL)
                 {
                         stats_t.readAc.access = XML->sys.core[ithCore].mul_accesses;
                         rtp_stats = stats_t;
                 }

             //rt_power.readOp.dynamic = base_energy*executionTime + energy*stats_t.readAc.access;
             rt_power.readOp.dynamic = per_access_energy*stats_t.readAc.access + base_energy*executionTime;
                 double sckRation = g_tp.sckt_co_eff;
                 rt_power.readOp.dynamic *= sckRation;
                 rt_power.writeOp.dynamic *= sckRation;
                 rt_power.searchOp.dynamic *= sckRation;

         }


 }

 void FunctionalUnit::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
 {
         string indent_str(indent, ' ');
         string indent_str_next(indent+2, ' ');
         bool long_channel = XML->sys.longer_channel_device;

 //	cout << indent_str_next << "Results Broadcast Bus Area = " << bypass->area.get_area() *1e-6 << " mm^2" << endl;
         if (is_tdp)
         {
                 if (fu_type == FPU)
                 {
                         cout << indent_str << "Floating Point Units (FPUs) (Count: "<< coredynp.num_fpus <<" ):" << endl;
                         cout << indent_str_next << "Area = " << area.get_area()*1e-6  << " mm^2" << endl;
                         cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate  << " W" << endl;
 //			cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage  << " W" << endl;
                         cout << indent_str_next<< "Subthreshold Leakage = "
                                                 << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
                         cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage  << " W" << endl;
                         cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
                         cout <<endl;
                 }
                 else if (fu_type == ALU)
                 {
                         cout << indent_str << "Integer ALUs (Count: "<< coredynp.num_alus <<" ):" << endl;
                         cout << indent_str_next << "Area = " << area.get_area()*1e-6  << " mm^2" << endl;
                         cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate  << " W" << endl;
 //			cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage  << " W" << endl;
                         cout << indent_str_next<< "Subthreshold Leakage = "
                                                 << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
                         cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage  << " W" << endl;
                         cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
                         cout <<endl;
                 }
                 else if (fu_type == MUL)
                 {
                         cout << indent_str << "Complex ALUs (Mul/Div) (Count: "<< coredynp.num_muls <<" ):" << endl;
                         cout << indent_str_next << "Area = " << area.get_area()*1e-6  << " mm^2" << endl;
                         cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate  << " W" << endl;
 //			cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage  << " W" << endl;
                         cout << indent_str_next<< "Subthreshold Leakage = "
                                                 << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
                         cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage  << " W" << endl;
                         cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
                         cout <<endl;

                 }

         }
         else
         {
         }

 }

 void FunctionalUnit::leakage_feedback(double temperature)
 {
   // Update the temperature and initialize the global interfaces.
   interface_ip.temp = (unsigned int)round(temperature/10.0)*10;

   uca_org_t init_result = init_interface(&interface_ip); // init_result is dummy

   // This is part of FunctionalUnit()
   double area_t, leakage, gate_leakage;
   double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();

   if (fu_type == FPU)
   {
         area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number
         if (g_ip->F_sz_nm>90)
                 area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
         leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
         gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
   }
   else if (fu_type == ALU)
   {
     area_t = 280*260*2*num_fu*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
     leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
     gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
   }
   else if (fu_type == MUL)
   {
     area_t = 280*260*2*3*num_fu*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
     leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
     gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
   }
   else
   {
     cout<<"Unknown Functional Unit Type"<<endl;
     exit(1);
   }

   power.readOp.leakage = leakage*num_fu;
   power.readOp.gate_leakage = gate_leakage*num_fu;
   power.readOp.longer_channel_leakage = longer_channel_device_reduction(Core_device, coredynp.core_ty);
 }

 UndiffCore::UndiffCore(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_,  bool embedded_)
 :XML(XML_interface),
  ithCore(ithCore_),
  interface_ip(*interface_ip_),
  coredynp(dyn_p_),
  core_ty(coredynp.core_ty),
  embedded(XML->sys.Embedded),
  pipeline_stage(coredynp.pipeline_stages),
  num_hthreads(coredynp.num_hthreads),
  issue_width(coredynp.issueW),
  exist(exist_)
 // is_default(_is_default)
 {
         if (!exist) return;
         double undifferentiated_core=0;
         double core_tx_density=0;
         double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
         double undifferentiated_core_coe;
         //XML_interface=_XML_interface;
         uca_org_t result2;
         result2 = init_interface(&interface_ip);

         //Compute undifferentiated core area at 90nm.
         if (embedded==false)
         {
                 //Based on the results of polynomial/log curve fitting based on undifferentiated core of Niagara, Niagara2, Merom, Penyrn, Prescott, Opteron die measurements
                 if (core_ty==OOO)
                 {
                         //undifferentiated_core = (0.0764*pipeline_stage*pipeline_stage -2.3685*pipeline_stage + 10.405);//OOO
                         undifferentiated_core = (3.57*log(pipeline_stage)-1.2643)>0?(3.57*log(pipeline_stage)-1.2643):0;
                 }
                 else if (core_ty==Inorder)
                 {
                         //undifferentiated_core = (0.1238*pipeline_stage + 7.2572)*0.9;//inorder
                         undifferentiated_core = (-2.19*log(pipeline_stage)+6.55)>0?(-2.19*log(pipeline_stage)+6.55):0;
                 }
                 else
                 {
                         cout<<"invalid core type"<<endl;
                         exit(0);
                 }
                 undifferentiated_core *= (1+ logtwo(num_hthreads)* 0.0716);
         }
         else
         {
                 //Based on the results in paper "parametrized processor models" Sandia Labs
                 if (XML->sys.opt_clockrate)
                         undifferentiated_core_coe = 0.05;
                 else
                         undifferentiated_core_coe = 0;
                 undifferentiated_core = (0.4109* pipeline_stage - 0.776)*undifferentiated_core_coe;
                 undifferentiated_core *= (1+ logtwo(num_hthreads)* 0.0426);
         }

         undifferentiated_core 		    *= g_tp.scaling_factor.logic_scaling_co_eff*1e6;//change from mm^2 to um^2
         core_tx_density                 = g_tp.scaling_factor.core_tx_density;
         //undifferentiated_core 		    = 3*1e6;
         //undifferentiated_core			*= g_tp.scaling_factor.logic_scaling_co_eff;//(g_ip->F_sz_um*g_ip->F_sz_um/0.09/0.09)*;
         power.readOp.leakage = undifferentiated_core*(core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
         power.readOp.gate_leakage = undifferentiated_core*(core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;

         double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
         power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;
         area.set_area(undifferentiated_core);

         scktRatio = g_tp.sckt_co_eff;
         power.readOp.dynamic *= scktRatio;
         power.writeOp.dynamic *= scktRatio;
         power.searchOp.dynamic *= scktRatio;
         macro_PR_overhead = g_tp.macro_layout_overhead;
         area.set_area(area.get_area()*macro_PR_overhead);


 //		double vt=g_tp.peri_global.Vth;
 //		double velocity_index=1.1;
 //		double c_in=gate_C(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r , 0.0, false);
 //		double c_out= drain_C_(g_tp.min_w_nmos_, NCH, 2, 1, g_tp.cell_h_def, false) + drain_C_(g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, PCH, 1, 1, g_tp.cell_h_def, false) + c_in;
 //		double w_nmos=g_tp.min_w_nmos_;
 //		double w_pmos=g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
 //		double i_on_n=1.0;
 //		double i_on_p=1.0;
 //		double i_on_n_in=1.0;
 //		double i_on_p_in=1;
 //		double vdd=g_tp.peri_global.Vdd;

 //		power.readOp.sc=shortcircuit_simple(vt, velocity_index, c_in, c_out, w_nmos,w_pmos, i_on_n, i_on_p,i_on_n_in, i_on_p_in, vdd);
 //		power.readOp.dynamic=c_out*vdd*vdd/2;

 //		cout<<power.readOp.dynamic << "dynamic" <<endl;
 //		cout<<power.readOp.sc << "sc" << endl;

 //		power.readOp.sc=shortcircuit(vt, velocity_index, c_in, c_out, w_nmos,w_pmos, i_on_n, i_on_p,i_on_n_in, i_on_p_in, vdd);
 //		power.readOp.dynamic=c_out*vdd*vdd/2;
 //
 //		cout<<power.readOp.dynamic << "dynamic" <<endl;
 //		cout<<power.readOp.sc << "sc" << endl;


 }


 void UndiffCore::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
 {
         string indent_str(indent, ' ');
         string indent_str_next(indent+2, ' ');
         bool long_channel = XML->sys.longer_channel_device;

         if (is_tdp)
         {
                 cout << indent_str << "UndiffCore:" << endl;
                 cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
                 cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl;
                 //cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl;
                 cout << indent_str_next<< "Subthreshold Leakage = "
                                         << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
                 cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
                 //cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
                 cout <<endl;
         }
         else
         {
                 cout << indent_str << "UndiffCore:" << endl;
                 cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
                 cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl;
                 cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl;
                 cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
                 //cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
                 cout <<endl;
         }

 }

 inst_decoder::inst_decoder(
                 bool   _is_default,
                 const InputParameter *configure_interface,
                 int opcode_length_,
                 int num_decoders_,
                 bool x86_,
             enum Device_ty device_ty_,
             enum Core_type core_ty_)
 :is_default(_is_default),
  opcode_length(opcode_length_),
  num_decoders(num_decoders_),
  x86(x86_),
  device_ty(device_ty_),
  core_ty(core_ty_)
  {
                         /*
                          * Instruction decoder is different from n to 2^n decoders
                          * that are commonly used in row decoders in memory arrays.
                          * The RISC instruction decoder is typically a very simple device.
                          * We can decode an instruction by simply
                          * separating the machine word into small parts using wire slices
                          * The RISC instruction decoder can be approximate by the n to 2^n decoders,
                          * although this approximation usually underestimate power since each decoded
                          * instruction normally has more than 1 active signal.
                          *
                          * However, decoding a CISC instruction word is much more difficult
                          * than the RISC case. A CISC decoder is typically set up as a state machine.
                          * The machine reads the opcode field to determine
                          * what type of instruction it is,
                          * and where the other data values are.
                          * The instruction word is read in piece by piece,
                          * and decisions are made at each stage as to
                          * how the remainder of the instruction word will be read.
                          * (sequencer and ROM are usually needed)
                          * An x86 decoder can be even more complex since
                          * it involve  both decoding instructions into u-ops and
                          * merge u-ops when doing micro-ops fusion.
                          */
                         bool is_dram=false;
                         double pmos_to_nmos_sizing_r;
                         double load_nmos_width, load_pmos_width;
                         double C_driver_load, R_wire_load;
                         Area cell;

                         l_ip=*configure_interface;
                         local_result = init_interface(&l_ip);
                         cell.h =g_tp.cell_h_def;
                         cell.w =g_tp.cell_h_def;

                         num_decoder_segments = (int)ceil(opcode_length/18.0);
                         if (opcode_length > 18)	opcode_length = 18;
                         num_decoded_signals= (int)pow(2.0,opcode_length);
                         pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
                         load_nmos_width=g_tp.max_w_nmos_ /2;
                         load_pmos_width= g_tp.max_w_nmos_ * pmos_to_nmos_sizing_r;
                         C_driver_load = 1024*gate_C(load_nmos_width + load_pmos_width, 0, is_dram); //TODO: this number 1024 needs to be revisited
                         R_wire_load   = 3000*l_ip.F_sz_um * g_tp.wire_outside_mat.R_per_um;

                         final_dec = new Decoder(
                                         num_decoded_signals,
                                         false,
                                         C_driver_load,
                                         R_wire_load,
                                         false/*is_fa*/,
                                         false/*is_dram*/,
                                         false/*wl_tr*/, //to use peri device
                                         cell);

                         PredecBlk * predec_blk1 = new PredecBlk(
                                         num_decoded_signals,
                                         final_dec,
                                         0,//Assuming predec and dec are back to back
                                         0,
                                         1,//Each Predec only drives one final dec
                                         false/*is_dram*/,
                                         true);
                         PredecBlk * predec_blk2 = new PredecBlk(
                                         num_decoded_signals,
                                         final_dec,
                                         0,//Assuming predec and dec are back to back
                                         0,
                                         1,//Each Predec only drives one final dec
                                         false/*is_dram*/,
                                         false);

                         PredecBlkDrv * predec_blk_drv1 = new PredecBlkDrv(0, predec_blk1, false);
                         PredecBlkDrv * predec_blk_drv2 = new PredecBlkDrv(0, predec_blk2, false);

                         pre_dec            = new Predec(predec_blk_drv1, predec_blk_drv2);

                         double area_decoder = final_dec->area.get_area() * num_decoded_signals * num_decoder_segments*num_decoders;
                         //double w_decoder    = area_decoder / area.get_h();
                         double area_pre_dec = (predec_blk_drv1->area.get_area() +
                                         predec_blk_drv2->area.get_area() +
                                         predec_blk1->area.get_area() +
                                         predec_blk2->area.get_area())*
                                         num_decoder_segments*num_decoders;
                         area.set_area(area.get_area()+ area_decoder + area_pre_dec);
                         double macro_layout_overhead   = g_tp.macro_layout_overhead;
                         double chip_PR_overhead        = g_tp.chip_layout_overhead;
                         area.set_area(area.get_area()*macro_layout_overhead*chip_PR_overhead);

                         inst_decoder_delay_power();

                         double sckRation = g_tp.sckt_co_eff;
                         power.readOp.dynamic *= sckRation;
                         power.writeOp.dynamic *= sckRation;
                         power.searchOp.dynamic *= sckRation;

                         double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
                         power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;

 }

 void inst_decoder::inst_decoder_delay_power()
 {

         double dec_outrisetime;
         double inrisetime=0, outrisetime;
         double pppm_t[4]    = {1,1,1,1};
         double squencer_passes = x86?2:1;

         outrisetime = pre_dec->compute_delays(inrisetime);
         dec_outrisetime = final_dec->compute_delays(outrisetime);
         set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments);
     power = power + pre_dec->power*pppm_t;
     set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals,
                 num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments);
     power = power + final_dec->power*pppm_t;
 }
 void inst_decoder::leakage_feedback(double temperature)
 {
   l_ip.temp = (unsigned int)round(temperature/10.0)*10;
   uca_org_t init_result = init_interface(&l_ip); // init_result is dummy

   final_dec->leakage_feedback(temperature);
   pre_dec->leakage_feedback(temperature);

   double pppm_t[4]    = {1,1,1,1};
   double squencer_passes = x86?2:1;

   set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments);
   power = pre_dec->power*pppm_t;

   set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals,num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments);
   power = power + final_dec->power*pppm_t;

   double sckRation = g_tp.sckt_co_eff;

   power.readOp.dynamic *= sckRation;
   power.writeOp.dynamic *= sckRation;
   power.searchOp.dynamic *= sckRation;

   double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
   power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;
 }

 inst_decoder::~inst_decoder()
 {
           local_result.cleanup();

           delete final_dec;

           delete pre_dec->blk1;
           delete pre_dec->blk2;
           delete pre_dec->drv1;
           delete pre_dec->drv2;
           delete pre_dec;
 }