#include "scheduler.h"
#include "stack.h"

/********************************************************************/
/*This routine displays the information of the Scheduler class,
 *including the schedules of each dfg node, and the hardware
 *allocation and assignment.
 */
void Scheduler::display(ostream &output)
{
  output << "Scheduler: " << (void *)this << endl;

  return;
}
/********************************************************************/
/*This is the main low-power scheduling routine.
 */
Datapath *Scheduler::lpschedule(Dfg &flowgraph, library *lib,
				const float sample_constraint, const float laxity,
				Scm *scms, const Boolean use_retiming,
				char *vec_filename, const int max_synthesis_iterations)
{
  float current_vdd, min_vdd, max_vdd, cur_cost, best_cost;
  int num_csteps, max_csteps, min_csteps, lastcstep;
  float previous_clk, current_clk;
  float fastest_sample_period, sample_period;
  int num_explorations = 0;
  Datapath *cur_dp, *best_dp, *tmp_dp;
  Dfg *best_flowgraph = NULL;
  Dfg *initial_flowgraph = new Dfg;
  Retpipe rp;
  Schalloc_info best_info(flowgraph.numnodes(), flowgraph.numedges());
  char tmpfilename[MAXSTRLEN];

  assert_force(lib);
  assert_force(scms);
  assert_force((sample_constraint > 0.0 && laxity == 0.0) ||
	       (sample_constraint == 0.0 && laxity >= 1.0));
  //CHAINING IS CURRENTLY NOT IMPLEMENTED
  assert_force(MAX_CHAINING_FACTOR == 1);
  assert(max_synthesis_iterations >= 0);

  max_vdd = 5.0;
  fastest_sample_period = compute_min_sample_period(flowgraph, lib);
  if(sample_constraint == 0.0) {
    sample_period = fastest_sample_period*laxity;
  } else {
    sample_period = sample_constraint;
  }
 
  //min_vdd = compute_vdd(5.0, fastest_sample_period, sample_period);
  min_vdd = 1.0;
  if(min_vdd > 5.0) {
    cout << "WARNING: specified sample period cannot be met even at 5V" << endl;
    cout << "No scheduling will be performed" << endl;
  }

  cout << "*" << endl;
  cur_dp = best_dp = NULL;

  initial_flowgraph->copy(flowgraph);


//  for(current_vdd = min_vdd; current_vdd <= max_vdd;
//      current_vdd = current_vdd+0.1) 
  {
  //current_vdd = 1.8; 
   current_vdd = 5.0;
  //if vdd is skippable, skip it
    //if(vdd_can_be_pruned()) continue;

    max_csteps = compute_max_csteps(*initial_flowgraph, lib, sample_period, current_vdd);
    min_csteps = compute_min_csteps(*initial_flowgraph, lib, sample_period,
				    current_vdd, MAX_CHAINING_FACTOR);
    previous_clk = 0.0;
    for(num_csteps = max_csteps; num_csteps >= min_csteps; num_csteps--) {
      current_clk = (int)(sample_period/(float)num_csteps);

      //each time we start off with the initial_flowgraph
      flowgraph.copy(*initial_flowgraph);

      //if clock is skippable, skip it
      //if(previous_clk != 0.0 &&
      //clk_can_be_pruned(current_clk, previous_clk, current_vdd, flowgraph, lib)) {
      //cout << "*  Pruning clock, Vdd: " << current_vdd << " csteps: " << num_csteps
      //<< endl;
      //continue;
      //} else {
      //previous_clk = current_clk;
      //}

      //reset the Dfg & Datapath
      flowgraph.reset_scheduling_info();
      flowgraph.reset_allocation_info();
      if(cur_dp) delete cur_dp;
      cur_dp = new Datapath;
      cur_dp->set_vdd(current_vdd);
      cur_dp->set_sample_period(sample_period);
      cur_dp->set_csteps(num_csteps);
#ifdef PHYSICAL      
      cur_dp->set_bitwidth(flowgraph.get_bitwidth());
#endif      

      //map each operation to the fastest module that can perform it
      fastest_map(flowgraph, lib);

      //perform an asap schedule
      lastcstep = asap_schedule(flowgraph, lib, current_vdd, current_clk);

      //If sample period constraint is not satisfied, attempt pipelining to
      //meet the sample period constraint
      //NOTE: flowgraph is updated to reflect the new positions of delays
      if(use_retiming && lastcstep > num_csteps) {
	if(rp.retpipe_flowgraph(flowgraph, num_csteps)) {
	  //perform an ASAP schedule and update the value of num_csteps
	  lastcstep = asap_schedule(flowgraph, lib, current_vdd, current_clk);
	  //CDFG structure has changed ==> need to recompute SCMs
	  if(lastcstep <= num_csteps) {
	    scms->extract_scmatrices(flowgraph, lib, vec_filename);
	  }
	}
      }

      //perform iterative improvement based synthesis only if the initial
      //solution meets the sample_period
      if(lastcstep <= num_csteps) {
	num_explorations++;
	cout << "Iterative improvement synthesis No.: " << num_explorations
	  << ", Vdd: " << current_vdd << ", Contol Steps: " << num_csteps << endl;
	//perform an one-to-one-allocation to create the Datapath
	initial_allocation(flowgraph, cur_dp, lib);
#ifdef PHYSICAL	
  	cout<<"Fully parallel result:"<<endl;
    	cur_dp->do_floorplan(lib, scm);
#endif	

	//perform an iterative improvement to reduce the switched
	//capacitance
	iterative_improvement(flowgraph, lib, cur_dp, 
			current_vdd, current_clk, POWER, scms);
#ifdef PHYSICAL
	iterative_reg_improvement(flowgraph, lib, cur_dp, 
			current_vdd, current_clk, POWER, scms);
#endif	
      } else {
	cout << "OOPS - cannot meet constraint with Vdd: " 
		<< current_vdd << " control steps:"
	  << num_csteps << " best csteps possible: " << lastcstep << endl;
	continue;  /*WATCH OUT - A CONTINUE IS BEING USED*/
      }

      //cur_cost = current_vdd*current_vdd*compute_sccost(cur_dp, lib, scms);
      cur_cost = compute_sccost(cur_dp, lib, scms);
      cout << "Cost (Vdd = " << current_vdd << ", csteps = " << num_csteps
	   << ") = " << cur_cost << endl;
      if(!best_dp) {
	assert(!best_flowgraph);
	best_flowgraph = new Dfg;
	best_dp = new Datapath;
	copy_flowgraph_and_dp(flowgraph, *cur_dp, *best_flowgraph, *best_dp, lib);
	best_cost = cur_cost;
      } else if( cur_cost < best_cost) {
	//copying cur_dp to best_dp is done by simply EXCHANGING pointers
	copy_flowgraph_and_dp(flowgraph, *cur_dp, *best_flowgraph, *best_dp, lib);
	best_cost = cur_cost;
      }

      if(max_synthesis_iterations > 0) {
	if(num_explorations == max_synthesis_iterations) {
	  cout << "Reached limit on max. synthesis iterations" << endl;
	  break; /*BREAK FROM THE FOR EACH CLOCK LOOP*/
	}
      }
    } /*END FOR EACH CANDIDATE csteps*/
/*
    if(max_synthesis_iterations > 0) {
      if(num_explorations == max_synthesis_iterations) {
	break; //BREAK FROM THE FOR EACH VDD LOOP
      }
    }
*/
  } /*END FOR EACH CANDIDATE Vdd*/

  copy_flowgraph_and_dp(*best_flowgraph, *best_dp, flowgraph, *cur_dp, lib);

  //write out Genesis CDFG file
  strcpy(tmpfilename, flowgraph.name);
  strcat(tmpfilename, "_gen.dfg");
  write_genesis_dfg(flowgraph, *cur_dp, tmpfilename);
  strcpy(tmpfilename, flowgraph.name);
  strcat(tmpfilename, "_dfg.vcg");
  flowgraph.print_vcg(tmpfilename);

  assert(best_dp);
  delete best_dp;
  assert(best_flowgraph);
  delete best_flowgraph;
  assert(initial_flowgraph);
  delete initial_flowgraph;

  return(cur_dp);
}
/********************************************************************/
/*This routine is used to compute the minimum sample period which is
 *possible for the given Dfg and library. This is multiplied by the
 *laxity factor to give the sampling period. The computation proceeds
 *as follows. Operations are mapped to the fastest functional units,
 *a clock period equal to the minimum of all the functional unit stage
 *delays is chosen, an ASAP schedule is performed, and the sample
 *period thus obtained is returned as the minimum sample period.
 */
float Scheduler::compute_min_sample_period(Dfg &flowgraph, library *lib)
{
  register int i;
  register NODEPTR node;
  register libelement *libel;
  float tmp_clock, retval;
  int maxcstep;

  assert_force(lib);

  //clear up the schedule & module selection info from the DFG
  flowgraph.reset_scheduling_info();
  flowgraph.reset_allocation_info();

  fastest_map(flowgraph, lib);
  tmp_clock = HUGEFLOAT;
  FOR_EACH_NODEINDEX(flowgraph, i) {
    node =flowgraph.get_nthnode(i);
    assert(node);
    libel = lib->get_nthelement(node->get_moduletype());
    if(libel->get_stage_delay(5.0) < tmp_clock) {
      tmp_clock = (int) libel->get_stage_delay(5.0);
    }
  }
  maxcstep = asap_schedule(flowgraph, lib, 5.0, tmp_clock);

  retval = maxcstep*tmp_clock;

  return(retval);
}
/********************************************************************/
/*This routine computes the amount of vdd scaling possible given that
 *the total delay of the circuit is equal to orig_delay, at a voltage
 *of reference_vdd. The vdd can be scaled until the delay degrades to
 *just less than scaled_delay. The computed vdd is rounded UPWARDS to
 *a multiple of 0.1, and this value is returned.
 */
float Scheduler::compute_vdd(const float reference_vdd, const float orig_delay,
			     const float scaled_delay)
{
  float cur_vdd, cur_delay, k;
  assert(reference_vdd > 0.0 && orig_delay > 0.0 && scaled_delay > 0.0);
  assert(reference_vdd/VDD_GRANULARITY ==  (float)((int)(reference_vdd/VDD_GRANULARITY)));

  k = (orig_delay*(reference_vdd-V_TH)*(reference_vdd-V_TH))/reference_vdd;

  if(scaled_delay == orig_delay) {
    cur_vdd = reference_vdd;
  } else if(scaled_delay > orig_delay) {
    cur_vdd = reference_vdd;
    cur_delay = orig_delay;
    while(cur_delay <= scaled_delay) {
      cur_vdd = cur_vdd - 0.1;
      assert(cur_vdd > 2*V_TH);
      cur_delay = (k*cur_vdd)/((cur_vdd-V_TH)*(cur_vdd-V_TH));
    }
    cur_vdd = cur_vdd + 0.1;
  } else {//scaled_delay < orig_delay
    cur_vdd = reference_vdd;
    cur_delay = orig_delay;
    while(cur_delay > scaled_delay) {
      cur_vdd = cur_vdd + 0.1;
      assert(cur_vdd > 2*V_TH);
      cur_delay = (k*cur_vdd)/((cur_vdd-V_TH)*(cur_vdd-V_TH));
    }
  }
  return(cur_vdd);
}
/********************************************************************/
/*This routine computes the maximum number of control steps that need
 *to be used in any schedule for the given DFG. This corresponds to the
 *least clock period that is explored. The least clock period is
 *assumed to be equal to the stage delay of the fastest library component
 *available. In general, even smaller clock periods are possible, but
 *such fine clock periods lead to excessive power dissipation in the
 *clock network, Datapath registers, and controller. The floor of the
 *ratio of the sample period to this least clock period corresponds to
 *the greatest number of control steps.
 */
const int Scheduler::compute_max_csteps(Dfg &flowgraph, library *lib,
			      const float sample_period, const float vdd)
{
  float min_delay;
  List_ar<libelement *> *elemlistptr;
  List_iterator<libelement *> elemscan;
  register libelement *libel;

  assert_force(lib);
  assert_force(sample_period > 0.0);
  assert_force(vdd > 0.0 && vdd <= 5.0);

  elemlistptr = lib->get_all_elements(flowgraph.bitwidth);
  min_delay = HUGEFLOAT;
  FOR_EACH_LISTNODE(*elemlistptr, elemscan) {
    libel = elemscan.get_item();
    assert(libel);
    min_delay = MIN(min_delay, libel->get_stage_delay(vdd));
  }
  assert(min_delay > 0.0);
  delete elemlistptr;

  return(FLOOR(sample_period/min_delay));
}
/********************************************************************/
/*This routine computes the minimum number of control steps that need
 *be considered in any schedule for the given DFG. This corresponds to
 *the greatest clock period that is explored. The extreme case would be
 *to have just 1 clock period, i.e, each iteration of the DFG computation
 *is performed by a combinational network. This would, however, require
 *excessive hardware (no sharing possible), and result in excessive
 *glitching power dissipation in the Datapath. Note that in order to
 *utilize the benefits of a large clock period, CHAINING MUST BE
 *PERFORMED. Given a max chaining factor of k, i.e., at most k operations
 *can be chained together in a single control step, and we can estimate
 *the max clock period by taking a combination of k non-pipelined library
 *elements that result in the largest cumulative delay.
 *CURRENTLY THE CHAINING FACTOR IS ASSUMED TO BE 1, i.e, NO CHAINING.
 */
const int Scheduler::compute_min_csteps(Dfg &flowgraph, library *lib, const float sample_period,
			      const float vdd, const int max_chaining_factor)
{
  assert_force(lib);
  assert_force(sample_period > 0.0);
  assert_force(max_chaining_factor == 1);
  assert(vdd > 0.0 && vdd <= 5.0);

  //since the chaining factor is 1 (each operation will take at least
  //1 control step), one iteration of the DFG will require at least as
  //many control steps as there are levels in the CDFG

  //assert that the DFG has been levelized
  assert(flowgraph.levellists.get_size() == flowgraph.numnodes() &&
	 flowgraph.maxlevel < flowgraph.levellists.get_size());
  return((flowgraph.maxlevel+1)/max_chaining_factor);
}
/********************************************************************/
/*This routine determines whether a given clock period, current_clk,
 *can be pruned. To determine this, we also need the last clock period
 *for which a Datapath was synthesized. The routine compares the number
 *of control steps for a stage of each library element under clock
 *periods of current_clk and previous_clk. If the numbers differ for at
 *least one library element, then clock pruning CANNOT be performed.
 *IMPORTANT: In order for this strategy to work, previous_clk must be
 *SMALLER THAN current_clk.
 */
Boolean Scheduler::clk_can_be_pruned(float current_clk, float previous_clk,
				     float vdd, Dfg &flowgraph, library *lib)
{
  register int i;
  register NODEPTR node;
  register libelement *libelptr;
  Boolean retval = T;
  List_ar<libelement *> *choices;
  List_iterator<libelement *> elemscan;

  assert(lib && current_clk > 0.0 && previous_clk > 0.0
	 && vdd > 0.0 && vdd <= 5.0);
  assert(current_clk >= previous_clk);

  FOR_EACH_NODEINDEX(flowgraph, i) {
    node = flowgraph.get_nthnode(i);
    assert(node && node->get_address() == i);
    choices = lib->get_choices(node->get_func(), flowgraph.get_bitwidth());
    FOR_EACH_LISTNODE(*choices, elemscan) {
      libelptr = elemscan.get_item();
      assert(libelptr);
      if(libelptr->get_stage_csteps(vdd, current_clk) !=
	 libelptr->get_stage_csteps(vdd, previous_clk)) {
	retval = F;
	break;
      }
    }
    delete choices;
    if(retval == F) {
      break;
    }
  }

  return(retval);
}
/********************************************************************/
/*This routine performs iterative improvement on a given initial Datapath.
 */
void Scheduler::iterative_improvement(Dfg &flowgraph, library *lib, Datapath * &dp,
			     const float current_vdd, const float current_clk,
			     const objective &obj, Scm *scms)
{
  register int i;
  Boolean moves_left;
  Boolean improvement = T;
  int pass = 0;
  Datapath *cur_dp;
  Dfg *cur_flowgraph;
  Schalloc_info cur_dfg_info(flowgraph.numnodes(),flowgraph.numedges());
  Array<Boolean> nodes_visited(flowgraph.numnodes());
  Array<Boolean> edges_visited(flowgraph.numedges());
  float best_cost,cur_cost;

  assert(lib && dp && current_vdd > 0.0 && current_clk > 0.0);
  //check_dfg_and_datapath(flowgraph, *dp);

  //flowgraph and dp store the best solution
  //cur_flowgraph and cur_dp store the current solution
  cur_flowgraph = new Dfg;
  cur_dp = new Datapath;

  while(improvement && pass < MAX_NUM_PASSES) 
  {
    	pass++;
    	improvement = F;
    	//each pass starts with the best solution seen thus far
    	copy_flowgraph_and_dp(flowgraph, *dp, *cur_flowgraph, *cur_dp, lib);

    	nodes_visited.resize(cur_flowgraph->numnodes());
    	nodes_visited.reset(F);
    	edges_visited.resize(cur_flowgraph->numedges());
    	edges_visited.reset(F);

    	cur_dfg_info.extract_info(*cur_flowgraph);

    	if(obj == POWER) {
      		best_cost = compute_sccost(cur_dp, lib, scms);
	}
    	else { 
      		best_cost = compute_areacost(*cur_flowgraph, cur_dp, lib);
	}
    	//for(i = 0; i < MAX_NUM_MOVES_PER_PASS; i++)
    	while(1)
	{
	    	//One move is generated and implemented on the copy of cur_dp.
      		moves_left = generate_move(*cur_flowgraph, lib, 
				cur_dp, cur_dfg_info, nodes_visited, 
				edges_visited, scms, obj);
      		if(!moves_left){ 
			//cout<<" No move left "<<endl;
			break;    /*BREAK FROM THE FOR LOOP*/
     	 	}
	//temp 07172002
   	}
	//The following codes were within the MAX_NUM_MOVES_PER_PASS loop
	//I moved them out 
      	if(obj == POWER) { 
		cur_cost = compute_sccost(cur_dp, lib, scms);
	}
      	else  {
		assert(obj == AREA);
		cur_cost = compute_areacost(*cur_flowgraph, cur_dp,lib);
      	}
      	//if the current solution is better than the previous best, 
	//update the best solution
      	if(cur_cost < best_cost) 
      	{
		cur_flowgraph->copy_schalloc_info(cur_dfg_info);
		copy_flowgraph_and_dp(*cur_flowgraph, *cur_dp, 
				flowgraph, *dp, lib);
		best_cost = cur_cost;
		cout<<"A series of move implemented"<<endl;
		improvement = T;
      	} else {
	       cout<<" No improvement after a series of move"<<endl;
	       cout<<"Best cots is "<<best_cost<<" New cost is "
		       <<cur_cost<<"pass # "<<pass<<endl;
	}
  } /*END WHILE*/

/*
  if(obj == AREA) {
    //Implement register sharing
    copy_flowgraph_and_dp(flowgraph, *dp, *cur_flowgraph, *cur_dp, lib);
    cur_dfg_info.extract_info(*cur_flowgraph);
    reg_share(*cur_flowgraph, cur_dp, cur_dfg_info, lib);
    cur_flowgraph->copy_schalloc_info(cur_dfg_info); 
    copy_flowgraph_and_dp(*cur_flowgraph, *cur_dp, flowgraph, *dp, lib);
  }
  */
  delete cur_flowgraph;
  delete cur_dp;

  return;
}
/********************************************************************/

/*This routine performs iterative improvement on a given initial Datapath.
 */
#ifdef PHYSICAL
void Scheduler::iterative_reg_improvement(Dfg &flowgraph, library *lib, Datapath * &dp,
                             const float current_vdd, const float current_clk,
                             const objective &obj, Scm *scms)
{
  register int i;
  Boolean moves_left;
  Boolean improvement = T;
  int pass = 0;
  Datapath *cur_dp;
  Dfg *cur_flowgraph;
  Schalloc_info cur_dfg_info(flowgraph.numnodes(),flowgraph.numedges());
  Array<Boolean> nodes_visited(flowgraph.numnodes());
  Array<Boolean> edges_visited(flowgraph.numedges());
  float best_cost,cur_cost;

  assert(lib && dp && current_vdd > 0.0 && current_clk > 0.0);
  //check_dfg_and_datapath(flowgraph, *dp);

  //flowgraph and dp store the best solution
  //cur_flowgraph and cur_dp store the current solution
  cur_flowgraph = new Dfg;
  cur_dp = new Datapath;

  while(improvement && pass < MAX_NUM_PASSES)
  {
        pass++;
        improvement = F;
        //each pass starts with the best solution seen thus far
        copy_flowgraph_and_dp(flowgraph, *dp, *cur_flowgraph, *cur_dp, lib);

        nodes_visited.resize(cur_flowgraph->numnodes());
        nodes_visited.reset(F);
        edges_visited.resize(cur_flowgraph->numedges());
        edges_visited.reset(F);

        cur_dfg_info.extract_info(*cur_flowgraph);

        if(obj == POWER)
                best_cost = compute_sccost(cur_dp, lib, scms);
        else
                best_cost = compute_areacost(*cur_flowgraph, cur_dp, lib);

        for(i = 0; i < MAX_NUM_MOVES_PER_PASS; i++)
        {
                //One move is generated and implemented on the copy of cur_dp.
                moves_left = generate_reg_move(*cur_flowgraph, lib,
                                cur_dp, cur_dfg_info, nodes_visited,
                                edges_visited, scms, obj);
                if(!moves_left){
                        //cout<<" No move left "<<endl;
                        break;    /*BREAK FROM THE FOR LOOP*/
                }
       //temp 07172002
        }
        if(obj == POWER) {
	      cur_cost = compute_sccost(cur_dp, lib, scms);
        }
        else  {
	      assert(obj == AREA);
	      cur_cost = compute_areacost(*cur_flowgraph, cur_dp,lib);
        }
	if(cur_cost<best_cost) {
	        cur_flowgraph->copy_schalloc_info(cur_dfg_info);
        	copy_flowgraph_and_dp(*cur_flowgraph, *cur_dp,
                	   flowgraph, *dp, lib);
	        best_cost = cur_cost;
        	improvement = T;
	}
  } /*END WHILE*/

  delete cur_flowgraph;
  delete cur_dp;

  return;
}
#endif

/********************************************************************/
/*This routine computes the value of the cost function for a Datapath
 *that implements the given Dfg. The cost estimation is based on the
 *switched capacitance matrices.
 *Currently assumes that (i) No CONDITIONALS are present, and (ii) The
 *list of operations mapped to each fu (variables mapped to each register)
 *is sorted in increasing order of lifetimes (this is asserted).
 */
float Scheduler::compute_sccost(Datapath *dp, library *lib, Scm *scms)
{
  register FUPTR fu;
  register STORPTR reg;
  List_iterator<FUPTR> fuscan;
  List_iterator<Storage_unit *> regscan;
  float total = 0.0;

  assert(lib && dp && scms);

  //First estimate SC for functional units
  FOR_EACH_LISTNODE(dp->get_functional_units(), fuscan) {
    fu = fuscan.get_item();
    assert(fu);
#ifdef PHYSICAL
    total +=fu->compute_reduced_sc(scm);    
#ifdef _LEAKAGE_IN_
    total +=fu->get_libelement()->get_leakage();    
#endif    
#else    
    total += scms->get_total_sc(fu->get_libelement()->get_address(), fu->get_operations());
#endif    

  }

  //Next, estimate SC for storage units
  FOR_EACH_LISTNODE(dp->get_storage_units(), regscan) {
    assert(!strcmp(regscan.get_item()->get_id(),"Register"));
    reg = (STORPTR) regscan.get_item();
    assert(reg);
    total += scms->get_total_sc(reg->get_variables());
  }
  //SC estimation for multiplexers - CURRENTLY IGNORED
  //SC estimation for wiring - CURRENTLY IGNORED
#ifdef PHYSICAL
   total = total*FROM_35_to_18 * lib->get_gate_scale_down();
#endif   
//   cout<<"TOTAL = "<<total<<endl;
  return(total);
}
/********************************************************************/
/*This routine computes the expected gain for a move of class A. This
 *routine uses the FACT that there is NO CHANGE IN THE FUNCTIONAL UNIT
 *OR REGISTER ALLOCATION due to the move. The gain is equal to the
 *EXPECTED REDUCTION IN THE COST FUNCTION if this move is performed.
 */
float Scheduler::compute_scgain(Datapath *dp, Class_a_move &move, Scm *scms)
{
  float retval = 0.0;

  assert(dp && scms);

  retval += scms->get_total_sc(move.old_libelement->get_address(), move.fu->get_operations());
  retval -= scms->get_total_sc(move.new_libelement->get_address(), move.fu->get_operations());

  return(retval);
}
/********************************************************************/
/*This routine computes the expected switched capacitance gain for a
 *Class_b_fu_move.
 */
float Scheduler::compute_scgain(Datapath *dp, Class_b_fu_move &move, Scm *scms)
{
  NODEPTR node1, node2;
  register FUPTR fu;
  Boolean first;
  float retval = 0.0;
  List_iterator<NODEPTR> node1scan, node2scan;
  List_ar<NODEPTR> tmplist;

  assert(dp && scms);

  if(move.splitting) {
    assert_force(0);
    //gain = cost of move.fu1 - cost of split_operations - cost of remaining operations on move.fu1
  } else {
    retval += scms->get_total_sc(move.fu1->get_libelement()->get_address(),move.fu1->get_operations());
    retval += scms->get_total_sc(move.fu2->get_libelement()->get_address(),move.fu2->get_operations());

    //order the operations of move.fu1 and move.fu2 into a single list, based on their
    //current birth times
    node1scan.start(move.fu1->get_operations());
    node2scan.start(move.fu2->get_operations());
    while(node1scan.not_done() || node2scan.not_done()) {
      if(!node1scan.not_done()) {
	assert(node2scan.not_done());
	node2 = node2scan.get_item();
	assert(node2);
	tmplist.append(node2);
	node2scan.increment();
      } else if(!node2scan.not_done()) {
	assert(node1scan.not_done());
	node1 = node1scan.get_item();
	assert(node1);
	tmplist.append(node1);
	node1scan.increment();
      } else {
	node1 = node1scan.get_item();
	node2 = node2scan.get_item();

	if(node1->get_birth() < node2->get_birth()) {
	  tmplist.append(node1);
	  node1scan.increment();
	} else if (node2->get_birth() < node1->get_birth()) {
	  tmplist.append(node2);
	  node2scan.increment();
	} else { //both birth cycles are equal - arbitratily add node1 and then node2
	  tmplist.append(node1);
	  tmplist.append(node2);
	  node1scan.increment();
	  node2scan.increment();
	}
      }
    }

    //create the precedence constraints using the single list
    assert_force(tmplist.get_size() == move.fu1->get_operations().get_size() + move.fu2->get_operations().get_size() );

    retval -= scms->get_total_sc(move.fu1->get_libelement()->get_address(), tmplist);
  }
#ifdef _LEAKAGE_IN_
  retval +=move.fu1->get_libelement()->get_leakage();
#endif  
  return(retval);
}

#ifdef _OLD_BUG_ 
float Scheduler::compute_scgain(Datapath *dp, Class_b_reg_move &move, Scm *scms)
{

  EDGEPTR edge1, edge2;
  register STORPTR reg;
  Boolean first;
  float retval = 0.0;
  List_iterator<EDGEPTR> edge1scan, edge2scan;
  List_ar<EDGEPTR> tmplist;
  float temp;
  assert(dp && scms);

  if(move.splitting) {
    assert_force(0);
  } else {

    retval += scms->get_total_sc(move.su1->get_variables());
    retval += scms->get_total_sc(move.su2->get_variables());

    //order the variables of move.su1 and move.su2 into a single list, based on their
    //current birth times (states)
    edge1scan.start(move.su1->get_variables());
    edge2scan.start(move.su2->get_variables());
    while(edge1scan.not_done() || edge2scan.not_done()) {
      if(!edge1scan.not_done()) {
        assert(edge2scan.not_done());
        edge2 = edge2scan.get_item();
        assert(edge2);
        tmplist.append(edge2);
        edge2scan.increment();
      } else if(!edge2scan.not_done()) {
        assert(edge1scan.not_done());
        edge1 = edge1scan.get_item();
        assert(edge1);
        tmplist.append(edge1);
        edge1scan.increment();
      } else {
        edge1 = edge1scan.get_item();
        edge2 = edge2scan.get_item();
        if(edge1->get_birth() < edge2->get_birth()) {
          tmplist.append(edge1);
          edge1scan.increment();
        } else if (edge2->get_birth() < edge1->get_birth()) {
          tmplist.append(edge2);
          edge2scan.increment();
        } else { //both birth cycles are equal - arbitratily add edge1 and then edge2
          tmplist.append(edge1);
          tmplist.append(edge2);
          edge1scan.increment();
          edge2scan.increment();
        }
      }
    }
    //create the precedence constraints using the single list
    assert_force(tmplist.get_size() == move.su1->get_variables().get_size() + move.su2->get_variables().get_size() );
    retval -= scms->get_total_sc(tmplist);
  }
  return(retval);
}
#endif
/********************************************************************/
#ifdef _SCALP_
FSM* Scheduler::create_fsm(Datapath* datapath)
{
  FRITS_SET_MESSAGE("get_fsm");
  assert(datapath);
  FSM* fsm;
  mem_ok(fsm = new FSM(datapath->get_csteps()));

  // Add FSM outputs corresponding to load_enables of datapath registers
  List_iterator<Storage_unit *> su_iterator;
  List_iterator<EDGEPTR> var_iterator;
  STring output_name;	// NOTE: this is FRITS-specific
  char address[5];
  Datapath_element* dpelement;
  Net* net;
  FOR_EACH_LISTNODE(datapath->get_storage_units(), su_iterator) {
    if ( ((Storage_library_element*)(dpelement = su_iterator.get_item())->
	get_libelement())->has_load_enable() ) {
      output_name.set("X");
      output_name.cat(itoa(dpelement->get_address(), address, 10));
      output_name.cat("_le");
      fsm->add_output(output_name.get());
      // create a corresponding net and hook it up to dpelement and fsm
      mem_ok(net = new Net(output_name.get()));
      dpelement->connect_to_net(net, LE);
      fsm->connect_to_net(net, output_name.get()); 
      datapath->add_net(net);
      // Set control values for the register load enables
      FOR_EACH_LISTNODE( ((Register*)dpelement)->get_variables(), var_iterator) {
        fsm->set_active_state(var_iterator.get_item()->get_birth());
      }  /* FOR_EACH_.. */
    } /* if ( has_load_enable ) */
  } /* FOR_EACH_.. */

  // Add FSM outputs corresponding to select signals of datapath muxes
  // Connect selects to FSM's outputs and set values for FSM's outputs
  List_iterator<Interconnect_unit *> iu_iterator;
  Mux *mux;
  Datapath_element* mux_dpelement;  // what mux is feeding
  int port_index;  // port index of the port the output of mux is connected to
  int position;  // mux input line #, 0-based for convenience oof numbering selects
  NODEPTR node;
  EDGEPTR edge;
  int i;	// universal counter
  STring select_name;
  int mux_select_bitwidth;
  char index[3];
  FOR_EACH_LISTNODE(datapath->get_interconnect_units(), iu_iterator) {
    mux = (Mux*)iu_iterator.get_item(); 

    // Get the output net of the iu and determine what it is connected to
    Net* output = mux->get_net(MUXOUT);
    assert(output->get_port_connections().get_size() == 2);
    List_iterator<PORTCONNPTR> pc_iterator(output->get_port_connections());
    Port_connection* pc = pc_iterator.get_item(); // equivalent to [0]
    mux_dpelement = pc->get_datapath_element();
    port_index = pc->get_port_number();

    if ( ((Interconnect_library_element*)(dpelement = iu_iterator.get_item())->
	get_libelement())->is_tristate() ) { 
      mux_select_bitwidth = mux->get_number_of_inputs();
    } else {
      mux_select_bitwidth = CEIL(log((double)(mux->get_number_of_inputs()))/log((double)2.0));
    } /* if ( ! ..is_tristate ) */

    // Create selects for muxes/FSM outputs  and connect them
    for ( i = 0; i < mux_select_bitwidth; i++ ) {
      datapath->add_net(fsm->connect_to_mux_select(i, (Interconnect_unit*&)mux));
    } /* for */

  // Set values for the select lines of the muxes

    if ( !strcmp(mux_dpelement->get_id(), "Functional_unit") ) {
      // Functional_unit's Mux
      List_iterator<NODEPTR> node_iterator;
      FOR_EACH_LISTNODE(((Functional_unit*)mux_dpelement)->get_operations(), node_iterator) {
        // node corersponding to an operation
	node = node_iterator.get_item();  
        // edge corresponding to the mux path
	edge = (*node->get_input_edges())[port_index];  
	Storage_unit* su = edge->get_storage_unit();
	net = mux->search_input_net(su);  // net corresponding to the edge
	assert(net);
	position = mux->find_input_position(net);
	assert(position >= 0);  // net must be found

	// select fsm's output based on the position of the net
        output_name.set("X");
        output_name.cat(itoa(mux->get_address(), address, 10));
        output_name.cat("_select");
        output_name.cat(itoa(position, index, 10));

#ifdef _OLD_LIBRARY_
        if ( ((Interconnect_library_element*)mux->get_libelement())->
	       is_tristate() ) 
#else
        if ( ((Interconnect_library_element*)mux->get_libelement()->
		get_frits_libelement())->is_tristate() ) 
#endif /* _OLD_LIBRARY_ */		
	 // two input bus driver
	{  fsm->set_active_state(node->get_birth(), output_name.get()); }
	else 	// mux tree
	{  fsm->set_vector_active_state(node->get_birth(), position, mux_select_bitwidth, output_name.get()); }

      } /* FOR_EACH operation */
    } else if ( !strcmp(mux_dpelement->get_id(), "Register") ) {
      // Register's Mux
      List_iterator<EDGEPTR> edge_iterator;
      FOR_EACH_LISTNODE(((Register*)mux_dpelement)->get_variables(), edge_iterator) {
	edge = edge_iterator.get_item();
	node = (edge->get_source_nodes())[0];
	Functional_unit* fu = node->get_functional_unit();
	net = mux->search_input_net(fu);
	assert(net);
	position = mux->find_input_position(net);
	assert(position >= 0);  // net must be found

	// select fsm's output based on the position of the net
        output_name.set("X");
        output_name.cat(itoa(mux->get_address(), address, 10));
        output_name.cat("_select");
        output_name.cat(itoa(position, index, 10));

#ifdef _OLD_LIBRARY_
        if ( ((Interconnect_library_element*)mux->get_libelement())->
	       is_tristate() ) 
#else
        if ( ((Interconnect_library_element*)mux->get_libelement()->
		get_frits_libelement())->is_tristate() ) 
#endif /* _OLD_LIBRARY_ */		
	 // two input bus driver
	{  fsm->set_active_state(edge->get_birth(), output_name.get()); }
	else 	// mux tree
	{  fsm->set_vector_active_state(edge->get_birth(), position, mux_select_bitwidth, output_name.get()); }

      } /* FOR_EACH variable */
    } else {
	error("A Mux # %d is feeding something it's not supposed to", mux->get_address());
    } /* if Functional_unit/Register */
  } /* FOR_EACH_.. */
  return fsm;
}
#endif
/********************************************************************/
