/********************************************************************/
/*          FILE: sched_retime.C                                    */
/********************************************************************/
#include "scheduler.h"
#include "/u/cplex/cpxdefs.inc"
/********************************************************************/
//This routine takes a Dfg and computes the W and D values for each
//pair of vertices, and stores the result in two matrices, wvalues
//and dvalues
void Scheduler::compute_WD(Dfg &flowgraph, Matrix<int> &wvalues,
			   Matrix<int> &dvalues)
{
  register int i, j, k;
  int delaycount;
  register EDGEPTR edge, loopout_edge;
  register NODEPTR node, innode;
  List_iterator<NODEPTR> nodescan;
  static Matrix<int> arcs;
  static Array<int> nodeweights;

  assert(flowgraph.numnodes() > 0);
  assert(wvalues.get_xcount() == flowgraph.numnodes()+1 &&
	 wvalues.get_ycount() == flowgraph.numnodes()+1);
  assert(dvalues.get_xcount() == flowgraph.numnodes()+1 &&
	 dvalues.get_ycount() == flowgraph.numnodes()+1);
  nodeweights.resize(flowgraph.numnodes());

  if(arcs.get_xcount() != flowgraph.numnodes() ||
     arcs.get_ycount() != flowgraph.numnodes()) {
    arcs.resize(flowgraph.numnodes(), flowgraph.numnodes());
  }

  wvalues.reset(HUGEINT);
  dvalues.reset(0);
  arcs.reset(0);

  //Compute node weights for the circuit graph.
  //Node weights are equal to the operation lifetimes
  FOR_EACH_NODEINDEX(flowgraph, i) {
    node = flowgraph.get_nthnode(i);
    assert(node);
    nodeweights[i] = node->get_death() - node->get_birth();
    assert(nodeweights[i] > 0);
    edge = node->get_output_edge();
    assert(edge);

    //ANAND 5/7/96: CONSERVATIVE ASSUMPTION
    //Note: The weight of a node that feeds a PO or LOOPOUT is taken as
    //the death-birth of the node + 1, in order to account for the
    //loopout-to-loopin register-transfer (for a LOOPOUT) or observation cycle
    //(for a PO).
    //
    //Note that this extra cycle can be avoided by:
    //(i) Forcing each loopout to be assigned to the same register as its loopin,
    //and
    //(ii) Overlapping the observation cycle for one iteration with the
    //     first cycle of the next iteration (this will impose some register
    //     sharing restrictions between some PIs and POs).
    //
    //However, the above restriction are NOT imposed in SCALP, hence, we need
    //to add the extra 1 cycle. The net effect is that we are being conservative
    //during retiming/pipelining since other operations that need the output value
    //from the operation generating a PO/LOOPOUT will also have to wait for 1
    //extra cycle.
    //if(edge->is_po() || edge->is_loopout()) {
    //nodeweights[i]++;
    //}
  }

  FOR_EACH_EDGEINDEX(flowgraph, i) {
    edge = flowgraph.get_nthedge(i);
    assert(edge);
    if(edge->is_pi() || edge->is_constant()) {
      /*nothing done in this case*/
    } else if(edge->is_loopin()) {
      loopout_edge = edge->get_loopout_link();
      assert(loopout_edge);
      delaycount = 1;
      while(!(innode = loopout_edge->input_node())) {
	assert(loopout_edge->is_loopin() || loopout_edge->is_pi());
	if(loopout_edge->is_pi()) {
	  break;
	  //break from the while loop, and continue with the FOR loop;
	} else if(loopout_edge->is_loopin()) {
	  loopout_edge = loopout_edge->get_loopout_link();
	  delaycount++;
	}
      }
      if(innode) {
	FOR_EACH_FANOUT_NODE(edge, nodescan) {
	  node = nodescan.get_item();
	  assert(node);
	  wvalues[innode->get_address()][node->get_address()] = delaycount;
	  dvalues[innode->get_address()][node->get_address()] =
	    nodeweights[innode->get_address()];
	  arcs[innode->get_address()][node->get_address()] = 1;
	}
      }
    } else {
      innode = edge->input_node();
      assert(innode);
      FOR_EACH_FANOUT_NODE(edge, nodescan) {
	node = nodescan.get_item();
	assert(node);
	wvalues[innode->get_address()][node->get_address()] = 0;
	dvalues[innode->get_address()][node->get_address()] =
	                               nodeweights[innode->get_address()];
	arcs[innode->get_address()][node->get_address()] = 1;
      }
    }
  }

  //Perform Floyd-Warshall all-pairs shortest paths computation, using
  //(W,-D) as composite weights
  for(k = 0; k < flowgraph.numnodes(); k++) {
    for(i = 0; i < flowgraph.numnodes(); i++) {
      for(j = 0; j < flowgraph.numnodes(); j++) {
	if(arcs[i][k] && arcs[k][j] &&
	   ((wvalues[i][k] + wvalues[k][j] < wvalues[i][j]) ||
	   (wvalues[i][k] + wvalues[k][j] == wvalues[i][j] &&
	    -(dvalues[i][k] + dvalues[k][j]) < -dvalues[i][j])) ) {
	  wvalues[i][j] = wvalues[i][k] + wvalues[k][j];
	  dvalues[i][j] = dvalues[i][k] + dvalues[k][j];
	  arcs[i][j] = 1;
	}
      }
    }
  }

  //Account for the last vertex in each D(u,v) entry, since
  //D(u ---> v) as computed above does not include delay(v).
  for(i = 0; i < flowgraph.numnodes(); i++) {
    for(j = 0; j < flowgraph.numnodes(); j++) {
      if(arcs[i][j]) {
	  dvalues[i][j] += nodeweights[j];
      }
    }
  }

  return;
}
/********************************************************************/
//This routine takes as input the W and D matrices, and a required
//path length constraint. It creates a list of constraint edges with
//weights. A constraint edge is created for every pair of vertices (u,v)
//for which D(u,v) is greater that the given performance constraint.
//These edges help ensure that after retiming, W(u,v) will be at least 1
//for such edges.
void Scheduler::create_constraint_edges(Dfg &flowgraph, Matrix<int> &wvalues,
			Matrix<int> &dvalues, int perf_constraint,
			Matrix<int> &constraints)
{
  register int i, j;

  assert(perf_constraint > 0);
  assert(wvalues.get_xcount() == flowgraph.numnodes()+1 &&
	 wvalues.get_ycount() == flowgraph.numnodes()+1);
  assert(dvalues.get_xcount() == flowgraph.numnodes()+1 &&
	 dvalues.get_ycount() == flowgraph.numnodes()+1);

  if(constraints.get_xcount() != flowgraph.numnodes() ||
     constraints.get_ycount() != flowgraph.numnodes()) {
    constraints.resize(flowgraph.numnodes(), flowgraph.numnodes());
  }

  constraints.reset(0);

  //Add constraint edges  - dont need to add any to the host vertex
  for(i = 0; i < wvalues.get_xcount(); i++) {
    for(j = 0; j < wvalues.get_xcount(); j++) {
      if(i == j) continue;
      if(wvalues[i][j] < HUGEINT) {
	assert(dvalues[i][j] > 0);
	if(dvalues[i][j] > perf_constraint) {
	  constraints[i][j] = 1;
	}
      }
    }
  }

  return;
}
/********************************************************************/
Ilp *Scheduler::formulate_retpipe_ilp(Dfg &flowgraph, Matrix<int> &wvalues, Matrix<int> &constraints)
{
  register int i, j, k;
  int num_variables, num_constraints, innode_index;
  EDGEPTR edge, loopout_edge;
  NODEPTR innode, outnode;
  List_iterator<NODEPTR> nodescan;
  double arc_weight;
  Ilp *newilp;

  //Variables are the r-value of each vertex, the r-value of the
  //host vertex, and the number of pipeline stages.
  num_variables = flowgraph.numnodes() + 2;

  //Find out the number of constraints
  num_constraints = 0;
  //One constraint for each arc in the circuit graph
  FOR_EACH_EDGEINDEX(flowgraph, i) {
    edge = flowgraph.get_nthedge(i);
    assert(edge && edge->get_address() == i);
    //Constants are not counted as arcs in the retiming graph
    if(edge->is_constant()) continue;
    num_constraints += edge->number_sink_nodes();
    //Account for the arc from a PO to the host vertex
    if(edge->is_po()) {
      num_constraints++;
    }
  }
  //One constraint for each pair of vertices whose D-value
  //violates the sample period constraint
  for(i = 0; i < flowgraph.numnodes(); i++) {
    for(j = 0; j < flowgraph.numnodes(); j++) {
      if(constraints[i][j]) {
	num_constraints++;
      }
    }
  }
  //One constraint forcing the r-value of the host vertex to 0
  num_constraints++;

  newilp = new Ilp(num_constraints, num_variables);

  //All variables are continuous variables
  for(i = 0; i < num_variables; i++) {
    newilp->set_variable_type(i, CONT);
  }

  //Set the objective function
  //The objective is to minimize the number of pipeline stages added
  newilp->set_problem_type(MINIMIZE);
  for(i = 0; i < num_variables-1; i++) {
    newilp->set_objective_coeff(i, 0.0);
  }
  newilp->set_objective_coeff(num_variables-1, 1.0);

  //Set the constraint coefficients and RHS values
  k = 0;
  //constraints that ensure a legal retiming/pipelining
  FOR_EACH_EDGEINDEX(flowgraph, i) {
    edge = flowgraph.get_nthedge(i);
    assert(edge && edge->get_address() == i);
    //Constants are not counted as arcs in the retiming graph
    if(edge->is_constant()) {
      continue;
    } else if(edge->is_pi()) {
      arc_weight = 0;
      innode_index = num_variables-2;
    } else if(edge->is_loopin) {
      //traverse backward thro delays till we hit a PI or innode is
      //not NULL. arc_weight is the number of delays traversed
      //If we hit a PI, innode_index is num_vertices-2. Else, innode_index
      //is the address of the source node
      loopout_edge = edge->get_loopout_link();
      assert(loopout_edge);
      arc_weight = 1.0;
      while(!(innode = loopout_edge->input_node())) {
	assert(loopout_edge->is_loopin() || loopout_edge->is_pi());
	if(loopout_edge->is_pi()) {
	  break;
	} else if(loopout_edge->is_loopin()) {
	  loopout_edge = loopout_edge->get_loopout_link();
	  arc_weight += 1.0;
	}
      }
      if(innode) {
	innode_index = innode->get_address();
      } else {
	innode_index = num_variables - 2;
      }
    } else {
      innode = edge->input_node();
      assert(innode);
      innode_index = innode->get_address();
      arc_weight = 0;
    }
    FOR_EACH_FANOUT_NODE(edge, nodescan) {
      outnode = nodescan.get_item();
      assert(outnode);
      //r(innode) - r(outnode) <= arc weight
      newilp->set_constraint_coeff(k, innode_index, 1.0);
      newilp->set_constraint_coeff(k, outnode->get_address(), -1.0);
      newilp->set_constraint_type(k, LE);
      newilp->set_constraint_rhs(k, arc_weight);
      k++;
    }
    //Account for the arc from a PO to the host vertex
    if(edge->is_po()) {
      //r(innode) - r(host) <= arc weight
      newilp->set_constraint_coeff(k, innode_index, 1.0);
      newilp->set_constraint_coeff(k, num_variables-2, -1.0);
      newilp->set_constraint_type(k, LE);
      newilp->set_constraint_rhs(k, arc_weight);
      k++;
    }
  }

  //constraints to ensure that the sample period is met
  FOR_EACH_NODEINDEX(flowgraph, i) {
    FOR_EACH_NODEINDEX(flowgraph, j) {
      if(constraints[i][j]) {
	//r(i) - r(j) <= W(i,j)-1
	newilp->set_constraint_coeff(k, i, 1.0);
	newilp->set_constraint_coeff(k, j, -1.0);
	newilp->set_constraint_type(k, LE);
	arc_weight = wvalues[i][j]-1.0;
	newilp->set_constraint_rhs(k, arc_weight);
	k++;
      }
    }
  }

  //force r(host) to 0.0
  newilp->set_constraint_coeff(k, num_variables-2, 1.0);
  newilp->set_constraint_type(k, EQ);
  newilp->set_constraint_rhs(k, 0.0);

  //Set the variable bounds
  //All variables are free except the number of  pipeline stages, which
  //must be nonnegative
  for(i = 0; i < num_variables-1; i++) {
    newilp->set_upper_bound(i, INFBOUND);
    newilp->set_lower_bound(i, -INFBOUND);
  }
  newilp->set_upper_bound(num_variables, INFBOUND);
  newilp->set_lower_bound(num_variables, 0);

  return(newilp);
}
/********************************************************************/
//Given the flowgraph, wvalues, and constraint arcs, formulate the
//retiming problem as an LP(Linear program), encode the LP in the CPLEX
//input format, call CPLEX to solve the LP. If a solution to the LP doesnt
//exist, i.e., a retiming/pipelining is not possible for the given
//constraints, return NULL. Else, return an array containing the
//r-values for each flowgraph node, the r-value for the host vertex,
//and the added number of pipeline stages.
double *Scheduler::call_cplex(Dfg &flowgraph, Matrix<int> &wvalues,
			      Matrix<int> &constraints)
{
  register int i, j, k, l;
  int no_vertices, matsize, no_arcs;
  EDGEPTR edge, loopout_edge;
  NODEPTR innode, outnode;
  List_iterator<NODEPTR> nodescan;
  List_iterator<EDGEPTR> edgescan;
  char cplex_debug_filename[MAXSTRLEN];
  static FILE *cplex_debug_file = (FILE *)NULL;

  //variables used by cplex package
  char *problem_filename = "_cplex.lp";
  char *dual_filename = "_dual.lp";
  CPXLPptr lp;
  char *probname = "retminl", *senx;
  int mac = 0, mar = 0, *matbeg, *matcnt, *matind, status, lpstat;
  double *objx, *rhsx, *matval, *bdl, *bdu, *lpx, *lpdualx, lpobj;

  void print_solution(double, double *, int, double *, int, FILE *);

  //variables for CPLEX 3.0
  int pnodes, parcs, pitcnt;

  //Open the retiming debug file
  sprintf(cplex_debug_filename, "%s.cplex_debug", flowgraph.get_name());
  cplex_debug_file = anand_fopen(cplex_debug_filename, "a");

  /* NETWORK FLOW SPECIFICATION  */
  /* Specify number of rows and columns of the constraint matrix */
  no_vertices = flowgraph.numnodes() + 2;
  no_arcs = 0;
  FOR_EACH_EDGEINDEX(flowgraph, i) {
    edge = flowgraph.get_nthedge(i);
    assert(edge && edge->get_address() == i);
    //Constants are not counted as arcs in the retiming graph
    if(edge->is_constant()) continue;
    no_arcs += edge->number_sink_nodes();
    //Account for the arc from a PO to the host vertex
    if(edge->is_po()) {
      no_arcs++;
    }
  }
  for(i = 0; i < flowgraph.numnodes(); i++) {
    for(j = 0; j < flowgraph.numnodes(); j++) {
      if(constraints[i][j]) {
	no_arcs++;
      }
    }
  }
  mac = no_arcs + 2;
  mar = no_vertices;

  /*Set up the obj function for the dual*/
  objx = new double[mac];
  j = 0;
  FOR_EACH_EDGEINDEX(flowgraph, i) {
    edge = flowgraph.get_nthedge(i);
    assert(edge && edge->get_address() == i);
    //Constants are not counted as arcs in the retiming graph
    if(edge->is_constant()) continue;

    FOR_EACH_FANOUT_NODE(edge, nodescan) {
      if(edge->is_loopin()) {
	objx[j] = 1;
      } else {
	objx[j] = 0;
      }
      j++;
    }
    /*Account for arc from PO to host vertex*/
    if(edge->is_po()) {
      objx[j] = 0;
      j++;
    }
  }
  /*handle the constraint edges*/
  for(i = 0; i < flowgraph.numnodes(); i++) {
    for(l = 0; l < flowgraph.numnodes(); l++) {
      if(constraints[i][l]) {
	objx[j] = wvalues[i][l]-1;
	j++;
      }
    }
  }
  objx[j] = 0;
  objx[j+1] = 0;
  j = j+2;
  assert_force(j == mac);

  /* Set up Rhs term for each constraint in the constraint matrix */
  rhsx = new double[mar];
  senx = new char[mar];
  for (i = 0; i < mar-1; i++) {
    rhsx[i] = 0;
    senx[i] = 'E';
  }
  rhsx[mar-1] = -1;
  senx[mar-1] = 'G';

  /* Set up bounds on each edge */
  bdl = new double[mac];
  bdu = new double[mac];
  for (j = 0; j < mac; j++) {
    bdl[j] = 0;
    bdu[j] = INFBOUND;
  }

  /* Set up constraint matrix */
  matbeg = new int[mac];
  matcnt = new int[mac];
  matsize = 2*mac;
  FOR_EACH_LISTNODE(flowgraph.inputs, edgescan) {
    edge = edgescan.get_item();
    assert(edge);
    matsize += edge->number_sink_nodes();
  }
  FOR_EACH_LISTNODE(flowgraph.constants, edgescan) {
    edge = edgescan.get_item();
    assert(edge);
    matsize += edge->number_sink_nodes();
  }
  matsize -= 2;

  matval = new double[matsize];
  matind = new int[matsize];

  j = 0;
  k = 0;
  FOR_EACH_EDGEINDEX(flowgraph, i) {
    edge = flowgraph.get_nthedge(i);
    assert(edge && edge->get_address() == i);
    //Constants are not counted as arcs in the retiming graph
    if(edge->is_constant()) continue;

    FOR_EACH_FANOUT_NODE(edge, nodescan) {
      outnode = nodescan.get_item();
      assert(outnode);
      if(edge->is_pi()) {
	matbeg[j] = k;
	matcnt[j] = 3;
	matval[k] = (double)1.0;
	matval[k+1] = (double)-1.0;
	matval[k+2] = (double)-1.0;
	matind[k] = mar - 2;
	matind[k+1] = outnode->get_address();
	matind[k+2] = mar - 1;
	k = k + 3;
      } else {
	if(edge->is_loopin()) {
	  loopout_edge = edge->get_loopout_link();
	  assert(loopout_edge);
	  innode = loopout_edge->input_node();
	} else {
	  innode = edge->input_node();
	}
	assert(innode);
	matbeg[j] = k;
	matcnt[j] = 2;
	matval[k] = (double)1.0;
	matval[k+1] = (double)-1.0;
	matind[k] = innode->get_address();
	matind[k+1] = outnode->get_address();
	k = k + 2;
      }
      j++;
    }
    /*Account for the arcs from POs to the Host vertex*/
    if(edge->is_po()) {
      innode = edge->input_node();
      assert(innode);
      matbeg[j] = k;
      matcnt[j] = 2;
      matval[k] = (double)1.0;
      matval[k+1] = (double)-1.0;
      matind[k] = innode->get_address();
      matind[k+1] = mar-2;  /*Host node index = mar-2*/
      k = k + 2;
      j++;
    }
  }

  /*handle constraint edges*/
  for(i = 0; i < flowgraph.numnodes(); i++) {
    for(l = 0; l < flowgraph.numnodes(); l++) {
      if(constraints[i][l]) {
	matbeg[j] = k;
	matcnt[j] = 2;
	matval[k] = (double)1.0;
	matval[k+1] = (double)-1.0;
	matind[k] = i;
	matind[k+1] = l;
	k = k + 2;
	j++;
      }
    }
  }

  /*r-value of host vertex <= 0*/  
  matbeg[j] = k;
  matcnt[j] = 1;
  matval[k] = (double)1.0;
  matind[k] = mar-2;
  j++;
  k++;

  /*r-value of host vertex >= 0*/
  matbeg[j] = k;
  matcnt[j] = 1;
  matval[k] = (double)-1.0;
  matind[k] = mar-2;
  j++;
  k++;

  assert_force(j == mac);
  assert_force(k == matsize);
  
  /* Load the flow problem */
#ifdef CPLEX_DEBUG
  fprintf(stdout, "\tCPLEX (rows=%d,columns=%d)..", mar, mac); fflush(stdout);
#endif
  lp = loadprob(probname, mac, mar, 0, 1, objx, rhsx, senx, matbeg, matcnt,
                matind, matval, bdl, bdu, (double *)NULL, (int *)NULL,
                (int *)NULL, (int *)NULL, (int *)NULL, (int *)NULL,
                (double *)NULL, (char *)NULL, (char *)NULL, (char *)NULL,
                (char *)NULL, (char *)NULL, (char **)NULL, (char *)NULL,
                (char **)NULL, (char *)NULL, (char **)NULL, (char *)NULL, mac,
                mar, matsize, 0, 0, (unsigned)0, (unsigned)0, (unsigned)0);

  if (!lp) {
    fprintf(stderr, "ERROR: CPLEX is out of memory\n");
    exit(-1);
  }

#ifdef CPLEX_DEBUG
  assert(problem_filename);
  if (lpwrite(lp, problem_filename)) {
    fprintf(stderr, "ERROR: Could not write the lp to file %s\n", problem_filename);
    exit(-1);
  }
  if (dualwrite(lp, dual_filename)) {
    fprintf(stderr, "ERROR: Could not write the dual lp to file %s\n", dual_filename);
    exit(-1);
  }

#endif

  /* Solve the optimization problem */
  status = netopt(lp, &lpstat, &pnodes, &parcs, &pitcnt);
  if (status) {
    fprintf(stdout, "Status = %d, NStatus = %d\n", status, lpstat);
    fprintf(stderr, "ERROR: CPLEX. NETOPT cannot find optimal solution\n");
    exit(-1);
  }
  status = optimize(lp);
  if (status) {
    fprintf(stderr, "CPLEX. Could not find a solution\n");
    exit(-1);
  }

  /* Obtain the solution of the problem */
  lpx = new double[mac];
  lpdualx = new double[mar];
  status = solution(lp, &lpstat, &lpobj, lpx, lpdualx,
                    (double *)NULL, (double *)NULL);

  if (status) {
    fprintf(stderr, "ERROR: CPLEX. Solution() returns non-zero value\n");
    exit(-1);
  }

  if (lpstat == 1) {
#ifdef CPLEX_DEBUG
    printf("Retiming/Pipelining SUCCEEDED (CPLEX status = %d)\n", lpstat);
    fprintf(cplex_debug_file, "Retiming/Pipelining SUCCEEDED (CPLEX status = %d)\n", lpstat);
    fprintf(cplex_debug_file, "----------------------\n");
    print_solution(lpobj, lpx, mac, lpdualx, mar, cplex_debug_file);
    fprintf(cplex_debug_file, "----------------------\n\n");
#endif
  } else {
#ifdef CPLEX_DEBUG
    printf("Retiming/Pipelining not possible for given constraint (CPLEX status = %d)\n", lpstat);
    fprintf(cplex_debug_file, "Retiming/Pipelining not possible for given constraint (CPLEX status = %d)\n", lpstat);
#endif
    delete lpdualx; lpdualx = NULL;
  }

  //Clean up all memory used by cplex
  freeprob(&lp);
  delete objx; objx = NULL;
  delete rhsx; rhsx = NULL;
  delete senx; senx = NULL;
  delete bdl; bdl = NULL;
  delete bdu; bdu = NULL;
  delete matbeg; matbeg = NULL;
  delete matcnt; matcnt = NULL;
  delete matval; matval = NULL;
  delete matind; matind = NULL;
  delete lpx; lpx = NULL;
  cplex_debug_file = anand_fclose(cplex_debug_file);

  return(lpdualx);
}
/********************************************************************/
void print_solution(double lpobj, double *lpx, int mac, double *lpdualx,
		    int mar, FILE *fp)
{
  int i;

  assert_force(lpx && lpdualx && fp);

  fprintf(fp, "Solution to the flow problem:\n");
  fprintf(fp, "Optimal Value: %8.2f\n", lpobj);

#ifdef CPLEX_DEBUG
  fprintf(fp, "Primal Values for the arcs:\n");
  for (i = 0; i < mac; i++)
    fprintf(fp, "%8.2f ", lpx[i]);
  fprintf(fp, "\n");
#endif

  fprintf(fp, "Dual Values for vertices:\n");
  for (i = 0; i < mar; i++)
    fprintf(fp, "Vertex %d: %8.2f\n", i, lpdualx[i]);

  return;
}
/********************************************************************/
void Scheduler::compute_retimed_arcweights(Dfg &flowgraph,
	       Array<double> &rvalues, Matrix<int> &retimed_arcweights)
{
  register int i;
  int arc_weight, innode_index;
  EDGEPTR edge, loopout_edge;
  NODEPTR innode;
  List_iterator<NODEPTR> nodescan;

  FOR_EACH_EDGEINDEX(flowgraph, i) {
    edge = flowgraph.get_nthedge(i);
    assert(edge && edge->get_address() == i);
    //Constants are not counted as arcs in the retiming graph
    if(edge->is_constant()) {
      continue;
    } else if(edge->is_pi()) {
      arc_weight = 0;
      innode_index = flowgraph.numnodes();
    } else if(edge->is_loopin) {
      //traverse backward thro delays till we hit a PI or innode is
      //not NULL. arc_weight is the number of delays traversed
      //If we hit a PI, innode_index is num_vertices-2. Else, innode_index
      //is the address of the source node
      loopout_edge = edge->get_loopout_link();
      assert(loopout_edge);
      arc_weight = 1;
      while(!(innode = loopout_edge->input_node())) {
	assert(loopout_edge->is_loopin() || loopout_edge->is_pi());
	if(loopout_edge->is_pi()) {
	  break;
	} else if(loopout_edge->is_loopin()) {
	  loopout_edge = loopout_edge->get_loopout_link();
	  arc_weight ++;
	}
      }
      if(innode) {
	innode_index = innode->get_address();
      } else {
	innode_index = flowgraph.numnodes();
      }
    } else {
      innode = edge->input_node();
      assert(innode);
      innode_index = innode->get_address();
      arc_weight = 0;
    }
    FOR_EACH_FANOUT_NODE(edge, nodescan) {
      outnode = nodescan.get_item();
      assert(outnode);
      retimed_arcweights[innode_index][outnode->get_address()] =
	arc_weight + rvalues[outnode->get_address()] - rvalues[innode_index];
    }
    //Account for the arc from a PO to the host vertex
    if(edge->is_po()) {
      retimed_arcweights[innode_index][flowgraph.numnodes()] =
	arc_weight + rvalues[flowgraph.numnodes()] - rvalues[innode_index];
    }
  }
  return;
}
/********************************************************************/
//Given a flowgraph and the r-values produced by retiming/pipelining,
//this routine updates the flowgraph by deleting the existing delays,
//and introducing delays at suitable edges in accordance with the
//retiming/pipelining solution
void Scheduler::update_flowgraph_after_retpipe(Dfg &flowgraph,
			       Matrix<int> &retimed_arcweights)
{
  register int i, j;
  int innode_index, maxweight, curweight;
  NODEPTR node, outnode, innode;
  EDGEPTR edge, edge1, cur_edge;
  List_ar<NODEPTR> fanout_list;
  Dfg *newflowgraph = new Dfg;

  //Note that there is a node correspondence between flowgraph and
  //newflowgraph, but the edges do not correspond
  newflowgraph->copy(flowgraph);

  /*  FOR_EACH_LISTNODE(flowgraph.get_loopins(), edgescan) {
      edge = edgescan.get_item();
      assert(edge && edge->is_loopin());
      loopout_edge = edge->get_loopout_link();
      assert(loopout_edge && loopout_edge->is_loopout() &&
      loopout_edge->get_loopin_link() == edge);
      replace all fanouts of loopout_edge by loopin_edge;
      if(loopout_edge->is_loopin()) {
      tmpedge = loopout_edge->get_loopout_link();
      delete loopout_edge and connect (edge, tmpedge) as a (loopin, loopout) pain
      }
      }
      */

  //introduce delays in newflowgraph appropriately according to the
  //retimed arcweights
  numedges = newflowgraph.numedges();
  for(i = 0; i < numedges; i++) {
    edge = newflowgraph.get_nthedge(i);
    assert(edge && edge->get_address() == i);

    if(edge->is_constant()) continue;

    //find out the source node of edge
    innode = edge->input_node();
    if(innode) {
      innode_index = innode->get_address();
    } else {
      assert(edge->is_pi());
      innode_index = newflowgraph.numnodes();
    }

    //Find out the max retimed arc weight for all arcs represented by edge.
    //Also, create a list that contains all fanouts of edge that need to
    //be fed by delayed versions of edge
    fanout_list.clear();
    maxweight = 0;
    if(edge->is_po()) {
      maxweight = MAX(maxweight, retimed_arcweights[innode_index][newflowgraph.numnodes()]);
    }
    FOR_EACH_FANOUT_NODE(edge, nodescan) {
      outnode = nodescan.get_item();
      assert(outnode);
      if(retimed_arcweights[innode_index][outnode->get_address()] > 0) {
	fanout_list.append(outnode);
      }
      maxweight = MAX(maxweight, retimed_arcweights[innode_index][outnode->get_address()]);
    }

    //Add the required number of delays at edge as a cascade of unit delays.
    //At each stage, migrate the appropriate fanouts of edge to its delayed
    //versions
    cur_edge = edge;
    for(curweight = 1; curweight <= max_arcweight; curweight++) {
      assert(!cur_edge->is_loopout());
      cur_edge->loopout = T;
      assert(!newflowgraph->get_loopouts().find(cur_edge));
      newflowgraph.get_loopouts().append(cur_edge);

      edge1 = new EDGE;
      sprintf(tmpname, "%s_@1\n", cur_edge->get_name());
      edge1->set_name(tmpname);
      newflowgraph->add_edge(edge1);
      edge1->loopin = T;
      newflowgraph->get_loopins().append(edge1);

      cur_edge->set_loopin_link(edge1);
      edge1->set_loopout_link(cur_edge);

      FOR_EACH_LISTNODE(fanout_list, nodescan) {
	outnode = nodescan.get_item();
	assert(outnode);
	if(retimed_arcweights[innode_index][outnode->get_address()] ==
	   curweight) {
	  edge1->add_sink_node(outnode);
	  outnode->replace_input_edge(edge, edge1);
	}
      }
      cur_edge = edge1;
    }
  }

  //copy newflowgraph back into flowgraph
  flowgraph.copy(newflowgraph);

  delete newflowgraph;

  return;
}
/********************************************************************/
//Given a flowgraph and desired latency constraint, this routine attempts
//to retime/pipeline the flowgraph in order to satisfy the given constraint.
//If the latency constraint is achievable, the flowgraph is updated to
//reflect the transformation and T is returned. Else, F is returned and
//the flowgraph remains unchanged.
//ASSUMPTION: The nodes in the flowgraph have their birth and death fields
//properly set so that death-birth for a node reflects the number of clock
//cycles required for it to execute at the current Vdd and clock period.
Boolean Scheduler::retpipe_flowgraph(Dfg &flowgraph, const int num_csteps)
{
  static Matrix<int> wvalues;
  static Matrix<int> dvalues;
  static Matrix<int> constraints;
  static Matrix<int> retimed_arcweights;
  static Array<double> rvalues;
  Ilp *retpipe_ilp;
  double num_pipe_stages;
  Boolean solution_found;

  wvalues.resize(flowgraph.numnodes()+1, flowgraph.numnodes()+1);
  dvalues.resize(flowgraph.numnodes()+1, flowgraph.numnodes()+1);
  retimed_arcweights.resize(flowgraph.numnodes()+1, flowgraph.numnodes()+1);

  //compute W and D values for current Vdd
  compute_WD(flowgraph, wvalues, dvalues);
#ifdef CPLEX_DEBUG
  cout << "W values" << endl << wvalues;
  cout << "D values" << endl << dvalues;
#endif

  //compute the constraint edges matrix
  create_constraint_edges(flowgraph, wvalues, dvalues, num_csteps, constraints);
#ifdef CPLEX_DEBUG
  cout << "Constraint edges" << endl << constraints;
#endif

  //formulate the retiming/pipelining problem
  retpipe_ilp = formulate_retpipe_ilp(flowgraph, wvalues, constraints);

  assert(retpipe_ilp);

#ifdef _USE_CPLEX_
  solution_found = retpipe_ilp->solve_using_cplex(rvalues, num_pipe_stages);

  //call CPLEX to solve the LP and compute the new positions of the delays
  //solution = call_cplex(flowgraph, wvalues, constraints);

  //move the delays to reflect the retimed flowgraph
  if(solution_found) {
    //compute the retimed arc weights
    compute_retimed_arcweights(flowgraph, rvalues, retimed_arcweights);
    //update_flowgraph(flowgraph, rvalues, retimed_arcweights);
    flowgraph.levelize();
    return(T);
  }
#endif

  return(F);
}
/********************************************************************/
