/*
  f-cpu/c/scheduler/scheduler.c
  c simulation of the F-CPU decoder unit
  Copyright (C) 2002 Jaap Stolk (JWS) jwstolk@yahoo.com
  version:
           19 July 2002 13:30
Sun Jul 21 12:56:13 CEST 2002 JWS: added write queue.
Sun Jul 21 15:25:43 CEST 2002 JWS: romoved IMUL counter remarks (only IDIV)
Mon Jul 22 15:07:09 CEST 2002 JWS: complete rewrite, along with other units
Mon Jul 22 23:02:02 CEST 2002 JWS: finisched major scheduling update
Tue Jul 23 12:46:35 CEST 2002 JWS: normal bypass now overwrites delayed bypass
Thu Jul 25 21:52:20 CEST 2002 JWS: added 0-cycle moves remarks but no code :) 

 ------------------------BEGIN-LICENSE------------------------------------
  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 ---------------------------END-LICENSE-----------------------------------

     sch_operand_1  (from fetcher)
     sch_operand_2
     sch_operand_dst
         | | |
         v v v
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%                         scheduler                                %%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
         | | |
         v v v
     sch_bypass_r0 (no bypass, (direct/delayed) bypas to w0/w1
     sch_bypass_r1       
     sch_bypass_r2    (to the xbar)     

     sch_use_r0  (bool)  from the decoder
     sch_use_r1  (bool)
     sch_use_r2  (bool)
     sch_r0_reg_nr       from the decoder 
     sch_r1_reg_nr
     sch_r2_reg_nr
     sch_latency
     sch_w0_port_nr
     sch_w1_port_nr
     sch_w0_reg_nr
     sch_w1_reg_nr
     | | | | | | | | | | | |
     v v v v v v v v v v v v
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%                         scheduler                                %%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
     | | | | | | |
     v v v v v v v
     stalled (bool)   to evrywhere !!!
     sch_xbar_w0_reg_nr   to x-bar
     sch_xbar_w0_port_nr
     sch_xbar_w1_reg_nr
     sch_xbar_w1_port_nr
     sch_reg_w0_reg_nr    to registers
     sch_reg_w1_reg_nr

  show the the exact reason for the stall, for debugging and optimising ?
*/

/* defines the constants, inputs and outputs : */
#include <scheduler.h>

static inline void scheduler_cycle (void) {
 
  int level_loop;
  int reg_loop;
  bool swapped = false;

/*%%%%%%%%%%%%%%%% do a register write if it's indecated by the queue: %%%%%%*/
  /* look at "level 0" in the queue */

  sch_reg_w0_reg_nr = 0; /* --> 0 =  no write */
  sch_reg_w1_reg_nr = 0; /* --> 0 =  no write */

  if ( sch_Q_write_to_reg_nr[0][0] ){
    sch_reg_w0_reg_nr = sch_Q_write_to_reg_nr   [0][0];
  }
  if ( sch_Q_write_to_reg_nr[0][1] ){
    sch_reg_w1_reg_nr = sch_Q_write_to_reg_nr   [0][1]; /* FIXED ! (was w0) */
  }

/*%%%%%%%%%%%%%%%% detect a stall: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/

  stalled = false; /* if ther is no problem  */

  /* check if all the input registers are free: */
  if ( sch_use_r0 ) {
    if ( sch_register_in_use[1][sch_r0_reg_nr] ) { stalled = true; }
  }
  if ( sch_use_r1 ) {
    if ( sch_register_in_use[1][sch_r1_reg_nr] ) { stalled = true; }
  }
  if ( sch_use_r2 ) {
    if ( sch_register_in_use[1][sch_r2_reg_nr] ) { stalled = true; }
  }

  /* check if the needed write ports are free: */
  /* the nop instruction needs 0 free write ports !? */
  if ( sch_w0_reg_nr ){   
    if ( sch_w1_reg_nr ){                      /* need two free write ports: */
      if ( (sch_Q_write_to_reg_nr[1+sch_latency][0]) ||
           (sch_Q_write_to_reg_nr[1+sch_latency][1]) ) {
        stalled = true;
      }
    }else{                               /* need at least 1 free write port: */
      if ( (sch_Q_write_to_reg_nr[1+sch_latency][0]) &&
           (sch_Q_write_to_reg_nr[1+sch_latency][1]) ) {
        stalled = true;
      }else{
        /* swap ports if needed: */
        if ( sch_Q_write_to_reg_nr[1+sch_latency][0] ) {
          swapped = true;
        }
      }
    }
  } /* else { } */                             /* don't need any write port: */

/*%%%%%%%%%%%%%%%% add instruction to score board and write queue: %%%%%%%%%%*/

  /* clear the toplevel of the queues: */
  for (reg_loop=1; reg_loop<64; reg_loop++){
    sch_register_in_use[8][reg_loop] = false;
  }
  for (reg_loop=0; reg_loop<2; reg_loop++){
    sch_Q_write_to_reg_nr   [8][reg_loop] = 0; /* -> 0 = free */
    sch_Q_write_from_port_nr[8][reg_loop] = 0; /* -> no clear needed !*/
  }

  /* add IDIV counter to toplevel when it reaches level 8 !? */

  if ( !stalled ){

    /* (no set counters for IDIV yet!!) */

    /* mark the output register(s) in the scoreboard: */
    /* for a register move sch_latency==0, so this won't hapen! */
    if (sch_w0_reg_nr){
      for (level_loop=1; level_loop<=sch_latency; level_loop++){
        sch_register_in_use[level_loop+1][sch_w0_reg_nr] = true; 
      }
    }
    if (sch_w1_reg_nr){
      for (level_loop=1; level_loop<=sch_latency; level_loop++){
        sch_register_in_use[level_loop+1][sch_w1_reg_nr] = true; 
      }
    }

    if ( !swapped ){
      /* add the result register nr to the write queue: */
      /* add the   x-bar  port   nr to the write queue: */
      /* if sch_latency==0, we still need it in the reg nr queue */
      /* to detect the next bypass, and do a registe writeback ! */

      if (sch_w0_reg_nr){
        sch_Q_write_to_reg_nr   [sch_latency+1][0] = sch_w0_reg_nr;
        sch_Q_write_from_port_nr[sch_latency+1][0] = sch_w0_port_nr;
      }

      /* if sch_latency==0, we don't need it en the port nr queue, */
      /* it will we removed from the queue anyway, when we clock */
      /* the write port nr queue ! */
      if (sch_w1_reg_nr){
        sch_Q_write_to_reg_nr   [sch_latency+1][1] = sch_w1_reg_nr;
        sch_Q_write_from_port_nr[sch_latency+1][1] = sch_w1_port_nr;
      }
    }else{
      /* a (w0->w1) swap is only posible if we only write one !!! */ 
      sch_Q_write_to_reg_nr   [sch_latency+1][1] = sch_w0_reg_nr;
      sch_Q_write_from_port_nr[sch_latency+1][1] = sch_w0_port_nr;
   }    
  }

/*%%%%%%%%%%%%%%%% do an x-bar write if it's indecated by the queue: %%%%%%%%*/
  /* moved to AFTER updating the queue, so we can do 0-cycle register move !! */
  /* look at "level 1" in the queue */

  /* instruct xbar to do a write from the EU output to a register: */
  sch_xbar_w0_reg_nr  = 0; /* --> 0 =  no write */
  sch_xbar_w0_port_nr = 0; /* --> not needed */
  sch_xbar_w1_reg_nr  = 0; /* --> 0 =  no write */
  sch_xbar_w1_port_nr = 0; /* --> not needed */
  if ( sch_Q_write_to_reg_nr[1][0] ){
    sch_xbar_w0_reg_nr  = sch_Q_write_to_reg_nr   [1][0];
    sch_xbar_w0_port_nr = sch_Q_write_from_port_nr[1][0];
  }
  if ( sch_Q_write_to_reg_nr[1][1] ){
    sch_xbar_w1_reg_nr  = sch_Q_write_to_reg_nr   [1][1];
    sch_xbar_w1_port_nr = sch_Q_write_from_port_nr[1][1];
  }

/*%%%%%%%%%%%%%%%% detect register bypass: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/

/*Get the 3 operands from the fetcher and find out if the instruction
  that is now being decoded, could need a register bypass.
  Ther are 5 possible results:
    - PORT_READ_FROM_REGISTER      (no bypass)
    - PORT_READ_FROM_BYPASS_0      (bypass from write bus 0)
    - PORT_READ_FROM_BYPASS_0_BIS  (1-cycle delay bypass from write bus 0)
    - PORT_READ_FROM_BYPASS_1      (bypass from write bus 1)
    - PORT_READ_FROM_BYPASS_1_BIS  (1-cycle delay bypass from write bus 1)
  Watch out: at this point it is stil unclear if the operands were actualy a
  register number ! doing a bypass if the operand was (part of) flags, could
  distroi 8/18bit imm data in the xbar bus, so check for that BEFORE doing
  the bypass.
  Watch out: no bypass dous NOT mean that the register value is ok. a
  pipeline stall could still be needed.
  After the stal cycle a bypass is usualy needed, but the stall could also
  have a different couse, like waiting for a free write slot.
  Watch out: if there is a write-after-write both a normal and a delayd bypass
  is posible !! we should use the normal bypass ! (not the delayed one ) */

/*If a xbar wrtite is on the bottom of the queue, it will be performed
  on the next cycle, witch is when we need a bypass.
  Please note: level 1 is the bottom of the queue (normal use) !!
               level 0 is only used for delayed bypass !!
  level 0 of the score board is not used (and not shifted!)
  This is all becous i can't have a -1...8 array in C :-(  
  operand_1 = R0,  operand_2 = R1,  operand_dst = R2 
  watch register 0 ! (hardwired to 0) */

  sch_bypass_r0 = PORT_READ_FROM_REGISTER; /* if no reg nr match is found */
  sch_bypass_r1 = PORT_READ_FROM_REGISTER;
  sch_bypass_r2 = PORT_READ_FROM_REGISTER;

  /* need to know stalled here !!!! */
  if ( stalled ){  /* bypass detection after a stall */
    sch_operand_1  = tmp_sch_operand_1;   /* ignor what we get from the  */
    sch_operand_2  = tmp_sch_operand_2;   /* fetcher and use the old     */
    sch_operand_dst= tmp_sch_operand_dst; /* values of the stalled instr.*/
  }

  /* check for bypass of read bus 0: */
  if ( sch_operand_1 ){
    if ( sch_operand_1  == sch_Q_write_to_reg_nr[0][0] )
                              { sch_bypass_r0 = PORT_READ_FROM_BYPASS_0_BIS; }
    if ( sch_operand_1  == sch_Q_write_to_reg_nr[1][0] )
                              { sch_bypass_r0 = PORT_READ_FROM_BYPASS_0; }
    if ( sch_operand_1  == sch_Q_write_to_reg_nr[0][1] )
                              { sch_bypass_r0 = PORT_READ_FROM_BYPASS_1_BIS; }
    if ( sch_operand_1  == sch_Q_write_to_reg_nr[1][1] )
                              { sch_bypass_r0 = PORT_READ_FROM_BYPASS_1; }
  }

  /* check for bypass of read bus 1: */
  if ( sch_operand_2 ){
    if ( sch_operand_2  == sch_Q_write_to_reg_nr[0][0] )
                              { sch_bypass_r1 = PORT_READ_FROM_BYPASS_0_BIS; }
    if ( sch_operand_2  == sch_Q_write_to_reg_nr[1][0] )
                              { sch_bypass_r1 = PORT_READ_FROM_BYPASS_0; }
    if ( sch_operand_2  == sch_Q_write_to_reg_nr[0][1] )
                              { sch_bypass_r1 = PORT_READ_FROM_BYPASS_1_BIS; }
    if ( sch_operand_2  == sch_Q_write_to_reg_nr[1][1] )
                              { sch_bypass_r1 = PORT_READ_FROM_BYPASS_1; }
  }

  /* check for bypass of read bus 2: */
  if ( sch_operand_2 ){
    if ( sch_operand_dst== sch_Q_write_to_reg_nr[0][0] )
                              { sch_bypass_r2 = PORT_READ_FROM_BYPASS_0_BIS; }
    if ( sch_operand_dst== sch_Q_write_to_reg_nr[1][0] )
                              { sch_bypass_r2 = PORT_READ_FROM_BYPASS_0; }
    if ( sch_operand_dst== sch_Q_write_to_reg_nr[0][1] )
                              { sch_bypass_r2 = PORT_READ_FROM_BYPASS_1_BIS; }
    if ( sch_operand_dst== sch_Q_write_to_reg_nr[1][1] )
                              { sch_bypass_r2 = PORT_READ_FROM_BYPASS_1; }
  }

/*%%%%%%%%%%%%%%%% clock scoreboard (and clock write queue): %%%%%%%%%%%%%%%%*/

/*(in hardware this is done by the stage clock after each cycle)
  rotatong ptr's to levels could be used if this simulates to slow !!
  some FIFO's do not use register 0 and/or level 0 !! */

  /* clock the scoreboard FIFO */
  for (level_loop=1; level_loop<8; level_loop++){
    for (reg_loop=1; reg_loop<64; reg_loop++){
      sch_register_in_use[level_loop  ][reg_loop] =
      sch_register_in_use[level_loop+1][reg_loop];
    }
  }

  /* clock the sch_Q_write_to_reg_nr FIFO */
  for (level_loop=0; level_loop<8; level_loop++){
    for (reg_loop=0; reg_loop<2; reg_loop++){
      sch_Q_write_to_reg_nr   [level_loop  ][reg_loop] = 
      sch_Q_write_to_reg_nr   [level_loop+1][reg_loop];
    }
  }
  /* clock the sch_Q_write_from_port_nr FIFO */
  for (level_loop=1; level_loop<8; level_loop++){
    for (reg_loop=0; reg_loop<2; reg_loop++){
      sch_Q_write_from_port_nr[level_loop  ][reg_loop] = 
      sch_Q_write_from_port_nr[level_loop+1][reg_loop];
    }
  }

/*%%%%%%%%%%%%%%%% delay operand data from fetcher: %%%%%%%%%%%%%%%%*/
  /* these are needed to detect a bypass if there was a stall !!! */

  if ( !stalled ){
    tmp_sch_operand_1  = sch_operand_1;
    tmp_sch_operand_2  = sch_operand_2;
    tmp_sch_operand_dst= sch_operand_dst;
  }

}



