-- inc.vhdl -- F-CPU 64-Bit Increment Unit
-- Copyright (C) 2000 Erik Hansen <erik.hansen@berlin.de>
--
-- This program is free software; you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 2 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program; if not, write to the Free Software
-- Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
--
-- Version 0.1 2000/11/20

library ieee;
use ieee.std_logic_1164.all;

entity inc is
  
  port (
    r1   : in  std_ulogic_vector(63 downto 0);  -- input value 1
    r2   : in  std_ulogic_vector(63 downto 0);  -- input_value 2
    r3   : out std_ulogic_vector(63 downto 0);  -- output value 1
    r4   : out std_ulogic_vector(63 downto 0);  -- output value 2 (dest reg + 1)
    iin  : in  std_ulogic;                      -- iin = '1' inverts input value r1
    rev  : in  std_ulogic;                      -- rev = '1' reverses all bits
    cmp  : in  std_ulogic;                      -- cmp = '1' makes this unit
                                                -- compare r1 and r2
    inc  : in  std_ulogic;                      -- default set to '1'
    scan : in  std_ulogic;                      -- scan = '1' selects scan mode
    oin  : in  std_ulogic;                      -- oin = '1' inverts tmp_inc
    smin : in  std_ulogic;                      -- select min
    smax : in  std_ulogic;                      -- select max
    sabs : in  std_ulogic;                      -- select absolute value
    U08  : in  std_ulogic;                      -- U08, U16 and U32 are
    U16  : in  std_ulogic;                      -- used to encode 
    U32  : in  std_ulogic);                     -- data size 
    
end inc;

-- KNOWN ISSUES
-- ============
--
-- 1. This Unit is not fully operational and might be very buggy!
-- 2. Entity Declaration is subject to change!
--    I'm not quite shure whether the coding scheme
--    for the operations is appropriate or not.
-- 3. Code ist somehow chaotic. (Sorry! I will clean up asap)
-- 4. Not faster than iadd
--    inc r1, r3 has no advantage to adding in with the add command,
--    which makes a specialized inc unit quite useless, but it
--    can do a lot of other things.
--    Normally inc could be done with an logic depth of d=4 (3 for carry, 1 for
--    increment) inc and dec together would need d=6 (1 input inv., 3 carry,
--    1 increment, 1 output inv) Together with the muxes for input and output
--    and the compare logic inc needs even more time :-(
-- 5. I probably have forgotten something...
--    Don't be suprised if inc does not work as expected.
-- 
--
-- STRUCTURE
-- =========
-- (cf file ../doc/inc.html, more is still to come)
--
--  d
--    ******************************
--  1 *    xnor     *     iin      *
--    ******************************
--  2 *           imux             *
--  3 *                            *
--    ******************************
--  4 *                            *
--  5 *           carry            *
--  6 *                            *
--    ******************************
--  7 *    scan     *     inc      *
--    ******************************
--  8 *             *     oin      *
--  9 *     cmp     ****************
-- 10 *             *
--    ******************************
-- 11 *                            *
-- 12 *           omux             *
-- 13 *                            *
--    ******************************
--
--
-- OPERATION MODES
-- ===============
--
-- command iin rev cmp inc scan oin smin smax sabs
-- -----------------------------------------------
-- inc      0   0   0   1    0   0    0    0    0 
-- dec      1   0   0   1    0   1    0    0    0
-- neg      1   0   0   1    0   0    0    0    0
-- lsb0     0   0   0   1    1   0    0    0    0  
-- lsb1     1   0   0   1    1   0    0    0    0
-- msb0     0   1   0   1    1   0    0    0    0
-- msb1     1   1   0   1    1   0    0    0    0
-- cmpl     0   0   1   1    0   0    0    0    0
-- cmpli    0   0   1   1    0   0    0    0    0  
-- cmple    *   *   *   *    *   *    *    *    *
-- cmplei   *   *   *   *    *   *    *    *    *
-- abs      1   0   0   1    0   0    0    0    1
-- max      0   0   1   1    0   0    0    1    0
-- maxi     0   0   1   1    0   0    0    1    0
-- min      0   0   1   1    0   0    1    0    0
-- mini     0   0   1   1    0   0    1    0    0
-- sort     0   0   1   1    0   0    1    0    0
--
-- comments
--
--   cmple(i)
--     these functions are not implementet
--
--   lsb0, lsb1, msb0, msb1
--     in this version of inc the result are not
--     coded. They are written to the output
--     Register in an 1 out of n (8,16,32,64) code.
--
--   i - operations
--     these Operarions are handled by the operation
--     decoder. Inc reveives the immideate values
--     at its normal r1,r2 inputs.
--
--   max(i), min(i), sort
--     omux is not prepared for these operations
--     Therfore they are not functionally
--
--   sort
--     As sort is almost the same as min(i) these
--     functions are handled the same way in this
--     unit. At writeback it is decided whether to
--     write back r3+1 i.e. r4 as well or not.
--
--
-- SIMD MODES (same as in iadd)
-- ============================
--
-- U08 = '0', U16 = '0', U32 = '0': 8-bit mode
-- U08 = '1', U16 = '0', U32 = '0': 16-bit mode
-- U08 = '1', U16 = '1', U32 = '0': 32-bit mode
-- U08 = '1', U16 = '1', U32 = '1': 64-bit mode
-- (others combinations are invalid)
--
-- comment
--   Inc generally works in simd mode using
--   the given data width.
--   Which bytes are relevant is supposed to be decided on
--   write-back
--
-- TO DO
-- =====
--
-- * make code more readable, make use of blocks,
--   consistent signal naming, include assertions
--   for testing.
-- * Add generic for data size
-- * Add omux support for min, max and sort functions
-- * encode scan results, i.e. 01000000 => 00000111
--   This would add a lot more gate. But can be done
--   while compare result is calculated.
--   Must be done in d=3 or even d=4 which
--   should be not to difficult
-- * reduce logic depth
-- * try to use smaller gates ( no 9 input and gates)
-- * OPTIMIZE!
-- * test,test,test
-- (* change to nor and nand for CMOS optimisation)
-- * I'm shure that there was still more to do..


-- Let's go. Good Luck

architecture struct_1 of inc is

  -- internal control
  signal nrev           : std_ulogic;                      -- not(rev)
  signal ncmp           : std_ulogic;                      -- not(cmp)
  signal nscan          : std_ulogic;                      -- not(scan)
  signal nu08           : std_ulogic;                      -- not(u08) 
  signal nu16           : std_ulogic;                      -- not(u16)
  signal nu32           : std_ulogic;                      -- not(u32)
  signal nsabs          : std_ulogic;                      -- not(abs)
  signal nscan_and_ncmp : std_ulogic;                      -- not(scan) AND not(cmp)
  -- signal for the preparation of input 
  signal tmp_iin        : std_ulogic_vector(63 downto 0);  -- if iin = '1' then
                                                           -- not(r1) else r1
  signal tmp_xnor       : std_ulogic_vector(63 downto 0);  -- r1 xnor r2
  -- signals used by the input multiplexer
  signal tmp_in_rev     : std_ulogic_vector(63 downto 0);  -- r1 reversed
  signal tmp_in_dir     : std_ulogic_vector(63 downto 0);  -- input r1 direct without
                                                           -- inversion
  signal tmp_in_cmp     : std_ulogic_vector(63 downto 0);  -- tmp_xnor reversed octbytes
  signal tmp_inp        : std_ulogic_vector(63 downto 0);  -- selected input value
                                                           -- either iin oder xnor
                                                           -- reversed rev = '1'
                                                           -- according to size
  -- signals used in the compare part
  signal tmp_cmp        : std_ulogic_vector(63 downto 0);
  signal tmp_cmp2       : std_ulogic_vector(63 downto 0);
  signal tmp_cmp3       : std_ulogic_vector( 7 downto 0);
  signal tmp_cmp4       : std_ulogic_vector( 6 downto 0);
  -- signals used to calculate carry
  signal tmp_cry1       : std_ulogic_vector(63 downto 0);  -- calculation of carry 
  signal tmp_cry2       : std_ulogic_vector(63 downto 0);  -- calculation of carry 
  signal tmp_cry3       : std_ulogic_vector(63 downto 0);  -- calculation of carry 
  signal tmp_cry4       : std_ulogic_vector(63 downto 0);  -- calculation of carry
  signal tmp_cry5       : std_ulogic_vector(63 downto 0);  -- calculation of carry
  signal tmp_cryo       : std_ulogic_vector( 7 downto 0);  -- carry overflow at msb
                                                           -- of every byte
  signal tmp_ncryo      : std_ulogic_vector( 7 downto 0);  -- not(tmp_cryo)
  signal tmp_cry        : std_ulogic_vector(63 downto 0);  --  final carry value
  -- operation results
  signal tmp_inc        : std_ulogic_vector(63 downto 0);  -- tmp_input + 1
  signal tmp_scan       : std_ulogic_vector(63 downto 0);  -- position of lsb0 in tmp_input
  signal tmp_oin        : std_ulogic_vector(63 downto 0);  -- not(tmp_inc)
  -- signals used by the abs function
  signal tmp_abs_sel1   : std_ulogic_vector(31 downto 0);
  signal tmp_abs_sel2   : std_ulogic_vector( 7 downto 0);
  signal tmp_abs_sel3   : std_ulogic_vector( 7 downto 0);
  -- signals used by the output mux
  signal tmp_out_scan   : std_ulogic_vector(63 downto 0);  -- scan result for lsbx 
  signal tmp_out_scan_r : std_ulogic_vector(63 downto 0);  -- scan result for msbx
  signal tmp_out_dir    : std_ulogic_vector(63 downto 0);  -- out of inc result
  signal tmp_out_cmp08  : std_ulogic_vector(63 downto 0);  -- compare res.  8 bit
  signal tmp_out_cmp16  : std_ulogic_vector(63 downto 0);  -- compare res. 16 bit
  signal tmp_out_cmp32  : std_ulogic_vector(63 downto 0);  -- compare res. 32 bit
  signal tmp_out_cmp64  : std_ulogic_vector(63 downto 0);  -- compare res. 64 bit
  signal tmp_out_r1     : std_ulogic_vector(63 downto 0);  -- r3 = r1
  signal tmp_sel_dir1   : std_ulogic_vector( 7 downto 0);  -- control signals for
  signal tmp_sel_dir2   : std_ulogic_vector( 7 downto 0);  --   tmp_out_dir

  -----------------------------------------------------------------------------
  -- Components used by inc
  -- The generally should behave as their name indicates
  -----------------------------------------------------------------------------

  -- NOT Gate
  
  component NOT1
    port (
      A : in  std_ulogic;
      Y : out std_ulogic);
  end component;

  -- AND Gates
  
  component AND2
    port (
      A : in  std_ulogic;
      B : in  std_ulogic;
      Y : out std_ulogic);
  end component;

  component AND3
    port (
      A : in  std_ulogic;
      B : in  std_ulogic;
      C : in  std_ulogic;
      Y : out std_ulogic);
  end component;

  component AND4
    port (
      A : in  std_ulogic;
      B : in  std_ulogic;
      C : in  std_ulogic;
      D : in  std_ulogic;
      Y : out std_ulogic);
  end component;

  component AND5
    port (
      A : in  std_ulogic;
      B : in  std_ulogic;
      C : in  std_ulogic;
      D : in  std_ulogic;
      E : in  std_ulogic;
      Y : out std_ulogic);
  end component;

  component AND6
    port (
      A : in  std_ulogic;
      B : in  std_ulogic;
      C : in  std_ulogic;
      D : in  std_ulogic;
      E : in  std_ulogic;
      F : in  std_ulogic;
      Y : out std_ulogic);
  end component;

  component AND7
    port (
      A : in  std_ulogic;
      B : in  std_ulogic;
      C : in  std_ulogic;
      D : in  std_ulogic;
      E : in  std_ulogic;
      F : in  std_ulogic;
      G : in  std_ulogic;
      Y : out std_ulogic);
  end component;

  component AND8
    port (
      A : in  std_ulogic;
      B : in  std_ulogic;
      C : in  std_ulogic;
      D : in  std_ulogic;
      E : in  std_ulogic;
      F : in  std_ulogic;
      G : in  std_ulogic;
      H : in  std_ulogic;
      Y : out std_ulogic);
  end component;

  component AND9
    port (
      A : in  std_ulogic;
      B : in  std_ulogic;
      C : in  std_ulogic;
      D : in  std_ulogic;
      E : in  std_ulogic;
      F : in  std_ulogic;
      G : in  std_ulogic;
      H : in  std_ulogic;
      I : in  std_ulogic;
      Y : out std_ulogic);
  end component;

  -- OR Gates
  
  component OR2
    port (
      A : in  std_ulogic;
      B : in  std_ulogic;
      Y : out std_ulogic);
  end component;

  component OR3
    port (
      A : in  std_ulogic;
      B : in  std_ulogic;
      C : in  std_ulogic;
      Y : out std_ulogic);
  end component;

  component OR4
    port (
      A : in  std_ulogic;
      B : in  std_ulogic;
      C : in  std_ulogic;
      D : in  std_ulogic;
      Y : out std_ulogic);
  end component;
  
  component OR8
    port (
      A : in  std_ulogic;
      B : in  std_ulogic;
      C : in  std_ulogic;
      D : in  std_ulogic;
      E : in  std_ulogic;
      F : in  std_ulogic;
      G : in  std_ulogic;
      H : in  std_ulogic;
      Y : out std_ulogic);
  end component;

  -- XOR Gates
  
  component XOR2
    port (
      A : in  std_ulogic;
      B : in  std_ulogic;
      Y : out std_ulogic);
  end component;

  -- XNOR Gates
  
  component XNOR2
    port (
      A : in  std_ulogic;
      B : in  std_ulogic;
      Y : out std_ulogic);
  end component;
  
begin

  -----------------------------------------------------------------------------
  -- Stage one (d=1)
  -- Preparation of input values
  -----------------------------------------------------------------------------

  -- Inversion of input Value r1 if needed
  -- if iin = 1
  -- then tmp_iin = not(r1)
  -- else tmp_iin = r1
  
  input_inverter: for index in 63 downto 0 generate

  begin
    
    inverter : xor2 port map (
      A => r1(index),
      B => iin,
      Y => tmp_iin(index));
    
  end generate input_inverter;

  -- Preparation for Compare input values r1 and r2 are xnored
  -- tmp_xnot = r1 xnor r2

  r1_xnor_r2: for index in 63 downto 0 generate

    r1_xnor_r2 : xnor2 port map (
      A => r1(index),
      B => r2(index),
      Y => tmp_xnor(index));
    
  end generate r1_xnor_r2;

  -- Preparation of control signals
  
  inv_cmp : not1 port map (
    A => cmp,
    Y => ncmp);

  inv_rev : not1 port map (
    A => rev,
    Y => nrev);

  inv_scan : not1 port map (
    A => scan,
    Y => nscan);
  
  inv_u08 : not1 port map (
    A => u08,
    Y => nu08);

  inv_u16 : not1 port map (
    A => u16,
    Y => nu16);

  inv_u32 : not1 port map (
    A => u32,
    Y => nu32);

  inv_sabs : NOT1 port map (
    A => sabs,
    Y => nsabs);
  
  -----------------------------------------------------------------------------
  -- Stage two (d=2)
  -- Selection of input value
  -- either tmp_iin or tmp_xnor
  -- will be reversed if needed
  -----------------------------------------------------------------------------

  -- IMUX PART ONE ( and gates )
  
  select_in: for index in 63 downto 0 generate

  begin

    -- direct iput for inc,dec,neg,abs,lsbx functions
    
    and_direct : AND3 port map (
      A => tmp_iin(index),
      B => nrev,
      C => ncmp,
      Y => tmp_in_dir(index));

    -- reversed input for msbx functions
    
    and_rev : AND2 port map (
      A => tmp_iin(63-index),
      B => rev,
      Y => tmp_in_rev(index));

    -- input for cmpl(e)(i), min(i), max(i), sort
    
    and_cmp : AND2 port map (
      A => tmp_xnor(63-index),
      B => cmp,
      Y => tmp_in_cmp(index));

  end generate select_in;

  -- precalculation for abs function
  -- checks which byts, words or dword are
  -- negative.
  -- And gates of a MUX
  
  abs_sel_and: for index in 7 downto 0 generate

  begin
    gate_08 : AND2 port map (
      A => r1((index*8)+7),
      B => nu08,
      Y => tmp_abs_sel1(index*4));

    gate_16 : AND3 port map (
      A => r1(((index/2)*16)+15),
      B => u08,
      C => nu16,
      Y => tmp_abs_sel1((index*4)+1));

    gate_32 : AND3 port map (
      A => r1(((index/4)*32)+31),
      B => u16,
      C => nu32,
      Y => tmp_abs_sel1((index*4)+2));

    gate_64 : AND2 port map (
      A => r1(63),
      B => u32,
      Y => tmp_abs_sel1((index*4)+3));
    
  end generate abs_sel_and;
  
  -----------------------------------------------------------------------------
  -- Stage three (d=3)
  -- Selection of input value part II
  -----------------------------------------------------------------------------

  -- Part Two of IMUX (OR GATES)
  
  in_mux_or: for bit_index in 63 downto 0 generate
    
  begin

    imuxor : OR3 port map (
      A => tmp_in_dir(bit_index),
      B => tmp_in_rev(bit_index),
      C => tmp_in_cmp(bit_index),
      Y => tmp_inp(bit_index));
     
  end generate in_mux_or;

  -- futher calculations for abs
  -- OR gates of MUX

  abs_sel_or: for index in 7 downto 0 generate

  begin

    gate : OR4 port map (
      A => tmp_abs_sel1(index*4),
      B => tmp_abs_sel1((index*4)+1),
      C => tmp_abs_sel1((index*4)+2),
      D => tmp_abs_sel1((index*4)+3),
      Y => tmp_abs_sel2(index));
    
  end generate abs_sel_or;

  -----------------------------------------------------------------------------
  -- Stage four (d=4)
  -- calculation of carry part I
  -----------------------------------------------------------------------------

  -- precalc carry for each byte
  
  carry_step1: for bit_index in 63 downto 0 generate

  begin

    bit_zero: if (bit_index mod 8) = 0 generate

    begin

      tmp_cry1(bit_index) <= inc;
      
    end generate bit_zero;

    bit_one: if (bit_index mod 8) = 1 generate

    begin

      gate : AND2 port map (
        A => inc,
        B => tmp_inp(bit_index-1),
        Y => tmp_cry1(bit_index));
      
    end generate bit_one;

    bit_two: if (bit_index mod 8) = 2 generate

    begin

      gate : AND3 port map (
        A => inc,
        B => tmp_inp(bit_index-1),
        C => tmp_inp(bit_index-2),
        Y => tmp_cry1(bit_index));
      
    end generate bit_two;

    bit_three: if (bit_index mod 8 ) = 3 generate

    begin

      gate : AND4 port map (
        A => inc,
        B => tmp_inp(bit_index-1),
        C => tmp_inp(bit_index-2),
        D => tmp_inp(bit_index-3),
        Y => tmp_cry1(bit_index));
      
    end generate bit_three;

    bit_four: if (bit_index mod 8) = 4 generate

    begin

      gate : AND5 port map (
        A => inc,
        B => tmp_inp(bit_index-1),
        C => tmp_inp(bit_index-2),
        D => tmp_inp(bit_index-3),
        E => tmp_inp(bit_index-4),
        Y => tmp_cry1(bit_index));
      
    end generate bit_four;

    bit_five: if (bit_index mod 8) = 5 generate

    begin

       gate : AND6 port map (
         A => inc,
         B => tmp_inp(bit_index-1),
         C => tmp_inp(bit_index-2),
         D => tmp_inp(bit_index-3),
         E => tmp_inp(bit_index-4),
         F => tmp_inp(bit_index-5),
         Y => tmp_cry1(bit_index));
         
    end generate bit_five;

    bit_six: if (bit_index mod 8) = 6 generate

    begin

      gate : AND7 port map (
        A => inc,
        B => tmp_inp(bit_index-1),
        C => tmp_inp(bit_index-2),
        D => tmp_inp(bit_index-3),
        E => tmp_inp(bit_index-4),
        F => tmp_inp(bit_index-5),
        G => tmp_inp(bit_index-6),
        Y => tmp_cry1(bit_index));
      
    end generate bit_six;

    bit_seven: if (bit_index mod 8) = 7 generate

    begin

      gate : AND8 port map (
        A => inc,
        B => tmp_inp(bit_index-1),
        C => tmp_inp(bit_index-2),
        D => tmp_inp(bit_index-3),
        E => tmp_inp(bit_index-4),
        F => tmp_inp(bit_index-5),
        G => tmp_inp(bit_index-6),
        H => tmp_inp(bit_index-7),
        Y => tmp_cry1(bit_index));
      
    end generate bit_seven;
    
  end generate carry_step1;

  -- generate byte carries

  cry_overflow: for index in 7 downto 0 generate

  begin

    gate : AND9 port map (
      A => inc,
      B => tmp_inp(index*8),
      C => tmp_inp((index*8)+1),
      D => tmp_inp((index*8)+2),
      E => tmp_inp((index*8)+3),
      F => tmp_inp((index*8)+4),
      G => tmp_inp((index*8)+5),
      H => tmp_inp((index*8)+6),
      I => tmp_inp((index*8)+7),
      Y => tmp_cryo(index));
    
  end generate cry_overflow;

  -- Still calculations for abs.
  -- tmp_abs_sel2 and tmp_abs_sel3
  -- select whether neg value or r1
  -- is the result in r3
  
  negate_abs_sel: for index in 7 downto 0 generate

  begin

    gate : NOT1 port map (
      A => tmp_abs_sel2(index),
      Y => tmp_abs_sel3(index));
    
  end generate negate_abs_sel;

  -----------------------------------------------------------------------------
  -- Stage five (d=5)
  -- calculation of carry bit stage II
  -----------------------------------------------------------------------------

  -- check byte carries with simd modes

  carry_partII: for bit_index in 63 downto 0 generate

  begin

    byte_zero: if (bit_index / 8) = 0 generate

    begin

      tmp_cry2(bit_index) <= tmp_cry1(bit_index);
      
    end generate byte_zero;

    byte_one: if (bit_index/8) = 1  generate

    begin

      gate0 : AND2 port map (
        A => tmp_cry1(bit_index),
        B => nu08,
        Y => tmp_cry2(bit_index));

      gate1 : AND3 port map (
        A => tmp_cry1(bit_index),
        B => tmp_cryo(0),
        C => u08,
        Y => tmp_cry3(bit_index));
      
    end generate byte_one;

    byte_two: if (bit_index/8) = 2 generate

    begin

      gate0 : AND2 port map (
        A => tmp_cry1(bit_index),
        B => nu16,
        Y => tmp_cry2(bit_index));
      
      gate1 : AND4 port map (
        A => tmp_cry1(bit_index),
        B => tmp_cryo(0),
        C => tmp_cryo(1),
        D => u16,
        Y => tmp_cry3(bit_index));
      
    end generate byte_two;

    byte_three : if (bit_index/8) = 3 generate

    begin

      gate0 : AND2 port map (
        A => tmp_cry1(bit_index),
        B => nu08,
        Y => tmp_cry2(bit_index));
      
      gate1 : AND4 port map (
        A => tmp_cry1(bit_index),
        B => tmp_cryo(2),
        C => nu16,
        D => u08,
        Y => tmp_cry3(bit_index));

      gate2 : AND5 port map (
        A => tmp_cry1(bit_index),
        B => tmp_cryo(0),
        C => tmp_cryo(1),
        D => tmp_cryo(2),
        E => u16,
        Y => tmp_cry4(bit_index));
      
    end generate byte_three;

    byte_four: if (bit_index/8) = 4 generate

    begin

      gate0 : AND2 port map (
        A => tmp_cry1(bit_index),
        B => nu32,
        Y => tmp_cry2(bit_index));

      gate1 : AND6 port map (
        A => tmp_cry1(bit_index),
        B => tmp_cryo(0),
        C => tmp_cryo(1),
        D => tmp_cryo(2),
        E => tmp_cryo(3),
        F => u32,
        Y => tmp_cry3(bit_index));
      
    end generate byte_four;

    byte_five: if (bit_index/8) = 5 generate

    begin

      gate0 : AND2 port map (
        A => tmp_cry1(bit_index),
        B => nu08,
        Y => tmp_cry2(bit_index));

      gate1 : AND4 port map (
        A => tmp_cry1(bit_index),
        B => tmp_cryo(4),
        C => nu32,
        D => u08,
        Y => tmp_cry3(bit_index));

      gate2 : AND7 port map (
        A => tmp_cry1(bit_index),
        B => tmp_cryo(0),
        C => tmp_cryo(1),
        D => tmp_cryo(2),
        E => tmp_cryo(3),
        F => tmp_cryo(4),
        G => u32,
        Y => tmp_cry4(bit_index));

    end generate byte_five;

    byte_six: if (bit_index/8) = 6 generate

    begin

      gate0 : AND2 port map (
        A => tmp_cry1(bit_index),
        B => nu16,
        Y => tmp_cry2(bit_index));

      gate1 : AND5 port map (
        A => tmp_cry1(bit_index),
        B => tmp_cryo(4),
        C => tmp_cryo(5),
        D => u16,
        E => nu32,
        Y => tmp_cry3(bit_index));
      
      gate2 : AND8 port map (
        A => tmp_cry1(bit_index),
        B => tmp_cryo(0),
        C => tmp_cryo(1),
        D => tmp_cryo(2),
        E => tmp_cryo(3),
        F => tmp_cryo(4),
        G => tmp_cryo(5),
        H => u32,
        Y => tmp_cry4(bit_index));
      
    end generate byte_six;

    byte_seven: if (bit_index / 8) = 7 generate

    begin

      gate0 : AND2 port map (
        A => tmp_cry1(bit_index),
        B => nu08,
        Y => tmp_cry2(bit_index));

      gate1 : AND4 port map (
        A => tmp_cry1(bit_index),
        B => tmp_cryo(6),
        C => nu16,
        D => u08,
        Y => tmp_cry3(bit_index));

      gate2 : AND6 port map (
        A => tmp_cry1(bit_index),
        B => tmp_cryo(4),
        C => tmp_cryo(5),
        D => tmp_cryo(6),
        E => u16,
        F => nu32,
        Y => tmp_cry4(bit_index));
      
      gate3 : AND9 port map (
        A => tmp_cry1(bit_index),
        B => tmp_cryo(0),
        C => tmp_cryo(1),
        D => tmp_cryo(2),
        E => tmp_cryo(3),
        F => tmp_cryo(4),
        G => tmp_cryo(5),
        H => tmp_cryo(6),
        I => u32,
        Y => tmp_cry5(bit_index));
     
    end generate byte_seven;
    
  end generate carry_partII;

  -- still calculation for abs

  sel_direct_value_abs: for index in 7 downto 0 generate

  begin

    gate : AND2 port map (
      A => tmp_abs_sel2(index),
      B => sabs,
      Y => tmp_sel_dir1(index));
    
  end generate sel_direct_value_abs;

  inc_dec : AND3 port map (
    A => ncmp,
    B => nscan,
    C => nsabs,
    Y => nscan_and_ncmp);

  -- invert carry for use inc scan unit
  
  carryo_inv: for index in 7 downto 0 generate

  begin

    gate : NOT1 port map (
      A => tmp_cryo(index),
      Y => tmp_ncryo(index));
    
  end generate carryo_inv;

  -----------------------------------------------------------------------------
  -- Stage six (d=6)
  -- carry part III
  -----------------------------------------------------------------------------

  -- final carry result
  
  carry_part_III: for bit_index in 63 downto 0 generate

  begin

    byte_0: if (bit_index/8) = 0 generate

    begin

      tmp_cry(bit_index) <= tmp_cry2(bit_index);
      
    end generate byte_0;

    byte_124: if ((bit_index/8) = 1) or ((bit_index/8) = 2) or ((bit_index/8) = 4) generate

    begin

      gate : OR2 port map (
        A => tmp_cry2(bit_index),
        B => tmp_cry3(bit_index),
        Y => tmp_cry(bit_index));
      
    end generate byte_124;

    byte_356: if ((bit_index/8) = 3) or ((bit_index/8) = 5) or ((bit_index/8) = 6)  generate

    begin

      gate : OR3 port map (
        A => tmp_cry2(bit_index),
        B => tmp_cry3(bit_index),
        C => tmp_cry4(bit_index),
        Y => tmp_cry(bit_index));
      
    end generate byte_356;

    byte_7: if (bit_index/8) = 7 generate

    begin

      gate : OR4 port map (
        A => tmp_cry2(bit_index),
        B => tmp_cry3(bit_index),
        C => tmp_cry4(bit_index),
        D => tmp_cry5(bit_index),
        Y => tmp_cry(bit_index));
      
    end generate byte_7;

  end generate carry_part_III;

  -- and still preperation for abs functions
  
  sel_direct_value: for index in 7 downto 0 generate

  begin

    gate : OR2 port map (
      A => nscan_and_ncmp,
      B => tmp_sel_dir1(index),
      Y => tmp_sel_dir2(index));
    
  end generate sel_direct_value;

  -----------------------------------------------------------------------------
  -- Stage seven (d=7)
  -- inc, dec, scan, ...
  -----------------------------------------------------------------------------

  -- The Main part of this unit.
  -- Simple and effective. 'Just' add one
  
  increment: for bit_index in 63 downto 0 generate

  begin

    gate : XOR2 port map (
      A => tmp_inp(bit_index),
      B => tmp_cry(bit_index),
      Y => tmp_inc(bit_index));
    
  end generate increment;

  -- scan carry bits for first zero
  
  scanner: for bit_index in 63 downto 0 generate

  begin

    msb: if (bit_index mod 8) = 7 generate

      signal tmp : std_ulogic;
      
    begin

      gate : AND2 port map (
        A => tmp_cry(bit_index),
        B => tmp_ncryo(bit_index/8),
        Y => tmp_scan(bit_index));
      
    end generate msb;

    other: if (bit_index mod 8) /= 7 generate

    begin
      
      gate: XOR2 port map (
        A => tmp_cry(bit_index),
        B => tmp_cry(bit_index+1),
        Y => tmp_scan(bit_index));
      
    end generate other;
    
  end generate scanner;

  -----------------------------------------------------------------------------
  -- Stage eight (d=8)
  -- invert output, cmp part II
  -----------------------------------------------------------------------------

  -- invert output if required
  -- if oin = 1
  -- then tmp_oin = not(tmp_inc)
  -- else tmp_oin = tmp_inc
  
  output_inverter: for bit_index in 63 downto 0 generate

  begin

    gate : XOR2 port map (
      A => tmp_inc(bit_index),
      B => oin,
      Y => tmp_oin(bit_index));
    
  end generate output_inverter;

  -- compare part II
  -- r1 and msb1 (r1 xor r2)
  -- result is either zero or not
  
  compare_ii: for bit_index in 63 downto 0 generate

  begin
    
    gate : AND2 port map (
      A => tmp_scan(63-bit_index),
      B => r1(bit_index),
      Y => tmp_cmp2(bit_index));
    
  end generate compare_ii;
  
  -----------------------------------------------------------------------------
  -- Stage nine (d=9)
  -- compare results
  -----------------------------------------------------------------------------

  -- check every byte whether its zero or not
  
  compare_iii: for index in 7 downto 0 generate

  begin

    gate : OR8 port map (
      A => tmp_cmp2(index*8),
      B => tmp_cmp2((index*8)+1),
      C => tmp_cmp2((index*8)+2),
      D => tmp_cmp2((index*8)+3),
      E => tmp_cmp2((index*8)+4),
      F => tmp_cmp2((index*8)+5),
      G => tmp_cmp2((index*8)+6),
      H => tmp_cmp2((index*8)+7),
      Y => tmp_cmp3(index));
    
  end generate compare_iii;


  -----------------------------------------------------------------------------
  -- Stage ten (d=10)
  -- compare for simd modes
  -----------------------------------------------------------------------------

  -- check data according to simd mode,
  -- whether its zero or not
  -- ( uses byte results )
  
  word_zero : OR2 port map (
    A => tmp_cmp3(0),
    B => tmp_cmp3(1),
    Y => tmp_cmp4(0));

  word_one : OR2 port map (
    A => tmp_cmp3(2),
    B => tmp_cmp3(3),
    Y => tmp_cmp4(1));

  word_two : OR2 port map (
    A => tmp_cmp3(4),
    B => tmp_cmp3(5),
    Y => tmp_cmp4(2));

  word_three : OR2 port map (
    A => tmp_cmp3(6),
    B => tmp_cmp3(7),
    Y => tmp_cmp4(3));

  dword_zero : OR4 port map (
    A => tmp_cmp3(0),
    B => tmp_cmp3(1),
    C => tmp_cmp3(2),
    D => tmp_cmp3(3),
    Y => tmp_cmp4(4));

  dword_one : OR4 port map (
    A => tmp_cmp3(4),
    B => tmp_cmp3(5),
    C => tmp_cmp3(6),
    D => tmp_cmp3(7),
    Y => tmp_cmp4(5));

  qword : OR8 port map (
    A => tmp_cmp3(0),
    B => tmp_cmp3(1),
    C => tmp_cmp3(2),
    D => tmp_cmp3(3),
    E => tmp_cmp3(4),
    F => tmp_cmp3(5),
    G => tmp_cmp3(6),
    H => tmp_cmp3(7),
    Y => tmp_cmp4(6));
  

  -----------------------------------------------------------------------------
  -- Stage eleven (d=11)
  -- omux part I
  -----------------------------------------------------------------------------

  -- part one of gigantic output mux
  -- there are still more inputs to select from
  -- when min(i), max(i) and sort functions
  -- are implementet
  
  output_mux_ands: for bit_index in 63 downto 0 generate

  begin

    -- select incrementet value
    
    and_direct : AND2 port map (
      A => tmp_sel_dir2(bit_index/8),
      B => tmp_oin(bit_index),
      Y => tmp_out_dir(bit_index));

    -- select scan value for msbx function
    
    and_reversed : AND3 port map (
      A => scan,
      B => rev,
      C => tmp_scan(63 - bit_index),
      Y => tmp_out_scan_r(bit_index));

    -- select scan value for lsbx functions
    
    and_sca : AND3 port map (
      A => scan,
      B => nrev,
      C => tmp_scan(bit_index),
      Y => tmp_out_scan(bit_index));

    -- select compare result for 8 bit data
    
    and_cmp08 : AND2 port map (
      A => cmp,
      B => tmp_cmp3(bit_index/8),
      Y => tmp_out_cmp08(bit_index));

    -- select compare results for 16 bit data
    
    and_cmp16 : AND3 port map (
      A => cmp,
      B => U08,
      C => tmp_cmp4((bit_index/16)),
      Y => tmp_out_cmp16(bit_index));

    -- select compare results for 32 bit data
    
    and_cmp32 : AND3 port map (
      A => cmp,
      B => U16,
      C => tmp_cmp4((bit_index/32)+4),
      Y => tmp_out_cmp32(bit_index));

    -- select compare results for 64 bit data
    
    and_cmp64 : AND3 port map (
      A => cmp,
      B => U32,
      C => tmp_cmp4(6),
      Y => tmp_out_cmp64(bit_index));

    -- select r1
    
    and_abs_n : AND3 port map (
      A => sabs,
      B => tmp_abs_sel3(bit_index/8),
      C => r1(bit_index),
      Y => tmp_out_r1(bit_index));
    
  end generate output_mux_ands;

  -----------------------------------------------------------------------------
  -- Stage twelve (d=12)
  -- omux part II
  -----------------------------------------------------------------------------

  -- Output mux part two
  -- OR it together
  output_mux_ors: for bit_index in 63 downto 0 generate

  begin
    
    gate : OR8 port map (
      A => tmp_out_dir(bit_index),
      B => tmp_out_scan(bit_index),
      C => tmp_out_cmp08(bit_index),
      D => tmp_out_cmp16(bit_index),
      E => tmp_out_cmp32(bit_index),
      F => tmp_out_cmp64(bit_index),
      G => tmp_out_r1(bit_index),
      H => tmp_out_scan_r(bit_index),
      y => r3(bit_index));
    
  end generate output_mux_ors;
  
end struct_1;

-- DONE!!












