-- iadd.vhdl -- F-CPU 64-bit Add/Subtract Unit
-- Copyright (C) 2000 Michael Riepe <michael@s...>
--
-- This program is free software; you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 2 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program; if not, write to the Free Software
-- Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

-- $Id: iadd.vhdl,v 1.8 2000/11/03 20:28:20 michael Exp $

library IEEE;
use IEEE.std_logic_1164.all;

entity IAdd is
        generic (
                WIDTH : natural := 64   -- do not change!
        );
        port (
                A, B : in std_ulogic_vector(WIDTH-1 downto 0) := (others => '0');
                -- subtract mode enable
                Sub : in std_ulogic := '0';
                -- saturate/floor mode enable
                Sat : in std_ulogic := '0';
                -- SIMD mode switches
                U08, U16, U32 : in std_ulogic := '0';
                -- outputs
                Yl, Yh : out std_ulogic_vector(WIDTH-1 downto 0)
        );
end IAdd;

-- Known limitations:
--      1: Not fully tested.
--      2: 8-bit SIMD mode doesn't work yet.
--      3: There was no space (or rather, delay time) to include the avg
--      and diff instructions.  They will probably need an additional
--      output port, unless the MUX components are fast enough.  Different
--      avg/diff rounding modes won't work either, but `truncate' should
--      be sufficient anyway.
--      4: subb mode differs from F-CPU manual.  IMHO the manual should
--      be changed :)  See the rationale in the code below.

-- Operating Modes:
--      Sub = '0', Sat = '0': add operation
--      Sub = '0', Sat = '1': add operation with unsigned saturation (ceiling)
--      Sub = '1', Sat = '0': sub operation
--      Sub = '1', Sat = '1': sub operation with unsigned saturation (floor)
--      carry/borrow is always available on the second output port (Yh);
--      that means all operating modes from the manual are supported:
--      add, addc, adds, sub, subb and subf.

-- SIMD Modes:
--      U08 = '0', U16 = '0', U32 = '0': 8-bit mode (does not work yet)
--      U08 = '1', U16 = '0', U32 = '0': 16-bit mode
--      U08 = '1', U16 = '1', U32 = '0': 32-bit mode
--      U08 = '1', U16 = '1', U32 = '1': 64-bit mode
--      (others combinations are invalid)
--      Note: I intend to use this encoding scheme everywhere; it seems
--      to be the most appropriate one.

-- Modus Operandi:
--      The IAdd unit is a carry-select adder with some special features.
--      The first part calculates 4-bit results both with and without carry.
--      These 4-bit results are put together to form 16-bit results (again,
--      both with and without carry) in the first carry-select stage.  The
--      second (final) carry-select stage creates larger results, and also
--      does the SIMD selection for results >= 16 bits.  8-bit SIMD is not
--      supported yet due to timing problems -- I guess I have to tap
--      the 4-bit results and handle 8-bit mode separately.  Ideas welcome.
--
--      Subtraction is implemented as `not ((not A) + B)' rather than the
--      usual `A + (not B) + 1' because that makes the saturation modes
--      easier -- and we don't need a carry input either, which simplifies
--      the SIMD stuff in the final carry-select stage.  Expressed as a
--      simple equation, this unit calculates:
--
--              Yl := (((A xor Sub) + B) or (Sat and Carry)) xor Sub
--              Yh := 1 when Carry is set, 0 otherwise
--
--      where `Carry' is the appropriate carry output from the adder;
--      any other signals can be found in the entity declaration.

-- Implementation:
--      The whole unit consist of ordinary and/or gates with up to 4 inputs,
--      2-input xor gates, inverters and 2:1 muxes.  This also applies to the
--      HA (half adder), MULTI_CLA and SIMD_CLA (carry look-ahead) subunits.
--      In timing calculations, all gates are assumed to have a delay of 1,
--      muxes count as 2 delays (in FPGAs, they will probably occupy a
--      single cell; in standard CMOS, they can be made faster, too).
--      Some basic elements may be optimized further if the target supports
--      arbitrary functions of 3 or 4 inputs, e.g. the half adders can
--      be combined with the input inverters.  With the conservative
--      assumptions above, the unit has a delay of 12, and it can be split
--      in the middle (at d=6) to form two pipeline stages.  *schwitz* :)

architecture Struct_1 of IAdd is
        component AND2 is -- assume d=1
                port (A, B : in std_ulogic; Y : out std_ulogic);
        end component;

        component AND3 is -- assume d=1
                port (A, B, C : in std_ulogic; Y : out std_ulogic);
        end component;

        component AND4 is -- assume d=1
                port (A, B, C, D : in std_ulogic; Y : out std_ulogic);
        end component;

        component XOR2 is -- assume d=1
                port (A, B : in std_ulogic; Y : out std_ulogic);
        end component;

        component OR2 is -- assume d=1
                port (A, B : in std_ulogic; Y : out std_ulogic);
        end component;

        component OR3 is -- assume d=1
                port (A, B, C : in std_ulogic; Y : out std_ulogic);
        end component;

        component OR4 is -- assume d=1
                port (A, B, C, D : in std_ulogic; Y : out std_ulogic);
        end component;

        component NOT1 is -- assume d=1
                port (A : in std_ulogic; Y : out std_ulogic);
        end component;

        component MUX2 is -- assume d=2
                port (A0, A1, Sel : in std_ulogic; Y : out std_ulogic);
        end component;

        component HA is -- assume d=1
                port (A, B : in std_ulogic; Sum, Carry : out std_ulogic);
        end component;

        component MULTI_CLA is -- assume d=2
                port (
                        Gi, Pi : in std_ulogic_vector(3 downto 0) := (others => '0');
                        CoNC : out std_ulogic_vector(3 downto 0);
                        CoCY : out std_ulogic_vector(3 downto 0);
                        Go, Po : out std_ulogic
                );
        end component;

        component SIMD_CLA is -- assume d=2
                port (
                        Gi, Pi : in std_ulogic_vector(3 downto 0) := (others => '0');
                        U1, U2 : in std_ulogic := '0';
                        Co : out std_ulogic_vector(3 downto 0)
                );
        end component;

        signal G0, P0, C1_nc, C1_cy : std_ulogic_vector(WIDTH-1 downto 0);
        signal G1, P1, C2_nc, C2_cy : std_ulogic_vector(WIDTH/4-1 downto 0);
        signal G2, P2, C3 : std_ulogic_vector(WIDTH/16-1 downto 0);
        signal Y4_nc, Y4_cy : std_ulogic_vector(WIDTH-1 downto 0);
        signal Y16_nc, Y16_cy : std_ulogic_vector(WIDTH-1 downto 0);
        signal S : std_ulogic_vector(WIDTH/8-1 downto 0) := (others => '0');
        signal un08, un16, un32 : std_ulogic;
begin
        -- input stage
        -- d=2
        input : for i in 0 to WIDTH-1 generate
                -- Behaviour:
                -- Sum(i) <= (A(i) xor Sub) xor B(i);
                -- Carry(i) <= (A(i) xor Sub) and B(i);
                bl : block
                        -- local signals
                        signal An : std_ulogic;
                begin
                        -- invert operand for subtraction
                        inv_a : XOR2
                                port map (A => A(i), B => Sub, Y => An);

                        -- a row of half adders
                        add : HA
                                port map (A => An, B => B(i), Sum => P0(i), Carry => G0(i));
                end block;
        end generate;

        -- first-level CLA, w/ and w/o carry-in
        -- d=4
        cla1 : for i in 0 to WIDTH/4-1 generate
                -- Behaviour: see multicla.vhdl
                multi : MULTI_CLA
                        port map (
                                Gi => G0(4*i+3 downto 4*i),
                                Pi => P0(4*i+3 downto 4*i),
                                CoNC => C1_nc(4*i+3 downto 4*i),
                                CoCY => C1_cy(4*i+3 downto 4*i),
                                Go => G1(i),
                                Po => P1(i)
                        );
        end generate;

        -- precomputed 4-bit partial results
        -- d=5
        res4 : for i in 0 to WIDTH-1 generate
                -- Behaviour:
                -- Y4_nc(i) <= P0(i) xor C1_nc(i);
                -- Y4_cy(i) <= P0(i) xor C1_cy(i);
                nc : XOR2
                        port map (A => P0(i), B => C1_nc(i), Y => Y4_nc(i));
                cy : XOR2
                        port map (A => P0(i), B => C1_cy(i), Y => Y4_cy(i));
        end generate;

        -- second-level CLA, w/ and w/o carry
        -- d=6
        cla2 : for i in 0 to WIDTH/16-1 generate
                -- Behaviour: see multicla.vhdl
                multi : MULTI_CLA
                        port map (
                                Gi => G1(4*i+3 downto 4*i),
                                Pi => P1(4*i+3 downto 4*i),
                                CoNC => C2_nc(4*i+3 downto 4*i),
                                CoCY => C2_cy(4*i+3 downto 4*i),
                                Go => G2(i),
                                Po => P2(i)
                        );
        end generate;

        -- TODO: ADD PIPELINE REGISTER HERE !!!

        -- 16-bit partial result muxes
        -- d=8
        res16 : for i in 0 to WIDTH-1 generate
                -- Behaviour:
                --      Y16_nc(i) <= Y4_cy(i) when C2_nc(i/4) = '1' else Y4_nc(i);
                --      Y16_cy(i) <= Y4_cy(i) when C2_cy(i/4) = '1' else Y4_nc(i);
                nc : MUX2
                        port map (
                                A0 => Y4_nc(i),
                                A1 => Y4_cy(i),
                                Sel => C2_nc(i/4),
                                Y => Y16_nc(i)
                        );
                cy : MUX2
                        port map (
                                A0 => Y4_nc(i),
                                A1 => Y4_cy(i),
                                Sel => C2_cy(i/4),
                                Y => Y16_cy(i)
                        );
        end generate;

        -- third-level CLA, w/ variable carry
        -- d=8
        cla3 : for i in 0 to WIDTH/64-1 generate
                -- Similar to the `normal' CLA entity, but this
                -- one is influenced by the SIMD mode lines.
                -- Behavior:
                --      Co <= (
                --              0 => '0',
                --              1 => (U1 and Gi(0)),
                --              2 => (U2 and Gi(1))
                --                or (U2 and Pi(1) and Gi(0)),
                --              3 => (U1 and Gi(2))
                --                or (U2 and Pi(2) and Gi(1))
                --                or (U2 and Pi(2) and Pi(1) and Gi(0))
                --      );
                -- This used to be a block statement, but Symphony EDA
                -- doesn't support blocks with ports in them, yet :(
                -- Therefore it's now a separate entity.
                nc : SIMD_CLA
                        port map (
                                Gi => G2(4*i+3 downto 4*i),
                                Pi => P2(4*i+3 downto 4*i),
                                Co => C3(4*i+3 downto 4*i),
                                U1 => U16,
                                U2 => U32
                        );
        end generate;

        -- saturate/carry logic
        -- d=10
        sat_and_carry : block
                -- Note!!! some signals are not driven explicitly!
                signal C08, C16, C32, C64 : std_ulogic_vector(WIDTH/8-1 downto0)
                        := (others => '0');
        begin
                -- mode switches inverted
                -- (only used here, but can be moved anywhere)
                modeinv : block
                begin
                        u08_n : NOT1 port map (U08, un08);
                        u16_n : NOT1 port map (U16, un16);
                        u32_n : NOT1 port map (U32, un32);
                end block;

                -- carry output vectors for each width
                -- d=8
                carry08 : for i in WIDTH/8-1 downto 0 generate
                        -- 8-bit carry out
                        --      C08(i) <= G1(2*i+1) or (P1(2*i+1) and G1(2*i));
                        bl : block
                                signal t : std_ulogic;
                        begin
                                tmp : AND2 port map (P1(2*i+1), G1(2*i), t);
                                co  : OR2  port map (G1(2*i+1), t, C08(i));
                        end block;
                end generate;

                carry16 : for i in WIDTH/16-1 downto 0 generate
                        -- 16-bit carry out
                        C16(2*i) <= G2(i);
                end generate;

                carry32 : for i in WIDTH/32-1 downto 0 generate
                        -- 32-bit carry out
                        --      C32(4*i) <= G2(2*i+1) or (P2(2*i+1) and G2(2*i));
                        bl : block
                                signal t : std_ulogic;
                        begin
                                tmp : AND2 port map (P2(2*i+1), G2(2*i), t);
                                co  : OR2  port map (G2(2*i+1), t, C32(4*i));
                        end block;
                end generate;

                carry64 : for i in WIDTH/64-1 downto 0 generate
                        -- 64-bit carry out
                        --      C64(8*i) <= (G2(4*i+3))
                        --                       or (P2(4*i+3) and G2(4*i+2))
                        --                       or (P2(4*i+3) and P2(4*i+2) and G2(4*i+1))
                        --                       or (P2(4*i+3) and P2(4*i+2) and P2(4*i+1) and G2(4*i+0));
                        bl : block
                                signal t1, t2, t3 : std_ulogic;
                        begin
                                tmp1 : AND2 port map (P2(4*i+3), G2(4*i+2), t1);
                                tmp2 : AND3 port map (P2(4*i+3), P2(4*i+2), G2(4*i+1), t2);
                                tmp3 : AND4 port map (P2(4*i+3), P2(4*i+2), P2(4*i+1), G2(4*i+0), t3);
                                co   : OR4  port map (G2(4*i+3), t1, t2, t3, C64(8*i));
                        end block;
                end generate;

                -- saturate vector (taken from carry outputs)
                -- d=10
                saturate : for i in WIDTH/8-1 downto 0 generate
                        bl : block
                                alias S08 : std_ulogic is C08(i);
                                alias S16 : std_ulogic is C16(2*(i/2));
                                alias S32 : std_ulogic is C32(4*(i/4));
                                alias S64 : std_ulogic is C64(8*(i/8));

                                signal t1, t2, t3, t4 : std_ulogic;
                        begin
                                -- Behaviour:
                                --      S(i) <= (Sat and S08         and not U08)
                                --               or (Sat and S16 and U08 and not U16)
                                --               or (Sat and S32 and U16 and not U32)
                                --               or (Sat and S64 and U32);
                                tmp1  : AND3 port map (Sat, S08, un08, t1);
                                tmp2  : AND4 port map (Sat, S16, U08, un16, t2);
                                tmp3  : AND4 port map (Sat, S32, U16, un32, t3);
                                tmp4  : AND3 port map (Sat, S64, U32, t4);
                                sat   : OR4  port map (t1, t2, t3, t4, S(i));
                        end block;
                end generate;

                -- high output vector
                -- d=10
                high_out : for i in WIDTH/8-1 downto 0 generate
                        bl : block
                                signal t1, t2, t3, t4, t5 : std_ulogic;
                        begin
                                -- Behaviour:
                                --      Yh(8*i) <= (C08(i)         and not U08)
                                --                      or (C16(i) and U08 andnot U16)
                                --                      or (C32(i) and U16 andnot U32)
                                --                      or (C64(i) and U32);
                                tmp1 : AND2 port map (C08(i), un08, t1);
                                tmp2 : AND3 port map (C16(i), U08, un16, t2);
                                tmp3 : AND3 port map (C32(i), U16, un32, t3);
                                tmp4 : AND2 port map (C64(i), U32, t4);
                                tmp5 : OR4  port map (t1, t2, t3, t4, t5);
                                --
                                -- Note that this differs from the F-CPU
                                -- manual, Rev.0.2.  In the manual, the
                                -- `subb' borrow output is set to all 1's
                                -- (numeric value -1) while this unit
                                -- sets it to the numeric value 1.
                                -- This is much easier to do in the
                                -- presence of SIMD, and it's also
                                -- more logical: `borrow -1' actually
                                -- means `add 1', which is wrong.
                                --
                                Yh(8*i) <= t5;
                                Yh(8*i+7 downto 8*i+1) <= (others => '0');
                        end block;
                end generate;
        end block;

        -- output stage
        -- d=12
        output : for i in 0 to WIDTH-1 generate
                bl : block
                        -- local signals
                        signal Y64, Y64_sat : std_ulogic;
                begin
                        -- 64-bit result mux
                        -- d=10
                        mux : MUX2
                                port map (
                                        A0 => Y16_nc(i),
                                        A1 => Y16_cy(i),
                                        Sel => C3(i/16),
                                        Y => Y64
                                );

                        -- handle saturate/floor mode
                        -- d=11
                        sat_or : OR2
                                port map (A => Y64, B => S(i/8), Y => Y64_sat);

                        -- invert output for subtraction
                        -- d=12
                        inv_y : XOR2
                                port map (A => Y64_sat, B => Sub, Y => Yl(i));
                end block;
        end generate;
end Struct_1;

-- vi: set ts=4 sw=4 : please
