-- imul64.vhdl - F-CPU 64-Bit SIMD Integer Multiplication Unit
-- Copyright (C) 2000 Michael Riepe <michael@stud.uni-hannover.de>
--
-- This program is free software; you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 2 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program; if not, write to the Free Software
-- Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

-- $Id: imul64.vhdl,v 1.26 2000/12/01 16:14:36 michael Exp $

library IEEE;
use IEEE.std_logic_1164.all;

entity IMul64 is
	port (
		-- inputs
		A : in std_ulogic_vector(63 downto 0);
		B : in std_ulogic_vector(63 downto 0);
		-- optional add input
		X : in std_ulogic_vector(63 downto 0) := (others => '0');
		-- signed/unsigned mode switch
		SignedMode : in std_ulogic := '0';
		-- MAC modes
		MacLo, MacHi : in std_ulogic := '0';
		-- SIMD mode switches, as usual
		U08, U16, U32 : in std_ulogic := '1';
	--
		-- 8-bit results
		Y08l : out std_ulogic_vector(63 downto 0);
		Y08h : out std_ulogic_vector(63 downto 0);
		-- 16-bit results
		Y16l : out std_ulogic_vector(63 downto 0);
		Y16h : out std_ulogic_vector(63 downto 0);
		-- 32-bit results
		Y32l : out std_ulogic_vector(63 downto 0);
		Y32h : out std_ulogic_vector(63 downto 0);
		-- 64-bit results
		Y64l : out std_ulogic_vector(63 downto 0);
		Y64h : out std_ulogic_vector(63 downto 0)
	);
end IMul64;

architecture Struct_1 of IMul64 is
	component AND2
		port (A, B : in std_ulogic; Y : out std_ulogic);
	end component;
	component AND3
		port (A, B, C : in std_ulogic; Y : out std_ulogic);
	end component;
	component OR3
		port (A, B, C : in std_ulogic; Y : out std_ulogic);
	end component;
	component XOR2
		port (A, B : in std_ulogic; Y : out std_ulogic);
	end component;
	component XOR3
		port (A, B, C : in std_ulogic; Y : out std_ulogic);
	end component;
	component MAJ23
		port (A, B, C : in std_ulogic; Y : out std_ulogic);
	end component;
	component ReduceTree
		generic (
			WIDTH : natural := 128;
			ILINES : natural := 4;
			OLINES : natural := 3
		);
		port (
			A : in std_ulogic_vector(WIDTH*ILINES-1 downto 0);
			Y : out std_ulogic_vector(WIDTH*OLINES-1 downto 0)
		);
	end component;
	component CIAdd
		generic (WIDTH : natural := 64);
		port (
			A : in std_ulogic_vector(WIDTH-1 downto 0);
			B : in std_ulogic_vector(WIDTH-1 downto 0);
			Y : out std_ulogic_vector(WIDTH-1 downto 0);
			C : out std_ulogic_vector(WIDTH-1 downto 0);
			G : out std_ulogic;
			P : out std_ulogic
		);
	end component;
	component CIAddSmall
		generic (WIDTH : natural range 1 to 16 := 8);
		port (
			A : in std_ulogic_vector(WIDTH-1 downto 0);
			B : in std_ulogic_vector(WIDTH-1 downto 0);
			Y : out std_ulogic_vector(WIDTH-1 downto 0)
		);
	end component;

	constant w : natural := 128;

	signal v0 : std_ulogic_vector(64*w-1 downto 0) := (others => '0');
	signal v1 : std_ulogic_vector(32*w-1 downto 0);
	signal vc : std_ulogic_vector(16*w-1 downto 0);
	signal v2 : std_ulogic_vector(32*w-1 downto 0);
	signal v3 : std_ulogic_vector(16*w-1 downto 0);
	signal v4 : std_ulogic_vector( 8*w-1 downto 0);
	signal v4a : std_ulogic_vector( 6*w-1 downto 0);
	signal v5 : std_ulogic_vector( 4*w-1 downto 0);
	signal v6 : std_ulogic_vector( 2*w-1 downto 0);
begin
	-- "In the beginning, there was..."
	-- 64x64 1-bit partial products, w/ SIMD gates,
	-- populating 50% of an "unrolled" 128x64 array...
	-- d=1
	input : block
		type mode_table is array(7 downto 0, 7 downto 0) of std_ulogic;
		signal simd : mode_table;
	begin
		simd <= (
			--7    6    5    4    3    2    1    0
			('1', U08, U16, U16, U32, U32, U32, U32), --7
			(U08, '1', U16, U16, U32, U32, U32, U32), --6
			(U16, U16, '1', U08, U32, U32, U32, U32), --5
			(U16, U16, U08, '1', U32, U32, U32, U32), --4
			(U32, U32, U32, U32, '1', U08, U16, U16), --3
			(U32, U32, U32, U32, U08, '1', U16, U16), --2
			(U32, U32, U32, U32, U16, U16, '1', U08), --1
			(U32, U32, U32, U32, U16, U16, U08, '1')  --0
		);
		outer : for j in 63 downto 0 generate
			inner : for i in 63 downto 0 generate
				mul_1x1 : AND3 port map (
					A(j), B(i), simd(i/8, j/8), v0(64*(j+i)+j)
				);
			end generate;
		end generate;
	end block;

	-- 4:2 reducer
	-- d=5
	level_1 : ReduceTree
		generic map (WIDTH => w, ILINES => 64, OLINES => 32)
		port map (A => v0, Y => v1);

	-- signed/unsigned correction, MAC input
	-- d=6
	signed_corr : block
		component AND4
			port (A, B, C, D : in std_ulogic; Y : out std_ulogic);
		end component;
		component NOT1
			port (A : in std_ulogic; Y : out std_ulogic);
		end component;

		signal an, bn : std_ulogic_vector(63 downto 0);
		signal corr08 : std_ulogic_vector(7 downto 0);
		signal un08, un16, un32 : std_ulogic;
		signal vb : std_ulogic_vector(32*w-1 downto 0) := (others => '0');
	begin
		-- negated size flags
		-- d=1
		un_08 : NOT1 port map (U08, un08);
		un_16 : NOT1 port map (U16, un16);
		un_32 : NOT1 port map (U32, un32);

		-- 8-bit correction gate vector
		-- d=1
		corr08 <= (7 => '1', 3 => un32, 5|1 => un16, others => un08);

		-- both operands, inverted (for subtraction)
		-- d=1
		invert : for i in 63 downto 0 generate
			inv_a : NOT1 port map (A(i), an(i));
			inv_b : NOT1 port map (B(i), bn(i));
		end generate;

		-- gated X input
		-- d=1
		input_x : for i in 0 to 3 generate
			bits : for j in 0 to 15 generate
				x_lo : AND2 port map (
					MacLo, X(16*i+j), vb(32*(16*i+j+ 0)+4*i+ 0+3)
				);
				x_hi : AND2 port map (
					MacHi, X(16*i+j), vb(32*(16*i+j+64)+4*i+16+3)
				);
			end generate;
		end generate;

		-- 8-bit correction vectors
		-- d=2
		mul08 : for i in 0 to 7 generate
			bl : block
				signal g, t1, t2 : std_ulogic;
			begin
				gate : g <= SignedMode;
				tmp1 : XOR2 port map (A(8*i+7), B(8*i+7), t1);
				tmp2 : AND2 port map (A(8*i+7), B(8*i+7), t2);
				ci_1 : AND3 port map (
					t1, g, un08, vb(32*(16*i+0+8)+4*i+0)
				);
				ci_2 : AND3 port map (
					t2, g, un08, vb(32*(16*i+1+8)+4*i+0)
				);
				bits : for j in 0 to 7 generate
					corr_a : AND4 port map (
						B(8*i+7), g, corr08(i), an(8*i+j),
						vb(32*(16*i+j+8)+4*i+1)
					);
					corr_b : AND4 port map (
						A(8*i+7), g, corr08(i), bn(8*i+j),
						vb(32*(16*i+j+8)+4*i+2)
					);
				end generate;
			end block;
		end generate;

		-- 16-bit correction vectors
		-- d=2
		mul16 : for i in 0 to 3 generate
			bl : block
				signal g, t1, t2 : std_ulogic;
			begin
				gate : AND2 port map (SignedMode, U08, g);
				tmp1 : XOR2 port map (A(16*i+15), B(16*i+15), t1);
				tmp2 : AND2 port map (A(16*i+15), B(16*i+15), t2);
				ci_1 : AND3 port map (
					t1, g, un16, vb(32*(32*i+0+16)+8*i+4)
				);
				ci_2 : AND3 port map (
					t2, g, un16, vb(32*(32*i+1+16)+8*i+4)
				);
				bits : for j in 0 to 7 generate
					corr_a : AND4 port map (
						B(16*i+15), g, corr08(2*i+1), an(16*i+j),
						vb(32*(32*i+j+16)+8*i+5)
					);
					corr_b : AND4 port map (
						A(16*i+15), g, corr08(2*i+1), bn(16*i+j),
						vb(32*(32*i+j+16)+8*i+6)
					);
				end generate;
			end block;
		end generate;

		-- 32-bit correction vectors
		-- d=2
		mul32 : for i in 0 to 1 generate
			bl : block
				signal g, t1, t2 : std_ulogic;
			begin
				gate : AND2 port map (SignedMode, U16, g);
				tmp1 : XOR2 port map (A(32*i+31), B(32*i+31), t1);
				tmp2 : AND2 port map (A(32*i+31), B(32*i+31), t2);
				ci_1 : AND3 port map (
					t1, g, un32, vb(32*(64*i+0+32)+16*i+12)
				);
				ci_2 : AND3 port map (
					t2, g, un32, vb(32*(64*i+1+32)+16*i+12)
				);
				bits : for j in 0 to 15 generate
					corr_a : AND4 port map (
						B(32*i+31), g, corr08(4*i+3), an(32*i+j),
						vb(32*(64*i+j+32)+16*i+13)
					);
					corr_b : AND4 port map (
						A(32*i+31), g, corr08(4*i+3), bn(32*i+j),
						vb(32*(64*i+j+32)+16*i+14)
					);
				end generate;
			end block;
		end generate;

		-- 64-bit correction vectors
		-- d=2
		mul64 : for i in 0 to 0 generate
			bl : block
				signal g, t1, t2 : std_ulogic;
			begin
				gate : AND2 port map (SignedMode, U32, g);
				tmp1 : XOR2 port map (A(64*i+63), B(64*i+63), t1);
				tmp2 : AND2 port map (A(64*i+63), B(64*i+63), t2);
				ci_1 : AND2 port map (
					t1, g, vb(32*(128*i+0+64)+32*i+28)
				);
				ci_2 : AND2 port map (
					t2, g, vb(32*(128*i+1+64)+32*i+28)
				);
				bits : for j in 0 to 31 generate
					corr_a : AND4 port map (
						B(64*i+63), g, corr08(8*i+7), an(64*i+j),
						vb(32*(128*i+j+64)+32*i+29)
					);
					corr_b : AND4 port map (
						A(64*i+63), g, corr08(8*i+7), bn(64*i+j),
						vb(32*(128*i+j+64)+32*i+30)
					);
				end generate;
			end block;
		end generate;

		-- 4:2 reducer
		-- d=6
		reduce : ReduceTree
			generic map (WIDTH => 128, ILINES => 32, OLINES => 16)
			port map (A => vb, Y => vc);
	end block;

	-- PIPELINE REGISTER: d=6(6)

	-- 3:2 reducer (with irregular inputs)
	-- d=8
	reduce : for i in 0 to 8*w-1 generate
		init : if i < 8 generate
			v2(4*i+0) <= '0';
			v2(4*i+1) <= '0';
		end generate;
		x1 : XOR3 port map (v1(4*i+0), v1(4*i+1), v1(4*i+2), v2(4*i+2));
		x2 : XOR3 port map (v1(4*i+3), vc(2*i+0), vc(2*i+1), v2(4*i+3));
		carry : if i < 8*(w-1) generate
			m1 : MAJ23 port map (v1(4*i+0), v1(4*i+1), v1(4*i+2), v2(4*i+32));
			m2 : MAJ23 port map (v1(4*i+3), vc(2*i+0), vc(2*i+1), v2(4*i+33));
		end generate;
	end generate;

	-- 4:2 reducer
	-- d=12
	level_2 : ReduceTree
		generic map (WIDTH => w, ILINES => 32, OLINES => 16)
		port map (A => v2, Y => v3);

	-- PIPELINE REGISTER: d=12(12)

	-- 4:2 reducer
	-- d=16
	level_3 : ReduceTree
		generic map (WIDTH => w, ILINES => 16, OLINES => 8)
		port map (A => v3, Y => v4);

	-- 4:3 reducer (first half of a split 4:2 reducer)
	-- d=18
	level_4a : ReduceTree
		generic map (WIDTH => w, ILINES => 8, OLINES => 6)
		port map (A => v4, Y => v4a);

	-- PIPELINE REGISTER: d=18(18)

	-- 3:2 reducer (second half of a split 4:2 reducer)
	-- d=20
	level_4b : ReduceTree
		generic map (WIDTH => w, ILINES => 6, OLINES => 4)
		port map (A => v4a, Y => v5);

	-- 4:2 reducer
	-- d=24
	level_5 : ReduceTree
		generic map (WIDTH => w, ILINES => 4, OLINES => 2)
		port map (A => v5, Y => v6);

	-- PIPELINE REGISTER: d=24(24)

	-- 8-bit results (located in stage 3)
	-- d=12+6=18(18)
	res_08 : for i in 0 to 7 generate
		bl : block
			signal ta, tb, ty : std_ulogic_vector(15 downto 0);
		begin
			inputs : for j in 0 to 15 generate
				ta(j) <= v3(256*i+16*j+2*i+0);
				tb(j) <= v3(256*i+16*j+2*i+1);
			end generate;

			-- d=+6
			adder : CIAddSmall
				generic map (WIDTH => 16)
				port map (A => ta, B => tb, Y => ty);

			res_lo : Y08l(8*i+7 downto 8*i) <= ty( 7 downto 0);
			res_hi : Y08h(8*i+7 downto 8*i) <= ty(15 downto 8);
		end block;
	end generate;

	-- 16-bit results (located in stage 3-4)
	-- d=16+8=24(24)
	res_16 : for i in 0 to 3 generate
		--
		-- I gotta play some tricks here because a) CIAdd is
		-- too slow, and b) it can't be split at d=2 and d=8
		-- (and I don't want it to occupy 3 stages anyway!).
		--
		bl : block
			component CIA_Stage
				generic (WIDTH : natural := 64; STEP : natural := 4);
				port (
					Gi, Pi : in std_ulogic_vector((WIDTH-1)/STEP downto 0);
					Yi, Ci : in std_ulogic_vector(WIDTH-1 downto 0);
					Go, Po : out std_ulogic_vector((WIDTH-1)/STEP/4 downto 0);
					Yo, Co : out std_ulogic_vector(WIDTH-1 downto 0)
				);
			end component;

			signal Y1, C1 : std_ulogic_vector(31 downto 0);
			signal Y2, C2 : std_ulogic_vector(31 downto 0);
			signal P0, G0 : std_ulogic_vector(31 downto 0);
			signal P1, G1 : std_ulogic_vector(15 downto 0);
			signal P2, G2 : std_ulogic_vector( 3 downto 0);

			signal ta, tb, ty : std_ulogic_vector(31 downto 0);
		begin
			-- d=16
			inputs : for j in 0 to 31 generate
				ta(j) <= v4(256*i+8*j+2*i+0);
				tb(j) <= v4(256*i+8*j+2*i+1);
			end generate;

			-- d=17
			half_adders : for j in 0 to 31 generate
				sum   : XOR2 port map (ta(j), tb(j), P0(j));
				carry : AND2 port map (ta(j), tb(j), G0(j));
			end generate;

			-- 2-bit partial results, optimized for low delay
			-- d=18
			two_bit : for j in 0 to 15 generate
				bl : block
					signal t1, t2 : std_ulogic;
				begin
					y_0 : Y1(2*j+0) <= P0(2*j+0);
					y_1 : XOR2 port map (P0(2*j+1), G0(2*j+0), Y1(2*j+1));
					c_0 : C1(2*j+0) <= '1';
					c_1 : C1(2*j+1) <= P0(2*j+0);
					p_o : AND2 port map (P0(2*j+1), P0(2*j+0), P1(j));
					t_1 : AND3 port map (ta(2*j+1), ta(2*j+0), tb(2*j+0), t1);
					t_2 : AND3 port map (tb(2*j+1), ta(2*j+0), tb(2*j+0), t2);
					g_o : OR3  port map (G0(2*j+1), t1, t2, G1(j));
				end block;
			end generate;

			-- PIPELINE REGISTER: d=18(18)

			-- d=22 (Go/Po: d=20)
			level_1 : CIA_Stage
				generic map (WIDTH => 32, STEP => 2)
				port map (
					Gi => G1, Pi => P1, Yi => Y1, Ci => C1,
					Go => G2, Po => P2, Yo => Y2, Co => C2
				);

			-- d=24
			level_2 : CIA_Stage
				generic map (WIDTH => 32, STEP => 8)
				port map (
					Gi => G2, Pi => P2, Yi => Y2, Ci => C2,
					Go => open, Po => open, Yo => ty, Co => open
				);

			res_lo : Y16l(16*i+15 downto 16*i) <= ty(15 downto  0);
			res_hi : Y16h(16*i+15 downto 16*i) <= ty(31 downto 16);
		end block;
	end generate;

	-- 32-bit results (located in stage 4-5; TODO: NEEDS SPLIT!)
	-- d=20+9=29(30)
	res_32 : for i in 0 to 1 generate
		bl : block
			signal ta, tb, ty : std_ulogic_vector(63 downto 0);
		begin
			inputs : for j in 0 to 63 generate
				ta(j) <= v5(256*i+4*j+2*i+0);
				tb(j) <= v5(256*i+4*j+2*i+1);
			end generate;

			-- d=+9
			-- TODO: SPLIT AT d=+3
			adder : CIAdd
				generic map (WIDTH => 64)
				port map (A => ta, B => tb, Y => ty);

			res_lo : Y32l(32*i+31 downto 32*i) <= ty(31 downto  0);
			res_hi : Y32h(32*i+31 downto 32*i) <= ty(63 downto 32);
		end block;
	end generate;

	-- 64-bit results (located in stage 5-6; TODO: NEEDS SPLIT!)
	-- d=24+11=35(36)
	res_64 : for i in 0 to 0 generate
		bl : block
			signal ta, tb, ty : std_ulogic_vector(127 downto 0);
		begin
			inputs : for j in 0 to 127 generate
				ta(j) <= v6(256*i+2*j+2*i+0);
				tb(j) <= v6(256*i+2*j+2*i+1);
			end generate;

			-- d=+11
			-- TODO: SPLIT AT d=+5
			adder : CIAdd
				generic map (WIDTH => 128)
				port map (A => ta, B => tb, Y => ty);

			res_lo : Y64l(64*i+63 downto 64*i) <= ty( 63 downto  0);
			res_hi : Y64h(64*i+63 downto 64*i) <= ty(127 downto 64);
		end block;
	end generate;
end Struct_1;

-- vi: set ts=4 sw=4 equalprg="fmt -72 -p--": please
