-- imul64.vhdl - F-CPU 64-Bit SIMD Integer Multiplication Unit
-- Copyright (C) 2000 - 2002 Michael Riepe <michael@stud.uni-hannover.de>
--
-- This program is free software; you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 2 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program; if not, write to the Free Software
-- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

-- $Id: imul64.vhdl,v 1.51 2002/06/12 20:42:02 michael Exp $

library IEEE;
use IEEE.std_logic_1164.all;
use work.Generic_Adder.all;
use work.Bit_Manipulation.all;
use work.Misc.all;

entity IMul64 is
	generic (
		PIPE_AFTER : natural := 0;	-- gates per stage; 0 means no pipelining
		PIPE_DELAY : natural := 0	-- additional delay before 1st stage
	);
	port (
		-- inputs
		A : in std_ulogic_vector(63 downto 0);
		B : in std_ulogic_vector(63 downto 0);
		-- optional add input
		X : in std_ulogic_vector(63 downto 0);
		-- signed/unsigned mode switch
		SignedMode : in std_ulogic;
		-- MAC modes
		-- "000" => normal operation
		-- "001" => low-part mac
		-- "010" => high-part mac
		-- "100" => alternative (same-size) mac
		Mac : in std_ulogic_vector(2 downto 0);
		-- SIMD mode switches, as usual
		-- "000" => 8-bit
		-- "001" => 16-bit
		-- "011" => 32-bit
		-- "111" => 64-bit
		U : in std_ulogic_vector(2 downto 0);
		-- clock/reset/Enable inputs
		Clk : in std_ulogic;
		Rst : in std_ulogic;
		En : in std_ulogic;
	--
		-- 8-bit results
		Y08l : out std_ulogic_vector(63 downto 0);	-- d=18 (3 stages)
		Y08h : out std_ulogic_vector(63 downto 0);	-- d=20 (4 stages)
		-- 16-bit results
		Y16l : out std_ulogic_vector(63 downto 0);	-- d=24 (4 stages)
		Y16h : out std_ulogic_vector(63 downto 0);	-- d=26 (5 stages)
		-- 32-bit results
		Y32l : out std_ulogic_vector(63 downto 0);	-- d=28 (5 stages)
		Y32h : out std_ulogic_vector(63 downto 0);	-- d=28 (5 stages)
		-- 64-bit results
		Y64l : out std_ulogic_vector(63 downto 0);	-- d=32 (6 stages)
		Y64h : out std_ulogic_vector(63 downto 0)	-- d=34 (6 stages)
	);
--pragma synthesis_off
begin
	-- pipelining restrictions
	assert (PIPE_AFTER = 6) or (PIPE_AFTER = 0)
		report "PIPE_AFTER must be 0 or 6 in this version"
		severity failure;
	assert PIPE_DELAY = 0
		report "PIPE_DELAY must be 0 in this version"
		severity failure;
--pragma synthesis_on
end IMul64;

architecture Behave_1 of IMul64 is
	constant w : natural := 128;

	type matrix is array (natural range <>)
		of std_ulogic_vector(w-1 downto 0);
	type small_matrix is array (natural range <>)
		of std_ulogic_vector(w/2+3 downto 0);

	function reduce_4_3 (X : in matrix) return matrix is
		constant L : natural := X'length;
		variable t0 : matrix(L-1 downto 0);
		variable t1 : matrix(3*L/4-1 downto 0);
	begin
--pragma synthesis_off
		assert L mod 4 = 0;
--pragma synthesis_on
		t0 := X;
		for i in L/4-1 downto 0 loop
if TRUE then -- assume t0(4*i+1) and t0(4*i+3) arrive late
			t1(3*i+0) := lshift(maj23(t0(4*i+0), t0(4*i+2), t0(4*i+1)), 1);
			t1(3*i+1) := (t0(4*i+0) xor t0(4*i+2)) xor t0(4*i+1);
			t1(3*i+2) := t0(4*i+3);
else
			t1(3*i+0) := t0(4*i+0);
			t1(3*i+1) := xor3(t0(4*i+1), t0(4*i+2), t0(4*i+3));
			t1(3*i+2) := lshift(maj23(t0(4*i+1), t0(4*i+2), t0(4*i+3)), 1);
end if;
		end loop;
		return t1;
	end reduce_4_3;

	function reduce_3_2 (X : in matrix) return matrix is
		constant L : natural := X'length;
		variable t1 : matrix(L-1 downto 0);
		variable t2 : matrix(2*L/3-1 downto 0);
	begin
--pragma synthesis_off
		assert L mod 3 = 0;
--pragma synthesis_on
		t1 := X;
		for i in L/3-1 downto 0 loop
if TRUE then -- assume t1(3*i+1) arrives late
			t2(2*i+0) := lshift(maj23(t1(3*i+0), t1(3*i+2), t1(3*i+1)), 1);
			t2(2*i+1) := (t1(3*i+0) xor t1(3*i+2)) xor t1(3*i+1);
else
			t2(2*i+0) := xor3(t1(3*i+0), t1(3*i+1), t1(3*i+2));
			t2(2*i+1) := lshift(maj23(t1(3*i+0), t1(3*i+1), t1(3*i+2)), 1);
end if;
		end loop;
		return t2;
	end reduce_3_2;

	-- pipeline enable
	signal En_1, En_2, En_3, En_4, En_5 : std_ulogic;
	-- input stage data
	signal v1 : matrix(31 downto 0);
	signal va : small_matrix(31 downto 0);
	signal vc : matrix(15 downto 0);
	-- reducer data
	signal vr1 : matrix(15 downto 0);
	signal vr2 : matrix( 5 downto 0);
	signal vr3 : matrix( 1 downto 0);
	-- reducer branches
	signal v_08_a, v_08_b : std_ulogic_vector(w-1 downto 0);
	signal v_16_a, v_16_b : std_ulogic_vector(w-1 downto 0);
	signal v_32_a, v_32_b : std_ulogic_vector(w-1 downto 0);
	signal v_64_a, v_64_b : std_ulogic_vector(w-1 downto 0);
	-- signals used in 8-bit output branch
	signal r08_Y2, r08_C2 : std_ulogic_vector(w-1 downto 0);
	signal r08_G2 : std_ulogic_vector(w/8-1 downto 0);
	-- signals used in 16-bit output branch
	signal r16_G0, r16_P0 : std_ulogic_vector(w-1 downto 0);
	signal r16_Y2, r16_C2 : std_ulogic_vector(w-1 downto 0);
	signal r16_G2 : std_ulogic_vector(w/16-1 downto 0);
	-- signals used in 32-bit output branch
	signal r32_Y1, r32_C1 : std_ulogic_vector(w-1 downto 0);
	signal r32_S1, r32_I1 : std_ulogic_vector(w/4-1 downto 0);
	signal r32_G2, r32_P2 : std_ulogic_vector(w/16-1 downto 0);
	-- signals used in 64-bit output branch
	signal r64_Y2, r64_C2 : std_ulogic_vector(w-1 downto 0);
	signal r64_S2, r64_I2 : std_ulogic_vector(w/16-1 downto 0);
	signal r64_G3 : std_ulogic_vector(w/64-1 downto 0);
begin
	-- enable signal
	-- d=0
	En_1 <= En;

	-- gated 1x1-bit products
	-- d=6
	input : process (A, B, U, Clk, Rst, En_1)
		-- SIMD gating
		function simd (U : in std_ulogic_vector;
					   x, y : in natural) return std_ulogic is
			variable n : natural;
		begin
			if x = y then
				return '1';
			else
				n := 2;
				for i in U'low to U'high loop
					if x / n = y / n then
						return U(i);
					end if;
					n := 2 * n;
				end loop;
				return '0';
			end if;
		end simd;

		variable t0 : matrix(63 downto 0);
		variable t1 : matrix(47 downto 0);
		variable t2 : matrix(31 downto 0);
		variable t3 : small_matrix(31 downto 0);
	begin
		-- product matrix
		-- d=1
		t0 := (others => (others => '0'));
		for j in 63 downto 0 loop
			for i in 63 downto 0 loop
				t0(j)(j+i) := A(j) and B(i) and simd(U, i/8, j/8);
			end loop;
		end loop;

		-- reducers
		t1 := reduce_4_3(t0);
		t2 := reduce_3_2(t1);

		-- squeezer
		for i in t2'length/2-1 downto 0 loop
			t3(2*i+0) := t2(2*i+0)(w/2+3+4*i downto 4*i);
			t3(2*i+1) := t2(2*i+1)(w/2+3+4*i downto 4*i);
		end loop;

		-- output register
		if PIPE_AFTER = 0 then
			va <= t3;
			En_2 <= En_1;
		elsif to_X01(Rst) = '1' then
			va <= (others => (others => '0'));
			En_2 <= '0';
		elsif rising_edge(Clk) then
			if to_X01(En_1) = '1' then
				va <= t3;
			end if;
			En_2 <= En_1;
		end if;
	end process;

	unsq : process (va)
	begin
		-- unsqueezer
		v1 <= (others => (others => '0'));
		for i in va'length/2-1 downto 0 loop
			v1(2*i+0)(w/2+3+4*i downto 4*i) <= va(2*i+0);
			v1(2*i+1)(w/2+3+4*i downto 4*i) <= va(2*i+1);
		end loop;
	end process;

	-- signed/unsigned correction, MAC input
	-- d=6
	signed_corr : process (A, B, X, U, SignedMode, Mac, Clk, Rst, En_1)
		variable t00 : matrix(31 downto 0);
		variable t01 : matrix(23 downto 0);
		variable t02 : matrix(15 downto 0);
		variable un : std_ulogic_vector(2 downto 0);
		variable corr08 : std_ulogic_vector(7 downto 0);
		variable an, bn : std_ulogic_vector(63 downto 0);
		variable gate_mac : std_ulogic_vector(7 downto 0);
		variable g, t1, t2 : std_ulogic;
	begin
		-- negated size flags
		-- d=1
		un := not U;

		-- 8-bit correction gate vector
		-- d=1
		corr08 := (7 => '1', 3 => un(2), 5|1 => un(1), others => un(0));

		-- both operands, inverted (for subtraction)
		-- d=1
		an := not A;
		bn := not B;

		-- default output vector
		t00 := (others => (others => '0'));

		-- gated X input (standard F-CPU `widening' MAC instruction)
		-- d=1
		for i in 0 to 3 loop
			for j in 0 to 15 loop
				t00(4*i+ 0+3)(16*i+j+ 0) := Mac(0) and X(16*i+j);
				t00(4*i+16+3)(16*i+j+64) := Mac(1) and X(16*i+j);
			end loop;
		end loop;

		-- gated X input (MR's alternative `same-size' MAC instruction)
		-- d=2
		gate_mac := (0 => '1', 4 => un(2), 6|2 => un(1), others => un(0));
		for i in 0 to 7 loop
			for j in 0 to 7 loop
				t00(4*i+ 0)(16*i+j) :=
					Mac(2) and X(8*i+j) and gate_mac(i);
			end loop;
		end loop;
		for i in 0 to 3 loop
			for j in 8 to 15 loop
				t00(8*i+ 4)(32*i+j) :=
					Mac(2) and X(16*i+j) and gate_mac(2*i) and U(0);
			end loop;
		end loop;
		for i in 0 to 1 loop
			for j in 16 to 31 loop
				t00(16*i+ 8)(64*i+j) :=
					Mac(2) and X(32*i+j) and gate_mac(4*i) and U(1);
			end loop;
		end loop;
		for i in 0 to 0 loop
			for j in 32 to 63 loop
				t00(32*i+16)(128*i+j) :=
					Mac(2) and X(64*i+j) and gate_mac(8*i) and U(2);
			end loop;
		end loop;

		-- 8-bit correction vectors
		-- d=2
		g := SignedMode;
		for i in 0 to 7 loop
			t1 := A(8*i+7) xor B(8*i+7);
			t2 := A(8*i+7) and B(8*i+7);
			t00(4*i+0)(16*i+0+8) := t1 and un(0) and g;
			t00(4*i+0)(16*i+1+8) := t2 and un(0) and g;
			for j in 0 to 7 loop
				t00(4*i+1)(16*i+j+8) :=
					B(8*i+7) and g and corr08(i) and an(8*i+j);
				t00(4*i+2)(16*i+j+8) :=
					A(8*i+7) and g and corr08(i) and bn(8*i+j);
			end loop;
		end loop;

		-- 16-bit correction vectors
		-- d=2
		g := SignedMode and U(0);
		for i in 0 to 3 loop
			t1 := A(16*i+15) xor B(16*i+15);
			t2 := A(16*i+15) and B(16*i+15);
			t00(8*i+0)(32*i+0+16) := t1 and un(1) and g;
			t00(8*i+0)(32*i+1+16) := t2 and un(1) and g;
			for j in 0 to 7 loop
				t00(8*i+1)(32*i+j+16) :=
					B(16*i+15) and g and corr08(2*i+1) and an(16*i+j);
				t00(8*i+2)(32*i+j+16) :=
					A(16*i+15) and g and corr08(2*i+1) and bn(16*i+j);
			end loop;
		end loop;

		-- 32-bit correction vectors
		-- d=2
		g := SignedMode and U(1);
		for i in 0 to 1 loop
			t1 := A(32*i+31) xor B(32*i+31);
			t2 := A(32*i+31) and B(32*i+31);
			t00(16*i+4)(64*i+0+32) := t1 and un(2) and g;
			t00(16*i+4)(64*i+1+32) := t2 and un(2) and g;
			for j in 0 to 15 loop
				t00(16*i+5)(64*i+j+32) :=
					B(32*i+31) and g and corr08(4*i+3) and an(32*i+j);
				t00(16*i+6)(64*i+j+32) :=
					A(32*i+31) and g and corr08(4*i+3) and bn(32*i+j);
			end loop;
		end loop;

		-- 64-bit correction vectors
		-- d=2
		g := SignedMode and U(2);
		for i in 0 to 0 loop
			t1 := A(64*i+63) xor B(64*i+63);
			t2 := A(64*i+63) and B(64*i+63);
			t00(32*i+12)(128*i+0+64) := t1 and g;
			t00(32*i+12)(128*i+1+64) := t2 and g;
			for j in 0 to 31 loop
				t00(32*i+13)(128*i+j+64) :=
					B(64*i+63) and g and corr08(8*i+7) and an(64*i+j);
				t00(32*i+14)(128*i+j+64) :=
					A(64*i+63) and g and corr08(8*i+7) and bn(64*i+j);
			end loop;
		end loop;

		-- reducers
		t01 := reduce_4_3(t00);
		t02 := reduce_3_2(t01);

		-- output register
		if PIPE_AFTER = 0 then
			vc <= t02;
		elsif to_X01(Rst) = '1' then
			vc <= (others => (others => '0'));
		elsif rising_edge(Clk) then
			if to_X01(En_1) = '1' then
				vc <= t02;
			end if;
		end if;
	end process;

	-- reducer tree part #1 (located in stage 2)
	tree_1 : process (v1, vc, Clk, Rst, En_2)
		variable t0 : matrix(47 downto 0);
		variable t1 : matrix(31 downto 0);
		variable t2 : matrix(23 downto 0);
		variable t3 : matrix(15 downto 0);
	begin
		-- reorder signals
		-- d=6
		for i in t0'length/6-1 downto 0 loop
			t0(6*i+3 downto 6*i+0) := v1(4*i+3 downto 4*i+0);
			t0(6*i+5 downto 6*i+4) := vc(2*i+1 downto 2*i+0);
		end loop;

		-- reducers
		t1 := reduce_3_2(t0);
		t2 := reduce_4_3(t1);
		t3 := reduce_3_2(t2);

		-- pipeline register (stage 2)
		-- d=12
		if PIPE_AFTER = 0 then
			vr1 <= t3;
			En_3 <= '1';
		elsif to_X01(Rst) = '1' then
			vr1 <= (others => (others => '0'));
			En_3 <= '0';
		elsif rising_edge(Clk) then
			if to_X01(En_2) = '1' then
				vr1 <= t3;
			end if;
			En_3 <= En_2;
		end if;
	end process;

	-- reducer tree part #2 (located in stage 3)
	tree_2 : process (vr1, Clk, Rst, En_3)
		variable t3 : matrix(15 downto 0);
		variable t4 : matrix(11 downto 0);
		variable t5 : matrix( 7 downto 0);
		variable t6 : matrix( 5 downto 0);
		variable a, b : std_ulogic_vector(w-1 downto 0);
	begin
		t3 := vr1;

		-- branch output for 8-bit results
		-- d=12
		for i in 0 to 7 loop
			a(16*i+15 downto 16*i) := t3(2*i+0)(16*i+15 downto 16*i);
			b(16*i+15 downto 16*i) := t3(2*i+1)(16*i+15 downto 16*i);
		end loop;
		v_08_a <= a;
		v_08_b <= b;

		-- reducers
		t4 := reduce_4_3(t3);
		t5 := reduce_3_2(t4);

		-- branch output for 16-bit results
		-- d=16
		for i in 0 to 3 loop
			a(32*i+31 downto 32*i) := t5(2*i+0)(32*i+31 downto 32*i);
			b(32*i+31 downto 32*i) := t5(2*i+1)(32*i+31 downto 32*i);
		end loop;
		v_16_a <= a;
		v_16_b <= b;

		-- reducer
		t6 := reduce_4_3(t5);

		-- pipeline register (stage 3)
		-- d=18
		if PIPE_AFTER = 0 then
			vr2 <= t6;
			En_4 <= '1';
		elsif to_X01(Rst) = '1' then
			vr2 <= (others => (others => '0'));
			En_4 <= '0';
		elsif rising_edge(Clk) then
			if to_X01(En_3) = '1' then
				vr2 <= t6;
			end if;
			En_4 <= En_3;
		end if;
	end process;

	-- reducer tree part #3 (located in stage 4)
	-- d=12/16/20/24 (depending on output)
	tree_3 : process (vr2, Clk, Rst, En_4)
		variable t6 : matrix(5 downto 0);
		variable t7 : matrix(3 downto 0);
		variable t8 : matrix(2 downto 0);
		variable t9 : matrix(1 downto 0);
		variable a, b : std_ulogic_vector(w-1 downto 0);
	begin
		t6 := vr2;

		-- reducer
		t7 := reduce_3_2(t6);

		-- branch output for 32-bit results
		-- d=20
		for i in 0 to 1 loop
			a(64*i+63 downto 64*i) := t7(2*i+0)(64*i+63 downto 64*i);
			b(64*i+63 downto 64*i) := t7(2*i+1)(64*i+63 downto 64*i);
		end loop;
		v_32_a <= a;
		v_32_b <= b;

		-- reducers
		t8 := reduce_4_3(t7);
		t9 := reduce_3_2(t8);

		-- pipeline register (stage 4)
		-- d=24
		if PIPE_AFTER = 0 then
			vr3 <= t9;
			En_5 <= '1';
		elsif to_X01(Rst) = '1' then
			vr3 <= (others => (others => '0'));
			En_5 <= '0';
		elsif rising_edge(Clk) then
			if to_X01(En_4) = '1' then
				vr3 <= t9;
			end if;
			En_5 <= En_4;
		end if;
	end process;

	-- reducer tree part #4
	-- d=24 (depending on output)
	tree_4 : process (vr3)
		variable t9 : matrix(1 downto 0);
		variable a, b : std_ulogic_vector(w-1 downto 0);
	begin
		t9 := vr3;

		-- output for 64-bit results
		for i in 0 to 0 loop
			a(128*i+127 downto 128*i) := t9(2*i+0)(128*i+127 downto 128*i);
			b(128*i+127 downto 128*i) := t9(2*i+1)(128*i+127 downto 128*i);
		end loop;
		v_64_a <= a;
		v_64_b <= b;
	end process;

	-- 8-bit results part #1 (located in stage 3)
	res_08_1 : process (v_08_a, v_08_b, Clk, Rst, En_3)
		variable G0, P0 : std_ulogic_vector(w-1 downto 0);
		variable S0 : std_ulogic_vector(w-1 downto 0);
		variable Y1, C1 : std_ulogic_vector(w-1 downto 0);
		variable G1, P1 : std_ulogic_vector(w/4-1 downto 0);
		variable Y2, C2 : std_ulogic_vector(w-1 downto 0);
		variable G2 : std_ulogic_vector(w/8-1 downto 0);
		variable yl : std_ulogic_vector(63 downto 0);
	begin
		-- a row of half adders
		-- d=13
		G0 := v_08_a and v_08_b;
		P0 := v_08_a xor v_08_b;

		-- 4-bit results
		-- d=15
		CIA_Row(G0, P0, S0, C1, G1, P1);

		-- d=16
		Y1 := P0 xor S0;

		-- 8-bit results
		-- d=17
		for j in 15 downto 0 loop
			G2(j) := G1(2*j+1) or (P1(2*j+1) and G1(2*j));
			for i in 7 downto 4 loop
				Y2(8*j+i) := Y1(8*j+i) xor (C1(8*j+i) and G1(2*j));
				C2(8*j+i) := C1(8*j+i) and P1(2*j);
			end loop;
			for i in 3 downto 0 loop
				Y2(8*j+i) := Y1(8*j+i);
				C2(8*j+i) := C1(8*j+i);
			end loop;
		end loop;

		-- 8-bit results (lower half), reordered
		-- d=17
		for j in 7 downto 0 loop
			yl(8*j+7 downto 8*j) := Y2(16*j+ 7 downto 16*j+0);
		end loop;
		Y08l <= yl;

		-- pipeline register (stage 3)
		-- d=18
		if PIPE_AFTER = 0 then
			r08_Y2 <= Y2;
			r08_C2 <= C2;
			r08_G2 <= G2;
		elsif to_X01(Rst) = '1' then
			r08_Y2 <= (others => '0');
			r08_C2 <= (others => '0');
			r08_G2 <= (others => '0');
		elsif rising_edge(Clk) then
			if to_X01(En_3) = '1' then
				r08_Y2 <= Y2;
				r08_C2 <= C2;
				r08_G2 <= G2;
			end if;
		end if;
	end process;

	-- 8-bit results part #2 (located in stage 4)
	res_08_2 : process (r08_Y2, r08_C2, r08_G2)
		variable Y2, C2 : std_ulogic_vector(w-1 downto 0);
		variable G2 : std_ulogic_vector(w/8-1 downto 0);
		variable yh : std_ulogic_vector(63 downto 0);
	begin
		Y2 := r08_Y2;
		C2 := r08_C2;
		G2 := r08_G2;

		-- 8-bit outputs (upper half), reordered
		-- d=20
		for j in 7 downto 0 loop
			for i in 7 downto 0 loop
				yh(8*j+i) := Y2(16*j+i+8) xor (C2(16*j+i+8) and G2(2*j));
			end loop;
		end loop;
		Y08h <= yh;
	end process;

	-- 16-bit results part #1 (located in stage 3)
	res_08_3 : process (v_16_a, v_16_b, Clk, Rst, En_3)
		variable G0, P0 : std_ulogic_vector(w-1 downto 0);
	begin
		-- a row of half adders
		-- d=17
		G0 := v_16_a and v_16_b;
		P0 := v_16_a xor v_16_b;

		-- pipeline register (stage 3)
		-- d=18
		if PIPE_AFTER = 0 then
			r16_G0 <= G0;
			r16_P0 <= P0;
		elsif to_X01(Rst) = '1' then
			r16_G0 <= (others => '0');
			r16_P0 <= (others => '0');
		elsif rising_edge(Clk) then
			if to_X01(En_3) = '1' then
				r16_G0 <= G0;
				r16_P0 <= P0;
			end if;
		end if;
	end process;

	-- 16-bit results part #2 (located in stage 4)
	res_16_1 : process (r16_G0, r16_P0, Clk, Rst, En_4)
		variable G0, P0 : std_ulogic_vector(w-1 downto 0);
		variable S0 : std_ulogic_vector(w-1 downto 0);
		variable Y1, C1 : std_ulogic_vector(w-1 downto 0);
		variable G1, P1 : std_ulogic_vector(w/4-1 downto 0);
		variable S1, I1 : std_ulogic_vector(w/4-1 downto 0);
		variable Y2, C2 : std_ulogic_vector(w-1 downto 0);
		variable G2, P2 : std_ulogic_vector(w/16-1 downto 0);
		variable yl : std_ulogic_vector(63 downto 0);
	begin
		G0 := r16_G0;
		P0 := r16_P0;

		-- 4-bit results
		-- d=20
		CIA_Row(G0, P0, S0, C1, G1, P1);

		-- d=21
		Y1 := P0 xor S0;

		-- d=22
		CIA_Row(G1, P1, S1, I1, G2, P2);

		-- d=24
		CIA_Inc(Y1, C1, S1, I1, Y2, C2, 4);

		-- 16-bit results (lower half), reordered
		-- d=24
		for j in 3 downto 0 loop
			yl(16*j+15 downto 16*j) := Y2(32*j+15 downto 32*j);
		end loop;
		Y16l <= yl;

		-- pipeline register (stage 4)
		-- d=24
		if PIPE_AFTER = 0 then
			r16_Y2 <= Y2;
			r16_C2 <= C2;
			r16_G2 <= G2;
		elsif to_X01(Rst) = '1' then
			r16_Y2 <= (others => '0');
			r16_C2 <= (others => '0');
			r16_G2 <= (others => '0');
		elsif rising_edge(Clk) then
			if to_X01(En_4) = '1' then
				r16_Y2 <= Y2;
				r16_C2 <= C2;
				r16_G2 <= G2;
			end if;
		end if;
	end process;

	-- 16-bit results part #3 (located in stage 5)
	res_16_2 : process (r16_Y2, r16_C2, r16_G2)
		variable Y2, C2 : std_ulogic_vector(w-1 downto 0);
		variable G2 : std_ulogic_vector(w/16-1 downto 0);
		variable yh : std_ulogic_vector(63 downto 0);
	begin
		Y2 := r16_Y2;
		C2 := r16_C2;
		G2 := r16_G2;

		-- 16-bit outputs (upper half), reordered
		-- d=26
		for j in 3 downto 0 loop
			for i in 15 downto 0 loop
				yh(16*j+i) := Y2(32*j+i+16) xor (C2(32*j+i+16) and G2(2*j));
			end loop;
		end loop;
		Y16h <= yh;
	end process;

	-- 32-bit results part #1 (located in stage 4)
	res_32_1 : process (v_32_a, v_32_b, Clk, Rst, En_4)
		variable G0, P0 : std_ulogic_vector(w-1 downto 0);
		variable S0 : std_ulogic_vector(w-1 downto 0);
		variable Y1, C1 : std_ulogic_vector(w-1 downto 0);
		variable G1, P1 : std_ulogic_vector(w/4-1 downto 0);
		variable S1, I1 : std_ulogic_vector(w/4-1 downto 0);
		variable G2, P2 : std_ulogic_vector(w/16-1 downto 0);
	begin
		-- a row of half adders
		-- d=20
		G0 := v_32_a and v_32_b;
		P0 := v_32_a xor v_32_b;

		-- 4-bit results
		-- d=22
		CIA_Row(G0, P0, S0, C1, G1, P1);

		-- d=23
		Y1 := P0 xor S0;

		-- d=24
		CIA_Row(G1, P1, S1, I1, G2, P2);

		-- pipeline register (stage 4)
		-- d=24
		if PIPE_AFTER = 0 then
			r32_Y1 <= Y1;
			r32_C1 <= C1;
			r32_S1 <= S1;
			r32_I1 <= I1;
			r32_G2 <= G2;
			r32_P2 <= P2;
		elsif to_X01(Rst) = '1' then
			r32_Y1 <= (others => '0');
			r32_C1 <= (others => '0');
			r32_S1 <= (others => '0');
			r32_I1 <= (others => '0');
			r32_G2 <= (others => '0');
			r32_P2 <= (others => '0');
		elsif rising_edge(Clk) then
			if to_X01(En_4) = '1' then
				r32_Y1 <= Y1;
				r32_C1 <= C1;
				r32_S1 <= S1;
				r32_I1 <= I1;
				r32_G2 <= G2;
				r32_P2 <= P2;
			end if;
		end if;
	end process;

	-- 32-bit results part #2 (located in stage 5)
	res_32_2 : process (r32_Y1, r32_C1, r32_S1, r32_I1, r32_G2, r32_P2)
		variable Y1, C1 : std_ulogic_vector(w-1 downto 0);
		variable S1, I1 : std_ulogic_vector(w/4-1 downto 0);
		variable Y2, C2 : std_ulogic_vector(w-1 downto 0);
		variable G2, P2 : std_ulogic_vector(w/16-1 downto 0);
		variable S2, I2 : std_ulogic_vector(w/16-1 downto 0);
		variable Y3, C3 : std_ulogic_vector(w-1 downto 0);
		variable G3, P3 : std_ulogic_vector(w/64-1 downto 0);
		variable yh, yl : std_ulogic_vector(63 downto 0);
	begin
		Y1 := r32_Y1;
		C1 := r32_C1;
		S1 := r32_S1;
		I1 := r32_I1;
		G2 := r32_G2;
		P2 := r32_P2;

		-- d=26
		CIA_Inc(Y1, C1, S1, I1, Y2, C2, 4);

		-- d=26
		CIA_Row(G2, P2, S2, I2, G3, P3);

		-- d=28
		CIA_Inc(Y2, C2, S2, I2, Y3, C3, 16);

		-- 32-bit results, reordered
		-- d=28
		for j in 1 downto 0 loop
			yl(32*j+31 downto 32*j) := Y3(64*j+31 downto 64*j+ 0);
			yh(32*j+31 downto 32*j) := Y3(64*j+63 downto 64*j+32);
		end loop;
		Y32l <= yl;
		Y32h <= yh;
	end process;

	-- 64-bit results part #1 (located in stage 5)
	res_64_1 : process (v_64_a, v_64_b, Clk, Rst, En_5)
		variable G0, P0 : std_ulogic_vector(w-1 downto 0);
		variable S0 : std_ulogic_vector(w-1 downto 0);
		variable Y1, C1 : std_ulogic_vector(w-1 downto 0);
		variable G1, P1 : std_ulogic_vector(w/4-1 downto 0);
		variable S1, I1 : std_ulogic_vector(w/4-1 downto 0);
		variable Y2, C2 : std_ulogic_vector(w-1 downto 0);
		variable G2, P2 : std_ulogic_vector(w/16-1 downto 0);
		variable S2, I2 : std_ulogic_vector(w/16-1 downto 0);
		variable G3, P3 : std_ulogic_vector(w/64-1 downto 0);
	begin
		-- a row of half adders
		-- d=24
		G0 := v_64_a and v_64_b;
		P0 := v_64_a xor v_64_b;

		-- 4-bit results
		-- d=26
		CIA_Row(G0, P0, S0, C1, G1, P1);

		-- d=27
		Y1 := P0 xor S0;

		-- d=28
		CIA_Row(G1, P1, S1, I1, G2, P2);

		-- d=30
		CIA_Inc(Y1, C1, S1, I1, Y2, C2, 4);

		-- d=30
		CIA_Row(G2, P2, S2, I2, G3, P3);

		-- pipeline register (stage 5)
		-- d=30
		if PIPE_AFTER = 0 then
			r64_Y2 <= Y2;
			r64_C2 <= C2;
			r64_S2 <= S2;
			r64_I2 <= I2;
			r64_G3 <= G3;
		elsif to_X01(Rst) = '1' then
			r64_Y2 <= (others => '0');
			r64_C2 <= (others => '0');
			r64_S2 <= (others => '0');
			r64_I2 <= (others => '0');
			r64_G3 <= (others => '0');
		elsif rising_edge(Clk) then
			if to_X01(En_5) = '1' then
				r64_Y2 <= Y2;
				r64_C2 <= C2;
				r64_S2 <= S2;
				r64_I2 <= I2;
				r64_G3 <= G3;
			end if;
		end if;
	end process;

	-- 64-bit results part #2 (located in stage 6)
	res_64_2 : process (r64_Y2, r64_C2, r64_S2, r64_I2, r64_G3)
		variable Y2, C2 : std_ulogic_vector(w-1 downto 0);
		variable S2, I2 : std_ulogic_vector(w/16-1 downto 0);
		variable Y3, C3 : std_ulogic_vector(w-1 downto 0);
		variable G3 : std_ulogic_vector(w/64-1 downto 0);
		variable yh, yl : std_ulogic_vector(63 downto 0);
	begin
		Y2 := r64_Y2;
		C2 := r64_C2;
		S2 := r64_S2;
		I2 := r64_I2;
		G3 := r64_G3;

		-- d=32
		CIA_Inc(Y2, C2, S2, I2, Y3, C3, 16);

		-- 64-bit results, reordered
		-- d=32/34
		for j in 0 downto 0 loop
			-- d=32
			yl(64*j+63 downto 64*j) := Y3(128*j+ 63 downto 128*j+ 0);
			-- d=34
			for i in 63 downto 0 loop
				yh(64*j+i) := Y3(128*j+i+64) xor (C3(128*j+i+64) and G3(2*j));
			end loop;
		end loop;
		Y64l <= yl;
		Y64h <= yh;
	end process;
end Behave_1;

-- vi: set ts=4 sw=4 equalprg="fmt -72 -p--": please
