-- idiv64.vhdl - 64-bit Integer Divider
-- Copyright (C) 2001 - 2003 Michael Riepe <michael@stud.uni-hannover.de>
--
-- This program is free software; you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 2 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program; if not, write to the Free Software
-- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

-- @(#) $Id: idiv64.vhdl,v 1.13 2003/04/07 03:07:00 michael Exp $

library IEEE;
use IEEE.std_logic_1164.all;
use work.Bit_Manipulation.all;

entity IDiv64 is
	generic (
		WIDTH : natural := 64
	);
	port (
		-- dividend
		A : in std_ulogic_vector(WIDTH-1 downto 0);
		-- divisor
		B : in std_ulogic_vector(WIDTH-1 downto 0);
		-- signed flag
		S : in std_ulogic;
		-- SIMD size flags
		U : in std_ulogic_vector(2 downto 0);
		-- clock/reset/Enable inputs
		Clk : in std_ulogic;
		Rst : in std_ulogic;
		En : in std_ulogic;
	--
		-- quotient
		Y : out std_ulogic_vector(WIDTH-1 downto 0);
		-- remainder
		Z : out std_ulogic_vector(WIDTH-1 downto 0)
	);
--pragma synthesis_off
begin
	assert WIDTH mod 64 = 0;
--pragma synthesis_on
end IDiv64;

architecture Behave_1 of IDiv64 is
	-- shortcuts
	function reverse_cascade_or (A : in std_ulogic_vector;
								 N : in natural) return std_ulogic_vector is
		constant L : natural := A'length;
		variable aa : std_ulogic_vector(L-1 downto 0);
		variable yy : std_ulogic_vector(L-1 downto 0);
	begin
--pragma synthesis_off
		assert N > 1;
		assert L mod N = 0;
--pragma synthesis_on
		aa := A;
		yy := (others => 'X');
		for i in L/N-1 downto 0 loop
			yy(N*(i+1)-1 downto N*i) :=
				bit_reverse(cascade_or(bit_reverse(aa(N*(i+1)-1 downto N*i))));
		end loop;
		return yy;
	end reverse_cascade_or;

	function reverse_chunks (A : in std_ulogic_vector;
							 N : in natural) return std_ulogic_vector is
		constant L : natural := A'length;
		variable aa : std_ulogic_vector(L-1 downto 0);
		variable yy : std_ulogic_vector(L-1 downto 0);
	begin
--pragma synthesis_off
		assert N > 1;
		assert L mod N = 0;
--pragma synthesis_on
		aa := A;
		yy := (others => 'X');
		for i in L/N-1 downto 0 loop
			yy(N*(i+1)-1 downto N*i) := bit_reverse(aa(N*(i+1)-1 downto N*i));
		end loop;
		return yy;
	end reverse_chunks;

	-- n:n `AND' combiner circuit
	function combine_and (A : in std_ulogic_vector;
						  N : in natural) return std_ulogic_vector is
		constant L : natural := A'length;
		variable aa : std_ulogic_vector(L-1 downto 0);
		variable yy : std_ulogic_vector(L-1 downto 0);
		variable tt : std_ulogic;
	begin
--pragma synthesis_off
		assert N > 1;
		assert L mod N = 0;
--pragma synthesis_on
		aa := A;
		yy := (others => 'X');
		for i in L/N-1 downto 0 loop
			tt := reduce_and(aa(N*i+N-1 downto N*i));
			yy(N*i+N-1 downto N*i) := (N*i+N-1 downto N*i => tt);
		end loop;
		return yy;
	end combine_and;

	-- n:n `OR' combiner circuit
	function combine_or (A : in std_ulogic_vector;
						 N : in natural) return std_ulogic_vector is
		constant L : natural := A'length;
		variable aa : std_ulogic_vector(L-1 downto 0);
		variable yy : std_ulogic_vector(L-1 downto 0);
		variable tt : std_ulogic;
	begin
--pragma synthesis_off
		assert N > 1;
		assert L mod N = 0;
--pragma synthesis_on
		aa := A;
		yy := (others => 'X');
		for i in L/N-1 downto 0 loop
			tt := reduce_or(aa(N*i+N-1 downto N*i));
			yy(N*i+N-1 downto N*i) := (N*i+N-1 downto N*i => tt);
		end loop;
		return yy;
	end combine_or;

	-- single stage of an omega network
	function omega_1 (A : in std_ulogic_vector;
					  B : in std_ulogic_vector) return std_ulogic_vector is
		constant L : natural := A'length;
		variable aa : std_ulogic_vector(L-1 downto 0);
		variable bb : std_ulogic_vector(L/2-1 downto 0);
		variable yy : std_ulogic_vector(L-1 downto 0);
	begin
--pragma synthesis_off
		assert A'length = L;
		assert 2 * B'length = L;
--pragma synthesis_on
		aa := A;
		bb := B;
		yy := (others => 'X');
		for i in L/2-1 downto 0 loop
			if to_X01(bb(i)) = '1' then
				yy(2*i+1) := aa(i);
				yy(2*i+0) := aa(L/2 + i);
			else
				yy(2*i+1) := aa(L/2 + i);
				yy(2*i+0) := aa(i);
			end if;
		end loop;
		return yy;
	end omega_1;

	-- carry look-ahead circuitry
	procedure CLA(GI, PI : in std_ulogic_vector;
				  GO, PO : out std_ulogic_vector) is
		constant L : natural := GI'length;
	begin
--pragma synthesis_off
		assert L mod 4 = 0;
		assert (GI'left = L-1) and (GI'right = 0);
		assert (PI'left = L-1) and (PI'right = 0);
		assert (GO'left = L/4-1) and (GO'right = 0);
		assert (PO'left = L/4-1) and (PO'right = 0);
--pragma synthesis_on
		for i in L/4-1 downto 0 loop
			GO(i) := GI(4*i+3)
				or (PI(4*i+3) and GI(4*i+2))
				or (PI(4*i+3) and PI(4*i+2) and GI(4*i+1))
				or (PI(4*i+3) and PI(4*i+2) and PI(4*i+1) and GI(4*i+0));
			PO(i) := PI(4*i+3) and PI(4*i+2) and PI(4*i+1) and PI(4*i+0);
		end loop;
	end CLA;

	constant W10 : natural := 10 * (WIDTH / 8);

	type omega_ctrl is array (natural range <>)
		of std_ulogic_vector(WIDTH/2-1 downto 0);

	-- norm_1
	signal r1_A, r1_B : std_ulogic_vector(WIDTH-1 downto 0);
	signal r1_BX : std_ulogic_vector(WIDTH-1 downto 0);
	signal r1_Size : std_ulogic_vector(2 downto 0);
	signal r1_S : std_ulogic;
	signal r1_En : std_ulogic;

	-- norm_2
	signal r2_A : std_ulogic_vector(WIDTH-1 downto 0);
	signal r2_B : std_ulogic_vector(W10-1 downto 0);
	signal r2_U, r2_V : std_ulogic_vector(W10-1 downto 0);
	signal r2_AS : std_ulogic_vector(WIDTH/8-1 downto 0);
	signal r2_Size : std_ulogic_vector(2 downto 0);
	signal r2_En : std_ulogic;

	-- core_3
	signal r3_A : std_ulogic_vector(WIDTH-1 downto 0);
	signal r3_B : std_ulogic_vector(W10-1 downto 0);
	signal r3_U, r3_V : std_ulogic_vector(W10-1 downto 0);
	signal r3_P, r3_Q : std_ulogic_vector(WIDTH-1 downto 0);
	signal r3_AS : std_ulogic_vector(WIDTH/8-1 downto 0);
	signal r3_Size : std_ulogic_vector(2 downto 0);
	signal r3_En : std_ulogic;
	signal r3_Done : std_ulogic;

	-- corr_4
	signal r4_P, r4_Q : std_ulogic_vector(WIDTH-1 downto 0);
	signal r4_Size : std_ulogic_vector(2 downto 0);
	signal r4_Done : std_ulogic;

	-- check_4
	signal r4_C08, r4_C16, r4_C32 : std_ulogic_vector(WIDTH/8-1 downto 0);
	signal r4_Sign, r4_Zero, r4_BX : std_ulogic_vector(WIDTH/8-1 downto 0);
	signal r4_G3 : std_ulogic_vector(WIDTH/64-1 downto 0);

	-- check_5
	signal r5_Sel : std_ulogic_vector(WIDTH/8-1 downto 0);

	-- quot_5
	signal r5_Y1, r5_Z1 : std_ulogic_vector(WIDTH-1 downto 0);
	signal r5_S1, r5_T1 : std_ulogic_vector(WIDTH/4-1 downto 0);
	signal r5_G2, r5_P2 : std_ulogic_vector(WIDTH/16-1 downto 0);
	signal r5_Size : std_ulogic_vector(2 downto 0);
	signal r5_Done : std_ulogic;

	-- quot_6
	signal r6_Y3 : std_ulogic_vector(WIDTH-1 downto 0);
begin
	-- normalizer part #1
	norm_1 : process (A, B, S, U, Clk, Rst, En)
		-- inputs
		variable aa, bb : std_ulogic_vector(WIDTH-1 downto 0);
		variable sz : std_ulogic_vector(2 downto 0);
		-- locals
		variable bx : std_ulogic_vector(WIDTH-1 downto 0);
		variable b08, b16, b32, b64 : std_ulogic_vector(WIDTH-1 downto 0);
	begin
		-- inputs
		-- d=0
		aa := A;
		bb := B;
		sz := to_X01(U);

		-- d=1
		bx := bb xor rshift(bb, 1);
		-- d=2
		for i in WIDTH/64-1 downto 0 loop
			bx(64*i+63) := bb(64*i+63) and not S;
		end loop;
		-- d=5
		b64 := reverse_cascade_or(bx, 64);
		-- d=2
		for i in WIDTH/64-1 downto 0 loop
			bx(64*i+31) := bb(64*i+31) and not S;
		end loop;
		-- d=5
		b32 := reverse_cascade_or(bx, 32);
		-- d=2
		for i in WIDTH/32-1 downto 0 loop
			bx(32*i+15) := bb(32*i+15) and not S;
		end loop;
		-- d=4
		b16 := reverse_cascade_or(bx, 16);
		-- d=2
		for i in WIDTH/16-1 downto 0 loop
			bx(16*i+ 7) := bb(16*i+ 7) and not S;
		end loop;
		-- d=4
		b08 := reverse_cascade_or(bx,  8);

		-- d=6
		case sz is
			when "111" => bx := b64;
			when "011" => bx := b32;
			when "001" => bx := b16;
			when "000" => bx := b08;
			when others => bx := (others => 'X');
		end case;

		if to_X01(Rst) = '1' then
			r1_A <= (others => '0');
			r1_B <= (others => '0');
			r1_BX <= (others => '0');
			r1_Size <= (others => '0');
			r1_S <= '0';
			r1_En <= '0';
		elsif rising_edge(Clk) then
			if to_X01(En) = '1' then
				r1_A <= aa;
				r1_B <= bb;
				r1_BX <= bx;
				r1_Size <= sz;
				r1_S <= S;
			end if;
			r1_En <= En;
		end if;
	end process;

	-- normalizer part #2
	norm_2 : process (r1_A, r1_B, r1_BX, r1_Size, r1_S, Clk, Rst, r1_En)
		-- inputs
		variable aa, bb, bx : std_ulogic_vector(WIDTH-1 downto 0);
		variable sz : std_ulogic_vector(2 downto 0);
		-- locals
		variable cc : omega_ctrl(5 downto 0);
		variable as, bs, cs : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable a2 : std_ulogic_vector(WIDTH-1 downto 0);
		variable b2 : std_ulogic_vector(W10-1 downto 0);
		variable u2 : std_ulogic_vector(W10-1 downto 0);
		variable x, y : natural;
	begin
		-- inputs
		aa := r1_A;
		bb := r1_B;
		bx := r1_BX;
		sz := to_X01(r1_Size);

		-- d=1
		cc := (others => (others => 'X'));
		for step in 5 downto 0 loop
			x := 2 ** step;
			y := 32 / x;
			for wd in WIDTH/64-1 downto 0 loop
				for j in x-1 downto 0 loop
					for i in y-1 downto 0 loop
						cc(step)(32*wd+y*j+i) :=
							bx(64*wd+2*x*i+j+x) xor bx(64*wd+2*x*i+j);
					end loop;
				end loop;
			end loop;
		end loop;

		-- d=3-5
		case sz is
			when "111" =>
				-- d=2
				cc(4) := combine_or(cc(4),  2);
				cc(3) := combine_or(cc(3),  4);
				-- d=3
				cc(2) := combine_or(cc(2),  8);
				cc(1) := combine_or(cc(1), 16);
				-- d=4
				cc(0) := combine_or(cc(0), 32);
			when "011" =>
				cc(5) := (others => '0');
				-- d=2
				cc(3) := combine_or(cc(3),  2);
				cc(2) := combine_or(cc(2),  4);
				-- d=3
				cc(1) := combine_or(cc(1),  8);
				cc(0) := combine_or(cc(0), 16);
			when "001" =>
				cc(5) := (others => '0');
				cc(4) := (others => '0');
				-- d=2
				cc(2) := combine_or(cc(2),  2);
				cc(1) := combine_or(cc(1),  4);
				-- d=3
				cc(0) := combine_or(cc(0),  8);
			when "000" =>
				cc(5) := (others => '0');
				cc(4) := (others => '0');
				cc(3) := (others => '0');
				-- d=2
				cc(1) := combine_or(cc(1),  2);
				cc(0) := combine_or(cc(0),  4);
			when others =>
				null;
		end case;

		-- sign extension bits
		cs := (others => r1_S);
		-- d=1
		case sz is
			when "111" =>
				as := bit_duplicate(bit_extract(aa, 64, 63), 8);
				bs := bit_duplicate(bit_extract(bb, 64, 63), 8);
			when "011" =>
				as := bit_duplicate(bit_extract(aa, 32, 31), 4);
				bs := bit_duplicate(bit_extract(bb, 32, 31), 4);
			when "001" =>
				as := bit_duplicate(bit_extract(aa, 16, 15), 2);
				bs := bit_duplicate(bit_extract(bb, 16, 15), 2);
			when "000" =>
				as := bit_duplicate(bit_extract(aa,  8,  7), 1);
				bs := bit_duplicate(bit_extract(bb,  8,  7), 1);
			when others =>
				as := (others => 'X');
				bs := (others => 'X');
		end case;
		-- d=2
		as := as and cs;
		bs := bs and cs;

		-- shifting networks for A and B
		for step in 5 downto 0 loop	-- range MUST be descending
			for wd in WIDTH/64-1 downto 0 loop
				aa(64*wd+63 downto 64*wd) :=
					omega_1(aa(64*wd+63 downto 64*wd),
							cc(step)(32*wd+31 downto 32*wd));
				bb(64*wd+63 downto 64*wd) :=
					omega_1(bb(64*wd+63 downto 64*wd),
							cc(step)(32*wd+31 downto 32*wd));
			end loop;
		end loop;

		-- d=1
		case sz is
			when "111" => bx := reverse_chunks(bx, 64);
			when "011" => bx := reverse_chunks(bx, 32);
			when "001" => bx := reverse_chunks(bx, 16);
			when "000" => bx := reverse_chunks(bx,  8);
			when others => bx := (others => 'X');
		end case;

		a2 := aa and bx;
		for i in WIDTH-1 downto 0 loop
			if to_X01(bx(i)) = '1' then
				aa(i) := as(i/8);
			end if;
		end loop;
		bb := bb and bx;
		for i in WIDTH/8-1 downto 0 loop
			b2(10*i+7 downto 10*i) := bb(8*i+7 downto 8*i);
			b2(10*i+8) := bs(i);
			b2(10*i+9) := bs(i);
			u2(10*i+7 downto 10*i) := aa(8*i+7 downto 8*i);
			u2(10*i+8) := as(i);
			u2(10*i+9) := as(i);
		end loop;

		-- pipeline register
		if to_X01(Rst) = '1' then
			r2_A <= (others => '0');
			r2_B <= (others => '0');
			r2_U <= (others => '0');
			r2_V <= (others => '0');
			r2_Size <= (others => '0');
			r2_AS <= (others => '0');
			r2_En <= '0';
		elsif rising_edge(Clk) then
			if to_X01(r1_En) = '1' then
				r2_A <= a2;
				r2_B <= b2;
				r2_U <= not u2;	-- u2 xor (others => '1')
				r2_V <= u2;		-- u2 and (others => '1')
				r2_Size <= sz;
				r2_AS <= as;
			end if;
			r2_En <= r1_En;
		end if;
	end process;

	-- SRT divider core
	core_3 : process (r2_A, r2_B, r2_U, r2_V, r2_Size, r2_AS,
					  Clk, Rst, r2_En,
					  r3_A, r3_B, r3_U, r3_V, r3_P, r3_Q,
					  r3_Size, r3_AS, r3_En, r3_Done)
		variable aa : std_ulogic_vector(WIDTH-1 downto 0);
		variable bb : std_ulogic_vector(W10-1 downto 0);
		variable uu, vv : std_ulogic_vector(W10-1 downto 0);
		variable r0, r1 : std_ulogic_vector(W10-1 downto 0);
		variable r2, r3 : std_ulogic_vector(W10-1 downto 0);
		variable pp, dd : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable qp, qn : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable sp, sn : std_ulogic_vector(WIDTH-1 downto 0);
		variable sz : std_ulogic_vector(2 downto 0);
		variable cv : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable as : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable cc : std_ulogic;
		variable done : std_ulogic;
		variable enable : std_ulogic;
	begin
		-- input selector
		-- d=1
		if to_X01(r2_En) = '1' then
			aa := r2_A;
			bb := r2_B;
			uu := r2_U;
			vv := r2_V;
			sp := (others => '0');
			sp( 0) := sz(2);
			sp(32) := sz(1) xor sz(2);
			sp(48) := sz(0) xor sz(1);
			sp(56) := not sz(0);
			sn := (others => '0');
			sz := to_X01(r2_Size);
			as := r2_AS;
		else
			aa := r3_A;
			bb := r3_B;
			uu := r3_U;
			vv := r3_V;
			sp := r3_P;
			sn := r3_Q;
			sz := to_X01(r3_Size);
			as := r3_AS;
		end if;
		-- end marker
		done := sp(63);
		-- stage output register enable
		enable := r2_En or (r3_En and not r3_Done);

		-- chaining vector
		-- d=1
		cv := (others => '1');
		for i in WIDTH/16-1 downto 0 loop
			cv(2*i+1) := sz(0);
		end loop;
		for i in WIDTH/32-1 downto 0 loop
			cv(4*i+2) := sz(1);
		end loop;
		for i in WIDTH/64-1 downto 0 loop
			cv(8*i+4) := sz(2);
		end loop;

		-- SRT decision logic
		-- d=2/4
		for i in WIDTH/8-1 downto 0 loop
			-- pp = 1 if remainder is (approximately) zero
			-- d=2
			pp(i) := uu(10*i+9) and uu(10*i+8) and uu(10*i+7) and uu(10*i+6);
			-- d=3
			cc := vv(10*i+8)
				or (uu(10*i+8) and vv(10*i+7))
				or (uu(10*i+8) and uu(10*i+7) and vv(10*i+6));
			-- dd = 1 if bb and remainder differ in sign
			-- d=4
			dd(i) := (bb(10*i+9) xor uu(10*i+9)) xor cc;
		end loop;

		-- result bits
		-- d=5
		--
		-- Encoding:
		--
		--  q | pp | dd | qp | qn | action
		-- ===#====#====#====#====#========
		-- -1 |  0 |  1 |  1 |  1 | add
		--  0 |  1 |  ? |  1 |  0 | nop
		-- +1 |  0 |  0 |  0 |  0 | sub
		--
		-- Note that the values for qp/qn are chosen in a
		-- way that simplifies post-processing.
		--
		qp := dd or pp;
		qn := dd and not pp;

		-- result shift register
		-- d=6
		sp := lshift(sp, 1);
		sn := lshift(sn, 1);
		case sz is
			when "111" =>
				for i in WIDTH/64-1 downto 0 loop
					sp(64*i) := qp(8*i+7);
					sn(64*i) := qn(8*i+7);
				end loop;
			when "011" =>
				for i in WIDTH/32-1 downto 0 loop
					sp(32*i) := qp(4*i+3);
					sn(32*i) := qn(4*i+3);
				end loop;
			when "001" =>
				for i in WIDTH/16-1 downto 0 loop
					sp(16*i) := qp(2*i+1);
					sn(16*i) := qn(2*i+1);
				end loop;
			when "000" =>
				for i in WIDTH/8-1 downto 0 loop
					sp(8*i) := qp(1*i+0);
					sn(8*i) := qn(1*i+0);
				end loop;
			when others =>
				sp := (others => 'X');
				sn := (others => 'X');
		end case;

		-- left shift
		-- d=1
		uu := lshift(uu, 1);
		vv := lshift(vv, 1);

		-- chain slices
		-- d=1
		for i in WIDTH/8-1 downto 1 loop
			uu(10*i) := uu(10*i-2);
			vv(10*i) := vv(10*i-2);
		end loop;
		for i in WIDTH/64-1 downto 0 loop
			uu(80*i) := '0';
			vv(80*i) := '0';
		end loop;

		-- break the chains for SIMD
		-- d=2
		case sz is
			when "111" =>
				for i in WIDTH/64-1 downto 0 loop
					uu(80*i) := not aa(64*i+63);
					vv(80*i) := aa(64*i+63);
				end loop;
			when "011" =>
				for i in WIDTH/32-1 downto 0 loop
					uu(40*i) := not aa(32*i+31);
					vv(40*i) := aa(32*i+31);
				end loop;
			when "001" =>
				for i in WIDTH/16-1 downto 0 loop
					uu(20*i) := not aa(16*i+15);
					vv(20*i) := aa(16*i+15);
				end loop;
			when "000" =>
				for i in WIDTH/8-1 downto 0 loop
					uu(10*i) := not aa( 8*i+ 7);
					vv(10*i) := aa( 8*i+ 7);
				end loop;
			when others =>
				uu := (others => 'X');
				vv := (others => 'X');
		end case;

		-- left shift, part two
		-- d=1
		aa := lshift(aa, 1);

		-- chain slices
		-- d=2
		for i in WIDTH/8-1 downto 0 loop
			aa(8*i) := aa(8*i) and cv(i);
		end loop;
		for i in WIDTH/64-1 downto 0 loop
			aa(64*i) := '0';
		end loop;

		-- precompute sum
		-- d=2/3
		r0 := uu xor bb;
		-- d=3/4
		r1 := vv or (uu and bb);
		r1 := lshift(r1, 1, '0');
		-- d=4
		for i in WIDTH/8-1 downto 1 loop
			r1(10*i) := r1(10*i-2) and cv(i);
		end loop;
		for i in WIDTH/64-1 downto 0 loop
			r1(80*i) := '0';
		end loop;

		-- precompute difference
		-- d=2/3
		r2 := uu xor not bb;
		-- d=3/4
		r3 := vv or (uu and not bb);
		r3 := lshift(r3, 1, '1');
		-- d=4
		for i in WIDTH/8-1 downto 1 loop
			r3(10*i) := r3(10*i-2) or not cv(i);
		end loop;
		for i in WIDTH/64-1 downto 0 loop
			r3(80*i) := '1';
		end loop;

		-- add/sub selectors
		-- d=2/4
		case sz is
			when "000" =>
				null;
			when "001" =>
				for i in WIDTH/16-1 downto 0 loop
					dd(2*i+1 downto 2*i) := (2*i+1 downto 2*i => dd(2*i+1));
					pp(2*i+1 downto 2*i) := (2*i+1 downto 2*i => pp(2*i+1));
				end loop;
			when "011" =>
				for i in WIDTH/32-1 downto 0 loop
					dd(4*i+3 downto 4*i) := (4*i+3 downto 4*i => dd(4*i+3));
					pp(4*i+3 downto 4*i) := (4*i+3 downto 4*i => pp(4*i+3));
				end loop;
			when "111" =>
				for i in WIDTH/64-1 downto 0 loop
					dd(8*i+7 downto 8*i) := (8*i+7 downto 8*i => dd(8*i+7));
					pp(8*i+7 downto 8*i) := (8*i+7 downto 8*i => pp(8*i+7));
				end loop;
			when others =>
				dd := (others => 'X');
				pp := (others => 'X');
		end case;

		-- result selectors
		-- d<=6
		for i in WIDTH/8-1 downto 0 loop
			if to_X01(pp(i)) = '1' then
				-- remainder is (almost) zero => do nothing
				null;
			elsif to_X01(dd(i)) = '1' then
				-- add B
				uu(10*i+9 downto 10*i) :=
					r0(10*i+9 downto 10*i) xor r1(10*i+9 downto 10*i);
				vv(10*i+9 downto 10*i) :=
					r0(10*i+9 downto 10*i) and r1(10*i+9 downto 10*i);
			else
				-- subtract B
				uu(10*i+9 downto 10*i) :=
					r2(10*i+9 downto 10*i) xor r3(10*i+9 downto 10*i);
				vv(10*i+9 downto 10*i) :=
					r2(10*i+9 downto 10*i) and r3(10*i+9 downto 10*i);
			end if;
		end loop;

		-- outputs
		if to_X01(Rst) = '1' then
			r3_A <= (others => '0');
			r3_B <= (others => '0');
			r3_U <= (others => '0');
			r3_V <= (others => '0');
			r3_P <= (others => '0');
			r3_Q <= (others => '0');
			r3_Size <= (others => '0');
			r3_AS <= (others => '0');
			r3_Done <= '0';
			r3_En <= '0';
		elsif rising_edge(Clk) then
			if to_X01(enable) = '1' then
				r3_A <= aa;
				r3_B <= bb;
				r3_U <= uu;
				r3_V <= vv;
				r3_P <= sp;
				r3_Q <= sn;
				r3_Size <= sz;
				r3_AS <= as;
				r3_Done <= done;
			end if;
			r3_En <= enable;
		end if;
	end process;

	-- result pre-correction
	corr_4 : process (r3_B, r3_P, r3_Q, r3_Size, r3_AS, r3_Done, Clk, Rst)
		variable as, bs, incr : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable pp, qq, xx : std_ulogic_vector(WIDTH-1 downto 0);
		variable r0, r1 : std_ulogic_vector(WIDTH-1 downto 0);
		variable sz : std_ulogic_vector(2 downto 0);
	begin
		-- d=0
		as := r3_AS;
		bs := bit_extract(r3_B, 10, 9);
		pp := r3_P;
		qq := r3_Q;
		sz := to_X01(r3_Size);

		-- Note: we don't know whether we will have to correct
		-- the result (that will not be known until the end
		-- of the next cycle).	But we know HOW to correct it
		-- if we have to.  Therefore, we can calculate both
		-- results in parallel and select the correct one in
		-- stage 6.  In order to save space, a single adder
		-- calculates the uncorrected and corrected results one
		-- after another.  Since correcting takes one clock
		-- cycle, the latency is not affected.	Nor is the
		-- throughput: We can't issue two divide instructions
		-- in two consecutive cycles anyway, therefore the
		-- result adder pipeline is guaranteed to be free.

		-- Increment result if A/B is negative.
		-- Otherwise, decrement it.
		-- d=1
		incr := as xor bs;
		xx := bit_duplicate(incr, 8);

		-- d=2
		case sz is
			when "111" =>
				for i in WIDTH/64-1 downto 0 loop
					xx(64*i) := '1';
				end loop;
			when "011" =>
				for i in WIDTH/32-1 downto 0 loop
					xx(32*i) := '1';
				end loop;
			when "001" =>
				for i in WIDTH/16-1 downto 0 loop
					xx(16*i) := '1';
				end loop;
			when "000" =>
				for i in WIDTH/8-1 downto 0 loop
					xx(8*i) := '1';
				end loop;
			when others =>
				null;
		end case;

		-- inc/dec circuit
		-- d=1
		r1 := pp xor qq;
		-- d=3
		r0 := r1 xor xx;
		-- d=4
		r1 := (pp and qq) or (r1 and xx);
		r1 := lshift(r1, 1);

		-- SIMD chunk separation
		-- d=5
		case sz is
			when "111" =>
				for i in WIDTH/64-1 downto 0 loop
					r1(64*i) := '0';
				end loop;
			when "011" =>
				for i in WIDTH/32-1 downto 0 loop
					r1(32*i) := '0';
				end loop;
			when "001" =>
				for i in WIDTH/16-1 downto 0 loop
					r1(16*i) := '0';
				end loop;
			when "000" =>
				for i in WIDTH/8-1 downto 0 loop
					r1(8*i) := '0';
				end loop;
			when others =>
				null;
		end case;

		if to_X01(Rst) = '1' then
			r4_P <= (others => '0');
			r4_Q <= (others => '0');
			r4_Size <= (others => '0');
			r4_Done <= '0';
		elsif rising_edge(Clk) then
			if to_X01(r3_Done) = '1' then
				r4_P <= r0;
				r4_Q <= r1;
				r4_Size <= r3_Size;
			end if;
			r4_Done <= r3_Done;
		end if;
	end process;

	-- correction check part #1
	check_4 : process (r3_B, r3_U, r3_V, r3_Size, r3_AS, r3_Done, Clk, Rst)
		variable g0, p0 : std_ulogic_vector(WIDTH-1 downto 0);
		variable g1, p1 : std_ulogic_vector(WIDTH/4-1 downto 0);
		variable g2, p2 : std_ulogic_vector(WIDTH/16-1 downto 0);
		variable g3, p3 : std_ulogic_vector(WIDTH/64-1 downto 0);
		variable sz : std_ulogic_vector(2 downto 0);
		variable as : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable bx : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable sign : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable zero : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable c08, c16, c32, c64 : std_ulogic_vector(WIDTH/8-1 downto 0);
	begin
		-- If the remainder is not zero, and the sign bits
		-- of dividend (A) and remainder differ, the result
		-- has to be corrected.  Since the remainder is
		-- dual-rail encoded, calculation of its sign bit
		-- takes two cycles.  Note that the calculation has
		-- to be performed for each chunk independently.
		-- Some of them may need correction while others don't.

		-- d=0
		as := r3_AS;
		sz := to_X01(r3_Size);
		for i in WIDTH/8-1 downto 0 loop
			p0(8*i+7 downto 8*i) := r3_U(10*i+7 downto 10*i);
			g0(8*i+7 downto 8*i) := r3_V(10*i+7 downto 10*i);
		end loop;
		bx := bit_extract(r3_U, 10, 8);

		-- d=2
		sign := bit_extract(r3_U, 10, 9) xor bit_extract(r3_V, 10, 8);
		sign := sign xor as;

		-- d=4
		case sz is
			when "111" => zero := bit_extract(combine_and(p0, 64), 8, 0);
			when "011" => zero := bit_extract(combine_and(p0, 32), 8, 0);
			when "001" => zero := bit_extract(combine_and(p0, 16), 8, 0);
			when "000" => zero := bit_extract(combine_and(p0,  8), 8, 0);
			when others => zero := (others => 'X');
		end case;

		-- d=6
		CLA(g0, p0, g1, p1);
		CLA(g1, p1, g2, p2);
		CLA(g2, p2, g3, p3);

		-- d=4
		c08 := (others => 'X');
		for i in WIDTH/8-1 downto 0 loop
			c08(i) := (bx(i) and g1(2*i+1))
				or (bx(i) and p1(2*i+1) and g1(2*i+0));
		end loop;

		-- d=5
		c16 := (others => 'X');
		for i in WIDTH/16-1 downto 0 loop
			c16(2*i+1) := bx(2*i+1) and g2(i);
		end loop;

		-- d=6
		c32 := (others => 'X');
		for i in WIDTH/32-1 downto 0 loop
			c32(4*i+3) := (bx(4*i+3) and g2(2*i+1))
				or (bx(4*i+3) and p2(2*i+1) and g2(2*i+0));
		end loop;

		if to_X01(Rst) = '1' then
			r4_C08 <= (others => '0');
			r4_C16 <= (others => '0');
			r4_C32 <= (others => '0');
			r4_BX <= (others => '0');
			r4_G3 <= (others => '0');
			r4_Sign <= (others => '0');
			r4_Zero <= (others => '0');
		elsif rising_edge(Clk) then
			if to_X01(r3_Done) = '1' then
				r4_C08 <= c08;
				r4_C16 <= c16;
				r4_C32 <= c32;
				r4_BX <= bx;
				r4_G3 <= g3;
				r4_Sign <= sign;
				r4_Zero <= zero;
			end if;
		end if;
	end process;

	-- correction check part #2
	check_5 : process (r4_C08, r4_C16, r4_C32, r4_BX, r4_G3, r4_Sign, r4_Zero,
					   r4_Size, r4_Done, Clk, Rst)
		variable sign : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable zero : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable c08, c16, c32, c64 : std_ulogic_vector(WIDTH/8-1 downto 0);
		variable sz : std_ulogic_vector(2 downto 0);
	begin
		-- d=0
		c08 := r4_c08;
		c16 := r4_c16;
		c32 := r4_c32;
		sign := r4_Sign;
		zero := r4_Zero;
		sz := to_X01(r4_Size);

		-- d=1
		c64 := (others => 'X');
		for i in WIDTH/64-1 downto 0 loop
			c64(8*i+7) := r4_BX(8*i+7) and r4_G3(i);
		end loop;

		-- d=3
		case sz is
			when "111" =>
				sign := bit_duplicate(bit_extract(sign xor c64, 8, 7), 8);
			when "011" =>
				sign := bit_duplicate(bit_extract(sign xor c32, 4, 3), 4);
			when "001" =>
				sign := bit_duplicate(bit_extract(sign xor c16, 2, 1), 2);
			when "000" =>
				sign := bit_duplicate(bit_extract(sign xor c08, 1, 0), 1);
			when others =>
				sign := (others => 'X');
		end case;

		if to_X01(Rst) = '1' then
			r5_Sel <= (others => '0');
		elsif rising_edge(Clk) then
			if to_X01(r4_Done) = '1' then
				-- d=4
				r5_Sel <= sign and not zero;
			end if;
		end if;
	end process;

	-- result adder part #1
	quot_5 : process (r3_P, r3_Q, r3_Size, r3_Done, Clk, Rst,
					   r4_P, r4_Q, r4_Size, r4_Done)
		variable g0, p0 : std_ulogic_vector(WIDTH-1 downto 0);
		variable s0, t0 : std_ulogic_vector(WIDTH-1 downto 0);
		variable y1, z1 : std_ulogic_vector(WIDTH-1 downto 0);
		variable g1, p1 : std_ulogic_vector(WIDTH/4-1 downto 0);
		variable s1, t1 : std_ulogic_vector(WIDTH/4-1 downto 0);
		variable g2, p2 : std_ulogic_vector(WIDTH/16-1 downto 0);
		variable sz : std_ulogic_vector(2 downto 0);
	begin
		-- choose uncorrected or corrected result
		-- d=2
		if to_X01(r4_Done) = '1' then
			p0 := r4_P xor r4_Q;
			g0 := r4_P and r4_Q;
			sz := to_X01(r4_Size);
		else
			p0 := r3_P xor r3_Q;
			g0 := r3_P and r3_Q;
			sz := to_X01(r3_Size);
		end if;

		-- d=4
		CLA(g0, p0, g1, p1);
		for i in WIDTH/4-1 downto 0 loop
			s0(4*i+0) := '0';
			s0(4*i+1) := g0(4*i+0);
			s0(4*i+2) := g0(4*i+1)
				or (p0(4*i+1) and g0(4*i+0));
			s0(4*i+3) := g0(4*i+2)
				or (p0(4*i+2) and g0(4*i+1))
				or (p0(4*i+2) and p0(4*i+1) and g0(4*i+0));
			t0(4*i+0) := '1';
			t0(4*i+1) := g0(4*i+0) or p0(4*i+0);
			t0(4*i+2) := g0(4*i+1)
				or (p0(4*i+1) and g0(4*i+0))
				or (p0(4*i+1) and p0(4*i+0));
			t0(4*i+3) := g0(4*i+2)
				or (p0(4*i+2) and g0(4*i+1))
				or (p0(4*i+2) and p0(4*i+1) and g0(4*i+0))
				or (p0(4*i+2) and p0(4*i+1) and p0(4*i+0));
		end loop;

		-- d=5
		y1 := (not p0) xor s0;
		z1 := (not p0) xor t0;

		-- d=6
		CLA(g1, p1, g2, p2);
		for i in WIDTH/16-1 downto 0 loop
			s1(4*i+0) := '0';
			s1(4*i+1) := g1(4*i+0);
			s1(4*i+2) := (sz(0) and g1(4*i+1))
				or (sz(0) and p1(4*i+1) and g1(4*i+0));
			s1(4*i+3) := g1(4*i+2)
				or (sz(0) and p1(4*i+2) and g1(4*i+1))
				or (sz(0) and p1(4*i+2) and p1(4*i+1) and g1(4*i+0));
			t1(4*i+0) := '1';
			t1(4*i+1) := g1(4*i+0) or p1(4*i+0);
			t1(4*i+2) := g1(4*i+1)
				or (not sz(0))
				or (p1(4*i+1) and g1(4*i+0))
				or (p1(4*i+1) and p1(4*i+0));
			t1(4*i+3) := (g1(4*i+2) or (p1(4*i+2) and not sz(0)))
				or (p1(4*i+2) and g1(4*i+1))
				or (p1(4*i+2) and p1(4*i+1) and g1(4*i+0))
				or (p1(4*i+2) and p1(4*i+1) and p1(4*i+0));
		end loop;

		if to_X01(Rst) = '1' then
			r5_Y1 <= (others => '0');
			r5_Z1 <= (others => '0');
			r5_S1 <= (others => '0');
			r5_T1 <= (others => '0');
			r5_G2 <= (others => '0');
			r5_P2 <= (others => '0');
			r5_Size <= (others => '0');
			r5_Done <= '0';
		elsif rising_edge(Clk) then
			if to_X01(r3_Done or r4_Done) = '1' then
				r5_Y1 <= y1;
				r5_Z1 <= z1;
				r5_S1 <= s1;
				r5_T1 <= t1;
				r5_G2 <= g2;
				r5_P2 <= p2;
				r5_Size <= sz;
			end if;
			r5_Done <= r4_Done;
		end if;
	end process;

	-- result adder part #2
	quot_6 : process (r5_Y1, r5_Z1, r5_S1, r5_T1, r5_G2, r5_P2, r5_Size,
					  Clk, Rst, r4_Done, r5_Done, r5_Sel, r6_Y3)
		variable y1, z1 : std_ulogic_vector(WIDTH-1 downto 0);
		variable y2, z2 : std_ulogic_vector(WIDTH-1 downto 0);
		variable y3, z3 : std_ulogic_vector(WIDTH-1 downto 0);
		variable s1, t1 : std_ulogic_vector(WIDTH/4-1 downto 0);
		variable g2, p2 : std_ulogic_vector(WIDTH/16-1 downto 0);
		variable s2, t2 : std_ulogic_vector(WIDTH/16-1 downto 0);
		variable sz : std_ulogic_vector(2 downto 0);
	begin
		-- d=0
		y1 := r5_Y1;
		z1 := r5_Z1;
		s1 := r5_S1;
		t1 := r5_T1;
		g2 := r5_G2;
		p2 := r5_P2;
		sz := to_X01(r5_Size);

		-- d=1
		for i in WIDTH/4-1 downto 0 loop
			if to_X01(s1(i)) = '1' then
				y2(4*i+3 downto 4*i) := z1(4*i+3 downto 4*i);
			else
				y2(4*i+3 downto 4*i) := y1(4*i+3 downto 4*i);
			end if;
			if to_X01(t1(i)) = '1' then
				z2(4*i+3 downto 4*i) := z1(4*i+3 downto 4*i);
			else
				z2(4*i+3 downto 4*i) := y1(4*i+3 downto 4*i);
			end if;
		end loop;

		-- d=2
		for i in WIDTH/64-1 downto 0 loop
			s2(4*i+0) := '0';
			s2(4*i+1) := sz(1) and g2(4*i+0);
			s2(4*i+2) := (sz(2) and g2(4*i+1))
				or (sz(2) and p2(4*i+1) and g2(4*i+0));
			s2(4*i+3) := (sz(1) and g2(4*i+2))
				or (sz(2) and p2(4*i+2) and g2(4*i+1))
				or (sz(2) and p2(4*i+2) and p2(4*i+1) and g2(4*i+0));
		end loop;

		-- d=3
		for i in WIDTH/16-1 downto 0 loop
			if to_X01(s2(i)) = '1' then
				y3(16*i+15 downto 16*i) := z2(16*i+15 downto 16*i);
			else
				y3(16*i+15 downto 16*i) := y2(16*i+15 downto 16*i);
			end if;
		end loop;

		-- select correct result
		-- d=4
		for i in WIDTH/8-1 downto 0 loop
			if to_X01(r5_Done and not r5_Sel(i)) = '1' then
				y3(8*i+7 downto 8*i) := r6_Y3(8*i+7 downto 8*i);
			end if;
		end loop;

		-- output
		Y <= y3;
		Z <= (others => 'U');	-- XXX: not implemented yet

		-- push uncorrected result into the pipe
		if to_X01(Rst) = '1' then
			r6_Y3 <= (others => '0');
		elsif rising_edge(Clk) then
			if to_X01(r4_Done) = '1' then
				r6_Y3 <= y3;
			end if;
		end if;
	end process;
end Behave_1;

-- vi: set ts=4 sw=4 equalprg="fmt -72 -p--": please
