-- icmp64.vhdl -- F-CPU 64-bit Integer Compare Unit
-- Copyright (C) 2002, 2003 Michael Riepe <michael@stud.uni-hannover.de>
--
-- This program is free software; you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 2 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program; if not, write to the Free Software
-- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

-- @(#) $Id: icmp64.vhdl,v 1.5 2003/03/25 13:43:54 michael Exp $

library IEEE;
use IEEE.std_logic_1164.all;
use work.Bit_Manipulation.all;

entity ICmp64 is
	generic (
		WIDTH : natural := 64
	);
	port (
		-- operand inputs
		A : in std_ulogic_vector(WIDTH-1 downto 0);
		B : in std_ulogic_vector(WIDTH-1 downto 0);
		-- operating modes
		Comp : in std_ulogic;
		Sort : in std_ulogic;
		Msb0 : in std_ulogic;
		Msb1 : in std_ulogic;
		-- flags
		SignedMode : in std_ulogic;
		-- SIMD mode switches
		U : in std_ulogic_vector(2 downto 0);
		-- clock/reset inputs
		Clk : in std_ulogic;
		Rst : in std_ulogic;
		En : in std_ulogic;
	--
		Y : out std_ulogic_vector(WIDTH-1 downto 0);
		Z : out std_ulogic_vector(WIDTH-1 downto 0)
	);
--pragma synthesis_off
begin
	assert WIDTH mod 64 = 0
		report "width of ICmp must be an integer multiple of 64"
		severity failure;
--pragma synthesis_on
end ICmp64;

-- Operating Modes:
--
--  mode | Y           | Z
--  =====#=============#==============
--  Comp | -(A > B)    | -(A <= B) [*]
--  Sort | min(A, B)   | max(A, B) [*]
--  Msb0 | findmsb(~A) | maskmsb(~A)
--  Msb1 | findmsb(A)  | maskmsb(A)
--
--  [*] compares signed or unsigned numbers, depending on `SignedMode'
--
--  `findmsb(x)' returns 1 plus the index of the most significant
--  `1' bit in `x' while `maskmsb(x)' returns a bit mask that selects
--  that bit.  If no bit is found, both return 0.

architecture Behave_1 of ICmp64 is
	signal M : std_ulogic_vector(7 downto 0);
	signal r_M : std_ulogic_vector(7 downto 0);
	signal r_A : std_ulogic_vector(WIDTH-1 downto 0);
	signal r_B : std_ulogic_vector(WIDTH-1 downto 0);
	signal r_C : std_ulogic_vector(WIDTH-1 downto 0);
	signal r_X08 : std_ulogic_vector(WIDTH-1 downto 0);
	signal r_X16 : std_ulogic_vector(WIDTH-1 downto 0);
	signal r_X32 : std_ulogic_vector(WIDTH-1 downto 0);
	signal r_X64 : std_ulogic_vector(WIDTH-1 downto 0);
	signal r_En : std_ulogic;
	signal Ymin : std_ulogic_vector(WIDTH-1 downto 0);
	signal Ymax : std_ulogic_vector(WIDTH-1 downto 0);
	signal Ycmp : std_ulogic_vector(WIDTH-1 downto 0);
	signal Ymsb : std_ulogic_vector(WIDTH-1 downto 0);
	signal Ymsk : std_ulogic_vector(WIDTH-1 downto 0);
begin
	M <= (
		0 => Comp,
		1 => Sort,
		2 => Msb0,
		3 => Msb1,
		4 => SignedMode,
		5 => U(0),
		6 => U(1),
		7 => U(2)
	);

	stage_1 : process (A, B, M, Clk, Rst, En)
		-- reversed single stage of work.Bit_Manipulation.cascade_or
		function tree_layer (A : in std_ulogic_vector;
							 N, M : in natural) return std_ulogic_vector is
			constant L : natural := A'length;
			-- *** Note the index ranges! ***
			alias aa : std_ulogic_vector(0 to L-1) is A;
			variable yy : std_ulogic_vector(0 to L-1);
			-- ******************************
			variable j : natural;
		begin
--pragma synthesis_off
			assert L mod 64 = 0;
			assert L >= 64;
			assert N >= 1;
			assert (M = 2) or (M = 4);
--pragma synthesis_on
			for i in L-1 downto 0 loop
				j := i - i mod (M * N) + N - 1;
				case (i / N) mod M is
					when 3 =>
						yy(i) := aa(i) or aa(j) or aa(j+N) or aa(j+2*N);
					when 2 =>
						yy(i) := aa(i) or aa(j) or aa(j+N);
					when 1 =>
						yy(i) := aa(i) or aa(j);
					when others =>
						yy(i) := aa(i);
				end case;
			end loop;
			return yy;
		end tree_layer;

		variable mm : std_ulogic_vector(7 downto 0);
		variable aa : std_ulogic_vector(WIDTH-1 downto 0);
		variable bb : std_ulogic_vector(WIDTH-1 downto 0);
		variable cc : std_ulogic_vector(WIDTH-1 downto 0);
		variable x04 : std_ulogic_vector(WIDTH-1 downto 0);
		variable x08 : std_ulogic_vector(WIDTH-1 downto 0);
		variable x16 : std_ulogic_vector(WIDTH-1 downto 0);
		variable x32 : std_ulogic_vector(WIDTH-1 downto 0);
		variable x64 : std_ulogic_vector(WIDTH-1 downto 0);
	begin
		aa := to_X01(A);
		bb := to_X01(B);
		mm := to_X01(M);

		-- tree input select
		-- d=2
		if mm(3) = '1' then
			-- Msb1
			x04 := aa;
		elsif mm(2) = '1' then
			-- Msb0
			x04 := not aa;
		else
			-- Comp/Sort
			x04 := aa xor bb;
		end if;

		-- OR tree, layer by layer
		-- d=3
		x04 := tree_layer(x04, 1, 4);
		-- d=4
		x08 := tree_layer(x04, 4, 2);
		x16 := tree_layer(x04, 4, 4);
		-- d=5
		x32 := tree_layer(x16, 16, 2);
		x64 := tree_layer(x16, 16, 4);

		-- xor with right shifted chunk
		-- d=5
		for i in WIDTH/8-1 downto 0 loop
			x08(8*i+6 downto 8*i+0) := x08(8*i+6 downto 8*i+0)
				xor x08(8*i+7 downto 8*i+1);
		end loop;
		for i in WIDTH/16-1 downto 0 loop
			x16(16*i+14 downto 16*i+0) := x16(16*i+14 downto 16*i+0)
				xor x16(16*i+15 downto 16*i+1);
		end loop;
		-- d=6
		for i in WIDTH/32-1 downto 0 loop
			x32(32*i+30 downto 32*i+0) := x32(32*i+30 downto 32*i+0)
				xor x32(32*i+31 downto 32*i+1);
		end loop;
		for i in WIDTH/64-1 downto 0 loop
			x64(64*i+62 downto 64*i+0) := x64(64*i+62 downto 64*i+0)
				xor x64(64*i+63 downto 64*i+1);
		end loop;

		-- handle signed compare/sort
		-- d=2
		cc := aa;	-- `aa' for cmpg; `not aa' for cmple.
		--cc := bb;	-- `bb' for cmpl; `not bb' for cmpge.
		-- Note: also see `gtr' below.
		-- in signed mode: invert MSBs
		case mm(7 downto 5) is
			when "111" =>
				for i in WIDTH/64-1 downto 0 loop
					cc(64*(i+1)-1) := mm(4) xor cc(64*(i+1)-1);
				end loop;
			when "011" =>
				for i in WIDTH/32-1 downto 0 loop
					cc(32*(i+1)-1) := mm(4) xor cc(32*(i+1)-1);
				end loop;
			when "001" =>
				for i in WIDTH/16-1 downto 0 loop
					cc(16*(i+1)-1) := mm(4) xor cc(16*(i+1)-1);
				end loop;
			when "000" =>
				for i in WIDTH/8-1 downto 0 loop
					cc(8*(i+1)-1) := mm(4) xor cc(8*(i+1)-1);
				end loop;
			when others =>
				-- this should trigger a simulation error
				cc := (others => 'X');
		end case;

		-- pipeline register
		if to_X01(Rst) = '1' then
			r_A <= (others => '0');
			r_B <= (others => '0');
			r_C <= (others => '0');
			r_M <= (others => '0');
			r_X08 <= (others => '0');
			r_X16 <= (others => '0');
			r_X32 <= (others => '0');
			r_X64 <= (others => '0');
			r_En <= '0';
		elsif rising_edge(Clk) then
			if to_X01(En) = '1' then
				r_A <= aa;
				r_B <= bb;
				r_C <= cc;
				r_M <= mm;
				r_X08 <= x08;
				r_X16 <= x16;
				r_X32 <= x32;
				r_X64 <= x64;
			end if;
			r_En <= En;
		end if;
	end process;

	stage_2 : process (r_A, r_B, r_C, r_M, r_X08, r_X16, r_X32, r_X64)
		procedure chunk (A, B, C, X : in std_ulogic_vector;
						 YMin, YMax, YCmp, YMsb : out std_ulogic_vector) is
			constant w : natural := X'length;
			alias aa : std_ulogic_vector(w-1 downto 0) is A;
			alias bb : std_ulogic_vector(w-1 downto 0) is B;
			alias cc : std_ulogic_vector(w-1 downto 0) is C;
			alias xx : std_ulogic_vector(w-1 downto 0) is X;
			variable min : std_ulogic_vector(w-1 downto 0);
			variable max : std_ulogic_vector(w-1 downto 0);
			variable cmp : std_ulogic_vector(w-1 downto 0);
			variable msb : std_ulogic_vector(w-1 downto 0);
			variable tt : std_ulogic_vector(w/2-1 downto 0);
			variable gtr : std_ulogic;
			variable k : natural;
		begin
			-- d=2-4 (depends on chunk size)
			gtr := reduce_or(cc and xx);
			cmp := (others => gtr);
			-- d=5 (worst case)
			if to_X01(gtr) = '1' then
				max := aa;
				min := bb;
			else
				max := bb;
				min := aa;
			end if;
			-- d=3 (worst case)
			msb := (others => '0');
			for i in 0 to 31 loop
				exit when 2**i > w;
				k := 0;
				for j in 1 to w loop
					if j mod 2**(i+1) >= 2**i then
						tt(k) := xx(j - 1);
						k := k + 1;
					end if;
				end loop;
				assert k > 0;
				assert k <= w/2;
				msb(i) := reduce_or(tt(k-1 downto 0));
			end loop;
			YMin := min;
			YMax := max;
			YCmp := cmp;
			YMsb := msb;
		end chunk;

		variable mm : std_ulogic_vector(7 downto 0);
		variable min08, max08, cmp08, msb08 : std_ulogic_vector(WIDTH-1 downto 0);
		variable min16, max16, cmp16, msb16 : std_ulogic_vector(WIDTH-1 downto 0);
		variable min32, max32, cmp32, msb32 : std_ulogic_vector(WIDTH-1 downto 0);
		variable min64, max64, cmp64, msb64 : std_ulogic_vector(WIDTH-1 downto 0);
	begin
		-- postprocessing
		-- d=5
		for i in WIDTH/8-1 downto 0 loop
			chunk(
				r_A( 8*i+ 7 downto  8*i),
				r_B( 8*i+ 7 downto  8*i),
				r_C( 8*i+ 7 downto  8*i),
				r_X08( 8*i+ 7 downto  8*i),
				min08( 8*i+ 7 downto  8*i),
				max08( 8*i+ 7 downto  8*i),
				cmp08( 8*i+ 7 downto  8*i),
				msb08( 8*i+ 7 downto  8*i));
		end loop;
		for i in WIDTH/16-1 downto 0 loop
			chunk(
				r_A(16*i+15 downto 16*i),
				r_B(16*i+15 downto 16*i),
				r_C(16*i+15 downto 16*i),
				r_X16(16*i+15 downto 16*i),
				min16(16*i+15 downto 16*i),
				max16(16*i+15 downto 16*i),
				cmp16(16*i+15 downto 16*i),
				msb16(16*i+15 downto 16*i));
		end loop;
		for i in WIDTH/32-1 downto 0 loop
			chunk(
				r_A(32*i+31 downto 32*i),
				r_B(32*i+31 downto 32*i),
				r_C(32*i+31 downto 32*i),
				r_X32(32*i+31 downto 32*i),
				min32(32*i+31 downto 32*i),
				max32(32*i+31 downto 32*i),
				cmp32(32*i+31 downto 32*i),
				msb32(32*i+31 downto 32*i));
		end loop;
		for i in WIDTH/64-1 downto 0 loop
			chunk(
				r_A(64*i+63 downto 64*i),
				r_B(64*i+63 downto 64*i),
				r_C(64*i+63 downto 64*i),
				r_X64(64*i+63 downto 64*i),
				min64(64*i+63 downto 64*i),
				max64(64*i+63 downto 64*i),
				cmp64(64*i+63 downto 64*i),
				msb64(64*i+63 downto 64*i));
		end loop;

		-- outputs
		-- d=6
		mm := to_X01(r_M);
		case mm(7 downto 5) is
			when "111" =>
				Ymin <= min64;
				Ymax <= max64;
				Ycmp <= cmp64;
				Ymsb <= msb64;
				Ymsk <= r_X64;
			when "011" =>
				Ymin <= min32;
				Ymax <= max32;
				Ycmp <= cmp32;
				Ymsb <= msb32;
				Ymsk <= r_X32;
			when "001" =>
				Ymin <= min16;
				Ymax <= max16;
				Ycmp <= cmp16;
				Ymsb <= msb16;
				Ymsk <= r_X16;
			when "000" =>
				Ymin <= min08;
				Ymax <= max08;
				Ycmp <= cmp08;
				Ymsb <= msb08;
				Ymsk <= r_X08;
			when others =>
				Ymin <= (others => 'X');
				Ymax <= (others => 'X');
				Ycmp <= (others => 'X');
				Ymsb <= (others => 'X');
				Ymsk <= (others => 'X');
		end case;
	end process;

	out_mux : process (r_M, Ymin, Ymax, Ycmp, Ymsb)
		variable mm : std_ulogic_vector(7 downto 0);
	begin
		-- XXX: I violate the `6 Gate Rule' here.  Since there's
		-- no XOR gate in the datapath of the second stage,
		-- I may get away with it...
		-- d=7
		mm := to_X01(r_M);
		case mm(3 downto 0) is
			when "1000" | "0100" => -- Msb1/Msb0
				Y <= Ymsb;
				Z <= Ymsk;
			when "0010" => -- Sort
				Y <= Ymin;
				Z <= Ymax;
			when "0001" => -- Comp
				Y <= Ycmp;
				Z <= not Ycmp;
			when others =>
				Y <= (others => 'X');
				Z <= (others => 'X');
		end case;
	end process;
end Behave_1;

-- vi: set ts=4 sw=4 equalprg="fmt -72 -p--": please
