-- shuffle64.vhdl -- 64-Bit F-CPU Bit Shuffling Unit
-- Copyright (C) 2001 Michael Riepe <michael@stud.uni-hannover.de>
--
-- This program is free software; you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 2 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program; if not, write to the Free Software
-- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

-- $Id: shuffle64.vhdl,v 1.7 2001/09/27 13:47:15 michael Exp $

library IEEE;
use IEEE.std_logic_1164.all;
use work.Bit_Manipulation.all;
use work.Shuffle64_Config.all;

entity Shuffle64 is
	generic (
		WIDTH : natural := 64
	);
	port (
		-- shiftee
		A : in std_ulogic_vector(WIDTH-1 downto 0);
		-- shift count
		B : in std_ulogic_vector(WIDTH-1 downto 0);
		-- operating mode (mutually exclusive)
		ShiftL : in std_ulogic;
		ShiftR : in std_ulogic;
		ShiftRA : in std_ulogic;
		RotL : in std_ulogic;
		RotR : in std_ulogic;
		Bitrev : in std_ulogic;
		Byterev : in std_ulogic;
		Sdup : in std_ulogic;
		Mix : in std_ulogic;
		Expand : in std_ulogic;
		-- `high' flag for mix/expand instructions
		High : in std_ulogic;
		-- SIMD mode flags
		U : in std_ulogic_vector(2 downto 0);
		-- clock/reset/enable inputs (unused)
		Clk : in std_ulogic;
		Rst : in std_ulogic;
		En : in std_ulogic;
	--
		-- output
		Y : out std_ulogic_vector(WIDTH-1 downto 0)
	);
--pragma synthesis_off
begin
	assert WIDTH = 64
		report "WIDTH must be 64"
		severity failure;
--pragma synthesis_on
end Shuffle64;

architecture Behave_1 of Shuffle64 is
	-- byte-wide 16:1 mux
	function ab_sel (aa : in std_ulogic_vector(WIDTH-1 downto 0);
					 bb : in std_ulogic_vector(WIDTH-1 downto 0);
					 se : in std_ulogic_vector(3 downto 0)) return std_ulogic_vector is
		variable yy : std_ulogic_vector(7 downto 0);
	begin
		case se is
			when X"0" => yy := aa( 7 downto  0);
			when X"1" => yy := aa(15 downto  8);
			when X"2" => yy := aa(23 downto 16);
			when X"3" => yy := aa(31 downto 24);
			when X"4" => yy := aa(39 downto 32);
			when X"5" => yy := aa(47 downto 40);
			when X"6" => yy := aa(55 downto 48);
			when X"7" => yy := aa(63 downto 56);
			when X"8" => yy := bb( 7 downto  0);
			when X"9" => yy := bb(15 downto  8);
			when X"A" => yy := bb(23 downto 16);
			when X"B" => yy := bb(31 downto 24);
			when X"C" => yy := bb(39 downto 32);
			when X"D" => yy := bb(47 downto 40);
			when X"E" => yy := bb(55 downto 48);
			when X"F" => yy := bb(63 downto 56);
			when others => yy := (others => 'X'); -- don't care
		end case;
		return yy;
	end ab_sel;

	-- byte-wide 12:1 mux (for upper part of bit-shifted data)
	function hi_sel (aa : in std_ulogic_vector(WIDTH-1 downto 0);
					 xx : in std_ulogic_vector(WIDTH/8-1 downto 0);
					 se : in std_ulogic_vector(3 downto 0)) return std_ulogic_vector is
		variable yy : std_ulogic_vector(7 downto 0);
	begin
		case se is
			when X"0" => yy := aa( 7 downto  0);
			when X"1" => yy := aa(15 downto  8);
			when X"2" => yy := aa(23 downto 16);
			when X"3" => yy := aa(31 downto 24);
			when X"4" => yy := aa(39 downto 32);
			when X"5" => yy := aa(47 downto 40);
			when X"6" => yy := aa(55 downto 48);
			when X"7" => yy := aa(63 downto 56);
			when X"9" => yy := (others => xx(1));
			when X"B" => yy := (others => xx(3));
			when X"D" => yy := (others => xx(5));
			when X"F" => yy := (others => xx(7));
			when others => yy := (others => 'X'); -- don't care
		end case;
		return yy;
	end hi_sel;

	-- byte-wide 9:1 mux (for lower part of bit-shifted data)
	function lo_sel (aa : in std_ulogic_vector(WIDTH-1 downto 0);
					 se : in std_ulogic_vector(3 downto 0)) return std_ulogic_vector is
		variable yy : std_ulogic_vector(7 downto 0);
	begin
		case se is
			when X"0" => yy := aa( 7 downto  0);
			when X"1" => yy := aa(15 downto  8);
			when X"2" => yy := aa(23 downto 16);
			when X"3" => yy := aa(31 downto 24);
			when X"4" => yy := aa(39 downto 32);
			when X"5" => yy := aa(47 downto 40);
			when X"6" => yy := aa(55 downto 48);
			when X"7" => yy := aa(63 downto 56);
			when X"8" => yy := (others => '0');
			when others => yy := (others => 'X'); -- don't care
		end case;
		return yy;
	end lo_sel;

	-- basic bitwise right shift/rotate operation
	-- NOTE: all bitwise operations use this.
	function shuffle_8x8 (A, B, U : in std_ulogic_vector;
						  Rot, Ext : in std_ulogic) return std_ulogic_vector is
		constant w : natural := A'length;
		alias aa : std_ulogic_vector(w-1 downto 0) is A;
		alias bb : std_ulogic_vector(B'length-1 downto 0) is B;
		alias uu : std_ulogic_vector(U'length-1 downto 0) is U;
		variable ah : std_ulogic_vector(w-1 downto 0);
		variable al : std_ulogic_vector(w-1 downto 0);
		variable yy : std_ulogic_vector(w-1 downto 0);
		variable bsc : std_ulogic_vector(6 downto 0);
		variable tt : std_ulogic_vector(15 downto 0);
		variable ee : std_ulogic_vector(w/8-1 downto 0);
		variable k : natural;
		variable hi, lo : std_ulogic_vector(4*(w/8)-1 downto 0);
		variable sel : std_ulogic_vector(2 downto 0);
	begin
--pragma synthesis_off
		--assert w mod 8 = 0 report "w must be a multiple of 8" severity failure;
		assert w = 64 report "w must be 64" severity failure;
		assert B'length >= 6 report "B is too short" severity failure;
		assert U'length >= 3 report "U is too short" severity failure;
--pragma synthesis_on

		-- SIMD sign extension
		ee := (others => '1');
		for j in uu'range loop
			k := 2 ** j;
			for i in ee'length/k/2-1 downto 0 loop
				if 2*i*k+k-1 < ee'length then
					ee(2*i*k+k-1) := not uu(j);
				end if;
			end loop;
		end loop;
		-- XXX: speed this up?
		for i in w/8-1 downto 0 loop
			ee(i) := ee(i) and Ext and not Rot and aa(8*i+7);
		end loop;
		ee := to_X01(ee);

		-- bit-shift individual bytes
		sel := bb(2 downto 0);
		sel := to_X01(sel);
		for i in w/8-1 downto 0 loop
			tt := (others => '0');
			tt(15 downto 8) := aa(8*i+7 downto 8*i);
			tt := to_X01(tt);
			case sel is
				when "000" => tt := rshift(tt, 0, ee(i));
				when "001" => tt := rshift(tt, 1, ee(i));
				when "010" => tt := rshift(tt, 2, ee(i));
				when "011" => tt := rshift(tt, 3, ee(i));
				when "100" => tt := rshift(tt, 4, ee(i));
				when "101" => tt := rshift(tt, 5, ee(i));
				when "110" => tt := rshift(tt, 6, ee(i));
				when "111" => tt := rshift(tt, 7, ee(i));
				when others => tt := (others => 'X'); -- don't care
			end case;
			ah(8*i+7 downto 8*i) := tt(15 downto 8);
			al(8*i+7 downto 8*i) := tt( 7 downto 0);
		end loop;

		-- calculate byte selector
		bsc(6) := Rot;
		bsc(5 downto 3) := uu(2 downto 0);
		bsc(2 downto 0) := bb(5 downto 3) and uu(2 downto 0);
		bsc := to_X01(bsc);
		case bsc is
			-- shift.8
			when "0000000" => hi := X"76543210"; lo := X"88888888";
			-- rot.8
			when "1000000" => hi := X"76543210"; lo := X"76543210";
			-- shift.16
			when "0001000" => hi := X"76543210"; lo := X"87858381";
			when "0001001" => hi := X"F7D5B391"; lo := X"88888888";
			-- rot.16
			when "1001000" => hi := X"76543210"; lo := X"67452301";
			when "1001001" => hi := X"67452301"; lo := X"76543210";
			-- shift.32
			when "0011000" => hi := X"76543210"; lo := X"87658321";
			when "0011001" => hi := X"F765B321"; lo := X"88768832";
			when "0011010" => hi := X"FF76BB32"; lo := X"88878883";
			when "0011011" => hi := X"FFF7BBB3"; lo := X"88888888";
			-- rot.32
			when "1011000" => hi := X"76543210"; lo := X"47650321";
			when "1011001" => hi := X"47650321"; lo := X"54761032";
			when "1011010" => hi := X"54761032"; lo := X"65472103";
			when "1011011" => hi := X"65472103"; lo := X"76543210";
			-- shift.64
			when "0111000" => hi := X"76543210"; lo := X"87654321";
			when "0111001" => hi := X"F7654321"; lo := X"88765432";
			when "0111010" => hi := X"FF765432"; lo := X"88876543";
			when "0111011" => hi := X"FFF76543"; lo := X"88887654";
			when "0111100" => hi := X"FFFF7654"; lo := X"88888765";
			when "0111101" => hi := X"FFFFF765"; lo := X"88888876";
			when "0111110" => hi := X"FFFFFF76"; lo := X"88888887";
			when "0111111" => hi := X"FFFFFFF7"; lo := X"88888888";
			-- rot.64
			when "1111000" => hi := X"76543210"; lo := X"07654321";
			when "1111001" => hi := X"07654321"; lo := X"10765432";
			when "1111010" => hi := X"10765432"; lo := X"21076543";
			when "1111011" => hi := X"21076543"; lo := X"32107654";
			when "1111100" => hi := X"32107654"; lo := X"43210765";
			when "1111101" => hi := X"43210765"; lo := X"54321076";
			when "1111110" => hi := X"54321076"; lo := X"65432107";
			when "1111111" => hi := X"65432107"; lo := X"76543210";
			-- don't care
			when others   => hi := (others => 'X'); lo := (others => 'X');
		end case;

		-- output stage
		for i in w/8-1 downto 0 loop
			yy(8*i+7 downto 8*i) := hi_sel(ah, ee, hi(4*i+3 downto 4*i))
								 or lo_sel(al, lo(4*i+3 downto 4*i));
		end loop;
		return yy;
	end shuffle_8x8;

	-- alternative bitwise right shift/rotate operation
	function shuffle_4x4 (A, B, U : in std_ulogic_vector;
						  Rot, Ext : in std_ulogic) return std_ulogic_vector is
		procedure phase1 (A : in std_ulogic_vector;
						  B : in std_ulogic_vector(1 downto 0);
						  H : out std_ulogic_vector;
						  L : out std_ulogic_vector) is
			constant w : natural := A'length;
			alias aa : std_ulogic_vector(w-1 downto 0) is A;
			alias hh : std_ulogic_vector(w-1 downto 0) is H;
			alias ll : std_ulogic_vector(w-1 downto 0) is L;
			variable tt : std_ulogic_vector(7 downto 0);
			variable bb : std_ulogic_vector(1 downto 0);
		begin
			assert w mod 4 = 0;
			assert A'length = w;
			assert H'length = w;
			assert L'length = w;

			bb := to_X01(B);
			for i in w/4-1 downto 0 loop
				tt := (others => '0');
				tt(7 downto 4) := aa(4*i+3 downto 4*i);
				case bb is
					when "00" => null;
					when "01" => tt := rshift(tt, 1);
					when "10" => tt := rshift(tt, 2);
					when "11" => tt := rshift(tt, 3);
					when others => tt := (others => 'X');
				end case;
				hh(4*i+3 downto 4*i) := tt(7 downto 4);
				ll(4*i+3 downto 4*i) := tt(3 downto 0);
			end loop;
		end phase1;

		procedure phase2 (A1, A0 : in std_ulogic_vector;
						  B : in std_ulogic_vector(1 downto 0);
						  U : in std_ulogic_vector(2 downto 0);
						  H : out std_ulogic_vector;
						  L : out std_ulogic_vector) is
			constant w : natural := A1'length;
			alias ah : std_ulogic_vector(w-1 downto 0) is A1;
			alias al : std_ulogic_vector(w-1 downto 0) is A0;
			alias hh : std_ulogic_vector(w-1 downto 0) is H;
			alias ll : std_ulogic_vector(w-1 downto 0) is L;
			variable ubb : std_ulogic_vector(2 downto 0);

			procedure slice (xh, xl : in std_ulogic_vector(15 downto 0);
							 ubb : in std_ulogic_vector(2 downto 0);
							 yh, yl : out std_ulogic_vector(15 downto 0)) is
				variable sel : std_ulogic_vector(2 downto 0);
			begin
				yh := (others => '0');
				yl := (others => '0');
				sel := to_X01(ubb);
				case sel is
					-- 8-bit mode, shift = 0 to 3
					when "000" | "010" | "0X0" =>
						yh(15 downto 12) := xh(15 downto 12);
						yh(11 downto  8) := xh(11 downto  8) or xl(15 downto 12);
						yl(15 downto 12) :=                     xl(11 downto  8);
						--
						yh( 7 downto  4) := xh( 7 downto  4);
						yh( 3 downto  0) := xh( 3 downto  0) or xl( 7 downto  4);
						yl( 7 downto  4) :=                     xl( 3 downto  0);
					-- 8-bit mode, shift = 4 to 7
					when "001" | "011" | "0X1" =>
						yh(11 downto  8) := xh(15 downto 12);
						yl(15 downto 12) := xh(11 downto  8) or xl(15 downto 12);
						yl(11 downto  8) :=                     xl(11 downto  8);
						--
						yh( 3 downto  0) := xh( 7 downto  4);
						yl( 7 downto  4) := xh( 3 downto  0) or xl( 7 downto  4);
						yl( 3 downto  0) :=                     xl( 3 downto  0);
					-- 16-bit mode, shift = 0 to 3
					when "100" =>
						yh(15 downto 12) := xh(15 downto 12);
						yh(11 downto  8) := xh(11 downto  8) or xl(15 downto 12);
						yh( 7 downto  4) := xh( 7 downto  4) or xl(11 downto  8);
						yh( 3 downto  0) := xh( 3 downto  0) or xl( 7 downto  4);
						yl(15 downto 12) :=                     xl( 3 downto  0);
					-- 16-bit mode, shift = 4 to 7
					when "101" =>
						yh(11 downto  8) := xh(15 downto 12);
						yh( 7 downto  4) := xh(11 downto  8) or xl(15 downto 12);
						yh( 3 downto  0) := xh( 7 downto  4) or xl(11 downto  8);
						yl(15 downto 12) := xh( 3 downto  0) or xl( 7 downto  4);
						yl(11 downto  8) :=                     xl( 3 downto  0);
					-- 16-bit mode, shift = 8 to 11
					when "110" =>
						yh( 7 downto  4) := xh(15 downto 12);
						yh( 3 downto  0) := xh(11 downto  8) or xl(15 downto 12);
						yl(15 downto 12) := xh( 7 downto  4) or xl(11 downto  8);
						yl(11 downto  8) := xh( 3 downto  0) or xl( 7 downto  4);
						yl( 7 downto  4) :=                     xl( 3 downto  0);
					-- 16-bit mode, shift = 12 to 15
					when "111" =>
						yh( 3 downto  0) := xh(15 downto 12);
						yl(15 downto 12) := xh(11 downto  8) or xl(15 downto 12);
						yl(11 downto  8) := xh( 7 downto  4) or xl(11 downto  8);
						yl( 7 downto  4) := xh( 3 downto  0) or xl( 7 downto  4);
						yl( 3 downto  0) :=                     xl( 3 downto  0);
					-- don't care
					when others =>
						yh := (others => 'X');
						yl := (others => 'X');
				end case;
			end slice;
		begin
			assert w mod 16 = 0;
			assert A1'length = w;
			assert A0'length = w;
			assert H'length = w;
			assert L'length = w;

			ubb := U(0) & B;
			for i in w/16-1 downto 0 loop
				slice(ah(16*i+15 downto 16*i), al(16*i+15 downto 16*i),
					ubb, hh(16*i+15 downto 16*i), ll(16*i+15 downto 16*i));
			end loop;
		end phase2;

		procedure phase3 (A1, A0 : in std_ulogic_vector;
						  B : in std_ulogic_vector(1 downto 0);
						  U : in std_ulogic_vector(2 downto 0);
						  H : out std_ulogic_vector;
						  L : out std_ulogic_vector) is
			constant w : natural := A1'length;
			alias ah : std_ulogic_vector(w-1 downto 0) is A1;
			alias al : std_ulogic_vector(w-1 downto 0) is A0;
			alias hh : std_ulogic_vector(w-1 downto 0) is H;
			alias ll : std_ulogic_vector(w-1 downto 0) is L;
			variable ubb : std_ulogic_vector(3 downto 0);

			procedure slice (xh, xl : in std_ulogic_vector(63 downto 0);
							 ubb : in std_ulogic_vector(3 downto 0);
							 yh, yl : out std_ulogic_vector(63 downto 0)) is
				variable sel : std_ulogic_vector(3 downto 0);
			begin
				yh := (others => '0');
				yl := (others => '0');
				sel := to_X01(ubb);
				case sel is
					-- 32-bit mode, shift = 0 to 15
					when "0100" | "0110" | "01X0" =>
						yh(63 downto 48) := xh(63 downto 48);
						yh(47 downto 32) := xh(47 downto 32) or xl(63 downto 48);
						yl(63 downto 48) :=                     xl(47 downto 32);
						yh(31 downto 16) := xh(31 downto 16);
						yh(15 downto  0) := xh(15 downto  0) or xl(31 downto 16);
						yl(31 downto 16) :=                     xl(15 downto  0);
					-- 32-bit mode, shift = 16 to 31
					when "0101" | "0111" | "01X1" =>
						yh(47 downto 32) := xh(63 downto 48);
						yl(63 downto 48) := xh(47 downto 32) or xl(63 downto 48);
						yl(47 downto 32) :=                     xl(47 downto 32);
						yh(15 downto  0) := xh(31 downto 16);
						yl(31 downto 16) := xh(15 downto  0) or xl(31 downto 16);
						yl(15 downto  0) :=                     xl(15 downto  0);
					-- 64-bit mode, shift = 0 to 15
					when "1100" =>
						yh(63 downto 48) := xh(63 downto 48);
						yh(47 downto 32) := xh(47 downto 32) or xl(63 downto 48);
						yh(31 downto 16) := xh(31 downto 16) or xl(47 downto 32);
						yh(15 downto  0) := xh(15 downto  0) or xl(31 downto 16);
						yl(63 downto 48) :=                     xl(15 downto  0);
					-- 64-bit mode, shift = 16 to 31
					when "1101" =>
						yh(47 downto 32) := xh(63 downto 48);
						yh(31 downto 16) := xh(47 downto 32) or xl(63 downto 48);
						yh(15 downto  0) := xh(31 downto 16) or xl(47 downto 32);
						yl(63 downto 48) := xh(15 downto  0) or xl(31 downto 16);
						yl(47 downto 32) :=                     xl(15 downto  0);
					-- 64-bit mode, shift = 32 to 47
					when "1110" =>
						yh(31 downto 16) := xh(63 downto 48);
						yh(15 downto  0) := xh(47 downto 32) or xl(63 downto 48);
						yl(63 downto 48) := xh(31 downto 16) or xl(47 downto 32);
						yl(47 downto 32) := xh(15 downto  0) or xl(31 downto 16);
						yl(31 downto 16) :=                     xl(15 downto  0);
					-- 64-bit mode, shift = 48 to 63
					when "1111" =>
						yh(15 downto  0) := xh(63 downto 48);
						yl(63 downto 48) := xh(47 downto 32) or xl(63 downto 48);
						yl(47 downto 32) := xh(31 downto 16) or xl(47 downto 32);
						yl(31 downto 16) := xh(15 downto  0) or xl(31 downto 16);
						yl(15 downto  0) :=                     xl(15 downto  0);
					-- pass through
					when others =>
						yh := xh;
						yl := xl;
				end case;
			end slice;
		begin
			assert w mod 16 = 0;
			assert A1'length = w;
			assert A0'length = w;
			assert H'length = w;
			assert L'length = w;

			ubb := U(2 downto 1) & B;
			for i in w/64-1 downto 0 loop
				slice(ah(64*i+63 downto 64*i), al(64*i+63 downto 64*i),
					ubb, hh(64*i+63 downto 64*i), ll(64*i+63 downto 64*i));
			end loop;
		end phase3;

		procedure signext (A : in std_ulogic_vector;
						   B : in std_ulogic_vector(5 downto 0);
						   U : in std_ulogic_vector(2 downto 0);
						   Y : out std_ulogic_vector) is
			constant w : natural := A'length;
			alias aa : std_ulogic_vector(w-1 downto 0) is A;
			alias bb : std_ulogic_vector(B'length-1 downto 0) is B;
			alias uu : std_ulogic_vector(U'length-1 downto 0) is U;
			alias yy : std_ulogic_vector(w-1 downto 0) is Y;
			variable bb1 : std_ulogic_vector(1 downto 0);
			variable bb2 : std_ulogic_vector(2 downto 0);
			variable bb3 : std_ulogic_vector(3 downto 0);
			variable t04 : std_ulogic_vector(3 downto 0);
			variable t16 : std_ulogic_vector(15 downto 0);
			variable t64 : std_ulogic_vector(63 downto 0);
			variable ex1 : std_ulogic_vector(w/2-1 downto 0);
			variable ex2 : std_ulogic_vector(w-1 downto 0);
			variable ex3 : std_ulogic_vector(w-1 downto 0);
		begin
			assert w mod 64 = 0;
			assert A'length = w;
			assert Y'length = w;

			bb1 := to_X01(bb(1 downto 0));
			for i in w/8-1 downto 0 loop
				t04 := (others => '0');
				case bb1 is
					when "00" => t04(3 downto 3) := (3 downto 3 => aa(8*i+7));
					when "01" => t04(3 downto 2) := (3 downto 2 => aa(8*i+7));
					when "10" => t04(3 downto 1) := (3 downto 1 => aa(8*i+7));
					when "11" => t04(3 downto 0) := (3 downto 0 => aa(8*i+7));
					when others => t04 := (others => 'X');
				end case;
				ex1(4*i+3 downto 4*i) := t04;
			end loop;

			bb2 := to_X01(uu(0) & bb(3 downto 2));
			for i in w/16-1 downto 0 loop
				t16 := (others => '0');
				case bb2 is
					-- 8-bit mode, shift = 0 to 3
					when "000" | "010" | "0X0" =>
						t16(15 downto 12) := ex1(8*i+7 downto 8*i+4);
						t16( 7 downto  4) := ex1(8*i+3 downto 8*i+0);
					-- 8-bit mode, shift = 4 to 7
					when "001" | "011" | "0X1" =>
						t16(15 downto 12) := ex1(8*i+7 downto 8*i+4);
						t16(15 downto 8) := rshifta(t16(15 downto 8), 4);
						t16( 7 downto  4) := ex1(8*i+3 downto 8*i+0);
						t16( 7 downto 0) := rshifta(t16( 7 downto 0), 4);
					-- 16-bit mode, shift = 0 to 3
					when "100" =>
						t16(15 downto 12) := ex1(8*i+7 downto 8*i+4);
					-- 16-bit mode, shift = 4 to 7
					when "101" =>
						t16(15 downto 12) := ex1(8*i+7 downto 8*i+4);
						t16 := rshifta(t16,  4);
					-- 16-bit mode, shift = 8 to 11
					when "110" =>
						t16(15 downto 12) := ex1(8*i+7 downto 8*i+4);
						t16 := rshifta(t16,  8);
					-- 16-bit mode, shift = 12 to 15
					when "111" =>
						t16(15 downto 12) := ex1(8*i+7 downto 8*i+4);
						t16 := rshifta(t16, 12);
					when others =>
						t16 := (others => 'X');
				end case;
				ex2(16*i+15 downto 16*i) := t16;
			end loop;

			bb3 := to_X01(uu(2 downto 1) & bb(5 downto 4));
			for i in w/64-1 downto 0 loop
				t64 := (others => '0');
				case bb3 is
					when "0100" | "0110" | "01X0" =>
						t64(63 downto 48) := ex2(64*i+63 downto 64*i+48);
						t64(31 downto 16) := ex2(64*i+31 downto 64*i+16);
					when "0101" | "0111" | "01X1" =>
						t64(63 downto 48) := ex2(64*i+63 downto 64*i+48);
						t64(63 downto 32) := rshifta(t64(63 downto 32), 16);
						t64(31 downto 16) := ex2(64*i+31 downto 64*i+16);
						t64(31 downto  0) := rshifta(t64(31 downto  0), 16);
					when "1100" =>
						t64(63 downto 48) := ex2(64*i+63 downto 64*i+48);
					when "1101" =>
						t64(63 downto 48) := ex2(64*i+63 downto 64*i+48);
						t64 := rshifta(t64, 16);
					when "1110" =>
						t64(63 downto 48) := ex2(64*i+63 downto 64*i+48);
						t64 := rshifta(t64, 32);
					when "1111" =>
						t64(63 downto 48) := ex2(64*i+63 downto 64*i+48);
						t64 := rshifta(t64, 48);
					-- pass through
					when others =>
						t64 := ex2(64*i+63 downto 64*i);
				end case;
				ex3(64*i+63 downto 64*i) := t64;
			end loop;

			Y := ex3;
		end signext;

		constant w : natural := A'length;
		alias aa : std_ulogic_vector(w-1 downto 0) is A;
		alias bb : std_ulogic_vector(w-1 downto 0) is B;
		alias uu : std_ulogic_vector(U'length-1 downto 0) is U;
		variable h1, l1 : std_ulogic_vector(w-1 downto 0);
		variable h2, l2 : std_ulogic_vector(w-1 downto 0);
		variable h3, l3 : std_ulogic_vector(w-1 downto 0);
		variable sx : std_ulogic_vector(w-1 downto 0);
	begin
		phase1(aa, bb(1 downto 0), h1, l1);
		phase2(h1, l1, bb(3 downto 2), uu, h2, l2);
		phase3(h2, l2, bb(5 downto 4), uu, h3, l3);
		signext(aa, bb(5 downto 0), uu, sx);
		if to_X01(Rot) = '1' then
			return h3 or l3;
		elsif to_X01(Ext) = '1' then
			return h3 or sx;
		else
			return h3;
		end if;
	end shuffle_4x4;

	subtype mode_type is std_ulogic_vector(10 downto 0);

	constant MODE_SHIFTL  : mode_type := "10000000000";
	constant MODE_SHIFTR  : mode_type := "01000000000";
	constant MODE_SHIFTRA : mode_type := "00100000000";
	constant MODE_ROTL    : mode_type := "00010000000";
	constant MODE_ROTR    : mode_type := "00001000000";
	constant MODE_BITREV  : mode_type := "00000100000";
	constant MODE_BYTEREV : mode_type := "00000010000";
	constant MODE_SDUP    : mode_type := "00000001000";
	constant MODE_MIXL    : mode_type := "00000000100";
	constant MODE_MIXH    : mode_type := "00000000101";
	constant MODE_EXPANDL : mode_type := "00000000010";
	constant MODE_EXPANDH : mode_type := "00000000011";

	signal Mode : mode_type;

	signal Y_ShiftRotL : std_ulogic_vector(WIDTH-1 downto 0);
	signal Y_ShiftRotR : std_ulogic_vector(WIDTH-1 downto 0);
	signal Y_Bitrev : std_ulogic_vector(WIDTH-1 downto 0);
	signal Y_Bytewise : std_ulogic_vector(WIDTH-1 downto 0);
begin
	-- mode vector
	Mode <= (
		10 => ShiftL,
		 9 => ShiftR,
		 8 => ShiftRA,
		 7 => RotL,
		 6 => RotR,
		 5 => Bitrev,
		 4 => Byterev,
		 3 => Sdup,
		 2 => Mix,
		 1 => Expand,
		 0 => High,
		others => 'X'
	);

	-- shift / rotate / bitrev
	process (A, B, U, ShiftRA, RotL, RotR)
		variable xx : std_ulogic_vector(WIDTH-1 downto 0);
		variable yy : std_ulogic_vector(WIDTH-1 downto 0);
		variable uu : std_ulogic_vector(2 downto 0);
	begin
		-- shiftr / shiftra / rotr
		if FOUR_BY_FOUR then
			xx := shuffle_4x4(A, B, U, RotR, ShiftRA);
		else
			xx := shuffle_8x8(A, B, U, RotR, ShiftRA);
		end if;
		Y_ShiftRotR <= xx;
		-- shiftl / rotl
		-- shiftla ("arithmetic" left-shift) is also possible
		xx := bit_reverse(A);
		if FOUR_BY_FOUR then
			xx := shuffle_4x4(xx, B, U, RotL, '0');
		else
			xx := shuffle_8x8(xx, B, U, RotL, '0');
		end if;
		xx := bit_reverse(xx);
		Y_ShiftRotL <= xx;
		-- bitrev
		-- NOTE: `rev(shl(x))' = `shr(rev(x))'
		yy := (others => '0');
		uu := to_X01(U);
		case uu is
			when "000" => yy( 7 downto 0) := bit_reverse(xx( 7 downto 0));
			when "001" => yy(15 downto 0) := bit_reverse(xx(15 downto 0));
			when "011" => yy(31 downto 0) := bit_reverse(xx(31 downto 0));
			when "111" => yy(63 downto 0) := bit_reverse(xx(63 downto 0));
			when others => yy := (others => 'X');
		end case;
		Y_Bitrev <= yy;
	end process;

	-- bytewise stuff (byterev, sdup, mix, expand)
	process (A, B, U, Byterev, Sdup, Mix, Expand, High)
		variable sel : std_ulogic_vector(7 downto 0);
		variable xx : std_ulogic_vector(WIDTH/2-1 downto 0);
		variable yy : std_ulogic_vector(WIDTH-1 downto 0);
	begin
		sel(7) := Byterev;
		sel(6) := Sdup;
		sel(5) := Mix;
		sel(4) := Expand;
		sel(3) := High;
		sel(2 downto 0) := U;
		sel := to_X01(sel);
		case sel is
			-- byterev
			when "10000000" => xx := X"76543210";
			when "10000001" => xx := X"67452301";
			when "10000011" => xx := X"45670123";
			when "10000111" => xx := X"01234567";
			-- sdup
			when "01000000" => xx := X"00000000";
			when "01000001" => xx := X"10101010";
			when "01000011" => xx := X"32103210";
			when "01000111" => xx := X"76543210";
			-- mixh
			-- XXX: swap input registers?
			when "00101000" => xx := X"F7E6D5C4";
			when "00101001" => xx := X"FE76DC54";
			when "00101011" => xx := X"FEDC7654";
			when "00101111" => xx := X"FEDCBA98"; -- XXX: undefined
			-- mixl
			-- XXX: swap input registers?
			when "00100000" => xx := X"B3A29180";
			when "00100001" => xx := X"BA329810";
			when "00100011" => xx := X"BA983210";
			when "00100111" => xx := X"76543210"; -- XXX: undefined
			-- expandh
			-- XXX: swap input registers?
			when "00011000" => xx := X"F7D5B391";
			when "00011001" => xx := X"FE76BA32";
			when "00011011" => xx := X"FEDC7654";
			when "00011111" => xx := X"FEDCBA98"; -- XXX: undefined
			-- expandl
			-- XXX: swap input registers?
			when "00010000" => xx := X"E6C4A280";
			when "00010001" => xx := X"DC549810";
			when "00010011" => xx := X"BA983210";
			when "00010111" => xx := X"76543210"; -- XXX: undefined
			-- don't care
			when others => xx := (others => 'X');
		end case;
		for i in WIDTH/8-1 downto 0 loop
			yy(8*i+7 downto 8*i) := ab_sel(A, B, xx(4*i+3 downto 4*i));
		end loop;
		Y_Bytewise <= yy;
	end process;

	-- output mux
	process (Mode, Y_ShiftRotL, Y_ShiftRotR, Y_Bitrev, Y_Bytewise)
		variable mm : mode_type;
		variable yy : std_ulogic_vector(WIDTH-1 downto 0);
	begin
		mm := to_X01(Mode);
		case mm is
			when MODE_SHIFTL | MODE_ROTL =>
				yy := Y_ShiftRotL;
			when MODE_SHIFTR | MODE_SHIFTRA | MODE_ROTR =>
				yy := Y_ShiftRotR;
			when MODE_BITREV =>
				yy := Y_Bitrev;
			when MODE_BYTEREV | MODE_SDUP | MODE_MIXL
			   | MODE_MIXH | MODE_EXPANDL | MODE_EXPANDH =>
				yy := Y_Bytewise;
			when others =>
				yy := (others => 'X'); -- don't care
		end case;
		Y <= yy;
	end process;
end Behave_1;

-- vi: set ts=4 sw=4 equalprg="fmt -72 -p--": please
