-- shuffle64.vhdl -- 64-Bit F-CPU Bit Shuffling Unit
-- Copyright (C) 2001 - 2003 Michael Riepe <michael@stud.uni-hannover.de>
--
-- This program is free software; you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 2 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program; if not, write to the Free Software
-- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

-- @(#) $Id: shuffle64.vhdl,v 1.43 2003/04/17 02:22:49 michael Exp $

library IEEE;
use IEEE.std_logic_1164.all;

use work.Bit_Manipulation.all;

entity Shuffle64 is
	generic (
		WIDTH : natural := 64
	);
	port (
		-- shiftee
		A : in std_ulogic_vector(WIDTH-1 downto 0);
		-- shift count
		B : in std_ulogic_vector(WIDTH-1 downto 0);
		-- common shift count on/off switch
		CommonShiftCount : in std_ulogic;
		-- operating mode (mutually exclusive)
		ShiftL : in std_ulogic;
		ShiftR : in std_ulogic;
		ShiftRA : in std_ulogic;
		RotL : in std_ulogic;
		RotR : in std_ulogic;
		Bitrev : in std_ulogic;
		Byterev : in std_ulogic;
		Permute : in std_ulogic;
		Mix : in std_ulogic;
		Expand : in std_ulogic;
		Cshift : in std_ulogic;
		Widen : in std_ulogic;
		-- SIMD mode flags
		U : in std_ulogic_vector(2 downto 0);
		-- clock/reset/enable inputs (unused)
		Clk : in std_ulogic;
		Rst : in std_ulogic;
		En : in std_ulogic;
	--
		-- output
		Y : out std_ulogic_vector(WIDTH-1 downto 0);
		-- alt. output (for double-width shifts)
		Z : out std_ulogic_vector(WIDTH-1 downto 0)
	);
--pragma synthesis_off
begin
	assert (WIDTH >= 64) and (WIDTH mod 64 = 0)
		report "WIDTH must be an integer multiple of 64"
		severity failure;
--pragma synthesis_on
end Shuffle64;

architecture Behave_1 of Shuffle64 is
	-- single omega stage
	function omega_1 (A, B : in std_ulogic_vector) return std_ulogic_vector is
		constant w : natural := A'length;
		variable aa : std_ulogic_vector(w-1 downto 0);
		variable bb : std_ulogic_vector(w/2-1 downto 0);
		variable yy : std_ulogic_vector(w-1 downto 0);
		variable j, k : natural;
	begin
--pragma synthesis_off
		assert A'length = w;
		assert 2 * B'length = w;
--pragma synthesis_on
		aa := A;
		bb := B;
		for i in 0 to w-1 loop
			j := 2 * i;
			if j < w then
				k := j + 1;
			else
				k := j - w;
				j := k + 1;
			end if;
			if to_X01(bb(j / 2)) = '1' then
				yy(i) := aa(k);
			else
				yy(i) := aa(j);
			end if;
		end loop;
		return yy;
	end omega_1;

	function shift_mask (A : in std_ulogic_vector) return std_ulogic_vector is
		constant L : natural := A'length;
		constant w : natural := 2 ** L;
		variable aa : std_ulogic_vector(L-1 downto 0);
		variable yy : std_ulogic_vector(w-1 downto 0);
		variable k : natural;
	begin
		aa := A;
		case L is
			when 0 =>
				yy(0) := '0';
			when 1 =>
				yy(1) := '0';
				yy(0) := aa(0);
			when 2 =>
				yy(3) := '0';
				yy(2) := aa(1) and aa(0);
				yy(1) := aa(1);
				yy(0) := aa(1) or aa(0);
			when others =>
				-- higher layers
				yy(7) := '0';
				yy(6) := aa(2) and aa(1) and aa(0);
				yy(5) := aa(2) and aa(1);
				yy(4) := aa(2) and (aa(1) or aa(0));
				yy(3) := aa(2);
				yy(2) := aa(2) or (aa(1) and aa(0));
				yy(1) := aa(2) or aa(1);
				yy(0) := aa(2) or aa(1) or aa(0);
				if L > 3 then
					for i in 3 to A'length-1 loop
						k := 2 ** i;
						for j in 0 to k-1 loop
							yy(j+k) := yy(j) and aa(i);
							yy(j+0) := yy(j)  or aa(i);
						end loop;
					end loop;
				end if;
		end case;
		return yy;
	end shift_mask;

	function omega_ctrl (B : in std_ulogic_vector;
						 Rev, Right : in std_ulogic) return std_ulogic_vector is
		constant w : natural := B'length;
		constant ww : natural := 2**(w-1);
		variable bb : std_ulogic_vector(w-1 downto 0);
		variable bx : std_ulogic_vector(w-1 downto 0);
		variable yy : std_ulogic_vector(ww-1 downto 0);
	begin
--pragma synthesis_off
		assert w > 0;
--pragma synthesis_on
		bb := B;
		yy := (others => bb(w-1) xor Rev);
		if w > 1 then
			bx := (others => Rev or Right);
			bx := bx xor bb;
			yy := yy xor shift_mask(bx(w-2 downto 0));
			if to_X01(Rev or Right) = '1' then
				yy := lshift(not yy, 1, yy(ww-1));
			end if;
		end if;
		return yy;
	end omega_ctrl;

	function rotate (A : in std_ulogic_vector;
					 B : in std_ulogic_vector;
					 U : in std_ulogic_vector;
					 Rev, Right : in std_ulogic) return std_ulogic_vector is
		constant w : natural := A'length;
		constant hw : natural := w / 2;
		constant chunks : natural := w / 8;
		variable aa : std_ulogic_vector(w-1 downto 0);
		variable bb : std_ulogic_vector(w-1 downto 0);
		variable uu : std_ulogic_vector(U'length-1 downto 0);
		variable yy : std_ulogic_vector(w-1 downto 0);
		variable xx : std_ulogic_vector(hw-1 downto 0);
		variable xt : std_ulogic_vector(hw-1 downto 0);
		variable pi : natural;
	begin
--pragma synthesis_off
		assert w = 64;
		assert B'length = w;
		assert U'length >= 3;
--pragma synthesis_on
		aa := A;
		bb := B;
		uu := U;
		-- omega stages
		yy := aa;
		for i in 0 to 2 loop
			pi := 2**i;
			for j in chunks-1 downto 0 loop
				xt(pi-1 downto 0) := omega_ctrl(bb(8*j+i downto 8*j), Rev, Right);
				for k in pi-1 downto 0 loop
					xx(chunks*k+j) := xt(k);
				end loop;
			end loop;
			xx := bit_duplicate(xx(chunks*pi-1 downto 0), hw/8/pi);
			yy := omega_1(yy, xx);
		end loop;
		for i in 3 to 5 loop
			pi := 2**i;
			for j in hw/pi-1 downto 0 loop
				xt(pi-1 downto 0) := omega_ctrl(bb(2*pi*j+i downto 2*pi*j), Rev, Right);
				for k in pi-1 downto 0 loop
					xx((hw/pi)*k+j) := xt(k) and uu(i-3);
				end loop;
			end loop;
			yy := omega_1(yy, xx);
		end loop;
		return yy;
	end rotate;

	signal Mode : std_ulogic_vector(11 downto 0);
	signal Mode_Byte : std_ulogic;

	signal Y_Bitwise  : std_ulogic_vector(WIDTH-1 downto 0);
	signal Y_BitExt   : std_ulogic_vector(WIDTH-1 downto 0);
	signal Y_Bytewise : std_ulogic_vector(WIDTH-1 downto 0);
	signal Y_ByteExt  : std_ulogic_vector(WIDTH-1 downto 0);
begin
	-- mode vector
	Mode <= (
		11 => Widen,
		10 => ShiftL,
		9 => ShiftR,
		8 => ShiftRA,
		7 => RotL,
		6 => RotR,
		5 => Bitrev,
		4 => Byterev,
		3 => Permute,
		2 => Mix,
		1 => Expand,
		0 => Cshift,
		others => 'X'
	);
	Mode_Byte <= Widen or Byterev or Permute or Mix or Expand or Cshift;

	-- shift / rotate / bitrev
	process (A, B, U, Mode, CommonShiftCount)
		function chunk_mask (B : in std_ulogic_vector;
							 Rev, Right : in std_ulogic) return std_ulogic_vector is
			constant w : natural := B'length;
			constant pw : natural := 2 ** w;
			variable bb : std_ulogic_vector(w-1 downto 0);
			variable bx : std_ulogic_vector(w-1 downto 0);
			variable yy : std_ulogic_vector(pw-1 downto 0);
		begin
			bb := B;
			bx := (others => Rev or Right);
			yy := shift_mask(bb xor bx);
			if to_X01(Rev or Right) = '1' then
				yy := lshift(not yy, 1);
			end if;
			return yy;
		end chunk_mask;

		procedure bitwise (A, B, U : in std_ulogic_vector;
						   M : in std_ulogic_vector;
						   Y, Z : out std_ulogic_vector) is
			constant L : natural := A'length;
			variable aa : std_ulogic_vector(L-1 downto 0);
			variable bx : std_ulogic_vector(L-1 downto 0);
			variable ee : std_ulogic_vector(L-1 downto 0);
			variable mm : std_ulogic_vector(L-1 downto 0);
			variable uu : std_ulogic_vector(U'length-1 downto 0);
			variable xx : std_ulogic_vector(L-1 downto 0);
			variable yy : std_ulogic_vector(L-1 downto 0);
			variable zz : std_ulogic_vector(L-1 downto 0);
			variable Right : std_ulogic;
		begin
--pragma synthesis_off
			assert L = 64;
			assert A'length = L;
			assert B'length = L;
			assert Y'length = L;
			assert Z'length = L;
--pragma synthesis_on

			-- inputs
			-- d=0
			aa := A;
			bx := B;
			uu := to_X01(U);
			-- d=1
			Right := M(9) or M(8) or M(6);

			-- omega network (SIMD left/right rotate)
			xx := rotate(aa, bx, uu, M(5), Right);

			-- SIMD mask replication / sign extension
			ee := (others => M(8));
			if uu(2) = '1' then
				mm(63 downto  0) := chunk_mask(bx( 5 downto  0), M(5), Right);
				ee := ee and bit_duplicate(bit_extract(aa, 64, 63), 64);
			elsif uu(1) = '1' then
				mm(31 downto  0) := chunk_mask(bx( 4 downto  0), M(5), Right);
				mm(63 downto 32) := chunk_mask(bx(36 downto 32), M(5), Right);
				ee := ee and bit_duplicate(bit_extract(aa, 32, 31), 32);
			elsif uu(0) = '1' then
				mm(15 downto  0) := chunk_mask(bx( 3 downto  0), M(5), Right);
				mm(31 downto 16) := chunk_mask(bx(19 downto 16), M(5), Right);
				mm(47 downto 32) := chunk_mask(bx(35 downto 32), M(5), Right);
				mm(63 downto 48) := chunk_mask(bx(51 downto 48), M(5), Right);
				ee := ee and bit_duplicate(bit_extract(aa, 16, 15), 16);
			else
				mm( 7 downto  0) := chunk_mask(bx( 2 downto  0), M(5), Right);
				mm(15 downto  8) := chunk_mask(bx(10 downto  8), M(5), Right);
				mm(23 downto 16) := chunk_mask(bx(18 downto 16), M(5), Right);
				mm(31 downto 24) := chunk_mask(bx(26 downto 24), M(5), Right);
				mm(39 downto 32) := chunk_mask(bx(34 downto 32), M(5), Right);
				mm(47 downto 40) := chunk_mask(bx(42 downto 40), M(5), Right);
				mm(55 downto 48) := chunk_mask(bx(50 downto 48), M(5), Right);
				mm(63 downto 56) := chunk_mask(bx(58 downto 56), M(5), Right);
				ee := ee and bit_duplicate(bit_extract(aa,  8,  7),  8);
			end if;

			-- select operation
			if to_X01(M(6) or M(7)) = '1' then
				-- rotate operation
				yy := xx;
				zz := (others => 'X');
			else
				-- shift operation
				for i in L-1 downto 0 loop
					if to_X01(mm(i)) = '1' then
						yy(i) := ee(i);
						zz(i) := xx(i);
					else
						yy(i) := xx(i);
						zz(i) := '0';
					end if;
				end loop;
			end if;
			Y := yy;
			Z := zz;
		end bitwise;

		variable bb : std_ulogic_vector(WIDTH-1 downto 0);
		variable yy : std_ulogic_vector(WIDTH-1 downto 0);
		variable zz : std_ulogic_vector(WIDTH-1 downto 0);
	begin
		bb := B;

		-- shift count replicator
		if to_X01(CommonShiftCount) = '1' then
			bb := vector_duplicate(bb(7 downto 0), WIDTH/8);
		elsif to_X01(U(2)) = '1' then
			for i in WIDTH/64-1 downto 0 loop
				bb(64*i+63 downto 64*i) :=
					vector_duplicate(bb(64*i+7 downto 64*i), 64/8);
			end loop;
		elsif to_X01(U(1)) = '1' then
			for i in WIDTH/32-1 downto 0 loop
				bb(32*i+31 downto 32*i) :=
					vector_duplicate(bb(32*i+7 downto 32*i), 32/8);
			end loop;
		elsif to_X01(U(0)) = '1' then
			for i in WIDTH/16-1 downto 0 loop
				bb(16*i+15 downto 16*i) :=
					vector_duplicate(bb(16*i+7 downto 16*i), 16/8);
			end loop;
		end if;

		-- independent 64-bit slices
		for i in WIDTH/64-1 downto 0 loop
			bitwise(A(64*i+63 downto 64*i), bb(64*i+63 downto 64*i), U,
				Mode, yy(64*i+63 downto 64*i), zz(64*i+63 downto 64*i));
		end loop;

		-- output signals
		Y_Bitwise <= yy;
		Y_BitExt <= zz;
	end process;

	-- bytewise stuff (byterev, mix, expand, permute/sdup, cshift, widen)
	process (A, B, U, Mode)
		-- mix[l/h] instructions
		function do_mix (A, B : in std_ulogic_vector;
					     U : in std_ulogic_vector) return std_ulogic_vector is
			constant L : natural := A'length;
			variable aa : std_ulogic_vector(L-1 downto 0);
			variable bb : std_ulogic_vector(L-1 downto 0);
			variable yy : std_ulogic_vector(2*L-1 downto 0);
		begin
--pragma synthesis_off
			assert L mod 64 = 0;
			assert A'length = L;
			assert B'length = L;
			assert (U'left = 2) and (U'right = 0);
--pragma synthesis_on
			aa := A;
			bb := B;
			yy := (others => 'X');
			-- mix.8
			-- d=0
			for i in L/8-1 downto 0 loop
				-- bytewise intersect A and B
				yy(16*i+ 7 downto 16*i+0) := aa(8*i+7 downto 8*i);
				yy(16*i+15 downto 16*i+8) := bb(8*i+7 downto 8*i);
			end loop;
			-- mix.16
			-- d=1
			if to_X01(U(0)) = '1' then
				for i in L/16-1 downto 0 loop
					-- swap second and third byte
					yy(32*i+23 downto 32*i+8) :=
						lrotate(yy(32*i+23 downto 32*i+8), 8);
				end loop;
			end if;
			-- mix.32
			-- d=2
			if to_X01(U(1)) = '1' then
				for i in L/32-1 downto 0 loop
					-- swap second and third 16-bit word
					yy(64*i+47 downto 64*i+16) :=
						lrotate(yy(64*i+47 downto 64*i+16), 16);
				end loop;
			end if;
			-- mix.64
			-- d=3
			if to_X01(U(2)) = '1' then
				for i in L/64-1 downto 0 loop
					-- swap second and third 32-bit word
					yy(128*i+95 downto 128*i+32) :=
						lrotate(yy(128*i+95 downto 128*i+32), 32);
				end loop;
			end if;
			return yy;
		end do_mix;

		-- expand[l/h] instructions
		function do_expand (A, B : in std_ulogic_vector;
						    U : in std_ulogic_vector) return std_ulogic_vector is
			constant L : natural := A'length;
			variable yy : std_ulogic_vector(2*L-1 downto 0);
			variable uu : std_ulogic_vector(2 downto 0);
			variable tt : std_ulogic_vector(63 downto 0);
		begin
--pragma synthesis_off
			assert L mod 64 = 0;
			assert A'length = L;
			assert B'length = L;
			assert (U'left = 2) and (U'right = 0);
--pragma synthesis_on
			yy := B & A;
			uu := to_X01(U);
			case uu is
				when "000" =>
					for i in L/16-1 downto 0 loop
						tt(7 downto 0) := yy(16*i+15 downto 16*i+8);
						yy(16*i+15 downto 16*i+8) :=
							yy(L+16*i+7 downto L+16*i+0);
						yy(L+16*i+7 downto L+16*i+0) := tt(7 downto 0);
					end loop;
				when "001" =>
					for i in L/32-1 downto 0 loop
						tt(15 downto 0) := yy(32*i+31 downto 32*i+16);
						yy(32*i+31 downto 32*i+16) :=
							yy(L+32*i+15 downto L+32*i+0);
						yy(L+32*i+15 downto L+32*i+0) := tt(15 downto 0);
					end loop;
				when "011" =>
					for i in L/64-1 downto 0 loop
						tt(31 downto 0) := yy(64*i+63 downto 64*i+32);
						yy(64*i+63 downto 64*i+32) :=
							yy(L+64*i+31 downto L+64*i+0);
						yy(L+64*i+31 downto L+64*i+0) := tt(31 downto 0);
					end loop;
				when "111" =>
					if L > 64 then
						for i in L/128-1 downto 0 loop
							tt(63 downto 0) := yy(128*i+127 downto 128*i+64);
							yy(128*i+127 downto 128*i+64) :=
								yy(L+128*i+63 downto L+128*i+0);
							yy(L+128*i+63 downto L+128*i+0) := tt(63 downto 0);
						end loop;
					end if;
				when others =>
					yy := (others => 'X');
			end case;
			return yy;
		end do_expand;

		-- byterev instruction
		function do_byterev (A : in std_ulogic_vector;
						     U : in std_ulogic_vector) return std_ulogic_vector is
			constant L : natural := A'length;
			variable yy : std_ulogic_vector(L-1 downto 0);
		begin
--pragma synthesis_off
			assert L mod 64 = 0;
			assert A'length = L;
			assert (U'left = 2) and (U'right = 0);
--pragma synthesis_on
			-- byterev.8 (no-op)
			-- d=0
			yy := A;
			-- byterev.16
			-- d=1
			if to_X01(U(0)) = '1' then
				for i in L/16-1 downto 0 loop
					yy(16*i+15 downto 16*i) :=
						lrotate(yy(16*i+15 downto 16*i), 8);
				end loop;
			end if;
			-- byterev.32
			-- d=2
			if to_X01(U(1)) = '1' then
				for i in L/32-1 downto 0 loop
					yy(32*i+31 downto 32*i) :=
						lrotate(yy(32*i+31 downto 32*i), 16);
				end loop;
			end if;
			-- byterev.64
			-- d=3
			if to_X01(U(2)) = '1' then
				for i in L/64-1 downto 0 loop
					yy(64*i+63 downto 64*i) :=
						lrotate(yy(64*i+63 downto 64*i), 32);
				end loop;
			end if;
			return yy;
		end do_byterev;

		-- widen instruction
		function do_widen (A : in std_ulogic_vector;
						   U : in std_ulogic_vector) return std_ulogic_vector is
			constant L : natural := A'length;
			variable yy : std_ulogic_vector(L-1 downto 0);
		begin
--pragma synthesis_off
			assert L mod 64 = 0;
			assert A'length = L;
			assert (U'left = 2) and (U'right = 0);
--pragma synthesis_on
			yy := A;
			if to_X01(U(0)) = '0' then
				for i in WIDTH/64-1 downto 0 loop
					yy(64*i+63 downto 64*i+8) :=
						(64*i+63 downto 64*i+8 => yy(64*i+7));
				end loop;
			elsif to_X01(U(1)) = '0' then
				for i in WIDTH/64-1 downto 0 loop
					yy(64*i+63 downto 64*i+16) :=
						(64*i+63 downto 64*i+16 => yy(64*i+15));
				end loop;
			elsif to_X01(U(2)) = '0' then
				for i in WIDTH/64-1 downto 0 loop
					yy(64*i+63 downto 64*i+32) :=
						(64*i+63 downto 64*i+32 => yy(64*i+31));
				end loop;
			end if;
			return yy;
		end do_widen;

		-- select arbitrary chunks
		function vsel (A, B : in std_ulogic_vector) return std_ulogic_vector is
			constant L : natural := A'length;
			constant N : natural := B'length;
			variable aa : std_ulogic_vector(L-1 downto 0);
			variable bb : std_ulogic_vector(N-1 downto 0);
			variable tt : std_ulogic_vector(N-1 downto 0);
			variable x : natural;
		begin
--pragma synthesis_off
			assert N >= 8;
			assert L mod N = 0;
			assert A'length = L;
			assert B'length = N;
--pragma synthesis_on
			aa := A;
			bb := to_X01(B);
			for j in 0 to 7 loop	-- limited to 256 chunks
				x := N * 2 ** (j + 1);
				exit when L < x;
				for i in 0 to L/x-1 loop	-- MUST be ascending
					if bb(j) = '1' then
						tt := aa(N*(2*i+2)-1 downto N*(2*i+1));
					else
						tt := aa(N*(2*i+1)-1 downto N*(2*i+0));
					end if;
					aa(N*(i+1)-1 downto N*i) := tt;
				end loop;
			end loop;
			return aa(N-1 downto 0);
		end vsel;

		-- permute (vsel) instruction
		function do_permute (A, B : in std_ulogic_vector;
							 U : in std_ulogic_vector) return std_ulogic_vector is
			constant L : natural := A'length;
			variable aa : std_ulogic_vector(L-1 downto 0);
			variable bb : std_ulogic_vector(L-1 downto 0);
			variable uu : std_ulogic_vector(2 downto 0);
			variable yy : std_ulogic_vector(L-1 downto 0);
		begin
--pragma synthesis_off
			assert L mod 64 = 0;
			assert A'length = L;
			assert B'length = L;
			assert (U'left = 2) and (U'right = 0);
--pragma synthesis_on
			aa := A;
			bb := B;
			uu := to_X01(U);
			yy := (others => 'X');
			if uu(2) = '1' then
				bb := lshift(bb, 1);
				for i in L/64-1 downto 0 loop
					bb(64*i+63 downto 64*i+33) := bb(64*i+31 downto 64*i+1);
					bb(64*i+32) := '1';
					bb(64*i) := '0';
				end loop;
			end if;
			if uu(1) = '1' then
				bb := lshift(bb, 1);
				for i in L/32-1 downto 0 loop
					bb(32*i+31 downto 32*i+17) := bb(32*i+15 downto 32*i+1);
					bb(32*i+16) := '1';
					bb(32*i) := '0';
				end loop;
			end if;
			if uu(0) = '1' then
				bb := lshift(bb, 1);
				for i in L/16-1 downto 0 loop
					bb(16*i+15 downto 16*i+ 9) := bb(16*i+ 7 downto 16*i+1);
					bb(16*i+ 8) := '1';
					bb(16*i) := '0';
				end loop;
			end if;
			for i in L/8-1 downto 0 loop
				yy(8*i+7 downto 8*i) := vsel(aa, bb(8*i+7 downto 8*i));
			end loop;
			return yy;
		end do_permute;

		-- cshiftl instruction
		function do_cshiftl (A, B : in std_ulogic_vector;
							 U : in std_ulogic_vector) return std_ulogic_vector is
			constant L : natural := A'length;
			variable aa : std_ulogic_vector(L-1 downto 0);
			variable bb : std_ulogic_vector(L-1 downto 0);
			variable uu : std_ulogic_vector(2 downto 0);
			variable yy : std_ulogic_vector(L-1 downto 0);
		begin
--pragma synthesis_off
			assert L mod 64 = 0;
			assert A'length = L;
			assert B'length = L;
			assert (U'left = 2) and (U'right = 0);
--pragma synthesis_on
			aa := A;
			bb := B;
			uu := to_X01(U);
			-- cshiftl.8
			-- d=0
			yy := lshift(aa, 8);
			yy(7 downto 0) := bb(7 downto 0);
			-- cshiftl.16
			-- d=1
			if to_X01(U(0)) = '1' then
				yy := lshift(yy, 8);
				yy(15 downto 0) := bb(15 downto 0);
			end if;
			-- cshiftl.32
			-- d=2
			if to_X01(U(1)) = '1' then
				yy := lshift(yy, 16);
				yy(31 downto 0) := bb(31 downto 0);
			end if;
			-- cshiftl.64
			-- d=3
			if to_X01(U(2)) = '1' then
				yy := lshift(yy, 32);
				yy(63 downto 0) := bb(63 downto 0);
			end if;
			return yy;
		end do_cshiftl;

		-- cshiftr instruction
		function do_cshiftr (A, B : in std_ulogic_vector;
							 U : in std_ulogic_vector) return std_ulogic_vector is
			constant L : natural := A'length;
			variable aa : std_ulogic_vector(L-1 downto 0);
			variable bb : std_ulogic_vector(L-1 downto 0);
			variable uu : std_ulogic_vector(2 downto 0);
			variable yy : std_ulogic_vector(L-1 downto 0);
		begin
--pragma synthesis_off
			assert L mod 64 = 0;
			assert A'length = L;
			assert B'length = L;
			assert (U'left = 2) and (U'right = 0);
--pragma synthesis_on
			aa := A;
			bb := B;
			uu := to_X01(U);
			-- cshiftr.8
			-- d=0
			yy := rshift(aa, 8);
			yy(L-1 downto L-8) := bb(7 downto 0);
			-- cshiftr.16
			-- d=1
			if to_X01(U(0)) = '1' then
				yy := rshift(yy, 8);
				yy(L-1 downto L-8) := bb(15 downto 8);
			end if;
			-- cshiftr.32
			-- d=2
			if to_X01(U(1)) = '1' then
				yy := rshift(yy, 16);
				yy(L-1 downto L-16) := bb(31 downto 16);
			end if;
			-- cshiftr.64
			-- d=3
			if to_X01(U(2)) = '1' then
				yy := rshift(yy, 32);
				yy(L-1 downto L-32) := bb(63 downto 32);
			end if;
			return yy;
		end do_cshiftr;

		variable sel : std_ulogic_vector(5 downto 0);
		variable yy : std_ulogic_vector(WIDTH-1 downto 0);
		variable zz : std_ulogic_vector(WIDTH-1 downto 0);
		variable yz : std_ulogic_vector(2*WIDTH-1 downto 0);
	begin
		sel(5) := Mode(11);	-- widen
		sel(4) := Mode(4);	-- byterev
		sel(3) := Mode(3);	-- permute
		sel(2) := Mode(2);	-- mix
		sel(1) := Mode(1);	-- expand
		sel(0) := Mode(0);	-- cshift
		sel := to_X01(sel);
		case sel is
			when "100000" =>
				yy := do_widen(A, U);
				zz := do_widen(B, U);
			when "010000" =>
				yy := do_byterev(A, U);
				zz := do_byterev(B, U);
			when "001000" =>
				yy := do_permute(A, B, U);
				-- Z = sdup(A, B)
				zz := vector_duplicate(B(7 downto 0), WIDTH/8);
				zz := do_permute(A, zz, U);
			when "000100" =>
				-- XXX: swap input registers?
				yz := do_mix(A, B, U);
				yy := yz(WIDTH-1 downto 0);
				zz := yz(2*WIDTH-1 downto WIDTH);
			when "000010" =>
				-- XXX: swap input registers?
				yz := do_expand(A, B, U);
				yy := yz(WIDTH-1 downto 0);
				zz := yz(2*WIDTH-1 downto WIDTH);
			when "000001" =>
				yy := do_cshiftl(A, B, U);
				zz := do_cshiftr(A, B, U);
			when others =>
				-- don't care
				yy := (others => 'X');
				zz := (others => 'X');
		end case;

		-- output signals
		Y_Bytewise <= yy;
		Y_ByteExt <= zz;
	end process;

	-- output mux
	process (Mode_Byte, Y_Bitwise, Y_BitExt, Y_Bytewise, Y_ByteExt)
		variable yy : std_ulogic_vector(WIDTH-1 downto 0);
		variable zz : std_ulogic_vector(WIDTH-1 downto 0);
	begin
		if to_X01(Mode_Byte) = '1' then
			-- bytewise operations
			yy := Y_Bytewise;
			zz := Y_ByteExt;
		else
			-- bitwise operations
			yy := Y_Bitwise;
			zz := Y_BitExt;
		end if;

		-- output signals
		Y <= yy;
		Z <= zz;
	end process;
end Behave_1;

-- vi: set ts=4 sw=4 equalprg="fmt -72 -p--": please
