-- imul3.vhdl - F-CPU 8x8-Bit Integer Multiplication Unit
-- Copyright (C) 2000 Michael Riepe <michael@s...>
--
-- This program is free software; you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 2 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program; if not, write to the Free Software
-- Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

-- $Id: imul3.vhdl,v 1.1 2000/10/07 23:40:07 michael Exp $

library IEEE;
use IEEE.std_logic_1164.all;
use IEEE.numeric_std.all;

--
-- the 9x9(8x8)-bit multiplier
--
entity Mul9x9 is
        port (
                -- inputs (will be changed to 8-bit later)
                A, B : in std_ulogic_vector(8 downto 0);
                -- optional double-width `add' input
                X : in std_ulogic_vector(15 downto 0) := (others => '0');
                -- double-width output
                Y : out std_ulogic_vector(15 downto 0)
        );
end Mul9x9;

architecture Arch_1 of Mul9x9 is
        --
        -- multiplexer function (called in stage 2)
        --
        function bmux (
                A : in std_ulogic_vector(3 downto 0);
                B : in std_ulogic_vector(8 downto 0);
                B3 : in std_ulogic_vector(10 downto 0))
        return std_ulogic_vector is
                variable res : std_ulogic_vector(10 downto 0);
                variable sel : std_ulogic_vector(2 downto 0);
        begin
                -- NOTE: calculation of `sel' can be put in first stage if necessary
                sel := A(2 downto 0) xor (2 downto 0 => A(3));
                case sel is
                        when "000" =>
                                res := (others => '0');
                        when "001" | "010" =>
                                res := (others => B(B'left));
                                res(B'range) := B;
                        when "011" | "100" =>
                                res := (0 => '0', others => B(B'left));
                                res(B'left+1 downto B'right+1) := B;
                        when "101" | "110" =>
                                res := (others => B3(B3'left));
                                res(B3'range) := B3;
                        when "111" =>
                                res := (0 | 1 => '0', others => B(B'left));
                                res(B'left+2 downto B'right+2) := B;
                        when others =>
                                res := (others => 'X');
                end case;
                if (A(3) = '1') then
                        -- subtract
                        res := not res;
                end if;
                return res;
        end bmux;
        --
        -- stage 1 : setup
        --
        signal a1 : std_ulogic_vector(8 downto 0);
        signal b1 : std_ulogic_vector(8 downto 0);
        signal b1_3 : std_ulogic_vector(10 downto 0);
        signal y1, z1 : std_ulogic_vector(16 downto 0);
        --
        -- stage 2 : muxes
        --
        signal b2a, b2b, b2c : std_ulogic_vector(10 downto 0);
        signal y2, z2 : std_ulogic_vector(16 downto 0);
        --
        -- stage 3 : carry-save adders
        --
        signal y3, z3 : std_ulogic_vector(16 downto 0);
        --
        -- stage 4 : final adder (drives Y)
        --
begin
        --
        -- setup stage
        --
        stage1 : process (A, B, X)
        begin
                -- pass `A' and `B' through
                -- will handle signed/unsigned mode switching here (later)
                a1 <= A;
                b1 <= B;
                -- precalculate `3*B'
                b1_3 <= std_ulogic_vector(
                                        to_signed(3 * to_integer(signed(B)), b1_3'length));
                -- optional input for multiply-and-add
                y1 <= "0" & X;
                -- `magic' correction value for subtractions
                z1 <= (6 => A(8), 3 => A(5), 0 => A(2), others => '0');
        end process;

        --
        -- input mux stage
        --
        stage2 : process (a1, b1, b1_3, y1, z1)
        begin
                -- input multiplexers
                b2a <= bmux(a1(2 downto 0) & '0', b1, b1_3);
                b2b <= bmux(a1(5 downto 2), b1, b1_3);
                b2c <= bmux(a1(8 downto 5), b1, b1_3);
                -- pass `y1' and `z1' through
                y2 <= y1;
                z2 <= z1;
        end process;

        --
        -- adder farm stage
        --
        stage3 : process (b2a, b2b, b2c, y2, z2)
                -- sum outputs
                variable s1, s2, s3 : std_ulogic_vector(16 downto 0);
                -- carry outputs
                variable c1, c2, c3 : std_ulogic_vector(16 downto 0);
                -- temporary summand
                variable t : std_ulogic_vector(16 downto 0);
        begin
                -- adder row #1
                t := (others => b2a(b2a'left)); t(b2a'range) := b2a;
                s1 := y2 xor z2 xor t;
                c1 := (y2 and z2) or (y2 and t) or (z2 and t);
                c1 := c1(c1'left-1 downto 0) & "0";             -- wired 1-bitleft shift

                -- adder row #2
                t := (others => b2b(b2b'left)); t(b2b'range) := b2b;
                t := t(t'left-3 downto 0) & "000";              -- wired 3-bitleft shift
                s2 := s1 xor c1 xor t;
                c2 := (s1 and c1) or (s1 and t) or (c1 and t);
                c2 := c2(c2'left-1 downto 0) & "0";             -- wired 1-bitleft shift

                -- adder row #3
                t := (others => b2c(b2c'left)); t(b2c'range) := b2c;
                t := t(t'left-6 downto 0) & "000000";   -- wired 6-bit left shift
                s3 := s2 xor c2 xor t;
                c3 := (s2 and c2) or (s2 and t) or (c2 and t);
                c3 := c3(c3'left-1 downto 0) & "0";             -- wired 1-bitleft shift

                -- output drivers
                y3 <= s3;
                z3 <= c3;
        end process;

        --
        -- output adder stage
        --
        stage4 : process (y3, z3)
                -- temporary result
                variable t : unsigned(16 downto 0);
        begin
                -- high-bit adder
                t := unsigned(y3) + unsigned(z3);
                -- output driver
                Y <= std_ulogic_vector(t(Y'range));
        end process;
end Arch_1;

-- vi: set ts=4 sw=4 : please
