--------------------------------------------------------------------------
-- f-cpu/vhdl/eu_popc/popc64_RTL.vhdl - POPCOUNT Execution Unit for the F-CPU
-- Copyright (C) 2002 Yann GUIDON (whygee@f-cpu.org)
--
-- created Sat Jun 29 23:47:54 CEST 2002 by whygee@f-cpu.org
-- version Sun Jun 30 05:43:05 CEST 2002 compiles.
-- version Fri Jul  5 06:31:46 CEST 2002 with MR's RTL advices
--
--------------------------BEGIN-VHDL-LICENSE-----------------------------
-- This program is free software; you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 2 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program; if not, write to the Free Software
-- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
---------------------------END-VHDL-LICENSE------------------------------
--
-- This is the 64-bit "building block" for the POPCOUNT unit, it is
-- replicated as needed to form larger SIMD operators.
--
-- This version is a more complex one because it goes down to the gates.
-- it might synthesize well or not, but it's slower than the first version
-- concerning the simulation.
--
--
-- Note : insert pipeline stages wherever needed !
--
--------------------------------------------------------------------------

LIBRARY ieee;
    USE ieee.std_logic_1164.ALL;
    USE work.popc64;
    USE work.Generic_Adder.CIAdd;

Architecture RTL of popc64 is
  signal tmp_XOR : std_ulogic_vector(WIDTH-1 downto 0);

  subtype count_04 is std_ulogic_vector(2 downto 0);
  subtype count_08 is std_ulogic_vector(3 downto 0); -- byte granularity
  subtype count_16 is std_ulogic_vector(4 downto 0); -- 16-bit results
  subtype count_32 is std_ulogic_vector(5 downto 0); -- 32-bit results
  subtype count_64 is std_ulogic_vector(6 downto 0); -- 64-bit results

  type au_08 is array (WIDTH/8  downto 0) of count_08;
  type au_16 is array (WIDTH/16 downto 0) of count_16;
  type au_32 is array (WIDTH/32 downto 0) of count_32;
  type au_64 is array (WIDTH/64 downto 0) of count_64;

  signal tmp_08 : au_08;
  signal tmp_16 : au_16;
  signal tmp_32 : au_32;
  signal tmp_64 : au_64;

  signal
--    result_0, result_1, result_2, result_3, -- the partial results for the MUX.
--    cascade1, cascade2, cascade3 : std_ulogic_vector(WIDTH-1 downto 0);

  -- Michael's "magic" function :
  function "+" (A, B : in std_ulogic_vector) return std_ulogic_vector
  is
--  use work.Generic_Adder.CIAdd;
    constant w : natural := A'length;
    alias aa : std_ulogic_vector(w-1 downto 0) is A;
    alias bb : std_ulogic_vector(w-1 downto 0) is B;
    variable yy, cc : std_ulogic_vector(w-1 downto 0);
    variable pp, gg : std_ulogic;
  begin
    CIAdd(aa, bb, yy, cc, gg, pp);
      -- we only care about yy here
    return yy;
  end "+";

  -- this is the very first stage of the popcount operation :
  -- reduce a 4-bit vector into a 3-bit number.
  function reduce4to3 (bv : in count_08)
    return count_04
  is
    variable t : count_04;
  begin
    t(2) := bv(0) and bv(1) and bv(2) and bv(3);
    t(1) := (( bv(0) and bv(1))
          or ( bv(0) and bv(2))
          or ( bv(0) and bv(3))
          or ( bv(1) and bv(2))
          or ( bv(1) and bv(3))
          or ( bv(2) and bv(3)))
          and not t(2);
    t(0) := bv(0) xor bv(1) xor bv(2) xor bv(3);
    return t;
  end;

begin

  -- the Hamming stuff :
  tmp_XOR <= POPC_in_A xor POPC_in_B;

  -- the first level of popcount :
  first_byte : for i in WIDTH/8-1 downto 0 generate
    tmp_08(i) <= reduce4to3(tmp_XOR(i*8+7 downto i*8+4))
               + reduce4to3(tmp_XOR(i*8+3 downto i*8));
  end generate first_byte;

  loop_add16 : for i in WIDTH/16-1 downto 0 generate
    tmp_16(i) <= tmp_08(i*2+1) + tmp_08(i*2);
  end generate loop_add16;

  loop_add32 : for i in WIDTH/32-1 downto 0 generate
    tmp_32(i) <= tmp_16(i*2+1) + tmp_16(i*2);
  end generate loop_add32;

  loop_add64 : for i in WIDTH/64-1 downto 0 generate
    tmp_64(i) <= tmp_32(i*2+1) + tmp_32(i*2);
--    with POPC_size(2) select
--      POPC_out(i*64+6 downto i*64) <=
--        tmp_64(i) when '1',
--        tmp_32(i*2) & '0' when others;
--    POPC_out(i*64+63 downto i*64+38) <= (others => '0');
--    POPC_out(i*64+37 downto i*64+32) <= tmp_32(i*2+1) and (others => POPC_size(2));
--    POPC_out(i*64+37 downto i*64+32) <= (others => '0');
--      POPC_out(i*64+63 downto i*64) <=
--        tmp_64(i) & std_ulogic_vector(6 downto 0)'(others => '0') when '1',
--        (5 downto 0 => tmp_32(i*2), 37 downto 32 => tmp_32(i*2), others => '0') when others;
  end generate loop_add64;

  POPC_out(6 downto 0) <= tmp_64(0);
  POPC_out(63 downto 7) <= (others => '0');

end;

--    result_0(i*8+7   downto  i*8) <= (3 downto 0 => tmp_08(i), others => '0');
--    result_1(i*16+15 downto i*16) <=
--      std_ulogic_vector(to_unsigned(tmp_16(i), 16));
--  with POPC_size(0) select
--    cascade1 <= result_1 when '1',
--                result_0 when others;
-- 
--    result_2(i*32+31 downto i*32) <=
--      std_ulogic_vector(to_unsigned(tmp_32(i), 32));
--
--  with POPC_size(1) select
--    cascade2 <= result_2 when '1',
--                cascade1 when others;
--
--    result_3(i*64+63 downto i*64) <=
--        std_ulogic_vector(to_unsigned(tmp_64(i), 64));
--
--  with POPC_size(2) select
--    POPC_out <= result_3 when '1',
--                cascade2 when others;
