--------------------------------------------------------------------------
-- f-cpu/vhdl/eu_popc/popc64.vhdl - POPCOUNT Execution Unit for the F-CPU
-- Copyright (C) 2002 Yann GUIDON (whygee@f-cpu.org)
--
-- created Sat Jun 29 23:47:54 CEST 2002 by whygee@f-cpu.org
-- version Sun Jun 30 05:43:05 CEST 2002 compiles.
--
--------------------------BEGIN-VHDL-LICENSE-----------------------------
-- This program is free software; you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 2 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program; if not, write to the Free Software
-- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
---------------------------END-VHDL-LICENSE------------------------------
--
-- This is the 64-bit "building block" for the POPCOUNT unit, it is
-- replicated as needed to form larger SIMD operators.
--
-- It is not particularly well optimised for anything, be careful...
-- I hope however that it is not dog slow for simulations.
-- Synthesis should be ok when Michael Riepe finds a way to reuse
-- his generic adders...
-- 
-- A larger version (128+) would be cool but is it necessary or implementable ?
--
--------------------------------------------------------------------------

LIBRARY ieee;
    USE ieee.std_logic_1164.ALL;
    USE ieee.numeric_std.all;

Entity popc64 is
  generic (
    WIDTH : natural := 64;  -- do not change!
    PIPELINED : integer := 0
  );
  port(
    POPC_in_A,
    POPC_in_B : in std_ulogic_vector(WIDTH-1 downto 0); -- the 2 operands
    POPC_size : in std_ulogic_vector(2 downto 0);

-- Size :
-- "000" =>  8 bits
-- "001" => 16 bits
-- "010" => 32 bits
-- "100" => 64 bits

--    Clk       : in std_ulogic;

    POPC_out  : out std_ulogic_vector(WIDTH-1 downto 0) -- the result
  );

--pragma synthesis_off
  begin
    assert WIDTH = 64
      report "width of IAdd must be 64"
      severity failure;
    assert PIPELINED = 0
      report "POPC is not yet pipelined !"
      severity failure;
--pragma synthesis_on
end popc64;

Architecture arch1 of popc64 is
  signal tmp_XOR : std_ulogic_vector(WIDTH-1 downto 0);

  subtype count_08 is integer range 0 to  8; -- byte granularity
  subtype count_16 is integer range 0 to 16; -- 16-bit results
  subtype count_32 is integer range 0 to 32; -- 32-bit results
  subtype count_64 is integer range 0 to 64; -- 64-bit results

  type au_08 is array (WIDTH/8  downto 0) of count_08;
  type au_16 is array (WIDTH/16 downto 0) of count_16;
  type au_32 is array (WIDTH/32 downto 0) of count_32;
  type au_64 is array (WIDTH/64 downto 0) of count_64;

  signal tmp_08 : au_08;
  signal tmp_16 : au_16;
  signal tmp_32 : au_32;
  signal tmp_64 : au_64;

  signal
    result_0, result_1, result_2, result_3, -- the partial results for the MUX.
    cascade1, cascade2, cascade3 : std_ulogic_vector(WIDTH-1 downto 0);


  function lazy_popc (bv : in std_ulogic_vector(7 downto 0))
    return count_08
  is
    -- variable count : count_08 := 0;

    type t_lut is array (0 to 15) of integer range 0 to 4;
    constant popc_lut : t_lut :=
     ( 0, 1, 1, 2,
       1, 2, 2, 3,  -- so the synthesizer is free to optimize
       1, 2, 2, 3,  -- the way it wants...
       2, 3, 3, 4);

  begin
    -- should be replaced with a carry-increment
    --  or carry-select unit...

    -- first "working" version :
    -- for i in bv'range loop
    --   if bv(i) = '1' then
    --     count := count+1;
    --   end if;
    -- end loop;
    -- return count;

    return popc_lut(to_integer(UNSIGNED(bv(7 downto 4))))
         + popc_lut(to_integer(UNSIGNED(bv(3 downto 0))));

    -- shortcuts :
    -- MSB = and-reduce(bv)
    -- LSB = xor-reduce(bv)
  end;

begin

  -- the Hamming stuff :
  tmp_XOR <= POPC_in_A xor POPC_in_B;

  -- the first level of popcount :
  first_byte : for i in WIDTH/8-1 downto 0 generate
    tmp_08(i) <= lazy_popc(tmp_XOR(i*8+7 downto i*8));
    result_0(i*8+7   downto  i*8) <=
      std_ulogic_vector(to_unsigned(tmp_08(i),  8));
  end generate first_byte;

  -- the tree of adders (layer by layer)
  -- and the distributed MUX :

  loop_add16 : for i in WIDTH/16-1 downto 0 generate
    tmp_16(i) <= tmp_08(i*2+1) + tmp_08(i*2);
    result_1(i*16+15 downto i*16) <=
      std_ulogic_vector(to_unsigned(tmp_16(i), 16));
  end generate loop_add16;

  -- insert pipeline stage here

  with POPC_size(0) select
    cascade1 <= result_1 when '1',
                result_0 when others;

  loop_add32 : for i in WIDTH/32-1 downto 0 generate
    tmp_32(i) <= tmp_16(i*2+1) + tmp_16(i*2);
    result_2(i*32+31 downto i*32) <=
      std_ulogic_vector(to_unsigned(tmp_32(i), 32));
  end generate loop_add32;

  with POPC_size(1) select
    cascade2 <= result_2 when '1',
                cascade1 when others;

  loop_add64 : for i in WIDTH/64-1 downto 0 generate
    tmp_64(i) <= tmp_32(i*2+1) + tmp_32(i*2);
    result_3(i*64+63 downto i*64) <=
        std_ulogic_vector(to_unsigned(tmp_64(i), 64));
  end generate loop_add64;

  with POPC_size(2) select
    POPC_out <= result_3 when '1',
                cascade2 when others;

end;
