From: "Carl Brannen" <carl.brannen@terabeam.com>
Newsgroups: comp.arch.fpga
Subject: Divide by 3, with remainder, efficient and fast, for Altera or Xilinx
Date: Tue, 18 Dec 2001 23:13:45 +0000 (UTC)
Organization: Mailgate.ORG Server - http://www.Mailgate.ORG
Lines: 496
Message-ID: <48a39dc55b77e0cdb6c1f9895bfe4eb6.51709@mygate.mailgate.org>
NNTP-Posting-Host: firewall.terabeam.com
Content-Type: text/plain; charset=ISO-8859-1
Content-Transfer-Encoding: 8bit
X-Trace: news.mailgate.org 1008715124 10527 216.137.15.2 (Wed Dec 19 00:13:45 2001)
X-Complaints-To: abuse@mailgate.org
NNTP-Posting-Date: Tue, 18 Dec 2001 23:13:45 +0000 (UTC)
Injector-Info: news.mailgate.org; posting-host=firewall.terabeam.com; posting-account=51709; posting-date=1008715124
User-Agent: Mailgate Web Server
X-URL: http://www.Mailgate.ORG
Path: chonsp.franklin.ch!pfaff.ethz.ch!news-zh.switch.ch!news.mailgate.org!web2news!firewall.terabeam.com!not-for-mail
Xref: chonsp.franklin.ch comp.arch.fpga:12515

Let's see how long a VHDL chunk will fit in this forum...

library IEEE;
use IEEE.std_logic_1164.all;

-- Divide by 3 circuit example.  Uses only 85 LUTs
-- (up to 4-input) to compute the quotient and
-- remainder when a 32-bit input is divided by 3.
--
-- Designer:  Carl Brannen
--
-- Feel free to modify this circuit and use it in
-- your own designs. I am aware of no patents that
-- it infringes on, but you will have to make your
-- own determination of this.  My only request is
-- that you leave a comment to the effect that your
-- knowledge of the algorithm is through me.
--
-- Synthesize with optimize set for "low", and 
-- "area".  This circuit is already optimized,
-- the computer will only be waste its time (and likely
-- increase the size and delay of the result) if it
-- tries to optimize further.
--
-- This code was written in response to this post
-- on the comp.arch.FPGA thread:
--
-- <<<
-- "I need to implement in an fpga an algorithm that will divide an integer
-- by 3.  The dividend length is still to be determined but will be
-- somewhere between 20 and 30 bits, and the divisor is always the number
-- 3.
-- 
-- Does anyone know an efficient combinatoric algorithm that can accomplish
-- this?
-- >>>
--
-- http://www.fpga-faq.com/archives/11400.html#11409

entity DIV32_3 is
    port (
        CLK:    in  STD_LOGIC;
        AIN:    in  STD_LOGIC_VECTOR(31 downto 0);
        REMOUT: out STD_LOGIC_VECTOR( 1 downto 0);
        QOUT:   out STD_LOGIC_VECTOR(30 downto 0);
        TEST:   out STD_LOGIC_VECTOR(53 downto 0)
    );
end DIV32_3;

architecture DIV32_3_arch of DIV32_3 is

-- Partial remainders:
signal R1V:  STD_LOGIC_VECTOR(15 downto 0);  --  16 LUTs
signal R2V:  STD_LOGIC_VECTOR( 7 downto 0);  --   8 LUTs
signal R4V:  STD_LOGIC_VECTOR( 3 downto 0);  --   4 LUTs
signal R8V:  STD_LOGIC_VECTOR( 1 downto 0);  --   2 LUTs
signal P4V:  STD_LOGIC_VECTOR( 1 downto 0);  --   2 LUTs
signal P3V:  STD_LOGIC_VECTOR( 1 downto 0);  --   2 LUTs
signal P2V:  STD_LOGIC_VECTOR( 1 downto 0);  --   2 LUTs
signal P1V:  STD_LOGIC_VECTOR( 1 downto 0);  --   2 LUTs
signal P0V:  STD_LOGIC_VECTOR( 1 downto 0);  --   2 LUTs

-- Rearrangement of partial remainders only:
signal PRV:  STD_LOGIC_VECTOR(15 downto 0);  --  16 LUTs

-- carries internal to blocks:
signal X0V:  STD_LOGIC_VECTOR(13 downto 0);  --  14 LUTs

-- Flip-flop for QOUT:
signal QOUTQ,QOUTD: STD_LOGIC_VECTOR(30 downto 0);  --  31 LUTs

-- Flip-flop for Remainder:
signal REMQ,REMD:   STD_LOGIC_VECTOR( 1 downto 0);


--                                           --  -------

-- Total LUT count:                          --  85 LUTs (45 slices or 23 CLBs)

-- Force tool to not "optimize" (i.e. bloat) the design
-- by creating a set of flip-flop outputs.
signal FKD,FKQ: STD_LOGIC_VECTOR(53 downto 0);


begin

-- Scheme for quickest determination of remainder when dividing by 3.
-- Example, 32-bit input, R80 provides the 2-bit remainder result in
-- just 4 stages of 4-input LUTs:
--
--                   AIN
-- 3322 2222 2222 1111 1111 1100 0000 0000
-- 1098 7654 3210 9876 5432 1098 7654 3210
-- ---- ---- ---- ---- ---- ---- ---- ----
--  R17  R16  R15  R14  R13  R12  R11  R10
--   \   /     \   /     \   /     \   /
--    R23       R22       R21       R20
--        \    /              \    /
--         R41                 R40
--            -----\    /-----
--                  R80
--
-- Scheme for computing quotients from the above
-- remainder scheme.  More partial remainders have
-- to be computed, as compared to the above remainders,
-- these ones are called PRs.
--
-- The quotient for the highest four bits is computed
-- directly from (no greater than 4-input) LUTs.  The
-- lower quotients all require a remainder input.  That
-- remainder allows direct computation of the quotient
-- for the high two bits, and a partial remainder needs
-- to be computed to get the lower 2 bits as well.  The
-- following diagram suppresses Rxx that aren't used, and
-- only shows how the PRs are calculated.  The lowest
-- value in each column gives the Rxx or PRx that computes
-- the partial remainder at that column:
--
--                   AIN
-- 3322 2222 2222 1111 1111 1100 0000 0000
-- 1098 7654 3210 9876 5432 1098 7654 3210
-- ---- ---- ---- ---- ---- ---- ---- ----
--  R17       R15       R13       R11
--             |         |         |
--            P4         |   R21   |
--           /           |    |  \ |
--       R23             P3   P2   P1
--                      /     /    |
--                     /     /     P0
--                    /-----/-----/
--                 R41
--
-- ---- ---- ---- ---- ---- ---- ---- ----
--  R17  R23  P4   R41  P3   P2   P0   R8
--  PR7  PR6  PR5  PR4  PR3  PR2  PR1  PR0
--
--
-- From the above tables, it's clear that the
-- longest computation is that of the quotient
-- at position 0, as would be expected.  The
-- number of stages of logic is only 6, and the
-- longest paths are as follows:
--
-- R17 R16 R15 R14 R13 R12
--   \  /    \  /    \  /
--   R23     R22     R21 
--      \    /         \ 
--       R41            P1
--            \     /
--              P0
--               |
--              Remainder[3:2]
--               |
--              Quotient[1:0]
--
--
-- In order to make the VHDL shorter, I've packed
-- the remainders into longer STD_LOGIC_VECTORs
-- as follows:
-- R1V <= R17 & R16 & ... R10
-- R2V <= R23 & R22 & R21 & R20
-- R4V <= R41 & R40
-- R8V <= R80
--
--
-- I normally don't like to complicate things any more
-- than they have to, but I hate to have to create all
-- those unnecessary "SEL" assignments.

-- If Xilinx would support a select statement like this:
--
--     with AIN(4*I+3 downto 4*I)
--
-- I wouldn't have to do this this way, but this is the first
-- way to implement this that comes to mind.  I guess I could
-- define LUT4s, since none of these are trivial, but that wouldn't
-- port to Altera.

-- Generate the R1x logic (16 LUTs)
G1: for I in 0 to 7 generate
R1V(I*2 + 0)
 <= ((not AIN(4*I+3)) and (not AIN(4*I+2)) and (not AIN(4*I+1)) and (
AIN(4*I+0)))
 or ((not AIN(4*I+3)) and (    AIN(4*I+2)) and (not AIN(4*I+1)) and (not
AIN(4*I+0)))
 or ((not AIN(4*I+3)) and (    AIN(4*I+2)) and (    AIN(4*I+1)) and (
AIN(4*I+0)))
 or ((    AIN(4*I+3)) and (not AIN(4*I+2)) and (    AIN(4*I+1)) and (not
AIN(4*I+0)))
 or ((    AIN(4*I+3)) and (    AIN(4*I+2)) and (not AIN(4*I+1)) and (
AIN(4*I+0)));
R1V(I*2 + 1)
 <= ((not AIN(4*I+3)) and (not AIN(4*I+2)) and (    AIN(4*I+1)) and (not
AIN(4*I+0)))
 or ((not AIN(4*I+3)) and (    AIN(4*I+2)) and (not AIN(4*I+1)) and (
AIN(4*I+0)))
 or ((    AIN(4*I+3)) and (not AIN(4*I+2)) and (not AIN(4*I+1)) and (not
AIN(4*I+0)))
 or ((    AIN(4*I+3)) and (not AIN(4*I+2)) and (    AIN(4*I+1)) and (
AIN(4*I+0)))
 or ((    AIN(4*I+3)) and (    AIN(4*I+2)) and (    AIN(4*I+1)) and (not
AIN(4*I+0)));
end generate;

-- Generate the R2x logic (8 LUTs)
G2: for I in 0 to 3 generate
R2V(I*2 + 0)
 <= ((not R1V(4*I+3)) and (not R1V(4*I+2)) and (not R1V(4*I+1)) and (
R1V(4*I+0)))
 or ((not R1V(4*I+3)) and (    R1V(4*I+2)) and (not R1V(4*I+1)) and (not
R1V(4*I+0)))
 or ((not R1V(4*I+3)) and (    R1V(4*I+2)) and (    R1V(4*I+1)) and (
R1V(4*I+0)))
 or ((    R1V(4*I+3)) and (not R1V(4*I+2)) and (    R1V(4*I+1)) and (not
R1V(4*I+0)))
 or ((    R1V(4*I+3)) and (    R1V(4*I+2)) and (not R1V(4*I+1)) and (
R1V(4*I+0)));
R2V(I*2 + 1)
 <= ((not R1V(4*I+3)) and (not R1V(4*I+2)) and (    R1V(4*I+1)) and (not
R1V(4*I+0)))
 or ((not R1V(4*I+3)) and (    R1V(4*I+2)) and (not R1V(4*I+1)) and (
R1V(4*I+0)))
 or ((    R1V(4*I+3)) and (not R1V(4*I+2)) and (not R1V(4*I+1)) and (not
R1V(4*I+0)))
 or ((    R1V(4*I+3)) and (not R1V(4*I+2)) and (    R1V(4*I+1)) and (
R1V(4*I+0)))
 or ((    R1V(4*I+3)) and (    R1V(4*I+2)) and (    R1V(4*I+1)) and (not
R1V(4*I+0)));
end generate;

-- Generate the R4x logic (4 LUTs)
G4: for I in 0 to 1 generate
R4V(I*2 + 0)
 <= ((not R2V(4*I+3)) and (not R2V(4*I+2)) and (not R2V(4*I+1)) and (
R2V(4*I+0)))
 or ((not R2V(4*I+3)) and (    R2V(4*I+2)) and (not R2V(4*I+1)) and (not
R2V(4*I+0)))
 or ((not R2V(4*I+3)) and (    R2V(4*I+2)) and (    R2V(4*I+1)) and (
R2V(4*I+0)))
 or ((    R2V(4*I+3)) and (not R2V(4*I+2)) and (    R2V(4*I+1)) and (not
R2V(4*I+0)))
 or ((    R2V(4*I+3)) and (    R2V(4*I+2)) and (not R2V(4*I+1)) and (
R2V(4*I+0)));
R4V(I*2 + 1)
 <= ((not R2V(4*I+3)) and (not R2V(4*I+2)) and (    R2V(4*I+1)) and (not
R2V(4*I+0)))
 or ((not R2V(4*I+3)) and (    R2V(4*I+2)) and (not R2V(4*I+1)) and (
R2V(4*I+0)))
 or ((    R2V(4*I+3)) and (not R2V(4*I+2)) and (not R2V(4*I+1)) and (not
R2V(4*I+0)))
 or ((    R2V(4*I+3)) and (not R2V(4*I+2)) and (    R2V(4*I+1)) and (
R2V(4*I+0)))
 or ((    R2V(4*I+3)) and (    R2V(4*I+2)) and (    R2V(4*I+1)) and (not
R2V(4*I+0)));
end generate;

-- The R80 logic: (2 LUTs)
R8V(0)
 <= ((not R4V(3)) and (not R4V(2)) and (not R4V(1)) and (    R4V(0)))
 or ((not R4V(3)) and (    R4V(2)) and (not R4V(1)) and (not R4V(0)))
 or ((not R4V(3)) and (    R4V(2)) and (    R4V(1)) and (    R4V(0)))
 or ((    R4V(3)) and (not R4V(2)) and (    R4V(1)) and (not R4V(0)))
 or ((    R4V(3)) and (    R4V(2)) and (not R4V(1)) and (    R4V(0)));
R8V(1)
 <= ((not R4V(3)) and (not R4V(2)) and (    R4V(1)) and (not R4V(0)))
 or ((not R4V(3)) and (    R4V(2)) and (not R4V(1)) and (    R4V(0)))
 or ((    R4V(3)) and (not R4V(2)) and (not R4V(1)) and (not R4V(0)))
 or ((    R4V(3)) and (not R4V(2)) and (    R4V(1)) and (    R4V(0)))
 or ((    R4V(3)) and (    R4V(2)) and (    R4V(1)) and (not R4V(0)));

-- P4 = R23 # R15 (2 LUTs)
P4V(0)
 <= ((not R2V(7)) and (not R2V(6)) and (not R1V(11)) and (    R1V(10)))
 or ((not R2V(7)) and (    R2V(6)) and (not R1V(11)) and (not R1V(10)))
 or ((not R2V(7)) and (    R2V(6)) and (    R1V(11)) and (    R1V(10)))
 or ((    R2V(7)) and (not R2V(6)) and (    R1V(11)) and (not R1V(10)))
 or ((    R2V(7)) and (    R2V(6)) and (not R1V(11)) and (    R1V(10)));
P4V(1)
 <= ((not R2V(7)) and (not R2V(6)) and (    R1V(11)) and (not R1V(10)))
 or ((not R2V(7)) and (    R2V(6)) and (not R1V(11)) and (    R1V(10)))
 or ((    R2V(7)) and (not R2V(6)) and (not R1V(11)) and (not R1V(10)))
 or ((    R2V(7)) and (not R2V(6)) and (    R1V(11)) and (    R1V(10)))
 or ((    R2V(7)) and (    R2V(6)) and (    R1V(11)) and (not R1V(10)));

-- P3 = R41 # R13 (2 LUTs)
P3V(0)
 <= ((not R4V(3)) and (not R4V(2)) and (not R1V(7)) and (    R1V(6)))
 or ((not R4V(3)) and (    R4V(2)) and (not R1V(7)) and (not R1V(6)))
 or ((not R4V(3)) and (    R4V(2)) and (    R1V(7)) and (    R1V(6)))
 or ((    R4V(3)) and (not R4V(2)) and (    R1V(7)) and (not R1V(6)))
 or ((    R4V(3)) and (    R4V(2)) and (not R1V(7)) and (    R1V(6)));
P3V(1)
 <= ((not R4V(3)) and (not R4V(2)) and (    R1V(7)) and (not R1V(6)))
 or ((not R4V(3)) and (    R4V(2)) and (not R1V(7)) and (    R1V(6)))
 or ((    R4V(3)) and (not R4V(2)) and (not R1V(7)) and (not R1V(6)))
 or ((    R4V(3)) and (not R4V(2)) and (    R1V(7)) and (    R1V(6)))
 or ((    R4V(3)) and (    R4V(2)) and (    R1V(7)) and (not R1V(6)));

-- P2 = R41 # R21 (2 LUTs)
P2V(0)
 <= ((not R4V(3)) and (not R4V(2)) and (not R2V(3)) and (    R2V(2)))
 or ((not R4V(3)) and (    R4V(2)) and (not R2V(3)) and (not R2V(2)))
 or ((not R4V(3)) and (    R4V(2)) and (    R2V(3)) and (    R2V(2)))
 or ((    R4V(3)) and (not R4V(2)) and (    R2V(3)) and (not R2V(2)))
 or ((    R4V(3)) and (    R4V(2)) and (not R2V(3)) and (    R2V(2)));
P2V(1)
 <= ((not R4V(3)) and (not R4V(2)) and (    R2V(3)) and (not R2V(2)))
 or ((not R4V(3)) and (    R4V(2)) and (not R2V(3)) and (    R2V(2)))
 or ((    R4V(3)) and (not R4V(2)) and (not R2V(3)) and (not R2V(2)))
 or ((    R4V(3)) and (not R4V(2)) and (    R2V(3)) and (    R2V(2)))
 or ((    R4V(3)) and (    R4V(2)) and (    R2V(3)) and (not R2V(2)));

-- P1 = R11 # R21 (2 LUTs)
P1V(0)
 <= ((not R1V(3)) and (not R1V(2)) and (not R2V(3)) and (    R2V(2)))
 or ((not R1V(3)) and (    R1V(2)) and (not R2V(3)) and (not R2V(2)))
 or ((not R1V(3)) and (    R1V(2)) and (    R2V(3)) and (    R2V(2)))
 or ((    R1V(3)) and (not R1V(2)) and (    R2V(3)) and (not R2V(2)))
 or ((    R1V(3)) and (    R1V(2)) and (not R2V(3)) and (    R2V(2)));
P1V(1)
 <= ((not R1V(3)) and (not R1V(2)) and (    R2V(3)) and (not R2V(2)))
 or ((not R1V(3)) and (    R1V(2)) and (not R2V(3)) and (    R2V(2)))
 or ((    R1V(3)) and (not R1V(2)) and (not R2V(3)) and (not R2V(2)))
 or ((    R1V(3)) and (not R1V(2)) and (    R2V(3)) and (    R2V(2)))
 or ((    R1V(3)) and (    R1V(2)) and (    R2V(3)) and (not R2V(2)));

-- P0 = R41 # PR1 (2 LUTs)
P0V(0)
 <= ((not R4V(3)) and (not R4V(2)) and (not P1V(1)) and (    P1V(0)))
 or ((not R4V(3)) and (    R4V(2)) and (not P1V(1)) and (not P1V(0)))
 or ((not R4V(3)) and (    R4V(2)) and (    P1V(1)) and (    P1V(0)))
 or ((    R4V(3)) and (not R4V(2)) and (    P1V(1)) and (not P1V(0)))
 or ((    R4V(3)) and (    R4V(2)) and (not P1V(1)) and (    P1V(0)));
P0V(1)
 <= ((not R4V(3)) and (not R4V(2)) and (    P1V(1)) and (not P1V(0)))
 or ((not R4V(3)) and (    R4V(2)) and (not P1V(1)) and (    P1V(0)))
 or ((    R4V(3)) and (not R4V(2)) and (not P1V(1)) and (not P1V(0)))
 or ((    R4V(3)) and (not R4V(2)) and (    P1V(1)) and (    P1V(0)))
 or ((    R4V(3)) and (    R4V(2)) and (    P1V(1)) and (not P1V(0)));

-- Assemble the partial remainders into the inputs for the quotient
-- calculations:
PRV(15 downto 0)         --      Remainder of 
  <= R1V(15 downto 14)   -- R17  AIN[31:28]
  &  R2V( 7 downto  6)   -- R23  AIN[31:24]
  &  P4V( 1 downto  0)   -- P4   AIN[31:20]
  &  R4V( 3 downto  2)   -- R41  AIN[31:16]
  &  P3V( 1 downto  0)   -- P3   AIN[31:12]
  &  P2V( 1 downto  0)   -- P2   AIN[31:8]
  &  P0V( 1 downto  0)   -- P0   AIN[31:4]
  &  R8V( 1 downto  0);  -- R8   AIN[31:0]

-- The highest quotient block has no remainder coming in,
-- so compute it directly: (3 LUTs)
with AIN(31 downto 28) select
  QOUTD(30 downto 28) <=
    "000" when "0000" | "0001" | "0010",
    "001" when "0011" | "0100" | "0101",
    "010" when "0110" | "0111" | "1000",
    "011" when "1001" | "1010" | "1011",
    "100" when "1100" | "1101" | "1110",
    "101" when others;

-- 0000 00
-- 0001 00
-- 0010 00

-- 0011 01
-- 0100 01
-- 0101 01

-- 0110 10
-- 0111 10
-- 1000 10

-- 1001 11
-- 1010 11
-- 1011 11


-- Compute the quotient in blocks of 4 bits
Q: for I in 0 to 6 generate
  -- Top two bits are computed directly from the carry-in and AIN
 QOUTD(4*I + 2)
  <= ((not PRV(2*I+3)) and (not PRV(2*I+2)) and (    AIN(4*I+3)) and (
AIN(4*I+2)))
  or ((not PRV(2*I+3)) and (    PRV(2*I+2)) and (not AIN(4*I+3)) and (not
AIN(4*I+2)))
  or ((not PRV(2*I+3)) and (    PRV(2*I+2)) and (not AIN(4*I+3)) and (
AIN(4*I+2)))
  or ((    PRV(2*I+3)) and (not PRV(2*I+2)) and (not AIN(4*I+3)) and (
AIN(4*I+2)))
  or ((    PRV(2*I+3)) and (not PRV(2*I+2)) and (    AIN(4*I+3)) and (not
AIN(4*I+2)))
  or ((    PRV(2*I+3)) and (not PRV(2*I+2)) and (    AIN(4*I+3)) and (
AIN(4*I+2)));
 QOUTD(4*I + 3)
  <= ((not PRV(2*I+3)) and (    PRV(2*I+2)) and (    AIN(4*I+3)) and (not
AIN(4*I+2)))
  or ((not PRV(2*I+3)) and (    PRV(2*I+2)) and (    AIN(4*I+3)) and (
AIN(4*I+2)))
  or ((    PRV(2*I+3)) and (not PRV(2*I+2)) and (not AIN(4*I+3)) and (not
AIN(4*I+2)))
  or ((    PRV(2*I+3)) and (not PRV(2*I+2)) and (not AIN(4*I+3)) and (
AIN(4*I+2)))
  or ((    PRV(2*I+3)) and (not PRV(2*I+2)) and (    AIN(4*I+3)) and (not
AIN(4*I+2)))
  or ((    PRV(2*I+3)) and (not PRV(2*I+2)) and (    AIN(4*I+3)) and (
AIN(4*I+2)));
  -- I need to compute the remainder out of the top two bits for the lower two
bits
 X0V(2*I + 0)
  <= ((not PRV(2*I+3)) and (not PRV(2*I+2)) and (not AIN(4*I+3)) and (
AIN(4*I+2)))
  or ((not PRV(2*I+3)) and (    PRV(2*I+2)) and (not AIN(4*I+3)) and (not
AIN(4*I+2)))
  or ((not PRV(2*I+3)) and (    PRV(2*I+2)) and (    AIN(4*I+3)) and (
AIN(4*I+2)))
  or ((    PRV(2*I+3)) and (not PRV(2*I+2)) and (    AIN(4*I+3)) and (not
AIN(4*I+2)))
  or ((    PRV(2*I+3)) and (    PRV(2*I+2)) and (not AIN(4*I+3)) and (
AIN(4*I+2)));
 X0V(2*I + 1)
  <= ((not PRV(2*I+3)) and (not PRV(2*I+2)) and (    AIN(4*I+3)) and (not
AIN(4*I+2)))
  or ((not PRV(2*I+3)) and (    PRV(2*I+2)) and (not AIN(4*I+3)) and (
AIN(4*I+2)))
  or ((    PRV(2*I+3)) and (not PRV(2*I+2)) and (not AIN(4*I+3)) and (not
AIN(4*I+2)))
  or ((    PRV(2*I+3)) and (not PRV(2*I+2)) and (    AIN(4*I+3)) and (
AIN(4*I+2)))
  or ((    PRV(2*I+3)) and (    PRV(2*I+2)) and (    AIN(4*I+3)) and (not
AIN(4*I+2)));
  -- Now I can compute the lowest two bits:
 QOUTD(4*I + 0)
  <= ((not X0V(2*I+1)) and (not X0V(2*I+0)) and (    AIN(4*I+1)) and (
AIN(4*I+0)))
  or ((not X0V(2*I+1)) and (    X0V(2*I+0)) and (not AIN(4*I+1)) and (not
AIN(4*I+0)))
  or ((not X0V(2*I+1)) and (    X0V(2*I+0)) and (not AIN(4*I+1)) and (
AIN(4*I+0)))
  or ((    X0V(2*I+1)) and (not X0V(2*I+0)) and (not AIN(4*I+1)) and (
AIN(4*I+0)))
  or ((    X0V(2*I+1)) and (not X0V(2*I+0)) and (    AIN(4*I+1)) and (not
AIN(4*I+0)))
  or ((    X0V(2*I+1)) and (not X0V(2*I+0)) and (    AIN(4*I+1)) and (
AIN(4*I+0)));
 QOUTD(4*I + 1)
  <= ((not X0V(2*I+1)) and (    X0V(2*I+0)) and (    AIN(4*I+1)) and (not
AIN(4*I+0)))
  or ((not X0V(2*I+1)) and (    X0V(2*I+0)) and (    AIN(4*I+1)) and (
AIN(4*I+0)))
  or ((    X0V(2*I+1)) and (not X0V(2*I+0)) and (not AIN(4*I+1)) and (not
AIN(4*I+0)))
  or ((    X0V(2*I+1)) and (not X0V(2*I+0)) and (not AIN(4*I+1)) and (
AIN(4*I+0)))
  or ((    X0V(2*I+1)) and (not X0V(2*I+0)) and (    AIN(4*I+1)) and (not
AIN(4*I+0)))
  or ((    X0V(2*I+1)) and (not X0V(2*I+0)) and (    AIN(4*I+1)) and (
AIN(4*I+0)));
end generate;


-- The following flip-flop implication of all the partial logic is included
only
-- to prevent the synthesis tool from optimizing out the beautiful logic I've
-- created.
FKD(53 downto 0) <= R1V & R2V & R4V & R8V & P4V & P3V & P2V & P1V & P0V & X0V;

-- Register the remainder
REMD(1 downto 0) <= PRV(1 downto 0);

process (CLK)
begin
  if CLK'event and CLK='1' then
    FKQ   <= FKD;
    QOUTQ <= QOUTD;
    REMQ  <= REMD;
  end if;
end process;


-- Output assignment:

REMOUT <= REMQ(1 downto 0);
QOUT   <= QOUTQ(30 downto 0);

-- Test output (unused, but included for synthesis restriction).
TEST   <= FKQ;

end DIV32_3_arch;


-- 
Posted from firewall.terabeam.com [216.137.15.2] 
via Mailgate.ORG Server - http://www.Mailgate.ORG

######

From: "Carl Brannen" <carl.brannen@terabeam.com>
Newsgroups: comp.arch.fpga
Subject: Re: Divide by 3, with remainder, efficient and fast, for Altera or Xilinx
Date: Wed, 19 Dec 2001 04:58:05 +0000 (UTC)
Organization: Mailgate.ORG Server - http://www.Mailgate.ORG
Lines: 72
Message-ID: <da41417fa3901d20f4822f851463130a.51709@mygate.mailgate.org>
References: <48a39dc55b77e0cdb6c1f9895bfe4eb6.51709@mygate.mailgate.org>
NNTP-Posting-Host: firewall.terabeam.com
Content-Type: text/plain; charset=ISO-8859-1
Content-Transfer-Encoding: 8bit
X-Trace: news.mailgate.org 1008715124 10527 216.137.15.2 (Wed Dec 19 05:58:05 2001)
X-Complaints-To: abuse@mailgate.org
NNTP-Posting-Date: Wed, 19 Dec 2001 04:58:05 +0000 (UTC)
Injector-Info: news.mailgate.org; posting-host=firewall.terabeam.com; posting-account=51709; posting-date=1008715124
User-Agent: Mailgate Web Server
X-URL: http://www.Mailgate.ORG
Path: chonsp.franklin.ch!pfaff.ethz.ch!news-zh.switch.ch!news.mailgate.org!web2news!firewall.terabeam.com!not-for-mail
Xref: chonsp.franklin.ch comp.arch.fpga:12501

Performance of above divide by 3 circuit.  Period objective is 5ns,
all inputs and outputs are registered (with internal flip-flops).
The thing actually achieved 103.907MHz, which is very good for a
fall through divide by 3 circuit.  No floor planning.  Part is
a small VirtexE-8:

<<<
Xilinx Mapping Report File for Design 'divide'
Copyright (c) 1995-2000 Xilinx, Inc.  All rights reserved.

Design Information
------------------
Command Line   : map -p xcv50e-8-cs144 -o map.ncd divide.ngd divide.pcf 
Target Device  : xv50e
Target Package : cs144
Target Speed   : -8
Mapper Version : virtexe -- D.27
Mapped Date    : Tue Dec 18 20:51:33 2001

Design Summary
--------------
   Number of errors:      0
   Number of warnings:    1
   Number of Slices:                 76 out of    768    9%
   Number of Slices containing
      unrelated logic:                0 out of     76    0%
   Number of Slice Flip Flops:       98 out of  1,536    6%
   Number of 4 input LUTs:           85 out of  1,536    5%
   Number of bonded IOBs:            65 out of     94   69%
   Number of GCLKs:                   1 out of      4   25%
   Number of GCLKIOBs:                1 out of      4   25%
Total equivalent gate count for design:  1,294
Additional JTAG gate count for IOBs:  3,168
>>>

<<<
The Number of signals not completely routed for this design is: 0

   The Average Connection Delay for this design is:        0.836 ns
   The Maximum Pin Delay is:                               2.583 ns
   The Average Connection Delay on the 10 Worst Nets is:   1.968 ns


--------------------------------------------------------------------------------
  Constraint                                | Requested  | Actual     | Logic 
                                            |            |            | Levels

--------------------------------------------------------------------------------
* NET "CLK" PERIOD =  5 nS   LOW 50.000 %   | 5.000ns    | 9.624ns    | 7    

--------------------------------------------------------------------------------

1 constraint not met.
Dumping design to file divide.ncd.
>>>

<<<
Constraints cover 8188 paths, 0 nets, and 447 connections (93.1% coverage)

Design statistics:
   Minimum period:   9.624ns (Maximum frequency: 103.907MHz)

Analysis completed Tue Dec 18 20:52:40 2001
>>>

Carl


-- 
Posted from firewall.terabeam.com [216.137.15.2] 
via Mailgate.ORG Server - http://www.Mailgate.ORG

######

From: "Eric Pearson" <ecp@mgl.ca>
Newsgroups: comp.arch.fpga
Subject: Re: Divide by 3, with remainder, efficient and fast, for Altera or Xilinx
Date: Wed, 19 Dec 2001 08:28:37 -0500
Organization: Posted via Supernews, http://www.supernews.com
Message-ID: <u215h0681n965@corp.supernews.com>
References: <48a39dc55b77e0cdb6c1f9895bfe4eb6.51709@mygate.mailgate.org>
X-Priority: 3
X-MSMail-Priority: Normal
X-Newsreader: Microsoft Outlook Express 5.50.4133.2400
X-MimeOLE: Produced By Microsoft MimeOLE V5.50.4133.2400
X-Complaints-To: newsabuse@supernews.com
Lines: 525
Path: chonsp.franklin.ch!pfaff.ethz.ch!news-zh.switch.ch!news.mailgate.org!news-spur1.maxwell.syr.edu!news.maxwell.syr.edu!newsfeed.stanford.edu!sn-xit-01!sn-post-01!supernews.com!corp.supernews.com!not-for-mail
Xref: chonsp.franklin.ch comp.arch.fpga:12518

My favorite "divide by 3" circuit is to multiply by 85 and then divide by
256.
85 is 0x55. To multiply by 0x55, only 2 adders are required.

Input A,
Output B = A / 3

t1 = A + (A<<2);    -- multiply by 0x05
t2 = t1 + (t1<<4);    -- multiply by 0x11
B = t2 >> 8;            -- divide by 256

Not quite as elegant as below, but very fast and small. It is useful when
the remainder is not required as in the orginal post.

Eric Pearson


"Carl Brannen" <carl.brannen@terabeam.com> wrote in message
news:48a39dc55b77e0cdb6c1f9895bfe4eb6.51709@mygate.mailgate.org...
> Let's see how long a VHDL chunk will fit in this forum...
>
> library IEEE;
> use IEEE.std_logic_1164.all;
>
> -- Divide by 3 circuit example.  Uses only 85 LUTs
> -- (up to 4-input) to compute the quotient and
> -- remainder when a 32-bit input is divided by 3.
> --
> -- Designer:  Carl Brannen
> --
> -- Feel free to modify this circuit and use it in
> -- your own designs. I am aware of no patents that
> -- it infringes on, but you will have to make your
> -- own determination of this.  My only request is
> -- that you leave a comment to the effect that your
> -- knowledge of the algorithm is through me.
> --
> -- Synthesize with optimize set for "low", and
> -- "area".  This circuit is already optimized,
> -- the computer will only be waste its time (and likely
> -- increase the size and delay of the result) if it
> -- tries to optimize further.
> --
> -- This code was written in response to this post
> -- on the comp.arch.FPGA thread:
> --
> -- <<<
> -- "I need to implement in an fpga an algorithm that will divide an
integer
> -- by 3.  The dividend length is still to be determined but will be
> -- somewhere between 20 and 30 bits, and the divisor is always the number
> -- 3.
> --
> -- Does anyone know an efficient combinatoric algorithm that can
accomplish
> -- this?
> -- >>>
> --
> -- http://www.fpga-faq.com/archives/11400.html#11409
>
> entity DIV32_3 is
>     port (
>         CLK:    in  STD_LOGIC;
>         AIN:    in  STD_LOGIC_VECTOR(31 downto 0);
>         REMOUT: out STD_LOGIC_VECTOR( 1 downto 0);
>         QOUT:   out STD_LOGIC_VECTOR(30 downto 0);
>         TEST:   out STD_LOGIC_VECTOR(53 downto 0)
>     );
> end DIV32_3;
>
> architecture DIV32_3_arch of DIV32_3 is
>
> -- Partial remainders:
> signal R1V:  STD_LOGIC_VECTOR(15 downto 0);  --  16 LUTs
> signal R2V:  STD_LOGIC_VECTOR( 7 downto 0);  --   8 LUTs
> signal R4V:  STD_LOGIC_VECTOR( 3 downto 0);  --   4 LUTs
> signal R8V:  STD_LOGIC_VECTOR( 1 downto 0);  --   2 LUTs
> signal P4V:  STD_LOGIC_VECTOR( 1 downto 0);  --   2 LUTs
> signal P3V:  STD_LOGIC_VECTOR( 1 downto 0);  --   2 LUTs
> signal P2V:  STD_LOGIC_VECTOR( 1 downto 0);  --   2 LUTs
> signal P1V:  STD_LOGIC_VECTOR( 1 downto 0);  --   2 LUTs
> signal P0V:  STD_LOGIC_VECTOR( 1 downto 0);  --   2 LUTs
>
> -- Rearrangement of partial remainders only:
> signal PRV:  STD_LOGIC_VECTOR(15 downto 0);  --  16 LUTs
>
> -- carries internal to blocks:
> signal X0V:  STD_LOGIC_VECTOR(13 downto 0);  --  14 LUTs
>
> -- Flip-flop for QOUT:
> signal QOUTQ,QOUTD: STD_LOGIC_VECTOR(30 downto 0);  --  31 LUTs
>
> -- Flip-flop for Remainder:
> signal REMQ,REMD:   STD_LOGIC_VECTOR( 1 downto 0);
>
>
> --                                           --  -------
>
> -- Total LUT count:                          --  85 LUTs (45 slices or 23
CLBs)
>
> -- Force tool to not "optimize" (i.e. bloat) the design
> -- by creating a set of flip-flop outputs.
> signal FKD,FKQ: STD_LOGIC_VECTOR(53 downto 0);
>
>
> begin
>
> -- Scheme for quickest determination of remainder when dividing by 3.
> -- Example, 32-bit input, R80 provides the 2-bit remainder result in
> -- just 4 stages of 4-input LUTs:
> --
> --                   AIN
> -- 3322 2222 2222 1111 1111 1100 0000 0000
> -- 1098 7654 3210 9876 5432 1098 7654 3210
> -- ---- ---- ---- ---- ---- ---- ---- ----
> --  R17  R16  R15  R14  R13  R12  R11  R10
> --   \   /     \   /     \   /     \   /
> --    R23       R22       R21       R20
> --        \    /              \    /
> --         R41                 R40
> --            -----\    /-----
> --                  R80
> --
> -- Scheme for computing quotients from the above
> -- remainder scheme.  More partial remainders have
> -- to be computed, as compared to the above remainders,
> -- these ones are called PRs.
> --
> -- The quotient for the highest four bits is computed
> -- directly from (no greater than 4-input) LUTs.  The
> -- lower quotients all require a remainder input.  That
> -- remainder allows direct computation of the quotient
> -- for the high two bits, and a partial remainder needs
> -- to be computed to get the lower 2 bits as well.  The
> -- following diagram suppresses Rxx that aren't used, and
> -- only shows how the PRs are calculated.  The lowest
> -- value in each column gives the Rxx or PRx that computes
> -- the partial remainder at that column:
> --
> --                   AIN
> -- 3322 2222 2222 1111 1111 1100 0000 0000
> -- 1098 7654 3210 9876 5432 1098 7654 3210
> -- ---- ---- ---- ---- ---- ---- ---- ----
> --  R17       R15       R13       R11
> --             |         |         |
> --            P4         |   R21   |
> --           /           |    |  \ |
> --       R23             P3   P2   P1
> --                      /     /    |
> --                     /     /     P0
> --                    /-----/-----/
> --                 R41
> --
> -- ---- ---- ---- ---- ---- ---- ---- ----
> --  R17  R23  P4   R41  P3   P2   P0   R8
> --  PR7  PR6  PR5  PR4  PR3  PR2  PR1  PR0
> --
> --
> -- From the above tables, it's clear that the
> -- longest computation is that of the quotient
> -- at position 0, as would be expected.  The
> -- number of stages of logic is only 6, and the
> -- longest paths are as follows:
> --
> -- R17 R16 R15 R14 R13 R12
> --   \  /    \  /    \  /
> --   R23     R22     R21
> --      \    /         \
> --       R41            P1
> --            \     /
> --              P0
> --               |
> --              Remainder[3:2]
> --               |
> --              Quotient[1:0]
> --
> --
> -- In order to make the VHDL shorter, I've packed
> -- the remainders into longer STD_LOGIC_VECTORs
> -- as follows:
> -- R1V <= R17 & R16 & ... R10
> -- R2V <= R23 & R22 & R21 & R20
> -- R4V <= R41 & R40
> -- R8V <= R80
> --
> --
> -- I normally don't like to complicate things any more
> -- than they have to, but I hate to have to create all
> -- those unnecessary "SEL" assignments.
>
> -- If Xilinx would support a select statement like this:
> --
> --     with AIN(4*I+3 downto 4*I)
> --
> -- I wouldn't have to do this this way, but this is the first
> -- way to implement this that comes to mind.  I guess I could
> -- define LUT4s, since none of these are trivial, but that wouldn't
> -- port to Altera.
>
> -- Generate the R1x logic (16 LUTs)
> G1: for I in 0 to 7 generate
> R1V(I*2 + 0)
>  <= ((not AIN(4*I+3)) and (not AIN(4*I+2)) and (not AIN(4*I+1)) and (
> AIN(4*I+0)))
>  or ((not AIN(4*I+3)) and (    AIN(4*I+2)) and (not AIN(4*I+1)) and (not
> AIN(4*I+0)))
>  or ((not AIN(4*I+3)) and (    AIN(4*I+2)) and (    AIN(4*I+1)) and (
> AIN(4*I+0)))
>  or ((    AIN(4*I+3)) and (not AIN(4*I+2)) and (    AIN(4*I+1)) and (not
> AIN(4*I+0)))
>  or ((    AIN(4*I+3)) and (    AIN(4*I+2)) and (not AIN(4*I+1)) and (
> AIN(4*I+0)));
> R1V(I*2 + 1)
>  <= ((not AIN(4*I+3)) and (not AIN(4*I+2)) and (    AIN(4*I+1)) and (not
> AIN(4*I+0)))
>  or ((not AIN(4*I+3)) and (    AIN(4*I+2)) and (not AIN(4*I+1)) and (
> AIN(4*I+0)))
>  or ((    AIN(4*I+3)) and (not AIN(4*I+2)) and (not AIN(4*I+1)) and (not
> AIN(4*I+0)))
>  or ((    AIN(4*I+3)) and (not AIN(4*I+2)) and (    AIN(4*I+1)) and (
> AIN(4*I+0)))
>  or ((    AIN(4*I+3)) and (    AIN(4*I+2)) and (    AIN(4*I+1)) and (not
> AIN(4*I+0)));
> end generate;
>
> -- Generate the R2x logic (8 LUTs)
> G2: for I in 0 to 3 generate
> R2V(I*2 + 0)
>  <= ((not R1V(4*I+3)) and (not R1V(4*I+2)) and (not R1V(4*I+1)) and (
> R1V(4*I+0)))
>  or ((not R1V(4*I+3)) and (    R1V(4*I+2)) and (not R1V(4*I+1)) and (not
> R1V(4*I+0)))
>  or ((not R1V(4*I+3)) and (    R1V(4*I+2)) and (    R1V(4*I+1)) and (
> R1V(4*I+0)))
>  or ((    R1V(4*I+3)) and (not R1V(4*I+2)) and (    R1V(4*I+1)) and (not
> R1V(4*I+0)))
>  or ((    R1V(4*I+3)) and (    R1V(4*I+2)) and (not R1V(4*I+1)) and (
> R1V(4*I+0)));
> R2V(I*2 + 1)
>  <= ((not R1V(4*I+3)) and (not R1V(4*I+2)) and (    R1V(4*I+1)) and (not
> R1V(4*I+0)))
>  or ((not R1V(4*I+3)) and (    R1V(4*I+2)) and (not R1V(4*I+1)) and (
> R1V(4*I+0)))
>  or ((    R1V(4*I+3)) and (not R1V(4*I+2)) and (not R1V(4*I+1)) and (not
> R1V(4*I+0)))
>  or ((    R1V(4*I+3)) and (not R1V(4*I+2)) and (    R1V(4*I+1)) and (
> R1V(4*I+0)))
>  or ((    R1V(4*I+3)) and (    R1V(4*I+2)) and (    R1V(4*I+1)) and (not
> R1V(4*I+0)));
> end generate;
>
> -- Generate the R4x logic (4 LUTs)
> G4: for I in 0 to 1 generate
> R4V(I*2 + 0)
>  <= ((not R2V(4*I+3)) and (not R2V(4*I+2)) and (not R2V(4*I+1)) and (
> R2V(4*I+0)))
>  or ((not R2V(4*I+3)) and (    R2V(4*I+2)) and (not R2V(4*I+1)) and (not
> R2V(4*I+0)))
>  or ((not R2V(4*I+3)) and (    R2V(4*I+2)) and (    R2V(4*I+1)) and (
> R2V(4*I+0)))
>  or ((    R2V(4*I+3)) and (not R2V(4*I+2)) and (    R2V(4*I+1)) and (not
> R2V(4*I+0)))
>  or ((    R2V(4*I+3)) and (    R2V(4*I+2)) and (not R2V(4*I+1)) and (
> R2V(4*I+0)));
> R4V(I*2 + 1)
>  <= ((not R2V(4*I+3)) and (not R2V(4*I+2)) and (    R2V(4*I+1)) and (not
> R2V(4*I+0)))
>  or ((not R2V(4*I+3)) and (    R2V(4*I+2)) and (not R2V(4*I+1)) and (
> R2V(4*I+0)))
>  or ((    R2V(4*I+3)) and (not R2V(4*I+2)) and (not R2V(4*I+1)) and (not
> R2V(4*I+0)))
>  or ((    R2V(4*I+3)) and (not R2V(4*I+2)) and (    R2V(4*I+1)) and (
> R2V(4*I+0)))
>  or ((    R2V(4*I+3)) and (    R2V(4*I+2)) and (    R2V(4*I+1)) and (not
> R2V(4*I+0)));
> end generate;
>
> -- The R80 logic: (2 LUTs)
> R8V(0)
>  <= ((not R4V(3)) and (not R4V(2)) and (not R4V(1)) and (    R4V(0)))
>  or ((not R4V(3)) and (    R4V(2)) and (not R4V(1)) and (not R4V(0)))
>  or ((not R4V(3)) and (    R4V(2)) and (    R4V(1)) and (    R4V(0)))
>  or ((    R4V(3)) and (not R4V(2)) and (    R4V(1)) and (not R4V(0)))
>  or ((    R4V(3)) and (    R4V(2)) and (not R4V(1)) and (    R4V(0)));
> R8V(1)
>  <= ((not R4V(3)) and (not R4V(2)) and (    R4V(1)) and (not R4V(0)))
>  or ((not R4V(3)) and (    R4V(2)) and (not R4V(1)) and (    R4V(0)))
>  or ((    R4V(3)) and (not R4V(2)) and (not R4V(1)) and (not R4V(0)))
>  or ((    R4V(3)) and (not R4V(2)) and (    R4V(1)) and (    R4V(0)))
>  or ((    R4V(3)) and (    R4V(2)) and (    R4V(1)) and (not R4V(0)));
>
> -- P4 = R23 # R15 (2 LUTs)
> P4V(0)
>  <= ((not R2V(7)) and (not R2V(6)) and (not R1V(11)) and (    R1V(10)))
>  or ((not R2V(7)) and (    R2V(6)) and (not R1V(11)) and (not R1V(10)))
>  or ((not R2V(7)) and (    R2V(6)) and (    R1V(11)) and (    R1V(10)))
>  or ((    R2V(7)) and (not R2V(6)) and (    R1V(11)) and (not R1V(10)))
>  or ((    R2V(7)) and (    R2V(6)) and (not R1V(11)) and (    R1V(10)));
> P4V(1)
>  <= ((not R2V(7)) and (not R2V(6)) and (    R1V(11)) and (not R1V(10)))
>  or ((not R2V(7)) and (    R2V(6)) and (not R1V(11)) and (    R1V(10)))
>  or ((    R2V(7)) and (not R2V(6)) and (not R1V(11)) and (not R1V(10)))
>  or ((    R2V(7)) and (not R2V(6)) and (    R1V(11)) and (    R1V(10)))
>  or ((    R2V(7)) and (    R2V(6)) and (    R1V(11)) and (not R1V(10)));
>
> -- P3 = R41 # R13 (2 LUTs)
> P3V(0)
>  <= ((not R4V(3)) and (not R4V(2)) and (not R1V(7)) and (    R1V(6)))
>  or ((not R4V(3)) and (    R4V(2)) and (not R1V(7)) and (not R1V(6)))
>  or ((not R4V(3)) and (    R4V(2)) and (    R1V(7)) and (    R1V(6)))
>  or ((    R4V(3)) and (not R4V(2)) and (    R1V(7)) and (not R1V(6)))
>  or ((    R4V(3)) and (    R4V(2)) and (not R1V(7)) and (    R1V(6)));
> P3V(1)
>  <= ((not R4V(3)) and (not R4V(2)) and (    R1V(7)) and (not R1V(6)))
>  or ((not R4V(3)) and (    R4V(2)) and (not R1V(7)) and (    R1V(6)))
>  or ((    R4V(3)) and (not R4V(2)) and (not R1V(7)) and (not R1V(6)))
>  or ((    R4V(3)) and (not R4V(2)) and (    R1V(7)) and (    R1V(6)))
>  or ((    R4V(3)) and (    R4V(2)) and (    R1V(7)) and (not R1V(6)));
>
> -- P2 = R41 # R21 (2 LUTs)
> P2V(0)
>  <= ((not R4V(3)) and (not R4V(2)) and (not R2V(3)) and (    R2V(2)))
>  or ((not R4V(3)) and (    R4V(2)) and (not R2V(3)) and (not R2V(2)))
>  or ((not R4V(3)) and (    R4V(2)) and (    R2V(3)) and (    R2V(2)))
>  or ((    R4V(3)) and (not R4V(2)) and (    R2V(3)) and (not R2V(2)))
>  or ((    R4V(3)) and (    R4V(2)) and (not R2V(3)) and (    R2V(2)));
> P2V(1)
>  <= ((not R4V(3)) and (not R4V(2)) and (    R2V(3)) and (not R2V(2)))
>  or ((not R4V(3)) and (    R4V(2)) and (not R2V(3)) and (    R2V(2)))
>  or ((    R4V(3)) and (not R4V(2)) and (not R2V(3)) and (not R2V(2)))
>  or ((    R4V(3)) and (not R4V(2)) and (    R2V(3)) and (    R2V(2)))
>  or ((    R4V(3)) and (    R4V(2)) and (    R2V(3)) and (not R2V(2)));
>
> -- P1 = R11 # R21 (2 LUTs)
> P1V(0)
>  <= ((not R1V(3)) and (not R1V(2)) and (not R2V(3)) and (    R2V(2)))
>  or ((not R1V(3)) and (    R1V(2)) and (not R2V(3)) and (not R2V(2)))
>  or ((not R1V(3)) and (    R1V(2)) and (    R2V(3)) and (    R2V(2)))
>  or ((    R1V(3)) and (not R1V(2)) and (    R2V(3)) and (not R2V(2)))
>  or ((    R1V(3)) and (    R1V(2)) and (not R2V(3)) and (    R2V(2)));
> P1V(1)
>  <= ((not R1V(3)) and (not R1V(2)) and (    R2V(3)) and (not R2V(2)))
>  or ((not R1V(3)) and (    R1V(2)) and (not R2V(3)) and (    R2V(2)))
>  or ((    R1V(3)) and (not R1V(2)) and (not R2V(3)) and (not R2V(2)))
>  or ((    R1V(3)) and (not R1V(2)) and (    R2V(3)) and (    R2V(2)))
>  or ((    R1V(3)) and (    R1V(2)) and (    R2V(3)) and (not R2V(2)));
>
> -- P0 = R41 # PR1 (2 LUTs)
> P0V(0)
>  <= ((not R4V(3)) and (not R4V(2)) and (not P1V(1)) and (    P1V(0)))
>  or ((not R4V(3)) and (    R4V(2)) and (not P1V(1)) and (not P1V(0)))
>  or ((not R4V(3)) and (    R4V(2)) and (    P1V(1)) and (    P1V(0)))
>  or ((    R4V(3)) and (not R4V(2)) and (    P1V(1)) and (not P1V(0)))
>  or ((    R4V(3)) and (    R4V(2)) and (not P1V(1)) and (    P1V(0)));
> P0V(1)
>  <= ((not R4V(3)) and (not R4V(2)) and (    P1V(1)) and (not P1V(0)))
>  or ((not R4V(3)) and (    R4V(2)) and (not P1V(1)) and (    P1V(0)))
>  or ((    R4V(3)) and (not R4V(2)) and (not P1V(1)) and (not P1V(0)))
>  or ((    R4V(3)) and (not R4V(2)) and (    P1V(1)) and (    P1V(0)))
>  or ((    R4V(3)) and (    R4V(2)) and (    P1V(1)) and (not P1V(0)));
>
> -- Assemble the partial remainders into the inputs for the quotient
> -- calculations:
> PRV(15 downto 0)         --      Remainder of
>   <= R1V(15 downto 14)   -- R17  AIN[31:28]
>   &  R2V( 7 downto  6)   -- R23  AIN[31:24]
>   &  P4V( 1 downto  0)   -- P4   AIN[31:20]
>   &  R4V( 3 downto  2)   -- R41  AIN[31:16]
>   &  P3V( 1 downto  0)   -- P3   AIN[31:12]
>   &  P2V( 1 downto  0)   -- P2   AIN[31:8]
>   &  P0V( 1 downto  0)   -- P0   AIN[31:4]
>   &  R8V( 1 downto  0);  -- R8   AIN[31:0]
>
> -- The highest quotient block has no remainder coming in,
> -- so compute it directly: (3 LUTs)
> with AIN(31 downto 28) select
>   QOUTD(30 downto 28) <=
>     "000" when "0000" | "0001" | "0010",
>     "001" when "0011" | "0100" | "0101",
>     "010" when "0110" | "0111" | "1000",
>     "011" when "1001" | "1010" | "1011",
>     "100" when "1100" | "1101" | "1110",
>     "101" when others;
>
> -- 0000 00
> -- 0001 00
> -- 0010 00
>
> -- 0011 01
> -- 0100 01
> -- 0101 01
>
> -- 0110 10
> -- 0111 10
> -- 1000 10
>
> -- 1001 11
> -- 1010 11
> -- 1011 11
>
>
> -- Compute the quotient in blocks of 4 bits
> Q: for I in 0 to 6 generate
>   -- Top two bits are computed directly from the carry-in and AIN
>  QOUTD(4*I + 2)
>   <= ((not PRV(2*I+3)) and (not PRV(2*I+2)) and (    AIN(4*I+3)) and (
> AIN(4*I+2)))
>   or ((not PRV(2*I+3)) and (    PRV(2*I+2)) and (not AIN(4*I+3)) and (not
> AIN(4*I+2)))
>   or ((not PRV(2*I+3)) and (    PRV(2*I+2)) and (not AIN(4*I+3)) and (
> AIN(4*I+2)))
>   or ((    PRV(2*I+3)) and (not PRV(2*I+2)) and (not AIN(4*I+3)) and (
> AIN(4*I+2)))
>   or ((    PRV(2*I+3)) and (not PRV(2*I+2)) and (    AIN(4*I+3)) and (not
> AIN(4*I+2)))
>   or ((    PRV(2*I+3)) and (not PRV(2*I+2)) and (    AIN(4*I+3)) and (
> AIN(4*I+2)));
>  QOUTD(4*I + 3)
>   <= ((not PRV(2*I+3)) and (    PRV(2*I+2)) and (    AIN(4*I+3)) and (not
> AIN(4*I+2)))
>   or ((not PRV(2*I+3)) and (    PRV(2*I+2)) and (    AIN(4*I+3)) and (
> AIN(4*I+2)))
>   or ((    PRV(2*I+3)) and (not PRV(2*I+2)) and (not AIN(4*I+3)) and (not
> AIN(4*I+2)))
>   or ((    PRV(2*I+3)) and (not PRV(2*I+2)) and (not AIN(4*I+3)) and (
> AIN(4*I+2)))
>   or ((    PRV(2*I+3)) and (not PRV(2*I+2)) and (    AIN(4*I+3)) and (not
> AIN(4*I+2)))
>   or ((    PRV(2*I+3)) and (not PRV(2*I+2)) and (    AIN(4*I+3)) and (
> AIN(4*I+2)));
>   -- I need to compute the remainder out of the top two bits for the lower
two
> bits
>  X0V(2*I + 0)
>   <= ((not PRV(2*I+3)) and (not PRV(2*I+2)) and (not AIN(4*I+3)) and (
> AIN(4*I+2)))
>   or ((not PRV(2*I+3)) and (    PRV(2*I+2)) and (not AIN(4*I+3)) and (not
> AIN(4*I+2)))
>   or ((not PRV(2*I+3)) and (    PRV(2*I+2)) and (    AIN(4*I+3)) and (
> AIN(4*I+2)))
>   or ((    PRV(2*I+3)) and (not PRV(2*I+2)) and (    AIN(4*I+3)) and (not
> AIN(4*I+2)))
>   or ((    PRV(2*I+3)) and (    PRV(2*I+2)) and (not AIN(4*I+3)) and (
> AIN(4*I+2)));
>  X0V(2*I + 1)
>   <= ((not PRV(2*I+3)) and (not PRV(2*I+2)) and (    AIN(4*I+3)) and (not
> AIN(4*I+2)))
>   or ((not PRV(2*I+3)) and (    PRV(2*I+2)) and (not AIN(4*I+3)) and (
> AIN(4*I+2)))
>   or ((    PRV(2*I+3)) and (not PRV(2*I+2)) and (not AIN(4*I+3)) and (not
> AIN(4*I+2)))
>   or ((    PRV(2*I+3)) and (not PRV(2*I+2)) and (    AIN(4*I+3)) and (
> AIN(4*I+2)))
>   or ((    PRV(2*I+3)) and (    PRV(2*I+2)) and (    AIN(4*I+3)) and (not
> AIN(4*I+2)));
>   -- Now I can compute the lowest two bits:
>  QOUTD(4*I + 0)
>   <= ((not X0V(2*I+1)) and (not X0V(2*I+0)) and (    AIN(4*I+1)) and (
> AIN(4*I+0)))
>   or ((not X0V(2*I+1)) and (    X0V(2*I+0)) and (not AIN(4*I+1)) and (not
> AIN(4*I+0)))
>   or ((not X0V(2*I+1)) and (    X0V(2*I+0)) and (not AIN(4*I+1)) and (
> AIN(4*I+0)))
>   or ((    X0V(2*I+1)) and (not X0V(2*I+0)) and (not AIN(4*I+1)) and (
> AIN(4*I+0)))
>   or ((    X0V(2*I+1)) and (not X0V(2*I+0)) and (    AIN(4*I+1)) and (not
> AIN(4*I+0)))
>   or ((    X0V(2*I+1)) and (not X0V(2*I+0)) and (    AIN(4*I+1)) and (
> AIN(4*I+0)));
>  QOUTD(4*I + 1)
>   <= ((not X0V(2*I+1)) and (    X0V(2*I+0)) and (    AIN(4*I+1)) and (not
> AIN(4*I+0)))
>   or ((not X0V(2*I+1)) and (    X0V(2*I+0)) and (    AIN(4*I+1)) and (
> AIN(4*I+0)))
>   or ((    X0V(2*I+1)) and (not X0V(2*I+0)) and (not AIN(4*I+1)) and (not
> AIN(4*I+0)))
>   or ((    X0V(2*I+1)) and (not X0V(2*I+0)) and (not AIN(4*I+1)) and (
> AIN(4*I+0)))
>   or ((    X0V(2*I+1)) and (not X0V(2*I+0)) and (    AIN(4*I+1)) and (not
> AIN(4*I+0)))
>   or ((    X0V(2*I+1)) and (not X0V(2*I+0)) and (    AIN(4*I+1)) and (
> AIN(4*I+0)));
> end generate;
>
>
> -- The following flip-flop implication of all the partial logic is
included
> only
> -- to prevent the synthesis tool from optimizing out the beautiful logic
I've
> -- created.
> FKD(53 downto 0) <= R1V & R2V & R4V & R8V & P4V & P3V & P2V & P1V & P0V &
X0V;
>
> -- Register the remainder
> REMD(1 downto 0) <= PRV(1 downto 0);
>
> process (CLK)
> begin
>   if CLK'event and CLK='1' then
>     FKQ   <= FKD;
>     QOUTQ <= QOUTD;
>     REMQ  <= REMD;
>   end if;
> end process;
>
>
> -- Output assignment:
>
> REMOUT <= REMQ(1 downto 0);
> QOUT   <= QOUTQ(30 downto 0);
>
> -- Test output (unused, but included for synthesis restriction).
> TEST   <= FKQ;
>
> end DIV32_3_arch;
>
>
> --
> Posted from firewall.terabeam.com [216.137.15.2]
> via Mailgate.ORG Server - http://www.Mailgate.ORG