FFT engine: merge SHIFT into WRITE (5→4 cycle butterfly, 20% throughput) + barrel-shift twiddle index
Opt 1: Eliminated ST_BF_SHIFT state — arithmetic right-shift is pure bit-selection (zero logic levels), merged into BF_WRITE combinational add/subtract. Saves LOG2N * N/2 = 5120 cycles per 1024-pt FFT. Opt 2: Replaced idx_val * tw_stride_reg general multiply with idx_val << (LOG2N-1-stage) barrel shift. tw_stride_reg is always a power of 2, so this is mathematically identical and frees a multiplier. Regression: 18/18 FPGA pass (bit-exact results).
This commit is contained in:
@@ -8,15 +8,18 @@
|
|||||||
*
|
*
|
||||||
* Architecture:
|
* Architecture:
|
||||||
* - LOAD: Accept N input samples, store bit-reversed in BRAM
|
* - LOAD: Accept N input samples, store bit-reversed in BRAM
|
||||||
* - COMPUTE: LOG2N stages x N/2 butterflies, 5-cycle pipeline:
|
* - COMPUTE: LOG2N stages x N/2 butterflies, 4-cycle pipeline:
|
||||||
* BF_READ: Present BRAM addresses; register twiddle index
|
* BF_READ: Present BRAM addresses; register twiddle index
|
||||||
* BF_TW: BRAM data valid → capture; twiddle ROM lookup from
|
* BF_TW: BRAM data valid → capture; twiddle ROM lookup from
|
||||||
* registered index → capture cos/sin
|
* registered index → capture cos/sin
|
||||||
* BF_MULT2: DSP multiply from registered data + twiddle
|
* BF_MULT2: DSP multiply from registered data + twiddle → PREG
|
||||||
* BF_SHIFT: Arithmetic shift of DSP products
|
* BF_WRITE: Shift (bit-select from PREG, pure wiring) +
|
||||||
* BF_WRITE: Add/subtract + BRAM writeback
|
* add/subtract + BRAM writeback
|
||||||
* - OUTPUT: Stream N results (1/N scaling for IFFT)
|
* - OUTPUT: Stream N results (1/N scaling for IFFT)
|
||||||
*
|
*
|
||||||
|
* Twiddle index computed via barrel shift (idx << (LOG2N-1-stage)) instead
|
||||||
|
* of general multiply, since the stride is always a power of 2.
|
||||||
|
*
|
||||||
* Data memory uses xpm_memory_tdpram (Xilinx Parameterized Macros) for
|
* Data memory uses xpm_memory_tdpram (Xilinx Parameterized Macros) for
|
||||||
* guaranteed BRAM mapping in synthesis. Under `ifdef SIMULATION, a
|
* guaranteed BRAM mapping in synthesis. Under `ifdef SIMULATION, a
|
||||||
* behavioral Verilog-2001 model replaces the XPM so the design compiles
|
* behavioral Verilog-2001 model replaces the XPM so the design compiles
|
||||||
@@ -67,23 +70,20 @@ localparam [LOG2N:0] FFT_N_M1 = N - 1;
|
|||||||
// ============================================================================
|
// ============================================================================
|
||||||
// STATES
|
// STATES
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// Butterfly pipeline: READ → TW → MULT2 → SHIFT → WRITE (5 cycles)
|
// Butterfly pipeline: READ → TW → MULT2 → WRITE (4 cycles)
|
||||||
// READ: Present BRAM addresses; register twiddle index (bf_tw_idx)
|
// READ: Present BRAM addresses; register twiddle index (bf_tw_idx)
|
||||||
// TW: BRAM data valid → capture rd_a/rd_b; ROM lookup from registered
|
// TW: BRAM data valid → capture rd_a/rd_b; twiddle ROM lookup from
|
||||||
// twiddle index → capture rd_tw_cos/sin. This splits the combinational
|
// registered index → capture cos/sin
|
||||||
// path (address calc + multiply + ROM + quarter-wave mux) into two cycles.
|
// MULT2: DSP multiply from registered data + twiddle → products in PREG
|
||||||
// MULT2: DSP multiply from registered data
|
// WRITE: Shift (bit-select from PREG, pure wiring) + add/sub + BRAM writeback
|
||||||
// SHIFT: Arithmetic shift of DSP products
|
|
||||||
// WRITE: Add/subtract + BRAM writeback
|
|
||||||
localparam [3:0] ST_IDLE = 4'd0,
|
localparam [3:0] ST_IDLE = 4'd0,
|
||||||
ST_LOAD = 4'd1,
|
ST_LOAD = 4'd1,
|
||||||
ST_BF_READ = 4'd2,
|
ST_BF_READ = 4'd2,
|
||||||
ST_BF_TW = 4'd3,
|
ST_BF_TW = 4'd3,
|
||||||
ST_BF_MULT2 = 4'd4,
|
ST_BF_MULT2 = 4'd4,
|
||||||
ST_BF_SHIFT = 4'd5,
|
ST_BF_WRITE = 4'd5,
|
||||||
ST_BF_WRITE = 4'd6,
|
ST_OUTPUT = 4'd6,
|
||||||
ST_OUTPUT = 4'd7,
|
ST_DONE = 4'd7;
|
||||||
ST_DONE = 4'd8;
|
|
||||||
|
|
||||||
reg [3:0] state;
|
reg [3:0] state;
|
||||||
assign busy = (state != ST_IDLE);
|
assign busy = (state != ST_IDLE);
|
||||||
@@ -135,9 +135,8 @@ reg [LOG2N-1:0] rd_addr_even, rd_addr_odd;
|
|||||||
reg rd_inverse;
|
reg rd_inverse;
|
||||||
reg [LOG2N-1:0] rd_tw_idx; // registered twiddle index (breaks addr→ROM path)
|
reg [LOG2N-1:0] rd_tw_idx; // registered twiddle index (breaks addr→ROM path)
|
||||||
|
|
||||||
// Half and twiddle stride
|
// Half register (twiddle stride replaced by barrel shift — see bf_addr_calc)
|
||||||
reg [LOG2N-1:0] half_reg;
|
reg [LOG2N-1:0] half_reg;
|
||||||
reg [LOG2N-1:0] tw_stride_reg;
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// BUTTERFLY ADDRESS COMPUTATION (combinational)
|
// BUTTERFLY ADDRESS COMPUTATION (combinational)
|
||||||
@@ -158,7 +157,7 @@ always @(*) begin : bf_addr_calc
|
|||||||
bf_addr_even = (grp_val << 1) | idx_val;
|
bf_addr_even = (grp_val << 1) | idx_val;
|
||||||
bf_addr_odd = bf_addr_even + half_val;
|
bf_addr_odd = bf_addr_even + half_val;
|
||||||
|
|
||||||
bf_tw_idx = idx_val * tw_stride_reg;
|
bf_tw_idx = idx_val << (LOG2N - 1 - stage);
|
||||||
end
|
end
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
@@ -215,13 +214,12 @@ endfunction
|
|||||||
// ============================================================================
|
// ============================================================================
|
||||||
// BUTTERFLY PIPELINE REGISTERS
|
// BUTTERFLY PIPELINE REGISTERS
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// Stage 1 (BF_MULT): Capture BRAM read data into rd_a, rd_b
|
// Stage 1 (BF_TW): Capture BRAM read data into rd_a, rd_b
|
||||||
// Stage 2 (BF_MULT2): DSP multiply + accumulate → raw products (bf_prod_re/im)
|
// Stage 2 (BF_MULT2): DSP multiply + accumulate → raw products (bf_prod_re/im)
|
||||||
// Stage 3 (BF_WRITE): Shift (bit-select) + add/subtract + BRAM writeback
|
// Stage 3 (BF_WRITE): Shift (bit-select, pure wiring) + add/subtract + BRAM writeback
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
reg signed [INTERNAL_W-1:0] rd_a_re, rd_a_im; // registered BRAM port A data
|
reg signed [INTERNAL_W-1:0] rd_a_re, rd_a_im; // registered BRAM port A data
|
||||||
reg signed [INTERNAL_W-1:0] rd_b_re, rd_b_im; // registered BRAM port B data (for twiddle multiply)
|
reg signed [INTERNAL_W-1:0] rd_b_re, rd_b_im; // registered BRAM port B data (for twiddle multiply)
|
||||||
reg signed [INTERNAL_W-1:0] bf_t_re, bf_t_im; // twiddle products (after shift)
|
|
||||||
|
|
||||||
// Raw DSP products — full precision, registered to break DSP→CARRY4 path
|
// Raw DSP products — full precision, registered to break DSP→CARRY4 path
|
||||||
// Width: 32*16 = 48 bits per multiply, sum of two = 49 bits max
|
// Width: 32*16 = 48 bits per multiply, sum of two = 49 bits max
|
||||||
@@ -233,10 +231,12 @@ reg signed [INTERNAL_W-1:0] bf_sum_re, bf_sum_im;
|
|||||||
reg signed [INTERNAL_W-1:0] bf_dif_re, bf_dif_im;
|
reg signed [INTERNAL_W-1:0] bf_dif_re, bf_dif_im;
|
||||||
|
|
||||||
always @(*) begin : bf_addsub
|
always @(*) begin : bf_addsub
|
||||||
bf_sum_re = rd_a_re + bf_t_re;
|
// Shift is pure bit-selection from DSP PREG (zero logic levels in HW).
|
||||||
bf_sum_im = rd_a_im + bf_t_im;
|
// Path: PREG → wiring → 32-bit CARRY4 adder → BRAM write (~3 ns total).
|
||||||
bf_dif_re = rd_a_re - bf_t_re;
|
bf_sum_re = rd_a_re + (bf_prod_re >>> (TWIDDLE_W - 1));
|
||||||
bf_dif_im = rd_a_im - bf_t_im;
|
bf_sum_im = rd_a_im + (bf_prod_im >>> (TWIDDLE_W - 1));
|
||||||
|
bf_dif_re = rd_a_re - (bf_prod_re >>> (TWIDDLE_W - 1));
|
||||||
|
bf_dif_im = rd_a_im - (bf_prod_im >>> (TWIDDLE_W - 1));
|
||||||
end
|
end
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
@@ -288,10 +288,6 @@ always @(*) begin : bram_port_mux
|
|||||||
// Twiddle multiply from registered BRAM data (rd_b_re/im)
|
// Twiddle multiply from registered BRAM data (rd_b_re/im)
|
||||||
// No BRAM access needed this cycle
|
// No BRAM access needed this cycle
|
||||||
end
|
end
|
||||||
ST_BF_SHIFT: begin
|
|
||||||
// Shift (bit-select) from registered DSP products
|
|
||||||
// No BRAM access needed this cycle
|
|
||||||
end
|
|
||||||
ST_BF_WRITE: begin
|
ST_BF_WRITE: begin
|
||||||
bram_we_a = 1'b1;
|
bram_we_a = 1'b1;
|
||||||
bram_addr_a = rd_addr_even;
|
bram_addr_a = rd_addr_even;
|
||||||
@@ -547,7 +543,6 @@ always @(posedge clk or negedge reset_n) begin
|
|||||||
bfly_count <= 0;
|
bfly_count <= 0;
|
||||||
stage <= 0;
|
stage <= 0;
|
||||||
half_reg <= 1;
|
half_reg <= 1;
|
||||||
tw_stride_reg <= FFT_N_HALF[LOG2N-1:0];
|
|
||||||
dout_re <= 0;
|
dout_re <= 0;
|
||||||
dout_im <= 0;
|
dout_im <= 0;
|
||||||
dout_valid <= 0;
|
dout_valid <= 0;
|
||||||
@@ -572,7 +567,6 @@ always @(posedge clk or negedge reset_n) begin
|
|||||||
stage <= 0;
|
stage <= 0;
|
||||||
bfly_count <= 0;
|
bfly_count <= 0;
|
||||||
half_reg <= 1;
|
half_reg <= 1;
|
||||||
tw_stride_reg <= FFT_N_HALF[LOG2N-1:0];
|
|
||||||
end else begin
|
end else begin
|
||||||
load_count <= load_count + 1;
|
load_count <= load_count + 1;
|
||||||
end
|
end
|
||||||
@@ -588,10 +582,6 @@ always @(posedge clk or negedge reset_n) begin
|
|||||||
end
|
end
|
||||||
|
|
||||||
ST_BF_MULT2: begin
|
ST_BF_MULT2: begin
|
||||||
state <= ST_BF_SHIFT;
|
|
||||||
end
|
|
||||||
|
|
||||||
ST_BF_SHIFT: begin
|
|
||||||
state <= ST_BF_WRITE;
|
state <= ST_BF_WRITE;
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -604,7 +594,6 @@ always @(posedge clk or negedge reset_n) begin
|
|||||||
end else begin
|
end else begin
|
||||||
stage <= stage + 1;
|
stage <= stage + 1;
|
||||||
half_reg <= half_reg << 1;
|
half_reg <= half_reg << 1;
|
||||||
tw_stride_reg <= tw_stride_reg >> 1;
|
|
||||||
state <= ST_BF_READ;
|
state <= ST_BF_READ;
|
||||||
end
|
end
|
||||||
end else begin
|
end else begin
|
||||||
@@ -652,8 +641,8 @@ end
|
|||||||
// - rd_tw_cos/sin → DSP48E1 BREG (butterfly multiply B-port input)
|
// - rd_tw_cos/sin → DSP48E1 BREG (butterfly multiply B-port input)
|
||||||
// - bf_prod_re/im → DSP48E1 PREG (multiply output register)
|
// - bf_prod_re/im → DSP48E1 PREG (multiply output register)
|
||||||
// - rd_a_re/im → BRAM output register (REGCE)
|
// - rd_a_re/im → BRAM output register (REGCE)
|
||||||
// - rd_tw_idx → DSP48E1 PREG (twiddle index multiply output)
|
// - rd_tw_idx → pipeline register (twiddle index)
|
||||||
// - bf_t_re/im, rd_addr_even/odd, rd_inverse — internal pipeline
|
// - rd_addr_even/odd, rd_inverse — internal pipeline
|
||||||
//
|
//
|
||||||
// These registers are only meaningful during COMPUTE states (BF_READ through
|
// These registers are only meaningful during COMPUTE states (BF_READ through
|
||||||
// BF_WRITE). Their values are always overwritten before use after every FSM
|
// BF_WRITE). Their values are always overwritten before use after every FSM
|
||||||
@@ -671,8 +660,6 @@ always @(posedge clk) begin
|
|||||||
rd_a_im <= 0;
|
rd_a_im <= 0;
|
||||||
rd_b_re <= 0;
|
rd_b_re <= 0;
|
||||||
rd_b_im <= 0;
|
rd_b_im <= 0;
|
||||||
bf_t_re <= 0;
|
|
||||||
bf_t_im <= 0;
|
|
||||||
bf_prod_re <= 0;
|
bf_prod_re <= 0;
|
||||||
bf_prod_im <= 0;
|
bf_prod_im <= 0;
|
||||||
end else begin
|
end else begin
|
||||||
@@ -705,9 +692,9 @@ always @(posedge clk) begin
|
|||||||
|
|
||||||
ST_BF_MULT2: begin
|
ST_BF_MULT2: begin
|
||||||
// Compute raw twiddle products from registered BRAM data.
|
// Compute raw twiddle products from registered BRAM data.
|
||||||
// Path: register -> DSP48E1 multiply-accumulate -> register
|
// Path: register -> DSP48E1 multiply-accumulate -> PREG
|
||||||
// The shift is deferred to the next cycle to break the
|
// The arithmetic shift and add/subtract are handled combinationally
|
||||||
// DSP -> CARRY4 path.
|
// in BF_WRITE (shift is pure bit-select, zero logic levels).
|
||||||
if (!rd_inverse) begin
|
if (!rd_inverse) begin
|
||||||
bf_prod_re <= rd_b_re * rd_tw_cos + rd_b_im * rd_tw_sin;
|
bf_prod_re <= rd_b_re * rd_tw_cos + rd_b_im * rd_tw_sin;
|
||||||
bf_prod_im <= rd_b_im * rd_tw_cos - rd_b_re * rd_tw_sin;
|
bf_prod_im <= rd_b_im * rd_tw_cos - rd_b_re * rd_tw_sin;
|
||||||
@@ -717,14 +704,6 @@ always @(posedge clk) begin
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
ST_BF_SHIFT: begin
|
|
||||||
// Apply arithmetic right shift to registered DSP products.
|
|
||||||
// Register -> bit-select/sign-extend -> register
|
|
||||||
// (near-zero logic: pure wiring + sign extension).
|
|
||||||
bf_t_re <= bf_prod_re >>> (TWIDDLE_W - 1);
|
|
||||||
bf_t_im <= bf_prod_im >>> (TWIDDLE_W - 1);
|
|
||||||
end
|
|
||||||
|
|
||||||
default: begin
|
default: begin
|
||||||
// No datapath update in other states — registers hold values
|
// No datapath update in other states — registers hold values
|
||||||
end
|
end
|
||||||
|
|||||||
Reference in New Issue
Block a user