Split fft_engine FSM: async reset for control, sync reset for DSP/BRAM datapath (Build 11)

Split monolithic always block into two:
- Block 1 (async reset): FSM state, counters, output interface
  (dout_re/im, dout_valid, done) — deterministic startup
- Block 2 (sync reset): DSP/BRAM pipeline registers (rd_b_re/im,
  rd_tw_cos/sin, bf_prod_re/im, rd_a_re/im, bf_t_re/im, rd_tw_idx,
  rd_addr_even/odd, rd_inverse) — enables hard block absorption

Also convert output pipeline (out_pipe_valid/inverse) to sync reset.

Expected synthesis impact:
- DSP48E1 AREG/BREG absorption for butterfly multiply inputs
- DSP48E1 PREG absorption for multiply outputs (bf_prod_re/im)
- BRAM output register absorption for rd_a_re/im
- Eliminate ~300 DPIR-1 methodology warnings per FFT instance
- Resolve DPOP-2 (PREG=0), RBOR-1 (BRAM DOA), REQP-1839/1840

13/13 regression suites pass. Integration golden: 2048/2048 exact match.
This commit is contained in:
Jason
2026-03-17 21:40:09 +02:00
parent d8a8532097
commit 36ad15247c
+97 -54
View File
@@ -521,7 +521,9 @@ xpm_memory_tdpram #(
reg out_pipe_valid;
reg out_pipe_inverse;
always @(posedge clk or negedge reset_n) begin
// Sync reset: pure internal pipeline no functional need for async reset.
// Enables downstream register absorption.
always @(posedge clk) begin
if (!reset_n) begin
out_pipe_valid <= 1'b0;
out_pipe_inverse <= 1'b0;
@@ -532,7 +534,10 @@ always @(posedge clk or negedge reset_n) begin
end
// ============================================================================
// MAIN FSM
// MAIN FSM Block 1: Control / FSM / Output Interface (async reset)
// ============================================================================
// Retains async reset for deterministic startup of FSM state and external
// output interface signals (dout_re/im, dout_valid, done).
// ============================================================================
always @(posedge clk or negedge reset_n) begin
if (!reset_n) begin
@@ -547,20 +552,6 @@ always @(posedge clk or negedge reset_n) begin
dout_im <= 0;
dout_valid <= 0;
done <= 0;
rd_tw_cos <= 0;
rd_tw_sin <= 0;
rd_addr_even <= 0;
rd_addr_odd <= 0;
rd_inverse <= 0;
rd_tw_idx <= 0;
rd_a_re <= 0;
rd_a_im <= 0;
rd_b_re <= 0;
rd_b_im <= 0;
bf_t_re <= 0;
bf_t_im <= 0;
bf_prod_re <= 0;
bf_prod_im <= 0;
end else begin
dout_valid <= 1'b0;
done <= 1'b0;
@@ -589,58 +580,22 @@ always @(posedge clk or negedge reset_n) begin
end
ST_BF_READ: begin
// Register butterfly addresses and twiddle index.
// BRAM read initiated by bram_port_mux (addresses presented
// combinationally); data arrives next cycle (ST_BF_TW).
// Twiddle ROM lookup uses rd_tw_idx next cycle, breaking the
// address-calc ROM quarter-wave-mux combinational path.
rd_addr_even <= bf_addr_even;
rd_addr_odd <= bf_addr_odd;
rd_inverse <= inverse;
rd_tw_idx <= bf_tw_idx;
state <= ST_BF_TW;
state <= ST_BF_TW;
end
ST_BF_TW: begin
// BRAM data valid this cycle (1-cycle read latency).
// Capture BRAM data into pipeline regs.
// Twiddle ROM lookup is combinational from registered rd_tw_idx
// capture the result into rd_tw_cos/sin.
rd_a_re <= mem_rdata_a_re;
rd_a_im <= mem_rdata_a_im;
rd_b_re <= mem_rdata_b_re;
rd_b_im <= mem_rdata_b_im;
rd_tw_cos <= tw_cos_lookup;
rd_tw_sin <= tw_sin_lookup;
state <= ST_BF_MULT2;
state <= ST_BF_MULT2;
end
ST_BF_MULT2: begin
// Compute raw twiddle products from registered BRAM data.
// Path: register DSP48E1 multiply-accumulate register (bf_prod_re/im)
// The shift is deferred to the next cycle to break the DSPCARRY4 path.
if (!rd_inverse) begin
bf_prod_re <= rd_b_re * rd_tw_cos + rd_b_im * rd_tw_sin;
bf_prod_im <= rd_b_im * rd_tw_cos - rd_b_re * rd_tw_sin;
end else begin
bf_prod_re <= rd_b_re * rd_tw_cos - rd_b_im * rd_tw_sin;
bf_prod_im <= rd_b_im * rd_tw_cos + rd_b_re * rd_tw_sin;
end
state <= ST_BF_SHIFT;
end
ST_BF_SHIFT: begin
// Apply arithmetic right shift to registered DSP products.
// This is now register bit-select/sign-extend register,
// which should be near-zero logic (pure wiring + sign extension).
bf_t_re <= bf_prod_re >>> (TWIDDLE_W - 1);
bf_t_im <= bf_prod_im >>> (TWIDDLE_W - 1);
state <= ST_BF_WRITE;
end
ST_BF_WRITE: begin
// bf_sum/bf_dif are combinational from registered rd_a and bf_t.
// BRAM write data driven by bram_port_mux using bf_sum/bf_dif.
if (bfly_count == FFT_N_HALF_M1[LOG2N-1:0]) begin
bfly_count <= 0;
if (stage == LOG2N - 1) begin
@@ -689,4 +644,92 @@ always @(posedge clk or negedge reset_n) begin
end
end
// ============================================================================
// MAIN FSM Block 2: DSP/BRAM Datapath Pipeline (sync reset)
// ============================================================================
// Sync reset enables Vivado to absorb these registers into hard blocks:
// - rd_b_re/im DSP48E1 AREG (butterfly multiply A-port input)
// - rd_tw_cos/sin DSP48E1 BREG (butterfly multiply B-port input)
// - bf_prod_re/im DSP48E1 PREG (multiply output register)
// - rd_a_re/im BRAM output register (REGCE)
// - rd_tw_idx DSP48E1 PREG (twiddle index multiply output)
// - bf_t_re/im, rd_addr_even/odd, rd_inverse internal pipeline
//
// These registers are only meaningful during COMPUTE states (BF_READ through
// BF_WRITE). Their values are always overwritten before use after every FSM
// transition, so sync reset is functionally equivalent to async reset.
// ============================================================================
always @(posedge clk) begin
if (!reset_n) begin
rd_tw_cos <= 0;
rd_tw_sin <= 0;
rd_addr_even <= 0;
rd_addr_odd <= 0;
rd_inverse <= 0;
rd_tw_idx <= 0;
rd_a_re <= 0;
rd_a_im <= 0;
rd_b_re <= 0;
rd_b_im <= 0;
bf_t_re <= 0;
bf_t_im <= 0;
bf_prod_re <= 0;
bf_prod_im <= 0;
end else begin
case (state)
ST_BF_READ: begin
// Register butterfly addresses and twiddle index.
// BRAM read initiated by bram_port_mux (addresses presented
// combinationally); data arrives next cycle (ST_BF_TW).
// Twiddle ROM lookup uses rd_tw_idx next cycle, breaking the
// address-calc -> ROM -> quarter-wave-mux combinational path.
rd_addr_even <= bf_addr_even;
rd_addr_odd <= bf_addr_odd;
rd_inverse <= inverse;
rd_tw_idx <= bf_tw_idx;
end
ST_BF_TW: begin
// BRAM data valid this cycle (1-cycle read latency).
// Capture BRAM data into pipeline regs.
// Twiddle ROM lookup is combinational from registered rd_tw_idx
// -- capture the result into rd_tw_cos/sin.
rd_a_re <= mem_rdata_a_re;
rd_a_im <= mem_rdata_a_im;
rd_b_re <= mem_rdata_b_re;
rd_b_im <= mem_rdata_b_im;
rd_tw_cos <= tw_cos_lookup;
rd_tw_sin <= tw_sin_lookup;
end
ST_BF_MULT2: begin
// Compute raw twiddle products from registered BRAM data.
// Path: register -> DSP48E1 multiply-accumulate -> register
// The shift is deferred to the next cycle to break the
// DSP -> CARRY4 path.
if (!rd_inverse) begin
bf_prod_re <= rd_b_re * rd_tw_cos + rd_b_im * rd_tw_sin;
bf_prod_im <= rd_b_im * rd_tw_cos - rd_b_re * rd_tw_sin;
end else begin
bf_prod_re <= rd_b_re * rd_tw_cos - rd_b_im * rd_tw_sin;
bf_prod_im <= rd_b_im * rd_tw_cos + rd_b_re * rd_tw_sin;
end
end
ST_BF_SHIFT: begin
// Apply arithmetic right shift to registered DSP products.
// Register -> bit-select/sign-extend -> register
// (near-zero logic: pure wiring + sign extension).
bf_t_re <= bf_prod_re >>> (TWIDDLE_W - 1);
bf_t_im <= bf_prod_im >>> (TWIDDLE_W - 1);
end
default: begin
// No datapath update in other states registers hold values
end
endcase
end
end
endmodule