Achieve full timing closure on xc7a100tcsg324-1 at 400 MHz (0 violations)

Complete FPGA timing closure across all clock domains after 9 iterative
Vivado builds. WNS improved from -48.325ns to +0.018ns (107,886 endpoints).

RTL fixes for 400 MHz timing:
- NCO: 6-stage pipeline with DSP48E1 phase accumulator, registered LUT
  index (Fix D splits address decode from ROM read), distributed RAM
- CIC: explicit DSP48E1 PCOUT->PCIN cascade for 5 integrator stages,
  CREG=1 on integrator_0 to eliminate fabric->DSP setup violation
- DDC: 400 MHz reset synchronizer (async-assert/sync-deassert),
  active-high reset register for DSP48E1 RST ports, posedge output stage
- FIR: 5-stage binary adder tree pipeline (7-cycle latency)
- FFT: 5-cycle butterfly pipeline with registered twiddle index,
  XPM_MEMORY_TDPRAM for data storage
- XDC: CDC false paths, async reset false paths, CIC comb multicycle paths

Final Build 9 timing (all MET):
  adc_dco_p (400 MHz): WNS = +0.278ns
  clk_100m  (100 MHz): WNS = +0.018ns
  clk_120m_dac (120 MHz): WNS = +0.992ns
  ft601_clk_in (100 MHz): WNS = +5.229ns
  Cross-domain (adc_dco_p->clk_100m): WNS = +7.105ns
This commit is contained in:
Jason
2026-03-16 15:02:35 +02:00
parent 692b6a3bfa
commit 00fbab6c9d
7 changed files with 1150 additions and 410 deletions
+117 -31
View File
@@ -8,9 +8,13 @@
*
* Architecture:
* - LOAD: Accept N input samples, store bit-reversed in BRAM
* - COMPUTE: LOG2N stages x N/2 butterflies, 2-cycle pipeline:
* BF_READ: Present BRAM addresses, capture twiddle
* BF_CALC: BRAM data valid; butterfly compute + writeback
* - COMPUTE: LOG2N stages x N/2 butterflies, 5-cycle pipeline:
* BF_READ: Present BRAM addresses; register twiddle index
* BF_TW: BRAM data valid capture; twiddle ROM lookup from
* registered index capture cos/sin
* BF_MULT2: DSP multiply from registered data + twiddle
* BF_SHIFT: Arithmetic shift of DSP products
* BF_WRITE: Add/subtract + BRAM writeback
* - OUTPUT: Stream N results (1/N scaling for IFFT)
*
* Data memory uses xpm_memory_tdpram (Xilinx Parameterized Macros) for
@@ -63,14 +67,25 @@ localparam [LOG2N:0] FFT_N_M1 = N - 1;
// ============================================================================
// STATES
// ============================================================================
localparam [2:0] ST_IDLE = 3'd0,
ST_LOAD = 3'd1,
ST_BF_READ = 3'd2,
ST_BF_CALC = 3'd3,
ST_OUTPUT = 3'd4,
ST_DONE = 3'd5;
// Butterfly pipeline: READ → TW → MULT2 → SHIFT → WRITE (5 cycles)
// READ: Present BRAM addresses; register twiddle index (bf_tw_idx)
// TW: BRAM data valid → capture rd_a/rd_b; ROM lookup from registered
// twiddle index → capture rd_tw_cos/sin. This splits the combinational
// path (address calc + multiply + ROM + quarter-wave mux) into two cycles.
// MULT2: DSP multiply from registered data
// SHIFT: Arithmetic shift of DSP products
// WRITE: Add/subtract + BRAM writeback
localparam [3:0] ST_IDLE = 4'd0,
ST_LOAD = 4'd1,
ST_BF_READ = 4'd2,
ST_BF_TW = 4'd3,
ST_BF_MULT2 = 4'd4,
ST_BF_SHIFT = 4'd5,
ST_BF_WRITE = 4'd6,
ST_OUTPUT = 4'd7,
ST_DONE = 4'd8;
reg [2:0] state;
reg [3:0] state;
assign busy = (state != ST_IDLE);
// ============================================================================
@@ -114,10 +129,11 @@ reg [LOG2N:0] out_count;
reg [LOG2N-1:0] bfly_count;
reg [3:0] stage;
// Registered values (captured in BF_READ, used in BF_CALC)
// Registered values (captured in BF_READ, used in BF_TW and later)
reg signed [TWIDDLE_W-1:0] rd_tw_cos, rd_tw_sin;
reg [LOG2N-1:0] rd_addr_even, rd_addr_odd;
reg rd_inverse;
reg [LOG2N-1:0] rd_tw_idx; // registered twiddle index (breaks addrROM path)
// Half and twiddle stride
reg [LOG2N-1:0] half_reg;
@@ -155,7 +171,7 @@ always @(*) begin : tw_lookup
reg [LOG2N-1:0] k;
reg [LOG2N-1:0] rom_idx;
k = bf_tw_idx;
k = rd_tw_idx; // use registered index (set in ST_BF_READ)
tw_cos_lookup = 0;
tw_sin_lookup = 0;
@@ -197,24 +213,30 @@ function signed [DATA_W-1:0] saturate;
endfunction
// ============================================================================
// BUTTERFLY COMPUTATION (combinational, for BF_CALC write data)
// BUTTERFLY PIPELINE REGISTERS
// ============================================================================
reg signed [INTERNAL_W-1:0] bf_t_re, bf_t_im;
// Stage 1 (BF_MULT): Capture BRAM read data into rd_a, rd_b
// Stage 2 (BF_MULT2): DSP multiply + accumulate raw products (bf_prod_re/im)
// Stage 3 (BF_WRITE): Shift (bit-select) + add/subtract + BRAM writeback
// ============================================================================
reg signed [INTERNAL_W-1:0] rd_a_re, rd_a_im; // registered BRAM port A data
reg signed [INTERNAL_W-1:0] rd_b_re, rd_b_im; // registered BRAM port B data (for twiddle multiply)
reg signed [INTERNAL_W-1:0] bf_t_re, bf_t_im; // twiddle products (after shift)
// Raw DSP products full precision, registered to break DSPCARRY4 path
// Width: 32*16 = 48 bits per multiply, sum of two = 49 bits max
localparam PROD_W = INTERNAL_W + TWIDDLE_W; // 48
reg signed [PROD_W:0] bf_prod_re, bf_prod_im; // 49 bits to hold sum of two products
// Combinational add/subtract from registered values (used in BF_WRITE)
reg signed [INTERNAL_W-1:0] bf_sum_re, bf_sum_im;
reg signed [INTERNAL_W-1:0] bf_dif_re, bf_dif_im;
always @(*) begin : bf_compute
if (!rd_inverse) begin
bf_t_re = (mem_rdata_b_re * rd_tw_cos + mem_rdata_b_im * rd_tw_sin) >>> (TWIDDLE_W - 1);
bf_t_im = (mem_rdata_b_im * rd_tw_cos - mem_rdata_b_re * rd_tw_sin) >>> (TWIDDLE_W - 1);
end else begin
bf_t_re = (mem_rdata_b_re * rd_tw_cos - mem_rdata_b_im * rd_tw_sin) >>> (TWIDDLE_W - 1);
bf_t_im = (mem_rdata_b_im * rd_tw_cos + mem_rdata_b_re * rd_tw_sin) >>> (TWIDDLE_W - 1);
end
bf_sum_re = mem_rdata_a_re + bf_t_re;
bf_sum_im = mem_rdata_a_im + bf_t_im;
bf_dif_re = mem_rdata_a_re - bf_t_re;
bf_dif_im = mem_rdata_a_im - bf_t_im;
always @(*) begin : bf_addsub
bf_sum_re = rd_a_re + bf_t_re;
bf_sum_im = rd_a_im + bf_t_im;
bf_dif_re = rd_a_re - bf_t_re;
bf_dif_im = rd_a_im - bf_t_im;
end
// ============================================================================
@@ -258,7 +280,19 @@ always @(*) begin : bram_port_mux
bram_addr_a = bf_addr_even;
bram_addr_b = bf_addr_odd;
end
ST_BF_CALC: begin
ST_BF_TW: begin
// BRAM outputs are being read; addresses were set in BF_READ
// Data is being captured into pipeline regs (rd_a, rd_b)
end
ST_BF_MULT2: begin
// Twiddle multiply from registered BRAM data (rd_b_re/im)
// No BRAM access needed this cycle
end
ST_BF_SHIFT: begin
// Shift (bit-select) from registered DSP products
// No BRAM access needed this cycle
end
ST_BF_WRITE: begin
bram_we_a = 1'b1;
bram_addr_a = rd_addr_even;
bram_wdata_a_re = bf_sum_re;
@@ -518,6 +552,15 @@ always @(posedge clk or negedge reset_n) begin
rd_addr_even <= 0;
rd_addr_odd <= 0;
rd_inverse <= 0;
rd_tw_idx <= 0;
rd_a_re <= 0;
rd_a_im <= 0;
rd_b_re <= 0;
rd_b_im <= 0;
bf_t_re <= 0;
bf_t_im <= 0;
bf_prod_re <= 0;
bf_prod_im <= 0;
end else begin
dout_valid <= 1'b0;
done <= 1'b0;
@@ -546,15 +589,58 @@ always @(posedge clk or negedge reset_n) begin
end
ST_BF_READ: begin
rd_tw_cos <= tw_cos_lookup;
rd_tw_sin <= tw_sin_lookup;
// Register butterfly addresses and twiddle index.
// BRAM read initiated by bram_port_mux (addresses presented
// combinationally); data arrives next cycle (ST_BF_TW).
// Twiddle ROM lookup uses rd_tw_idx next cycle, breaking the
// address-calc ROM quarter-wave-mux combinational path.
rd_addr_even <= bf_addr_even;
rd_addr_odd <= bf_addr_odd;
rd_inverse <= inverse;
state <= ST_BF_CALC;
rd_tw_idx <= bf_tw_idx;
state <= ST_BF_TW;
end
ST_BF_CALC: begin
ST_BF_TW: begin
// BRAM data valid this cycle (1-cycle read latency).
// Capture BRAM data into pipeline regs.
// Twiddle ROM lookup is combinational from registered rd_tw_idx
// capture the result into rd_tw_cos/sin.
rd_a_re <= mem_rdata_a_re;
rd_a_im <= mem_rdata_a_im;
rd_b_re <= mem_rdata_b_re;
rd_b_im <= mem_rdata_b_im;
rd_tw_cos <= tw_cos_lookup;
rd_tw_sin <= tw_sin_lookup;
state <= ST_BF_MULT2;
end
ST_BF_MULT2: begin
// Compute raw twiddle products from registered BRAM data.
// Path: register DSP48E1 multiply-accumulate register (bf_prod_re/im)
// The shift is deferred to the next cycle to break the DSPCARRY4 path.
if (!rd_inverse) begin
bf_prod_re <= rd_b_re * rd_tw_cos + rd_b_im * rd_tw_sin;
bf_prod_im <= rd_b_im * rd_tw_cos - rd_b_re * rd_tw_sin;
end else begin
bf_prod_re <= rd_b_re * rd_tw_cos - rd_b_im * rd_tw_sin;
bf_prod_im <= rd_b_im * rd_tw_cos + rd_b_re * rd_tw_sin;
end
state <= ST_BF_SHIFT;
end
ST_BF_SHIFT: begin
// Apply arithmetic right shift to registered DSP products.
// This is now register bit-select/sign-extend register,
// which should be near-zero logic (pure wiring + sign extension).
bf_t_re <= bf_prod_re >>> (TWIDDLE_W - 1);
bf_t_im <= bf_prod_im >>> (TWIDDLE_W - 1);
state <= ST_BF_WRITE;
end
ST_BF_WRITE: begin
// bf_sum/bf_dif are combinational from registered rd_a and bf_t.
// BRAM write data driven by bram_port_mux using bf_sum/bf_dif.
if (bfly_count == FFT_N_HALF_M1[LOG2N-1:0]) begin
bfly_count <= 0;
if (stage == LOG2N - 1) begin