Achieve full timing closure on xc7a100tcsg324-1 at 400 MHz (0 violations)

Complete FPGA timing closure across all clock domains after 9 iterative Vivado builds. WNS improved from -48.325ns to +0.018ns (107,886 endpoints). RTL fixes for 400 MHz timing: - NCO: 6-stage pipeline with DSP48E1 phase accumulator, registered LUT index (Fix D splits address decode from ROM read), distributed RAM - CIC: explicit DSP48E1 PCOUT->PCIN cascade for 5 integrator stages, CREG=1 on integrator_0 to eliminate fabric->DSP setup violation - DDC: 400 MHz reset synchronizer (async-assert/sync-deassert), active-high reset register for DSP48E1 RST ports, posedge output stage - FIR: 5-stage binary adder tree pipeline (7-cycle latency) - FFT: 5-cycle butterfly pipeline with registered twiddle index, XPM_MEMORY_TDPRAM for data storage - XDC: CDC false paths, async reset false paths, CIC comb multicycle paths Final Build 9 timing (all MET): adc_dco_p (400 MHz): WNS = +0.278ns clk_100m (100 MHz): WNS = +0.018ns clk_120m_dac (120 MHz): WNS = +0.992ns ft601_clk_in (100 MHz): WNS = +5.229ns Cross-domain (adc_dco_p->clk_100m): WNS = +7.105ns
2026-03-16 15:02:35 +02:00
parent 692b6a3bfa
commit 00fbab6c9d
7 changed files with 1150 additions and 410 deletions
@@ -16,23 +16,57 @@ parameter COEFF_WIDTH = 18;
 parameter DATA_WIDTH = 18;
 parameter ACCUM_WIDTH = 36;

-// Filter coefficients
+// ============================================================================
+// Pipelined FIR filter for 100 MHz timing closure
+//
+// Problem: The original fully-combinatorial adder tree for 32 multiply products
+// created a 31-deep DSP48E1 PCOUT cascade chain taking 56.6ns (WNS = -48.325ns).
+//
+// Solution: 5-stage pipelined binary adder tree with registered outputs at
+// each level. Each stage performs at most one pairwise addition (~1.7ns DSP hop),
+// easily fitting in the 10ns clock period.
+//
+// Pipeline stages:
+//   Cycle 0: data_valid → shift delay line, start multiplies (combinatorial)
+//   Cycle 1: Register 32 multiply results + 16 pairwise sums (level 0)
+//   Cycle 2: 8 pairwise sums (level 1)
+//   Cycle 3: 4 pairwise sums (level 2)
+//   Cycle 4: 2 pairwise sums (level 3)
+//   Cycle 5: 1 final sum → accumulator_reg (level 4)
+//   Cycle 6: Output saturation/rounding (existing output stage)
+//
+// Total latency: 7 cycles from data_valid to data_out_valid
+// Throughput: 1 sample per cycle (fully pipelined)
+// FIR runs at 100 MHz on data decimated 4:1 from 400 MHz — valid samples
+// arrive every ~4 cycles, so the 7-cycle latency is transparent.
+// ============================================================================
+
+// Filter coefficients (symmetric: coeff[k] == coeff[31-k])
 reg signed [COEFF_WIDTH-1:0] coeff [0:TAPS-1];

 // Parallel delay line
 reg signed [DATA_WIDTH-1:0] delay_line [0:TAPS-1];

-// Parallel multiply-accumulate structure
+// Parallel multiply results (combinatorial)
 wire signed [DATA_WIDTH+COEFF_WIDTH-1:0] mult_result [0:TAPS-1];

-// Wires for parallel addition (combinatorial)
-wire signed [ACCUM_WIDTH-1:0] sum_stage1_0, sum_stage1_1, sum_stage1_2, sum_stage1_3;
-wire signed [ACCUM_WIDTH-1:0] sum_stage2_0, sum_stage2_1;
-wire signed [ACCUM_WIDTH-1:0] sum_stage3;
-
-// Registered accumulator
+// Pipelined adder tree registers
+// Level 0: 16 pairwise sums of 32 products
+reg signed [ACCUM_WIDTH-1:0] add_l0 [0:15];
+// Level 1: 8 pairwise sums
+reg signed [ACCUM_WIDTH-1:0] add_l1 [0:7];
+// Level 2: 4 pairwise sums
+reg signed [ACCUM_WIDTH-1:0] add_l2 [0:3];
+// Level 3: 2 pairwise sums
+reg signed [ACCUM_WIDTH-1:0] add_l3 [0:1];
+// Level 4: final sum
 reg signed [ACCUM_WIDTH-1:0] accumulator_reg;

+// Valid pipeline: 7-stage shift register
+// [0]=multiply done, [1]=L0 done, [2]=L1 done, [3]=L2 done,
+// [4]=L3 done, [5]=L4/accum done, [6]=output done
+reg [6:0] valid_pipe;
+
 // Initialize coefficients
 initial begin
    // Proper low-pass filter coefficients
@@ -46,7 +80,7 @@ initial begin
    coeff[28] = 18'sh02A6; coeff[29] = 18'sh3FD87; coeff[30] = 18'sh00CE; coeff[31] = 18'sh00AD;
 end

-// Generate parallel multipliers
+// Generate parallel multipliers (combinatorial — DSP48E1 will absorb these)
 genvar k;
 generate
    for (k = 0; k < TAPS; k = k + 1) begin : mult_gen
@@ -54,71 +88,135 @@ generate
    end
 endgenerate

-// COMBINATORIAL PARALLEL ADDITION TREE
-// Stage 1: Group of 8
-assign sum_stage1_0 = mult_result[0] + mult_result[1] + mult_result[2] + mult_result[3] +
-                     mult_result[4] + mult_result[5] + mult_result[6] + mult_result[7];
-assign sum_stage1_1 = mult_result[8] + mult_result[9] + mult_result[10] + mult_result[11] +
-                     mult_result[12] + mult_result[13] + mult_result[14] + mult_result[15];
-assign sum_stage1_2 = mult_result[16] + mult_result[17] + mult_result[18] + mult_result[19] +
-                     mult_result[20] + mult_result[21] + mult_result[22] + mult_result[23];
-assign sum_stage1_3 = mult_result[24] + mult_result[25] + mult_result[26] + mult_result[27] +
-                     mult_result[28] + mult_result[29] + mult_result[30] + mult_result[31];
-
-// Stage 2: Combine groups of 2
-assign sum_stage2_0 = sum_stage1_0 + sum_stage1_1;
-assign sum_stage2_1 = sum_stage1_2 + sum_stage1_3;
-
-// Stage 3: Final sum
-assign sum_stage3 = sum_stage2_0 + sum_stage2_1;
-
 integer i;

-// SINGLE-CYCLE PIPELINE PROCESSING
+// ============================================================================
+// Pipeline Stage 0: Shift delay line on data_valid
+// ============================================================================
 always @(posedge clk or negedge reset_n) begin
    if (!reset_n) begin
-        // Reset delay line
        for (i = 0; i < TAPS; i = i + 1) begin
            delay_line[i] <= 0;
        end
-        accumulator_reg <= 0;
-        data_out <= 0;
-        data_out_valid <= 0;
-    end else begin
-        // Always shift in new data when valid
-        if (data_valid) begin
-            // Shift delay line
-            for (i = TAPS-1; i > 0; i = i - 1) begin
-                delay_line[i] <= delay_line[i-1];
-            end
-            delay_line[0] <= data_in;
-            
-            // Register the combinatorial sum
-            accumulator_reg <= sum_stage3;
-            
-            // Output with 1-cycle latency
-            data_out_valid <= 1'b1;
-        end else begin
-            data_out_valid <= 1'b0;
+    end else if (data_valid) begin
+        for (i = TAPS-1; i > 0; i = i - 1) begin
+            delay_line[i] <= delay_line[i-1];
        end
-        
-        // Output saturation logic (registered)
-        if (accumulator_reg > (2**(ACCUM_WIDTH-2)-1)) begin
-            data_out <= (2**(DATA_WIDTH-1))-1;
-        end else if (accumulator_reg < -(2**(ACCUM_WIDTH-2))) begin
-            data_out <= -(2**(DATA_WIDTH-1));
-        end else begin
-            // Round and truncate (keep middle bits)
-            data_out <= accumulator_reg[ACCUM_WIDTH-2:DATA_WIDTH-1];
+        delay_line[0] <= data_in;
+    end
+end
+
+// ============================================================================
+// Pipeline Stage 1 (Level 0): Register 16 pairwise sums of 32 multiply results
+// Each addition is a single 36-bit add — one DSP48E1 hop (~1.7ns), fits 10ns.
+// ============================================================================
+always @(posedge clk or negedge reset_n) begin
+    if (!reset_n) begin
+        for (i = 0; i < 16; i = i + 1) begin
+            add_l0[i] <= 0;
+        end
+    end else if (valid_pipe[0]) begin
+        for (i = 0; i < 16; i = i + 1) begin
+            add_l0[i] <= {{(ACCUM_WIDTH-DATA_WIDTH-COEFF_WIDTH){mult_result[2*i][DATA_WIDTH+COEFF_WIDTH-1]}}, mult_result[2*i]} +
+                          {{(ACCUM_WIDTH-DATA_WIDTH-COEFF_WIDTH){mult_result[2*i+1][DATA_WIDTH+COEFF_WIDTH-1]}}, mult_result[2*i+1]};
        end
    end
 end

-// Always ready to accept new data
+// ============================================================================
+// Pipeline Stage 2 (Level 1): 8 pairwise sums of 16 Level-0 results
+// ============================================================================
+always @(posedge clk or negedge reset_n) begin
+    if (!reset_n) begin
+        for (i = 0; i < 8; i = i + 1) begin
+            add_l1[i] <= 0;
+        end
+    end else if (valid_pipe[1]) begin
+        for (i = 0; i < 8; i = i + 1) begin
+            add_l1[i] <= add_l0[2*i] + add_l0[2*i+1];
+        end
+    end
+end
+
+// ============================================================================
+// Pipeline Stage 3 (Level 2): 4 pairwise sums of 8 Level-1 results
+// ============================================================================
+always @(posedge clk or negedge reset_n) begin
+    if (!reset_n) begin
+        for (i = 0; i < 4; i = i + 1) begin
+            add_l2[i] <= 0;
+        end
+    end else if (valid_pipe[2]) begin
+        for (i = 0; i < 4; i = i + 1) begin
+            add_l2[i] <= add_l1[2*i] + add_l1[2*i+1];
+        end
+    end
+end
+
+// ============================================================================
+// Pipeline Stage 4 (Level 3): 2 pairwise sums of 4 Level-2 results
+// ============================================================================
+always @(posedge clk or negedge reset_n) begin
+    if (!reset_n) begin
+        add_l3[0] <= 0;
+        add_l3[1] <= 0;
+    end else if (valid_pipe[3]) begin
+        add_l3[0] <= add_l2[0] + add_l2[1];
+        add_l3[1] <= add_l2[2] + add_l2[3];
+    end
+end
+
+// ============================================================================
+// Pipeline Stage 5 (Level 4): Final sum of 2 Level-3 results
+// ============================================================================
+always @(posedge clk or negedge reset_n) begin
+    if (!reset_n) begin
+        accumulator_reg <= 0;
+    end else if (valid_pipe[4]) begin
+        accumulator_reg <= add_l3[0] + add_l3[1];
+    end
+end
+
+// ============================================================================
+// Pipeline Stage 6: Output saturation/rounding (registered)
+// ============================================================================
+always @(posedge clk or negedge reset_n) begin
+    if (!reset_n) begin
+        data_out <= 0;
+        data_out_valid <= 0;
+    end else begin
+        data_out_valid <= valid_pipe[5];
+        
+        if (valid_pipe[5]) begin
+            // Output saturation logic
+            if (accumulator_reg > (2**(ACCUM_WIDTH-2)-1)) begin
+                data_out <= (2**(DATA_WIDTH-1))-1;
+            end else if (accumulator_reg < -(2**(ACCUM_WIDTH-2))) begin
+                data_out <= -(2**(DATA_WIDTH-1));
+            end else begin
+                // Round and truncate (keep middle bits)
+                data_out <= accumulator_reg[ACCUM_WIDTH-2:DATA_WIDTH-1];
+            end
+        end
+    end
+end
+
+// ============================================================================
+// Valid pipeline shift register
+// ============================================================================
+always @(posedge clk or negedge reset_n) begin
+    if (!reset_n) begin
+        valid_pipe <= 7'b0000000;
+    end else begin
+        valid_pipe <= {valid_pipe[5:0], data_valid};
+    end
+end
+
+// Always ready to accept new data (fully pipelined)
 assign fir_ready = 1'b1;

-// Overflow detection (simplified)
+// Overflow detection
 assign filter_overflow = (accumulator_reg > (2**(ACCUM_WIDTH-2)-1)) || 
                         (accumulator_reg < -(2**(ACCUM_WIDTH-2)));

-endmodule
+endmodule