fix: close all FPGA timing — CFAR pipeline + CIC reset path (Build 19)

CFAR pipeline fix (clk_100m WNS: -0.331ns → +0.156ns):
- Pre-register col_buf reads during ST_CFAR_THR pipeline stage
- 8 pipeline registers (4 values + 4 valids) break 15-level mux tree
- Delta wires use registered values, eliminating combinatorial depth

CIC reset path fix (clk_mmcm_out0 WNS: -0.074ns → +0.068ns):
- Add reset_h input port to cic_decimator_4x_enhanced.v
- Remove internal wire reset_h = ~reset_n (LUT1 inverter was root cause)
- Wire pre-registered reset_400m from ddc_400m.v into both CIC instances
- 3 sync reset blocks changed from if(!reset_n) to if(reset_h)

Build 19 results (xc7a50tftg256-2, Vivado 2025.2):
- All 5 clock domains timing met, 0 failing endpoints
- WNS: +0.068ns (400MHz), +0.156ns (100MHz), +0.627ns (120MHz)
- Utilization: 66.67% LUT, 22.36% FF, 74% BRAM, 93.33% DSP
- Bitstream: 2,140 KB
This commit is contained in:
Jason
2026-04-16 23:09:31 +05:45
parent e9705e40b7
commit 2401f5f89e
4 changed files with 55 additions and 15 deletions
+40 -8
View File
@@ -208,20 +208,31 @@ wire lead_rem_valid = (lead_rem_idx >= 0) && (lead_rem_idx < NUM_RANGE_BINS);
wire lag_rem_valid = (lag_rem_idx >= 0) && (lag_rem_idx < NUM_RANGE_BINS);
wire lag_add_valid = (lag_add_idx >= 0) && (lag_add_idx < NUM_RANGE_BINS);
// Safe col_buf read with bounds checking (combinational)
// Safe col_buf read with bounds checking (combinational feeds pipeline regs)
wire [MAG_WIDTH-1:0] lead_add_val = lead_add_valid ? col_buf[lead_add_idx[ROW_BITS-1:0]] : {MAG_WIDTH{1'b0}};
wire [MAG_WIDTH-1:0] lead_rem_val = lead_rem_valid ? col_buf[lead_rem_idx[ROW_BITS-1:0]] : {MAG_WIDTH{1'b0}};
wire [MAG_WIDTH-1:0] lag_rem_val = lag_rem_valid ? col_buf[lag_rem_idx[ROW_BITS-1:0]] : {MAG_WIDTH{1'b0}};
wire [MAG_WIDTH-1:0] lag_add_val = lag_add_valid ? col_buf[lag_add_idx[ROW_BITS-1:0]] : {MAG_WIDTH{1'b0}};
// Net deltas
wire signed [SUM_WIDTH:0] lead_delta = (lead_add_valid ? $signed({1'b0, lead_add_val}) : 0)
- (lead_rem_valid ? $signed({1'b0, lead_rem_val}) : 0);
wire signed [1:0] lead_cnt_delta = (lead_add_valid ? 1 : 0) - (lead_rem_valid ? 1 : 0);
// ============================================================================
// PIPELINE REGISTERS: Break col_buf mux tree out of ST_CFAR_CMP critical path
// ============================================================================
// Captured in ST_CFAR_THR (col_buf indices depend only on cut_idx/r_guard/r_train,
// all stable during THR). Used in ST_CFAR_CMP for delta/sum computation.
// This removes ~6-8 logic levels (9-level mux tree) from the CMP critical path.
reg [MAG_WIDTH-1:0] lead_add_val_r, lead_rem_val_r;
reg [MAG_WIDTH-1:0] lag_rem_val_r, lag_add_val_r;
reg lead_add_valid_r, lead_rem_valid_r;
reg lag_rem_valid_r, lag_add_valid_r;
wire signed [SUM_WIDTH:0] lag_delta = (lag_add_valid ? $signed({1'b0, lag_add_val}) : 0)
- (lag_rem_valid ? $signed({1'b0, lag_rem_val}) : 0);
wire signed [1:0] lag_cnt_delta = (lag_add_valid ? 1 : 0) - (lag_rem_valid ? 1 : 0);
// Net deltas (computed from registered col_buf values combinational in CMP)
wire signed [SUM_WIDTH:0] lead_delta = (lead_add_valid_r ? $signed({1'b0, lead_add_val_r}) : 0)
- (lead_rem_valid_r ? $signed({1'b0, lead_rem_val_r}) : 0);
wire signed [1:0] lead_cnt_delta = (lead_add_valid_r ? 1 : 0) - (lead_rem_valid_r ? 1 : 0);
wire signed [SUM_WIDTH:0] lag_delta = (lag_add_valid_r ? $signed({1'b0, lag_add_val_r}) : 0)
- (lag_rem_valid_r ? $signed({1'b0, lag_rem_val_r}) : 0);
wire signed [1:0] lag_cnt_delta = (lag_add_valid_r ? 1 : 0) - (lag_rem_valid_r ? 1 : 0);
// ============================================================================
// NOISE ESTIMATE COMPUTATION (combinational for CFAR mode selection)
@@ -290,6 +301,14 @@ always @(posedge clk or negedge reset_n) begin
noise_sum_reg <= 0;
noise_product <= 0;
adaptive_thr <= 0;
lead_add_val_r <= 0;
lead_rem_val_r <= 0;
lag_rem_val_r <= 0;
lag_add_val_r <= 0;
lead_add_valid_r <= 0;
lead_rem_valid_r <= 0;
lag_rem_valid_r <= 0;
lag_add_valid_r <= 0;
r_guard <= 4'd2;
r_train <= 5'd8;
r_alpha <= 8'h30;
@@ -443,6 +462,19 @@ always @(posedge clk or negedge reset_n) begin
cfar_status <= {4'd4, 1'b0, col_idx[2:0]};
noise_sum_reg <= noise_sum_comb;
// Pipeline: register col_buf reads for next CUT's window update.
// Indices depend only on cut_idx/r_guard/r_train (all stable here).
// Breaks the 9-level col_buf mux tree out of ST_CFAR_CMP.
lead_add_val_r <= lead_add_val;
lead_rem_val_r <= lead_rem_val;
lag_rem_val_r <= lag_rem_val;
lag_add_val_r <= lag_add_val;
lead_add_valid_r <= lead_add_valid;
lead_rem_valid_r <= lead_rem_valid;
lag_rem_valid_r <= lag_rem_valid;
lag_add_valid_r <= lag_add_valid;
state <= ST_CFAR_MUL;
end
@@ -1,6 +1,7 @@
module cic_decimator_4x_enhanced (
input wire clk, // 400MHz input clock
input wire reset_n,
input wire reset_h, // Pre-registered active-high reset from parent (avoids LUT1 inverter)
input wire signed [17:0] data_in, // 18-bit input
input wire data_valid,
output reg signed [17:0] data_out, // 18-bit output
@@ -32,11 +33,15 @@ localparam COMB_WIDTH = 28;
// adjacent DSP48E1 tiles — zero fabric delay, guaranteed to meet 400+ MHz
// on 7-series regardless of speed grade.
//
// Active-high reset derived from reset_n (inverted).
// Active-high reset provided by parent module (pre-registered).
// CEP (clock enable for P register) gated by data_valid.
// ============================================================================
wire reset_h = ~reset_n; // active-high reset for DSP48E1 RSTP
// reset_h is now an input port from parent module (pre-registered active-high).
// Previously: wire reset_h = ~reset_n; — this LUT1 inverter + long routing to
// 8 DSP48E1 RSTB pins was the root cause of 400 MHz timing failure (WNS=-0.074ns).
// The parent ddc_400m.v already has a registered reset_400m derived from
// the 2-stage sync reset, so we use that directly.
// Sign-extended input for integrator_0 C port (48-bit)
wire [ACC_WIDTH-1:0] data_in_c = {{(ACC_WIDTH-18){data_in[17]}}, data_in};
@@ -702,7 +707,7 @@ end
// Sync reset: enables FDRE inference for better timing at 400 MHz.
// Reset is already synchronous to clk via reset synchronizer in parent module.
always @(posedge clk) begin
if (!reset_n) begin
if (reset_h) begin
integrator_sampled <= 0;
decimation_counter <= 0;
data_valid_delayed <= 0;
@@ -757,7 +762,7 @@ end
// Pipeline the valid signal for comb section
// Sync reset: matches decimation control block reset style.
always @(posedge clk) begin
if (!reset_n) begin
if (reset_h) begin
data_valid_comb <= 0;
data_valid_comb_pipe <= 0;
data_valid_comb_0_out <= 0;
@@ -792,7 +797,7 @@ end
// - Each stage: comb[i] = comb[i-1] - comb_delay[i][last]
always @(posedge clk) begin
if (!reset_n) begin
if (reset_h) begin
for (i = 0; i < STAGES; i = i + 1) begin
comb[i] <= 0;
for (j = 0; j < COMB_DELAY; j = j + 1) begin
+4 -2
View File
@@ -565,7 +565,8 @@ wire cic_valid_i, cic_valid_q;
cic_decimator_4x_enhanced cic_i_inst (
.clk(clk_400m),
.reset_n(reset_n_400m),
.reset_n(reset_n_400m),
.reset_h(reset_400m),
.data_in(mixed_i[33:16]),
.data_valid(mixed_valid),
.data_out(cic_i_out),
@@ -574,7 +575,8 @@ cic_decimator_4x_enhanced cic_i_inst (
cic_decimator_4x_enhanced cic_q_inst (
.clk(clk_400m),
.reset_n(reset_n_400m),
.reset_n(reset_n_400m),
.reset_h(reset_400m),
.data_in(mixed_q[33:16]),
.data_valid(mixed_valid),
.data_out(cic_q_out),
@@ -36,6 +36,7 @@ module tb_cic_decimator;
cic_decimator_4x_enhanced uut (
.clk (clk),
.reset_n (reset_n),
.reset_h (~reset_n),
.data_in (data_in),
.data_valid (data_valid),
.data_out (data_out),