Improve timing margins with targeted datapath register tuning

Reduce routing pressure on CIC/NCO critical paths and move Doppler BRAM read-address registers to sync-reset datapath logic so Build 13 closes with stronger setup/hold slack while preserving functional behavior.
This commit is contained in:
Jason
2026-03-17 23:51:04 +02:00
parent 36ad15247c
commit 254c0e6f03
3 changed files with 58 additions and 30 deletions
@@ -481,8 +481,8 @@ reg signed [COMB_WIDTH-1:0] comb_delay [0:STAGES-1][0:COMB_DELAY-1];
// Enhanced control and monitoring
reg [1:0] decimation_counter;
reg data_valid_delayed;
reg data_valid_comb;
(* keep = "true", max_fanout = 4 *) reg data_valid_delayed;
(* keep = "true", max_fanout = 4 *) reg data_valid_comb;
reg [7:0] output_counter;
reg [ACC_WIDTH-1:0] max_integrator_value;
reg overflow_detected;
+46 -21
View File
@@ -195,8 +195,8 @@ always @(posedge clk or negedge reset_n) begin
state <= S_IDLE;
write_range_bin <= 0;
write_chirp_index <= 0;
read_range_bin <= 0;
read_doppler_index <= 0;
// read_range_bin, read_doppler_index moved to Block 2 (sync reset)
// to enable BRAM address register absorption (REQP-1839 fix)
frame_buffer_full <= 0;
doppler_valid <= 0;
fft_start <= 0;
@@ -251,8 +251,7 @@ always @(posedge clk or negedge reset_n) begin
frame_buffer_full <= 1;
chirp_state <= 0;
state <= S_PRE_READ;
read_range_bin <= 0;
read_doppler_index <= 0;
// read_range_bin/read_doppler_index zeroed in Block 2
fft_sample_counter <= 0;
// Reset write pointers no longer needed for
// this frame, and prevents stale overflow of
@@ -273,7 +272,7 @@ always @(posedge clk or negedge reset_n) begin
// Advance read_doppler_index to 1 so the NEXT BRAM read
// (which happens every cycle in the memory block) will
// fetch chirp 1.
read_doppler_index <= 1;
// read_doppler_index <= 1 moved to Block 2
fft_start <= 1;
state <= S_LOAD_FFT;
end
@@ -311,14 +310,13 @@ always @(posedge clk or negedge reset_n) begin
//
// We reuse fft_sample_counter as the sub-counter (0..32).
// read_doppler_index updates moved to Block 2 (sync reset)
if (fft_sample_counter == 0) begin
// Sub 0: pre-multiply. mem_rdata_i = data[chirp=0][rbin].
// (mult_i/mult_q computed in Block 2)
// Present BRAM addr for chirp 2 (sub=1 reads chirp 1
// from the BRAM read we triggered in S_PRE_READ;
// we need chirp 2 ready for sub=2).
read_doppler_index <= (2 < DOPPLER_FFT_SIZE) ? 2
: DOPPLER_FFT_SIZE - 1;
fft_sample_counter <= 1;
end else if (fft_sample_counter <= DOPPLER_FFT_SIZE) begin
// Sub 1..32
@@ -331,19 +329,12 @@ always @(posedge clk or negedge reset_n) begin
state <= S_FFT_WAIT;
fft_sample_counter <= 0;
processing_timeout <= 1000;
// Reset read index to prevent stale OOB address
// on BRAM read port during S_FFT_WAIT
read_doppler_index <= 0;
end else begin
// Sub 1..31: also compute new mult from current BRAM data
// (mult_i/mult_q computed in Block 2)
// Advance BRAM read to chirp fft_sample_counter+2
// (so data is ready two cycles later when we need it)
// Clamp to DOPPLER_FFT_SIZE-1 to prevent OOB memory read
if (fft_sample_counter + 2 < DOPPLER_FFT_SIZE)
read_doppler_index <= fft_sample_counter + 2;
else
read_doppler_index <= DOPPLER_FFT_SIZE - 1;
fft_sample_counter <= fft_sample_counter + 1;
end
end
@@ -371,8 +362,7 @@ always @(posedge clk or negedge reset_n) begin
S_OUTPUT: begin
if (read_range_bin < RANGE_BINS - 1) begin
read_range_bin <= read_range_bin + 1;
read_doppler_index <= 0;
// read_range_bin/read_doppler_index updated in Block 2
fft_sample_counter <= 0;
state <= S_PRE_READ;
end else begin
@@ -392,9 +382,10 @@ end
// Uses always @(posedge clk) so Vivado can absorb multipliers
// into DSP48 primitives and does not flag REQP-1839/1840 on
// BRAM address registers. Replicates the same state/condition
// structure as Block 1 for the eight registers:
// structure as Block 1 for the registers:
// mem_we, mem_waddr_r, mem_wdata_i, mem_wdata_q,
// mult_i, mult_q, fft_input_i, fft_input_q
// mult_i, mult_q, fft_input_i, fft_input_q,
// read_range_bin, read_doppler_index
// ----------------------------------------------------------
always @(posedge clk) begin
if (!reset_n) begin
@@ -406,6 +397,8 @@ always @(posedge clk) begin
mult_q <= 0;
fft_input_i <= 0;
fft_input_q <= 0;
read_range_bin <= 0;
read_doppler_index <= 0;
end else begin
mem_we <= 0;
@@ -429,9 +422,22 @@ always @(posedge clk) begin
mem_waddr_r <= mem_write_addr;
mem_wdata_i <= range_data[15:0];
mem_wdata_q <= range_data[31:16];
// Transition to S_PRE_READ when frame complete
if (write_range_bin >= RANGE_BINS - 1 &&
write_chirp_index >= CHIRPS_PER_FRAME - 1) begin
read_range_bin <= 0;
read_doppler_index <= 0;
end
end
end
S_PRE_READ: begin
// Advance read_doppler_index to 1 so next BRAM read
// fetches chirp 1
read_doppler_index <= 1;
end
S_LOAD_FFT: begin
if (fft_sample_counter == 0) begin
// Sub 0: pre-multiply. mem_rdata_i = data[chirp=0][rbin].
@@ -439,25 +445,44 @@ always @(posedge clk) begin
$signed(window_coeff[0]);
mult_q <= $signed(mem_rdata_q) *
$signed(window_coeff[0]);
// Present BRAM addr for chirp 2
read_doppler_index <= (2 < DOPPLER_FFT_SIZE) ? 2
: DOPPLER_FFT_SIZE - 1;
end else if (fft_sample_counter <= DOPPLER_FFT_SIZE) begin
// Sub 1..32: capture previous mult into fft_input
fft_input_i <= (mult_i + (1 << 14)) >>> 15;
fft_input_q <= (mult_q + (1 << 14)) >>> 15;
if (fft_sample_counter < DOPPLER_FFT_SIZE) begin
if (fft_sample_counter == DOPPLER_FFT_SIZE) begin
// Sub 32: flush reset read index to prevent
// stale OOB address on BRAM read port
read_doppler_index <= 0;
end else begin
// Sub 1..31: also compute new mult from current BRAM data
// mem_rdata_i = data[chirp = fft_sample_counter][rbin]
mult_i <= $signed(mem_rdata_i) *
$signed(window_coeff[fft_sample_counter]);
mult_q <= $signed(mem_rdata_q) *
$signed(window_coeff[fft_sample_counter]);
// Advance BRAM read to chirp fft_sample_counter+2
if (fft_sample_counter + 2 < DOPPLER_FFT_SIZE)
read_doppler_index <= fft_sample_counter + 2;
else
read_doppler_index <= DOPPLER_FFT_SIZE - 1;
end
end
end
S_OUTPUT: begin
if (read_range_bin < RANGE_BINS - 1) begin
read_range_bin <= read_range_bin + 1;
read_doppler_index <= 0;
end
end
default: begin
// S_PRE_READ, S_FFT_WAIT, S_OUTPUT:
// no BRAM-write or DSP operations needed
// S_IDLE, S_FFT_WAIT:
// no BRAM-write, DSP, or read-address operations needed
end
endcase
end
+10 -7
View File
@@ -43,7 +43,8 @@ reg [31:0] phase_accum_reg; // Stage 1 output: registered DSP48E1 P[31:0]
reg [31:0] phase_with_offset; // Stage 2 output: phase_accum_reg + offset
// Stage 3a pipeline registers: registered LUT address + quadrant
reg [5:0] lut_index_pipe;
reg [5:0] lut_index_pipe_sin;
reg [5:0] lut_index_pipe_cos;
reg [1:0] quadrant_pipe;
// Stage 3b pipeline registers: LUT output + quadrant
@@ -105,8 +106,8 @@ wire [5:0] lut_index = (quadrant_w[0] ^ quadrant_w[1]) ? ~lut_address[5:0] : lut
// These wires are driven by lut_index_pipe (registered in Stage 3a), so the
// combinational path is just: lut_index_pipe_reg LUT6 (distributed RAM read)
// This eliminates the LUT3LUT6 two-level critical path from Build 8.
wire [15:0] sin_abs_w = sin_lut[lut_index_pipe];
wire [15:0] cos_abs_w = sin_lut[63 - lut_index_pipe];
wire [15:0] sin_abs_w = sin_lut[lut_index_pipe_sin];
wire [15:0] cos_abs_w = sin_lut[63 - lut_index_pipe_cos];
// ============================================================================
// Stage 1: Phase accumulator (DSP48E1) accumulates FTW each cycle
@@ -265,11 +266,13 @@ end
// ============================================================================
always @(posedge clk_400m or negedge reset_n) begin
if (!reset_n) begin
lut_index_pipe <= 6'b000000;
quadrant_pipe <= 2'b00;
lut_index_pipe_sin <= 6'b000000;
lut_index_pipe_cos <= 6'b000000;
quadrant_pipe <= 2'b00;
end else if (valid_pipe[1]) begin
lut_index_pipe <= lut_index;
quadrant_pipe <= quadrant_w;
lut_index_pipe_sin <= lut_index;
lut_index_pipe_cos <= lut_index;
quadrant_pipe <= quadrant_w;
end
end