Improve timing margins with targeted datapath register tuning
Reduce routing pressure on CIC/NCO critical paths and move Doppler BRAM read-address registers to sync-reset datapath logic so Build 13 closes with stronger setup/hold slack while preserving functional behavior.
This commit is contained in:
@@ -481,8 +481,8 @@ reg signed [COMB_WIDTH-1:0] comb_delay [0:STAGES-1][0:COMB_DELAY-1];
|
||||
|
||||
// Enhanced control and monitoring
|
||||
reg [1:0] decimation_counter;
|
||||
reg data_valid_delayed;
|
||||
reg data_valid_comb;
|
||||
(* keep = "true", max_fanout = 4 *) reg data_valid_delayed;
|
||||
(* keep = "true", max_fanout = 4 *) reg data_valid_comb;
|
||||
reg [7:0] output_counter;
|
||||
reg [ACC_WIDTH-1:0] max_integrator_value;
|
||||
reg overflow_detected;
|
||||
|
||||
@@ -195,8 +195,8 @@ always @(posedge clk or negedge reset_n) begin
|
||||
state <= S_IDLE;
|
||||
write_range_bin <= 0;
|
||||
write_chirp_index <= 0;
|
||||
read_range_bin <= 0;
|
||||
read_doppler_index <= 0;
|
||||
// read_range_bin, read_doppler_index moved to Block 2 (sync reset)
|
||||
// to enable BRAM address register absorption (REQP-1839 fix)
|
||||
frame_buffer_full <= 0;
|
||||
doppler_valid <= 0;
|
||||
fft_start <= 0;
|
||||
@@ -251,8 +251,7 @@ always @(posedge clk or negedge reset_n) begin
|
||||
frame_buffer_full <= 1;
|
||||
chirp_state <= 0;
|
||||
state <= S_PRE_READ;
|
||||
read_range_bin <= 0;
|
||||
read_doppler_index <= 0;
|
||||
// read_range_bin/read_doppler_index zeroed in Block 2
|
||||
fft_sample_counter <= 0;
|
||||
// Reset write pointers — no longer needed for
|
||||
// this frame, and prevents stale overflow of
|
||||
@@ -273,7 +272,7 @@ always @(posedge clk or negedge reset_n) begin
|
||||
// Advance read_doppler_index to 1 so the NEXT BRAM read
|
||||
// (which happens every cycle in the memory block) will
|
||||
// fetch chirp 1.
|
||||
read_doppler_index <= 1;
|
||||
// read_doppler_index <= 1 moved to Block 2
|
||||
fft_start <= 1;
|
||||
state <= S_LOAD_FFT;
|
||||
end
|
||||
@@ -311,14 +310,13 @@ always @(posedge clk or negedge reset_n) begin
|
||||
//
|
||||
// We reuse fft_sample_counter as the sub-counter (0..32).
|
||||
|
||||
// read_doppler_index updates moved to Block 2 (sync reset)
|
||||
if (fft_sample_counter == 0) begin
|
||||
// Sub 0: pre-multiply. mem_rdata_i = data[chirp=0][rbin].
|
||||
// (mult_i/mult_q computed in Block 2)
|
||||
// Present BRAM addr for chirp 2 (sub=1 reads chirp 1
|
||||
// from the BRAM read we triggered in S_PRE_READ;
|
||||
// we need chirp 2 ready for sub=2).
|
||||
read_doppler_index <= (2 < DOPPLER_FFT_SIZE) ? 2
|
||||
: DOPPLER_FFT_SIZE - 1;
|
||||
fft_sample_counter <= 1;
|
||||
end else if (fft_sample_counter <= DOPPLER_FFT_SIZE) begin
|
||||
// Sub 1..32
|
||||
@@ -331,19 +329,12 @@ always @(posedge clk or negedge reset_n) begin
|
||||
state <= S_FFT_WAIT;
|
||||
fft_sample_counter <= 0;
|
||||
processing_timeout <= 1000;
|
||||
// Reset read index to prevent stale OOB address
|
||||
// on BRAM read port during S_FFT_WAIT
|
||||
read_doppler_index <= 0;
|
||||
end else begin
|
||||
// Sub 1..31: also compute new mult from current BRAM data
|
||||
// (mult_i/mult_q computed in Block 2)
|
||||
// Advance BRAM read to chirp fft_sample_counter+2
|
||||
// (so data is ready two cycles later when we need it)
|
||||
// Clamp to DOPPLER_FFT_SIZE-1 to prevent OOB memory read
|
||||
if (fft_sample_counter + 2 < DOPPLER_FFT_SIZE)
|
||||
read_doppler_index <= fft_sample_counter + 2;
|
||||
else
|
||||
read_doppler_index <= DOPPLER_FFT_SIZE - 1;
|
||||
fft_sample_counter <= fft_sample_counter + 1;
|
||||
end
|
||||
end
|
||||
@@ -371,8 +362,7 @@ always @(posedge clk or negedge reset_n) begin
|
||||
|
||||
S_OUTPUT: begin
|
||||
if (read_range_bin < RANGE_BINS - 1) begin
|
||||
read_range_bin <= read_range_bin + 1;
|
||||
read_doppler_index <= 0;
|
||||
// read_range_bin/read_doppler_index updated in Block 2
|
||||
fft_sample_counter <= 0;
|
||||
state <= S_PRE_READ;
|
||||
end else begin
|
||||
@@ -392,9 +382,10 @@ end
|
||||
// Uses always @(posedge clk) so Vivado can absorb multipliers
|
||||
// into DSP48 primitives and does not flag REQP-1839/1840 on
|
||||
// BRAM address registers. Replicates the same state/condition
|
||||
// structure as Block 1 for the eight registers:
|
||||
// structure as Block 1 for the registers:
|
||||
// mem_we, mem_waddr_r, mem_wdata_i, mem_wdata_q,
|
||||
// mult_i, mult_q, fft_input_i, fft_input_q
|
||||
// mult_i, mult_q, fft_input_i, fft_input_q,
|
||||
// read_range_bin, read_doppler_index
|
||||
// ----------------------------------------------------------
|
||||
always @(posedge clk) begin
|
||||
if (!reset_n) begin
|
||||
@@ -406,6 +397,8 @@ always @(posedge clk) begin
|
||||
mult_q <= 0;
|
||||
fft_input_i <= 0;
|
||||
fft_input_q <= 0;
|
||||
read_range_bin <= 0;
|
||||
read_doppler_index <= 0;
|
||||
end else begin
|
||||
mem_we <= 0;
|
||||
|
||||
@@ -429,9 +422,22 @@ always @(posedge clk) begin
|
||||
mem_waddr_r <= mem_write_addr;
|
||||
mem_wdata_i <= range_data[15:0];
|
||||
mem_wdata_q <= range_data[31:16];
|
||||
|
||||
// Transition to S_PRE_READ when frame complete
|
||||
if (write_range_bin >= RANGE_BINS - 1 &&
|
||||
write_chirp_index >= CHIRPS_PER_FRAME - 1) begin
|
||||
read_range_bin <= 0;
|
||||
read_doppler_index <= 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
S_PRE_READ: begin
|
||||
// Advance read_doppler_index to 1 so next BRAM read
|
||||
// fetches chirp 1
|
||||
read_doppler_index <= 1;
|
||||
end
|
||||
|
||||
S_LOAD_FFT: begin
|
||||
if (fft_sample_counter == 0) begin
|
||||
// Sub 0: pre-multiply. mem_rdata_i = data[chirp=0][rbin].
|
||||
@@ -439,25 +445,44 @@ always @(posedge clk) begin
|
||||
$signed(window_coeff[0]);
|
||||
mult_q <= $signed(mem_rdata_q) *
|
||||
$signed(window_coeff[0]);
|
||||
// Present BRAM addr for chirp 2
|
||||
read_doppler_index <= (2 < DOPPLER_FFT_SIZE) ? 2
|
||||
: DOPPLER_FFT_SIZE - 1;
|
||||
end else if (fft_sample_counter <= DOPPLER_FFT_SIZE) begin
|
||||
// Sub 1..32: capture previous mult into fft_input
|
||||
fft_input_i <= (mult_i + (1 << 14)) >>> 15;
|
||||
fft_input_q <= (mult_q + (1 << 14)) >>> 15;
|
||||
|
||||
if (fft_sample_counter < DOPPLER_FFT_SIZE) begin
|
||||
if (fft_sample_counter == DOPPLER_FFT_SIZE) begin
|
||||
// Sub 32: flush — reset read index to prevent
|
||||
// stale OOB address on BRAM read port
|
||||
read_doppler_index <= 0;
|
||||
end else begin
|
||||
// Sub 1..31: also compute new mult from current BRAM data
|
||||
// mem_rdata_i = data[chirp = fft_sample_counter][rbin]
|
||||
mult_i <= $signed(mem_rdata_i) *
|
||||
$signed(window_coeff[fft_sample_counter]);
|
||||
mult_q <= $signed(mem_rdata_q) *
|
||||
$signed(window_coeff[fft_sample_counter]);
|
||||
// Advance BRAM read to chirp fft_sample_counter+2
|
||||
if (fft_sample_counter + 2 < DOPPLER_FFT_SIZE)
|
||||
read_doppler_index <= fft_sample_counter + 2;
|
||||
else
|
||||
read_doppler_index <= DOPPLER_FFT_SIZE - 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
S_OUTPUT: begin
|
||||
if (read_range_bin < RANGE_BINS - 1) begin
|
||||
read_range_bin <= read_range_bin + 1;
|
||||
read_doppler_index <= 0;
|
||||
end
|
||||
end
|
||||
|
||||
default: begin
|
||||
// S_PRE_READ, S_FFT_WAIT, S_OUTPUT:
|
||||
// no BRAM-write or DSP operations needed
|
||||
// S_IDLE, S_FFT_WAIT:
|
||||
// no BRAM-write, DSP, or read-address operations needed
|
||||
end
|
||||
endcase
|
||||
end
|
||||
|
||||
@@ -43,7 +43,8 @@ reg [31:0] phase_accum_reg; // Stage 1 output: registered DSP48E1 P[31:0]
|
||||
reg [31:0] phase_with_offset; // Stage 2 output: phase_accum_reg + offset
|
||||
|
||||
// Stage 3a pipeline registers: registered LUT address + quadrant
|
||||
reg [5:0] lut_index_pipe;
|
||||
reg [5:0] lut_index_pipe_sin;
|
||||
reg [5:0] lut_index_pipe_cos;
|
||||
reg [1:0] quadrant_pipe;
|
||||
|
||||
// Stage 3b pipeline registers: LUT output + quadrant
|
||||
@@ -105,8 +106,8 @@ wire [5:0] lut_index = (quadrant_w[0] ^ quadrant_w[1]) ? ~lut_address[5:0] : lut
|
||||
// These wires are driven by lut_index_pipe (registered in Stage 3a), so the
|
||||
// combinational path is just: lut_index_pipe_reg → LUT6 (distributed RAM read)
|
||||
// This eliminates the LUT3→LUT6 two-level critical path from Build 8.
|
||||
wire [15:0] sin_abs_w = sin_lut[lut_index_pipe];
|
||||
wire [15:0] cos_abs_w = sin_lut[63 - lut_index_pipe];
|
||||
wire [15:0] sin_abs_w = sin_lut[lut_index_pipe_sin];
|
||||
wire [15:0] cos_abs_w = sin_lut[63 - lut_index_pipe_cos];
|
||||
|
||||
// ============================================================================
|
||||
// Stage 1: Phase accumulator (DSP48E1) — accumulates FTW each cycle
|
||||
@@ -265,11 +266,13 @@ end
|
||||
// ============================================================================
|
||||
always @(posedge clk_400m or negedge reset_n) begin
|
||||
if (!reset_n) begin
|
||||
lut_index_pipe <= 6'b000000;
|
||||
quadrant_pipe <= 2'b00;
|
||||
lut_index_pipe_sin <= 6'b000000;
|
||||
lut_index_pipe_cos <= 6'b000000;
|
||||
quadrant_pipe <= 2'b00;
|
||||
end else if (valid_pipe[1]) begin
|
||||
lut_index_pipe <= lut_index;
|
||||
quadrant_pipe <= quadrant_w;
|
||||
lut_index_pipe_sin <= lut_index;
|
||||
lut_index_pipe_cos <= lut_index;
|
||||
quadrant_pipe <= quadrant_w;
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
Reference in New Issue
Block a user