Improve timing margins with targeted datapath register tuning
Reduce routing pressure on CIC/NCO critical paths and move Doppler BRAM read-address registers to sync-reset datapath logic so Build 13 closes with stronger setup/hold slack while preserving functional behavior.
This commit is contained in:
@@ -481,8 +481,8 @@ reg signed [COMB_WIDTH-1:0] comb_delay [0:STAGES-1][0:COMB_DELAY-1];
|
|||||||
|
|
||||||
// Enhanced control and monitoring
|
// Enhanced control and monitoring
|
||||||
reg [1:0] decimation_counter;
|
reg [1:0] decimation_counter;
|
||||||
reg data_valid_delayed;
|
(* keep = "true", max_fanout = 4 *) reg data_valid_delayed;
|
||||||
reg data_valid_comb;
|
(* keep = "true", max_fanout = 4 *) reg data_valid_comb;
|
||||||
reg [7:0] output_counter;
|
reg [7:0] output_counter;
|
||||||
reg [ACC_WIDTH-1:0] max_integrator_value;
|
reg [ACC_WIDTH-1:0] max_integrator_value;
|
||||||
reg overflow_detected;
|
reg overflow_detected;
|
||||||
|
|||||||
@@ -195,8 +195,8 @@ always @(posedge clk or negedge reset_n) begin
|
|||||||
state <= S_IDLE;
|
state <= S_IDLE;
|
||||||
write_range_bin <= 0;
|
write_range_bin <= 0;
|
||||||
write_chirp_index <= 0;
|
write_chirp_index <= 0;
|
||||||
read_range_bin <= 0;
|
// read_range_bin, read_doppler_index moved to Block 2 (sync reset)
|
||||||
read_doppler_index <= 0;
|
// to enable BRAM address register absorption (REQP-1839 fix)
|
||||||
frame_buffer_full <= 0;
|
frame_buffer_full <= 0;
|
||||||
doppler_valid <= 0;
|
doppler_valid <= 0;
|
||||||
fft_start <= 0;
|
fft_start <= 0;
|
||||||
@@ -251,8 +251,7 @@ always @(posedge clk or negedge reset_n) begin
|
|||||||
frame_buffer_full <= 1;
|
frame_buffer_full <= 1;
|
||||||
chirp_state <= 0;
|
chirp_state <= 0;
|
||||||
state <= S_PRE_READ;
|
state <= S_PRE_READ;
|
||||||
read_range_bin <= 0;
|
// read_range_bin/read_doppler_index zeroed in Block 2
|
||||||
read_doppler_index <= 0;
|
|
||||||
fft_sample_counter <= 0;
|
fft_sample_counter <= 0;
|
||||||
// Reset write pointers — no longer needed for
|
// Reset write pointers — no longer needed for
|
||||||
// this frame, and prevents stale overflow of
|
// this frame, and prevents stale overflow of
|
||||||
@@ -273,7 +272,7 @@ always @(posedge clk or negedge reset_n) begin
|
|||||||
// Advance read_doppler_index to 1 so the NEXT BRAM read
|
// Advance read_doppler_index to 1 so the NEXT BRAM read
|
||||||
// (which happens every cycle in the memory block) will
|
// (which happens every cycle in the memory block) will
|
||||||
// fetch chirp 1.
|
// fetch chirp 1.
|
||||||
read_doppler_index <= 1;
|
// read_doppler_index <= 1 moved to Block 2
|
||||||
fft_start <= 1;
|
fft_start <= 1;
|
||||||
state <= S_LOAD_FFT;
|
state <= S_LOAD_FFT;
|
||||||
end
|
end
|
||||||
@@ -311,14 +310,13 @@ always @(posedge clk or negedge reset_n) begin
|
|||||||
//
|
//
|
||||||
// We reuse fft_sample_counter as the sub-counter (0..32).
|
// We reuse fft_sample_counter as the sub-counter (0..32).
|
||||||
|
|
||||||
|
// read_doppler_index updates moved to Block 2 (sync reset)
|
||||||
if (fft_sample_counter == 0) begin
|
if (fft_sample_counter == 0) begin
|
||||||
// Sub 0: pre-multiply. mem_rdata_i = data[chirp=0][rbin].
|
// Sub 0: pre-multiply. mem_rdata_i = data[chirp=0][rbin].
|
||||||
// (mult_i/mult_q computed in Block 2)
|
// (mult_i/mult_q computed in Block 2)
|
||||||
// Present BRAM addr for chirp 2 (sub=1 reads chirp 1
|
// Present BRAM addr for chirp 2 (sub=1 reads chirp 1
|
||||||
// from the BRAM read we triggered in S_PRE_READ;
|
// from the BRAM read we triggered in S_PRE_READ;
|
||||||
// we need chirp 2 ready for sub=2).
|
// we need chirp 2 ready for sub=2).
|
||||||
read_doppler_index <= (2 < DOPPLER_FFT_SIZE) ? 2
|
|
||||||
: DOPPLER_FFT_SIZE - 1;
|
|
||||||
fft_sample_counter <= 1;
|
fft_sample_counter <= 1;
|
||||||
end else if (fft_sample_counter <= DOPPLER_FFT_SIZE) begin
|
end else if (fft_sample_counter <= DOPPLER_FFT_SIZE) begin
|
||||||
// Sub 1..32
|
// Sub 1..32
|
||||||
@@ -331,19 +329,12 @@ always @(posedge clk or negedge reset_n) begin
|
|||||||
state <= S_FFT_WAIT;
|
state <= S_FFT_WAIT;
|
||||||
fft_sample_counter <= 0;
|
fft_sample_counter <= 0;
|
||||||
processing_timeout <= 1000;
|
processing_timeout <= 1000;
|
||||||
// Reset read index to prevent stale OOB address
|
|
||||||
// on BRAM read port during S_FFT_WAIT
|
|
||||||
read_doppler_index <= 0;
|
|
||||||
end else begin
|
end else begin
|
||||||
// Sub 1..31: also compute new mult from current BRAM data
|
// Sub 1..31: also compute new mult from current BRAM data
|
||||||
// (mult_i/mult_q computed in Block 2)
|
// (mult_i/mult_q computed in Block 2)
|
||||||
// Advance BRAM read to chirp fft_sample_counter+2
|
// Advance BRAM read to chirp fft_sample_counter+2
|
||||||
// (so data is ready two cycles later when we need it)
|
// (so data is ready two cycles later when we need it)
|
||||||
// Clamp to DOPPLER_FFT_SIZE-1 to prevent OOB memory read
|
// Clamp to DOPPLER_FFT_SIZE-1 to prevent OOB memory read
|
||||||
if (fft_sample_counter + 2 < DOPPLER_FFT_SIZE)
|
|
||||||
read_doppler_index <= fft_sample_counter + 2;
|
|
||||||
else
|
|
||||||
read_doppler_index <= DOPPLER_FFT_SIZE - 1;
|
|
||||||
fft_sample_counter <= fft_sample_counter + 1;
|
fft_sample_counter <= fft_sample_counter + 1;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@@ -371,8 +362,7 @@ always @(posedge clk or negedge reset_n) begin
|
|||||||
|
|
||||||
S_OUTPUT: begin
|
S_OUTPUT: begin
|
||||||
if (read_range_bin < RANGE_BINS - 1) begin
|
if (read_range_bin < RANGE_BINS - 1) begin
|
||||||
read_range_bin <= read_range_bin + 1;
|
// read_range_bin/read_doppler_index updated in Block 2
|
||||||
read_doppler_index <= 0;
|
|
||||||
fft_sample_counter <= 0;
|
fft_sample_counter <= 0;
|
||||||
state <= S_PRE_READ;
|
state <= S_PRE_READ;
|
||||||
end else begin
|
end else begin
|
||||||
@@ -392,9 +382,10 @@ end
|
|||||||
// Uses always @(posedge clk) so Vivado can absorb multipliers
|
// Uses always @(posedge clk) so Vivado can absorb multipliers
|
||||||
// into DSP48 primitives and does not flag REQP-1839/1840 on
|
// into DSP48 primitives and does not flag REQP-1839/1840 on
|
||||||
// BRAM address registers. Replicates the same state/condition
|
// BRAM address registers. Replicates the same state/condition
|
||||||
// structure as Block 1 for the eight registers:
|
// structure as Block 1 for the registers:
|
||||||
// mem_we, mem_waddr_r, mem_wdata_i, mem_wdata_q,
|
// mem_we, mem_waddr_r, mem_wdata_i, mem_wdata_q,
|
||||||
// mult_i, mult_q, fft_input_i, fft_input_q
|
// mult_i, mult_q, fft_input_i, fft_input_q,
|
||||||
|
// read_range_bin, read_doppler_index
|
||||||
// ----------------------------------------------------------
|
// ----------------------------------------------------------
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (!reset_n) begin
|
if (!reset_n) begin
|
||||||
@@ -406,6 +397,8 @@ always @(posedge clk) begin
|
|||||||
mult_q <= 0;
|
mult_q <= 0;
|
||||||
fft_input_i <= 0;
|
fft_input_i <= 0;
|
||||||
fft_input_q <= 0;
|
fft_input_q <= 0;
|
||||||
|
read_range_bin <= 0;
|
||||||
|
read_doppler_index <= 0;
|
||||||
end else begin
|
end else begin
|
||||||
mem_we <= 0;
|
mem_we <= 0;
|
||||||
|
|
||||||
@@ -429,8 +422,21 @@ always @(posedge clk) begin
|
|||||||
mem_waddr_r <= mem_write_addr;
|
mem_waddr_r <= mem_write_addr;
|
||||||
mem_wdata_i <= range_data[15:0];
|
mem_wdata_i <= range_data[15:0];
|
||||||
mem_wdata_q <= range_data[31:16];
|
mem_wdata_q <= range_data[31:16];
|
||||||
|
|
||||||
|
// Transition to S_PRE_READ when frame complete
|
||||||
|
if (write_range_bin >= RANGE_BINS - 1 &&
|
||||||
|
write_chirp_index >= CHIRPS_PER_FRAME - 1) begin
|
||||||
|
read_range_bin <= 0;
|
||||||
|
read_doppler_index <= 0;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
S_PRE_READ: begin
|
||||||
|
// Advance read_doppler_index to 1 so next BRAM read
|
||||||
|
// fetches chirp 1
|
||||||
|
read_doppler_index <= 1;
|
||||||
|
end
|
||||||
|
|
||||||
S_LOAD_FFT: begin
|
S_LOAD_FFT: begin
|
||||||
if (fft_sample_counter == 0) begin
|
if (fft_sample_counter == 0) begin
|
||||||
@@ -439,25 +445,44 @@ always @(posedge clk) begin
|
|||||||
$signed(window_coeff[0]);
|
$signed(window_coeff[0]);
|
||||||
mult_q <= $signed(mem_rdata_q) *
|
mult_q <= $signed(mem_rdata_q) *
|
||||||
$signed(window_coeff[0]);
|
$signed(window_coeff[0]);
|
||||||
|
// Present BRAM addr for chirp 2
|
||||||
|
read_doppler_index <= (2 < DOPPLER_FFT_SIZE) ? 2
|
||||||
|
: DOPPLER_FFT_SIZE - 1;
|
||||||
end else if (fft_sample_counter <= DOPPLER_FFT_SIZE) begin
|
end else if (fft_sample_counter <= DOPPLER_FFT_SIZE) begin
|
||||||
// Sub 1..32: capture previous mult into fft_input
|
// Sub 1..32: capture previous mult into fft_input
|
||||||
fft_input_i <= (mult_i + (1 << 14)) >>> 15;
|
fft_input_i <= (mult_i + (1 << 14)) >>> 15;
|
||||||
fft_input_q <= (mult_q + (1 << 14)) >>> 15;
|
fft_input_q <= (mult_q + (1 << 14)) >>> 15;
|
||||||
|
|
||||||
if (fft_sample_counter < DOPPLER_FFT_SIZE) begin
|
if (fft_sample_counter == DOPPLER_FFT_SIZE) begin
|
||||||
|
// Sub 32: flush — reset read index to prevent
|
||||||
|
// stale OOB address on BRAM read port
|
||||||
|
read_doppler_index <= 0;
|
||||||
|
end else begin
|
||||||
// Sub 1..31: also compute new mult from current BRAM data
|
// Sub 1..31: also compute new mult from current BRAM data
|
||||||
// mem_rdata_i = data[chirp = fft_sample_counter][rbin]
|
// mem_rdata_i = data[chirp = fft_sample_counter][rbin]
|
||||||
mult_i <= $signed(mem_rdata_i) *
|
mult_i <= $signed(mem_rdata_i) *
|
||||||
$signed(window_coeff[fft_sample_counter]);
|
$signed(window_coeff[fft_sample_counter]);
|
||||||
mult_q <= $signed(mem_rdata_q) *
|
mult_q <= $signed(mem_rdata_q) *
|
||||||
$signed(window_coeff[fft_sample_counter]);
|
$signed(window_coeff[fft_sample_counter]);
|
||||||
|
// Advance BRAM read to chirp fft_sample_counter+2
|
||||||
|
if (fft_sample_counter + 2 < DOPPLER_FFT_SIZE)
|
||||||
|
read_doppler_index <= fft_sample_counter + 2;
|
||||||
|
else
|
||||||
|
read_doppler_index <= DOPPLER_FFT_SIZE - 1;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
S_OUTPUT: begin
|
||||||
|
if (read_range_bin < RANGE_BINS - 1) begin
|
||||||
|
read_range_bin <= read_range_bin + 1;
|
||||||
|
read_doppler_index <= 0;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
default: begin
|
default: begin
|
||||||
// S_PRE_READ, S_FFT_WAIT, S_OUTPUT:
|
// S_IDLE, S_FFT_WAIT:
|
||||||
// no BRAM-write or DSP operations needed
|
// no BRAM-write, DSP, or read-address operations needed
|
||||||
end
|
end
|
||||||
endcase
|
endcase
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -43,7 +43,8 @@ reg [31:0] phase_accum_reg; // Stage 1 output: registered DSP48E1 P[31:0]
|
|||||||
reg [31:0] phase_with_offset; // Stage 2 output: phase_accum_reg + offset
|
reg [31:0] phase_with_offset; // Stage 2 output: phase_accum_reg + offset
|
||||||
|
|
||||||
// Stage 3a pipeline registers: registered LUT address + quadrant
|
// Stage 3a pipeline registers: registered LUT address + quadrant
|
||||||
reg [5:0] lut_index_pipe;
|
reg [5:0] lut_index_pipe_sin;
|
||||||
|
reg [5:0] lut_index_pipe_cos;
|
||||||
reg [1:0] quadrant_pipe;
|
reg [1:0] quadrant_pipe;
|
||||||
|
|
||||||
// Stage 3b pipeline registers: LUT output + quadrant
|
// Stage 3b pipeline registers: LUT output + quadrant
|
||||||
@@ -105,8 +106,8 @@ wire [5:0] lut_index = (quadrant_w[0] ^ quadrant_w[1]) ? ~lut_address[5:0] : lut
|
|||||||
// These wires are driven by lut_index_pipe (registered in Stage 3a), so the
|
// These wires are driven by lut_index_pipe (registered in Stage 3a), so the
|
||||||
// combinational path is just: lut_index_pipe_reg → LUT6 (distributed RAM read)
|
// combinational path is just: lut_index_pipe_reg → LUT6 (distributed RAM read)
|
||||||
// This eliminates the LUT3→LUT6 two-level critical path from Build 8.
|
// This eliminates the LUT3→LUT6 two-level critical path from Build 8.
|
||||||
wire [15:0] sin_abs_w = sin_lut[lut_index_pipe];
|
wire [15:0] sin_abs_w = sin_lut[lut_index_pipe_sin];
|
||||||
wire [15:0] cos_abs_w = sin_lut[63 - lut_index_pipe];
|
wire [15:0] cos_abs_w = sin_lut[63 - lut_index_pipe_cos];
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// Stage 1: Phase accumulator (DSP48E1) — accumulates FTW each cycle
|
// Stage 1: Phase accumulator (DSP48E1) — accumulates FTW each cycle
|
||||||
@@ -265,10 +266,12 @@ end
|
|||||||
// ============================================================================
|
// ============================================================================
|
||||||
always @(posedge clk_400m or negedge reset_n) begin
|
always @(posedge clk_400m or negedge reset_n) begin
|
||||||
if (!reset_n) begin
|
if (!reset_n) begin
|
||||||
lut_index_pipe <= 6'b000000;
|
lut_index_pipe_sin <= 6'b000000;
|
||||||
|
lut_index_pipe_cos <= 6'b000000;
|
||||||
quadrant_pipe <= 2'b00;
|
quadrant_pipe <= 2'b00;
|
||||||
end else if (valid_pipe[1]) begin
|
end else if (valid_pipe[1]) begin
|
||||||
lut_index_pipe <= lut_index;
|
lut_index_pipe_sin <= lut_index;
|
||||||
|
lut_index_pipe_cos <= lut_index;
|
||||||
quadrant_pipe <= quadrant_w;
|
quadrant_pipe <= quadrant_w;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
Reference in New Issue
Block a user