From 254c0e6f0373df3f21564135afb513266fcbc587 Mon Sep 17 00:00:00 2001 From: Jason <83615043+JJassonn69@users.noreply.github.com> Date: Tue, 17 Mar 2026 23:51:04 +0200 Subject: [PATCH] Improve timing margins with targeted datapath register tuning Reduce routing pressure on CIC/NCO critical paths and move Doppler BRAM read-address registers to sync-reset datapath logic so Build 13 closes with stronger setup/hold slack while preserving functional behavior. --- .../9_2_FPGA/cic_decimator_4x_enhanced.v | 4 +- 9_Firmware/9_2_FPGA/doppler_processor.v | 67 +++++++++++++------ 9_Firmware/9_2_FPGA/nco_400m_enhanced.v | 17 +++-- 3 files changed, 58 insertions(+), 30 deletions(-) diff --git a/9_Firmware/9_2_FPGA/cic_decimator_4x_enhanced.v b/9_Firmware/9_2_FPGA/cic_decimator_4x_enhanced.v index 6300fe0..c53a828 100644 --- a/9_Firmware/9_2_FPGA/cic_decimator_4x_enhanced.v +++ b/9_Firmware/9_2_FPGA/cic_decimator_4x_enhanced.v @@ -481,8 +481,8 @@ reg signed [COMB_WIDTH-1:0] comb_delay [0:STAGES-1][0:COMB_DELAY-1]; // Enhanced control and monitoring reg [1:0] decimation_counter; -reg data_valid_delayed; -reg data_valid_comb; +(* keep = "true", max_fanout = 4 *) reg data_valid_delayed; +(* keep = "true", max_fanout = 4 *) reg data_valid_comb; reg [7:0] output_counter; reg [ACC_WIDTH-1:0] max_integrator_value; reg overflow_detected; diff --git a/9_Firmware/9_2_FPGA/doppler_processor.v b/9_Firmware/9_2_FPGA/doppler_processor.v index 097579b..23e4e1c 100644 --- a/9_Firmware/9_2_FPGA/doppler_processor.v +++ b/9_Firmware/9_2_FPGA/doppler_processor.v @@ -195,8 +195,8 @@ always @(posedge clk or negedge reset_n) begin state <= S_IDLE; write_range_bin <= 0; write_chirp_index <= 0; - read_range_bin <= 0; - read_doppler_index <= 0; + // read_range_bin, read_doppler_index moved to Block 2 (sync reset) + // to enable BRAM address register absorption (REQP-1839 fix) frame_buffer_full <= 0; doppler_valid <= 0; fft_start <= 0; @@ -251,8 +251,7 @@ always @(posedge clk or negedge reset_n) begin frame_buffer_full <= 1; chirp_state <= 0; state <= S_PRE_READ; - read_range_bin <= 0; - read_doppler_index <= 0; + // read_range_bin/read_doppler_index zeroed in Block 2 fft_sample_counter <= 0; // Reset write pointers — no longer needed for // this frame, and prevents stale overflow of @@ -273,7 +272,7 @@ always @(posedge clk or negedge reset_n) begin // Advance read_doppler_index to 1 so the NEXT BRAM read // (which happens every cycle in the memory block) will // fetch chirp 1. - read_doppler_index <= 1; + // read_doppler_index <= 1 moved to Block 2 fft_start <= 1; state <= S_LOAD_FFT; end @@ -311,14 +310,13 @@ always @(posedge clk or negedge reset_n) begin // // We reuse fft_sample_counter as the sub-counter (0..32). + // read_doppler_index updates moved to Block 2 (sync reset) if (fft_sample_counter == 0) begin // Sub 0: pre-multiply. mem_rdata_i = data[chirp=0][rbin]. // (mult_i/mult_q computed in Block 2) // Present BRAM addr for chirp 2 (sub=1 reads chirp 1 // from the BRAM read we triggered in S_PRE_READ; // we need chirp 2 ready for sub=2). - read_doppler_index <= (2 < DOPPLER_FFT_SIZE) ? 2 - : DOPPLER_FFT_SIZE - 1; fft_sample_counter <= 1; end else if (fft_sample_counter <= DOPPLER_FFT_SIZE) begin // Sub 1..32 @@ -331,19 +329,12 @@ always @(posedge clk or negedge reset_n) begin state <= S_FFT_WAIT; fft_sample_counter <= 0; processing_timeout <= 1000; - // Reset read index to prevent stale OOB address - // on BRAM read port during S_FFT_WAIT - read_doppler_index <= 0; end else begin // Sub 1..31: also compute new mult from current BRAM data // (mult_i/mult_q computed in Block 2) // Advance BRAM read to chirp fft_sample_counter+2 // (so data is ready two cycles later when we need it) // Clamp to DOPPLER_FFT_SIZE-1 to prevent OOB memory read - if (fft_sample_counter + 2 < DOPPLER_FFT_SIZE) - read_doppler_index <= fft_sample_counter + 2; - else - read_doppler_index <= DOPPLER_FFT_SIZE - 1; fft_sample_counter <= fft_sample_counter + 1; end end @@ -371,8 +362,7 @@ always @(posedge clk or negedge reset_n) begin S_OUTPUT: begin if (read_range_bin < RANGE_BINS - 1) begin - read_range_bin <= read_range_bin + 1; - read_doppler_index <= 0; + // read_range_bin/read_doppler_index updated in Block 2 fft_sample_counter <= 0; state <= S_PRE_READ; end else begin @@ -392,9 +382,10 @@ end // Uses always @(posedge clk) so Vivado can absorb multipliers // into DSP48 primitives and does not flag REQP-1839/1840 on // BRAM address registers. Replicates the same state/condition -// structure as Block 1 for the eight registers: +// structure as Block 1 for the registers: // mem_we, mem_waddr_r, mem_wdata_i, mem_wdata_q, -// mult_i, mult_q, fft_input_i, fft_input_q +// mult_i, mult_q, fft_input_i, fft_input_q, +// read_range_bin, read_doppler_index // ---------------------------------------------------------- always @(posedge clk) begin if (!reset_n) begin @@ -406,6 +397,8 @@ always @(posedge clk) begin mult_q <= 0; fft_input_i <= 0; fft_input_q <= 0; + read_range_bin <= 0; + read_doppler_index <= 0; end else begin mem_we <= 0; @@ -429,9 +422,22 @@ always @(posedge clk) begin mem_waddr_r <= mem_write_addr; mem_wdata_i <= range_data[15:0]; mem_wdata_q <= range_data[31:16]; + + // Transition to S_PRE_READ when frame complete + if (write_range_bin >= RANGE_BINS - 1 && + write_chirp_index >= CHIRPS_PER_FRAME - 1) begin + read_range_bin <= 0; + read_doppler_index <= 0; + end end end + S_PRE_READ: begin + // Advance read_doppler_index to 1 so next BRAM read + // fetches chirp 1 + read_doppler_index <= 1; + end + S_LOAD_FFT: begin if (fft_sample_counter == 0) begin // Sub 0: pre-multiply. mem_rdata_i = data[chirp=0][rbin]. @@ -439,25 +445,44 @@ always @(posedge clk) begin $signed(window_coeff[0]); mult_q <= $signed(mem_rdata_q) * $signed(window_coeff[0]); + // Present BRAM addr for chirp 2 + read_doppler_index <= (2 < DOPPLER_FFT_SIZE) ? 2 + : DOPPLER_FFT_SIZE - 1; end else if (fft_sample_counter <= DOPPLER_FFT_SIZE) begin // Sub 1..32: capture previous mult into fft_input fft_input_i <= (mult_i + (1 << 14)) >>> 15; fft_input_q <= (mult_q + (1 << 14)) >>> 15; - if (fft_sample_counter < DOPPLER_FFT_SIZE) begin + if (fft_sample_counter == DOPPLER_FFT_SIZE) begin + // Sub 32: flush — reset read index to prevent + // stale OOB address on BRAM read port + read_doppler_index <= 0; + end else begin // Sub 1..31: also compute new mult from current BRAM data // mem_rdata_i = data[chirp = fft_sample_counter][rbin] mult_i <= $signed(mem_rdata_i) * $signed(window_coeff[fft_sample_counter]); mult_q <= $signed(mem_rdata_q) * $signed(window_coeff[fft_sample_counter]); + // Advance BRAM read to chirp fft_sample_counter+2 + if (fft_sample_counter + 2 < DOPPLER_FFT_SIZE) + read_doppler_index <= fft_sample_counter + 2; + else + read_doppler_index <= DOPPLER_FFT_SIZE - 1; end end end + S_OUTPUT: begin + if (read_range_bin < RANGE_BINS - 1) begin + read_range_bin <= read_range_bin + 1; + read_doppler_index <= 0; + end + end + default: begin - // S_PRE_READ, S_FFT_WAIT, S_OUTPUT: - // no BRAM-write or DSP operations needed + // S_IDLE, S_FFT_WAIT: + // no BRAM-write, DSP, or read-address operations needed end endcase end diff --git a/9_Firmware/9_2_FPGA/nco_400m_enhanced.v b/9_Firmware/9_2_FPGA/nco_400m_enhanced.v index 7707a9d..52e18ec 100644 --- a/9_Firmware/9_2_FPGA/nco_400m_enhanced.v +++ b/9_Firmware/9_2_FPGA/nco_400m_enhanced.v @@ -43,7 +43,8 @@ reg [31:0] phase_accum_reg; // Stage 1 output: registered DSP48E1 P[31:0] reg [31:0] phase_with_offset; // Stage 2 output: phase_accum_reg + offset // Stage 3a pipeline registers: registered LUT address + quadrant -reg [5:0] lut_index_pipe; +reg [5:0] lut_index_pipe_sin; +reg [5:0] lut_index_pipe_cos; reg [1:0] quadrant_pipe; // Stage 3b pipeline registers: LUT output + quadrant @@ -105,8 +106,8 @@ wire [5:0] lut_index = (quadrant_w[0] ^ quadrant_w[1]) ? ~lut_address[5:0] : lut // These wires are driven by lut_index_pipe (registered in Stage 3a), so the // combinational path is just: lut_index_pipe_reg → LUT6 (distributed RAM read) // This eliminates the LUT3→LUT6 two-level critical path from Build 8. -wire [15:0] sin_abs_w = sin_lut[lut_index_pipe]; -wire [15:0] cos_abs_w = sin_lut[63 - lut_index_pipe]; +wire [15:0] sin_abs_w = sin_lut[lut_index_pipe_sin]; +wire [15:0] cos_abs_w = sin_lut[63 - lut_index_pipe_cos]; // ============================================================================ // Stage 1: Phase accumulator (DSP48E1) — accumulates FTW each cycle @@ -265,11 +266,13 @@ end // ============================================================================ always @(posedge clk_400m or negedge reset_n) begin if (!reset_n) begin - lut_index_pipe <= 6'b000000; - quadrant_pipe <= 2'b00; + lut_index_pipe_sin <= 6'b000000; + lut_index_pipe_cos <= 6'b000000; + quadrant_pipe <= 2'b00; end else if (valid_pipe[1]) begin - lut_index_pipe <= lut_index; - quadrant_pipe <= quadrant_w; + lut_index_pipe_sin <= lut_index; + lut_index_pipe_cos <= lut_index; + quadrant_pipe <= quadrant_w; end end