Improve timing margins with targeted datapath register tuning

Reduce routing pressure on CIC/NCO critical paths and move Doppler BRAM read-address registers to sync-reset datapath logic so Build 13 closes with stronger setup/hold slack while preserving functional behavior.
2026-03-17 23:51:04 +02:00
parent 36ad15247c
commit 254c0e6f03
3 changed files with 58 additions and 30 deletions
@@ -481,8 +481,8 @@ reg signed [COMB_WIDTH-1:0] comb_delay [0:STAGES-1][0:COMB_DELAY-1];

 // Enhanced control and monitoring
 reg [1:0] decimation_counter;
-reg data_valid_delayed;
-reg data_valid_comb;
+(* keep = "true", max_fanout = 4 *) reg data_valid_delayed;
+(* keep = "true", max_fanout = 4 *) reg data_valid_comb;
 reg [7:0] output_counter;
 reg [ACC_WIDTH-1:0] max_integrator_value;
 reg overflow_detected;
@@ -195,8 +195,8 @@ always @(posedge clk or negedge reset_n) begin
        state <= S_IDLE;
        write_range_bin <= 0;
        write_chirp_index <= 0;
-        read_range_bin <= 0;
-        read_doppler_index <= 0;
+        // read_range_bin, read_doppler_index moved to Block 2 (sync reset)
+        // to enable BRAM address register absorption (REQP-1839 fix)
        frame_buffer_full <= 0;
        doppler_valid <= 0;
        fft_start <= 0;
@@ -251,8 +251,7 @@ always @(posedge clk or negedge reset_n) begin
                            frame_buffer_full <= 1;
                            chirp_state <= 0;
                            state <= S_PRE_READ;
-                            read_range_bin <= 0;
-                            read_doppler_index <= 0;
+                            // read_range_bin/read_doppler_index zeroed in Block 2
                            fft_sample_counter <= 0;
                            // Reset write pointers — no longer needed for
                            // this frame, and prevents stale overflow of
@@ -273,7 +272,7 @@ always @(posedge clk or negedge reset_n) begin
                // Advance read_doppler_index to 1 so the NEXT BRAM read
                // (which happens every cycle in the memory block) will
                // fetch chirp 1.
-                read_doppler_index <= 1;
+                // read_doppler_index <= 1 moved to Block 2
                fft_start <= 1;
                state <= S_LOAD_FFT;
            end
@@ -311,14 +310,13 @@ always @(posedge clk or negedge reset_n) begin
                //
                // We reuse fft_sample_counter as the sub-counter (0..32).

+                // read_doppler_index updates moved to Block 2 (sync reset)
                if (fft_sample_counter == 0) begin
                    // Sub 0: pre-multiply.  mem_rdata_i = data[chirp=0][rbin].
                    // (mult_i/mult_q computed in Block 2)
                    // Present BRAM addr for chirp 2 (sub=1 reads chirp 1
                    // from the BRAM read we triggered in S_PRE_READ;
                    // we need chirp 2 ready for sub=2).
-                    read_doppler_index <= (2 < DOPPLER_FFT_SIZE) ? 2
-                                          : DOPPLER_FFT_SIZE - 1;
                    fft_sample_counter <= 1;
                end else if (fft_sample_counter <= DOPPLER_FFT_SIZE) begin
                    // Sub 1..32
@@ -331,19 +329,12 @@ always @(posedge clk or negedge reset_n) begin
                        state <= S_FFT_WAIT;
                        fft_sample_counter <= 0;
                        processing_timeout <= 1000;
-                        // Reset read index to prevent stale OOB address
-                        // on BRAM read port during S_FFT_WAIT
-                        read_doppler_index <= 0;
                    end else begin
                        // Sub 1..31: also compute new mult from current BRAM data
                        // (mult_i/mult_q computed in Block 2)
                        // Advance BRAM read to chirp fft_sample_counter+2
                        // (so data is ready two cycles later when we need it)
                        // Clamp to DOPPLER_FFT_SIZE-1 to prevent OOB memory read
-                        if (fft_sample_counter + 2 < DOPPLER_FFT_SIZE)
-                            read_doppler_index <= fft_sample_counter + 2;
-                        else
-                            read_doppler_index <= DOPPLER_FFT_SIZE - 1;
                        fft_sample_counter <= fft_sample_counter + 1;
                    end
                end
@@ -371,8 +362,7 @@ always @(posedge clk or negedge reset_n) begin
            
            S_OUTPUT: begin
                if (read_range_bin < RANGE_BINS - 1) begin
-                    read_range_bin <= read_range_bin + 1;
-                    read_doppler_index <= 0;
+                    // read_range_bin/read_doppler_index updated in Block 2
                    fft_sample_counter <= 0;
                    state <= S_PRE_READ;
                end else begin
@@ -392,9 +382,10 @@ end
 // Uses always @(posedge clk) so Vivado can absorb multipliers
 // into DSP48 primitives and does not flag REQP-1839/1840 on
 // BRAM address registers.  Replicates the same state/condition
-// structure as Block 1 for the eight registers:
+// structure as Block 1 for the registers:
 //   mem_we, mem_waddr_r, mem_wdata_i, mem_wdata_q,
-//   mult_i, mult_q, fft_input_i, fft_input_q
+//   mult_i, mult_q, fft_input_i, fft_input_q,
+//   read_range_bin, read_doppler_index
 // ----------------------------------------------------------
 always @(posedge clk) begin
    if (!reset_n) begin
@@ -406,6 +397,8 @@ always @(posedge clk) begin
        mult_q      <= 0;
        fft_input_i <= 0;
        fft_input_q <= 0;
+        read_range_bin     <= 0;
+        read_doppler_index <= 0;
    end else begin
        mem_we <= 0;
        
@@ -429,9 +422,22 @@ always @(posedge clk) begin
                    mem_waddr_r <= mem_write_addr;
                    mem_wdata_i <= range_data[15:0];
                    mem_wdata_q <= range_data[31:16];
+
+                    // Transition to S_PRE_READ when frame complete
+                    if (write_range_bin >= RANGE_BINS - 1 &&
+                        write_chirp_index >= CHIRPS_PER_FRAME - 1) begin
+                        read_range_bin     <= 0;
+                        read_doppler_index <= 0;
+                    end
                end
            end
            
+            S_PRE_READ: begin
+                // Advance read_doppler_index to 1 so next BRAM read
+                // fetches chirp 1
+                read_doppler_index <= 1;
+            end
+
            S_LOAD_FFT: begin
                if (fft_sample_counter == 0) begin
                    // Sub 0: pre-multiply.  mem_rdata_i = data[chirp=0][rbin].
@@ -439,25 +445,44 @@ always @(posedge clk) begin
                                   $signed(window_coeff[0]);
                    mult_q <= $signed(mem_rdata_q) *
                                   $signed(window_coeff[0]);
+                    // Present BRAM addr for chirp 2
+                    read_doppler_index <= (2 < DOPPLER_FFT_SIZE) ? 2
+                                          : DOPPLER_FFT_SIZE - 1;
                end else if (fft_sample_counter <= DOPPLER_FFT_SIZE) begin
                    // Sub 1..32: capture previous mult into fft_input
                    fft_input_i <= (mult_i + (1 << 14)) >>> 15;
                    fft_input_q <= (mult_q + (1 << 14)) >>> 15;

-                    if (fft_sample_counter < DOPPLER_FFT_SIZE) begin
+                    if (fft_sample_counter == DOPPLER_FFT_SIZE) begin
+                        // Sub 32: flush — reset read index to prevent
+                        // stale OOB address on BRAM read port
+                        read_doppler_index <= 0;
+                    end else begin
                        // Sub 1..31: also compute new mult from current BRAM data
                        // mem_rdata_i = data[chirp = fft_sample_counter][rbin]
                        mult_i <= $signed(mem_rdata_i) *
                                       $signed(window_coeff[fft_sample_counter]);
                        mult_q <= $signed(mem_rdata_q) *
                                       $signed(window_coeff[fft_sample_counter]);
+                        // Advance BRAM read to chirp fft_sample_counter+2
+                        if (fft_sample_counter + 2 < DOPPLER_FFT_SIZE)
+                            read_doppler_index <= fft_sample_counter + 2;
+                        else
+                            read_doppler_index <= DOPPLER_FFT_SIZE - 1;
                    end
                end
            end

+            S_OUTPUT: begin
+                if (read_range_bin < RANGE_BINS - 1) begin
+                    read_range_bin     <= read_range_bin + 1;
+                    read_doppler_index <= 0;
+                end
+            end
+
            default: begin
-                // S_PRE_READ, S_FFT_WAIT, S_OUTPUT:
-                // no BRAM-write or DSP operations needed
+                // S_IDLE, S_FFT_WAIT:
+                // no BRAM-write, DSP, or read-address operations needed
            end
        endcase
    end
@@ -43,7 +43,8 @@ reg [31:0] phase_accum_reg;     // Stage 1 output: registered DSP48E1 P[31:0]
 reg [31:0] phase_with_offset;   // Stage 2 output: phase_accum_reg + offset

 // Stage 3a pipeline registers: registered LUT address + quadrant
-reg [5:0] lut_index_pipe;
+reg [5:0] lut_index_pipe_sin;
+reg [5:0] lut_index_pipe_cos;
 reg [1:0] quadrant_pipe;

 // Stage 3b pipeline registers: LUT output + quadrant
@@ -105,8 +106,8 @@ wire [5:0] lut_index = (quadrant_w[0] ^ quadrant_w[1]) ? ~lut_address[5:0] : lut
 // These wires are driven by lut_index_pipe (registered in Stage 3a), so the
 // combinational path is just: lut_index_pipe_reg → LUT6 (distributed RAM read)
 // This eliminates the LUT3→LUT6 two-level critical path from Build 8.
-wire [15:0] sin_abs_w = sin_lut[lut_index_pipe];
-wire [15:0] cos_abs_w = sin_lut[63 - lut_index_pipe];
+wire [15:0] sin_abs_w = sin_lut[lut_index_pipe_sin];
+wire [15:0] cos_abs_w = sin_lut[63 - lut_index_pipe_cos];

 // ============================================================================
 // Stage 1: Phase accumulator (DSP48E1) — accumulates FTW each cycle
@@ -265,11 +266,13 @@ end
 // ============================================================================
 always @(posedge clk_400m or negedge reset_n) begin
    if (!reset_n) begin
-        lut_index_pipe <= 6'b000000;
-        quadrant_pipe  <= 2'b00;
+        lut_index_pipe_sin <= 6'b000000;
+        lut_index_pipe_cos <= 6'b000000;
+        quadrant_pipe      <= 2'b00;
    end else if (valid_pipe[1]) begin
-        lut_index_pipe <= lut_index;
-        quadrant_pipe  <= quadrant_w;
+        lut_index_pipe_sin <= lut_index;
+        lut_index_pipe_cos <= lut_index;
+        quadrant_pipe      <= quadrant_w;
    end
 end