FIR DSP48 pipelining (BREG+MREG) + matched filter BRAM migration with overlap cache

FIR: Add coeff_reg/mult_reg pipeline stages to fix 68 DPIP-1 + 35 DPOP-2
DRC warnings. Valid pipeline widened 7→9 bits (+2 cycle latency).

Matched filter: Migrate input_buffer_i/q from register arrays to BRAM
(~33K FF savings). Overlap-save uses register cache captured during
ST_PROCESSING to avoid BRAM read/write conflicts during overlap copy.
New ST_OVERLAP_COPY state writes cached tail samples back sequentially.

Both changes pass 18/18 FPGA regression. Golden data regenerated for
+2 FIR latency baseline.
This commit is contained in:
Jason
2026-03-19 20:39:01 +02:00
parent 4e3c20066b
commit ed6f79c6d3
5 changed files with 4538 additions and 4395 deletions
+100 -27
View File
@@ -23,22 +23,25 @@ parameter ACCUM_WIDTH = 36;
// created a 31-deep DSP48E1 PCOUT cascade chain taking 56.6ns (WNS = -48.325ns).
//
// Solution: 5-stage pipelined binary adder tree with registered outputs at
// each level. Each stage performs at most one pairwise addition (~1.7ns DSP hop),
// easily fitting in the 10ns clock period.
// each level, plus BREG (coefficient register) and MREG (multiply output
// register) stages for DSP48E1 absorption. Each stage performs at most one
// pairwise addition (~1.7ns DSP hop), easily fitting in the 10ns clock period.
//
// Pipeline stages:
// Cycle 0: data_valid shift delay line, start multiplies (combinatorial)
// Cycle 1: Register 32 multiply results + 16 pairwise sums (level 0)
// Cycle 2: 8 pairwise sums (level 1)
// Cycle 3: 4 pairwise sums (level 2)
// Cycle 4: 2 pairwise sums (level 3)
// Cycle 5: 1 final sum accumulator_reg (level 4)
// Cycle 6: Output saturation/rounding (existing output stage)
// Cycle 0: data_valid shift delay line + latch coefficients (BREG)
// Cycle 1: Combinatorial multiply latched into mult_reg (MREG)
// Cycle 2: 16 pairwise sums of 32 multiply results (level 0)
// Cycle 3: 8 pairwise sums (level 1)
// Cycle 4: 4 pairwise sums (level 2)
// Cycle 5: 2 pairwise sums (level 3)
// Cycle 6: 1 final sum accumulator_reg (level 4)
// Cycle 7: Output saturation/rounding
//
// Total latency: 7 cycles from data_valid to data_out_valid
// Total latency: 9 cycles from data_valid to data_out_valid
// (was 7 before BREG+MREG addition +2 cycles for DSP48 pipelining)
// Throughput: 1 sample per cycle (fully pipelined)
// FIR runs at 100 MHz on data decimated 4:1 from 400 MHz valid samples
// arrive every ~4 cycles, so the 7-cycle latency is transparent.
// arrive every ~4 cycles, so the 9-cycle latency is transparent.
// ============================================================================
// Filter coefficients (symmetric: coeff[k] == coeff[31-k])
@@ -62,10 +65,11 @@ reg signed [ACCUM_WIDTH-1:0] add_l3 [0:1];
// Level 4: final sum
reg signed [ACCUM_WIDTH-1:0] accumulator_reg;
// Valid pipeline: 7-stage shift register
// [0]=multiply done, [1]=L0 done, [2]=L1 done, [3]=L2 done,
// [4]=L3 done, [5]=L4/accum done, [6]=output done
reg [6:0] valid_pipe;
// Valid pipeline: 9-stage shift register (was 7 before BREG+MREG addition)
// [0]=BREG done, [1]=MREG done, [2]=L0 done, [3]=L1 done, [4]=L2 done,
// [5]=L3 done, [6]=L4/accum done, [7]=output done, [8]=spare
// The BREG and MREG stages add 2 cycles of latency.
reg [8:0] valid_pipe;
// Initialize coefficients
initial begin
@@ -80,11 +84,45 @@ initial begin
coeff[28] = 18'sh02A6; coeff[29] = 18'sh3FD87; coeff[30] = 18'sh00CE; coeff[31] = 18'sh00AD;
end
// Generate parallel multipliers (combinatorial DSP48E1 will absorb these)
// ============================================================================
// DSP48E1 PIPELINE REGISTERS (BREG + MREG)
// ============================================================================
// Vivado DRC warnings DPIP-1 (input not pipelined) and DPOP-2 (output not
// pipelined) indicate that the DSP48E1 internal BREG and MREG pipeline stages
// are not being used.
//
// Solution: Add explicit registered stages that Vivado can absorb into DSP48E1:
// BREG: coeff_reg[k] registered copy of coeff[k], feeds DSP48 B-port
// MREG: mult_reg[k] registered multiply output, feeds DSP48 P-port
//
// With these registers, Vivado sets BREG=1 and MREG=1 inside DSP48E1,
// eliminating 68 DPIP-1 + 35 DPOP-2 warnings and improving timing.
//
// Pipeline impact: +2 cycles latency (BREG + MREG). Total FIR latency
// goes from 7 to 9 cycles. Still transparent since FIR input arrives
// every ~4 clocks (100 MHz / 4:1 CIC decimation).
// ============================================================================
// Registered coefficients (BREG absorbed into DSP48E1 B-port register)
reg signed [COEFF_WIDTH-1:0] coeff_reg [0:TAPS-1];
// Registered multiply outputs (MREG absorbed into DSP48E1 M-register)
reg signed [DATA_WIDTH+COEFF_WIDTH-1:0] mult_reg [0:TAPS-1];
// Combinatorial multiply (between BREG and MREG)
wire signed [DATA_WIDTH+COEFF_WIDTH-1:0] mult_comb [0:TAPS-1];
genvar k;
generate
for (k = 0; k < TAPS; k = k + 1) begin : mult_gen
assign mult_result[k] = delay_line[k] * coeff[k];
assign mult_comb[k] = delay_line[k] * coeff_reg[k];
end
endgenerate
// mult_result now comes from the registered multiply output (MREG stage)
genvar m;
generate
for (m = 0; m < TAPS; m = m + 1) begin : mult_alias
assign mult_result[m] = mult_reg[m];
end
endgenerate
@@ -111,17 +149,52 @@ always @(posedge clk) begin
end
end
// ============================================================================
// Pipeline Stage 0b (BREG): Register coefficients
// Runs on data_valid alongside delay_line shift.
// Vivado absorbs into DSP48E1 B-port pipeline register (BREG=1).
// ============================================================================
always @(posedge clk) begin
if (!reset_n) begin
for (i = 0; i < TAPS; i = i + 1) begin
coeff_reg[i] <= 0;
end
end else if (data_valid) begin
for (i = 0; i < TAPS; i = i + 1) begin
coeff_reg[i] <= coeff[i];
end
end
end
// ============================================================================
// Pipeline Stage 0c (MREG): Register multiply outputs
// Captures combinatorial multiply results one cycle after BREG.
// Vivado absorbs into DSP48E1 M-register (MREG=1).
// ============================================================================
always @(posedge clk) begin
if (!reset_n) begin
for (i = 0; i < TAPS; i = i + 1) begin
mult_reg[i] <= 0;
end
end else if (valid_pipe[0]) begin
for (i = 0; i < TAPS; i = i + 1) begin
mult_reg[i] <= mult_comb[i];
end
end
end
// ============================================================================
// Pipeline Stage 1 (Level 0): Register 16 pairwise sums of 32 multiply results
// Each addition is a single 36-bit add one DSP48E1 hop (~1.7ns), fits 10ns.
// Sync reset enables DSP48E1 absorption (fixes DPOR-1 warnings)
// Now uses mult_result (from mult_reg/MREG stage) instead of combinatorial multiply.
// ============================================================================
always @(posedge clk) begin
if (!reset_n) begin
for (i = 0; i < 16; i = i + 1) begin
add_l0[i] <= 0;
end
end else if (valid_pipe[0]) begin
end else if (valid_pipe[1]) begin
for (i = 0; i < 16; i = i + 1) begin
// mult_result is (DATA_WIDTH + COEFF_WIDTH) = 36 bits = ACCUM_WIDTH,
// so no sign extension is needed. Direct assignment preserves the
@@ -141,7 +214,7 @@ always @(posedge clk) begin
for (i = 0; i < 8; i = i + 1) begin
add_l1[i] <= 0;
end
end else if (valid_pipe[1]) begin
end else if (valid_pipe[2]) begin
for (i = 0; i < 8; i = i + 1) begin
add_l1[i] <= add_l0[2*i] + add_l0[2*i+1];
end
@@ -157,7 +230,7 @@ always @(posedge clk) begin
for (i = 0; i < 4; i = i + 1) begin
add_l2[i] <= 0;
end
end else if (valid_pipe[2]) begin
end else if (valid_pipe[3]) begin
for (i = 0; i < 4; i = i + 1) begin
add_l2[i] <= add_l1[2*i] + add_l1[2*i+1];
end
@@ -172,7 +245,7 @@ always @(posedge clk) begin
if (!reset_n) begin
add_l3[0] <= 0;
add_l3[1] <= 0;
end else if (valid_pipe[3]) begin
end else if (valid_pipe[4]) begin
add_l3[0] <= add_l2[0] + add_l2[1];
add_l3[1] <= add_l2[2] + add_l2[3];
end
@@ -185,7 +258,7 @@ end
always @(posedge clk) begin
if (!reset_n) begin
accumulator_reg <= 0;
end else if (valid_pipe[4]) begin
end else if (valid_pipe[5]) begin
accumulator_reg <= add_l3[0] + add_l3[1];
end
end
@@ -199,9 +272,9 @@ always @(posedge clk) begin
data_out <= 0;
data_out_valid <= 0;
end else begin
data_out_valid <= valid_pipe[5];
data_out_valid <= valid_pipe[6];
if (valid_pipe[5]) begin
if (valid_pipe[6]) begin
// Output saturation logic
if (accumulator_reg > (2**(ACCUM_WIDTH-2)-1)) begin
data_out <= (2**(DATA_WIDTH-1))-1;
@@ -216,14 +289,14 @@ always @(posedge clk) begin
end
// ============================================================================
// Valid pipeline shift register
// Valid pipeline shift register (9-stage for BREG+MREG+5-level adder+output)
// Sync reset no DSP48 involvement but keeps reset style consistent with datapath
// ============================================================================
always @(posedge clk) begin
if (!reset_n) begin
valid_pipe <= 7'b0000000;
valid_pipe <= 9'b000000000;
end else begin
valid_pipe <= {valid_pipe[5:0], data_valid};
valid_pipe <= {valid_pipe[7:0], data_valid};
end
end