FIR DSP48 pipelining (BREG+MREG) + matched filter BRAM migration with overlap cache
FIR: Add coeff_reg/mult_reg pipeline stages to fix 68 DPIP-1 + 35 DPOP-2 DRC warnings. Valid pipeline widened 7→9 bits (+2 cycle latency). Matched filter: Migrate input_buffer_i/q from register arrays to BRAM (~33K FF savings). Overlap-save uses register cache captured during ST_PROCESSING to avoid BRAM read/write conflicts during overlap copy. New ST_OVERLAP_COPY state writes cached tail samples back sequentially. Both changes pass 18/18 FPGA regression. Golden data regenerated for +2 FIR latency baseline.
This commit is contained in:
@@ -23,22 +23,25 @@ parameter ACCUM_WIDTH = 36;
|
||||
// created a 31-deep DSP48E1 PCOUT cascade chain taking 56.6ns (WNS = -48.325ns).
|
||||
//
|
||||
// Solution: 5-stage pipelined binary adder tree with registered outputs at
|
||||
// each level. Each stage performs at most one pairwise addition (~1.7ns DSP hop),
|
||||
// easily fitting in the 10ns clock period.
|
||||
// each level, plus BREG (coefficient register) and MREG (multiply output
|
||||
// register) stages for DSP48E1 absorption. Each stage performs at most one
|
||||
// pairwise addition (~1.7ns DSP hop), easily fitting in the 10ns clock period.
|
||||
//
|
||||
// Pipeline stages:
|
||||
// Cycle 0: data_valid → shift delay line, start multiplies (combinatorial)
|
||||
// Cycle 1: Register 32 multiply results + 16 pairwise sums (level 0)
|
||||
// Cycle 2: 8 pairwise sums (level 1)
|
||||
// Cycle 3: 4 pairwise sums (level 2)
|
||||
// Cycle 4: 2 pairwise sums (level 3)
|
||||
// Cycle 5: 1 final sum → accumulator_reg (level 4)
|
||||
// Cycle 6: Output saturation/rounding (existing output stage)
|
||||
// Cycle 0: data_valid → shift delay line + latch coefficients (BREG)
|
||||
// Cycle 1: Combinatorial multiply latched into mult_reg (MREG)
|
||||
// Cycle 2: 16 pairwise sums of 32 multiply results (level 0)
|
||||
// Cycle 3: 8 pairwise sums (level 1)
|
||||
// Cycle 4: 4 pairwise sums (level 2)
|
||||
// Cycle 5: 2 pairwise sums (level 3)
|
||||
// Cycle 6: 1 final sum → accumulator_reg (level 4)
|
||||
// Cycle 7: Output saturation/rounding
|
||||
//
|
||||
// Total latency: 7 cycles from data_valid to data_out_valid
|
||||
// Total latency: 9 cycles from data_valid to data_out_valid
|
||||
// (was 7 before BREG+MREG addition — +2 cycles for DSP48 pipelining)
|
||||
// Throughput: 1 sample per cycle (fully pipelined)
|
||||
// FIR runs at 100 MHz on data decimated 4:1 from 400 MHz — valid samples
|
||||
// arrive every ~4 cycles, so the 7-cycle latency is transparent.
|
||||
// arrive every ~4 cycles, so the 9-cycle latency is transparent.
|
||||
// ============================================================================
|
||||
|
||||
// Filter coefficients (symmetric: coeff[k] == coeff[31-k])
|
||||
@@ -62,10 +65,11 @@ reg signed [ACCUM_WIDTH-1:0] add_l3 [0:1];
|
||||
// Level 4: final sum
|
||||
reg signed [ACCUM_WIDTH-1:0] accumulator_reg;
|
||||
|
||||
// Valid pipeline: 7-stage shift register
|
||||
// [0]=multiply done, [1]=L0 done, [2]=L1 done, [3]=L2 done,
|
||||
// [4]=L3 done, [5]=L4/accum done, [6]=output done
|
||||
reg [6:0] valid_pipe;
|
||||
// Valid pipeline: 9-stage shift register (was 7 before BREG+MREG addition)
|
||||
// [0]=BREG done, [1]=MREG done, [2]=L0 done, [3]=L1 done, [4]=L2 done,
|
||||
// [5]=L3 done, [6]=L4/accum done, [7]=output done, [8]=spare
|
||||
// The BREG and MREG stages add 2 cycles of latency.
|
||||
reg [8:0] valid_pipe;
|
||||
|
||||
// Initialize coefficients
|
||||
initial begin
|
||||
@@ -80,11 +84,45 @@ initial begin
|
||||
coeff[28] = 18'sh02A6; coeff[29] = 18'sh3FD87; coeff[30] = 18'sh00CE; coeff[31] = 18'sh00AD;
|
||||
end
|
||||
|
||||
// Generate parallel multipliers (combinatorial — DSP48E1 will absorb these)
|
||||
// ============================================================================
|
||||
// DSP48E1 PIPELINE REGISTERS (BREG + MREG)
|
||||
// ============================================================================
|
||||
// Vivado DRC warnings DPIP-1 (input not pipelined) and DPOP-2 (output not
|
||||
// pipelined) indicate that the DSP48E1 internal BREG and MREG pipeline stages
|
||||
// are not being used.
|
||||
//
|
||||
// Solution: Add explicit registered stages that Vivado can absorb into DSP48E1:
|
||||
// BREG: coeff_reg[k] — registered copy of coeff[k], feeds DSP48 B-port
|
||||
// MREG: mult_reg[k] — registered multiply output, feeds DSP48 P-port
|
||||
//
|
||||
// With these registers, Vivado sets BREG=1 and MREG=1 inside DSP48E1,
|
||||
// eliminating 68 DPIP-1 + 35 DPOP-2 warnings and improving timing.
|
||||
//
|
||||
// Pipeline impact: +2 cycles latency (BREG + MREG). Total FIR latency
|
||||
// goes from 7 to 9 cycles. Still transparent since FIR input arrives
|
||||
// every ~4 clocks (100 MHz / 4:1 CIC decimation).
|
||||
// ============================================================================
|
||||
|
||||
// Registered coefficients (BREG — absorbed into DSP48E1 B-port register)
|
||||
reg signed [COEFF_WIDTH-1:0] coeff_reg [0:TAPS-1];
|
||||
|
||||
// Registered multiply outputs (MREG — absorbed into DSP48E1 M-register)
|
||||
reg signed [DATA_WIDTH+COEFF_WIDTH-1:0] mult_reg [0:TAPS-1];
|
||||
|
||||
// Combinatorial multiply (between BREG and MREG)
|
||||
wire signed [DATA_WIDTH+COEFF_WIDTH-1:0] mult_comb [0:TAPS-1];
|
||||
genvar k;
|
||||
generate
|
||||
for (k = 0; k < TAPS; k = k + 1) begin : mult_gen
|
||||
assign mult_result[k] = delay_line[k] * coeff[k];
|
||||
assign mult_comb[k] = delay_line[k] * coeff_reg[k];
|
||||
end
|
||||
endgenerate
|
||||
|
||||
// mult_result now comes from the registered multiply output (MREG stage)
|
||||
genvar m;
|
||||
generate
|
||||
for (m = 0; m < TAPS; m = m + 1) begin : mult_alias
|
||||
assign mult_result[m] = mult_reg[m];
|
||||
end
|
||||
endgenerate
|
||||
|
||||
@@ -111,17 +149,52 @@ always @(posedge clk) begin
|
||||
end
|
||||
end
|
||||
|
||||
// ============================================================================
|
||||
// Pipeline Stage 0b (BREG): Register coefficients
|
||||
// Runs on data_valid alongside delay_line shift.
|
||||
// Vivado absorbs into DSP48E1 B-port pipeline register (BREG=1).
|
||||
// ============================================================================
|
||||
always @(posedge clk) begin
|
||||
if (!reset_n) begin
|
||||
for (i = 0; i < TAPS; i = i + 1) begin
|
||||
coeff_reg[i] <= 0;
|
||||
end
|
||||
end else if (data_valid) begin
|
||||
for (i = 0; i < TAPS; i = i + 1) begin
|
||||
coeff_reg[i] <= coeff[i];
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
// ============================================================================
|
||||
// Pipeline Stage 0c (MREG): Register multiply outputs
|
||||
// Captures combinatorial multiply results one cycle after BREG.
|
||||
// Vivado absorbs into DSP48E1 M-register (MREG=1).
|
||||
// ============================================================================
|
||||
always @(posedge clk) begin
|
||||
if (!reset_n) begin
|
||||
for (i = 0; i < TAPS; i = i + 1) begin
|
||||
mult_reg[i] <= 0;
|
||||
end
|
||||
end else if (valid_pipe[0]) begin
|
||||
for (i = 0; i < TAPS; i = i + 1) begin
|
||||
mult_reg[i] <= mult_comb[i];
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
// ============================================================================
|
||||
// Pipeline Stage 1 (Level 0): Register 16 pairwise sums of 32 multiply results
|
||||
// Each addition is a single 36-bit add — one DSP48E1 hop (~1.7ns), fits 10ns.
|
||||
// Sync reset enables DSP48E1 absorption (fixes DPOR-1 warnings)
|
||||
// Now uses mult_result (from mult_reg/MREG stage) instead of combinatorial multiply.
|
||||
// ============================================================================
|
||||
always @(posedge clk) begin
|
||||
if (!reset_n) begin
|
||||
for (i = 0; i < 16; i = i + 1) begin
|
||||
add_l0[i] <= 0;
|
||||
end
|
||||
end else if (valid_pipe[0]) begin
|
||||
end else if (valid_pipe[1]) begin
|
||||
for (i = 0; i < 16; i = i + 1) begin
|
||||
// mult_result is (DATA_WIDTH + COEFF_WIDTH) = 36 bits = ACCUM_WIDTH,
|
||||
// so no sign extension is needed. Direct assignment preserves the
|
||||
@@ -141,7 +214,7 @@ always @(posedge clk) begin
|
||||
for (i = 0; i < 8; i = i + 1) begin
|
||||
add_l1[i] <= 0;
|
||||
end
|
||||
end else if (valid_pipe[1]) begin
|
||||
end else if (valid_pipe[2]) begin
|
||||
for (i = 0; i < 8; i = i + 1) begin
|
||||
add_l1[i] <= add_l0[2*i] + add_l0[2*i+1];
|
||||
end
|
||||
@@ -157,7 +230,7 @@ always @(posedge clk) begin
|
||||
for (i = 0; i < 4; i = i + 1) begin
|
||||
add_l2[i] <= 0;
|
||||
end
|
||||
end else if (valid_pipe[2]) begin
|
||||
end else if (valid_pipe[3]) begin
|
||||
for (i = 0; i < 4; i = i + 1) begin
|
||||
add_l2[i] <= add_l1[2*i] + add_l1[2*i+1];
|
||||
end
|
||||
@@ -172,7 +245,7 @@ always @(posedge clk) begin
|
||||
if (!reset_n) begin
|
||||
add_l3[0] <= 0;
|
||||
add_l3[1] <= 0;
|
||||
end else if (valid_pipe[3]) begin
|
||||
end else if (valid_pipe[4]) begin
|
||||
add_l3[0] <= add_l2[0] + add_l2[1];
|
||||
add_l3[1] <= add_l2[2] + add_l2[3];
|
||||
end
|
||||
@@ -185,7 +258,7 @@ end
|
||||
always @(posedge clk) begin
|
||||
if (!reset_n) begin
|
||||
accumulator_reg <= 0;
|
||||
end else if (valid_pipe[4]) begin
|
||||
end else if (valid_pipe[5]) begin
|
||||
accumulator_reg <= add_l3[0] + add_l3[1];
|
||||
end
|
||||
end
|
||||
@@ -199,9 +272,9 @@ always @(posedge clk) begin
|
||||
data_out <= 0;
|
||||
data_out_valid <= 0;
|
||||
end else begin
|
||||
data_out_valid <= valid_pipe[5];
|
||||
data_out_valid <= valid_pipe[6];
|
||||
|
||||
if (valid_pipe[5]) begin
|
||||
if (valid_pipe[6]) begin
|
||||
// Output saturation logic
|
||||
if (accumulator_reg > (2**(ACCUM_WIDTH-2)-1)) begin
|
||||
data_out <= (2**(DATA_WIDTH-1))-1;
|
||||
@@ -216,14 +289,14 @@ always @(posedge clk) begin
|
||||
end
|
||||
|
||||
// ============================================================================
|
||||
// Valid pipeline shift register
|
||||
// Valid pipeline shift register (9-stage for BREG+MREG+5-level adder+output)
|
||||
// Sync reset — no DSP48 involvement but keeps reset style consistent with datapath
|
||||
// ============================================================================
|
||||
always @(posedge clk) begin
|
||||
if (!reset_n) begin
|
||||
valid_pipe <= 7'b0000000;
|
||||
valid_pipe <= 9'b000000000;
|
||||
end else begin
|
||||
valid_pipe <= {valid_pipe[5:0], data_valid};
|
||||
valid_pipe <= {valid_pipe[7:0], data_valid};
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
@@ -56,15 +56,24 @@ parameter SHORT_SEGMENTS = 1; // 50 samples padded to 1024
|
||||
reg signed [31:0] pc_i, pc_q;
|
||||
reg pc_valid;
|
||||
|
||||
// Dual buffer for overlap-save
|
||||
reg signed [15:0] input_buffer_i [0:BUFFER_SIZE-1];
|
||||
reg signed [15:0] input_buffer_q [0:BUFFER_SIZE-1];
|
||||
// Dual buffer for overlap-save — BRAM inferred for synthesis
|
||||
(* ram_style = "block" *) reg signed [15:0] input_buffer_i [0:BUFFER_SIZE-1];
|
||||
(* ram_style = "block" *) reg signed [15:0] input_buffer_q [0:BUFFER_SIZE-1];
|
||||
reg [10:0] buffer_write_ptr;
|
||||
reg [10:0] buffer_read_ptr;
|
||||
reg buffer_has_data;
|
||||
reg buffer_processing;
|
||||
reg [15:0] chirp_samples_collected;
|
||||
|
||||
// BRAM write port signals
|
||||
reg buf_we;
|
||||
reg [9:0] buf_waddr;
|
||||
reg signed [15:0] buf_wdata_i, buf_wdata_q;
|
||||
|
||||
// BRAM read port signals
|
||||
reg [9:0] buf_raddr;
|
||||
reg signed [15:0] buf_rdata_i, buf_rdata_q;
|
||||
|
||||
// State machine
|
||||
reg [3:0] state;
|
||||
localparam ST_IDLE = 0;
|
||||
@@ -75,19 +84,25 @@ localparam ST_PROCESSING = 4;
|
||||
localparam ST_WAIT_FFT = 5;
|
||||
localparam ST_OUTPUT = 6;
|
||||
localparam ST_NEXT_SEGMENT = 7;
|
||||
localparam ST_OVERLAP_COPY = 8;
|
||||
|
||||
// Segment tracking
|
||||
reg [2:0] current_segment; // 0-3
|
||||
reg [2:0] total_segments;
|
||||
reg segment_done;
|
||||
reg chirp_complete;
|
||||
// Segment tracking
|
||||
reg [2:0] current_segment; // 0-3
|
||||
reg [2:0] total_segments;
|
||||
reg segment_done;
|
||||
reg chirp_complete;
|
||||
reg saw_chain_output; // Flag: chain started producing output
|
||||
|
||||
// Overlap cache — captured during ST_PROCESSING, written back in ST_OVERLAP_COPY
|
||||
reg signed [15:0] overlap_cache_i [0:OVERLAP_SAMPLES-1];
|
||||
reg signed [15:0] overlap_cache_q [0:OVERLAP_SAMPLES-1];
|
||||
reg [7:0] overlap_copy_count;
|
||||
|
||||
// Microcontroller sync detection
|
||||
reg mc_new_chirp_prev, mc_new_elevation_prev, mc_new_azimuth_prev;
|
||||
wire chirp_start_pulse = mc_new_chirp && !mc_new_chirp_prev;
|
||||
wire elevation_change_pulse = mc_new_elevation && !mc_new_elevation_prev;
|
||||
wire azimuth_change_pulse = mc_new_azimuth && !mc_new_azimuth_prev;
|
||||
wire azimuth_change_pulse = mc_new_azimuth && !mc_new_azimuth_prev;
|
||||
|
||||
// Processing chain signals
|
||||
wire [15:0] fft_pc_i, fft_pc_q;
|
||||
@@ -96,7 +111,7 @@ wire [3:0] fft_chain_state;
|
||||
|
||||
// Buffer for FFT input
|
||||
reg [15:0] fft_input_i, fft_input_q;
|
||||
reg fft_input_valid;
|
||||
reg fft_input_valid;
|
||||
reg fft_start;
|
||||
|
||||
// ========== SAMPLE ADDRESS OUTPUT ==========
|
||||
@@ -117,11 +132,30 @@ end
|
||||
|
||||
// ========== BUFFER INITIALIZATION ==========
|
||||
integer buf_init;
|
||||
integer ov_init;
|
||||
initial begin
|
||||
for (buf_init = 0; buf_init < BUFFER_SIZE; buf_init = buf_init + 1) begin
|
||||
input_buffer_i[buf_init] = 16'd0;
|
||||
input_buffer_q[buf_init] = 16'd0;
|
||||
end
|
||||
for (ov_init = 0; ov_init < OVERLAP_SAMPLES; ov_init = ov_init + 1) begin
|
||||
overlap_cache_i[ov_init] = 16'd0;
|
||||
overlap_cache_q[ov_init] = 16'd0;
|
||||
end
|
||||
end
|
||||
|
||||
// ========== BRAM WRITE PORT (synchronous, no async reset) ==========
|
||||
always @(posedge clk) begin
|
||||
if (buf_we) begin
|
||||
input_buffer_i[buf_waddr] <= buf_wdata_i;
|
||||
input_buffer_q[buf_waddr] <= buf_wdata_q;
|
||||
end
|
||||
end
|
||||
|
||||
// ========== BRAM READ PORT (synchronous, no async reset) ==========
|
||||
always @(posedge clk) begin
|
||||
buf_rdata_i <= input_buffer_i[buf_raddr];
|
||||
buf_rdata_q <= input_buffer_q[buf_raddr];
|
||||
end
|
||||
|
||||
// ========== FIXED STATE MACHINE WITH OVERLAP-SAVE ==========
|
||||
@@ -140,208 +174,231 @@ always @(posedge clk or negedge reset_n) begin
|
||||
pc_valid <= 0;
|
||||
status <= 0;
|
||||
chirp_samples_collected <= 0;
|
||||
chirp_complete <= 0;
|
||||
saw_chain_output <= 0;
|
||||
fft_input_valid <= 0;
|
||||
fft_start <= 0;
|
||||
chirp_complete <= 0;
|
||||
saw_chain_output <= 0;
|
||||
fft_input_valid <= 0;
|
||||
fft_start <= 0;
|
||||
buf_we <= 0;
|
||||
buf_waddr <= 0;
|
||||
buf_wdata_i <= 0;
|
||||
buf_wdata_q <= 0;
|
||||
buf_raddr <= 0;
|
||||
overlap_copy_count <= 0;
|
||||
end else begin
|
||||
pc_valid <= 0;
|
||||
mem_request <= 0;
|
||||
fft_input_valid <= 0;
|
||||
buf_we <= 0; // Default: no write
|
||||
|
||||
case (state)
|
||||
ST_IDLE: begin
|
||||
// Reset for new chirp
|
||||
buffer_write_ptr <= 0;
|
||||
buffer_read_ptr <= 0;
|
||||
buffer_has_data <= 0;
|
||||
buffer_processing <= 0;
|
||||
current_segment <= 0;
|
||||
segment_done <= 0;
|
||||
chirp_samples_collected <= 0;
|
||||
chirp_complete <= 0;
|
||||
ST_IDLE: begin
|
||||
// Reset for new chirp
|
||||
buffer_write_ptr <= 0;
|
||||
buffer_read_ptr <= 0;
|
||||
buffer_has_data <= 0;
|
||||
buffer_processing <= 0;
|
||||
current_segment <= 0;
|
||||
segment_done <= 0;
|
||||
chirp_samples_collected <= 0;
|
||||
chirp_complete <= 0;
|
||||
saw_chain_output <= 0;
|
||||
|
||||
// Wait for chirp start from microcontroller
|
||||
if (chirp_start_pulse) begin
|
||||
state <= ST_COLLECT_DATA;
|
||||
total_segments <= use_long_chirp ? LONG_SEGMENTS[2:0] : SHORT_SEGMENTS[2:0];
|
||||
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Starting %s chirp, segments: %d",
|
||||
use_long_chirp ? "LONG" : "SHORT",
|
||||
use_long_chirp ? LONG_SEGMENTS : SHORT_SEGMENTS);
|
||||
$display("[MULTI_SEG_FIXED] Overlap: %d samples, Advance: %d samples",
|
||||
OVERLAP_SAMPLES, SEGMENT_ADVANCE);
|
||||
total_segments <= use_long_chirp ? LONG_SEGMENTS[2:0] : SHORT_SEGMENTS[2:0];
|
||||
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Starting %s chirp, segments: %d",
|
||||
use_long_chirp ? "LONG" : "SHORT",
|
||||
use_long_chirp ? LONG_SEGMENTS : SHORT_SEGMENTS);
|
||||
$display("[MULTI_SEG_FIXED] Overlap: %d samples, Advance: %d samples",
|
||||
OVERLAP_SAMPLES, SEGMENT_ADVANCE);
|
||||
`endif
|
||||
end
|
||||
end
|
||||
|
||||
ST_COLLECT_DATA: begin
|
||||
// Collect samples for current segment with overlap-save
|
||||
if (ddc_valid) begin
|
||||
// Store in buffer
|
||||
input_buffer_i[buffer_write_ptr] <= ddc_i[17:2] + ddc_i[1];
|
||||
input_buffer_q[buffer_write_ptr] <= ddc_q[17:2] + ddc_q[1];
|
||||
|
||||
buffer_write_ptr <= buffer_write_ptr + 1;
|
||||
chirp_samples_collected <= chirp_samples_collected + 1;
|
||||
|
||||
// Debug: Show first few samples
|
||||
if (chirp_samples_collected < 10 && buffer_write_ptr < 10) begin
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Store[%0d]: I=%h Q=%h",
|
||||
buffer_write_ptr,
|
||||
ddc_i[17:2] + ddc_i[1],
|
||||
ddc_q[17:2] + ddc_q[1]);
|
||||
`endif
|
||||
end
|
||||
|
||||
// SHORT CHIRP: Only 50 samples, then zero-pad
|
||||
if (!use_long_chirp) begin
|
||||
if (chirp_samples_collected >= SHORT_CHIRP_SAMPLES - 1) begin
|
||||
state <= ST_ZERO_PAD;
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Short chirp: collected %d samples, starting zero-pad",
|
||||
chirp_samples_collected + 1);
|
||||
`endif
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
// LONG CHIRP: segment-ready and chirp-complete checks
|
||||
// evaluated every clock (not gated by ddc_valid) to avoid
|
||||
// missing the transition when buffer_write_ptr updates via
|
||||
// non-blocking assignment one cycle after the last write.
|
||||
//
|
||||
// Overlap-save fix: fill the FULL 1024-sample buffer before
|
||||
// processing. For segment 0 this means 1024 fresh samples.
|
||||
// For segments 1+, write_ptr starts at OVERLAP_SAMPLES (128)
|
||||
// so we collect 896 new samples to fill the buffer.
|
||||
if (use_long_chirp) begin
|
||||
if (buffer_write_ptr >= BUFFER_SIZE) begin
|
||||
buffer_has_data <= 1;
|
||||
state <= ST_WAIT_REF;
|
||||
segment_request <= current_segment[1:0];
|
||||
mem_request <= 1;
|
||||
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Segment %d ready: %d samples collected",
|
||||
current_segment, chirp_samples_collected);
|
||||
`endif
|
||||
end
|
||||
|
||||
if (chirp_samples_collected >= LONG_CHIRP_SAMPLES && !chirp_complete) begin
|
||||
chirp_complete <= 1;
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] End of long chirp reached");
|
||||
`endif
|
||||
// If buffer isn't full yet, zero-pad the remainder
|
||||
// (last segment with fewer than 896 new samples)
|
||||
if (buffer_write_ptr < BUFFER_SIZE) begin
|
||||
state <= ST_ZERO_PAD;
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Last segment partial: zero-padding from %0d to %0d",
|
||||
buffer_write_ptr, BUFFER_SIZE - 1);
|
||||
`endif
|
||||
end
|
||||
end
|
||||
end
|
||||
ST_COLLECT_DATA: begin
|
||||
// Collect samples for current segment with overlap-save
|
||||
if (ddc_valid && buffer_write_ptr < BUFFER_SIZE) begin
|
||||
// Store in buffer via BRAM write port
|
||||
buf_we <= 1;
|
||||
buf_waddr <= buffer_write_ptr[9:0];
|
||||
buf_wdata_i <= ddc_i[17:2] + ddc_i[1];
|
||||
buf_wdata_q <= ddc_q[17:2] + ddc_q[1];
|
||||
|
||||
buffer_write_ptr <= buffer_write_ptr + 1;
|
||||
chirp_samples_collected <= chirp_samples_collected + 1;
|
||||
|
||||
// Debug: Show first few samples
|
||||
if (chirp_samples_collected < 10 && buffer_write_ptr < 10) begin
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Store[%0d]: I=%h Q=%h",
|
||||
buffer_write_ptr,
|
||||
ddc_i[17:2] + ddc_i[1],
|
||||
ddc_q[17:2] + ddc_q[1]);
|
||||
`endif
|
||||
end
|
||||
|
||||
// SHORT CHIRP: Only 50 samples, then zero-pad
|
||||
if (!use_long_chirp) begin
|
||||
if (chirp_samples_collected >= SHORT_CHIRP_SAMPLES - 1) begin
|
||||
state <= ST_ZERO_PAD;
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Short chirp: collected %d samples, starting zero-pad",
|
||||
chirp_samples_collected + 1);
|
||||
`endif
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
// LONG CHIRP: segment-ready and chirp-complete checks
|
||||
// evaluated every clock (not gated by ddc_valid) to avoid
|
||||
// missing the transition when buffer_write_ptr updates via
|
||||
// non-blocking assignment one cycle after the last write.
|
||||
//
|
||||
// Overlap-save fix: fill the FULL 1024-sample buffer before
|
||||
// processing. For segment 0 this means 1024 fresh samples.
|
||||
// For segments 1+, write_ptr starts at OVERLAP_SAMPLES (128)
|
||||
// so we collect 896 new samples to fill the buffer.
|
||||
if (use_long_chirp) begin
|
||||
if (buffer_write_ptr >= BUFFER_SIZE) begin
|
||||
buffer_has_data <= 1;
|
||||
state <= ST_WAIT_REF;
|
||||
segment_request <= current_segment[1:0];
|
||||
mem_request <= 1;
|
||||
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Segment %d ready: %d samples collected",
|
||||
current_segment, chirp_samples_collected);
|
||||
`endif
|
||||
end
|
||||
|
||||
if (chirp_samples_collected >= LONG_CHIRP_SAMPLES && !chirp_complete) begin
|
||||
chirp_complete <= 1;
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] End of long chirp reached");
|
||||
`endif
|
||||
// If buffer isn't full yet, zero-pad the remainder
|
||||
// (last segment with fewer than 896 new samples)
|
||||
if (buffer_write_ptr < BUFFER_SIZE) begin
|
||||
state <= ST_ZERO_PAD;
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Last segment partial: zero-padding from %0d to %0d",
|
||||
buffer_write_ptr, BUFFER_SIZE - 1);
|
||||
`endif
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
ST_ZERO_PAD: begin
|
||||
// Zero-pad remaining buffer (short chirp or last long chirp segment)
|
||||
input_buffer_i[buffer_write_ptr] <= 16'd0;
|
||||
input_buffer_q[buffer_write_ptr] <= 16'd0;
|
||||
buffer_write_ptr <= buffer_write_ptr + 1;
|
||||
|
||||
if (buffer_write_ptr >= BUFFER_SIZE - 1) begin
|
||||
// Done zero-padding
|
||||
buffer_has_data <= 1;
|
||||
buffer_write_ptr <= 0;
|
||||
state <= ST_WAIT_REF;
|
||||
segment_request <= use_long_chirp ? current_segment[1:0] : 2'd0;
|
||||
mem_request <= 1;
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Zero-pad complete, buffer full");
|
||||
`endif
|
||||
end
|
||||
ST_ZERO_PAD: begin
|
||||
// Zero-pad remaining buffer via BRAM write port
|
||||
buf_we <= 1;
|
||||
buf_waddr <= buffer_write_ptr[9:0];
|
||||
buf_wdata_i <= 16'd0;
|
||||
buf_wdata_q <= 16'd0;
|
||||
buffer_write_ptr <= buffer_write_ptr + 1;
|
||||
|
||||
if (buffer_write_ptr >= BUFFER_SIZE - 1) begin
|
||||
// Done zero-padding
|
||||
buffer_has_data <= 1;
|
||||
buffer_write_ptr <= 0;
|
||||
state <= ST_WAIT_REF;
|
||||
segment_request <= use_long_chirp ? current_segment[1:0] : 2'd0;
|
||||
mem_request <= 1;
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Zero-pad complete, buffer full");
|
||||
`endif
|
||||
end
|
||||
end
|
||||
|
||||
ST_WAIT_REF: begin
|
||||
// Wait for memory to provide reference coefficients
|
||||
buf_raddr <= 10'd0; // Pre-present addr 0 so buf_rdata is ready next cycle
|
||||
if (mem_ready) begin
|
||||
// Start processing
|
||||
// Start processing — buf_rdata[0] will be valid on FIRST clock of ST_PROCESSING
|
||||
buffer_processing <= 1;
|
||||
buffer_read_ptr <= 0;
|
||||
fft_start <= 1;
|
||||
state <= ST_PROCESSING;
|
||||
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Reference ready, starting processing segment %d",
|
||||
current_segment);
|
||||
state <= ST_PROCESSING;
|
||||
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Reference ready, starting processing segment %d",
|
||||
current_segment);
|
||||
`endif
|
||||
end
|
||||
end
|
||||
|
||||
ST_PROCESSING: begin
|
||||
// Feed data to FFT chain
|
||||
// Feed data to FFT chain from BRAM.
|
||||
// buf_raddr was pre-presented in ST_WAIT_REF (=0), so
|
||||
// buf_rdata already contains data[0] on the first clock here.
|
||||
// Each cycle: feed buf_rdata, present NEXT address.
|
||||
if ((buffer_processing) && (buffer_read_ptr < BUFFER_SIZE)) begin
|
||||
// 1. Feed ADC data to FFT
|
||||
fft_input_i <= input_buffer_i[buffer_read_ptr];
|
||||
fft_input_q <= input_buffer_q[buffer_read_ptr];
|
||||
// 1. Feed BRAM read data to FFT (valid for current buffer_read_ptr)
|
||||
fft_input_i <= buf_rdata_i;
|
||||
fft_input_q <= buf_rdata_q;
|
||||
fft_input_valid <= 1;
|
||||
|
||||
// 2. Request corresponding reference sample
|
||||
mem_request <= 1'b1;
|
||||
|
||||
// Debug every 100 samples
|
||||
if (buffer_read_ptr % 100 == 0) begin
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Processing[%0d]: ADC I=%h Q=%h",
|
||||
buffer_read_ptr,
|
||||
input_buffer_i[buffer_read_ptr],
|
||||
input_buffer_q[buffer_read_ptr]);
|
||||
`endif
|
||||
// 3. Cache tail samples for overlap-save
|
||||
if (buffer_read_ptr >= SEGMENT_ADVANCE) begin
|
||||
overlap_cache_i[buffer_read_ptr - SEGMENT_ADVANCE] <= buf_rdata_i;
|
||||
overlap_cache_q[buffer_read_ptr - SEGMENT_ADVANCE] <= buf_rdata_q;
|
||||
end
|
||||
|
||||
// Debug every 100 samples
|
||||
if (buffer_read_ptr % 100 == 0) begin
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Processing[%0d]: ADC I=%h Q=%h",
|
||||
buffer_read_ptr,
|
||||
buf_rdata_i,
|
||||
buf_rdata_q);
|
||||
`endif
|
||||
end
|
||||
|
||||
// Present NEXT read address (for next cycle)
|
||||
buf_raddr <= buffer_read_ptr[9:0] + 10'd1;
|
||||
buffer_read_ptr <= buffer_read_ptr + 1;
|
||||
|
||||
end else if (buffer_read_ptr >= BUFFER_SIZE) begin
|
||||
// Done feeding buffer
|
||||
fft_input_valid <= 0;
|
||||
mem_request <= 0;
|
||||
buffer_processing <= 0;
|
||||
buffer_has_data <= 0;
|
||||
saw_chain_output <= 0;
|
||||
end else if (buffer_read_ptr >= BUFFER_SIZE) begin
|
||||
// Done feeding buffer
|
||||
fft_input_valid <= 0;
|
||||
mem_request <= 0;
|
||||
buffer_processing <= 0;
|
||||
buffer_has_data <= 0;
|
||||
saw_chain_output <= 0;
|
||||
state <= ST_WAIT_FFT; // CRITICAL: Wait for FFT completion
|
||||
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Finished feeding %d samples to FFT, waiting...",
|
||||
BUFFER_SIZE);
|
||||
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Finished feeding %d samples to FFT, waiting...",
|
||||
BUFFER_SIZE);
|
||||
`endif
|
||||
end
|
||||
end
|
||||
|
||||
ST_WAIT_FFT: begin
|
||||
// Wait for the processing chain to complete ALL outputs.
|
||||
// The chain streams 1024 samples (fft_pc_valid=1 for 1024 clocks),
|
||||
// then transitions to ST_DONE (9) -> ST_IDLE (0).
|
||||
// We track when output starts (saw_chain_output) and only
|
||||
// proceed once the chain returns to idle after outputting.
|
||||
if (fft_pc_valid) begin
|
||||
saw_chain_output <= 1;
|
||||
end
|
||||
|
||||
if (saw_chain_output && fft_chain_state == 4'd0) begin
|
||||
// Chain has returned to idle after completing all output
|
||||
saw_chain_output <= 0;
|
||||
state <= ST_OUTPUT;
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Chain complete for segment %d, entering ST_OUTPUT",
|
||||
current_segment);
|
||||
`endif
|
||||
end
|
||||
ST_WAIT_FFT: begin
|
||||
// Wait for the processing chain to complete ALL outputs.
|
||||
// The chain streams 1024 samples (fft_pc_valid=1 for 1024 clocks),
|
||||
// then transitions to ST_DONE (9) -> ST_IDLE (0).
|
||||
// We track when output starts (saw_chain_output) and only
|
||||
// proceed once the chain returns to idle after outputting.
|
||||
if (fft_pc_valid) begin
|
||||
saw_chain_output <= 1;
|
||||
end
|
||||
|
||||
if (saw_chain_output && fft_chain_state == 4'd0) begin
|
||||
// Chain has returned to idle after completing all output
|
||||
saw_chain_output <= 0;
|
||||
state <= ST_OUTPUT;
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Chain complete for segment %d, entering ST_OUTPUT",
|
||||
current_segment);
|
||||
`endif
|
||||
end
|
||||
end
|
||||
|
||||
ST_OUTPUT: begin
|
||||
@@ -349,11 +406,11 @@ always @(posedge clk or negedge reset_n) begin
|
||||
pc_i <= fft_pc_i;
|
||||
pc_q <= fft_pc_q;
|
||||
pc_valid <= 1;
|
||||
segment_done <= 1;
|
||||
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Output segment %d: I=%h Q=%h",
|
||||
current_segment, fft_pc_i, fft_pc_q);
|
||||
segment_done <= 1;
|
||||
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Output segment %d: I=%h Q=%h",
|
||||
current_segment, fft_pc_i, fft_pc_q);
|
||||
`endif
|
||||
|
||||
// Check if we need more segments
|
||||
@@ -361,10 +418,10 @@ always @(posedge clk or negedge reset_n) begin
|
||||
state <= ST_NEXT_SEGMENT;
|
||||
end else begin
|
||||
// All segments complete
|
||||
state <= ST_IDLE;
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] All %d segments complete",
|
||||
total_segments);
|
||||
state <= ST_IDLE;
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] All %d segments complete",
|
||||
total_segments);
|
||||
`endif
|
||||
end
|
||||
end
|
||||
@@ -375,35 +432,48 @@ always @(posedge clk or negedge reset_n) begin
|
||||
segment_done <= 0;
|
||||
|
||||
if (use_long_chirp) begin
|
||||
// OVERLAP-SAVE: Keep last OVERLAP_SAMPLES for next segment
|
||||
// Shift data in buffer to preserve overlap
|
||||
// OVERLAP-SAVE: Write cached tail samples back to BRAM [0..127]
|
||||
overlap_copy_count <= 0;
|
||||
state <= ST_OVERLAP_COPY;
|
||||
|
||||
for (i = 0; i < OVERLAP_SAMPLES; i = i + 1) begin
|
||||
input_buffer_i[i] <= input_buffer_i[i + SEGMENT_ADVANCE];
|
||||
input_buffer_q[i] <= input_buffer_q[i + SEGMENT_ADVANCE];
|
||||
end
|
||||
|
||||
// Start writing after the overlap
|
||||
buffer_write_ptr <= OVERLAP_SAMPLES;
|
||||
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Overlap-save: kept %d samples, write_ptr=%d",
|
||||
OVERLAP_SAMPLES, OVERLAP_SAMPLES);
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Overlap-save: writing %d cached samples",
|
||||
OVERLAP_SAMPLES);
|
||||
`endif
|
||||
end else begin
|
||||
// Short chirp: only one segment
|
||||
buffer_write_ptr <= 0;
|
||||
if (!chirp_complete) begin
|
||||
state <= ST_COLLECT_DATA;
|
||||
end else begin
|
||||
state <= ST_IDLE;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
ST_OVERLAP_COPY: begin
|
||||
// Write one cached overlap sample per cycle to BRAM
|
||||
buf_we <= 1;
|
||||
buf_waddr <= overlap_copy_count[9:0];
|
||||
buf_wdata_i <= overlap_cache_i[overlap_copy_count];
|
||||
buf_wdata_q <= overlap_cache_q[overlap_copy_count];
|
||||
|
||||
// Continue collecting or finish
|
||||
if (!chirp_complete) begin
|
||||
state <= ST_COLLECT_DATA;
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Starting segment %d/%d",
|
||||
current_segment + 1, total_segments);
|
||||
`endif
|
||||
if (overlap_copy_count < OVERLAP_SAMPLES - 1) begin
|
||||
overlap_copy_count <= overlap_copy_count + 1;
|
||||
end else begin
|
||||
state <= ST_IDLE;
|
||||
// All 128 samples written back
|
||||
buffer_write_ptr <= OVERLAP_SAMPLES;
|
||||
|
||||
`ifdef SIMULATION
|
||||
$display("[MULTI_SEG_FIXED] Overlap-save: copied %d samples, write_ptr=%d",
|
||||
OVERLAP_SAMPLES, OVERLAP_SAMPLES);
|
||||
`endif
|
||||
|
||||
if (!chirp_complete) begin
|
||||
state <= ST_COLLECT_DATA;
|
||||
end else begin
|
||||
state <= ST_IDLE;
|
||||
end
|
||||
end
|
||||
end
|
||||
endcase
|
||||
@@ -441,23 +511,23 @@ matched_filter_processing_chain m_f_p_c(
|
||||
.chain_state(fft_chain_state)
|
||||
);
|
||||
|
||||
// ========== DEBUG MONITOR ==========
|
||||
`ifdef SIMULATION
|
||||
reg [31:0] dbg_cycles;
|
||||
always @(posedge clk or negedge reset_n) begin
|
||||
if (!reset_n) begin
|
||||
dbg_cycles <= 0;
|
||||
end else begin
|
||||
dbg_cycles <= dbg_cycles + 1;
|
||||
|
||||
// Monitor state transitions
|
||||
if (dbg_cycles % 1000 == 0 && state != ST_IDLE) begin
|
||||
$display("[MULTI_SEG_MONITOR @%0d] state=%0d, segment=%0d/%0d, samples=%0d",
|
||||
dbg_cycles, state, current_segment, total_segments,
|
||||
chirp_samples_collected);
|
||||
end
|
||||
end
|
||||
end
|
||||
// ========== DEBUG MONITOR ==========
|
||||
`ifdef SIMULATION
|
||||
reg [31:0] dbg_cycles;
|
||||
always @(posedge clk or negedge reset_n) begin
|
||||
if (!reset_n) begin
|
||||
dbg_cycles <= 0;
|
||||
end else begin
|
||||
dbg_cycles <= dbg_cycles + 1;
|
||||
|
||||
// Monitor state transitions
|
||||
if (dbg_cycles % 1000 == 0 && state != ST_IDLE) begin
|
||||
$display("[MULTI_SEG_MONITOR @%0d] state=%0d, segment=%0d/%0d, samples=%0d",
|
||||
dbg_cycles, state, current_segment, total_segments,
|
||||
chirp_samples_collected);
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
// ========== OUTPUT CONNECTIONS ==========
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -109,8 +109,8 @@ module tb_fir_lowpass;
|
||||
saw_nonzero = 0;
|
||||
output_count = 0;
|
||||
|
||||
// Run for 40 clocks (need at least 32 for all taps + pipeline)
|
||||
for (sample_count = 0; sample_count < 40; sample_count = sample_count + 1) begin
|
||||
// Run for 44 clocks (need at least 32 for all taps + 9-stage pipeline)
|
||||
for (sample_count = 0; sample_count < 44; sample_count = sample_count + 1) begin
|
||||
@(posedge clk); #1;
|
||||
if (data_out_valid) begin
|
||||
$fwrite(csv_file, "%0d,%0d\n", output_count, data_out);
|
||||
|
||||
Reference in New Issue
Block a user