Replace FFT stubs with synthesizable radix-2 DIT engine, fix BRAM inference

Implement iterative single-butterfly FFT engine (fft_engine.v) supporting
1024-pt and 32-pt transforms with quarter-wave twiddle ROM, XPM_MEMORY_TDPRAM
for guaranteed BRAM mapping in Vivado, and behavioral model for simulation.

Add xfft_32.v AXI-Stream wrapper for doppler_processor integration and
dual-branch matched_filter_processing_chain.v (behavioral + synthesis paths).

Fix placement failure caused by 68K+ registers from dissolved memory arrays:
- doppler_processor.v: extract mem writes to sync-only always block for BRAM
- xfft_32.v: extract buffer writes to sync-only always block for LUTRAM

Post-implementation: 37K regs (29%), 23K LUTs (37%), 10 BRAM (7%), fully routed.
All testbenches pass: fft_engine 12/12, xfft_32 10/10, mf_chain 27/27.
This commit is contained in:
Jason
2026-03-16 10:25:07 +02:00
parent deb2e81ec4
commit 692b6a3bfa
9 changed files with 3428 additions and 190 deletions
+179 -148
View File
@@ -124,157 +124,188 @@ always @(posedge clk or negedge reset_n) begin
end
wire frame_start_pulse = new_chirp_frame & ~new_chirp_frame_d1;
// ==============================================
// Main State Machine - FIXED
// ==============================================
reg [5:0] fft_sample_counter;
reg [9:0] processing_timeout;
always @(posedge clk or negedge reset_n) begin
if (!reset_n) begin
state <= S_IDLE;
write_range_bin <= 0;
write_chirp_index <= 0;
read_range_bin <= 0;
read_doppler_index <= 0;
frame_buffer_full <= 0;
doppler_valid <= 0;
fft_start <= 0;
fft_input_valid <= 0;
fft_input_last <= 0;
fft_sample_counter <= 0;
processing_timeout <= 0;
status <= 0;
chirps_received <= 0;
chirp_state <= 0;
end else begin
doppler_valid <= 0;
fft_input_valid <= 0;
fft_input_last <= 0;
if (processing_timeout > 0) begin
processing_timeout <= processing_timeout - 1;
end
case (state)
S_IDLE: begin
if (frame_start_pulse) begin
// Start new frame
write_chirp_index <= 0;
write_range_bin <= 0;
frame_buffer_full <= 0;
chirps_received <= 0;
//chirp_state <= 1; // Start accumulating
end
if (data_valid && !frame_buffer_full) begin
// ==============================================
// Main State Machine - FIXED
// ==============================================
reg [5:0] fft_sample_counter;
reg [9:0] processing_timeout;
// Memory write enable and data signals (extracted for BRAM inference)
reg mem_we;
reg [10:0] mem_waddr_r;
reg [DATA_WIDTH-1:0] mem_wdata_i, mem_wdata_q;
// Memory read data (registered for BRAM read latency)
reg [DATA_WIDTH-1:0] mem_rdata_i, mem_rdata_q;
// ----------------------------------------------------------
// Separate always block for memory writes NO async reset
// in sensitivity list, so Vivado can infer Block RAM.
// ----------------------------------------------------------
always @(posedge clk) begin
if (mem_we) begin
doppler_i_mem[mem_waddr_r] <= mem_wdata_i;
doppler_q_mem[mem_waddr_r] <= mem_wdata_q;
end
// Registered read address driven by mem_read_addr from FSM
mem_rdata_i <= doppler_i_mem[mem_read_addr];
mem_rdata_q <= doppler_q_mem[mem_read_addr];
end
// ----------------------------------------------------------
// Main FSM async reset for control registers only.
// Memory arrays are NOT touched here.
// ----------------------------------------------------------
always @(posedge clk or negedge reset_n) begin
if (!reset_n) begin
state <= S_IDLE;
write_range_bin <= 0;
write_chirp_index <= 0;
read_range_bin <= 0;
read_doppler_index <= 0;
frame_buffer_full <= 0;
doppler_valid <= 0;
fft_start <= 0;
fft_input_valid <= 0;
fft_input_last <= 0;
fft_sample_counter <= 0;
processing_timeout <= 0;
status <= 0;
chirps_received <= 0;
chirp_state <= 0;
mem_we <= 0;
mem_waddr_r <= 0;
mem_wdata_i <= 0;
mem_wdata_q <= 0;
mult_i <= 0;
mult_q <= 0;
fft_input_i <= 0;
fft_input_q <= 0;
doppler_output <= 0;
doppler_bin <= 0;
end else begin
doppler_valid <= 0;
fft_input_valid <= 0;
fft_input_last <= 0;
mem_we <= 0;
if (processing_timeout > 0) begin
processing_timeout <= processing_timeout - 1;
end
case (state)
S_IDLE: begin
if (frame_start_pulse) begin
// Start new frame
write_chirp_index <= 0;
write_range_bin <= 0;
frame_buffer_full <= 0;
chirps_received <= 0;
end
if (data_valid && !frame_buffer_full) begin
state <= S_ACCUMULATE;
write_range_bin <= 0;
end
end
S_ACCUMULATE: begin
if (data_valid) begin
// Store with proper addressing
doppler_i_mem[mem_write_addr] <= range_data[15:0];
doppler_q_mem[mem_write_addr] <= range_data[31:16];
// Debug output to see what's being written
// $display("Time=%t: Write addr=%d (chirp=%d, range=%d), Data=%h",
// $time, mem_write_addr, write_chirp_index, write_range_bin, range_data);
// Increment range bin
if (write_range_bin < RANGE_BINS - 1) begin
write_range_bin <= write_range_bin + 1;
end else begin
// Completed one chirp
write_range_bin <= 0;
write_chirp_index <= write_chirp_index + 1;
chirps_received <= chirps_received + 1;
// Check if frame is complete
if (write_chirp_index >= CHIRPS_PER_FRAME - 1) begin
frame_buffer_full <= 1;
chirp_state <= 0; // Stop accumulating
// Could automatically start processing here:
state <= S_LOAD_FFT;
read_range_bin <= 0;
read_doppler_index <= 0;
fft_sample_counter <= 0;
fft_start <= 1;
end
end
end
end
// [Rest of S_LOAD_FFT, S_FFT_WAIT, S_OUTPUT states remain similar]
// But with fixed addressing in S_LOAD_FFT:
S_LOAD_FFT: begin
fft_start <= 0;
if (fft_sample_counter < DOPPLER_FFT_SIZE) begin
// Use correct addressing for reading
mult_i <= $signed(doppler_i_mem[mem_read_addr]) *
$signed(window_coeff[read_doppler_index]);
mult_q <= $signed(doppler_q_mem[mem_read_addr]) *
$signed(window_coeff[read_doppler_index]);
write_range_bin <= 0;
end
end
S_ACCUMULATE: begin
if (data_valid) begin
// Drive memory write signals (actual write in separate block)
mem_we <= 1;
mem_waddr_r <= mem_write_addr;
mem_wdata_i <= range_data[15:0];
mem_wdata_q <= range_data[31:16];
// Round instead of truncate
fft_input_i <= (mult_i + (1 << 14)) >>> 15; // Round to nearest
fft_input_q <= (mult_q + (1 << 14)) >>> 15;
// Increment range bin
if (write_range_bin < RANGE_BINS - 1) begin
write_range_bin <= write_range_bin + 1;
end else begin
// Completed one chirp
write_range_bin <= 0;
write_chirp_index <= write_chirp_index + 1;
chirps_received <= chirps_received + 1;
// Check if frame is complete
if (write_chirp_index >= CHIRPS_PER_FRAME - 1) begin
frame_buffer_full <= 1;
chirp_state <= 0;
state <= S_LOAD_FFT;
read_range_bin <= 0;
read_doppler_index <= 0;
fft_sample_counter <= 0;
fft_start <= 1;
end
end
end
end
S_LOAD_FFT: begin
fft_start <= 0;
if (fft_sample_counter < DOPPLER_FFT_SIZE) begin
// Use registered read data (one cycle latency from BRAM)
mult_i <= $signed(mem_rdata_i) *
$signed(window_coeff[read_doppler_index]);
mult_q <= $signed(mem_rdata_q) *
$signed(window_coeff[read_doppler_index]);
fft_input_valid <= 1;
if (fft_sample_counter == DOPPLER_FFT_SIZE - 1) begin
fft_input_last <= 1;
end
// Increment chirp index for next sample
read_doppler_index <= read_doppler_index + 1;
fft_sample_counter <= fft_sample_counter + 1;
end else begin
state <= S_FFT_WAIT;
fft_sample_counter <= 0;
processing_timeout <= 100;
end
end
S_FFT_WAIT: begin
if (fft_output_valid) begin
doppler_output <= {fft_output_q[15:0], fft_output_i[15:0]};
doppler_bin <= fft_sample_counter;
range_bin <= read_range_bin;
doppler_valid <= 1;
fft_sample_counter <= fft_sample_counter + 1;
if (fft_output_last) begin
state <= S_OUTPUT;
fft_sample_counter <= 0;
end
end
if (processing_timeout == 0) begin
state <= S_OUTPUT;
end
end
S_OUTPUT: begin
if (read_range_bin < RANGE_BINS - 1) begin
read_range_bin <= read_range_bin + 1;
read_doppler_index <= 0;
state <= S_LOAD_FFT;
fft_start <= 1;
end else begin
state <= S_IDLE;
frame_buffer_full <= 0;
end
end
endcase
status <= {state, frame_buffer_full};
end
// Round instead of truncate
fft_input_i <= (mult_i + (1 << 14)) >>> 15;
fft_input_q <= (mult_q + (1 << 14)) >>> 15;
fft_input_valid <= 1;
if (fft_sample_counter == DOPPLER_FFT_SIZE - 1) begin
fft_input_last <= 1;
end
// Increment chirp index for next sample
read_doppler_index <= read_doppler_index + 1;
fft_sample_counter <= fft_sample_counter + 1;
end else begin
state <= S_FFT_WAIT;
fft_sample_counter <= 0;
processing_timeout <= 100;
end
end
S_FFT_WAIT: begin
if (fft_output_valid) begin
doppler_output <= {fft_output_q[15:0], fft_output_i[15:0]};
doppler_bin <= fft_sample_counter;
range_bin <= read_range_bin;
doppler_valid <= 1;
fft_sample_counter <= fft_sample_counter + 1;
if (fft_output_last) begin
state <= S_OUTPUT;
fft_sample_counter <= 0;
end
end
if (processing_timeout == 0) begin
state <= S_OUTPUT;
end
end
S_OUTPUT: begin
if (read_range_bin < RANGE_BINS - 1) begin
read_range_bin <= read_range_bin + 1;
read_doppler_index <= 0;
state <= S_LOAD_FFT;
fft_start <= 1;
end else begin
state <= S_IDLE;
frame_buffer_full <= 0;
end
end
endcase
status <= {state, frame_buffer_full};
end
end
// ==============================================