Files
PLFM_RADAR/9_Firmware/9_2_FPGA/fft_engine.v
Jason a3e1996833 FFT engine: merge SHIFT into WRITE (5→4 cycle butterfly, 20% throughput) + barrel-shift twiddle index
Opt 1: Eliminated ST_BF_SHIFT state — arithmetic right-shift is pure
bit-selection (zero logic levels), merged into BF_WRITE combinational
add/subtract. Saves LOG2N * N/2 = 5120 cycles per 1024-pt FFT.

Opt 2: Replaced idx_val * tw_stride_reg general multiply with
idx_val << (LOG2N-1-stage) barrel shift. tw_stride_reg is always a
power of 2, so this is mathematically identical and frees a multiplier.

Regression: 18/18 FPGA pass (bit-exact results).
2026-03-20 00:20:59 +02:00

715 lines
25 KiB
Verilog

`timescale 1ns / 1ps
/**
* fft_engine.v
*
* Synthesizable parameterized radix-2 DIT FFT/IFFT engine.
* Iterative single-butterfly architecture with quarter-wave twiddle ROM.
*
* Architecture:
* - LOAD: Accept N input samples, store bit-reversed in BRAM
* - COMPUTE: LOG2N stages x N/2 butterflies, 4-cycle pipeline:
* BF_READ: Present BRAM addresses; register twiddle index
* BF_TW: BRAM data valid → capture; twiddle ROM lookup from
* registered index → capture cos/sin
* BF_MULT2: DSP multiply from registered data + twiddle → PREG
* BF_WRITE: Shift (bit-select from PREG, pure wiring) +
* add/subtract + BRAM writeback
* - OUTPUT: Stream N results (1/N scaling for IFFT)
*
* Twiddle index computed via barrel shift (idx << (LOG2N-1-stage)) instead
* of general multiply, since the stride is always a power of 2.
*
* Data memory uses xpm_memory_tdpram (Xilinx Parameterized Macros) for
* guaranteed BRAM mapping in synthesis. Under `ifdef SIMULATION, a
* behavioral Verilog-2001 model replaces the XPM so the design compiles
* with Icarus Verilog or any non-Xilinx simulator.
*
* Clock domain: single clock (clk), active-low async reset (reset_n).
*/
module fft_engine #(
parameter N = 1024,
parameter LOG2N = 10,
parameter DATA_W = 16,
parameter INTERNAL_W = 32,
parameter TWIDDLE_W = 16,
parameter TWIDDLE_FILE = "fft_twiddle_1024.mem"
)(
input wire clk,
input wire reset_n,
// Control
input wire start,
input wire inverse,
// Data input
input wire signed [DATA_W-1:0] din_re,
input wire signed [DATA_W-1:0] din_im,
input wire din_valid,
// Data output
output reg signed [DATA_W-1:0] dout_re,
output reg signed [DATA_W-1:0] dout_im,
output reg dout_valid,
// Status
output wire busy,
output reg done
);
// ============================================================================
// SAFE WIDTH CONSTANTS
// ============================================================================
localparam [LOG2N:0] FFT_N = N;
localparam [LOG2N:0] FFT_N_HALF = N / 2;
localparam [LOG2N:0] FFT_N_QTR = N / 4;
localparam [LOG2N:0] FFT_N_HALF_M1 = N / 2 - 1;
localparam [LOG2N:0] FFT_N_M1 = N - 1;
// ============================================================================
// STATES
// ============================================================================
// Butterfly pipeline: READ → TW → MULT2 → WRITE (4 cycles)
// READ: Present BRAM addresses; register twiddle index (bf_tw_idx)
// TW: BRAM data valid → capture rd_a/rd_b; twiddle ROM lookup from
// registered index → capture cos/sin
// MULT2: DSP multiply from registered data + twiddle → products in PREG
// WRITE: Shift (bit-select from PREG, pure wiring) + add/sub + BRAM writeback
localparam [3:0] ST_IDLE = 4'd0,
ST_LOAD = 4'd1,
ST_BF_READ = 4'd2,
ST_BF_TW = 4'd3,
ST_BF_MULT2 = 4'd4,
ST_BF_WRITE = 4'd5,
ST_OUTPUT = 4'd6,
ST_DONE = 4'd7;
reg [3:0] state;
assign busy = (state != ST_IDLE);
// ============================================================================
// DATA MEMORY DECLARATIONS
// ============================================================================
// BRAM read data (registered outputs from port blocks)
reg signed [INTERNAL_W-1:0] mem_rdata_a_re, mem_rdata_a_im;
reg signed [INTERNAL_W-1:0] mem_rdata_b_re, mem_rdata_b_im;
// ============================================================================
// TWIDDLE ROM
// ============================================================================
localparam TW_QUARTER = N / 4;
localparam TW_ADDR_W = LOG2N - 2;
(* rom_style = "block" *) reg signed [TWIDDLE_W-1:0] cos_rom [0:TW_QUARTER-1];
initial begin
$readmemh(TWIDDLE_FILE, cos_rom);
end
// ============================================================================
// BIT-REVERSE
// ============================================================================
function [LOG2N-1:0] bit_reverse;
input [LOG2N-1:0] val;
integer b;
begin
bit_reverse = 0;
for (b = 0; b < LOG2N; b = b + 1)
bit_reverse[LOG2N-1-b] = val[b];
end
endfunction
// ============================================================================
// COUNTERS AND PIPELINE REGISTERS
// ============================================================================
reg [LOG2N-1:0] load_count;
reg [LOG2N:0] out_count;
reg [LOG2N-1:0] bfly_count;
reg [3:0] stage;
// Registered values (captured in BF_READ, used in BF_TW and later)
reg signed [TWIDDLE_W-1:0] rd_tw_cos, rd_tw_sin;
reg [LOG2N-1:0] rd_addr_even, rd_addr_odd;
reg rd_inverse;
reg [LOG2N-1:0] rd_tw_idx; // registered twiddle index (breaks addr→ROM path)
// Half register (twiddle stride replaced by barrel shift — see bf_addr_calc)
reg [LOG2N-1:0] half_reg;
// ============================================================================
// BUTTERFLY ADDRESS COMPUTATION (combinational)
// ============================================================================
reg [LOG2N-1:0] bf_addr_even;
reg [LOG2N-1:0] bf_addr_odd;
reg [LOG2N-1:0] bf_tw_idx;
always @(*) begin : bf_addr_calc
reg [LOG2N-1:0] half_val;
reg [LOG2N-1:0] idx_val;
reg [LOG2N-1:0] grp_val;
half_val = half_reg;
idx_val = bfly_count & (half_val - 1);
grp_val = (bfly_count - idx_val);
bf_addr_even = (grp_val << 1) | idx_val;
bf_addr_odd = bf_addr_even + half_val;
bf_tw_idx = idx_val << (LOG2N - 1 - stage);
end
// ============================================================================
// TWIDDLE LOOKUP (combinational)
// ============================================================================
reg signed [TWIDDLE_W-1:0] tw_cos_lookup;
reg signed [TWIDDLE_W-1:0] tw_sin_lookup;
always @(*) begin : tw_lookup
reg [LOG2N-1:0] k;
reg [LOG2N-1:0] rom_idx;
k = rd_tw_idx; // use registered index (set in ST_BF_READ)
tw_cos_lookup = 0;
tw_sin_lookup = 0;
if (k == 0) begin
tw_cos_lookup = cos_rom[0];
tw_sin_lookup = {TWIDDLE_W{1'b0}};
end else if (k == FFT_N_QTR[LOG2N-1:0]) begin
tw_cos_lookup = {TWIDDLE_W{1'b0}};
tw_sin_lookup = cos_rom[0];
end else if (k < FFT_N_QTR[LOG2N-1:0]) begin
tw_cos_lookup = cos_rom[k[TW_ADDR_W-1:0]];
rom_idx = FFT_N_QTR[LOG2N-1:0] - k;
tw_sin_lookup = cos_rom[rom_idx[TW_ADDR_W-1:0]];
end else begin
rom_idx = k - FFT_N_QTR[LOG2N-1:0];
tw_sin_lookup = cos_rom[rom_idx[TW_ADDR_W-1:0]];
rom_idx = FFT_N_HALF[LOG2N-1:0] - k;
tw_cos_lookup = -cos_rom[rom_idx[TW_ADDR_W-1:0]];
end
end
// ============================================================================
// SATURATION
// ============================================================================
function signed [DATA_W-1:0] saturate;
input signed [INTERNAL_W-1:0] val;
reg signed [INTERNAL_W-1:0] max_pos;
reg signed [INTERNAL_W-1:0] max_neg;
begin
max_pos = (1 << (DATA_W - 1)) - 1;
max_neg = -(1 << (DATA_W - 1));
if (val > max_pos)
saturate = max_pos[DATA_W-1:0];
else if (val < max_neg)
saturate = max_neg[DATA_W-1:0];
else
saturate = val[DATA_W-1:0];
end
endfunction
// ============================================================================
// BUTTERFLY PIPELINE REGISTERS
// ============================================================================
// Stage 1 (BF_TW): Capture BRAM read data into rd_a, rd_b
// Stage 2 (BF_MULT2): DSP multiply + accumulate → raw products (bf_prod_re/im)
// Stage 3 (BF_WRITE): Shift (bit-select, pure wiring) + add/subtract + BRAM writeback
// ============================================================================
reg signed [INTERNAL_W-1:0] rd_a_re, rd_a_im; // registered BRAM port A data
reg signed [INTERNAL_W-1:0] rd_b_re, rd_b_im; // registered BRAM port B data (for twiddle multiply)
// Raw DSP products — full precision, registered to break DSP→CARRY4 path
// Width: 32*16 = 48 bits per multiply, sum of two = 49 bits max
localparam PROD_W = INTERNAL_W + TWIDDLE_W; // 48
reg signed [PROD_W:0] bf_prod_re, bf_prod_im; // 49 bits to hold sum of two products
// Combinational add/subtract from registered values (used in BF_WRITE)
reg signed [INTERNAL_W-1:0] bf_sum_re, bf_sum_im;
reg signed [INTERNAL_W-1:0] bf_dif_re, bf_dif_im;
always @(*) begin : bf_addsub
// Shift is pure bit-selection from DSP PREG (zero logic levels in HW).
// Path: PREG → wiring → 32-bit CARRY4 adder → BRAM write (~3 ns total).
bf_sum_re = rd_a_re + (bf_prod_re >>> (TWIDDLE_W - 1));
bf_sum_im = rd_a_im + (bf_prod_im >>> (TWIDDLE_W - 1));
bf_dif_re = rd_a_re - (bf_prod_re >>> (TWIDDLE_W - 1));
bf_dif_im = rd_a_im - (bf_prod_im >>> (TWIDDLE_W - 1));
end
// ============================================================================
// BRAM PORT ADDRESS / WE / WDATA — combinational mux (registered signals)
// ============================================================================
// Drives port A and port B control signals from FSM state.
// These are registered (via NBA) so they are stable at the next posedge
// when the BRAM template blocks sample them. This avoids any NBA race.
// ============================================================================
reg bram_we_a;
reg [LOG2N-1:0] bram_addr_a;
reg signed [INTERNAL_W-1:0] bram_wdata_a_re;
reg signed [INTERNAL_W-1:0] bram_wdata_a_im;
reg bram_we_b;
reg [LOG2N-1:0] bram_addr_b;
reg signed [INTERNAL_W-1:0] bram_wdata_b_re;
reg signed [INTERNAL_W-1:0] bram_wdata_b_im;
always @(*) begin : bram_port_mux
// Port A defaults
bram_we_a = 1'b0;
bram_addr_a = 0;
bram_wdata_a_re = 0;
bram_wdata_a_im = 0;
// Port B defaults
bram_we_b = 1'b0;
bram_addr_b = 0;
bram_wdata_b_re = 0;
bram_wdata_b_im = 0;
case (state)
ST_LOAD: begin
bram_we_a = din_valid;
bram_addr_a = bit_reverse(load_count);
bram_wdata_a_re = {{(INTERNAL_W-DATA_W){din_re[DATA_W-1]}}, din_re};
bram_wdata_a_im = {{(INTERNAL_W-DATA_W){din_im[DATA_W-1]}}, din_im};
end
ST_BF_READ: begin
bram_addr_a = bf_addr_even;
bram_addr_b = bf_addr_odd;
end
ST_BF_TW: begin
// BRAM outputs are being read; addresses were set in BF_READ
// Data is being captured into pipeline regs (rd_a, rd_b)
end
ST_BF_MULT2: begin
// Twiddle multiply from registered BRAM data (rd_b_re/im)
// No BRAM access needed this cycle
end
ST_BF_WRITE: begin
bram_we_a = 1'b1;
bram_addr_a = rd_addr_even;
bram_wdata_a_re = bf_sum_re;
bram_wdata_a_im = bf_sum_im;
bram_we_b = 1'b1;
bram_addr_b = rd_addr_odd;
bram_wdata_b_re = bf_dif_re;
bram_wdata_b_im = bf_dif_im;
end
ST_OUTPUT: begin
bram_addr_a = out_count[LOG2N-1:0];
end
default: begin
// keep defaults
end
endcase
end
// ============================================================================
// DATA MEMORY — True Dual-Port BRAM
// ============================================================================
// For synthesis: xpm_memory_tdpram (Xilinx Parameterized Macros)
// For simulation: behavioral Verilog-2001 model (Icarus-compatible)
// ============================================================================
// XPM read-data wires (directly assigned to rdata regs below)
wire [INTERNAL_W-1:0] xpm_douta_re, xpm_doutb_re;
wire [INTERNAL_W-1:0] xpm_douta_im, xpm_doutb_im;
always @(*) begin
mem_rdata_a_re = $signed(xpm_douta_re);
mem_rdata_a_im = $signed(xpm_douta_im);
mem_rdata_b_re = $signed(xpm_doutb_re);
mem_rdata_b_im = $signed(xpm_doutb_im);
end
`ifndef FFT_XPM_BRAM
// ----------------------------------------------------------------------------
// Default: behavioral TDP model (works with Icarus Verilog -g2001)
// For Vivado synthesis, define FFT_XPM_BRAM to use xpm_memory_tdpram.
// ----------------------------------------------------------------------------
reg [INTERNAL_W-1:0] sim_mem_re [0:N-1];
reg [INTERNAL_W-1:0] sim_mem_im [0:N-1];
// Port A
reg [INTERNAL_W-1:0] sim_douta_re, sim_douta_im;
always @(posedge clk) begin
if (bram_we_a) begin
sim_mem_re[bram_addr_a] <= bram_wdata_a_re;
sim_mem_im[bram_addr_a] <= bram_wdata_a_im;
end
sim_douta_re <= sim_mem_re[bram_addr_a];
sim_douta_im <= sim_mem_im[bram_addr_a];
end
assign xpm_douta_re = sim_douta_re;
assign xpm_douta_im = sim_douta_im;
// Port B
reg [INTERNAL_W-1:0] sim_doutb_re, sim_doutb_im;
always @(posedge clk) begin
if (bram_we_b) begin
sim_mem_re[bram_addr_b] <= bram_wdata_b_re;
sim_mem_im[bram_addr_b] <= bram_wdata_b_im;
end
sim_doutb_re <= sim_mem_re[bram_addr_b];
sim_doutb_im <= sim_mem_im[bram_addr_b];
end
assign xpm_doutb_re = sim_doutb_re;
assign xpm_doutb_im = sim_doutb_im;
integer init_i;
initial begin
for (init_i = 0; init_i < N; init_i = init_i + 1) begin
sim_mem_re[init_i] = 0;
sim_mem_im[init_i] = 0;
end
end
`else
// ----------------------------------------------------------------------------
// Synthesis: xpm_memory_tdpram — guaranteed BRAM mapping
// Enabled when FFT_XPM_BRAM is defined (e.g. in Vivado TCL script).
// ----------------------------------------------------------------------------
// Note: Vivado auto-finds XPM library; no `include needed.
// Two instances: one for real, one for imaginary.
// WRITE_MODE = "write_first" matches the behavioral TDP template.
// READ_LATENCY = 1 (registered output).
// ----------------------------------------------------------------------------
xpm_memory_tdpram #(
.ADDR_WIDTH_A (LOG2N),
.ADDR_WIDTH_B (LOG2N),
.AUTO_SLEEP_TIME (0),
.BYTE_WRITE_WIDTH_A (INTERNAL_W),
.BYTE_WRITE_WIDTH_B (INTERNAL_W),
.CASCADE_HEIGHT (0),
.CLOCKING_MODE ("common_clock"),
.ECC_BIT_RANGE ("7:0"),
.ECC_MODE ("no_ecc"),
.ECC_TYPE ("none"),
.IGNORE_INIT_SYNTH (0),
.MEMORY_INIT_FILE ("none"),
.MEMORY_INIT_PARAM ("0"),
.MEMORY_OPTIMIZATION ("true"),
.MEMORY_PRIMITIVE ("block"),
.MEMORY_SIZE (N * INTERNAL_W),
.MESSAGE_CONTROL (0),
.RAM_DECOMP ("auto"),
.READ_DATA_WIDTH_A (INTERNAL_W),
.READ_DATA_WIDTH_B (INTERNAL_W),
.READ_LATENCY_A (1),
.READ_LATENCY_B (1),
.READ_RESET_VALUE_A ("0"),
.READ_RESET_VALUE_B ("0"),
.RST_MODE_A ("SYNC"),
.RST_MODE_B ("SYNC"),
.SIM_ASSERT_CHK (0),
.USE_EMBEDDED_CONSTRAINT (0),
.USE_MEM_INIT (1),
.USE_MEM_INIT_MMI (0),
.WAKEUP_TIME ("disable_sleep"),
.WRITE_DATA_WIDTH_A (INTERNAL_W),
.WRITE_DATA_WIDTH_B (INTERNAL_W),
.WRITE_MODE_A ("read_first"),
.WRITE_MODE_B ("read_first"),
.WRITE_PROTECT (1)
) u_bram_re (
.clka (clk),
.clkb (clk),
.rsta (1'b0),
.rstb (1'b0),
.ena (1'b1),
.enb (1'b1),
.regcea (1'b1),
.regceb (1'b1),
.addra (bram_addr_a),
.addrb (bram_addr_b),
.dina (bram_wdata_a_re),
.dinb (bram_wdata_b_re),
.wea (bram_we_a),
.web (bram_we_b),
.douta (xpm_douta_re),
.doutb (xpm_doutb_re),
.injectdbiterra (1'b0),
.injectdbiterrb (1'b0),
.injectsbiterra (1'b0),
.injectsbiterrb (1'b0),
.sbiterra (),
.sbiterrb (),
.dbiterra (),
.dbiterrb (),
.sleep (1'b0)
);
xpm_memory_tdpram #(
.ADDR_WIDTH_A (LOG2N),
.ADDR_WIDTH_B (LOG2N),
.AUTO_SLEEP_TIME (0),
.BYTE_WRITE_WIDTH_A (INTERNAL_W),
.BYTE_WRITE_WIDTH_B (INTERNAL_W),
.CASCADE_HEIGHT (0),
.CLOCKING_MODE ("common_clock"),
.ECC_BIT_RANGE ("7:0"),
.ECC_MODE ("no_ecc"),
.ECC_TYPE ("none"),
.IGNORE_INIT_SYNTH (0),
.MEMORY_INIT_FILE ("none"),
.MEMORY_INIT_PARAM ("0"),
.MEMORY_OPTIMIZATION ("true"),
.MEMORY_PRIMITIVE ("block"),
.MEMORY_SIZE (N * INTERNAL_W),
.MESSAGE_CONTROL (0),
.RAM_DECOMP ("auto"),
.READ_DATA_WIDTH_A (INTERNAL_W),
.READ_DATA_WIDTH_B (INTERNAL_W),
.READ_LATENCY_A (1),
.READ_LATENCY_B (1),
.READ_RESET_VALUE_A ("0"),
.READ_RESET_VALUE_B ("0"),
.RST_MODE_A ("SYNC"),
.RST_MODE_B ("SYNC"),
.SIM_ASSERT_CHK (0),
.USE_EMBEDDED_CONSTRAINT (0),
.USE_MEM_INIT (1),
.USE_MEM_INIT_MMI (0),
.WAKEUP_TIME ("disable_sleep"),
.WRITE_DATA_WIDTH_A (INTERNAL_W),
.WRITE_DATA_WIDTH_B (INTERNAL_W),
.WRITE_MODE_A ("read_first"),
.WRITE_MODE_B ("read_first"),
.WRITE_PROTECT (1)
) u_bram_im (
.clka (clk),
.clkb (clk),
.rsta (1'b0),
.rstb (1'b0),
.ena (1'b1),
.enb (1'b1),
.regcea (1'b1),
.regceb (1'b1),
.addra (bram_addr_a),
.addrb (bram_addr_b),
.dina (bram_wdata_a_im),
.dinb (bram_wdata_b_im),
.wea (bram_we_a),
.web (bram_we_b),
.douta (xpm_douta_im),
.doutb (xpm_doutb_im),
.injectdbiterra (1'b0),
.injectdbiterrb (1'b0),
.injectsbiterra (1'b0),
.injectsbiterrb (1'b0),
.sbiterra (),
.sbiterrb (),
.dbiterra (),
.dbiterrb (),
.sleep (1'b0)
);
`endif
// ============================================================================
// OUTPUT PIPELINE
// ============================================================================
reg out_pipe_valid;
reg out_pipe_inverse;
// Sync reset: pure internal pipeline — no functional need for async reset.
// Enables downstream register absorption.
always @(posedge clk) begin
if (!reset_n) begin
out_pipe_valid <= 1'b0;
out_pipe_inverse <= 1'b0;
end else begin
out_pipe_valid <= (state == ST_OUTPUT) && (out_count <= FFT_N_M1[LOG2N-1:0]);
out_pipe_inverse <= inverse;
end
end
// ============================================================================
// MAIN FSM — Block 1: Control / FSM / Output Interface (async reset)
// ============================================================================
// Retains async reset for deterministic startup of FSM state and external
// output interface signals (dout_re/im, dout_valid, done).
// ============================================================================
always @(posedge clk or negedge reset_n) begin
if (!reset_n) begin
state <= ST_IDLE;
load_count <= 0;
out_count <= 0;
bfly_count <= 0;
stage <= 0;
half_reg <= 1;
dout_re <= 0;
dout_im <= 0;
dout_valid <= 0;
done <= 0;
end else begin
dout_valid <= 1'b0;
done <= 1'b0;
case (state)
ST_IDLE: begin
if (start) begin
state <= ST_LOAD;
load_count <= 0;
end
end
ST_LOAD: begin
if (din_valid) begin
if (load_count == FFT_N_M1[LOG2N-1:0]) begin
state <= ST_BF_READ;
stage <= 0;
bfly_count <= 0;
half_reg <= 1;
end else begin
load_count <= load_count + 1;
end
end
end
ST_BF_READ: begin
state <= ST_BF_TW;
end
ST_BF_TW: begin
state <= ST_BF_MULT2;
end
ST_BF_MULT2: begin
state <= ST_BF_WRITE;
end
ST_BF_WRITE: begin
if (bfly_count == FFT_N_HALF_M1[LOG2N-1:0]) begin
bfly_count <= 0;
if (stage == LOG2N - 1) begin
state <= ST_OUTPUT;
out_count <= 0;
end else begin
stage <= stage + 1;
half_reg <= half_reg << 1;
state <= ST_BF_READ;
end
end else begin
bfly_count <= bfly_count + 1;
state <= ST_BF_READ;
end
end
ST_OUTPUT: begin
if (out_count <= FFT_N_M1[LOG2N-1:0]) begin
out_count <= out_count + 1;
end
if (out_pipe_valid) begin
if (out_pipe_inverse) begin
dout_re <= saturate(mem_rdata_a_re >>> LOG2N);
dout_im <= saturate(mem_rdata_a_im >>> LOG2N);
end else begin
dout_re <= saturate(mem_rdata_a_re);
dout_im <= saturate(mem_rdata_a_im);
end
dout_valid <= 1'b1;
end
if (out_count > FFT_N_M1[LOG2N-1:0] && !out_pipe_valid) begin
state <= ST_DONE;
end
end
ST_DONE: begin
done <= 1'b1;
state <= ST_IDLE;
end
default: state <= ST_IDLE;
endcase
end
end
// ============================================================================
// MAIN FSM — Block 2: DSP/BRAM Datapath Pipeline (sync reset)
// ============================================================================
// Sync reset enables Vivado to absorb these registers into hard blocks:
// - rd_b_re/im → DSP48E1 AREG (butterfly multiply A-port input)
// - rd_tw_cos/sin → DSP48E1 BREG (butterfly multiply B-port input)
// - bf_prod_re/im → DSP48E1 PREG (multiply output register)
// - rd_a_re/im → BRAM output register (REGCE)
// - rd_tw_idx → pipeline register (twiddle index)
// - rd_addr_even/odd, rd_inverse — internal pipeline
//
// These registers are only meaningful during COMPUTE states (BF_READ through
// BF_WRITE). Their values are always overwritten before use after every FSM
// transition, so sync reset is functionally equivalent to async reset.
// ============================================================================
always @(posedge clk) begin
if (!reset_n) begin
rd_tw_cos <= 0;
rd_tw_sin <= 0;
rd_addr_even <= 0;
rd_addr_odd <= 0;
rd_inverse <= 0;
rd_tw_idx <= 0;
rd_a_re <= 0;
rd_a_im <= 0;
rd_b_re <= 0;
rd_b_im <= 0;
bf_prod_re <= 0;
bf_prod_im <= 0;
end else begin
case (state)
ST_BF_READ: begin
// Register butterfly addresses and twiddle index.
// BRAM read initiated by bram_port_mux (addresses presented
// combinationally); data arrives next cycle (ST_BF_TW).
// Twiddle ROM lookup uses rd_tw_idx next cycle, breaking the
// address-calc -> ROM -> quarter-wave-mux combinational path.
rd_addr_even <= bf_addr_even;
rd_addr_odd <= bf_addr_odd;
rd_inverse <= inverse;
rd_tw_idx <= bf_tw_idx;
end
ST_BF_TW: begin
// BRAM data valid this cycle (1-cycle read latency).
// Capture BRAM data into pipeline regs.
// Twiddle ROM lookup is combinational from registered rd_tw_idx
// -- capture the result into rd_tw_cos/sin.
rd_a_re <= mem_rdata_a_re;
rd_a_im <= mem_rdata_a_im;
rd_b_re <= mem_rdata_b_re;
rd_b_im <= mem_rdata_b_im;
rd_tw_cos <= tw_cos_lookup;
rd_tw_sin <= tw_sin_lookup;
end
ST_BF_MULT2: begin
// Compute raw twiddle products from registered BRAM data.
// Path: register -> DSP48E1 multiply-accumulate -> PREG
// The arithmetic shift and add/subtract are handled combinationally
// in BF_WRITE (shift is pure bit-select, zero logic levels).
if (!rd_inverse) begin
bf_prod_re <= rd_b_re * rd_tw_cos + rd_b_im * rd_tw_sin;
bf_prod_im <= rd_b_im * rd_tw_cos - rd_b_re * rd_tw_sin;
end else begin
bf_prod_re <= rd_b_re * rd_tw_cos - rd_b_im * rd_tw_sin;
bf_prod_im <= rd_b_im * rd_tw_cos + rd_b_re * rd_tw_sin;
end
end
default: begin
// No datapath update in other states — registers hold values
end
endcase
end
end
endmodule