Merge pull request #33 from JJassonn69/fix/staggered-prf-dual16-doppler

Fix staggered-PRF Doppler path using dual 16-point FFT sub-frames
2026-03-27 22:09:08 +01:00
parent 2a89713c21 a577b7628b
commit 46c37e17d4
18 changed files with 12801 additions and 12657 deletions
@@ -1,11 +1,44 @@
 `timescale 1ns / 1ps

+// ============================================================================
+// doppler_processor.v — Staggered-PRF Doppler Processor (CORRECTED)
+// ============================================================================
+//
+// ARCHITECTURE:
+//   This module implements dual 16-point FFTs for the AERIS-10 staggered-PRF
+//   waveform. The radar transmits 16 long-PRI chirps followed by 16 short-PRI
+//   chirps per frame (32 total). Rather than a single 32-point FFT over the
+//   non-uniformly sampled frame (which is signal-processing invalid), this
+//   module processes each sub-frame independently:
+//
+//     Sub-frame 0 (long PRI):  chirps 0..15  → 16-pt windowed FFT
+//     Sub-frame 1 (short PRI): chirps 16..31 → 16-pt windowed FFT
+//
+//   Each sub-frame produces 16 Doppler bins per range bin. The outputs are
+//   tagged with a sub_frame bit and the 4-bit bin index is packed into the
+//   existing 5-bit doppler_bin port as {sub_frame, bin[3:0]}.
+//
+//   This architecture enables downstream staggered-PRF ambiguity resolution:
+//   the same target velocity maps to DIFFERENT Doppler bins at different PRIs,
+//   and comparing the two sub-frame results resolves velocity ambiguity.
+//
+// INTERFACE COMPATIBILITY:
+//   The port list is a superset of the original module. Existing instantiations
+//   that don't connect `sub_frame` will still work. The FORMAL ports are
+//   retained. CHIRPS_PER_FRAME must be 32 (16 per sub-frame).
+//
+// WINDOW:
+//   16-point Hamming window (Q15), symmetric. Computed as:
+//     w[n] = 0.54 - 0.46 * cos(2*pi*n/15), n=0..15
+// ============================================================================
+
 module doppler_processor_optimized #(
-    parameter DOPPLER_FFT_SIZE = 32,
-    parameter RANGE_BINS = 64,
-    parameter CHIRPS_PER_FRAME = 32,
-    parameter WINDOW_TYPE = 0,            // 0=Hamming, 1=Rectangular
-    parameter DATA_WIDTH = 16
+    parameter DOPPLER_FFT_SIZE   = 16,     // FFT size per sub-frame (was 32)
+    parameter RANGE_BINS         = 64,
+    parameter CHIRPS_PER_FRAME   = 32,     // Total chirps in frame (16+16)
+    parameter CHIRPS_PER_SUBFRAME = 16,    // Chirps per sub-frame
+    parameter WINDOW_TYPE        = 0,      // 0=Hamming, 1=Rectangular
+    parameter DATA_WIDTH         = 16
 )(
    input wire clk,
    input wire reset_n,
@@ -14,62 +47,63 @@ module doppler_processor_optimized #(
    input wire new_chirp_frame,
    output reg [31:0] doppler_output,
    output reg doppler_valid,
-    output reg [4:0] doppler_bin,
+    output reg [4:0] doppler_bin,      // {sub_frame, bin[3:0]}
    output reg [5:0] range_bin,
-    output wire processing_active,
-    output wire frame_complete,
-    output reg [3:0] status
-
-`ifdef FORMAL
-    ,
-    output wire [2:0]  fv_state,
-    output wire [10:0] fv_mem_write_addr,
-    output wire [10:0] fv_mem_read_addr,
-    output wire [5:0]  fv_write_range_bin,
-    output wire [4:0]  fv_write_chirp_index,
-    output wire [5:0]  fv_read_range_bin,
-    output wire [4:0]  fv_read_doppler_index,
-    output wire [9:0]  fv_processing_timeout,
-    output wire        fv_frame_buffer_full,
-    output wire        fv_mem_we,
-    output wire [10:0] fv_mem_waddr_r
-`endif
-);
-
-// ==============================================
-// Window Coefficients (Simple Implementation)
-// ==============================================
-reg [DATA_WIDTH-1:0] window_coeff [0:31];
+    output reg sub_frame,              // 0=long PRI, 1=short PRI
+    output wire processing_active,
+    output wire frame_complete,
+    output reg [3:0] status
+
+`ifdef FORMAL
+    ,
+    output wire [2:0]  fv_state,
+    output wire [10:0] fv_mem_write_addr,
+    output wire [10:0] fv_mem_read_addr,
+    output wire [5:0]  fv_write_range_bin,
+    output wire [4:0]  fv_write_chirp_index,
+    output wire [5:0]  fv_read_range_bin,
+    output wire [4:0]  fv_read_doppler_index,
+    output wire [9:0]  fv_processing_timeout,
+    output wire        fv_frame_buffer_full,
+    output wire        fv_mem_we,
+    output wire [10:0] fv_mem_waddr_r
+`endif
+);
+
+// ==============================================
+// Window Coefficients — 16-point Hamming (Q15)
+// ==============================================
+// w[n] = 0.54 - 0.46 * cos(2*pi*n/15), n=0..15
+// Symmetric: w[n] = w[15-n]
+reg [DATA_WIDTH-1:0] window_coeff [0:15];

-// Generate window coefficients
 integer w;
 initial begin
    if (WINDOW_TYPE == 0) begin
-        // Pre-calculated Hamming window (Q15 format)
-        window_coeff[0]  = 16'h0800; window_coeff[1]  = 16'h0862;
-        window_coeff[2]  = 16'h09CB; window_coeff[3]  = 16'h0C3B;
-        window_coeff[4]  = 16'h0FB2; window_coeff[5]  = 16'h142F;
-        window_coeff[6]  = 16'h19B2; window_coeff[7]  = 16'h2039;
-        window_coeff[8]  = 16'h27C4; window_coeff[9]  = 16'h3050;
-        window_coeff[10] = 16'h39DB; window_coeff[11] = 16'h4462;
-        window_coeff[12] = 16'h4FE3; window_coeff[13] = 16'h5C5A;
-        window_coeff[14] = 16'h69C4; window_coeff[15] = 16'h781D;
-        window_coeff[16] = 16'h7FFF; // Peak
-        window_coeff[17] = 16'h781D; window_coeff[18] = 16'h69C4;
-        window_coeff[19] = 16'h5C5A; window_coeff[20] = 16'h4FE3;
-        window_coeff[21] = 16'h4462; window_coeff[22] = 16'h39DB;
-        window_coeff[23] = 16'h3050; window_coeff[24] = 16'h27C4;
-        window_coeff[25] = 16'h2039; window_coeff[26] = 16'h19B2;
-        window_coeff[27] = 16'h142F; window_coeff[28] = 16'h0FB2;
-        window_coeff[29] = 16'h0C3B; window_coeff[30] = 16'h09CB;
-        window_coeff[31] = 16'h0862;
+        // 16-point Hamming window, Q15 format
+        // Computed: round(32767 * (0.54 - 0.46*cos(2*pi*n/15)))
+        window_coeff[0]  = 16'h0A3D;  // 0.0800 * 32767 = 2621
+        window_coeff[1]  = 16'h0E5C;  // 0.1116 * 32767 = 3676
+        window_coeff[2]  = 16'h1B6D;  // 0.2138 * 32767 = 7021
+        window_coeff[3]  = 16'h3088;  // 0.3790 * 32767 = 12424
+        window_coeff[4]  = 16'h4B33;  // 0.5868 * 32767 = 19251
+        window_coeff[5]  = 16'h6573;  // 0.7930 * 32767 = 25971
+        window_coeff[6]  = 16'h7642;  // 0.9245 * 32767 = 30274
+        window_coeff[7]  = 16'h7F62;  // 0.9932 * 32767 = 32610
+        window_coeff[8]  = 16'h7F62;  // symmetric
+        window_coeff[9]  = 16'h7642;
+        window_coeff[10] = 16'h6573;
+        window_coeff[11] = 16'h4B33;
+        window_coeff[12] = 16'h3088;
+        window_coeff[13] = 16'h1B6D;
+        window_coeff[14] = 16'h0E5C;
+        window_coeff[15] = 16'h0A3D;
    end else begin
-        // Rectangular window (all ones)
-        for (w = 0; w < 32; w = w + 1) begin
+        for (w = 0; w < 16; w = w + 1) begin
            window_coeff[w] = 16'h7FFF;
        end
    end
-end
+end

 // ==============================================
 // Memory Declaration - FIXED SIZE
@@ -81,57 +115,53 @@ localparam MEM_DEPTH = RANGE_BINS * CHIRPS_PER_FRAME;
 // ==============================================
 // Control Registers
 // ==============================================
-reg [5:0] write_range_bin;     // Changed to match RANGE_BINS width
-reg [4:0] write_chirp_index;   // Changed to match CHIRPS_PER_FRAME width
+reg [5:0] write_range_bin;
+reg [4:0] write_chirp_index;
 reg [5:0] read_range_bin;
-reg [4:0] read_doppler_index;  // Changed name for clarity
+reg [4:0] read_doppler_index;
 reg frame_buffer_full;
-reg [9:0] chirps_received;     // Enough for up to 1024 chirps
-reg [1:0] chirp_state;         // Track chirp accumulation state
+reg [9:0] chirps_received;
+reg [1:0] chirp_state;
+
+// Sub-frame tracking
+reg current_sub_frame;   // 0=processing long, 1=processing short

-
 // ==============================================
 // FFT Interface
 // ==============================================
 reg fft_start;
 wire fft_ready;
 reg [DATA_WIDTH-1:0] fft_input_i;
-reg [DATA_WIDTH-1:0] fft_input_q;
-reg signed [31:0] mult_i, mult_q;  // 32-bit to avoid overflow
-reg signed [DATA_WIDTH-1:0] window_val_reg;   // BREG pipeline stage
-reg signed [31:0] mult_i_raw, mult_q_raw;     // MREG pipeline stage
+reg [DATA_WIDTH-1:0] fft_input_q;
+reg signed [31:0] mult_i, mult_q;
+reg signed [DATA_WIDTH-1:0] window_val_reg;
+reg signed [31:0] mult_i_raw, mult_q_raw;

 reg fft_input_valid;
 reg fft_input_last;
 wire [DATA_WIDTH-1:0] fft_output_i;
 wire [DATA_WIDTH-1:0] fft_output_q;
 wire fft_output_valid;
-wire fft_output_last;
+wire fft_output_last;

 // ==============================================
-// Addressing 
+// Addressing
 // ==============================================
 wire [10:0] mem_write_addr;
 wire [10:0] mem_read_addr;

-// Proper address calculation using parameters
 assign mem_write_addr = (write_chirp_index * RANGE_BINS) + write_range_bin;
 assign mem_read_addr = (read_doppler_index * RANGE_BINS) + read_range_bin;

-// Alternative organization (choose one):
-// If you want range-major organization (all chirps for one range bin together):
-// assign mem_write_addr = (write_range_bin * CHIRPS_PER_FRAME) + write_chirp_index;
-// assign mem_read_addr = (read_range_bin * CHIRPS_PER_FRAME) + read_doppler_index;
-
-// ==============================================
-// State Machine
-// ==============================================
-reg [2:0] state;
-localparam S_IDLE       = 3'b000;
-localparam S_ACCUMULATE = 3'b001;
-localparam S_PRE_READ   = 3'b101;  // Prime BRAM pipeline before FFT load
-localparam S_LOAD_FFT   = 3'b010;
-localparam S_FFT_WAIT   = 3'b011;
+// ==============================================
+// State Machine
+// ==============================================
+reg [2:0] state;
+localparam S_IDLE       = 3'b000;
+localparam S_ACCUMULATE = 3'b001;
+localparam S_PRE_READ   = 3'b101;
+localparam S_LOAD_FFT   = 3'b010;
+localparam S_FFT_WAIT   = 3'b011;
 localparam S_OUTPUT     = 3'b100;

 // Frame sync detection
@@ -142,361 +172,347 @@ always @(posedge clk or negedge reset_n) begin
 end
 wire frame_start_pulse = new_chirp_frame & ~new_chirp_frame_d1;

-// ==============================================
-// Main State Machine - FIXED
-// ==============================================
-reg [5:0] fft_sample_counter;
-reg [9:0] processing_timeout;
-
-// Memory write enable and data signals (extracted for BRAM inference)
-reg mem_we;
-reg [10:0] mem_waddr_r;
-reg [DATA_WIDTH-1:0] mem_wdata_i, mem_wdata_q;
-
-// Memory read data (registered for BRAM read latency)
-reg [DATA_WIDTH-1:0] mem_rdata_i, mem_rdata_q;
-
-`ifdef FORMAL
-assign fv_state              = state;
-assign fv_mem_write_addr     = mem_write_addr;
-assign fv_mem_read_addr      = mem_read_addr;
-assign fv_write_range_bin    = write_range_bin;
-assign fv_write_chirp_index  = write_chirp_index;
-assign fv_read_range_bin     = read_range_bin;
-assign fv_read_doppler_index = read_doppler_index;
-assign fv_processing_timeout = processing_timeout;
-assign fv_frame_buffer_full  = frame_buffer_full;
-assign fv_mem_we             = mem_we;
-assign fv_mem_waddr_r        = mem_waddr_r;
-`endif
-
-// ----------------------------------------------------------
-// Separate always block for memory writes — NO async reset
-// in sensitivity list, so Vivado can infer Block RAM.
-// ----------------------------------------------------------
-always @(posedge clk) begin
-    if (mem_we) begin
-        doppler_i_mem[mem_waddr_r] <= mem_wdata_i;
-        doppler_q_mem[mem_waddr_r] <= mem_wdata_q;
-    end
-    // Registered read — address driven by mem_read_addr from FSM
-    mem_rdata_i <= doppler_i_mem[mem_read_addr];
-    mem_rdata_q <= doppler_q_mem[mem_read_addr];
-end
-
-// ----------------------------------------------------------
-// Block 1: FSM / Control — async reset (posedge clk or negedge reset_n).
-// Only state-machine and control registers live here.
-// BRAM-driving and DSP datapath registers are intentionally
-// excluded to avoid Vivado REQP-1839 (async-reset on BRAM
-// address) and DPOR-1/DPIP-1 (async-reset blocking DSP48
-// absorption) DRC warnings.
-// ----------------------------------------------------------
-always @(posedge clk or negedge reset_n) begin
-    if (!reset_n) begin
-        state <= S_IDLE;
-        write_range_bin <= 0;
-        write_chirp_index <= 0;
-        // read_range_bin, read_doppler_index moved to Block 2 (sync reset)
-        // to enable BRAM address register absorption (REQP-1839 fix)
-        frame_buffer_full <= 0;
-        doppler_valid <= 0;
-        fft_start <= 0;
-        fft_input_valid <= 0;
-        fft_input_last <= 0;
-        fft_sample_counter <= 0;
-        processing_timeout <= 0;
-        status <= 0;
-        chirps_received <= 0;
-        chirp_state <= 0;
-        doppler_output <= 0;
-        doppler_bin <= 0;
-        range_bin <= 0;
-    end else begin
-        doppler_valid <= 0;
-        fft_input_valid <= 0;
-        fft_input_last <= 0;
-        
-        if (processing_timeout > 0) begin
-            processing_timeout <= processing_timeout - 1;
-        end
-        
-        case (state)
-            S_IDLE: begin
-                if (frame_start_pulse) begin
-                    // Start new frame
-                    write_chirp_index <= 0;
-                    write_range_bin <= 0;
-                    frame_buffer_full <= 0;
-                    chirps_received <= 0;
-                end
-                
-                if (data_valid && !frame_buffer_full) begin
-                    state <= S_ACCUMULATE;
-                    write_range_bin <= 1;
-                end
-            end
-            
-            S_ACCUMULATE: begin
-                if (data_valid) begin
-                    // Increment range bin
-                    if (write_range_bin < RANGE_BINS - 1) begin
-                        write_range_bin <= write_range_bin + 1;
-                    end else begin
-                        // Completed one chirp
-                        write_range_bin <= 0;
-                        write_chirp_index <= write_chirp_index + 1;
-                        chirps_received <= chirps_received + 1;
-                        
-                        // Check if frame is complete
-                        if (write_chirp_index >= CHIRPS_PER_FRAME - 1) begin
-                            frame_buffer_full <= 1;
-                            chirp_state <= 0;
-                            state <= S_PRE_READ;
-                            // read_range_bin/read_doppler_index zeroed in Block 2
-                            fft_sample_counter <= 0;
-                            // Reset write pointers — no longer needed for
-                            // this frame, and prevents stale overflow of
-                            // write_chirp_index (which was just incremented
-                            // past CHIRPS_PER_FRAME-1 above).
-                            write_chirp_index <= 0;
-                            write_range_bin <= 0;
-                        end
-                    end
-                end 
-            end
-            
-            S_PRE_READ: begin
-                // Prime the BRAM pipeline: present addr for chirp 0 of
-                // current read_range_bin.  read_doppler_index is already 0.
-                // mem_read_addr = 0 * RANGE_BINS + read_range_bin.
-                // After this cycle, mem_rdata_i will hold data[chirp=0][rbin].
-                // Advance read_doppler_index to 1 so the NEXT BRAM read
-                // (which happens every cycle in the memory block) will
-                // fetch chirp 1.
-                // read_doppler_index <= 1 moved to Block 2
-                fft_start <= 1;
-                state <= S_LOAD_FFT;
-            end
-
-            S_LOAD_FFT: begin
-                fft_start <= 0;
-                
-                // Pipeline alignment (after S_PRE_READ primed the BRAM
-                // and pre-registered window_val_reg = window_coeff[0]):
-                //
-                // With DSP48 BREG+MREG pipelining, data flows through:
-                //   sub=0: multiply mem_rdata * window_val_reg -> mult_i_raw
-                //          pre-register window_coeff[1] into window_val_reg
-                //   sub=1: MREG capture mult_i_raw -> mult_i (sample 0)
-                //          new multiply for sample 1
-                //   sub=2..DOPPLER_FFT_SIZE+1: steady state —
-                //          fft_input = rounding(mult_i), mult_i = mult_i_raw,
-                //          mult_i_raw = new multiply, window_val_reg = next coeff
-                //
-                // fft_input_valid asserted at sub=2..DOPPLER_FFT_SIZE+1
-                // fft_input_last  asserted at sub=DOPPLER_FFT_SIZE+1
-
-                // read_doppler_index updates moved to Block 2 (sync reset)
-                if (fft_sample_counter <= 1) begin
-                    // Sub 0..1: pipeline priming — no valid FFT data yet
-                    fft_sample_counter <= fft_sample_counter + 1;
-                end else if (fft_sample_counter <= DOPPLER_FFT_SIZE + 1) begin
-                    // Sub 2..DOPPLER_FFT_SIZE+1: steady state
-                    // (fft_input_i/fft_input_q captured in Block 2)
-                    fft_input_valid <= 1;
-
-                    if (fft_sample_counter == DOPPLER_FFT_SIZE + 1) begin
-                        // Last sample: flush
-                        fft_input_last <= 1;
-                        state <= S_FFT_WAIT;
-                        fft_sample_counter <= 0;
-                        processing_timeout <= 1000;
-                    end else begin
-                        fft_sample_counter <= fft_sample_counter + 1;
-                    end
-                end
-            end
-            
-            S_FFT_WAIT: begin
-                if (fft_output_valid) begin
-                    doppler_output <= {fft_output_q[15:0], fft_output_i[15:0]};
-                    doppler_bin <= fft_sample_counter;
-                    range_bin <= read_range_bin;
-                    doppler_valid <= 1;
-                    
-                    fft_sample_counter <= fft_sample_counter + 1;
-                    
-                    if (fft_output_last) begin
-                        state <= S_OUTPUT;
-                        fft_sample_counter <= 0;
-                    end
-                end
-                
-                if (processing_timeout == 0) begin
-                    state <= S_OUTPUT;
-                end
-            end
-            
-            S_OUTPUT: begin
-                if (read_range_bin < RANGE_BINS - 1) begin
-                    // read_range_bin/read_doppler_index updated in Block 2
-                    fft_sample_counter <= 0;
-                    state <= S_PRE_READ;
-                end else begin
-                    state <= S_IDLE;
-                    frame_buffer_full <= 0;
-                end
-            end
-            
-        endcase
-        
-        status <= {state, frame_buffer_full};
-    end
-end
-
-// ----------------------------------------------------------
-// Block 2: BRAM address/data & DSP datapath — synchronous reset only.
-// Uses always @(posedge clk) so Vivado can absorb multipliers
-// into DSP48 primitives and does not flag REQP-1839/1840 on
-// BRAM address registers.  Replicates the same state/condition
-// structure as Block 1 for the registers:
-//   mem_we, mem_waddr_r, mem_wdata_i, mem_wdata_q,
-//   mult_i, mult_q, fft_input_i, fft_input_q,
-//   read_range_bin, read_doppler_index
-// ----------------------------------------------------------
-always @(posedge clk) begin
-    if (!reset_n) begin
-        mem_we      <= 0;
-        mem_waddr_r <= 0;
-        mem_wdata_i <= 0;
-        mem_wdata_q <= 0;
-        mult_i      <= 0;
-        mult_q      <= 0;
-        mult_i_raw     <= 0;
-        mult_q_raw     <= 0;
-        window_val_reg <= 0;
-        fft_input_i <= 0;
-        fft_input_q <= 0;
-        read_range_bin     <= 0;
-        read_doppler_index <= 0;
-    end else begin
-        mem_we <= 0;
-        
-        case (state)
-            S_IDLE: begin
-                if (data_valid && !frame_buffer_full) begin
-                    // Write the first sample immediately (Bug #3 fix:
-                    // previously this transition consumed data_valid
-                    // without writing to BRAM)
-                    mem_we      <= 1;
-                    mem_waddr_r <= mem_write_addr;
-                    mem_wdata_i <= range_data[15:0];
-                    mem_wdata_q <= range_data[31:16];
-                end
-            end
-            
-            S_ACCUMULATE: begin
-                if (data_valid) begin
-                    // Drive memory write signals (actual write in separate block)
-                    mem_we      <= 1;
-                    mem_waddr_r <= mem_write_addr;
-                    mem_wdata_i <= range_data[15:0];
-                    mem_wdata_q <= range_data[31:16];
-
-                    // Transition to S_PRE_READ when frame complete
-                    if (write_range_bin >= RANGE_BINS - 1 &&
-                        write_chirp_index >= CHIRPS_PER_FRAME - 1) begin
-                        read_range_bin     <= 0;
-                        read_doppler_index <= 0;
-                    end
-                end
-            end
-            
-            S_PRE_READ: begin
-                // Advance read_doppler_index to 1 so next BRAM read
-                // fetches chirp 1
-                read_doppler_index <= 1;
-                // BREG priming: pre-register window coeff for sample 0
-                // so it is ready when S_LOAD_FFT sub=0 performs the multiply
-                window_val_reg <= $signed(window_coeff[0]);
-            end
-
-            S_LOAD_FFT: begin
-                if (fft_sample_counter == 0) begin
-                    // Pipe stage 1: multiply using pre-registered BREG value
-                    // mem_rdata_i = data[chirp=0][rbin] (primed by S_PRE_READ)
-                    mult_i_raw <= $signed(mem_rdata_i) * window_val_reg;
-                    mult_q_raw <= $signed(mem_rdata_q) * window_val_reg;
-                    // Pre-register next window coeff (sample 1)
-                    window_val_reg <= $signed(window_coeff[1]);
-                    // Present BRAM addr for chirp 2
-                    read_doppler_index <= (2 < DOPPLER_FFT_SIZE) ? 2
-                                          : DOPPLER_FFT_SIZE - 1;
-                end else if (fft_sample_counter == 1) begin
-                    // Pipe stage 2 (MREG): capture sample 0 multiply result
-                    mult_i <= mult_i_raw;
-                    mult_q <= mult_q_raw;
-                    // Multiply sample 1 using registered window value
-                    mult_i_raw <= $signed(mem_rdata_i) * window_val_reg;
-                    mult_q_raw <= $signed(mem_rdata_q) * window_val_reg;
-                    // Pre-register next window coeff (sample 2)
-                    if (2 < DOPPLER_FFT_SIZE)
-                        window_val_reg <= $signed(window_coeff[2]);
-                    // Advance BRAM read to chirp 3
-                    if (3 < DOPPLER_FFT_SIZE)
-                        read_doppler_index <= 3;
-                    else
-                        read_doppler_index <= DOPPLER_FFT_SIZE - 1;
-                end else if (fft_sample_counter <= DOPPLER_FFT_SIZE + 1) begin
-                    // Sub 2..DOPPLER_FFT_SIZE+1: steady state
-                    // Capture rounding into fft_input from MREG output
-                    fft_input_i <= (mult_i + (1 << 14)) >>> 15;
-                    fft_input_q <= (mult_q + (1 << 14)) >>> 15;
-                    // MREG: capture multiply result
-                    mult_i <= mult_i_raw;
-                    mult_q <= mult_q_raw;
-
-                    if (fft_sample_counter <= DOPPLER_FFT_SIZE - 1) begin
-                        // New multiply from current BRAM data
-                        mult_i_raw <= $signed(mem_rdata_i) * window_val_reg;
-                        mult_q_raw <= $signed(mem_rdata_q) * window_val_reg;
-                        // Pre-register next window coeff (clamped)
-                        if (fft_sample_counter + 1 < DOPPLER_FFT_SIZE)
-                            window_val_reg <= $signed(window_coeff[fft_sample_counter + 1]);
-                        // Advance BRAM read
-                        if (fft_sample_counter + 2 < DOPPLER_FFT_SIZE)
-                            read_doppler_index <= fft_sample_counter + 2;
-                        else
-                            read_doppler_index <= DOPPLER_FFT_SIZE - 1;
-                    end
-
-                    if (fft_sample_counter == DOPPLER_FFT_SIZE + 1) begin
-                        // Flush complete — reset read index
-                        read_doppler_index <= 0;
-                    end
-                end
-            end
-
-            S_OUTPUT: begin
-                if (read_range_bin < RANGE_BINS - 1) begin
-                    read_range_bin     <= read_range_bin + 1;
-                    read_doppler_index <= 0;
-                end
-            end
-
-            default: begin
-                // S_IDLE, S_FFT_WAIT:
-                // no BRAM-write, DSP, or read-address operations needed
-            end
-        endcase
-    end
+// ==============================================
+// Main State Machine
+// ==============================================
+reg [4:0] fft_sample_counter;  // Reduced: only need 0..17 for 16-pt FFT
+reg [9:0] processing_timeout;
+
+// Memory write enable and data signals
+reg mem_we;
+reg [10:0] mem_waddr_r;
+reg [DATA_WIDTH-1:0] mem_wdata_i, mem_wdata_q;
+
+// Memory read data
+reg [DATA_WIDTH-1:0] mem_rdata_i, mem_rdata_q;
+
+`ifdef FORMAL
+assign fv_state              = state;
+assign fv_mem_write_addr     = mem_write_addr;
+assign fv_mem_read_addr      = mem_read_addr;
+assign fv_write_range_bin    = write_range_bin;
+assign fv_write_chirp_index  = write_chirp_index;
+assign fv_read_range_bin     = read_range_bin;
+assign fv_read_doppler_index = read_doppler_index;
+assign fv_processing_timeout = processing_timeout;
+assign fv_frame_buffer_full  = frame_buffer_full;
+assign fv_mem_we             = mem_we;
+assign fv_mem_waddr_r        = mem_waddr_r;
+`endif
+
+// ----------------------------------------------------------
+// Separate always block for memory writes — NO async reset
+// ----------------------------------------------------------
+always @(posedge clk) begin
+    if (mem_we) begin
+        doppler_i_mem[mem_waddr_r] <= mem_wdata_i;
+        doppler_q_mem[mem_waddr_r] <= mem_wdata_q;
+    end
+    mem_rdata_i <= doppler_i_mem[mem_read_addr];
+    mem_rdata_q <= doppler_q_mem[mem_read_addr];
+end
+
+// ----------------------------------------------------------
+// Block 1: FSM / Control — async reset
+// ----------------------------------------------------------
+always @(posedge clk or negedge reset_n) begin
+    if (!reset_n) begin
+        state <= S_IDLE;
+        write_range_bin <= 0;
+        write_chirp_index <= 0;
+        frame_buffer_full <= 0;
+        doppler_valid <= 0;
+        fft_start <= 0;
+        fft_input_valid <= 0;
+        fft_input_last <= 0;
+        fft_sample_counter <= 0;
+        processing_timeout <= 0;
+        status <= 0;
+        chirps_received <= 0;
+        chirp_state <= 0;
+        doppler_output <= 0;
+        doppler_bin <= 0;
+        range_bin <= 0;
+        sub_frame <= 0;
+        current_sub_frame <= 0;
+    end else begin
+        doppler_valid <= 0;
+        fft_input_valid <= 0;
+        fft_input_last <= 0;
+        
+        if (processing_timeout > 0) begin
+            processing_timeout <= processing_timeout - 1;
+        end
+        
+        case (state)
+            S_IDLE: begin
+                if (frame_start_pulse) begin
+                    write_chirp_index <= 0;
+                    write_range_bin <= 0;
+                    frame_buffer_full <= 0;
+                    chirps_received <= 0;
+                end
+                
+                if (data_valid && !frame_buffer_full) begin
+                    state <= S_ACCUMULATE;
+                    write_range_bin <= 1;
+                end
+            end
+            
+            S_ACCUMULATE: begin
+                if (data_valid) begin
+                    if (write_range_bin < RANGE_BINS - 1) begin
+                        write_range_bin <= write_range_bin + 1;
+                    end else begin
+                        write_range_bin <= 0;
+                        write_chirp_index <= write_chirp_index + 1;
+                        chirps_received <= chirps_received + 1;
+                        
+                        if (write_chirp_index >= CHIRPS_PER_FRAME - 1) begin
+                            frame_buffer_full <= 1;
+                            chirp_state <= 0;
+                            state <= S_PRE_READ;
+                            fft_sample_counter <= 0;
+                            write_chirp_index <= 0;
+                            write_range_bin <= 0;
+                            // Start with sub-frame 0 (long PRI chirps 0..15)
+                            current_sub_frame <= 0;
+                        end
+                    end
+                end 
+            end
+            
+            S_PRE_READ: begin
+                // Prime BRAM pipeline for current sub-frame
+                // read_doppler_index already set in Block 2 to sub-frame base
+                fft_start <= 1;
+                state <= S_LOAD_FFT;
+            end
+
+            S_LOAD_FFT: begin
+                fft_start <= 0;
+                
+                // Pipeline: 2 priming cycles + CHIRPS_PER_SUBFRAME data cycles
+                if (fft_sample_counter <= 1) begin
+                    fft_sample_counter <= fft_sample_counter + 1;
+                end else if (fft_sample_counter <= CHIRPS_PER_SUBFRAME + 1) begin
+                    fft_input_valid <= 1;
+
+                    if (fft_sample_counter == CHIRPS_PER_SUBFRAME + 1) begin
+                        fft_input_last <= 1;
+                        state <= S_FFT_WAIT;
+                        fft_sample_counter <= 0;
+                        processing_timeout <= 1000;
+                    end else begin
+                        fft_sample_counter <= fft_sample_counter + 1;
+                    end
+                end
+            end
+            
+            S_FFT_WAIT: begin
+                if (fft_output_valid) begin
+                    doppler_output <= {fft_output_q[15:0], fft_output_i[15:0]};
+                    // Pack: {sub_frame, bin[3:0]}
+                    doppler_bin <= {current_sub_frame, fft_sample_counter[3:0]};
+                    range_bin <= read_range_bin;
+                    sub_frame <= current_sub_frame;
+                    doppler_valid <= 1;
+                    
+                    fft_sample_counter <= fft_sample_counter + 1;
+                    
+                    if (fft_output_last) begin
+                        state <= S_OUTPUT;
+                        fft_sample_counter <= 0;
+                    end
+                end
+                
+                if (processing_timeout == 0) begin
+                    state <= S_OUTPUT;
+                end
+            end
+            
+            S_OUTPUT: begin
+                if (current_sub_frame == 0) begin
+                    // Just finished long PRI sub-frame — now do short PRI
+                    current_sub_frame <= 1;
+                    fft_sample_counter <= 0;
+                    state <= S_PRE_READ;
+                    // read_range_bin stays the same, read_doppler_index
+                    // will be set to CHIRPS_PER_SUBFRAME in Block 2
+                end else begin
+                    // Finished both sub-frames for this range bin
+                    current_sub_frame <= 0;
+                    if (read_range_bin < RANGE_BINS - 1) begin
+                        fft_sample_counter <= 0;
+                        state <= S_PRE_READ;
+                        // read_range_bin incremented in Block 2
+                    end else begin
+                        state <= S_IDLE;
+                        frame_buffer_full <= 0;
+                    end
+                end
+            end
+            
+        endcase
+        
+        status <= {state, frame_buffer_full};
+    end
+end
+
+// ----------------------------------------------------------
+// Block 2: BRAM address/data & DSP datapath — synchronous reset
+// ----------------------------------------------------------
+always @(posedge clk) begin
+    if (!reset_n) begin
+        mem_we      <= 0;
+        mem_waddr_r <= 0;
+        mem_wdata_i <= 0;
+        mem_wdata_q <= 0;
+        mult_i      <= 0;
+        mult_q      <= 0;
+        mult_i_raw     <= 0;
+        mult_q_raw     <= 0;
+        window_val_reg <= 0;
+        fft_input_i <= 0;
+        fft_input_q <= 0;
+        read_range_bin     <= 0;
+        read_doppler_index <= 0;
+    end else begin
+        mem_we <= 0;
+        
+        case (state)
+            S_IDLE: begin
+                if (data_valid && !frame_buffer_full) begin
+                    mem_we      <= 1;
+                    mem_waddr_r <= mem_write_addr;
+                    mem_wdata_i <= range_data[15:0];
+                    mem_wdata_q <= range_data[31:16];
+                end
+            end
+            
+            S_ACCUMULATE: begin
+                if (data_valid) begin
+                    mem_we      <= 1;
+                    mem_waddr_r <= mem_write_addr;
+                    mem_wdata_i <= range_data[15:0];
+                    mem_wdata_q <= range_data[31:16];
+
+                    if (write_range_bin >= RANGE_BINS - 1 &&
+                        write_chirp_index >= CHIRPS_PER_FRAME - 1) begin
+                        read_range_bin     <= 0;
+                        // Start reading from chirp 0 (long PRI sub-frame)
+                        read_doppler_index <= 0;
+                    end
+                end
+            end
+            
+            S_PRE_READ: begin
+                // Set read_doppler_index to first chirp of current sub-frame + 1
+                // (because address is presented this cycle, data arrives next)
+                if (current_sub_frame == 0)
+                    read_doppler_index <= 1;  // Long PRI: chirps 0..15
+                else
+                    read_doppler_index <= CHIRPS_PER_SUBFRAME + 1;  // Short PRI: chirps 16..31
+
+                // BREG priming: window coeff for sample 0
+                window_val_reg <= $signed(window_coeff[0]);
+            end
+
+            S_LOAD_FFT: begin
+                if (fft_sample_counter == 0) begin
+                    // Pipe stage 1: multiply using pre-registered BREG value
+                    mult_i_raw <= $signed(mem_rdata_i) * window_val_reg;
+                    mult_q_raw <= $signed(mem_rdata_q) * window_val_reg;
+                    window_val_reg <= $signed(window_coeff[1]);
+                    // Advance to chirp base+2
+                    if (current_sub_frame == 0)
+                        read_doppler_index <= (2 < CHIRPS_PER_SUBFRAME) ? 2
+                                              : CHIRPS_PER_SUBFRAME - 1;
+                    else
+                        read_doppler_index <= (CHIRPS_PER_SUBFRAME + 2 < CHIRPS_PER_FRAME)
+                                              ? CHIRPS_PER_SUBFRAME + 2
+                                              : CHIRPS_PER_FRAME - 1;
+                end else if (fft_sample_counter == 1) begin
+                    mult_i <= mult_i_raw;
+                    mult_q <= mult_q_raw;
+                    mult_i_raw <= $signed(mem_rdata_i) * window_val_reg;
+                    mult_q_raw <= $signed(mem_rdata_q) * window_val_reg;
+                    if (2 < CHIRPS_PER_SUBFRAME)
+                        window_val_reg <= $signed(window_coeff[2]);
+                    // Advance to chirp base+3
+                    begin : advance_chirp3
+                        reg [4:0] next_chirp;
+                        next_chirp = (current_sub_frame == 0) ? 3 : CHIRPS_PER_SUBFRAME + 3;
+                        if (next_chirp < CHIRPS_PER_FRAME)
+                            read_doppler_index <= next_chirp;
+                        else
+                            read_doppler_index <= CHIRPS_PER_FRAME - 1;
+                    end
+                end else if (fft_sample_counter <= CHIRPS_PER_SUBFRAME + 1) begin
+                    // Steady state
+                    fft_input_i <= (mult_i + (1 << 14)) >>> 15;
+                    fft_input_q <= (mult_q + (1 << 14)) >>> 15;
+                    mult_i <= mult_i_raw;
+                    mult_q <= mult_q_raw;
+
+                    if (fft_sample_counter <= CHIRPS_PER_SUBFRAME - 1) begin
+                        mult_i_raw <= $signed(mem_rdata_i) * window_val_reg;
+                        mult_q_raw <= $signed(mem_rdata_q) * window_val_reg;
+                        // Window coeff index within sub-frame
+                        begin : advance_window
+                            reg [4:0] win_idx;
+                            win_idx = fft_sample_counter[3:0] + 1;
+                            if (win_idx < CHIRPS_PER_SUBFRAME)
+                                window_val_reg <= $signed(window_coeff[win_idx]);
+                        end
+                        // Advance BRAM read
+                        begin : advance_bram
+                            reg [4:0] chirp_offset;
+                            reg [4:0] chirp_base;
+                            chirp_offset = fft_sample_counter[3:0] + 2;
+                            chirp_base = (current_sub_frame == 0) ? 0 : CHIRPS_PER_SUBFRAME;
+                            if (chirp_base + chirp_offset < CHIRPS_PER_FRAME)
+                                read_doppler_index <= chirp_base + chirp_offset;
+                            else
+                                read_doppler_index <= CHIRPS_PER_FRAME - 1;
+                        end
+                    end
+
+                    if (fft_sample_counter == CHIRPS_PER_SUBFRAME + 1) begin
+                        // Reset read index for potential next operation
+                        if (current_sub_frame == 0)
+                            read_doppler_index <= CHIRPS_PER_SUBFRAME;  // Ready for short sub-frame
+                        else
+                            read_doppler_index <= 0;
+                    end
+                end
+            end
+
+            S_OUTPUT: begin
+                if (current_sub_frame == 0) begin
+                    // Transitioning to short PRI sub-frame
+                    // Set read_doppler_index to start of short sub-frame
+                    read_doppler_index <= CHIRPS_PER_SUBFRAME;
+                end else begin
+                    // Both sub-frames done
+                    if (read_range_bin < RANGE_BINS - 1) begin
+                        read_range_bin     <= read_range_bin + 1;
+                        read_doppler_index <= 0;  // Next range bin starts with long sub-frame
+                    end
+                end
+            end
+
+            default: begin
+                // S_FFT_WAIT: no BRAM-write or address operations needed
+            end
+        endcase
+    end
 end

 // ==============================================
-// FFT Module
+// FFT Module — 16-point
 // ==============================================
-xfft_32 fft_inst (
+xfft_16 fft_inst (
    .aclk(clk),
    .aresetn(reset_n),
    .s_axis_config_tdata(8'h01),
@@ -517,5 +533,4 @@ xfft_32 fft_inst (
 assign processing_active = (state != S_IDLE);
 assign frame_complete = (state == S_IDLE && frame_buffer_full == 0);

-
-endmodule
+endmodule
@@ -0,0 +1,8 @@
+// Quarter-wave cosine ROM for 16-point FFT
+// 4 entries (N/4), 16-bit signed Q15 format
+// cos(2*pi*k/16) for k = 0..3
+// Used by fft_engine with N=16, LOG2N=4
+7FFF
+7641
+5A82
+30FB
@@ -8,8 +8,8 @@
 // Single-clock design: clk is an input wire, async2sync handles async reset.
 // Each formal step = one clock edge.
 //
-// Parameters reduced: RANGE_BINS=4, CHIRPS_PER_FRAME=4, DOPPLER_FFT_SIZE=4.
-// Includes full xfft_32 and fft_engine sub-modules.
+// Parameters reduced: RANGE_BINS=4, CHIRPS_PER_FRAME=4, CHIRPS_PER_SUBFRAME=2, DOPPLER_FFT_SIZE=2.
+// Includes full xfft_16 and fft_engine sub-modules.
 //
 // Focus: memory address bounds (highest-value finding) and state encoding.
 // ============================================================================
@@ -20,7 +20,8 @@ module fv_doppler_processor (
    // Reduced parameters for tractable BMC
    localparam RANGE_BINS       = 4;
    localparam CHIRPS_PER_FRAME = 4;
-    localparam DOPPLER_FFT_SIZE = 4;
+    localparam CHIRPS_PER_SUBFRAME = 2;  // Dual sub-frame: 2 chirps per sub-frame
+    localparam DOPPLER_FFT_SIZE = 2;     // FFT size matches sub-frame size
    localparam MEM_DEPTH        = RANGE_BINS * CHIRPS_PER_FRAME;  // 16

    // State encoding (mirrors DUT localparams)
@@ -62,6 +63,7 @@ module fv_doppler_processor (
    wire        doppler_valid;
    wire [4:0]  doppler_bin;
    wire [5:0]  range_bin;
+    wire        sub_frame;
    wire        processing_active;
    wire        frame_complete;
    wire [3:0]  status;
@@ -86,6 +88,7 @@ module fv_doppler_processor (
        .DOPPLER_FFT_SIZE (DOPPLER_FFT_SIZE),
        .RANGE_BINS       (RANGE_BINS),
        .CHIRPS_PER_FRAME (CHIRPS_PER_FRAME),
+        .CHIRPS_PER_SUBFRAME (CHIRPS_PER_SUBFRAME),
        .WINDOW_TYPE      (1),   // Rectangular — simpler for formal
        .DATA_WIDTH       (16)
    ) dut (
@@ -98,6 +101,7 @@ module fv_doppler_processor (
        .doppler_valid    (doppler_valid),
        .doppler_bin      (doppler_bin),
        .range_bin        (range_bin),
+        .sub_frame        (sub_frame),
        .processing_active(processing_active),
        .frame_complete   (frame_complete),
        .status           (status),
@@ -36,6 +36,7 @@ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 DOPPLER_FFT = 32
 RANGE_BINS = 64
 TOTAL_OUTPUTS = RANGE_BINS * DOPPLER_FFT  # 2048
+SUBFRAME_SIZE = 16

 SCENARIOS = {
    'stationary': {
@@ -125,6 +126,19 @@ def find_peak_bin(i_arr, q_arr):
    return max(range(len(mags)), key=lambda k: mags[k])


+def peak_bins_match(py_peak, rtl_peak):
+    """Return True if peaks match within +/-1 bin inside the same sub-frame."""
+    py_sf = py_peak // SUBFRAME_SIZE
+    rtl_sf = rtl_peak // SUBFRAME_SIZE
+    if py_sf != rtl_sf:
+        return False
+
+    py_bin = py_peak % SUBFRAME_SIZE
+    rtl_bin = rtl_peak % SUBFRAME_SIZE
+    diff = abs(py_bin - rtl_bin)
+    return diff <= 1 or diff >= SUBFRAME_SIZE - 1
+
+
 def total_energy(data_dict):
    """Sum of I^2 + Q^2 across all range bins and Doppler bins."""
    total = 0
@@ -207,8 +221,8 @@ def compare_scenario(name, config, base_dir):
        py_peak = find_peak_bin(py_i, py_q)
        rtl_peak = find_peak_bin(rtl_i, rtl_q)

-        # Peak agreement (allow +/- 1 bin tolerance)
-        if abs(py_peak - rtl_peak) <= 1 or abs(py_peak - rtl_peak) >= DOPPLER_FFT - 1:
+        # Peak agreement (allow +/-1 bin tolerance, but only within a sub-frame)
+        if peak_bins_match(py_peak, rtl_peak):
            peak_agreements += 1

        py_mag = magnitude_l1(py_i, py_q)
@@ -242,7 +256,7 @@ def compare_scenario(name, config, base_dir):
    avg_corr_q = sum(q_correlations) / len(q_correlations)

    print(f"\n  Per-range-bin metrics:")
-    print(f"    Peak Doppler bin agreement (+/-1): {peak_agreements}/{RANGE_BINS} "
+    print(f"    Peak Doppler bin agreement (+/-1 within sub-frame): {peak_agreements}/{RANGE_BINS} "
          f"({peak_agreement_frac:.0%})")
    print(f"    Avg magnitude correlation: {avg_mag_corr:.4f}")
    print(f"    Avg I-channel correlation: {avg_corr_i:.4f}")
@@ -1106,8 +1106,8 @@ FFFF0000
 00000000
 00000000
 00000000
-FFFF0001
-FFFF0000
+00000001
+00000000
 FFFF0005
 00000001
 00000001
@@ -1172,7 +1172,7 @@ FFFF0000
 00010000
 00010000
 00010000
-00060003
+00060002
 00010001
 00000001
 00000000
@@ -1236,7 +1236,7 @@ FFFF0000
 00000000
 0001FFFF
 0002FFFF
-0006FFFD
+0005FFFC
 00010000
 0001FFFF
 00000001
@@ -1300,7 +1300,7 @@ FFFF0000
 00000000
 00000000
 FFFFFFFF
-FFFFFFFA
+FFFEFFFA
 0000FFFF
 0000FFFF
 00010001
@@ -1364,9 +1364,9 @@ FFFF0000
 00000000
 00000000
 FFFF0000
-FFFAFFFD
+FFFAFFFF
 FFFFFFFF
-00000000
+00000001
 00000001
 FFFF0000
 00000000
@@ -1427,74 +1427,74 @@ FFFF0000
 FFFF0000
 00000000
 FFFF0000
-00000001
-FFFB0005
-FFFE0001
-00000000
-00010000
-00000000
-00000000
-00000001
-00000000
-0000FFFF
-00010001
-00000000
-00000000
-00000000
-00000000
-00000000
-00000001
-00000001
-00000000
-00010001
-00000000
-00000000
-00000000
-00000000
-00000000
-00000000
-00000000
-FFFFFFFF
-FFFFFFFF
-0000FFFF
-00000000
-00000000
-00000001
-00000000
-00000000
-FFFF0000
-FFFF0000
-00000001
-00010000
-00000000
-FFFF0000
-00010000
-00000001
-FFFF0000
-FFFF0000
-00010001
-FFFF0000
-FFFFFFFF
-00000000
-00010000
-FFFF0000
-00000000
-00000000
-00000000
-00000000
-00000000
-00000000
-00000000
-00010001
-00000000
-00000000
-FFFF0000
-00000000
-00010001
-00000001
-00010006
 00000002
+FFFD0006
+FFFE0001
 00000001
+00010000
+00000000
+00000000
+00000001
+00000000
+0000FFFF
+00010001
+00000000
+00000000
+00000000
+00000000
+00000000
+00000001
+00000001
+00000000
+00010001
+00000000
+00000000
+00000000
+00000000
+00000000
+00000000
+00000000
+FFFFFFFF
+FFFFFFFF
+0000FFFF
+00000000
+00000000
+00000001
+00000000
+00000000
+FFFF0000
+FFFF0000
+00000001
+00010000
+00000000
+FFFF0000
+00010000
+00000001
+FFFF0000
+FFFF0000
+00010001
+FFFF0000
+FFFFFFFF
+00000000
+00010000
+FFFF0000
+00000000
+00000000
+00000000
+00000000
+00000000
+00000000
+00000000
+00010001
+00000000
+00000000
+FFFF0000
+00000000
+00010000
+00010001
+00030005
+00010001
+00010001
 00000000
 00000000
 FFFF0000
@@ -1556,8 +1556,8 @@ FFFFFFFF
 00000000
 00010000
 00020000
-00060001
-00010000
+0006FFFE
+0001FFFF
 00010000
 FFFF0000
 00000001
@@ -1619,9 +1619,9 @@ FFFFFFFE
 00000001
 0000FFFF
 00010000
-0001FFFF
-0004FFFB
-0002FFFF
+0001FFFE
+0001FFFA
+0002FFFE
 00010000
 FFFF0000
 FFFF0000
@@ -1682,9 +1682,9 @@ FFFF0000
 00000000
 00000001
 00000001
-00000000
 FFFF0000
-FFFEFFFA
+FFFF0000
+FFFBFFFC
 FFFFFFFF
 FFFF0000
 0000FFFF
@@ -1747,9 +1747,9 @@ FFFFFFFF
 00000000
 0000FFFF
 FFFF0001
-FFFF0000
-FFFA0000
-FFFE0000
+FFFF0001
+FFFA0003
+FFFF0001
 FFFF0000
 00000000
 00000001
@@ -1811,74 +1811,74 @@ FFFF0001
 00010000
 0000FFFF
 00000000
-FFFF0002
-FFFD0005
-FFFF0001
-00000001
-0000FFFF
-FFFF0001
-00000000
-00000000
-00000000
-FFFFFFFF
-00010001
-FFFFFFFF
-00000001
-00000000
-00000000
-00000000
-00010000
-00000000
-00000000
-FFFF0000
-00000000
-00000000
-00010000
-00000000
-00000000
-00000000
-00000000
-00000000
-0000FFFF
-00000000
-0000FFFF
-00000000
-00000000
-00000001
-00000001
-00000000
-00000000
-00000000
-00000000
-00000001
-FFFF0000
-00010000
-FFFF0000
-FFFF0000
-00000000
-00000000
-00000000
-00000001
-00000000
-FFFF0000
-00000001
-FFFF0000
-00000000
-0000FFFF
-FFFF0000
-0000FFFF
-00010000
-FFFF0000
-0001FFFF
-0000FFFF
-0001FFFF
-00000000
-0000FFFF
-00000001
-00010002
-00030005
 00000002
+00000006
+FFFF0002
 00010001
+0000FFFF
+FFFF0001
+00000000
+00000000
+00000000
+FFFFFFFF
+00010001
+FFFFFFFF
+00000001
+00000000
+00000000
+00000000
+00010000
+00000000
+00000000
+FFFF0000
+00000000
+00000000
+00010000
+00000000
+00000000
+00000000
+00000000
+00000000
+0000FFFF
+00000000
+0000FFFF
+00000000
+00000000
+00000001
+00000001
+00000000
+00000000
+00000000
+00000000
+00000001
+FFFF0000
+00010000
+FFFF0000
+FFFF0000
+00000000
+00000000
+00000000
+00000001
+00000000
+FFFF0000
+00000001
+FFFF0000
+00000000
+0000FFFF
+FFFF0000
+0000FFFF
+00010000
+FFFF0000
+0001FFFF
+0000FFFF
+0001FFFF
+00000000
+0000FFFF
+00010000
+00020001
+00060002
+00000001
+00010000
 0001FFFF
 00000000
 00000000
@@ -1939,9 +1939,9 @@ FFFF0000
 00000000
 0000FFFF
 0001FFFF
-0001FFFF
-00070000
-00000000
+0000FFFE
+0005FFFC
+0000FFFF
 00010001
 FFFF0000
 0000FFFF
@@ -2003,9 +2003,9 @@ FFFF0000
 00000001
 00000000
 0000FFFF
-0001FFFF
-0002FFF9
 0000FFFF
+FFFDFFF9
+FFFFFFFF
 FFFFFFFF
 00000000
 00000000
@@ -1099,7 +1099,7 @@ FFFF0000
 00000000
 00000002
 FFFF0003
-FFFE0012
+FFFF0012
 00000003
 FFFF0002
 00010001
@@ -1163,7 +1163,7 @@ FFFF0000
 00010001
 00010002
 00020003
-000C000D
+000D000C
 00030003
 00000001
 00000001
@@ -1226,9 +1226,9 @@ FFFF0000
 00000000
 FFFF0000
 00020000
-00030000
-00110004
-00030000
+0003FFFF
+00120002
+0003FFFF
 00020000
 00000000
 FFFF0000
@@ -1291,8 +1291,8 @@ FFFF0000
 00010000
 0002FFFF
 0003FFFE
-000FFFF6
-0004FFFF
+000EFFF4
+0003FFFE
 0002FFFF
 00000000
 FFFF0000
@@ -1312,8 +1312,8 @@ FFFF0000
 00010000
 00000001
 0000FFFF
-00000000
 00010000
+00010001
 FFFF0000
 00000001
 0000FFFF
@@ -1353,10 +1353,10 @@ FFFF0000
 00010001
 0001FFFF
 00010000
-0001FFFE
-0001FFFD
-0006FFF0
-0001FFFD
+0000FFFE
+0000FFFD
+0003FFEF
+0000FFFD
 0000FFFE
 00000000
 00010000
@@ -1376,7 +1376,7 @@ FFFF0000
 0000FFFF
 00010000
 00000001
-00010001
+00010002
 00000000
 00000001
 00000000
@@ -1418,10 +1418,10 @@ FFFF0000
 0000FFFF
 FFFF0000
 FFFFFFFE
-FFFEFFFD
-FFF9FFF1
-FFFEFFFD
-FFFFFFFF
+FFFDFFFD
+FFF5FFF2
+FFFEFFFE
+FFFE0000
 FFFF0000
 00000001
 FFFF0000
@@ -1439,8 +1439,8 @@ FFFF0000
 0000FFFF
 00010001
 FFFF0000
-FFFF0001
-FFFF0001
+FFFF0000
+FFFF0000
 00000000
 00000000
 00000001
@@ -1482,10 +1482,10 @@ FFFF0000
 00000000
 00000000
 FFFF0000
-FFFCFFFF
-FFEFFFF9
-FFFCFFFF
-FFFF0000
+FFFC0000
+FFEEFFFE
+FFFC0000
+FFFF0001
 00000000
 00000000
 FFFF0000
@@ -1504,7 +1504,7 @@ FFFF0000
 00000000
 00000000
 00000000
-FFFFFFFF
+0000FFFF
 FFFF0001
 00000000
 00010000
@@ -1546,10 +1546,10 @@ FFFFFFFF
 00000000
 FFFFFFFF
 FFFE0001
-FFFD0001
-FFEF0006
-FFFD0001
-FFFF0000
+FFFD0002
+FFF1000B
+FFFD0002
+FFFF0001
 00000000
 FFFFFFFF
 00010000
@@ -1609,77 +1609,77 @@ FFFF0001
 00000000
 00000001
 00000000
-FFFF0002
-FFFE0003
-FFF7000E
-FFFF0005
-FFFF0001
-0001FFFF
-00000000
-00000001
-0000FFFF
-00000000
-00000000
-FFFF0000
-00010000
-00010000
-FFFF0000
-FFFF0000
-0000FFFF
-00000000
-00000000
-00010000
-00000000
-00000000
-00010000
-00020001
-00000000
-00000000
-00000000
-FFFF0000
-00000000
-00000000
-00010000
-00000001
-00000001
-00000000
-00000000
-00000000
-00000000
-00000000
-00000000
-00000000
-00000001
-0000FFFF
-00000000
-0000FFFF
-00010000
-FFFF0000
-0001FFFF
-00010001
-00000000
-FFFF0001
-00010000
-0000FFFF
-00000001
-FFFF0000
-00000000
-0000FFFF
-FFFF0000
-00000001
-00000000
-FFFF0000
-FFFF0000
-00000000
-0000FFFF
-00000001
 00000002
-00000003
-00050012
-00010003
+FFFF0004
+FFFC0010
+00000005
+00000001
+0001FFFF
+00000000
+00000001
+0000FFFF
+00000000
+00000000
+FFFF0000
+00010000
+00010000
+FFFF0000
+FFFF0000
+0000FFFF
+00000000
+00000000
+00010000
+00000000
+00000000
+00010000
 00010002
 00000000
 00000000
+00000000
+FFFF0000
+00000000
+00000000
+00010000
+00000001
+00000001
+00000000
+00000000
+00000000
+00000000
+00000000
+00000000
+00000000
+00000001
+0000FFFF
+00000000
+0000FFFF
+00010000
+FFFF0000
+0001FFFF
+00010001
+00000000
+FFFF0001
+00010000
+0000FFFF
+00000001
+FFFF0000
+00000000
+0000FFFF
+FFFF0000
+00000001
+00000000
+FFFF0000
+FFFF0000
+00000000
+0000FFFF
+00000001
+00000002
+00010003
+000B000F
+00020003
+00020002
+00000000
+00000000
 00000001
 00000001
 00000001
@@ -1696,9 +1696,9 @@ FFFFFFFF
 00000000
 0000FFFF
 00000000
-00000002
-00010000
-00000000
+FFFF0001
+0000FFFF
+FFFF0000
 00000000
 00000000
 00000000
@@ -1737,160 +1737,160 @@ FFFFFFFF
 00000000
 00000000
 00000001
+00020001
+00030000
+00110004
+00040000
+00020000
+00000000
+00000000
+00000000
+0000FFFF
+00000001
+00000000
+00000001
+00000000
+00000000
+00000000
+00000001
+FFFFFFFF
+0000FFFF
+FFFF0000
+00000000
+FFFF0000
+00000001
+00000000
+0000FFFF
+FFFFFFFF
+00000000
+00000000
+FFFF0000
+FFFF0000
+0000FFFF
+00010000
+00000001
+00010000
+00010001
+00000000
+0000FFFF
+00000001
+00000000
+FFFF0001
+00010001
+00000000
+00000000
+00000000
+00000000
+FFFFFFFF
+FFFF0000
+00000000
+00010001
+00010000
+FFFFFFFF
+00000000
+00000001
+00000000
+00000000
+00000000
+00000000
+00000000
+00010000
+00000000
+FFFF0000
+0000FFFF
+0000FFFF
+00000000
+00000000
+0001FFFF
+0004FFFE
+000FFFF7
+0004FFFE
+00010000
+FFFF0001
+0000FFFF
+00010000
+0000FFFF
+00000000
+FFFF0001
+00000000
+FFFF0000
+00010000
+0000FFFF
+FFFF0001
+00000000
+00000000
+00000000
+FFFFFFFF
+00010001
+FFFFFFFF
+00000000
+00010000
+00000000
+00000000
+00010000
+00000000
+00000000
+FFFF0000
+00000000
+00000000
+00010000
+00000000
+00000000
+00000000
+00000000
+00000000
+0000FFFF
+00000000
+0000FFFF
+00000000
+00000000
+00000001
+00000001
+00000000
+00000000
+00000000
+00000000
+00000001
+FFFF0000
+00010000
+FFFF0000
+FFFF0000
+00000000
+00000000
+00000000
+00000001
+00000000
+FFFF0000
+00000001
+FFFF0000
+00000000
+0000FFFF
+0000FFFE
+0001FFFB
+0005FFEF
+0000FFFC
+0001FFFE
+0000FFFF
+0001FFFF
+00000000
+0000FFFF
+00000000
+00010001
+00000000
+FFFF0001
+00000000
+0001FFFF
+00000000
+00000000
+00010000
+FFFF0000
+00000000
+0001FFFF
+00000000
+00000001
 00020002
-00030001
-000E000A
-00040001
-00020001
-00000000
-00000000
 00000000
 0000FFFF
-00000001
-00000000
-00000001
-00000000
-00000000
-00000000
-00000001
-FFFFFFFF
-0000FFFF
-FFFF0000
-00000000
-FFFF0000
-00000001
-00000000
-FFFFFFFF
-FFFFFFFF
-00000000
-00000000
-FFFF0000
-FFFF0000
-0000FFFF
-00010000
-00000001
-00010000
-00010001
-00000000
-0000FFFF
-00000001
-00000000
-FFFF0001
-00010001
-00000000
-00000000
-00000000
-00000000
-FFFFFFFF
-FFFF0000
-00000000
-00010001
-00010000
-FFFFFFFF
-00000000
-00000001
-00000000
-00000000
-00000000
-00000000
-00000000
-00010000
-00000000
-FFFF0000
-0000FFFF
-0000FFFF
-00000000
-00000000
-00020000
-00050000
-0012FFFE
-00040000
-00020000
-FFFF0001
-0000FFFF
-00010000
-0000FFFF
-00000000
-FFFF0001
-00000000
-FFFF0000
-00010000
-0000FFFF
-FFFF0001
-00000000
-00000000
-00000000
-FFFFFFFF
-00010001
-FFFFFFFF
-00000000
-0000FFFF
-00000000
-00000000
-00010000
-00000000
-00000000
-FFFF0000
-00000000
-00000000
-00010000
-00000000
-00000000
-00000000
-00000000
-00000000
-0000FFFF
-00000000
-0000FFFF
-00000000
-00000000
-00000001
-00000001
-00000000
-00000000
-00000000
-00000000
-00000001
-FFFF0000
-00010000
-FFFF0000
-FFFF0000
-00000000
-00000000
-00000000
-00000001
-00000000
-FFFF0000
-00000001
-FFFF0000
-00000000
-0000FFFF
-0000FFFE
-0003FFFC
-000CFFF3
-0001FFFD
-0002FFFE
-0000FFFF
-0001FFFF
-00000000
-0000FFFF
-00000000
-00010001
-00000000
-FFFF0001
-00000000
-0001FFFF
-00000000
-00000000
-00010000
-FFFF0000
-00000000
-0001FFFF
-00010000
-00000000
-00030001
-00000000
-0001FFFF
 00000000
 00000000
 0000FFFF
@@ -1929,78 +1929,78 @@ FFFF0000
 FFFF0000
 00000000
 00000000
-0000FFFE
-FFFFFFFD
-FFFFFFEE
-FFFFFFFC
 FFFFFFFE
-00000000
-FFFF0000
-00000000
-0000FFFF
-0000FFFF
-FFFFFFFF
-00000000
-FFFF0000
-00000001
-FFFF0000
-0000FFFF
-00000000
-00000000
-00000000
-00010000
-FFFF0000
-00000000
-00000000
-00010001
-00000000
-00000000
-0000FFFF
-00000000
-00000000
-00000000
-00000000
-00000001
-0000FFFF
-00000000
-00000000
-00000000
-00000000
-00010000
-00000000
-00000001
-00000000
-FFFF0000
-00000000
-00000001
-00010000
-00000000
-00000001
-00010000
-00000000
-FFFF0000
-00000001
-00000000
-00000000
-00000000
-00000000
-00000000
-00000001
-00010000
-00000000
-00000000
-0001FFFF
-0000FFFF
-00010000
-FFFF0000
-FFFFFFFF
-FFFEFFFE
-FFF3FFF3
 FFFEFFFD
+FFF7FFF1
+FFFEFFFD
+FFFEFFFE
+00000000
+FFFF0000
+00000000
+0000FFFF
+0000FFFF
 FFFFFFFF
 00000000
 FFFF0000
 00000001
+FFFF0000
+0000FFFF
+00000000
+00000000
+00000000
+00010000
+FFFF0000
+00000000
+00000000
+FFFF0000
+00000000
+00000000
+0000FFFF
+00000000
+00000000
+00000000
+00000000
+00000001
+0000FFFF
+00000000
+00000000
+00000000
+00000000
+00010000
+00000000
+00000001
+00000000
+FFFF0000
+00000000
+00000001
+00010000
+00000000
+00000001
+00010000
+00000000
+FFFF0000
+00000001
+00000000
+00000000
+00000000
+00000000
+00000000
+00000001
+00010000
+00000000
+00000000
+0001FFFF
+0000FFFF
+00010000
+FFFF0000
+FFFF0000
+FFFEFFFF
+FFEEFFFB
+FFFDFFFE
+FFFEFFFF
+00000000
+FFFF0000
+00000001
 00000000
 00000000
 00000001
@@ -2016,7 +2016,7 @@ FFFF0001
 00010000
 00000000
 0001FFFF
-FFFE0000
+FFFFFFFF
 00000001
 00000000
 00010000
@@ -1075,44 +1075,43 @@ class RangeBinDecimator:


 # =============================================================================
-# Doppler Processor (Hamming window + 32-point FFT)
+# Doppler Processor (Hamming window + dual 16-point FFT)
 # =============================================================================

-# Hamming window LUT (32 entries, 16-bit unsigned Q15)
+# Hamming window LUT (16 entries, 16-bit unsigned Q15)
+# Matches doppler_processor.v window_coeff[0:15]
+# w[n] = 0.54 - 0.46 * cos(2*pi*n/15), n=0..15, symmetric
 HAMMING_WINDOW = [
-    0x0800, 0x0862, 0x09CB, 0x0C3B, 0x0FB2, 0x142F, 0x19B2, 0x2039,
-    0x27C4, 0x3050, 0x39DB, 0x4462, 0x4FE3, 0x5C5A, 0x69C4, 0x781D,
-    0x7FFF, 0x781D, 0x69C4, 0x5C5A, 0x4FE3, 0x4462, 0x39DB, 0x3050,
-    0x27C4, 0x2039, 0x19B2, 0x142F, 0x0FB2, 0x0C3B, 0x09CB, 0x0862,
+    0x0A3D, 0x0E5C, 0x1B6D, 0x3088, 0x4B33, 0x6573, 0x7642, 0x7F62,
+    0x7F62, 0x7642, 0x6573, 0x4B33, 0x3088, 0x1B6D, 0x0E5C, 0x0A3D,
 ]


 class DopplerProcessor:
    """
-    Bit-accurate model of doppler_processor_optimized.v
+    Bit-accurate model of doppler_processor_optimized.v (dual 16-pt FFT architecture).

-    For each range bin (0-63):
-      1. Read 32 chirps of data from accumulation buffer
-      2. Apply Hamming window (Q15 multiply, round, >>>15)
-      3. 32-point FFT
+    The staggered-PRF frame has 32 chirps total:
+      - Sub-frame 0 (long PRI):  chirps 0-15  -> 16-pt Hamming -> 16-pt FFT -> bins 0-15
+      - Sub-frame 1 (short PRI): chirps 16-31 -> 16-pt Hamming -> 16-pt FFT -> bins 16-31

-    The 32-point FFT uses xfft_32.v (Xilinx IP wrapper around fft_engine).
-    For the Python model, we use FFTEngine with N=32.
+    Output: doppler_bin[4:0] = {sub_frame_id, bin_in_subframe[3:0]}
+    Total output per range bin: 32 bins (16 + 16), same interface as before.
    """

-    DOPPLER_FFT_SIZE = 32
+    DOPPLER_FFT_SIZE = 16     # Per sub-frame
    RANGE_BINS = 64
    CHIRPS_PER_FRAME = 32
+    CHIRPS_PER_SUBFRAME = 16

-    def __init__(self, twiddle_file_32=None):
+    def __init__(self, twiddle_file_16=None):
        """
-        For 32-point FFT, we need the 32-point twiddle file.
+        For 16-point FFT, we need the 16-point twiddle file.
        If not provided, we generate twiddle factors mathematically
-        (since the 32-pt twiddle ROM is cos(2*pi*k/32) for k=0..7).
+        (cos(2*pi*k/16) for k=0..3, quarter-wave ROM with 4 entries).
        """
-        self.fft32 = None
-        self._twiddle_file_32 = twiddle_file_32
-        # We'll use a simple 32-pt FFT with computed twiddles
+        self.fft16 = None
+        self._twiddle_file_16 = twiddle_file_16

    @staticmethod
    def window_multiply(data_16, window_16):
@@ -1134,7 +1133,7 @@ class DopplerProcessor:

    def process_frame(self, chirp_data_i, chirp_data_q):
        """
-        Process one complete Doppler frame.
+        Process one complete Doppler frame using dual 16-pt FFTs.

        Args:
            chirp_data_i: 2D array [32 chirps][64 range bins] of signed 16-bit I
@@ -1143,46 +1142,63 @@ class DopplerProcessor:
        Returns:
            (doppler_map_i, doppler_map_q): 2D arrays [64 range bins][32 doppler bins]
                                            of signed 16-bit
+                                            Bins 0-15 = sub-frame 0 (long PRI)
+                                            Bins 16-31 = sub-frame 1 (short PRI)
        """
        doppler_map_i = []
        doppler_map_q = []

-        # Generate 32-pt twiddle factors (quarter-wave cos, 8 entries)
-        # cos(2*pi*k/32) for k=0..7
+        # Generate 16-pt twiddle factors (quarter-wave cos, 4 entries)
+        # cos(2*pi*k/16) for k=0..3
+        # Matches fft_twiddle_16.mem: 7FFF, 7641, 5A82, 30FB
        import math
-        cos_rom_32 = []
-        for k in range(8):
-            val = round(32767.0 * math.cos(2.0 * math.pi * k / 32.0))
-            cos_rom_32.append(sign_extend(val & 0xFFFF, 16))
+        cos_rom_16 = []
+        for k in range(4):
+            val = round(32767.0 * math.cos(2.0 * math.pi * k / 16.0))
+            cos_rom_16.append(sign_extend(val & 0xFFFF, 16))

-        fft32 = FFTEngine.__new__(FFTEngine)
-        fft32.N = 32
-        fft32.LOG2N = 5
-        fft32.cos_rom = cos_rom_32
-        fft32.mem_re = [0] * 32
-        fft32.mem_im = [0] * 32
+        fft16 = FFTEngine.__new__(FFTEngine)
+        fft16.N = 16
+        fft16.LOG2N = 4
+        fft16.cos_rom = cos_rom_16
+        fft16.mem_re = [0] * 16
+        fft16.mem_im = [0] * 16

        for rbin in range(self.RANGE_BINS):
-            # Gather 32 chirps for this range bin
-            fft_in_re = []
-            fft_in_im = []
+            # Output bins for this range bin: 32 total (16 from each sub-frame)
+            out_re = [0] * 32
+            out_im = [0] * 32

-            for chirp in range(self.CHIRPS_PER_FRAME):
-                re_val = sign_extend(chirp_data_i[chirp][rbin] & 0xFFFF, 16)
-                im_val = sign_extend(chirp_data_q[chirp][rbin] & 0xFFFF, 16)
+            # Process each sub-frame independently
+            for sf in range(2):
+                chirp_start = sf * self.CHIRPS_PER_SUBFRAME
+                bin_offset = sf * self.DOPPLER_FFT_SIZE

-                # Apply Hamming window
-                win_re = self.window_multiply(re_val, HAMMING_WINDOW[chirp])
-                win_im = self.window_multiply(im_val, HAMMING_WINDOW[chirp])
+                fft_in_re = []
+                fft_in_im = []

-                fft_in_re.append(win_re)
-                fft_in_im.append(win_im)
+                for c in range(self.CHIRPS_PER_SUBFRAME):
+                    chirp = chirp_start + c
+                    re_val = sign_extend(chirp_data_i[chirp][rbin] & 0xFFFF, 16)
+                    im_val = sign_extend(chirp_data_q[chirp][rbin] & 0xFFFF, 16)

-            # 32-point forward FFT
-            fft_out_re, fft_out_im = fft32.compute(fft_in_re, fft_in_im, inverse=False)
+                    # Apply 16-pt Hamming window (index = c within sub-frame)
+                    win_re = self.window_multiply(re_val, HAMMING_WINDOW[c])
+                    win_im = self.window_multiply(im_val, HAMMING_WINDOW[c])

-            doppler_map_i.append(fft_out_re)
-            doppler_map_q.append(fft_out_im)
+                    fft_in_re.append(win_re)
+                    fft_in_im.append(win_im)
+
+                # 16-point forward FFT
+                fft_out_re, fft_out_im = fft16.compute(fft_in_re, fft_in_im, inverse=False)
+
+                # Pack into output: sub-frame 0 -> bins 0-15, sub-frame 1 -> bins 16-31
+                for b in range(self.DOPPLER_FFT_SIZE):
+                    out_re[bin_offset + b] = fft_out_re[b]
+                    out_im[bin_offset + b] = fft_out_im[b]
+
+            doppler_map_i.append(out_re)
+            doppler_map_q.append(out_im)

        return doppler_map_i, doppler_map_q

@@ -1207,7 +1223,7 @@ class SignalChain:
    IF_FREQ = 120_000_000    # IF frequency
    FTW_120MHZ = 0x4CCCCCCD  # Phase increment for 120 MHz at 400 MSPS

-    def __init__(self, twiddle_file_1024=None, twiddle_file_32=None):
+    def __init__(self, twiddle_file_1024=None, twiddle_file_16=None):
        self.nco = NCO()
        self.mixer = Mixer()
        self.cic_i = CICDecimator()
@@ -1217,7 +1233,7 @@ class SignalChain:
        self.ddc_interface = DDCInputInterface()
        self.matched_filter = MatchedFilterChain(fft_size=1024, twiddle_file=twiddle_file_1024)
        self.range_decimator = RangeBinDecimator()
-        self.doppler = DopplerProcessor(twiddle_file_32=twiddle_file_32)
+        self.doppler = DopplerProcessor(twiddle_file_16=twiddle_file_16)

    def ddc_step(self, adc_data_8bit, ftw=None):
        """
@@ -3,23 +3,17 @@
 Generate Doppler processor co-simulation golden reference data.

 Uses the bit-accurate Python model (fpga_model.py) to compute the expected
-Doppler FFT output. Also generates the input hex files consumed by the
-Verilog testbench (tb_doppler_cosim.v).
+Doppler FFT output for the dual 16-pt FFT architecture.  Also generates the
+input hex files consumed by the Verilog testbench (tb_doppler_cosim.v).

-Two output modes:
-  1. "clean" — straight Python model (correct windowing alignment)
-  2. "buggy" — replicates the RTL's windowing pipeline misalignment:
-     * Sample 0: fft_input = 0 (from reset mult value)
-     * Sample 1: fft_input = window_multiply(data[wrong_rbin_or_0], window[0])
-     * Sample k (k>=2): fft_input = window_multiply(data[k-2], window[k-1])
-
-Default mode is "clean".  The comparison script uses correlation-based
-metrics that are tolerant of the pipeline shift.
+Architecture:
+  Sub-frame 0 (long PRI):  chirps 0-15  -> 16-pt Hamming -> 16-pt FFT -> bins 0-15
+  Sub-frame 1 (short PRI): chirps 16-31 -> 16-pt Hamming -> 16-pt FFT -> bins 16-31

 Usage:
    cd ~/PLFM_RADAR/9_Firmware/9_2_FPGA/tb/cosim
-    python3 gen_doppler_golden.py            # clean model
-    python3 gen_doppler_golden.py --buggy    # replicate RTL pipeline bug
+    python3 gen_doppler_golden.py
+    python3 gen_doppler_golden.py stationary   # single scenario

 Author: Phase 0.5 Doppler co-simulation suite for PLFM_RADAR
 """
@@ -31,7 +25,7 @@ import sys
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

 from fpga_model import (
-    DopplerProcessor, FFTEngine, sign_extend, HAMMING_WINDOW
+    DopplerProcessor, sign_extend, HAMMING_WINDOW
 )
 from radar_scene import Target, generate_doppler_frame

@@ -40,7 +34,8 @@ from radar_scene import Target, generate_doppler_frame
 # Constants
 # =============================================================================

-DOPPLER_FFT_SIZE = 32
+DOPPLER_FFT_SIZE = 16     # Per sub-frame
+DOPPLER_TOTAL_BINS = 32   # Total output (2 sub-frames x 16)
 RANGE_BINS = 64
 CHIRPS_PER_FRAME = 32
 TOTAL_SAMPLES = CHIRPS_PER_FRAME * RANGE_BINS  # 2048
@@ -82,154 +77,6 @@ def write_hex_16bit(filepath, data):
 # Buggy-model helpers  (match RTL pipeline misalignment)
 # =============================================================================

-def window_multiply(data_16, window_16):
-    """Hamming window multiply matching RTL."""
-    d = sign_extend(data_16 & 0xFFFF, 16)
-    w = sign_extend(window_16 & 0xFFFF, 16)
-    product = d * w
-    rounded = product + (1 << 14)
-    result = rounded >> 15
-    return sign_extend(result & 0xFFFF, 16)
-
-
-def buggy_process_frame(chirp_data_i, chirp_data_q):
-    """
-    Replicate the RTL's exact windowing pipeline for all 64 range bins.
-
-    For each range bin we model the three-stage pipeline:
-      Stage A (BRAM registered read):
-        mem_rdata captures doppler_i_mem[mem_read_addr] one cycle AFTER
-        mem_read_addr is presented.
-      Stage B (multiply):
-        mult_i <= mem_rdata_i * window_coeff[read_doppler_index]
-        -- read_doppler_index is the CURRENT cycle's value, but mem_rdata_i
-        -- is from the PREVIOUS cycle's address.
-      Stage C (round+shift):
-        fft_input_i <= (mult_i + (1<<14)) >>> 15
-        -- uses the PREVIOUS cycle's mult_i.
-
-    Additionally, at the S_ACCUMULATE->S_LOAD_FFT transition (rbin=0) or
-    S_OUTPUT->S_LOAD_FFT transition (rbin>0), the BRAM address during the
-    transition cycle depends on the stale read_doppler_index and read_range_bin
-    values.
-
-    This function models every detail to produce bit-exact FFT inputs.
-    """
-    # Build the 32-pt FFT engine (matching fpga_model.py)
-    import math as _math
-    cos_rom_32 = []
-    for k in range(8):
-        val = round(32767.0 * _math.cos(2.0 * _math.pi * k / 32.0))
-        cos_rom_32.append(sign_extend(val & 0xFFFF, 16))
-
-    fft32 = FFTEngine.__new__(FFTEngine)
-    fft32.N = 32
-    fft32.LOG2N = 5
-    fft32.cos_rom = cos_rom_32
-    fft32.mem_re = [0] * 32
-    fft32.mem_im = [0] * 32
-
-    # Build flat BRAM contents: addr = chirp_index * 64 + range_bin
-    bram_i = [0] * TOTAL_SAMPLES
-    bram_q = [0] * TOTAL_SAMPLES
-    for chirp in range(CHIRPS_PER_FRAME):
-        for rb in range(RANGE_BINS):
-            addr = chirp * RANGE_BINS + rb
-            bram_i[addr] = sign_extend(chirp_data_i[chirp][rb] & 0xFFFF, 16)
-            bram_q[addr] = sign_extend(chirp_data_q[chirp][rb] & 0xFFFF, 16)
-
-    doppler_map_i = []
-    doppler_map_q = []
-
-    # State carried across range bins (simulates the RTL registers)
-    # After reset: read_doppler_index=0, read_range_bin=0, mult_i=0, mult_q=0,
-    # fft_input_i=0, fft_input_q=0
-    # The BRAM read is always active: mem_rdata <= doppler_i_mem[mem_read_addr]
-    # mem_read_addr = read_doppler_index * 64 + read_range_bin
-
-    # We need to track what read_doppler_index and read_range_bin are at each
-    # transition, since the BRAM captures data one cycle before S_LOAD_FFT runs.
-
-    # Before processing starts (just entered S_LOAD_FFT from S_ACCUMULATE):
-    # At the S_ACCUMULATE clock that transitions:
-    #   read_doppler_index <= 0 (NBA)
-    #   read_range_bin <= 0 (NBA)
-    # These take effect NEXT cycle. At the transition clock itself,
-    # read_doppler_index and read_range_bin still had their old values.
-    # From reset, both were 0. So BRAM captures addr=0*64+0=0.
-    #
-    # For rbin>0 transitions from S_OUTPUT:
-    #   At S_OUTPUT clock:
-    #     read_doppler_index <= 0  (was 0, since it wrapped from 32->0 in 5 bits)
-    #     read_range_bin <= prev_rbin + 1 (NBA, takes effect next cycle)
-    #   At S_OUTPUT clock, the current read_range_bin = prev_rbin,
-    #   read_doppler_index = 0 (wrapped). So BRAM captures addr=0*64+prev_rbin.
-
-    for rbin in range(RANGE_BINS):
-        # Determine what BRAM data was captured during the transition clock
-        # (one cycle before S_LOAD_FFT's first execution cycle).
-        if rbin == 0:
-            # From S_ACCUMULATE: both indices were 0 (from reset or previous NBA)
-            # BRAM captures addr = 0*64+0 = 0  -> data[chirp=0][rbin=0]
-            transition_bram_addr = 0 * RANGE_BINS + 0
-        else:
-            # From S_OUTPUT: read_doppler_index=0 (wrapped), read_range_bin=rbin-1
-            # BRAM captures addr = 0*64+(rbin-1) -> data[chirp=0][rbin-1]
-            transition_bram_addr = 0 * RANGE_BINS + (rbin - 1)
-
-        transition_data_i = bram_i[transition_bram_addr]
-        transition_data_q = bram_q[transition_bram_addr]
-
-        # Now simulate the 32 cycles of S_LOAD_FFT for this range bin.
-        # Register pipeline state at entry:
-        mult_i_reg = 0  # From reset (rbin=0) or from end of previous S_FFT_WAIT
-        mult_q_reg = 0
-
-        fft_in_i_list = []
-        fft_in_q_list = []
-
-        for k in range(DOPPLER_FFT_SIZE):
-            # read_doppler_index = k at this cycle's start
-            # mem_read_addr = k * 64 + rbin
-
-            # What mem_rdata holds THIS cycle:
-            if k == 0:
-                # BRAM captured transition_bram_addr last cycle
-                rd_i = transition_data_i
-                rd_q = transition_data_q
-            else:
-                # BRAM captured addr from PREVIOUS cycle: (k-1)*64 + rbin
-                prev_addr = (k - 1) * RANGE_BINS + rbin
-                rd_i = bram_i[prev_addr]
-                rd_q = bram_q[prev_addr]
-
-            # Stage B: multiply (uses current read_doppler_index = k)
-            new_mult_i = sign_extend(rd_i & 0xFFFF, 16) * \
-                         sign_extend(HAMMING_WINDOW[k] & 0xFFFF, 16)
-            new_mult_q = sign_extend(rd_q & 0xFFFF, 16) * \
-                         sign_extend(HAMMING_WINDOW[k] & 0xFFFF, 16)
-
-            # Stage C: round+shift (uses PREVIOUS cycle's mult)
-            fft_i = (mult_i_reg + (1 << 14)) >> 15
-            fft_q = (mult_q_reg + (1 << 14)) >> 15
-
-            fft_in_i_list.append(sign_extend(fft_i & 0xFFFF, 16))
-            fft_in_q_list.append(sign_extend(fft_q & 0xFFFF, 16))
-
-            # Update pipeline registers for next cycle
-            mult_i_reg = new_mult_i
-            mult_q_reg = new_mult_q
-
-        # 32-point FFT
-        fft_out_re, fft_out_im = fft32.compute(
-            fft_in_i_list, fft_in_q_list, inverse=False
-        )
-
-        doppler_map_i.append(fft_out_re)
-        doppler_map_q.append(fft_out_im)
-
-    return doppler_map_i, doppler_map_q
-

 # =============================================================================
 # Test scenario definitions
@@ -244,9 +91,10 @@ def make_scenario_stationary():
 def make_scenario_moving():
    """Single target with moderate Doppler shift."""
    # v = 15 m/s → fd = 2*v*fc/c ≈ 1050 Hz
-    # PRI = 167 us → Doppler bin = fd * N_chirps * PRI = 1050 * 32 * 167e-6 ≈ 5.6
+    # Long PRI = 167 us → sub-frame 0 bin = fd * 16 * 167e-6 ≈ 2.8 → bin ~3
+    # Short PRI = 175 us → sub-frame 1 bin = fd * 16 * 175e-6 ≈ 2.9 → bin 16+3 = 19
    targets = [Target(range_m=500, velocity_mps=15.0, rcs_dbsm=20.0)]
-    return targets, "Single moving target v=15m/s (~1050Hz Doppler, bin~5-6)"
+    return targets, "Single moving target v=15m/s (~1050Hz Doppler, sf0 bin~3, sf1 bin~19)"


 def make_scenario_two_targets():
@@ -269,12 +117,11 @@ SCENARIOS = {
 # Main generator
 # =============================================================================

-def generate_scenario(name, targets, description, base_dir, use_buggy_model=False):
+def generate_scenario(name, targets, description, base_dir):
    """Generate input hex + golden output for one scenario."""
    print(f"\n{'='*60}")
    print(f"Scenario: {name} — {description}")
-    model_label = "BUGGY (RTL pipeline)" if use_buggy_model else "CLEAN"
-    print(f"Model: {model_label}")
+    print(f"Model: CLEAN (dual 16-pt FFT)")
    print(f"{'='*60}")

    # Generate Doppler frame (32 chirps x 64 range bins)
@@ -292,26 +139,24 @@ def generate_scenario(name, targets, description, base_dir, use_buggy_model=Fals
    input_hex = os.path.join(base_dir, f"doppler_input_{name}.hex")
    write_hex_32bit(input_hex, packed_samples)

-    # ---- Run through Python model ----
-    if use_buggy_model:
-        doppler_i, doppler_q = buggy_process_frame(frame_i, frame_q)
-    else:
-        dp = DopplerProcessor()
-        doppler_i, doppler_q = dp.process_frame(frame_i, frame_q)
+    # ---- Run through Python model (dual 16-pt FFT) ----
+    dp = DopplerProcessor()
+    doppler_i, doppler_q = dp.process_frame(frame_i, frame_q)

    print(f"  Doppler output: {len(doppler_i)} range bins x "
-          f"{len(doppler_i[0])} doppler bins")
+          f"{len(doppler_i[0])} doppler bins (2 sub-frames x {DOPPLER_FFT_SIZE})")

    # ---- Write golden output CSV ----
    # Format: range_bin, doppler_bin, out_i, out_q
    # Ordered same as RTL output: all doppler bins for rbin 0, then rbin 1, ...
+    # Bins 0-15 = sub-frame 0 (long PRI), bins 16-31 = sub-frame 1 (short PRI)
    flat_rbin = []
    flat_dbin = []
    flat_i = []
    flat_q = []

    for rbin in range(RANGE_BINS):
-        for dbin in range(DOPPLER_FFT_SIZE):
+        for dbin in range(DOPPLER_TOTAL_BINS):
            flat_rbin.append(rbin)
            flat_dbin.append(dbin)
            flat_i.append(doppler_i[rbin][dbin])
@@ -331,8 +176,8 @@ def generate_scenario(name, targets, description, base_dir, use_buggy_model=Fals
    peak_info = []
    for rbin in range(RANGE_BINS):
        mags = [abs(doppler_i[rbin][d]) + abs(doppler_q[rbin][d])
-                for d in range(DOPPLER_FFT_SIZE)]
-        peak_dbin = max(range(DOPPLER_FFT_SIZE), key=lambda d: mags[d])
+                for d in range(DOPPLER_TOTAL_BINS)]
+        peak_dbin = max(range(DOPPLER_TOTAL_BINS), key=lambda d: mags[d])
        peak_mag = mags[peak_dbin]
        peak_info.append((rbin, peak_dbin, peak_mag))

@@ -341,33 +186,14 @@ def generate_scenario(name, targets, description, base_dir, use_buggy_model=Fals
    for rbin, dbin, mag in peak_info[:5]:
        i_val = doppler_i[rbin][dbin]
        q_val = doppler_q[rbin][dbin]
-        print(f"    rbin={rbin:2d}, dbin={dbin:2d}, mag={mag:6d}, "
+        sf = dbin // DOPPLER_FFT_SIZE
+        bin_in_sf = dbin % DOPPLER_FFT_SIZE
+        print(f"    rbin={rbin:2d}, dbin={dbin:2d} (sf{sf}:{bin_in_sf:2d}), mag={mag:6d}, "
              f"I={i_val:6d}, Q={q_val:6d}")

-    # ---- Write frame data for debugging ----
-    # Also write per-range-bin FFT input (for debugging pipeline alignment)
-    if use_buggy_model:
-        # Write the buggy FFT inputs for debugging
-        debug_csv = os.path.join(base_dir, f"doppler_fft_inputs_{name}.csv")
-        # Regenerate to capture FFT inputs
-        dp_debug = DopplerProcessor()
-        clean_i, clean_q = dp_debug.process_frame(frame_i, frame_q)
-        # Show the difference between clean and buggy
-        print(f"\n  Comparing clean vs buggy model outputs:")
-        mismatches = 0
-        for rbin in range(RANGE_BINS):
-            for dbin in range(DOPPLER_FFT_SIZE):
-                if (doppler_i[rbin][dbin] != clean_i[rbin][dbin] or
-                    doppler_q[rbin][dbin] != clean_q[rbin][dbin]):
-                    mismatches += 1
-        total = RANGE_BINS * DOPPLER_FFT_SIZE
-        print(f"    {mismatches}/{total} output samples differ "
-              f"({100*mismatches/total:.1f}%)")
-
    return {
        'name': name,
        'description': description,
-        'model': 'buggy' if use_buggy_model else 'clean',
        'peak_info': peak_info[:5],
    }

@@ -375,11 +201,9 @@ def generate_scenario(name, targets, description, base_dir, use_buggy_model=Fals
 def main():
    base_dir = os.path.dirname(os.path.abspath(__file__))

-    use_buggy = '--buggy' in sys.argv
-
    print("=" * 60)
    print("Doppler Processor Co-Sim Golden Reference Generator")
-    print(f"Model: {'BUGGY (RTL pipeline replication)' if use_buggy else 'CLEAN'}")
+    print(f"Architecture: dual {DOPPLER_FFT_SIZE}-pt FFT ({DOPPLER_TOTAL_BINS} total bins)")
    print("=" * 60)

    scenarios_to_run = list(SCENARIOS.keys())
@@ -395,15 +219,14 @@ def main():
    results = []
    for name in scenarios_to_run:
        targets, description = SCENARIOS[name]()
-        r = generate_scenario(name, targets, description, base_dir,
-                              use_buggy_model=use_buggy)
+        r = generate_scenario(name, targets, description, base_dir)
        results.append(r)

    print(f"\n{'='*60}")
    print("Summary:")
    print(f"{'='*60}")
    for r in results:
-        print(f"  {r['name']:<15s} [{r['model']}] top peak: "
+        print(f"  {r['name']:<15s} top peak: "
              f"rbin={r['peak_info'][0][0]}, dbin={r['peak_info'][0][1]}, "
              f"mag={r['peak_info'][0][2]}")

@@ -48,19 +48,24 @@ ADC_BITS = 8              # ADC resolution
 T_LONG_CHIRP = 30e-6      # 30 us long chirp duration
 T_SHORT_CHIRP = 0.5e-6    # 0.5 us short chirp
 T_LISTEN_LONG = 137e-6    # 137 us listening window
+T_PRI_LONG = 167e-6       # 30 us chirp + 137 us listen
+T_PRI_SHORT = 175e-6      # staggered short-PRI sub-frame
 N_SAMPLES_LISTEN = int(T_LISTEN_LONG * FS_ADC)  # 54800 samples

 # Processing chain
 CIC_DECIMATION = 4
 FFT_SIZE = 1024
 RANGE_BINS = 64
-DOPPLER_FFT_SIZE = 32
+DOPPLER_FFT_SIZE = 16      # Per sub-frame
+DOPPLER_TOTAL_BINS = 32    # Total output bins (2 sub-frames x 16)
+CHIRPS_PER_SUBFRAME = 16
 CHIRPS_PER_FRAME = 32

 # Derived
 RANGE_RESOLUTION = C_LIGHT / (2 * CHIRP_BW)  # 7.5 m
 MAX_UNAMBIGUOUS_RANGE = C_LIGHT * T_LISTEN_LONG / 2  # ~20.55 km
-VELOCITY_RESOLUTION = WAVELENGTH / (2 * CHIRPS_PER_FRAME * T_LONG_CHIRP)
+VELOCITY_RESOLUTION_LONG = WAVELENGTH / (2 * CHIRPS_PER_SUBFRAME * T_PRI_LONG)
+VELOCITY_RESOLUTION_SHORT = WAVELENGTH / (2 * CHIRPS_PER_SUBFRAME * T_PRI_SHORT)

 # Short chirp LUT (60 entries, 8-bit unsigned)
 SHORT_CHIRP_LUT = [
@@ -384,9 +389,6 @@ def generate_doppler_frame(targets, n_chirps=CHIRPS_PER_FRAME,
                break
        return math.sqrt(-2.0 * math.log(u1)) * math.cos(2.0 * math.pi * u2)

-    # Chirp repetition interval (PRI)
-    t_pri = T_LONG_CHIRP + T_LISTEN_LONG  # ~167 us
-
    frame_i = []
    frame_q = []

@@ -408,8 +410,16 @@ def generate_doppler_frame(targets, n_chirps=CHIRPS_PER_FRAME,
            # Amplitude (simplified)
            amp = target.amplitude / 4.0

-            # Doppler phase for this chirp
-            doppler_phase = 2 * math.pi * target.doppler_hz * chirp_idx * t_pri
+            # Doppler phase for this chirp.
+            # The frame uses staggered PRF: chirps 0-15 use the long PRI,
+            # chirps 16-31 use the short PRI.
+            if chirp_idx < CHIRPS_PER_SUBFRAME:
+                slow_time_s = chirp_idx * T_PRI_LONG
+            else:
+                slow_time_s = (CHIRPS_PER_SUBFRAME * T_PRI_LONG) + \
+                              ((chirp_idx - CHIRPS_PER_SUBFRAME) * T_PRI_SHORT)
+
+            doppler_phase = 2 * math.pi * target.doppler_hz * slow_time_s
            total_phase = doppler_phase + target.phase_deg * math.pi / 180.0

            # Spread across a few bins (sinc-like response from matched filter)
@@ -91,6 +91,7 @@ doppler_processor_optimized dut (
    .doppler_valid(doppler_valid),
    .doppler_bin(doppler_bin),
    .range_bin(range_bin),
+    .sub_frame(),                   // Not used in this testbench
    .processing_active(processing_active),
    .frame_complete(frame_complete),
    .status(dut_status)
@@ -75,6 +75,7 @@ doppler_processor_optimized dut (
    .doppler_valid(doppler_valid),
    .doppler_bin(doppler_bin),
    .range_bin(range_bin),
+    .sub_frame(),                   // Not used in this testbench
    .processing_active(processing_active),
    .frame_complete(frame_complete),
    .status(dut_status)
@@ -0,0 +1,252 @@
+`timescale 1ns / 1ps
+// ============================================================================
+// xfft_16.v — 16-point FFT with AXI-Stream interface
+// ============================================================================
+// Wraps the synthesizable fft_engine (radix-2 DIT) with the AXI-Stream port
+// interface expected by the doppler_processor dual-FFT architecture.
+//
+// Identical interface to xfft_32.v but with N=16.
+//
+// Data format: {Q[15:0], I[15:0]} packed 32-bit.
+// Config tdata[0]: 1 = forward FFT, 0 = inverse FFT.
+// ============================================================================
+
+module xfft_16 (
+    input  wire        aclk,
+    input  wire        aresetn,
+
+    // Configuration channel (AXI-Stream slave)
+    input  wire [7:0]  s_axis_config_tdata,
+    input  wire        s_axis_config_tvalid,
+    output wire        s_axis_config_tready,
+
+    // Data input channel (AXI-Stream slave)
+    input  wire [31:0] s_axis_data_tdata,
+    input  wire        s_axis_data_tvalid,
+    input  wire        s_axis_data_tlast,
+
+    // Data output channel (AXI-Stream master)
+    output wire [31:0] m_axis_data_tdata,
+    output wire        m_axis_data_tvalid,
+    output wire        m_axis_data_tlast,
+    input  wire        m_axis_data_tready
+);
+
+// ============================================================================
+// PARAMETERS
+// ============================================================================
+localparam N     = 16;
+localparam LOG2N = 4;
+
+// ============================================================================
+// INTERNAL SIGNALS
+// ============================================================================
+
+// FSM states
+localparam [2:0] S_IDLE    = 3'd0,
+                 S_CONFIG  = 3'd1,
+                 S_FEED    = 3'd2,
+                 S_WAIT    = 3'd3,
+                 S_OUTPUT  = 3'd4;
+
+reg [2:0] state;
+
+// Configuration
+reg inverse_reg;
+
+// Input buffering
+reg signed [15:0] in_buf_re [0:N-1];
+reg signed [15:0] in_buf_im [0:N-1];
+reg [4:0] in_count;
+
+// Output buffering
+reg signed [15:0] out_buf_re [0:N-1];
+reg signed [15:0] out_buf_im [0:N-1];
+reg [4:0] out_count;
+reg [4:0] out_total;
+
+// FFT engine interface
+reg fft_start;
+reg fft_inverse;
+reg signed [15:0] fft_din_re, fft_din_im;
+reg fft_din_valid;
+wire signed [15:0] fft_dout_re, fft_dout_im;
+wire fft_dout_valid;
+wire fft_busy;
+wire fft_done;
+
+// Feed counter
+reg [4:0] feed_count;
+
+// ============================================================================
+// FFT ENGINE INSTANCE
+// ============================================================================
+fft_engine #(
+    .N(N),
+    .LOG2N(LOG2N),
+    .DATA_W(16),
+    .INTERNAL_W(32),
+    .TWIDDLE_W(16),
+    .TWIDDLE_FILE("fft_twiddle_16.mem")
+) fft_core (
+    .clk(aclk),
+    .reset_n(aresetn),
+    .start(fft_start),
+    .inverse(fft_inverse),
+    .din_re(fft_din_re),
+    .din_im(fft_din_im),
+    .din_valid(fft_din_valid),
+    .dout_re(fft_dout_re),
+    .dout_im(fft_dout_im),
+    .dout_valid(fft_dout_valid),
+    .busy(fft_busy),
+    .done(fft_done)
+);
+
+// ============================================================================
+// AXI-STREAM OUTPUTS
+// ============================================================================
+assign s_axis_config_tready = (state == S_IDLE);
+assign m_axis_data_tdata  = {out_buf_im[out_count[3:0]], out_buf_re[out_count[3:0]]};
+assign m_axis_data_tvalid = (state == S_OUTPUT) && (out_count < N);
+assign m_axis_data_tlast  = (state == S_OUTPUT) && (out_count == N - 1);
+
+// ============================================================================
+// BUFFER WRITE LOGIC — separate always block, NO async reset
+// ============================================================================
+reg in_buf_we;
+reg [3:0] in_buf_waddr;
+reg signed [15:0] in_buf_wdata_re, in_buf_wdata_im;
+
+reg out_buf_we;
+reg [3:0] out_buf_waddr;
+reg signed [15:0] out_buf_wdata_re, out_buf_wdata_im;
+
+always @(posedge aclk) begin
+    if (in_buf_we) begin
+        in_buf_re[in_buf_waddr] <= in_buf_wdata_re;
+        in_buf_im[in_buf_waddr] <= in_buf_wdata_im;
+    end
+    if (out_buf_we) begin
+        out_buf_re[out_buf_waddr] <= out_buf_wdata_re;
+        out_buf_im[out_buf_waddr] <= out_buf_wdata_im;
+    end
+end
+
+// ============================================================================
+// MAIN FSM
+// ============================================================================
+always @(posedge aclk or negedge aresetn) begin
+    if (!aresetn) begin
+        state        <= S_IDLE;
+        inverse_reg  <= 1'b0;
+        in_count     <= 0;
+        out_count    <= 0;
+        out_total    <= 0;
+        feed_count   <= 0;
+        fft_start    <= 1'b0;
+        fft_inverse  <= 1'b0;
+        fft_din_re   <= 0;
+        fft_din_im   <= 0;
+        fft_din_valid <= 1'b0;
+        in_buf_we    <= 1'b0;
+        in_buf_waddr <= 0;
+        in_buf_wdata_re <= 0;
+        in_buf_wdata_im <= 0;
+        out_buf_we   <= 1'b0;
+        out_buf_waddr <= 0;
+        out_buf_wdata_re <= 0;
+        out_buf_wdata_im <= 0;
+    end else begin
+        fft_start     <= 1'b0;
+        fft_din_valid <= 1'b0;
+        in_buf_we     <= 1'b0;
+        out_buf_we    <= 1'b0;
+
+        case (state)
+
+        S_IDLE: begin
+            in_count <= 0;
+            if (s_axis_config_tvalid) begin
+                inverse_reg <= ~s_axis_config_tdata[0];
+                state       <= S_FEED;
+                in_count    <= 0;
+                feed_count  <= 0;
+            end
+        end
+
+        S_FEED: begin
+            if (in_count < N) begin
+                if (s_axis_data_tvalid) begin
+                    in_buf_we       <= 1'b1;
+                    in_buf_waddr    <= in_count[3:0];
+                    in_buf_wdata_re <= s_axis_data_tdata[15:0];
+                    in_buf_wdata_im <= s_axis_data_tdata[31:16];
+                    in_count <= in_count + 1;
+                end
+            end else if (feed_count == 0) begin
+                fft_start   <= 1'b1;
+                fft_inverse <= inverse_reg;
+                feed_count  <= 0;
+                state       <= S_WAIT;
+                out_total   <= 0;
+            end
+        end
+
+        S_WAIT: begin
+            if (feed_count < N) begin
+                fft_din_re   <= in_buf_re[feed_count[3:0]];
+                fft_din_im   <= in_buf_im[feed_count[3:0]];
+                fft_din_valid <= 1'b1;
+                feed_count   <= feed_count + 1;
+            end
+
+            if (fft_dout_valid && out_total < N) begin
+                out_buf_we       <= 1'b1;
+                out_buf_waddr    <= out_total[3:0];
+                out_buf_wdata_re <= fft_dout_re;
+                out_buf_wdata_im <= fft_dout_im;
+                out_total <= out_total + 1;
+            end
+
+            if (fft_done) begin
+                state     <= S_OUTPUT;
+                out_count <= 0;
+            end
+        end
+
+        S_OUTPUT: begin
+            if (m_axis_data_tready || !m_axis_data_tvalid) begin
+                if (out_count < N) begin
+                    if (m_axis_data_tready) begin
+                        out_count <= out_count + 1;
+                    end
+                end
+                if (out_count >= N - 1 && m_axis_data_tready) begin
+                    state <= S_IDLE;
+                end
+            end
+        end
+
+        default: state <= S_IDLE;
+
+        endcase
+    end
+end
+
+// ============================================================================
+// MEMORY INIT (simulation only)
+// ============================================================================
+`ifdef SIMULATION
+integer init_k;
+initial begin
+    for (init_k = 0; init_k < N; init_k = init_k + 1) begin
+        in_buf_re[init_k]  = 0;
+        in_buf_im[init_k]  = 0;
+        out_buf_re[init_k] = 0;
+        out_buf_im[init_k] = 0;
+    end
+end
+`endif
+
+endmodule