Merge pull request #33 from JJassonn69/fix/staggered-prf-dual16-doppler

Fix staggered-PRF Doppler path using dual 16-point FFT sub-frames
2026-03-27 22:09:08 +01:00
parent 2a89713c21 a577b7628b
commit 46c37e17d4
18 changed files with 12801 additions and 12657 deletions
@@ -1,11 +1,44 @@
 `timescale 1ns / 1ps
 // ============================================================================
 // doppler_processor.v — Staggered-PRF Doppler Processor (CORRECTED)
 // ============================================================================
 //
 // ARCHITECTURE:
 //   This module implements dual 16-point FFTs for the AERIS-10 staggered-PRF
 //   waveform. The radar transmits 16 long-PRI chirps followed by 16 short-PRI
 //   chirps per frame (32 total). Rather than a single 32-point FFT over the
 //   non-uniformly sampled frame (which is signal-processing invalid), this
 //   module processes each sub-frame independently:
 //
 //     Sub-frame 0 (long PRI):  chirps 0..15  → 16-pt windowed FFT
 //     Sub-frame 1 (short PRI): chirps 16..31 → 16-pt windowed FFT
 //
 //   Each sub-frame produces 16 Doppler bins per range bin. The outputs are
 //   tagged with a sub_frame bit and the 4-bit bin index is packed into the
 //   existing 5-bit doppler_bin port as {sub_frame, bin[3:0]}.
 //
 //   This architecture enables downstream staggered-PRF ambiguity resolution:
 //   the same target velocity maps to DIFFERENT Doppler bins at different PRIs,
 //   and comparing the two sub-frame results resolves velocity ambiguity.
 //
 // INTERFACE COMPATIBILITY:
 //   The port list is a superset of the original module. Existing instantiations
 //   that don't connect `sub_frame` will still work. The FORMAL ports are
 //   retained. CHIRPS_PER_FRAME must be 32 (16 per sub-frame).
 //
 // WINDOW:
 //   16-point Hamming window (Q15), symmetric. Computed as:
 //     w[n] = 0.54 - 0.46 * cos(2*pi*n/15), n=0..15
 // ============================================================================
 module doppler_processor_optimized #(
-    parameter DOPPLER_FFT_SIZE = 32,
+    parameter DOPPLER_FFT_SIZE   = 16,     // FFT size per sub-frame (was 32)
-    parameter RANGE_BINS = 64,
+    parameter RANGE_BINS         = 64,
-    parameter CHIRPS_PER_FRAME = 32,
+    parameter CHIRPS_PER_FRAME   = 32,     // Total chirps in frame (16+16)
-    parameter WINDOW_TYPE = 0,            // 0=Hamming, 1=Rectangular
+    parameter CHIRPS_PER_SUBFRAME = 16,    // Chirps per sub-frame
-    parameter DATA_WIDTH = 16
+    parameter WINDOW_TYPE        = 0,      // 0=Hamming, 1=Rectangular
    parameter DATA_WIDTH         = 16
 )(
    input wire clk,
    input wire reset_n,
@@ -14,62 +47,63 @@ module doppler_processor_optimized #(
    input wire new_chirp_frame,
    output reg [31:0] doppler_output,
    output reg doppler_valid,
-    output reg [4:0] doppler_bin,
+    output reg [4:0] doppler_bin,      // {sub_frame, bin[3:0]}
    output reg [5:0] range_bin,
-    output wire processing_active,
+    output reg sub_frame,              // 0=long PRI, 1=short PRI
-    output wire frame_complete,
+    output wire processing_active,
-    output reg [3:0] status
+    output wire frame_complete,
-
+    output reg [3:0] status
-`ifdef FORMAL
+
-    ,
+`ifdef FORMAL
-    output wire [2:0]  fv_state,
+    ,
-    output wire [10:0] fv_mem_write_addr,
+    output wire [2:0]  fv_state,
-    output wire [10:0] fv_mem_read_addr,
+    output wire [10:0] fv_mem_write_addr,
-    output wire [5:0]  fv_write_range_bin,
+    output wire [10:0] fv_mem_read_addr,
-    output wire [4:0]  fv_write_chirp_index,
+    output wire [5:0]  fv_write_range_bin,
-    output wire [5:0]  fv_read_range_bin,
+    output wire [4:0]  fv_write_chirp_index,
-    output wire [4:0]  fv_read_doppler_index,
+    output wire [5:0]  fv_read_range_bin,
-    output wire [9:0]  fv_processing_timeout,
+    output wire [4:0]  fv_read_doppler_index,
-    output wire        fv_frame_buffer_full,
+    output wire [9:0]  fv_processing_timeout,
-    output wire        fv_mem_we,
+    output wire        fv_frame_buffer_full,
-    output wire [10:0] fv_mem_waddr_r
+    output wire        fv_mem_we,
-`endif
+    output wire [10:0] fv_mem_waddr_r
-);
+`endif
-
+);
-// ==============================================
+
-// Window Coefficients (Simple Implementation)
+// ==============================================
-// ==============================================
+// Window Coefficients — 16-point Hamming (Q15)
-reg [DATA_WIDTH-1:0] window_coeff [0:31];
+// ==============================================
 // w[n] = 0.54 - 0.46 * cos(2*pi*n/15), n=0..15
 // Symmetric: w[n] = w[15-n]
 reg [DATA_WIDTH-1:0] window_coeff [0:15];
 // Generate window coefficients
 integer w;
 initial begin
    if (WINDOW_TYPE == 0) begin
-        // Pre-calculated Hamming window (Q15 format)
+        // 16-point Hamming window, Q15 format
-        window_coeff[0]  = 16'h0800; window_coeff[1]  = 16'h0862;
+        // Computed: round(32767 * (0.54 - 0.46*cos(2*pi*n/15)))
-        window_coeff[2]  = 16'h09CB; window_coeff[3]  = 16'h0C3B;
+        window_coeff[0]  = 16'h0A3D;  // 0.0800 * 32767 = 2621
-        window_coeff[4]  = 16'h0FB2; window_coeff[5]  = 16'h142F;
+        window_coeff[1]  = 16'h0E5C;  // 0.1116 * 32767 = 3676
-        window_coeff[6]  = 16'h19B2; window_coeff[7]  = 16'h2039;
+        window_coeff[2]  = 16'h1B6D;  // 0.2138 * 32767 = 7021
-        window_coeff[8]  = 16'h27C4; window_coeff[9]  = 16'h3050;
+        window_coeff[3]  = 16'h3088;  // 0.3790 * 32767 = 12424
-        window_coeff[10] = 16'h39DB; window_coeff[11] = 16'h4462;
+        window_coeff[4]  = 16'h4B33;  // 0.5868 * 32767 = 19251
-        window_coeff[12] = 16'h4FE3; window_coeff[13] = 16'h5C5A;
+        window_coeff[5]  = 16'h6573;  // 0.7930 * 32767 = 25971
-        window_coeff[14] = 16'h69C4; window_coeff[15] = 16'h781D;
+        window_coeff[6]  = 16'h7642;  // 0.9245 * 32767 = 30274
-        window_coeff[16] = 16'h7FFF; // Peak
+        window_coeff[7]  = 16'h7F62;  // 0.9932 * 32767 = 32610
-        window_coeff[17] = 16'h781D; window_coeff[18] = 16'h69C4;
+        window_coeff[8]  = 16'h7F62;  // symmetric
-        window_coeff[19] = 16'h5C5A; window_coeff[20] = 16'h4FE3;
+        window_coeff[9]  = 16'h7642;
-        window_coeff[21] = 16'h4462; window_coeff[22] = 16'h39DB;
+        window_coeff[10] = 16'h6573;
-        window_coeff[23] = 16'h3050; window_coeff[24] = 16'h27C4;
+        window_coeff[11] = 16'h4B33;
-        window_coeff[25] = 16'h2039; window_coeff[26] = 16'h19B2;
+        window_coeff[12] = 16'h3088;
-        window_coeff[27] = 16'h142F; window_coeff[28] = 16'h0FB2;
+        window_coeff[13] = 16'h1B6D;
-        window_coeff[29] = 16'h0C3B; window_coeff[30] = 16'h09CB;
+        window_coeff[14] = 16'h0E5C;
-        window_coeff[31] = 16'h0862;
+        window_coeff[15] = 16'h0A3D;
    end else begin
-        // Rectangular window (all ones)
+        for (w = 0; w < 16; w = w + 1) begin
        for (w = 0; w < 32; w = w + 1) begin
            window_coeff[w] = 16'h7FFF;
        end
    end
-end
+end
 // ==============================================
 // Memory Declaration - FIXED SIZE
@@ -81,57 +115,53 @@ localparam MEM_DEPTH = RANGE_BINS * CHIRPS_PER_FRAME;
 // ==============================================
 // Control Registers
 // ==============================================
-reg [5:0] write_range_bin;     // Changed to match RANGE_BINS width
+reg [5:0] write_range_bin;
-reg [4:0] write_chirp_index;   // Changed to match CHIRPS_PER_FRAME width
+reg [4:0] write_chirp_index;
 reg [5:0] read_range_bin;
-reg [4:0] read_doppler_index;  // Changed name for clarity
+reg [4:0] read_doppler_index;
 reg frame_buffer_full;
-reg [9:0] chirps_received;     // Enough for up to 1024 chirps
+reg [9:0] chirps_received;
-reg [1:0] chirp_state;         // Track chirp accumulation state
+reg [1:0] chirp_state;
 // Sub-frame tracking
 reg current_sub_frame;   // 0=processing long, 1=processing short
 // ==============================================
 // FFT Interface
 // ==============================================
 reg fft_start;
 wire fft_ready;
 reg [DATA_WIDTH-1:0] fft_input_i;
-reg [DATA_WIDTH-1:0] fft_input_q;
+reg [DATA_WIDTH-1:0] fft_input_q;
-reg signed [31:0] mult_i, mult_q;  // 32-bit to avoid overflow
+reg signed [31:0] mult_i, mult_q;
-reg signed [DATA_WIDTH-1:0] window_val_reg;   // BREG pipeline stage
+reg signed [DATA_WIDTH-1:0] window_val_reg;
-reg signed [31:0] mult_i_raw, mult_q_raw;     // MREG pipeline stage
+reg signed [31:0] mult_i_raw, mult_q_raw;
 reg fft_input_valid;
 reg fft_input_last;
 wire [DATA_WIDTH-1:0] fft_output_i;
 wire [DATA_WIDTH-1:0] fft_output_q;
 wire fft_output_valid;
-wire fft_output_last;
+wire fft_output_last;
 // ==============================================
-// Addressing 
+// Addressing
 // ==============================================
 wire [10:0] mem_write_addr;
 wire [10:0] mem_read_addr;
 // Proper address calculation using parameters
 assign mem_write_addr = (write_chirp_index * RANGE_BINS) + write_range_bin;
 assign mem_read_addr = (read_doppler_index * RANGE_BINS) + read_range_bin;
-// Alternative organization (choose one):
+// ==============================================
-// If you want range-major organization (all chirps for one range bin together):
+// State Machine
-// assign mem_write_addr = (write_range_bin * CHIRPS_PER_FRAME) + write_chirp_index;
+// ==============================================
-// assign mem_read_addr = (read_range_bin * CHIRPS_PER_FRAME) + read_doppler_index;
+reg [2:0] state;
-
+localparam S_IDLE       = 3'b000;
-// ==============================================
+localparam S_ACCUMULATE = 3'b001;
-// State Machine
+localparam S_PRE_READ   = 3'b101;
-// ==============================================
+localparam S_LOAD_FFT   = 3'b010;
-reg [2:0] state;
+localparam S_FFT_WAIT   = 3'b011;
 localparam S_IDLE       = 3'b000;
 localparam S_ACCUMULATE = 3'b001;
 localparam S_PRE_READ   = 3'b101;  // Prime BRAM pipeline before FFT load
 localparam S_LOAD_FFT   = 3'b010;
 localparam S_FFT_WAIT   = 3'b011;
 localparam S_OUTPUT     = 3'b100;
 // Frame sync detection
@@ -142,361 +172,347 @@ always @(posedge clk or negedge reset_n) begin
 end
 wire frame_start_pulse = new_chirp_frame & ~new_chirp_frame_d1;
-// ==============================================
+// ==============================================
-// Main State Machine - FIXED
+// Main State Machine
-// ==============================================
+// ==============================================
-reg [5:0] fft_sample_counter;
+reg [4:0] fft_sample_counter;  // Reduced: only need 0..17 for 16-pt FFT
-reg [9:0] processing_timeout;
+reg [9:0] processing_timeout;
-
+
-// Memory write enable and data signals (extracted for BRAM inference)
+// Memory write enable and data signals
-reg mem_we;
+reg mem_we;
-reg [10:0] mem_waddr_r;
+reg [10:0] mem_waddr_r;
-reg [DATA_WIDTH-1:0] mem_wdata_i, mem_wdata_q;
+reg [DATA_WIDTH-1:0] mem_wdata_i, mem_wdata_q;
-
+
-// Memory read data (registered for BRAM read latency)
+// Memory read data
-reg [DATA_WIDTH-1:0] mem_rdata_i, mem_rdata_q;
+reg [DATA_WIDTH-1:0] mem_rdata_i, mem_rdata_q;
-
+
-`ifdef FORMAL
+`ifdef FORMAL
-assign fv_state              = state;
+assign fv_state              = state;
-assign fv_mem_write_addr     = mem_write_addr;
+assign fv_mem_write_addr     = mem_write_addr;
-assign fv_mem_read_addr      = mem_read_addr;
+assign fv_mem_read_addr      = mem_read_addr;
-assign fv_write_range_bin    = write_range_bin;
+assign fv_write_range_bin    = write_range_bin;
-assign fv_write_chirp_index  = write_chirp_index;
+assign fv_write_chirp_index  = write_chirp_index;
-assign fv_read_range_bin     = read_range_bin;
+assign fv_read_range_bin     = read_range_bin;
-assign fv_read_doppler_index = read_doppler_index;
+assign fv_read_doppler_index = read_doppler_index;
-assign fv_processing_timeout = processing_timeout;
+assign fv_processing_timeout = processing_timeout;
-assign fv_frame_buffer_full  = frame_buffer_full;
+assign fv_frame_buffer_full  = frame_buffer_full;
-assign fv_mem_we             = mem_we;
+assign fv_mem_we             = mem_we;
-assign fv_mem_waddr_r        = mem_waddr_r;
+assign fv_mem_waddr_r        = mem_waddr_r;
-`endif
+`endif
-
+
-// ----------------------------------------------------------
+// ----------------------------------------------------------
-// Separate always block for memory writes — NO async reset
+// Separate always block for memory writes — NO async reset
-// in sensitivity list, so Vivado can infer Block RAM.
+// ----------------------------------------------------------
-// ----------------------------------------------------------
+always @(posedge clk) begin
-always @(posedge clk) begin
+    if (mem_we) begin
-    if (mem_we) begin
+        doppler_i_mem[mem_waddr_r] <= mem_wdata_i;
-        doppler_i_mem[mem_waddr_r] <= mem_wdata_i;
+        doppler_q_mem[mem_waddr_r] <= mem_wdata_q;
-        doppler_q_mem[mem_waddr_r] <= mem_wdata_q;
+    end
-    end
+    mem_rdata_i <= doppler_i_mem[mem_read_addr];
-    // Registered read — address driven by mem_read_addr from FSM
+    mem_rdata_q <= doppler_q_mem[mem_read_addr];
-    mem_rdata_i <= doppler_i_mem[mem_read_addr];
+end
-    mem_rdata_q <= doppler_q_mem[mem_read_addr];
+
-end
+// ----------------------------------------------------------
-
+// Block 1: FSM / Control — async reset
-// ----------------------------------------------------------
+// ----------------------------------------------------------
-// Block 1: FSM / Control — async reset (posedge clk or negedge reset_n).
+always @(posedge clk or negedge reset_n) begin
-// Only state-machine and control registers live here.
+    if (!reset_n) begin
-// BRAM-driving and DSP datapath registers are intentionally
+        state <= S_IDLE;
-// excluded to avoid Vivado REQP-1839 (async-reset on BRAM
+        write_range_bin <= 0;
-// address) and DPOR-1/DPIP-1 (async-reset blocking DSP48
+        write_chirp_index <= 0;
-// absorption) DRC warnings.
+        frame_buffer_full <= 0;
-// ----------------------------------------------------------
+        doppler_valid <= 0;
-always @(posedge clk or negedge reset_n) begin
+        fft_start <= 0;
-    if (!reset_n) begin
+        fft_input_valid <= 0;
-        state <= S_IDLE;
+        fft_input_last <= 0;
-        write_range_bin <= 0;
+        fft_sample_counter <= 0;
-        write_chirp_index <= 0;
+        processing_timeout <= 0;
-        // read_range_bin, read_doppler_index moved to Block 2 (sync reset)
+        status <= 0;
-        // to enable BRAM address register absorption (REQP-1839 fix)
+        chirps_received <= 0;
-        frame_buffer_full <= 0;
+        chirp_state <= 0;
-        doppler_valid <= 0;
+        doppler_output <= 0;
-        fft_start <= 0;
+        doppler_bin <= 0;
-        fft_input_valid <= 0;
+        range_bin <= 0;
-        fft_input_last <= 0;
+        sub_frame <= 0;
-        fft_sample_counter <= 0;
+        current_sub_frame <= 0;
-        processing_timeout <= 0;
+    end else begin
-        status <= 0;
+        doppler_valid <= 0;
-        chirps_received <= 0;
+        fft_input_valid <= 0;
-        chirp_state <= 0;
+        fft_input_last <= 0;
-        doppler_output <= 0;
+        
-        doppler_bin <= 0;
+        if (processing_timeout > 0) begin
-        range_bin <= 0;
+            processing_timeout <= processing_timeout - 1;
-    end else begin
+        end
-        doppler_valid <= 0;
+        
-        fft_input_valid <= 0;
+        case (state)
-        fft_input_last <= 0;
+            S_IDLE: begin
-        
+                if (frame_start_pulse) begin
-        if (processing_timeout > 0) begin
+                    write_chirp_index <= 0;
-            processing_timeout <= processing_timeout - 1;
+                    write_range_bin <= 0;
-        end
+                    frame_buffer_full <= 0;
-        
+                    chirps_received <= 0;
-        case (state)
+                end
-            S_IDLE: begin
+                
-                if (frame_start_pulse) begin
+                if (data_valid && !frame_buffer_full) begin
-                    // Start new frame
+                    state <= S_ACCUMULATE;
-                    write_chirp_index <= 0;
+                    write_range_bin <= 1;
-                    write_range_bin <= 0;
+                end
-                    frame_buffer_full <= 0;
+            end
-                    chirps_received <= 0;
+            
-                end
+            S_ACCUMULATE: begin
-                
+                if (data_valid) begin
-                if (data_valid && !frame_buffer_full) begin
+                    if (write_range_bin < RANGE_BINS - 1) begin
-                    state <= S_ACCUMULATE;
+                        write_range_bin <= write_range_bin + 1;
-                    write_range_bin <= 1;
+                    end else begin
-                end
+                        write_range_bin <= 0;
-            end
+                        write_chirp_index <= write_chirp_index + 1;
-            
+                        chirps_received <= chirps_received + 1;
-            S_ACCUMULATE: begin
+                        
-                if (data_valid) begin
+                        if (write_chirp_index >= CHIRPS_PER_FRAME - 1) begin
-                    // Increment range bin
+                            frame_buffer_full <= 1;
-                    if (write_range_bin < RANGE_BINS - 1) begin
+                            chirp_state <= 0;
-                        write_range_bin <= write_range_bin + 1;
+                            state <= S_PRE_READ;
-                    end else begin
+                            fft_sample_counter <= 0;
-                        // Completed one chirp
+                            write_chirp_index <= 0;
-                        write_range_bin <= 0;
+                            write_range_bin <= 0;
-                        write_chirp_index <= write_chirp_index + 1;
+                            // Start with sub-frame 0 (long PRI chirps 0..15)
-                        chirps_received <= chirps_received + 1;
+                            current_sub_frame <= 0;
-                        
+                        end
-                        // Check if frame is complete
+                    end
-                        if (write_chirp_index >= CHIRPS_PER_FRAME - 1) begin
+                end 
-                            frame_buffer_full <= 1;
+            end
-                            chirp_state <= 0;
+            
-                            state <= S_PRE_READ;
+            S_PRE_READ: begin
-                            // read_range_bin/read_doppler_index zeroed in Block 2
+                // Prime BRAM pipeline for current sub-frame
-                            fft_sample_counter <= 0;
+                // read_doppler_index already set in Block 2 to sub-frame base
-                            // Reset write pointers — no longer needed for
+                fft_start <= 1;
-                            // this frame, and prevents stale overflow of
+                state <= S_LOAD_FFT;
-                            // write_chirp_index (which was just incremented
+            end
-                            // past CHIRPS_PER_FRAME-1 above).
+
-                            write_chirp_index <= 0;
+            S_LOAD_FFT: begin
-                            write_range_bin <= 0;
+                fft_start <= 0;
-                        end
+                
-                    end
+                // Pipeline: 2 priming cycles + CHIRPS_PER_SUBFRAME data cycles
-                end 
+                if (fft_sample_counter <= 1) begin
-            end
+                    fft_sample_counter <= fft_sample_counter + 1;
-            
+                end else if (fft_sample_counter <= CHIRPS_PER_SUBFRAME + 1) begin
-            S_PRE_READ: begin
+                    fft_input_valid <= 1;
-                // Prime the BRAM pipeline: present addr for chirp 0 of
+
-                // current read_range_bin.  read_doppler_index is already 0.
+                    if (fft_sample_counter == CHIRPS_PER_SUBFRAME + 1) begin
-                // mem_read_addr = 0 * RANGE_BINS + read_range_bin.
+                        fft_input_last <= 1;
-                // After this cycle, mem_rdata_i will hold data[chirp=0][rbin].
+                        state <= S_FFT_WAIT;
-                // Advance read_doppler_index to 1 so the NEXT BRAM read
+                        fft_sample_counter <= 0;
-                // (which happens every cycle in the memory block) will
+                        processing_timeout <= 1000;
-                // fetch chirp 1.
+                    end else begin
-                // read_doppler_index <= 1 moved to Block 2
+                        fft_sample_counter <= fft_sample_counter + 1;
-                fft_start <= 1;
+                    end
-                state <= S_LOAD_FFT;
+                end
-            end
+            end
-
+            
-            S_LOAD_FFT: begin
+            S_FFT_WAIT: begin
-                fft_start <= 0;
+                if (fft_output_valid) begin
-                
+                    doppler_output <= {fft_output_q[15:0], fft_output_i[15:0]};
-                // Pipeline alignment (after S_PRE_READ primed the BRAM
+                    // Pack: {sub_frame, bin[3:0]}
-                // and pre-registered window_val_reg = window_coeff[0]):
+                    doppler_bin <= {current_sub_frame, fft_sample_counter[3:0]};
-                //
+                    range_bin <= read_range_bin;
-                // With DSP48 BREG+MREG pipelining, data flows through:
+                    sub_frame <= current_sub_frame;
-                //   sub=0: multiply mem_rdata * window_val_reg -> mult_i_raw
+                    doppler_valid <= 1;
-                //          pre-register window_coeff[1] into window_val_reg
+                    
-                //   sub=1: MREG capture mult_i_raw -> mult_i (sample 0)
+                    fft_sample_counter <= fft_sample_counter + 1;
-                //          new multiply for sample 1
+                    
-                //   sub=2..DOPPLER_FFT_SIZE+1: steady state —
+                    if (fft_output_last) begin
-                //          fft_input = rounding(mult_i), mult_i = mult_i_raw,
+                        state <= S_OUTPUT;
-                //          mult_i_raw = new multiply, window_val_reg = next coeff
+                        fft_sample_counter <= 0;
-                //
+                    end
-                // fft_input_valid asserted at sub=2..DOPPLER_FFT_SIZE+1
+                end
-                // fft_input_last  asserted at sub=DOPPLER_FFT_SIZE+1
+                
-
+                if (processing_timeout == 0) begin
-                // read_doppler_index updates moved to Block 2 (sync reset)
+                    state <= S_OUTPUT;
-                if (fft_sample_counter <= 1) begin
+                end
-                    // Sub 0..1: pipeline priming — no valid FFT data yet
+            end
-                    fft_sample_counter <= fft_sample_counter + 1;
+            
-                end else if (fft_sample_counter <= DOPPLER_FFT_SIZE + 1) begin
+            S_OUTPUT: begin
-                    // Sub 2..DOPPLER_FFT_SIZE+1: steady state
+                if (current_sub_frame == 0) begin
-                    // (fft_input_i/fft_input_q captured in Block 2)
+                    // Just finished long PRI sub-frame — now do short PRI
-                    fft_input_valid <= 1;
+                    current_sub_frame <= 1;
-
+                    fft_sample_counter <= 0;
-                    if (fft_sample_counter == DOPPLER_FFT_SIZE + 1) begin
+                    state <= S_PRE_READ;
-                        // Last sample: flush
+                    // read_range_bin stays the same, read_doppler_index
-                        fft_input_last <= 1;
+                    // will be set to CHIRPS_PER_SUBFRAME in Block 2
-                        state <= S_FFT_WAIT;
+                end else begin
-                        fft_sample_counter <= 0;
+                    // Finished both sub-frames for this range bin
-                        processing_timeout <= 1000;
+                    current_sub_frame <= 0;
-                    end else begin
+                    if (read_range_bin < RANGE_BINS - 1) begin
-                        fft_sample_counter <= fft_sample_counter + 1;
+                        fft_sample_counter <= 0;
-                    end
+                        state <= S_PRE_READ;
-                end
+                        // read_range_bin incremented in Block 2
-            end
+                    end else begin
-            
+                        state <= S_IDLE;
-            S_FFT_WAIT: begin
+                        frame_buffer_full <= 0;
-                if (fft_output_valid) begin
+                    end
-                    doppler_output <= {fft_output_q[15:0], fft_output_i[15:0]};
+                end
-                    doppler_bin <= fft_sample_counter;
+            end
-                    range_bin <= read_range_bin;
+            
-                    doppler_valid <= 1;
+        endcase
-                    
+        
-                    fft_sample_counter <= fft_sample_counter + 1;
+        status <= {state, frame_buffer_full};
-                    
+    end
-                    if (fft_output_last) begin
+end
-                        state <= S_OUTPUT;
+
-                        fft_sample_counter <= 0;
+// ----------------------------------------------------------
-                    end
+// Block 2: BRAM address/data & DSP datapath — synchronous reset
-                end
+// ----------------------------------------------------------
-                
+always @(posedge clk) begin
-                if (processing_timeout == 0) begin
+    if (!reset_n) begin
-                    state <= S_OUTPUT;
+        mem_we      <= 0;
-                end
+        mem_waddr_r <= 0;
-            end
+        mem_wdata_i <= 0;
-            
+        mem_wdata_q <= 0;
-            S_OUTPUT: begin
+        mult_i      <= 0;
-                if (read_range_bin < RANGE_BINS - 1) begin
+        mult_q      <= 0;
-                    // read_range_bin/read_doppler_index updated in Block 2
+        mult_i_raw     <= 0;
-                    fft_sample_counter <= 0;
+        mult_q_raw     <= 0;
-                    state <= S_PRE_READ;
+        window_val_reg <= 0;
-                end else begin
+        fft_input_i <= 0;
-                    state <= S_IDLE;
+        fft_input_q <= 0;
-                    frame_buffer_full <= 0;
+        read_range_bin     <= 0;
-                end
+        read_doppler_index <= 0;
-            end
+    end else begin
-            
+        mem_we <= 0;
-        endcase
+        
-        
+        case (state)
-        status <= {state, frame_buffer_full};
+            S_IDLE: begin
-    end
+                if (data_valid && !frame_buffer_full) begin
-end
+                    mem_we      <= 1;
-
+                    mem_waddr_r <= mem_write_addr;
-// ----------------------------------------------------------
+                    mem_wdata_i <= range_data[15:0];
-// Block 2: BRAM address/data & DSP datapath — synchronous reset only.
+                    mem_wdata_q <= range_data[31:16];
-// Uses always @(posedge clk) so Vivado can absorb multipliers
+                end
-// into DSP48 primitives and does not flag REQP-1839/1840 on
+            end
-// BRAM address registers.  Replicates the same state/condition
+            
-// structure as Block 1 for the registers:
+            S_ACCUMULATE: begin
-//   mem_we, mem_waddr_r, mem_wdata_i, mem_wdata_q,
+                if (data_valid) begin
-//   mult_i, mult_q, fft_input_i, fft_input_q,
+                    mem_we      <= 1;
-//   read_range_bin, read_doppler_index
+                    mem_waddr_r <= mem_write_addr;
-// ----------------------------------------------------------
+                    mem_wdata_i <= range_data[15:0];
-always @(posedge clk) begin
+                    mem_wdata_q <= range_data[31:16];
-    if (!reset_n) begin
+
-        mem_we      <= 0;
+                    if (write_range_bin >= RANGE_BINS - 1 &&
-        mem_waddr_r <= 0;
+                        write_chirp_index >= CHIRPS_PER_FRAME - 1) begin
-        mem_wdata_i <= 0;
+                        read_range_bin     <= 0;
-        mem_wdata_q <= 0;
+                        // Start reading from chirp 0 (long PRI sub-frame)
-        mult_i      <= 0;
+                        read_doppler_index <= 0;
-        mult_q      <= 0;
+                    end
-        mult_i_raw     <= 0;
+                end
-        mult_q_raw     <= 0;
+            end
-        window_val_reg <= 0;
+            
-        fft_input_i <= 0;
+            S_PRE_READ: begin
-        fft_input_q <= 0;
+                // Set read_doppler_index to first chirp of current sub-frame + 1
-        read_range_bin     <= 0;
+                // (because address is presented this cycle, data arrives next)
-        read_doppler_index <= 0;
+                if (current_sub_frame == 0)
-    end else begin
+                    read_doppler_index <= 1;  // Long PRI: chirps 0..15
-        mem_we <= 0;
+                else
-        
+                    read_doppler_index <= CHIRPS_PER_SUBFRAME + 1;  // Short PRI: chirps 16..31
-        case (state)
+
-            S_IDLE: begin
+                // BREG priming: window coeff for sample 0
-                if (data_valid && !frame_buffer_full) begin
+                window_val_reg <= $signed(window_coeff[0]);
-                    // Write the first sample immediately (Bug #3 fix:
+            end
-                    // previously this transition consumed data_valid
+
-                    // without writing to BRAM)
+            S_LOAD_FFT: begin
-                    mem_we      <= 1;
+                if (fft_sample_counter == 0) begin
-                    mem_waddr_r <= mem_write_addr;
+                    // Pipe stage 1: multiply using pre-registered BREG value
-                    mem_wdata_i <= range_data[15:0];
+                    mult_i_raw <= $signed(mem_rdata_i) * window_val_reg;
-                    mem_wdata_q <= range_data[31:16];
+                    mult_q_raw <= $signed(mem_rdata_q) * window_val_reg;
-                end
+                    window_val_reg <= $signed(window_coeff[1]);
-            end
+                    // Advance to chirp base+2
-            
+                    if (current_sub_frame == 0)
-            S_ACCUMULATE: begin
+                        read_doppler_index <= (2 < CHIRPS_PER_SUBFRAME) ? 2
-                if (data_valid) begin
+                                              : CHIRPS_PER_SUBFRAME - 1;
-                    // Drive memory write signals (actual write in separate block)
+                    else
-                    mem_we      <= 1;
+                        read_doppler_index <= (CHIRPS_PER_SUBFRAME + 2 < CHIRPS_PER_FRAME)
-                    mem_waddr_r <= mem_write_addr;
+                                              ? CHIRPS_PER_SUBFRAME + 2
-                    mem_wdata_i <= range_data[15:0];
+                                              : CHIRPS_PER_FRAME - 1;
-                    mem_wdata_q <= range_data[31:16];
+                end else if (fft_sample_counter == 1) begin
-
+                    mult_i <= mult_i_raw;
-                    // Transition to S_PRE_READ when frame complete
+                    mult_q <= mult_q_raw;
-                    if (write_range_bin >= RANGE_BINS - 1 &&
+                    mult_i_raw <= $signed(mem_rdata_i) * window_val_reg;
-                        write_chirp_index >= CHIRPS_PER_FRAME - 1) begin
+                    mult_q_raw <= $signed(mem_rdata_q) * window_val_reg;
-                        read_range_bin     <= 0;
+                    if (2 < CHIRPS_PER_SUBFRAME)
-                        read_doppler_index <= 0;
+                        window_val_reg <= $signed(window_coeff[2]);
-                    end
+                    // Advance to chirp base+3
-                end
+                    begin : advance_chirp3
-            end
+                        reg [4:0] next_chirp;
-            
+                        next_chirp = (current_sub_frame == 0) ? 3 : CHIRPS_PER_SUBFRAME + 3;
-            S_PRE_READ: begin
+                        if (next_chirp < CHIRPS_PER_FRAME)
-                // Advance read_doppler_index to 1 so next BRAM read
+                            read_doppler_index <= next_chirp;
-                // fetches chirp 1
+                        else
-                read_doppler_index <= 1;
+                            read_doppler_index <= CHIRPS_PER_FRAME - 1;
-                // BREG priming: pre-register window coeff for sample 0
+                    end
-                // so it is ready when S_LOAD_FFT sub=0 performs the multiply
+                end else if (fft_sample_counter <= CHIRPS_PER_SUBFRAME + 1) begin
-                window_val_reg <= $signed(window_coeff[0]);
+                    // Steady state
-            end
+                    fft_input_i <= (mult_i + (1 << 14)) >>> 15;
-
+                    fft_input_q <= (mult_q + (1 << 14)) >>> 15;
-            S_LOAD_FFT: begin
+                    mult_i <= mult_i_raw;
-                if (fft_sample_counter == 0) begin
+                    mult_q <= mult_q_raw;
-                    // Pipe stage 1: multiply using pre-registered BREG value
+
-                    // mem_rdata_i = data[chirp=0][rbin] (primed by S_PRE_READ)
+                    if (fft_sample_counter <= CHIRPS_PER_SUBFRAME - 1) begin
-                    mult_i_raw <= $signed(mem_rdata_i) * window_val_reg;
+                        mult_i_raw <= $signed(mem_rdata_i) * window_val_reg;
-                    mult_q_raw <= $signed(mem_rdata_q) * window_val_reg;
+                        mult_q_raw <= $signed(mem_rdata_q) * window_val_reg;
-                    // Pre-register next window coeff (sample 1)
+                        // Window coeff index within sub-frame
-                    window_val_reg <= $signed(window_coeff[1]);
+                        begin : advance_window
-                    // Present BRAM addr for chirp 2
+                            reg [4:0] win_idx;
-                    read_doppler_index <= (2 < DOPPLER_FFT_SIZE) ? 2
+                            win_idx = fft_sample_counter[3:0] + 1;
-                                          : DOPPLER_FFT_SIZE - 1;
+                            if (win_idx < CHIRPS_PER_SUBFRAME)
-                end else if (fft_sample_counter == 1) begin
+                                window_val_reg <= $signed(window_coeff[win_idx]);
-                    // Pipe stage 2 (MREG): capture sample 0 multiply result
+                        end
-                    mult_i <= mult_i_raw;
+                        // Advance BRAM read
-                    mult_q <= mult_q_raw;
+                        begin : advance_bram
-                    // Multiply sample 1 using registered window value
+                            reg [4:0] chirp_offset;
-                    mult_i_raw <= $signed(mem_rdata_i) * window_val_reg;
+                            reg [4:0] chirp_base;
-                    mult_q_raw <= $signed(mem_rdata_q) * window_val_reg;
+                            chirp_offset = fft_sample_counter[3:0] + 2;
-                    // Pre-register next window coeff (sample 2)
+                            chirp_base = (current_sub_frame == 0) ? 0 : CHIRPS_PER_SUBFRAME;
-                    if (2 < DOPPLER_FFT_SIZE)
+                            if (chirp_base + chirp_offset < CHIRPS_PER_FRAME)
-                        window_val_reg <= $signed(window_coeff[2]);
+                                read_doppler_index <= chirp_base + chirp_offset;
-                    // Advance BRAM read to chirp 3
+                            else
-                    if (3 < DOPPLER_FFT_SIZE)
+                                read_doppler_index <= CHIRPS_PER_FRAME - 1;
-                        read_doppler_index <= 3;
+                        end
-                    else
+                    end
-                        read_doppler_index <= DOPPLER_FFT_SIZE - 1;
+
-                end else if (fft_sample_counter <= DOPPLER_FFT_SIZE + 1) begin
+                    if (fft_sample_counter == CHIRPS_PER_SUBFRAME + 1) begin
-                    // Sub 2..DOPPLER_FFT_SIZE+1: steady state
+                        // Reset read index for potential next operation
-                    // Capture rounding into fft_input from MREG output
+                        if (current_sub_frame == 0)
-                    fft_input_i <= (mult_i + (1 << 14)) >>> 15;
+                            read_doppler_index <= CHIRPS_PER_SUBFRAME;  // Ready for short sub-frame
-                    fft_input_q <= (mult_q + (1 << 14)) >>> 15;
+                        else
-                    // MREG: capture multiply result
+                            read_doppler_index <= 0;
-                    mult_i <= mult_i_raw;
+                    end
-                    mult_q <= mult_q_raw;
+                end
-
+            end
-                    if (fft_sample_counter <= DOPPLER_FFT_SIZE - 1) begin
+
-                        // New multiply from current BRAM data
+            S_OUTPUT: begin
-                        mult_i_raw <= $signed(mem_rdata_i) * window_val_reg;
+                if (current_sub_frame == 0) begin
-                        mult_q_raw <= $signed(mem_rdata_q) * window_val_reg;
+                    // Transitioning to short PRI sub-frame
-                        // Pre-register next window coeff (clamped)
+                    // Set read_doppler_index to start of short sub-frame
-                        if (fft_sample_counter + 1 < DOPPLER_FFT_SIZE)
+                    read_doppler_index <= CHIRPS_PER_SUBFRAME;
-                            window_val_reg <= $signed(window_coeff[fft_sample_counter + 1]);
+                end else begin
-                        // Advance BRAM read
+                    // Both sub-frames done
-                        if (fft_sample_counter + 2 < DOPPLER_FFT_SIZE)
+                    if (read_range_bin < RANGE_BINS - 1) begin
-                            read_doppler_index <= fft_sample_counter + 2;
+                        read_range_bin     <= read_range_bin + 1;
-                        else
+                        read_doppler_index <= 0;  // Next range bin starts with long sub-frame
-                            read_doppler_index <= DOPPLER_FFT_SIZE - 1;
+                    end
-                    end
+                end
-
+            end
-                    if (fft_sample_counter == DOPPLER_FFT_SIZE + 1) begin
+
-                        // Flush complete — reset read index
+            default: begin
-                        read_doppler_index <= 0;
+                // S_FFT_WAIT: no BRAM-write or address operations needed
-                    end
+            end
-                end
+        endcase
-            end
+    end
            S_OUTPUT: begin
                if (read_range_bin < RANGE_BINS - 1) begin
                    read_range_bin     <= read_range_bin + 1;
                    read_doppler_index <= 0;
                end
            end
            default: begin
                // S_IDLE, S_FFT_WAIT:
                // no BRAM-write, DSP, or read-address operations needed
            end
        endcase
    end
 end
 // ==============================================
-// FFT Module
+// FFT Module — 16-point
 // ==============================================
-xfft_32 fft_inst (
+xfft_16 fft_inst (
    .aclk(clk),
    .aresetn(reset_n),
    .s_axis_config_tdata(8'h01),
@@ -517,5 +533,4 @@ xfft_32 fft_inst (
 assign processing_active = (state != S_IDLE);
 assign frame_complete = (state == S_IDLE && frame_buffer_full == 0);
-
+endmodule
 endmodule
@@ -0,0 +1,8 @@
 // Quarter-wave cosine ROM for 16-point FFT
 // 4 entries (N/4), 16-bit signed Q15 format
 // cos(2*pi*k/16) for k = 0..3
 // Used by fft_engine with N=16, LOG2N=4
 7FFF
 7641
 5A82
 30FB
@@ -8,8 +8,8 @@
 // Single-clock design: clk is an input wire, async2sync handles async reset.
 // Each formal step = one clock edge.
 //
-// Parameters reduced: RANGE_BINS=4, CHIRPS_PER_FRAME=4, DOPPLER_FFT_SIZE=4.
+// Parameters reduced: RANGE_BINS=4, CHIRPS_PER_FRAME=4, CHIRPS_PER_SUBFRAME=2, DOPPLER_FFT_SIZE=2.
-// Includes full xfft_32 and fft_engine sub-modules.
+// Includes full xfft_16 and fft_engine sub-modules.
 //
 // Focus: memory address bounds (highest-value finding) and state encoding.
 // ============================================================================
@@ -20,7 +20,8 @@ module fv_doppler_processor (
    // Reduced parameters for tractable BMC
    localparam RANGE_BINS       = 4;
    localparam CHIRPS_PER_FRAME = 4;
-    localparam DOPPLER_FFT_SIZE = 4;
+    localparam CHIRPS_PER_SUBFRAME = 2;  // Dual sub-frame: 2 chirps per sub-frame
    localparam DOPPLER_FFT_SIZE = 2;     // FFT size matches sub-frame size
    localparam MEM_DEPTH        = RANGE_BINS * CHIRPS_PER_FRAME;  // 16
    // State encoding (mirrors DUT localparams)
@@ -62,6 +63,7 @@ module fv_doppler_processor (
    wire        doppler_valid;
    wire [4:0]  doppler_bin;
    wire [5:0]  range_bin;
    wire        sub_frame;
    wire        processing_active;
    wire        frame_complete;
    wire [3:0]  status;
@@ -86,6 +88,7 @@ module fv_doppler_processor (
        .DOPPLER_FFT_SIZE (DOPPLER_FFT_SIZE),
        .RANGE_BINS       (RANGE_BINS),
        .CHIRPS_PER_FRAME (CHIRPS_PER_FRAME),
        .CHIRPS_PER_SUBFRAME (CHIRPS_PER_SUBFRAME),
        .WINDOW_TYPE      (1),   // Rectangular — simpler for formal
        .DATA_WIDTH       (16)
    ) dut (
@@ -98,6 +101,7 @@ module fv_doppler_processor (
        .doppler_valid    (doppler_valid),
        .doppler_bin      (doppler_bin),
        .range_bin        (range_bin),
        .sub_frame        (sub_frame),
        .processing_active(processing_active),
        .frame_complete   (frame_complete),
        .status           (status),
@@ -36,6 +36,7 @@ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 DOPPLER_FFT = 32
 RANGE_BINS = 64
 TOTAL_OUTPUTS = RANGE_BINS * DOPPLER_FFT  # 2048
 SUBFRAME_SIZE = 16
 SCENARIOS = {
    'stationary': {
@@ -125,6 +126,19 @@ def find_peak_bin(i_arr, q_arr):
    return max(range(len(mags)), key=lambda k: mags[k])
 def peak_bins_match(py_peak, rtl_peak):
    """Return True if peaks match within +/-1 bin inside the same sub-frame."""
    py_sf = py_peak // SUBFRAME_SIZE
    rtl_sf = rtl_peak // SUBFRAME_SIZE
    if py_sf != rtl_sf:
        return False
    py_bin = py_peak % SUBFRAME_SIZE
    rtl_bin = rtl_peak % SUBFRAME_SIZE
    diff = abs(py_bin - rtl_bin)
    return diff <= 1 or diff >= SUBFRAME_SIZE - 1
 def total_energy(data_dict):
    """Sum of I^2 + Q^2 across all range bins and Doppler bins."""
    total = 0
@@ -207,8 +221,8 @@ def compare_scenario(name, config, base_dir):
        py_peak = find_peak_bin(py_i, py_q)
        rtl_peak = find_peak_bin(rtl_i, rtl_q)
-        # Peak agreement (allow +/- 1 bin tolerance)
+        # Peak agreement (allow +/-1 bin tolerance, but only within a sub-frame)
-        if abs(py_peak - rtl_peak) <= 1 or abs(py_peak - rtl_peak) >= DOPPLER_FFT - 1:
+        if peak_bins_match(py_peak, rtl_peak):
            peak_agreements += 1
        py_mag = magnitude_l1(py_i, py_q)
@@ -242,7 +256,7 @@ def compare_scenario(name, config, base_dir):
    avg_corr_q = sum(q_correlations) / len(q_correlations)
    print(f"\n  Per-range-bin metrics:")
-    print(f"    Peak Doppler bin agreement (+/-1): {peak_agreements}/{RANGE_BINS} "
+    print(f"    Peak Doppler bin agreement (+/-1 within sub-frame): {peak_agreements}/{RANGE_BINS} "
          f"({peak_agreement_frac:.0%})")
    print(f"    Avg magnitude correlation: {avg_mag_corr:.4f}")
    print(f"    Avg I-channel correlation: {avg_corr_i:.4f}")
@@ -1106,8 +1106,8 @@ FFFF0000
 00000000
 00000000
 00000000
-FFFF0001
+00000001
-FFFF0000
+00000000
 FFFF0005
 00000001
 00000001
@@ -1172,7 +1172,7 @@ FFFF0000
 00010000
 00010000
 00010000
-00060003
+00060002
 00010001
 00000001
 00000000
@@ -1236,7 +1236,7 @@ FFFF0000
 00000000
 0001FFFF
 0002FFFF
-0006FFFD
+0005FFFC
 00010000
 0001FFFF
 00000001
@@ -1300,7 +1300,7 @@ FFFF0000
 00000000
 00000000
 FFFFFFFF
-FFFFFFFA
+FFFEFFFA
 0000FFFF
 0000FFFF
 00010001
@@ -1364,9 +1364,9 @@ FFFF0000
 00000000
 00000000
 FFFF0000
-FFFAFFFD
+FFFAFFFF
 FFFFFFFF
-00000000
+00000001
 00000001
 FFFF0000
 00000000
@@ -1427,74 +1427,74 @@ FFFF0000
 FFFF0000
 00000000
 FFFF0000
 00000001
 FFFB0005
 FFFE0001
 00000000
 00010000
 00000000
 00000000
 00000001
 00000000
 0000FFFF
 00010001
 00000000
 00000000
 00000000
 00000000
 00000000
 00000001
 00000001
 00000000
 00010001
 00000000
 00000000
 00000000
 00000000
 00000000
 00000000
 00000000
 FFFFFFFF
 FFFFFFFF
 0000FFFF
 00000000
 00000000
 00000001
 00000000
 00000000
 FFFF0000
 FFFF0000
 00000001
 00010000
 00000000
 FFFF0000
 00010000
 00000001
 FFFF0000
 FFFF0000
 00010001
 FFFF0000
 FFFFFFFF
 00000000
 00010000
 FFFF0000
 00000000
 00000000
 00000000
 00000000
 00000000
 00000000
 00000000
 00010001
 00000000
 00000000
 FFFF0000
 00000000
 00010001
 00000001
 00010006
 00000002
 FFFD0006
 FFFE0001
 00000001
 00010000
 00000000
 00000000
 00000001
 00000000
 0000FFFF
 00010001
 00000000
 00000000
 00000000
 00000000
 00000000
 00000001
 00000001
 00000000
 00010001
 00000000
 00000000
 00000000
 00000000
 00000000
 00000000
 00000000
 FFFFFFFF
 FFFFFFFF
 0000FFFF
 00000000
 00000000
 00000001
 00000000
 00000000
 FFFF0000
 FFFF0000
 00000001
 00010000
 00000000
 FFFF0000
 00010000
 00000001
 FFFF0000
 FFFF0000
 00010001
 FFFF0000
 FFFFFFFF
 00000000
 00010000
 FFFF0000
 00000000
 00000000
 00000000
 00000000
 00000000
 00000000
 00000000
 00010001
 00000000
 00000000
 FFFF0000
 00000000
 00010000
 00010001
 00030005
 00010001
 00010001
 00000000
 00000000
 FFFF0000
@@ -1556,8 +1556,8 @@ FFFFFFFF
 00000000
 00010000
 00020000
-00060001
+0006FFFE
-00010000
+0001FFFF
 00010000
 FFFF0000
 00000001
@@ -1619,9 +1619,9 @@ FFFFFFFE
 00000001
 0000FFFF
 00010000
-0001FFFF
+0001FFFE
-0004FFFB
+0001FFFA
-0002FFFF
+0002FFFE
 00010000
 FFFF0000
 FFFF0000
@@ -1682,9 +1682,9 @@ FFFF0000
 00000000
 00000001
 00000001
 00000000
 FFFF0000
-FFFEFFFA
+FFFF0000
 FFFBFFFC
 FFFFFFFF
 FFFF0000
 0000FFFF
@@ -1747,9 +1747,9 @@ FFFFFFFF
 00000000
 0000FFFF
 FFFF0001
-FFFF0000
+FFFF0001
-FFFA0000
+FFFA0003
-FFFE0000
+FFFF0001
 FFFF0000
 00000000
 00000001
@@ -1811,74 +1811,74 @@ FFFF0001
 00010000
 0000FFFF
 00000000
 FFFF0002
 FFFD0005
 FFFF0001
 00000001
 0000FFFF
 FFFF0001
 00000000
 00000000
 00000000
 FFFFFFFF
 00010001
 FFFFFFFF
 00000001
 00000000
 00000000
 00000000
 00010000
 00000000
 00000000
 FFFF0000
 00000000
 00000000
 00010000
 00000000
 00000000
 00000000
 00000000
 00000000
 0000FFFF
 00000000
 0000FFFF
 00000000
 00000000
 00000001
 00000001
 00000000
 00000000
 00000000
 00000000
 00000001
 FFFF0000
 00010000
 FFFF0000
 FFFF0000
 00000000
 00000000
 00000000
 00000001
 00000000
 FFFF0000
 00000001
 FFFF0000
 00000000
 0000FFFF
 FFFF0000
 0000FFFF
 00010000
 FFFF0000
 0001FFFF
 0000FFFF
 0001FFFF
 00000000
 0000FFFF
 00000001
 00010002
 00030005
 00000002
 00000006
 FFFF0002
 00010001
 0000FFFF
 FFFF0001
 00000000
 00000000
 00000000
 FFFFFFFF
 00010001
 FFFFFFFF
 00000001
 00000000
 00000000
 00000000
 00010000
 00000000
 00000000
 FFFF0000
 00000000
 00000000
 00010000
 00000000
 00000000
 00000000
 00000000
 00000000
 0000FFFF
 00000000
 0000FFFF
 00000000
 00000000
 00000001
 00000001
 00000000
 00000000
 00000000
 00000000
 00000001
 FFFF0000
 00010000
 FFFF0000
 FFFF0000
 00000000
 00000000
 00000000
 00000001
 00000000
 FFFF0000
 00000001
 FFFF0000
 00000000
 0000FFFF
 FFFF0000
 0000FFFF
 00010000
 FFFF0000
 0001FFFF
 0000FFFF
 0001FFFF
 00000000
 0000FFFF
 00010000
 00020001
 00060002
 00000001
 00010000
 0001FFFF
 00000000
 00000000
@@ -1939,9 +1939,9 @@ FFFF0000
 00000000
 0000FFFF
 0001FFFF
-0001FFFF
+0000FFFE
-00070000
+0005FFFC
-00000000
+0000FFFF
 00010001
 FFFF0000
 0000FFFF
@@ -2003,9 +2003,9 @@ FFFF0000
 00000001
 00000000
 0000FFFF
 0001FFFF
 0002FFF9
 0000FFFF
 FFFDFFF9
 FFFFFFFF
 FFFFFFFF
 00000000
 00000000
@@ -1099,7 +1099,7 @@ FFFF0000
 00000000
 00000002
 FFFF0003
-FFFE0012
+FFFF0012
 00000003
 FFFF0002
 00010001
@@ -1163,7 +1163,7 @@ FFFF0000
 00010001
 00010002
 00020003
-000C000D
+000D000C
 00030003
 00000001
 00000001
@@ -1226,9 +1226,9 @@ FFFF0000
 00000000
 FFFF0000
 00020000
-00030000
+0003FFFF
-00110004
+00120002
-00030000
+0003FFFF
 00020000
 00000000
 FFFF0000
@@ -1291,8 +1291,8 @@ FFFF0000
 00010000
 0002FFFF
 0003FFFE
-000FFFF6
+000EFFF4
-0004FFFF
+0003FFFE
 0002FFFF
 00000000
 FFFF0000
@@ -1312,8 +1312,8 @@ FFFF0000
 00010000
 00000001
 0000FFFF
 00000000
 00010000
 00010001
 FFFF0000
 00000001
 0000FFFF
@@ -1353,10 +1353,10 @@ FFFF0000
 00010001
 0001FFFF
 00010000
-0001FFFE
+0000FFFE
-0001FFFD
+0000FFFD
-0006FFF0
+0003FFEF
-0001FFFD
+0000FFFD
 0000FFFE
 00000000
 00010000
@@ -1376,7 +1376,7 @@ FFFF0000
 0000FFFF
 00010000
 00000001
-00010001
+00010002
 00000000
 00000001
 00000000
@@ -1418,10 +1418,10 @@ FFFF0000
 0000FFFF
 FFFF0000
 FFFFFFFE
-FFFEFFFD
+FFFDFFFD
-FFF9FFF1
+FFF5FFF2
-FFFEFFFD
+FFFEFFFE
-FFFFFFFF
+FFFE0000
 FFFF0000
 00000001
 FFFF0000
@@ -1439,8 +1439,8 @@ FFFF0000
 0000FFFF
 00010001
 FFFF0000
-FFFF0001
+FFFF0000
-FFFF0001
+FFFF0000
 00000000
 00000000
 00000001
@@ -1482,10 +1482,10 @@ FFFF0000
 00000000
 00000000
 FFFF0000
-FFFCFFFF
+FFFC0000
-FFEFFFF9
+FFEEFFFE
-FFFCFFFF
+FFFC0000
-FFFF0000
+FFFF0001
 00000000
 00000000
 FFFF0000
@@ -1504,7 +1504,7 @@ FFFF0000
 00000000
 00000000
 00000000
-FFFFFFFF
+0000FFFF
 FFFF0001
 00000000
 00010000
@@ -1546,10 +1546,10 @@ FFFFFFFF
 00000000
 FFFFFFFF
 FFFE0001
-FFFD0001
+FFFD0002
-FFEF0006
+FFF1000B
-FFFD0001
+FFFD0002
-FFFF0000
+FFFF0001
 00000000
 FFFFFFFF
 00010000
@@ -1609,77 +1609,77 @@ FFFF0001
 00000000
 00000001
 00000000
 FFFF0002
 FFFE0003
 FFF7000E
 FFFF0005
 FFFF0001
 0001FFFF
 00000000
 00000001
 0000FFFF
 00000000
 00000000
 FFFF0000
 00010000
 00010000
 FFFF0000
 FFFF0000
 0000FFFF
 00000000
 00000000
 00010000
 00000000
 00000000
 00010000
 00020001
 00000000
 00000000
 00000000
 FFFF0000
 00000000
 00000000
 00010000
 00000001
 00000001
 00000000
 00000000
 00000000
 00000000
 00000000
 00000000
 00000000
 00000001
 0000FFFF
 00000000
 0000FFFF
 00010000
 FFFF0000
 0001FFFF
 00010001
 00000000
 FFFF0001
 00010000
 0000FFFF
 00000001
 FFFF0000
 00000000
 0000FFFF
 FFFF0000
 00000001
 00000000
 FFFF0000
 FFFF0000
 00000000
 0000FFFF
 00000001
 00000002
-00000003
+FFFF0004
-00050012
+FFFC0010
-00010003
+00000005
 00000001
 0001FFFF
 00000000
 00000001
 0000FFFF
 00000000
 00000000
 FFFF0000
 00010000
 00010000
 FFFF0000
 FFFF0000
 0000FFFF
 00000000
 00000000
 00010000
 00000000
 00000000
 00010000
 00010002
 00000000
 00000000
 00000000
 FFFF0000
 00000000
 00000000
 00010000
 00000001
 00000001
 00000000
 00000000
 00000000
 00000000
 00000000
 00000000
 00000000
 00000001
 0000FFFF
 00000000
 0000FFFF
 00010000
 FFFF0000
 0001FFFF
 00010001
 00000000
 FFFF0001
 00010000
 0000FFFF
 00000001
 FFFF0000
 00000000
 0000FFFF
 FFFF0000
 00000001
 00000000
 FFFF0000
 FFFF0000
 00000000
 0000FFFF
 00000001
 00000002
 00010003
 000B000F
 00020003
 00020002
 00000000
 00000000
 00000001
 00000001
 00000001
@@ -1696,9 +1696,9 @@ FFFFFFFF
 00000000
 0000FFFF
 00000000
-00000002
+FFFF0001
-00010000
+0000FFFF
-00000000
+FFFF0000
 00000000
 00000000
 00000000
@@ -1737,160 +1737,160 @@ FFFFFFFF
 00000000
 00000000
 00000001
 00020001
 00030000
 00110004
 00040000
 00020000
 00000000
 00000000
 00000000
 0000FFFF
 00000001
 00000000
 00000001
 00000000
 00000000
 00000000
 00000001
 FFFFFFFF
 0000FFFF
 FFFF0000
 00000000
 FFFF0000
 00000001
 00000000
 0000FFFF
 FFFFFFFF
 00000000
 00000000
 FFFF0000
 FFFF0000
 0000FFFF
 00010000
 00000001
 00010000
 00010001
 00000000
 0000FFFF
 00000001
 00000000
 FFFF0001
 00010001
 00000000
 00000000
 00000000
 00000000
 FFFFFFFF
 FFFF0000
 00000000
 00010001
 00010000
 FFFFFFFF
 00000000
 00000001
 00000000
 00000000
 00000000
 00000000
 00000000
 00010000
 00000000
 FFFF0000
 0000FFFF
 0000FFFF
 00000000
 00000000
 0001FFFF
 0004FFFE
 000FFFF7
 0004FFFE
 00010000
 FFFF0001
 0000FFFF
 00010000
 0000FFFF
 00000000
 FFFF0001
 00000000
 FFFF0000
 00010000
 0000FFFF
 FFFF0001
 00000000
 00000000
 00000000
 FFFFFFFF
 00010001
 FFFFFFFF
 00000000
 00010000
 00000000
 00000000
 00010000
 00000000
 00000000
 FFFF0000
 00000000
 00000000
 00010000
 00000000
 00000000
 00000000
 00000000
 00000000
 0000FFFF
 00000000
 0000FFFF
 00000000
 00000000
 00000001
 00000001
 00000000
 00000000
 00000000
 00000000
 00000001
 FFFF0000
 00010000
 FFFF0000
 FFFF0000
 00000000
 00000000
 00000000
 00000001
 00000000
 FFFF0000
 00000001
 FFFF0000
 00000000
 0000FFFF
 0000FFFE
 0001FFFB
 0005FFEF
 0000FFFC
 0001FFFE
 0000FFFF
 0001FFFF
 00000000
 0000FFFF
 00000000
 00010001
 00000000
 FFFF0001
 00000000
 0001FFFF
 00000000
 00000000
 00010000
 FFFF0000
 00000000
 0001FFFF
 00000000
 00000001
 00020002
 00030001
 000E000A
 00040001
 00020001
 00000000
 00000000
 00000000
 0000FFFF
 00000001
 00000000
 00000001
 00000000
 00000000
 00000000
 00000001
 FFFFFFFF
 0000FFFF
 FFFF0000
 00000000
 FFFF0000
 00000001
 00000000
 FFFFFFFF
 FFFFFFFF
 00000000
 00000000
 FFFF0000
 FFFF0000
 0000FFFF
 00010000
 00000001
 00010000
 00010001
 00000000
 0000FFFF
 00000001
 00000000
 FFFF0001
 00010001
 00000000
 00000000
 00000000
 00000000
 FFFFFFFF
 FFFF0000
 00000000
 00010001
 00010000
 FFFFFFFF
 00000000
 00000001
 00000000
 00000000
 00000000
 00000000
 00000000
 00010000
 00000000
 FFFF0000
 0000FFFF
 0000FFFF
 00000000
 00000000
 00020000
 00050000
 0012FFFE
 00040000
 00020000
 FFFF0001
 0000FFFF
 00010000
 0000FFFF
 00000000
 FFFF0001
 00000000
 FFFF0000
 00010000
 0000FFFF
 FFFF0001
 00000000
 00000000
 00000000
 FFFFFFFF
 00010001
 FFFFFFFF
 00000000
 0000FFFF
 00000000
 00000000
 00010000
 00000000
 00000000
 FFFF0000
 00000000
 00000000
 00010000
 00000000
 00000000
 00000000
 00000000
 00000000
 0000FFFF
 00000000
 0000FFFF
 00000000
 00000000
 00000001
 00000001
 00000000
 00000000
 00000000
 00000000
 00000001
 FFFF0000
 00010000
 FFFF0000
 FFFF0000
 00000000
 00000000
 00000000
 00000001
 00000000
 FFFF0000
 00000001
 FFFF0000
 00000000
 0000FFFF
 0000FFFE
 0003FFFC
 000CFFF3
 0001FFFD
 0002FFFE
 0000FFFF
 0001FFFF
 00000000
 0000FFFF
 00000000
 00010001
 00000000
 FFFF0001
 00000000
 0001FFFF
 00000000
 00000000
 00010000
 FFFF0000
 00000000
 0001FFFF
 00010000
 00000000
 00030001
 00000000
 0001FFFF
 00000000
 00000000
 0000FFFF
@@ -1929,78 +1929,78 @@ FFFF0000
 FFFF0000
 00000000
 00000000
 0000FFFE
 FFFFFFFD
 FFFFFFEE
 FFFFFFFC
 FFFFFFFE
 00000000
 FFFF0000
 00000000
 0000FFFF
 0000FFFF
 FFFFFFFF
 00000000
 FFFF0000
 00000001
 FFFF0000
 0000FFFF
 00000000
 00000000
 00000000
 00010000
 FFFF0000
 00000000
 00000000
 00010001
 00000000
 00000000
 0000FFFF
 00000000
 00000000
 00000000
 00000000
 00000001
 0000FFFF
 00000000
 00000000
 00000000
 00000000
 00010000
 00000000
 00000001
 00000000
 FFFF0000
 00000000
 00000001
 00010000
 00000000
 00000001
 00010000
 00000000
 FFFF0000
 00000001
 00000000
 00000000
 00000000
 00000000
 00000000
 00000001
 00010000
 00000000
 00000000
 0001FFFF
 0000FFFF
 00010000
 FFFF0000
 FFFFFFFF
 FFFEFFFE
 FFF3FFF3
 FFFEFFFD
 FFF7FFF1
 FFFEFFFD
 FFFEFFFE
 00000000
 FFFF0000
 00000000
 0000FFFF
 0000FFFF
 FFFFFFFF
 00000000
 FFFF0000
 00000001
 FFFF0000
 0000FFFF
 00000000
 00000000
 00000000
 00010000
 FFFF0000
 00000000
 00000000
 FFFF0000
 00000000
 00000000
 0000FFFF
 00000000
 00000000
 00000000
 00000000
 00000001
 0000FFFF
 00000000
 00000000
 00000000
 00000000
 00010000
 00000000
 00000001
 00000000
 FFFF0000
 00000000
 00000001
 00010000
 00000000
 00000001
 00010000
 00000000
 FFFF0000
 00000001
 00000000
 00000000
 00000000
 00000000
 00000000
 00000001
 00010000
 00000000
 00000000
 0001FFFF
 0000FFFF
 00010000
 FFFF0000
 FFFF0000
 FFFEFFFF
 FFEEFFFB
 FFFDFFFE
 FFFEFFFF
 00000000
 FFFF0000
 00000001
 00000000
 00000000
 00000001
@@ -2016,7 +2016,7 @@ FFFF0001
 00010000
 00000000
 0001FFFF
-FFFE0000
+FFFFFFFF
 00000001
 00000000
 00010000
@@ -1075,44 +1075,43 @@ class RangeBinDecimator:
 # =============================================================================
-# Doppler Processor (Hamming window + 32-point FFT)
+# Doppler Processor (Hamming window + dual 16-point FFT)
 # =============================================================================
-# Hamming window LUT (32 entries, 16-bit unsigned Q15)
+# Hamming window LUT (16 entries, 16-bit unsigned Q15)
 # Matches doppler_processor.v window_coeff[0:15]
 # w[n] = 0.54 - 0.46 * cos(2*pi*n/15), n=0..15, symmetric
 HAMMING_WINDOW = [
-    0x0800, 0x0862, 0x09CB, 0x0C3B, 0x0FB2, 0x142F, 0x19B2, 0x2039,
+    0x0A3D, 0x0E5C, 0x1B6D, 0x3088, 0x4B33, 0x6573, 0x7642, 0x7F62,
-    0x27C4, 0x3050, 0x39DB, 0x4462, 0x4FE3, 0x5C5A, 0x69C4, 0x781D,
+    0x7F62, 0x7642, 0x6573, 0x4B33, 0x3088, 0x1B6D, 0x0E5C, 0x0A3D,
    0x7FFF, 0x781D, 0x69C4, 0x5C5A, 0x4FE3, 0x4462, 0x39DB, 0x3050,
    0x27C4, 0x2039, 0x19B2, 0x142F, 0x0FB2, 0x0C3B, 0x09CB, 0x0862,
 ]
 class DopplerProcessor:
    """
-    Bit-accurate model of doppler_processor_optimized.v
+    Bit-accurate model of doppler_processor_optimized.v (dual 16-pt FFT architecture).
-    For each range bin (0-63):
+    The staggered-PRF frame has 32 chirps total:
-      1. Read 32 chirps of data from accumulation buffer
+      - Sub-frame 0 (long PRI):  chirps 0-15  -> 16-pt Hamming -> 16-pt FFT -> bins 0-15
-      2. Apply Hamming window (Q15 multiply, round, >>>15)
+      - Sub-frame 1 (short PRI): chirps 16-31 -> 16-pt Hamming -> 16-pt FFT -> bins 16-31
      3. 32-point FFT
-    The 32-point FFT uses xfft_32.v (Xilinx IP wrapper around fft_engine).
+    Output: doppler_bin[4:0] = {sub_frame_id, bin_in_subframe[3:0]}
-    For the Python model, we use FFTEngine with N=32.
+    Total output per range bin: 32 bins (16 + 16), same interface as before.
    """
-    DOPPLER_FFT_SIZE = 32
+    DOPPLER_FFT_SIZE = 16     # Per sub-frame
    RANGE_BINS = 64
    CHIRPS_PER_FRAME = 32
    CHIRPS_PER_SUBFRAME = 16
-    def __init__(self, twiddle_file_32=None):
+    def __init__(self, twiddle_file_16=None):
        """
-        For 32-point FFT, we need the 32-point twiddle file.
+        For 16-point FFT, we need the 16-point twiddle file.
        If not provided, we generate twiddle factors mathematically
-        (since the 32-pt twiddle ROM is cos(2*pi*k/32) for k=0..7).
+        (cos(2*pi*k/16) for k=0..3, quarter-wave ROM with 4 entries).
        """
-        self.fft32 = None
+        self.fft16 = None
-        self._twiddle_file_32 = twiddle_file_32
+        self._twiddle_file_16 = twiddle_file_16
        # We'll use a simple 32-pt FFT with computed twiddles
    @staticmethod
    def window_multiply(data_16, window_16):
@@ -1134,7 +1133,7 @@ class DopplerProcessor:
    def process_frame(self, chirp_data_i, chirp_data_q):
        """
-        Process one complete Doppler frame.
+        Process one complete Doppler frame using dual 16-pt FFTs.
        Args:
            chirp_data_i: 2D array [32 chirps][64 range bins] of signed 16-bit I
@@ -1143,46 +1142,63 @@ class DopplerProcessor:
        Returns:
            (doppler_map_i, doppler_map_q): 2D arrays [64 range bins][32 doppler bins]
                                            of signed 16-bit
                                            Bins 0-15 = sub-frame 0 (long PRI)
                                            Bins 16-31 = sub-frame 1 (short PRI)
        """
        doppler_map_i = []
        doppler_map_q = []
-        # Generate 32-pt twiddle factors (quarter-wave cos, 8 entries)
+        # Generate 16-pt twiddle factors (quarter-wave cos, 4 entries)
-        # cos(2*pi*k/32) for k=0..7
+        # cos(2*pi*k/16) for k=0..3
        # Matches fft_twiddle_16.mem: 7FFF, 7641, 5A82, 30FB
        import math
-        cos_rom_32 = []
+        cos_rom_16 = []
-        for k in range(8):
+        for k in range(4):
-            val = round(32767.0 * math.cos(2.0 * math.pi * k / 32.0))
+            val = round(32767.0 * math.cos(2.0 * math.pi * k / 16.0))
-            cos_rom_32.append(sign_extend(val & 0xFFFF, 16))
+            cos_rom_16.append(sign_extend(val & 0xFFFF, 16))
-        fft32 = FFTEngine.__new__(FFTEngine)
+        fft16 = FFTEngine.__new__(FFTEngine)
-        fft32.N = 32
+        fft16.N = 16
-        fft32.LOG2N = 5
+        fft16.LOG2N = 4
-        fft32.cos_rom = cos_rom_32
+        fft16.cos_rom = cos_rom_16
-        fft32.mem_re = [0] * 32
+        fft16.mem_re = [0] * 16
-        fft32.mem_im = [0] * 32
+        fft16.mem_im = [0] * 16
        for rbin in range(self.RANGE_BINS):
-            # Gather 32 chirps for this range bin
+            # Output bins for this range bin: 32 total (16 from each sub-frame)
-            fft_in_re = []
+            out_re = [0] * 32
-            fft_in_im = []
+            out_im = [0] * 32
-            for chirp in range(self.CHIRPS_PER_FRAME):
+            # Process each sub-frame independently
-                re_val = sign_extend(chirp_data_i[chirp][rbin] & 0xFFFF, 16)
+            for sf in range(2):
-                im_val = sign_extend(chirp_data_q[chirp][rbin] & 0xFFFF, 16)
+                chirp_start = sf * self.CHIRPS_PER_SUBFRAME
                bin_offset = sf * self.DOPPLER_FFT_SIZE
-                # Apply Hamming window
+                fft_in_re = []
-                win_re = self.window_multiply(re_val, HAMMING_WINDOW[chirp])
+                fft_in_im = []
                win_im = self.window_multiply(im_val, HAMMING_WINDOW[chirp])
-                fft_in_re.append(win_re)
+                for c in range(self.CHIRPS_PER_SUBFRAME):
-                fft_in_im.append(win_im)
+                    chirp = chirp_start + c
                    re_val = sign_extend(chirp_data_i[chirp][rbin] & 0xFFFF, 16)
                    im_val = sign_extend(chirp_data_q[chirp][rbin] & 0xFFFF, 16)
-            # 32-point forward FFT
+                    # Apply 16-pt Hamming window (index = c within sub-frame)
-            fft_out_re, fft_out_im = fft32.compute(fft_in_re, fft_in_im, inverse=False)
+                    win_re = self.window_multiply(re_val, HAMMING_WINDOW[c])
                    win_im = self.window_multiply(im_val, HAMMING_WINDOW[c])
-            doppler_map_i.append(fft_out_re)
+                    fft_in_re.append(win_re)
-            doppler_map_q.append(fft_out_im)
+                    fft_in_im.append(win_im)
                # 16-point forward FFT
                fft_out_re, fft_out_im = fft16.compute(fft_in_re, fft_in_im, inverse=False)
                # Pack into output: sub-frame 0 -> bins 0-15, sub-frame 1 -> bins 16-31
                for b in range(self.DOPPLER_FFT_SIZE):
                    out_re[bin_offset + b] = fft_out_re[b]
                    out_im[bin_offset + b] = fft_out_im[b]
            doppler_map_i.append(out_re)
            doppler_map_q.append(out_im)
        return doppler_map_i, doppler_map_q
@@ -1207,7 +1223,7 @@ class SignalChain:
    IF_FREQ = 120_000_000    # IF frequency
    FTW_120MHZ = 0x4CCCCCCD  # Phase increment for 120 MHz at 400 MSPS
-    def __init__(self, twiddle_file_1024=None, twiddle_file_32=None):
+    def __init__(self, twiddle_file_1024=None, twiddle_file_16=None):
        self.nco = NCO()
        self.mixer = Mixer()
        self.cic_i = CICDecimator()
@@ -1217,7 +1233,7 @@ class SignalChain:
        self.ddc_interface = DDCInputInterface()
        self.matched_filter = MatchedFilterChain(fft_size=1024, twiddle_file=twiddle_file_1024)
        self.range_decimator = RangeBinDecimator()
-        self.doppler = DopplerProcessor(twiddle_file_32=twiddle_file_32)
+        self.doppler = DopplerProcessor(twiddle_file_16=twiddle_file_16)
    def ddc_step(self, adc_data_8bit, ftw=None):
        """
@@ -3,23 +3,17 @@
 Generate Doppler processor co-simulation golden reference data.
 Uses the bit-accurate Python model (fpga_model.py) to compute the expected
-Doppler FFT output. Also generates the input hex files consumed by the
+Doppler FFT output for the dual 16-pt FFT architecture.  Also generates the
-Verilog testbench (tb_doppler_cosim.v).
+input hex files consumed by the Verilog testbench (tb_doppler_cosim.v).
-Two output modes:
+Architecture:
-  1. "clean" — straight Python model (correct windowing alignment)
+  Sub-frame 0 (long PRI):  chirps 0-15  -> 16-pt Hamming -> 16-pt FFT -> bins 0-15
-  2. "buggy" — replicates the RTL's windowing pipeline misalignment:
+  Sub-frame 1 (short PRI): chirps 16-31 -> 16-pt Hamming -> 16-pt FFT -> bins 16-31
     * Sample 0: fft_input = 0 (from reset mult value)
     * Sample 1: fft_input = window_multiply(data[wrong_rbin_or_0], window[0])
     * Sample k (k>=2): fft_input = window_multiply(data[k-2], window[k-1])
 Default mode is "clean".  The comparison script uses correlation-based
 metrics that are tolerant of the pipeline shift.
 Usage:
    cd ~/PLFM_RADAR/9_Firmware/9_2_FPGA/tb/cosim
-    python3 gen_doppler_golden.py            # clean model
+    python3 gen_doppler_golden.py
-    python3 gen_doppler_golden.py --buggy    # replicate RTL pipeline bug
+    python3 gen_doppler_golden.py stationary   # single scenario
 Author: Phase 0.5 Doppler co-simulation suite for PLFM_RADAR
 """
@@ -31,7 +25,7 @@ import sys
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 from fpga_model import (
-    DopplerProcessor, FFTEngine, sign_extend, HAMMING_WINDOW
+    DopplerProcessor, sign_extend, HAMMING_WINDOW
 )
 from radar_scene import Target, generate_doppler_frame
@@ -40,7 +34,8 @@ from radar_scene import Target, generate_doppler_frame
 # Constants
 # =============================================================================
-DOPPLER_FFT_SIZE = 32
+DOPPLER_FFT_SIZE = 16     # Per sub-frame
 DOPPLER_TOTAL_BINS = 32   # Total output (2 sub-frames x 16)
 RANGE_BINS = 64
 CHIRPS_PER_FRAME = 32
 TOTAL_SAMPLES = CHIRPS_PER_FRAME * RANGE_BINS  # 2048
@@ -82,154 +77,6 @@ def write_hex_16bit(filepath, data):
 # Buggy-model helpers  (match RTL pipeline misalignment)
 # =============================================================================
 def window_multiply(data_16, window_16):
    """Hamming window multiply matching RTL."""
    d = sign_extend(data_16 & 0xFFFF, 16)
    w = sign_extend(window_16 & 0xFFFF, 16)
    product = d * w
    rounded = product + (1 << 14)
    result = rounded >> 15
    return sign_extend(result & 0xFFFF, 16)
 def buggy_process_frame(chirp_data_i, chirp_data_q):
    """
    Replicate the RTL's exact windowing pipeline for all 64 range bins.
    For each range bin we model the three-stage pipeline:
      Stage A (BRAM registered read):
        mem_rdata captures doppler_i_mem[mem_read_addr] one cycle AFTER
        mem_read_addr is presented.
      Stage B (multiply):
        mult_i <= mem_rdata_i * window_coeff[read_doppler_index]
        -- read_doppler_index is the CURRENT cycle's value, but mem_rdata_i
        -- is from the PREVIOUS cycle's address.
      Stage C (round+shift):
        fft_input_i <= (mult_i + (1<<14)) >>> 15
        -- uses the PREVIOUS cycle's mult_i.
    Additionally, at the S_ACCUMULATE->S_LOAD_FFT transition (rbin=0) or
    S_OUTPUT->S_LOAD_FFT transition (rbin>0), the BRAM address during the
    transition cycle depends on the stale read_doppler_index and read_range_bin
    values.
    This function models every detail to produce bit-exact FFT inputs.
    """
    # Build the 32-pt FFT engine (matching fpga_model.py)
    import math as _math
    cos_rom_32 = []
    for k in range(8):
        val = round(32767.0 * _math.cos(2.0 * _math.pi * k / 32.0))
        cos_rom_32.append(sign_extend(val & 0xFFFF, 16))
    fft32 = FFTEngine.__new__(FFTEngine)
    fft32.N = 32
    fft32.LOG2N = 5
    fft32.cos_rom = cos_rom_32
    fft32.mem_re = [0] * 32
    fft32.mem_im = [0] * 32
    # Build flat BRAM contents: addr = chirp_index * 64 + range_bin
    bram_i = [0] * TOTAL_SAMPLES
    bram_q = [0] * TOTAL_SAMPLES
    for chirp in range(CHIRPS_PER_FRAME):
        for rb in range(RANGE_BINS):
            addr = chirp * RANGE_BINS + rb
            bram_i[addr] = sign_extend(chirp_data_i[chirp][rb] & 0xFFFF, 16)
            bram_q[addr] = sign_extend(chirp_data_q[chirp][rb] & 0xFFFF, 16)
    doppler_map_i = []
    doppler_map_q = []
    # State carried across range bins (simulates the RTL registers)
    # After reset: read_doppler_index=0, read_range_bin=0, mult_i=0, mult_q=0,
    # fft_input_i=0, fft_input_q=0
    # The BRAM read is always active: mem_rdata <= doppler_i_mem[mem_read_addr]
    # mem_read_addr = read_doppler_index * 64 + read_range_bin
    # We need to track what read_doppler_index and read_range_bin are at each
    # transition, since the BRAM captures data one cycle before S_LOAD_FFT runs.
    # Before processing starts (just entered S_LOAD_FFT from S_ACCUMULATE):
    # At the S_ACCUMULATE clock that transitions:
    #   read_doppler_index <= 0 (NBA)
    #   read_range_bin <= 0 (NBA)
    # These take effect NEXT cycle. At the transition clock itself,
    # read_doppler_index and read_range_bin still had their old values.
    # From reset, both were 0. So BRAM captures addr=0*64+0=0.
    #
    # For rbin>0 transitions from S_OUTPUT:
    #   At S_OUTPUT clock:
    #     read_doppler_index <= 0  (was 0, since it wrapped from 32->0 in 5 bits)
    #     read_range_bin <= prev_rbin + 1 (NBA, takes effect next cycle)
    #   At S_OUTPUT clock, the current read_range_bin = prev_rbin,
    #   read_doppler_index = 0 (wrapped). So BRAM captures addr=0*64+prev_rbin.
    for rbin in range(RANGE_BINS):
        # Determine what BRAM data was captured during the transition clock
        # (one cycle before S_LOAD_FFT's first execution cycle).
        if rbin == 0:
            # From S_ACCUMULATE: both indices were 0 (from reset or previous NBA)
            # BRAM captures addr = 0*64+0 = 0  -> data[chirp=0][rbin=0]
            transition_bram_addr = 0 * RANGE_BINS + 0
        else:
            # From S_OUTPUT: read_doppler_index=0 (wrapped), read_range_bin=rbin-1
            # BRAM captures addr = 0*64+(rbin-1) -> data[chirp=0][rbin-1]
            transition_bram_addr = 0 * RANGE_BINS + (rbin - 1)
        transition_data_i = bram_i[transition_bram_addr]
        transition_data_q = bram_q[transition_bram_addr]
        # Now simulate the 32 cycles of S_LOAD_FFT for this range bin.
        # Register pipeline state at entry:
        mult_i_reg = 0  # From reset (rbin=0) or from end of previous S_FFT_WAIT
        mult_q_reg = 0
        fft_in_i_list = []
        fft_in_q_list = []
        for k in range(DOPPLER_FFT_SIZE):
            # read_doppler_index = k at this cycle's start
            # mem_read_addr = k * 64 + rbin
            # What mem_rdata holds THIS cycle:
            if k == 0:
                # BRAM captured transition_bram_addr last cycle
                rd_i = transition_data_i
                rd_q = transition_data_q
            else:
                # BRAM captured addr from PREVIOUS cycle: (k-1)*64 + rbin
                prev_addr = (k - 1) * RANGE_BINS + rbin
                rd_i = bram_i[prev_addr]
                rd_q = bram_q[prev_addr]
            # Stage B: multiply (uses current read_doppler_index = k)
            new_mult_i = sign_extend(rd_i & 0xFFFF, 16) * \
                         sign_extend(HAMMING_WINDOW[k] & 0xFFFF, 16)
            new_mult_q = sign_extend(rd_q & 0xFFFF, 16) * \
                         sign_extend(HAMMING_WINDOW[k] & 0xFFFF, 16)
            # Stage C: round+shift (uses PREVIOUS cycle's mult)
            fft_i = (mult_i_reg + (1 << 14)) >> 15
            fft_q = (mult_q_reg + (1 << 14)) >> 15
            fft_in_i_list.append(sign_extend(fft_i & 0xFFFF, 16))
            fft_in_q_list.append(sign_extend(fft_q & 0xFFFF, 16))
            # Update pipeline registers for next cycle
            mult_i_reg = new_mult_i
            mult_q_reg = new_mult_q
        # 32-point FFT
        fft_out_re, fft_out_im = fft32.compute(
            fft_in_i_list, fft_in_q_list, inverse=False
        )
        doppler_map_i.append(fft_out_re)
        doppler_map_q.append(fft_out_im)
    return doppler_map_i, doppler_map_q
 # =============================================================================
 # Test scenario definitions
@@ -244,9 +91,10 @@ def make_scenario_stationary():
 def make_scenario_moving():
    """Single target with moderate Doppler shift."""
    # v = 15 m/s → fd = 2*v*fc/c ≈ 1050 Hz
-    # PRI = 167 us → Doppler bin = fd * N_chirps * PRI = 1050 * 32 * 167e-6 ≈ 5.6
+    # Long PRI = 167 us → sub-frame 0 bin = fd * 16 * 167e-6 ≈ 2.8 → bin ~3
    # Short PRI = 175 us → sub-frame 1 bin = fd * 16 * 175e-6 ≈ 2.9 → bin 16+3 = 19
    targets = [Target(range_m=500, velocity_mps=15.0, rcs_dbsm=20.0)]
-    return targets, "Single moving target v=15m/s (~1050Hz Doppler, bin~5-6)"
+    return targets, "Single moving target v=15m/s (~1050Hz Doppler, sf0 bin~3, sf1 bin~19)"
 def make_scenario_two_targets():
@@ -269,12 +117,11 @@ SCENARIOS = {
 # Main generator
 # =============================================================================
-def generate_scenario(name, targets, description, base_dir, use_buggy_model=False):
+def generate_scenario(name, targets, description, base_dir):
    """Generate input hex + golden output for one scenario."""
    print(f"\n{'='*60}")
    print(f"Scenario: {name} — {description}")
-    model_label = "BUGGY (RTL pipeline)" if use_buggy_model else "CLEAN"
+    print(f"Model: CLEAN (dual 16-pt FFT)")
    print(f"Model: {model_label}")
    print(f"{'='*60}")
    # Generate Doppler frame (32 chirps x 64 range bins)
@@ -292,26 +139,24 @@ def generate_scenario(name, targets, description, base_dir, use_buggy_model=Fals
    input_hex = os.path.join(base_dir, f"doppler_input_{name}.hex")
    write_hex_32bit(input_hex, packed_samples)
-    # ---- Run through Python model ----
+    # ---- Run through Python model (dual 16-pt FFT) ----
-    if use_buggy_model:
+    dp = DopplerProcessor()
-        doppler_i, doppler_q = buggy_process_frame(frame_i, frame_q)
+    doppler_i, doppler_q = dp.process_frame(frame_i, frame_q)
    else:
        dp = DopplerProcessor()
        doppler_i, doppler_q = dp.process_frame(frame_i, frame_q)
    print(f"  Doppler output: {len(doppler_i)} range bins x "
-          f"{len(doppler_i[0])} doppler bins")
+          f"{len(doppler_i[0])} doppler bins (2 sub-frames x {DOPPLER_FFT_SIZE})")
    # ---- Write golden output CSV ----
    # Format: range_bin, doppler_bin, out_i, out_q
    # Ordered same as RTL output: all doppler bins for rbin 0, then rbin 1, ...
    # Bins 0-15 = sub-frame 0 (long PRI), bins 16-31 = sub-frame 1 (short PRI)
    flat_rbin = []
    flat_dbin = []
    flat_i = []
    flat_q = []
    for rbin in range(RANGE_BINS):
-        for dbin in range(DOPPLER_FFT_SIZE):
+        for dbin in range(DOPPLER_TOTAL_BINS):
            flat_rbin.append(rbin)
            flat_dbin.append(dbin)
            flat_i.append(doppler_i[rbin][dbin])
@@ -331,8 +176,8 @@ def generate_scenario(name, targets, description, base_dir, use_buggy_model=Fals
    peak_info = []
    for rbin in range(RANGE_BINS):
        mags = [abs(doppler_i[rbin][d]) + abs(doppler_q[rbin][d])
-                for d in range(DOPPLER_FFT_SIZE)]
+                for d in range(DOPPLER_TOTAL_BINS)]
-        peak_dbin = max(range(DOPPLER_FFT_SIZE), key=lambda d: mags[d])
+        peak_dbin = max(range(DOPPLER_TOTAL_BINS), key=lambda d: mags[d])
        peak_mag = mags[peak_dbin]
        peak_info.append((rbin, peak_dbin, peak_mag))
@@ -341,33 +186,14 @@ def generate_scenario(name, targets, description, base_dir, use_buggy_model=Fals
    for rbin, dbin, mag in peak_info[:5]:
        i_val = doppler_i[rbin][dbin]
        q_val = doppler_q[rbin][dbin]
-        print(f"    rbin={rbin:2d}, dbin={dbin:2d}, mag={mag:6d}, "
+        sf = dbin // DOPPLER_FFT_SIZE
        bin_in_sf = dbin % DOPPLER_FFT_SIZE
        print(f"    rbin={rbin:2d}, dbin={dbin:2d} (sf{sf}:{bin_in_sf:2d}), mag={mag:6d}, "
              f"I={i_val:6d}, Q={q_val:6d}")
    # ---- Write frame data for debugging ----
    # Also write per-range-bin FFT input (for debugging pipeline alignment)
    if use_buggy_model:
        # Write the buggy FFT inputs for debugging
        debug_csv = os.path.join(base_dir, f"doppler_fft_inputs_{name}.csv")
        # Regenerate to capture FFT inputs
        dp_debug = DopplerProcessor()
        clean_i, clean_q = dp_debug.process_frame(frame_i, frame_q)
        # Show the difference between clean and buggy
        print(f"\n  Comparing clean vs buggy model outputs:")
        mismatches = 0
        for rbin in range(RANGE_BINS):
            for dbin in range(DOPPLER_FFT_SIZE):
                if (doppler_i[rbin][dbin] != clean_i[rbin][dbin] or
                    doppler_q[rbin][dbin] != clean_q[rbin][dbin]):
                    mismatches += 1
        total = RANGE_BINS * DOPPLER_FFT_SIZE
        print(f"    {mismatches}/{total} output samples differ "
              f"({100*mismatches/total:.1f}%)")
    return {
        'name': name,
        'description': description,
        'model': 'buggy' if use_buggy_model else 'clean',
        'peak_info': peak_info[:5],
    }
@@ -375,11 +201,9 @@ def generate_scenario(name, targets, description, base_dir, use_buggy_model=Fals
 def main():
    base_dir = os.path.dirname(os.path.abspath(__file__))
    use_buggy = '--buggy' in sys.argv
    print("=" * 60)
    print("Doppler Processor Co-Sim Golden Reference Generator")
-    print(f"Model: {'BUGGY (RTL pipeline replication)' if use_buggy else 'CLEAN'}")
+    print(f"Architecture: dual {DOPPLER_FFT_SIZE}-pt FFT ({DOPPLER_TOTAL_BINS} total bins)")
    print("=" * 60)
    scenarios_to_run = list(SCENARIOS.keys())
@@ -395,15 +219,14 @@ def main():
    results = []
    for name in scenarios_to_run:
        targets, description = SCENARIOS[name]()
-        r = generate_scenario(name, targets, description, base_dir,
+        r = generate_scenario(name, targets, description, base_dir)
                              use_buggy_model=use_buggy)
        results.append(r)
    print(f"\n{'='*60}")
    print("Summary:")
    print(f"{'='*60}")
    for r in results:
-        print(f"  {r['name']:<15s} [{r['model']}] top peak: "
+        print(f"  {r['name']:<15s} top peak: "
              f"rbin={r['peak_info'][0][0]}, dbin={r['peak_info'][0][1]}, "
              f"mag={r['peak_info'][0][2]}")
@@ -48,19 +48,24 @@ ADC_BITS = 8              # ADC resolution
 T_LONG_CHIRP = 30e-6      # 30 us long chirp duration
 T_SHORT_CHIRP = 0.5e-6    # 0.5 us short chirp
 T_LISTEN_LONG = 137e-6    # 137 us listening window
 T_PRI_LONG = 167e-6       # 30 us chirp + 137 us listen
 T_PRI_SHORT = 175e-6      # staggered short-PRI sub-frame
 N_SAMPLES_LISTEN = int(T_LISTEN_LONG * FS_ADC)  # 54800 samples
 # Processing chain
 CIC_DECIMATION = 4
 FFT_SIZE = 1024
 RANGE_BINS = 64
-DOPPLER_FFT_SIZE = 32
+DOPPLER_FFT_SIZE = 16      # Per sub-frame
 DOPPLER_TOTAL_BINS = 32    # Total output bins (2 sub-frames x 16)
 CHIRPS_PER_SUBFRAME = 16
 CHIRPS_PER_FRAME = 32
 # Derived
 RANGE_RESOLUTION = C_LIGHT / (2 * CHIRP_BW)  # 7.5 m
 MAX_UNAMBIGUOUS_RANGE = C_LIGHT * T_LISTEN_LONG / 2  # ~20.55 km
-VELOCITY_RESOLUTION = WAVELENGTH / (2 * CHIRPS_PER_FRAME * T_LONG_CHIRP)
+VELOCITY_RESOLUTION_LONG = WAVELENGTH / (2 * CHIRPS_PER_SUBFRAME * T_PRI_LONG)
 VELOCITY_RESOLUTION_SHORT = WAVELENGTH / (2 * CHIRPS_PER_SUBFRAME * T_PRI_SHORT)
 # Short chirp LUT (60 entries, 8-bit unsigned)
 SHORT_CHIRP_LUT = [
@@ -384,9 +389,6 @@ def generate_doppler_frame(targets, n_chirps=CHIRPS_PER_FRAME,
                break
        return math.sqrt(-2.0 * math.log(u1)) * math.cos(2.0 * math.pi * u2)
    # Chirp repetition interval (PRI)
    t_pri = T_LONG_CHIRP + T_LISTEN_LONG  # ~167 us
    frame_i = []
    frame_q = []
@@ -408,8 +410,16 @@ def generate_doppler_frame(targets, n_chirps=CHIRPS_PER_FRAME,
            # Amplitude (simplified)
            amp = target.amplitude / 4.0
-            # Doppler phase for this chirp
+            # Doppler phase for this chirp.
-            doppler_phase = 2 * math.pi * target.doppler_hz * chirp_idx * t_pri
+            # The frame uses staggered PRF: chirps 0-15 use the long PRI,
            # chirps 16-31 use the short PRI.
            if chirp_idx < CHIRPS_PER_SUBFRAME:
                slow_time_s = chirp_idx * T_PRI_LONG
            else:
                slow_time_s = (CHIRPS_PER_SUBFRAME * T_PRI_LONG) + \
                              ((chirp_idx - CHIRPS_PER_SUBFRAME) * T_PRI_SHORT)
            doppler_phase = 2 * math.pi * target.doppler_hz * slow_time_s
            total_phase = doppler_phase + target.phase_deg * math.pi / 180.0
            # Spread across a few bins (sinc-like response from matched filter)
@@ -91,6 +91,7 @@ doppler_processor_optimized dut (
    .doppler_valid(doppler_valid),
    .doppler_bin(doppler_bin),
    .range_bin(range_bin),
    .sub_frame(),                   // Not used in this testbench
    .processing_active(processing_active),
    .frame_complete(frame_complete),
    .status(dut_status)
@@ -75,6 +75,7 @@ doppler_processor_optimized dut (
    .doppler_valid(doppler_valid),
    .doppler_bin(doppler_bin),
    .range_bin(range_bin),
    .sub_frame(),                   // Not used in this testbench
    .processing_active(processing_active),
    .frame_complete(frame_complete),
    .status(dut_status)
@@ -0,0 +1,252 @@
 `timescale 1ns / 1ps
 // ============================================================================
 // xfft_16.v — 16-point FFT with AXI-Stream interface
 // ============================================================================
 // Wraps the synthesizable fft_engine (radix-2 DIT) with the AXI-Stream port
 // interface expected by the doppler_processor dual-FFT architecture.
 //
 // Identical interface to xfft_32.v but with N=16.
 //
 // Data format: {Q[15:0], I[15:0]} packed 32-bit.
 // Config tdata[0]: 1 = forward FFT, 0 = inverse FFT.
 // ============================================================================
 module xfft_16 (
    input  wire        aclk,
    input  wire        aresetn,
    // Configuration channel (AXI-Stream slave)
    input  wire [7:0]  s_axis_config_tdata,
    input  wire        s_axis_config_tvalid,
    output wire        s_axis_config_tready,
    // Data input channel (AXI-Stream slave)
    input  wire [31:0] s_axis_data_tdata,
    input  wire        s_axis_data_tvalid,
    input  wire        s_axis_data_tlast,
    // Data output channel (AXI-Stream master)
    output wire [31:0] m_axis_data_tdata,
    output wire        m_axis_data_tvalid,
    output wire        m_axis_data_tlast,
    input  wire        m_axis_data_tready
 );
 // ============================================================================
 // PARAMETERS
 // ============================================================================
 localparam N     = 16;
 localparam LOG2N = 4;
 // ============================================================================
 // INTERNAL SIGNALS
 // ============================================================================
 // FSM states
 localparam [2:0] S_IDLE    = 3'd0,
                 S_CONFIG  = 3'd1,
                 S_FEED    = 3'd2,
                 S_WAIT    = 3'd3,
                 S_OUTPUT  = 3'd4;
 reg [2:0] state;
 // Configuration
 reg inverse_reg;
 // Input buffering
 reg signed [15:0] in_buf_re [0:N-1];
 reg signed [15:0] in_buf_im [0:N-1];
 reg [4:0] in_count;
 // Output buffering
 reg signed [15:0] out_buf_re [0:N-1];
 reg signed [15:0] out_buf_im [0:N-1];
 reg [4:0] out_count;
 reg [4:0] out_total;
 // FFT engine interface
 reg fft_start;
 reg fft_inverse;
 reg signed [15:0] fft_din_re, fft_din_im;
 reg fft_din_valid;
 wire signed [15:0] fft_dout_re, fft_dout_im;
 wire fft_dout_valid;
 wire fft_busy;
 wire fft_done;
 // Feed counter
 reg [4:0] feed_count;
 // ============================================================================
 // FFT ENGINE INSTANCE
 // ============================================================================
 fft_engine #(
    .N(N),
    .LOG2N(LOG2N),
    .DATA_W(16),
    .INTERNAL_W(32),
    .TWIDDLE_W(16),
    .TWIDDLE_FILE("fft_twiddle_16.mem")
 ) fft_core (
    .clk(aclk),
    .reset_n(aresetn),
    .start(fft_start),
    .inverse(fft_inverse),
    .din_re(fft_din_re),
    .din_im(fft_din_im),
    .din_valid(fft_din_valid),
    .dout_re(fft_dout_re),
    .dout_im(fft_dout_im),
    .dout_valid(fft_dout_valid),
    .busy(fft_busy),
    .done(fft_done)
 );
 // ============================================================================
 // AXI-STREAM OUTPUTS
 // ============================================================================
 assign s_axis_config_tready = (state == S_IDLE);
 assign m_axis_data_tdata  = {out_buf_im[out_count[3:0]], out_buf_re[out_count[3:0]]};
 assign m_axis_data_tvalid = (state == S_OUTPUT) && (out_count < N);
 assign m_axis_data_tlast  = (state == S_OUTPUT) && (out_count == N - 1);
 // ============================================================================
 // BUFFER WRITE LOGIC — separate always block, NO async reset
 // ============================================================================
 reg in_buf_we;
 reg [3:0] in_buf_waddr;
 reg signed [15:0] in_buf_wdata_re, in_buf_wdata_im;
 reg out_buf_we;
 reg [3:0] out_buf_waddr;
 reg signed [15:0] out_buf_wdata_re, out_buf_wdata_im;
 always @(posedge aclk) begin
    if (in_buf_we) begin
        in_buf_re[in_buf_waddr] <= in_buf_wdata_re;
        in_buf_im[in_buf_waddr] <= in_buf_wdata_im;
    end
    if (out_buf_we) begin
        out_buf_re[out_buf_waddr] <= out_buf_wdata_re;
        out_buf_im[out_buf_waddr] <= out_buf_wdata_im;
    end
 end
 // ============================================================================
 // MAIN FSM
 // ============================================================================
 always @(posedge aclk or negedge aresetn) begin
    if (!aresetn) begin
        state        <= S_IDLE;
        inverse_reg  <= 1'b0;
        in_count     <= 0;
        out_count    <= 0;
        out_total    <= 0;
        feed_count   <= 0;
        fft_start    <= 1'b0;
        fft_inverse  <= 1'b0;
        fft_din_re   <= 0;
        fft_din_im   <= 0;
        fft_din_valid <= 1'b0;
        in_buf_we    <= 1'b0;
        in_buf_waddr <= 0;
        in_buf_wdata_re <= 0;
        in_buf_wdata_im <= 0;
        out_buf_we   <= 1'b0;
        out_buf_waddr <= 0;
        out_buf_wdata_re <= 0;
        out_buf_wdata_im <= 0;
    end else begin
        fft_start     <= 1'b0;
        fft_din_valid <= 1'b0;
        in_buf_we     <= 1'b0;
        out_buf_we    <= 1'b0;
        case (state)
        S_IDLE: begin
            in_count <= 0;
            if (s_axis_config_tvalid) begin
                inverse_reg <= ~s_axis_config_tdata[0];
                state       <= S_FEED;
                in_count    <= 0;
                feed_count  <= 0;
            end
        end
        S_FEED: begin
            if (in_count < N) begin
                if (s_axis_data_tvalid) begin
                    in_buf_we       <= 1'b1;
                    in_buf_waddr    <= in_count[3:0];
                    in_buf_wdata_re <= s_axis_data_tdata[15:0];
                    in_buf_wdata_im <= s_axis_data_tdata[31:16];
                    in_count <= in_count + 1;
                end
            end else if (feed_count == 0) begin
                fft_start   <= 1'b1;
                fft_inverse <= inverse_reg;
                feed_count  <= 0;
                state       <= S_WAIT;
                out_total   <= 0;
            end
        end
        S_WAIT: begin
            if (feed_count < N) begin
                fft_din_re   <= in_buf_re[feed_count[3:0]];
                fft_din_im   <= in_buf_im[feed_count[3:0]];
                fft_din_valid <= 1'b1;
                feed_count   <= feed_count + 1;
            end
            if (fft_dout_valid && out_total < N) begin
                out_buf_we       <= 1'b1;
                out_buf_waddr    <= out_total[3:0];
                out_buf_wdata_re <= fft_dout_re;
                out_buf_wdata_im <= fft_dout_im;
                out_total <= out_total + 1;
            end
            if (fft_done) begin
                state     <= S_OUTPUT;
                out_count <= 0;
            end
        end
        S_OUTPUT: begin
            if (m_axis_data_tready || !m_axis_data_tvalid) begin
                if (out_count < N) begin
                    if (m_axis_data_tready) begin
                        out_count <= out_count + 1;
                    end
                end
                if (out_count >= N - 1 && m_axis_data_tready) begin
                    state <= S_IDLE;
                end
            end
        end
        default: state <= S_IDLE;
        endcase
    end
 end
 // ============================================================================
 // MEMORY INIT (simulation only)
 // ============================================================================
 `ifdef SIMULATION
 integer init_k;
 initial begin
    for (init_k = 0; init_k < N; init_k = init_k + 1) begin
        in_buf_re[init_k]  = 0;
        in_buf_im[init_k]  = 0;
        out_buf_re[init_k] = 0;
        out_buf_im[init_k] = 0;
    end
 end
 `endif
 endmodule