feat: 2048-pt FFT upgrade with decimation=4, 512 output bins, 6m spacing

Complete cross-layer upgrade from 1024-pt/64-bin to 2048-pt/512-bin FFT: FPGA RTL (14+ modules): - radar_params.vh: FFT_SIZE=2048, RANGE_BINS=512, 9-bit range, 6-bit stream - fft_engine.v: 2048-pt FFT with XPM BRAM - chirp_memory_loader_param.v: 2 segments x 2048 (was 4 x 1024) - matched_filter_multi_segment.v: BRAM inference for overlap_cache, explicit ov_waddr - mti_canceller.v: BRAM inference for prev_i/q arrays (was fabric FFs) - doppler_processor.v: 16384-deep memory, 14-bit addressing - cfar_ca.v: 512 rows, indentation fix - radar_receiver_final.v: rising-edge detector for frame_complete, 11-bit sample_addr - range_bin_decimator.v: 512 output bins - usb_data_interface_ft2232h.v: bulk per-frame with Manhattan magnitude - radar_mode_controller.v: XOR edge detector for toggle signals - rx_gain_control.v: updated for new bin count Python GUI + Protocol (8 files): - radar_protocol.py: 512-bin bulk frame parser, LSB-first bitmap - GUI_V65_Tk.py, v7/*.py: updated for 512 bins, 6m range resolution Golden data + tests: - All .hex/.csv/.npy golden references regenerated for 2048/512 - fft_twiddle_2048.mem added - Deleted stale seg2/seg3 chirp mem files - 9 new bulk frame cross-layer tests, deleted 6 stale per-sample tests - Deleted stale tb_cross_layer_ft2232h.v and dead contract_parser functions - Updated validate_mem_files.py for 2048/2-segment config MCU: RadarSettings.cpp max_distance/map_size 1536->3072 All 4 CI jobs pass: 285 tests, 0 failures, 0 skips
2026-04-16 17:27:55 +05:45
parent affa40a9d3
commit e9705e40b7
178 changed files with 687738 additions and 122880 deletions
@@ -34,8 +34,8 @@ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 # =============================================================================

 DOPPLER_FFT = 32
-RANGE_BINS = 64
-TOTAL_OUTPUTS = RANGE_BINS * DOPPLER_FFT  # 2048
+RANGE_BINS = 512
+TOTAL_OUTPUTS = RANGE_BINS * DOPPLER_FFT  # 16384
 SUBFRAME_SIZE = 16

 SCENARIOS = {
@@ -246,7 +246,7 @@ def compare_scenario(name, config, base_dir):
    # ---- Pass/Fail ----
    checks = []

-    checks.append(('RTL output count == 2048', count_ok))
+    checks.append(('RTL output count == 16384', count_ok))

    energy_ok = (ENERGY_RATIO_MIN < energy_ratio < ENERGY_RATIO_MAX)
    checks.append((f'Energy ratio in bounds '
@@ -36,7 +36,7 @@ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 # Configuration
 # =============================================================================

-FFT_SIZE = 1024
+FFT_SIZE = 2048

 SCENARIOS = {
    'chirp': {
@@ -243,7 +243,7 @@ def compare_scenario(scenario_name, config, base_dir):

    # Check 2: RTL produced expected sample count
    correct_count = len(rtl_i) == FFT_SIZE
-    checks.append(('Correct output count (1024)', correct_count))
+    checks.append(('Correct output count (2048)', correct_count))

    # Check 3: Energy ratio within generous bounds
    # Allow very wide range since twiddle differences cause large gain variation
@@ -709,15 +709,24 @@ class DDCInputInterface:
 # FFT Engine (1024-point radix-2 DIT, in-place, 32-bit internal)
 # =============================================================================

-def load_twiddle_rom(filepath=None):
+def load_twiddle_rom(filepath=None, n=2048):
    """
-    Load 256-entry quarter-wave cosine ROM from hex file.
-    Returns list of 256 signed 16-bit integers.
+    Load quarter-wave cosine ROM from hex file.
+    Returns list of N/4 signed 16-bit integers.
+
+    For N=2048: loads fft_twiddle_2048.mem (512 entries).
+    For N=1024: loads fft_twiddle_1024.mem (256 entries).
+    For N=16:   loads fft_twiddle_16.mem (4 entries).
    """
    if filepath is None:
        # Default path relative to this file
        base = os.path.dirname(os.path.abspath(__file__))
-        filepath = os.path.join(base, '..', '..', 'fft_twiddle_1024.mem')
+        if n == 2048:
+            filepath = os.path.join(base, '..', '..', 'fft_twiddle_2048.mem')
+        elif n == 16:
+            filepath = os.path.join(base, '..', '..', 'fft_twiddle_16.mem')
+        else:
+            filepath = os.path.join(base, '..', '..', 'fft_twiddle_1024.mem')

    values = []
    with open(filepath) as f:
@@ -759,17 +768,17 @@ class FFTEngine:
    """
    Bit-accurate model of fft_engine.v

-    1024-point radix-2 DIT FFT/IFFT.
+    2048-point radix-2 DIT FFT/IFFT.
    Internal: 32-bit signed working data.
    Twiddle: 16-bit Q15 from quarter-wave cosine ROM.
    Butterfly: multiply 32x16->49 bits, >>>15, add/subtract.
    Output: saturate 32->16 bits. IFFT also >>>LOG2N before saturate.
    """

-    def __init__(self, n=1024, twiddle_file=None):
+    def __init__(self, n=2048, twiddle_file=None):
        self.N = n
        self.LOG2N = n.bit_length() - 1
-        self.cos_rom = load_twiddle_rom(twiddle_file)
+        self.cos_rom = load_twiddle_rom(twiddle_file, n=n)
        # Working memory (32-bit signed I/Q pairs)
        self.mem_re = [0] * n
        self.mem_im = [0] * n
@@ -942,21 +951,21 @@ class MatchedFilterChain:
    Uses a single FFTEngine instance (as in RTL, engine is reused).
    """

-    def __init__(self, fft_size=1024, twiddle_file=None):
+    def __init__(self, fft_size=2048, twiddle_file=None):
        self.fft_size = fft_size
        self.fft = FFTEngine(n=fft_size, twiddle_file=twiddle_file)
        self.conj_mult = FreqMatchedFilter()

    def process(self, sig_re, sig_im, ref_re, ref_im):
        """
-        Run matched filter on 1024-sample signal + reference.
+        Run matched filter on signal + reference.

        Args:
-            sig_re/im: signal I/Q (16-bit signed, 1024 samples)
-            ref_re/im: reference chirp I/Q (16-bit signed, 1024 samples)
+            sig_re/im: signal I/Q (16-bit signed, fft_size samples)
+            ref_re/im: reference chirp I/Q (16-bit signed, fft_size samples)

        Returns:
-            (range_profile_re, range_profile_im): 1024 x 16-bit signed
+            (range_profile_re, range_profile_im): fft_size x 16-bit signed
        """
        # Forward FFT of signal
        sig_fft_re, sig_fft_im = self.fft.compute(sig_re, sig_im, inverse=False)
@@ -984,27 +993,27 @@ class RangeBinDecimator:
    Bit-accurate model of range_bin_decimator.v

    Three modes:
-      00: Simple decimation (take center sample at index 8)
+      00: Simple decimation (take center sample at index 2)
      01: Peak detection (max |I|+|Q|)
-      10: Averaging (sum >> 4, truncation)
+      10: Averaging (sum >> 2, truncation)
      11: Reserved (output 0)
    """

-    DECIMATION_FACTOR = 16
-    OUTPUT_BINS = 64
+    DECIMATION_FACTOR = 4
+    OUTPUT_BINS = 512

    @staticmethod
    def decimate(range_re, range_im, mode=1, start_bin=0):
        """
-        Decimate 1024 range bins to 64.
+        Decimate 2048 range bins to 512.

        Args:
-            range_re/im: 1024 x signed 16-bit
+            range_re/im: 2048 x signed 16-bit
            mode: 0=center, 1=peak, 2=average, 3=zero
-            start_bin: first input bin to process (0-1023)
+            start_bin: first input bin to process (0-2047)

        Returns:
-            (out_re, out_im): 64 x signed 16-bit
+            (out_re, out_im): 512 x signed 16-bit
        """
        out_re = []
        out_im = []
@@ -1052,9 +1061,9 @@ class RangeBinDecimator:
                    if idx < len(range_re):
                        sum_re += sign_extend(range_re[idx] & 0xFFFF, 16)
                        sum_im += sign_extend(range_im[idx] & 0xFFFF, 16)
-                # Truncate (arithmetic right shift by 4), take 16 bits
-                out_re.append(sign_extend((sum_re >> 4) & 0xFFFF, 16))
-                out_im.append(sign_extend((sum_im >> 4) & 0xFFFF, 16))
+                # Truncate (arithmetic right shift by 2), take 16 bits
+                out_re.append(sign_extend((sum_re >> 2) & 0xFFFF, 16))
+                out_im.append(sign_extend((sum_im >> 2) & 0xFFFF, 16))

            else:
                # Mode 3: reserved, output 0
@@ -1090,7 +1099,7 @@ class DopplerProcessor:
    """

    DOPPLER_FFT_SIZE = 16     # Per sub-frame
-    RANGE_BINS = 64
+    RANGE_BINS = 512
    CHIRPS_PER_FRAME = 32
    CHIRPS_PER_SUBFRAME = 16

@@ -1126,11 +1135,11 @@ class DopplerProcessor:
        Process one complete Doppler frame using dual 16-pt FFTs.

        Args:
-            chirp_data_i: 2D array [32 chirps][64 range bins] of signed 16-bit I
-            chirp_data_q: 2D array [32 chirps][64 range bins] of signed 16-bit Q
+            chirp_data_i: 2D array [32 chirps][512 range bins] of signed 16-bit I
+            chirp_data_q: 2D array [32 chirps][512 range bins] of signed 16-bit Q

        Returns:
-            (doppler_map_i, doppler_map_q): 2D arrays [64 range bins][32 doppler bins]
+            (doppler_map_i, doppler_map_q): 2D arrays [512 range bins][32 doppler bins]
                                            of signed 16-bit
                                            Bins 0-15 = sub-frame 0 (long PRI)
                                            Bins 16-31 = sub-frame 1 (short PRI)
@@ -1213,7 +1222,7 @@ class SignalChain:
    IF_FREQ = 120_000_000    # IF frequency
    FTW_120MHZ = 0x4CCCCCCD  # Phase increment for 120 MHz at 400 MSPS

-    def __init__(self, twiddle_file_1024=None, twiddle_file_16=None):
+    def __init__(self, twiddle_file_2048=None, twiddle_file_16=None):
        self.nco = NCO()
        self.mixer = Mixer()
        self.cic_i = CICDecimator()
@@ -1221,7 +1230,7 @@ class SignalChain:
        self.fir_i = FIRFilter()
        self.fir_q = FIRFilter()
        self.ddc_interface = DDCInputInterface()
-        self.matched_filter = MatchedFilterChain(fft_size=1024, twiddle_file=twiddle_file_1024)
+        self.matched_filter = MatchedFilterChain(fft_size=2048, twiddle_file=twiddle_file_2048)
        self.range_decimator = RangeBinDecimator()
        self.doppler = DopplerProcessor(twiddle_file_16=twiddle_file_16)

@@ -2,34 +2,22 @@
 """
 gen_chirp_mem.py — Generate all chirp .mem files for AERIS-10 FPGA.

-Generates the 10 chirp .mem files used by chirp_memory_loader_param.v:
-  - long_chirp_seg{0,1,2,3}_{i,q}.mem  (8 files, 1024 lines each)
-  - short_chirp_{i,q}.mem              (2 files, 50 lines each)
+Generates the 6 chirp .mem files used by chirp_memory_loader_param.v:
+  - long_chirp_seg{0,1}_{i,q}.mem  (4 files, 2048 lines each)
+  - short_chirp_{i,q}.mem          (2 files, 50 lines each)

 Long chirp:
  The 3000-sample baseband chirp (30 us at 100 MHz system clock) is
-  segmented into 4 blocks of 1024 samples.  Each segment covers a
+  segmented into 2 blocks of 2048 samples.  Each segment covers a
  different time window of the chirp:
-    seg0: samples   0 .. 1023
-    seg1: samples 1024 .. 2047
-    seg2: samples 2048 .. 3071  (only 952 valid chirp samples; 72 zeros)
-    seg3: all zeros (seg3 starts at sample 3072, past chirp end at 3000)
+    seg0: samples    0 .. 2047
+    seg1: samples 2048 .. 4095  (only 952 valid chirp samples; 1096 zeros)

-  Wait — actually the memory loader stores 4*1024 = 4096 contiguous
-  samples indexed by {segment_select[1:0], sample_addr[9:0]}.  The
-  long chirp has 3000 samples, so:
-    seg0: chirp[0..1023]
-    seg1: chirp[1024..2047]
-    seg2: chirp[2048..2999] + 24 zeros  (samples 2048..3071 but chirp
-          ends at 2999, so indices 3000..3071 relative to full chirp
-          => mem indices 952..1023 in seg2 file are zero)
-
-  Wait, let me re-count.  seg2 covers global indices 2048..3071.
-  The chirp has samples 0..2999 (3000 samples).  So seg2 has valid
-  data at global indices 2048..2999 = 952 valid samples (seg2 file
-  indices 0..951), then zeros at file indices 952..1023 (72 zeros).
-
-  seg3 covers global indices 3072..4095, all past chirp end => all zeros.
+  The memory loader stores 2*2048 = 4096 contiguous samples indexed
+  by {segment_select[0], sample_addr[10:0]}.  The long chirp has
+  3000 samples, so:
+    seg0: chirp[0..2047] — all valid data
+    seg1: chirp[2048..2999] + 1096 zeros (samples past chirp end)

 Short chirp:
  50 samples (0.5 us at 100 MHz), same chirp formula with
@@ -56,10 +44,10 @@ CHIRP_BW = 20e6           # 20 MHz sweep bandwidth
 FS_SYS = 100e6            # System clock (100 MHz, post-CIC)
 T_LONG_CHIRP = 30e-6      # 30 us long chirp duration
 T_SHORT_CHIRP = 0.5e-6    # 0.5 us short chirp duration
-FFT_SIZE = 1024
+FFT_SIZE = 2048
 LONG_CHIRP_SAMPLES = int(T_LONG_CHIRP * FS_SYS)   # 3000
 SHORT_CHIRP_SAMPLES = int(T_SHORT_CHIRP * FS_SYS)  # 50
-LONG_SEGMENTS = 4
+LONG_SEGMENTS = 2
 SCALE = 0.9               # Q15 scaling factor (matches radar_scene.py)
 Q15_MAX = 32767

@@ -187,13 +175,14 @@ def main():
    # Check magnitude envelope
    max(math.sqrt(i*i + q*q) for i, q in zip(long_i, long_q, strict=False))

-    # Check seg3 zero padding
-    seg3_i_path = os.path.join(MEM_DIR, 'long_chirp_seg3_i.mem')
-    with open(seg3_i_path) as f:
-        seg3_lines = [line.strip() for line in f if line.strip()]
-    nonzero_seg3 = sum(1 for line in seg3_lines if line != '0000')
+    # Check seg1 zero padding (samples 3000-4095 should be zero)
+    seg1_i_path = os.path.join(MEM_DIR, 'long_chirp_seg1_i.mem')
+    with open(seg1_i_path) as f:
+        seg1_lines = [line.strip() for line in f if line.strip()]
+    # Indices 952..2047 in seg1 (global 3000..4095) should be zero
+    nonzero_tail = sum(1 for line in seg1_lines[952:] if line != '0000')

-    if nonzero_seg3 == 0:
+    if nonzero_tail == 0:
        pass
    else:
        pass
@@ -35,9 +35,9 @@ from radar_scene import Target, generate_doppler_frame

 DOPPLER_FFT_SIZE = 16     # Per sub-frame
 DOPPLER_TOTAL_BINS = 32   # Total output (2 sub-frames x 16)
-RANGE_BINS = 64
+RANGE_BINS = 512
 CHIRPS_PER_FRAME = 32
-TOTAL_SAMPLES = CHIRPS_PER_FRAME * RANGE_BINS  # 2048
+TOTAL_SAMPLES = CHIRPS_PER_FRAME * RANGE_BINS  # 16384


 # =============================================================================
@@ -30,7 +30,7 @@ from fpga_model import (
 )


-FFT_SIZE = 1024
+FFT_SIZE = 2048


 def load_hex_16bit(filepath):
@@ -143,9 +143,13 @@ def main():
        bb_q = load_hex_16bit(bb_q_path)
        ref_i = load_hex_16bit(ref_i_path)
        ref_q = load_hex_16bit(ref_q_path)
+        # Zero-pad to FFT_SIZE if shorter (legacy 1024-entry files → 2048)
+        for lst in [bb_i, bb_q, ref_i, ref_q]:
+            while len(lst) < FFT_SIZE:
+                lst.append(0)
        r = generate_case("chirp", bb_i, bb_q, ref_i, ref_q,
                          "Radar chirp: 2 targets (500m, 1500m) vs ref chirp",
-                          base_dir)
+                          base_dir, write_inputs=True)
        results.append(r)
    else:
        pass
@@ -5,8 +5,8 @@ gen_multiseg_golden.py
 Generate golden reference data for matched_filter_multi_segment co-simulation.

 Tests the overlap-save segmented convolution wrapper:
-  - Long chirp: 3072 samples (4 segments x 1024, with 128-sample overlap)
-  - Short chirp: 50 samples zero-padded to 1024 (1 segment)
+  - Long chirp: 3072 samples (2 segments x 2048, with overlap)
+  - Short chirp: 50 samples zero-padded to 2048 (1 segment)

 The matched_filter_processing_chain is already verified bit-perfect.
 This test validates that the multi_segment wrapper:
@@ -17,7 +17,7 @@ This test validates that the multi_segment wrapper:

 Strategy:
  - Generate known input data (identifiable per-segment patterns)
-  - Generate per-segment reference chirp data (1024 samples each)
+  - Generate per-segment reference chirp data (2048 samples each)
  - Run each segment through MatchedFilterChain independently in Python
  - Compare RTL multi-segment outputs against per-segment Python outputs

@@ -64,7 +64,7 @@ def generate_long_chirp_test():
      - buffer_write_ptr starts at 0 (from ST_IDLE reset)
      - Collects 896 samples into positions [0:895]
      - Positions [896:1023] remain zero (from initial block)
-      - Processes full 1024-sample buffer
+      - Processes full 2048-sample buffer

    For segment 1 (ST_NEXT_SEGMENT):
      - Copies input_buffer[SEGMENT_ADVANCE+i] to input_buffer[i] for i=0..127
@@ -89,7 +89,7 @@ def generate_long_chirp_test():
                          positions 0-895: input data
                          positions 896-1023: zeros from initial block

-    Processing chain sees: 1024 samples = [data[0:895], zeros[896:1023]]
+    Processing chain sees: 2048 samples = [data[0:1919], zeros[1920:2047]]

    OVERLAP-SAVE (ST_NEXT_SEGMENT):
      - Copies buffer[SEGMENT_ADVANCE+i] -> buffer[i] for i=0..OVERLAP-1
@@ -105,12 +105,12 @@ def generate_long_chirp_test():
        It was 896 after segment 0, then continues: 896+768 = 1664

    Actually I realize the overlap-save implementation in this RTL has an issue:
-    For segment 0, the buffer is only partially filled (896 out of 1024),
+    For segment 0, the buffer is only partially filled (1920 out of 2048),
    with zeros in positions 896-1023. The "overlap" that gets carried to
    segment 1 is those zeros, not actual signal data.

    A proper overlap-save would:
-    1. Fill the entire 1024-sample buffer for each segment
+    1. Fill the entire 2048-sample buffer for each segment
    2. The overlap region is the LAST 128 samples of the previous segment

    But this RTL only fills 896 samples per segment and relies on the
@@ -140,7 +140,7 @@ def generate_long_chirp_test():
    [768 new data samples at positions [128:895]] +
    [128 stale/zero samples at positions [896:1023]]

-    This is NOT standard overlap-save. It's a 1024-pt buffer but only
+    This is NOT standard overlap-save. It's a 2048-pt buffer but only
    896 positions are "active" for triggering, and positions 896-1023
    are never filled after init.

@@ -153,22 +153,16 @@ def generate_long_chirp_test():
    """

    # Parameters matching RTL
-    BUFFER_SIZE = 1024
+    BUFFER_SIZE = 2048
    OVERLAP_SAMPLES = 128
-    SEGMENT_ADVANCE = BUFFER_SIZE - OVERLAP_SAMPLES  # 896
-    LONG_SEGMENTS = 4
+    SEGMENT_ADVANCE = BUFFER_SIZE - OVERLAP_SAMPLES  # 1920
+    LONG_SEGMENTS = 2

-    # Total input samples needed:
-    # Segment 0: 896 samples (ptr goes from 0 to 896)
-    # Segment 1: 768 samples (ptr goes from 128 to 896)
-    # Segment 2: 768 samples (ptr goes from 128 to 896)
-    # Segment 3: 768 samples (ptr goes from 128 to 896)
-    # Total: 896 + 3*768 = 896 + 2304 = 3200
-    # But chirp_complete triggers at chirp_samples_collected >= LONG_CHIRP_SAMPLES-1 = 2999
-    # So the last segment may be truncated.
-    # Let's generate 3072 input samples (to be safe, more than 3000).
+    # Total input samples needed: seg0 needs 1920, seg1 needs 1792 (3712 total).
+    # chirp_complete triggers at chirp_samples_collected >= LONG_CHIRP_SAMPLES-1 (2999),
+    # so the last segment may be truncated. We generate 3800 samples to be safe.

-    TOTAL_SAMPLES = 3200  # More than enough for 4 segments
+    TOTAL_SAMPLES = 3800  # More than enough for 2 segments

    # Generate input signal: identifiable pattern per segment
    # Use a tone at different frequencies for each expected segment region
@@ -184,7 +178,7 @@ def generate_long_chirp_test():
        input_q.append(saturate(val_q, 16))

    # Generate per-segment reference chirps (just use known patterns)
-    # Each segment gets a different reference (1024 samples each)
+    # Each segment gets a different reference (2048 samples each)
    ref_segs_i = []
    ref_segs_q = []
    for seg in range(LONG_SEGMENTS):
@@ -202,7 +196,7 @@ def generate_long_chirp_test():
        ref_segs_q.append(ref_q)

    # Now simulate the RTL's overlap-save algorithm in Python
-    mf_chain = MatchedFilterChain(fft_size=1024)
+    mf_chain = MatchedFilterChain(fft_size=2048)

    # Simulate the buffer exactly as RTL does it
    input_buffer_i = [0] * BUFFER_SIZE
@@ -310,7 +304,7 @@ def generate_long_chirp_test():
        f.write('segment,bin,golden_i,golden_q\n')
        for seg in range(LONG_SEGMENTS):
            out_re, out_im = segment_results[seg]
-            for b in range(1024):
+            for b in range(2048):
                f.write(f'{seg},{b},{out_re[b]},{out_im[b]}\n')


@@ -321,9 +315,9 @@ def generate_short_chirp_test():
    """
    Generate test data for single-segment short chirp.

-    Short chirp: 50 samples of data, zero-padded to 1024.
+    Short chirp: 50 samples of data, zero-padded to 2048.
    """
-    BUFFER_SIZE = 1024
+    BUFFER_SIZE = 2048
    SHORT_SAMPLES = 50

    # Generate 50-sample input
@@ -336,7 +330,7 @@ def generate_short_chirp_test():
        input_i.append(saturate(val_i, 16))
        input_q.append(saturate(val_q, 16))

-    # Zero-pad to 1024 (as RTL does in ST_ZERO_PAD)
+    # Zero-pad to 2048 (as RTL does in ST_ZERO_PAD)
    # Note: padding computed here for documentation; actual buffer uses buf_i/buf_q below
    _padded_i = list(input_i) + [0] * (BUFFER_SIZE - SHORT_SAMPLES)
    _padded_q = list(input_q) + [0] * (BUFFER_SIZE - SHORT_SAMPLES)
@@ -359,7 +353,7 @@ def generate_short_chirp_test():
            buf_i.append(0)
            buf_q.append(0)

-    # Reference chirp (1024 samples)
+    # Reference chirp (2048 samples)
    ref_i = []
    ref_q = []
    for n in range(BUFFER_SIZE):
@@ -370,7 +364,7 @@ def generate_short_chirp_test():
        ref_q.append(saturate(val_q, 16))

    # Process through MF chain
-    mf_chain = MatchedFilterChain(fft_size=1024)
+    mf_chain = MatchedFilterChain(fft_size=2048)
    out_re, out_im = mf_chain.process(buf_i, buf_q, ref_i, ref_q)

    # Write hex files
@@ -394,7 +388,7 @@ def generate_short_chirp_test():
    csv_path = os.path.join(out_dir, 'multiseg_short_golden.csv')
    with open(csv_path, 'w') as f:
        f.write('bin,golden_i,golden_q\n')
-        for b in range(1024):
+        for b in range(2048):
            f.write(f'{b},{out_re[b]},{out_im[b]}\n')

    return out_re, out_im
@@ -409,7 +403,7 @@ if __name__ == '__main__':
        # Find peak
        max_mag = 0
        peak_bin = 0
-        for b in range(1024):
+        for b in range(2048):
            mag = abs(out_re[b]) + abs(out_im[b])
            if mag > max_mag:
                max_mag = mag
@@ -418,7 +412,7 @@ if __name__ == '__main__':
    short_re, short_im = generate_short_chirp_test()
    max_mag = 0
    peak_bin = 0
-    for b in range(1024):
+    for b in range(2048):
        mag = abs(short_re[b]) + abs(short_im[b])
        if mag > max_mag:
            max_mag = mag
@@ -53,8 +53,8 @@ N_SAMPLES_LISTEN = int(T_LISTEN_LONG * FS_ADC)  # 54800 samples

 # Processing chain
 CIC_DECIMATION = 4
-FFT_SIZE = 1024
-RANGE_BINS = 64
+FFT_SIZE = 2048
+RANGE_BINS = 512
 DOPPLER_FFT_SIZE = 16      # Per sub-frame
 DOPPLER_TOTAL_BINS = 32    # Total output bins (2 sub-frames x 16)
 CHIRPS_PER_SUBFRAME = 16
@@ -69,7 +69,7 @@ FIR_COEFFS_HEX = [
 # DDC output interface
 DDC_OUT_BITS = 16                # 18 → 16 bit with rounding + saturation

-FFT_SIZE = 1024
+FFT_SIZE = 2048
 FFT_DATA_W = 16
 FFT_INTERNAL_W = 32
 FFT_TWIDDLE_W = 16
@@ -77,7 +77,7 @@ FFT_TWIDDLE_W = 16
 # Doppler — dual 16-pt FFT architecture
 DOPPLER_FFT_SIZE = 16            # per sub-frame
 DOPPLER_TOTAL_BINS = 32          # total output (2 sub-frames x 16)
-DOPPLER_RANGE_BINS = 64
+DOPPLER_RANGE_BINS = 512
 DOPPLER_CHIRPS = 32
 CHIRPS_PER_SUBFRAME = 16
 DOPPLER_WINDOW_TYPE = 0          # Hamming
@@ -109,8 +109,8 @@ AERIS_RX_LO_HZ = 10.38e9        # RX LO (ADF4382)
 AERIS_CHIRP_BW = 20e6           # Chirp bandwidth (target: 30 MHz Phase 1)
 AERIS_LONG_CHIRP_S = 30e-6      # Long chirp duration
 AERIS_PRI_S = 167e-6            # Pulse repetition interval
-AERIS_DECIMATION = 16           # Range bin decimation (1024 → 64)
-AERIS_RANGE_PER_BIN = 24.0      # Meters per decimated bin
+AERIS_DECIMATION = 4            # Range bin decimation (2048 → 512)
+AERIS_RANGE_PER_BIN = 6.0       # Meters per decimated bin


 # ===========================================================================
@@ -152,7 +152,7 @@ def load_and_quantize_adi_data(data_path, config_path, frame_idx=0):
    with a 120 MHz IF. We need to:
    1. Take one frame of 256 chirps x 1079 samples
    2. Use only 32 chirps (matching AERIS-10 CHIRPS_PER_FRAME)
-    3. Truncate to 1024 samples (matching FFT_SIZE)
+    3. Zero-pad to 2048 samples (matching FFT_SIZE)
    4. Upconvert to 120 MHz IF (add I*cos - Q*sin) to create real signal
    5. Quantize to 8-bit unsigned (matching AD9484)
    """
@@ -163,8 +163,10 @@ def load_and_quantize_adi_data(data_path, config_path, frame_idx=0):
    # Extract one frame
    frame = data[frame_idx]  # (256, 1079) complex
    
-    # Use first 32 chirps, first 1024 samples
-    iq_block = frame[:DOPPLER_CHIRPS, :FFT_SIZE]  # (32, 1024) complex
+    # Use first 32 chirps, zero-pad to FFT_SIZE samples
+    n_available = min(frame.shape[1], FFT_SIZE)
+    iq_block = np.zeros((DOPPLER_CHIRPS, FFT_SIZE), dtype=np.complex128)
+    iq_block[:, :n_available] = frame[:DOPPLER_CHIRPS, :n_available]
    
    # The ADI data is baseband complex IQ at 4 MSPS.
    # AERIS-10 sees a real signal at 400 MSPS with 120 MHz IF.
@@ -202,7 +204,10 @@ def load_and_quantize_adi_data(data_path, config_path, frame_idx=0):
    
    # Also create 8-bit ADC stimulus for DDC validation
    # Use just one chirp of real-valued data (I channel only, shifted to unsigned)
-    chirp0_real = np.real(frame[0, :FFT_SIZE])
+    # Zero-pad if needed (ADI has 1079 samples, FFT_SIZE may be larger)
+    chirp0_real = np.zeros(FFT_SIZE)
+    n_avail = min(frame.shape[1], FFT_SIZE)
+    chirp0_real[:n_avail] = np.real(frame[0, :n_avail])
    chirp0_norm = chirp0_real / np.max(np.abs(chirp0_real))
    adc_8bit = np.round(chirp0_norm * 127 + 128).astype(np.uint8)
    adc_8bit = np.clip(adc_8bit, 0, 255)
@@ -451,21 +456,21 @@ def fft_twiddle_lookup(k, N, cos_rom):

 def run_range_fft(iq_i, iq_q, twiddle_file=None):
    """
-    Bit-accurate 1024-point radix-2 DIT FFT matching fft_engine.v.
+    Bit-accurate radix-2 DIT FFT matching fft_engine.v.
    
-    Input: 16-bit signed I/Q arrays (1024 samples)
-    Output: 16-bit signed I/Q arrays (1024 bins, saturated from 32-bit internal)
+    Input: 16-bit signed I/Q arrays (N samples, N must be power of 2)
+    Output: 16-bit signed I/Q arrays (N bins, saturated from 32-bit internal)
    
    Matches RTL:
    - Bit-reversed input loading → sign-extended to 32-bit internal
-    - 10 stages of radix-2 butterflies
+    - LOG2(N) stages of radix-2 butterflies
    - Twiddle multiply: 32-bit * 16-bit = 48-bit, shift >>> 15
    - Add/subtract in 32-bit
    - Output: saturate 32-bit → 16-bit
    """
-    N = FFT_SIZE
+    N = len(iq_i)
    LOG2N = int(np.log2(N))
-    assert N == 1024 and LOG2N == 10
+    assert N == (1 << LOG2N), f"FFT size {N} is not a power of 2"
    
    # Load twiddle ROM
    if twiddle_file and os.path.exists(twiddle_file):
@@ -542,18 +547,18 @@ def run_range_fft(iq_i, iq_q, twiddle_file=None):
 # ===========================================================================
 def run_range_bin_decimator(range_fft_i, range_fft_q,
                            mode=1, start_bin=0,
-                            input_bins=1024, output_bins=64,
-                            decimation_factor=16):
+                            input_bins=2048, output_bins=512,
+                            decimation_factor=4):
    """
    Bit-accurate model of range_bin_decimator.v (peak detection mode).

-    Input:  range_fft_i/q — shape (N_chirps, 1024), 16-bit signed
-    Output: decimated_i/q — shape (N_chirps, 64), 16-bit signed
+    Input:  range_fft_i/q — shape (N_chirps, input_bins), 16-bit signed
+    Output: decimated_i/q — shape (N_chirps, output_bins), 16-bit signed

    Modes:
        0 = simple decimation (take center sample of each group)
-        1 = peak detection   (select max |I|+|Q| from each group of 16)
-        2 = averaging        (sum group >> 4)
+        1 = peak detection   (select max |I|+|Q| from each group)
+        2 = averaging        (sum group >> log2(decimation_factor))

    RTL detail: abs_i = I[15] ? (~I + 1) : I   (unsigned 16-bit)
                cur_mag = {1'b0, abs_i} + {1'b0, abs_q}   (17-bit)
@@ -621,9 +626,10 @@ def run_range_bin_decimator(range_fft_i, range_fft_q,
                    sum_i += int(range_fft_i[c, in_idx])
                    sum_q += int(range_fft_q[c, in_idx])
                    in_idx += 1
-                # RTL: sum_i[19:4], truncation (not rounding)
-                decimated_i[c, obin] = int(sum_i) >> 4
-                decimated_q[c, obin] = int(sum_q) >> 4
+                # RTL: sum_i >> log2(decimation_factor), truncation (not rounding)
+                avg_shift = int(np.log2(decimation_factor))
+                decimated_i[c, obin] = int(sum_i) >> avg_shift
+                decimated_q[c, obin] = int(sum_q) >> avg_shift


    return decimated_i, decimated_q
@@ -636,7 +642,7 @@ def run_doppler_fft(range_data_i, range_data_q, twiddle_file_16=None):
    """
    Bit-accurate Doppler processor matching doppler_processor.v (dual 16-pt FFT).

-    Input: range_data_i/q shape (DOPPLER_CHIRPS, FFT_SIZE) — 16-bit signed
+    Input: range_data_i/q shape (DOPPLER_CHIRPS, N_range_bins) — 16-bit signed
           Only first DOPPLER_RANGE_BINS columns are processed.
    Output: doppler_map_i/q shape (DOPPLER_RANGE_BINS, DOPPLER_TOTAL_BINS) — 16-bit signed

@@ -1129,7 +1135,7 @@ def main():
        "amp_radar",
        "phaser_amp_4MSPS_500M_300u_256_m3dB_config.npy"
    )
-    twiddle_1024 = os.path.join(fpga_dir, "fft_twiddle_1024.mem")
+    twiddle_range = os.path.join(fpga_dir, "fft_twiddle_2048.mem")
    output_dir = os.path.join(script_dir, "hex")
    
    
@@ -1140,7 +1146,7 @@ def main():
        amp_data, amp_config, frame_idx=args.frame
    )
    
-    # iq_i, iq_q: (32, 1024) int64, 16-bit range — post-DDC equivalent
+    # iq_i, iq_q: (32, 2048) int64, 16-bit range — post-DDC equivalent (zero-padded)
    
    # -----------------------------------------------------------------------
    # Write stimulus files
@@ -1158,7 +1164,7 @@ def main():
    # -----------------------------------------------------------------------
    # Run range FFT on first chirp (bit-accurate)
    # -----------------------------------------------------------------------
-    range_fft_i, range_fft_q = run_range_fft(iq_i[0], iq_q[0], twiddle_1024)
+    range_fft_i, range_fft_q = run_range_fft(iq_i[0], iq_q[0], twiddle_range)
    write_hex_files(output_dir, range_fft_i, range_fft_q, "range_fft_chirp0")
    
    # Run range FFT on all 32 chirps
@@ -1166,7 +1172,7 @@ def main():
    all_range_q = np.zeros((DOPPLER_CHIRPS, FFT_SIZE), dtype=np.int64)
    
    for c in range(DOPPLER_CHIRPS):
-        ri, rq = run_range_fft(iq_i[c], iq_q[c], twiddle_1024)
+        ri, rq = run_range_fft(iq_i[c], iq_q[c], twiddle_range)
        all_range_i[c] = ri
        all_range_q[c] = rq
        if (c + 1) % 8 == 0:
@@ -1192,7 +1198,7 @@ def main():
        decimation_factor=FFT_SIZE // DOPPLER_RANGE_BINS
    )
    
-    # Write full-chain range FFT input: all 32 chirps x 1024 bins = 32768 samples
+    # Write full-chain range FFT input: all 32 chirps x 2048 bins = 65536 samples
    # This is the stimulus for the range_bin_decimator in the full-chain testbench.
    # Format: packed {Q[31:16], I[15:0]} per RTL range_data bus format
    fc_input_file = os.path.join(output_dir, "fullchain_range_input.hex")
@@ -1248,7 +1254,7 @@ def main():
    np.save(os.path.join(output_dir, "fullchain_mti_doppler_q.npy"), mti_doppler_q)
    
    # DC notch on MTI-Doppler data
-    DC_NOTCH_WIDTH = 2  # Default test value: zero bins {0, 1, 31}
+    DC_NOTCH_WIDTH = 2  # Default test value: zero bins {0, 1, 15, 16, 17, 31}
    notched_i, notched_q = run_dc_notch(mti_doppler_i, mti_doppler_q, width=DC_NOTCH_WIDTH)
    write_hex_files(output_dir, notched_i, notched_q, "fullchain_notched_ref")
    
@@ -1274,7 +1280,7 @@ def main():
    )
    
    # Write CFAR reference files
-    # 1. Magnitude map (17-bit unsigned, row-major: 64 range x 32 Doppler = 2048)
+    # 1. Magnitude map (17-bit unsigned, row-major: 512 range x 32 Doppler = 16384)
    cfar_mag_file = os.path.join(output_dir, "fullchain_cfar_mag.hex")
    with open(cfar_mag_file, 'w') as f:
        for rbin in range(DOPPLER_RANGE_BINS):
@@ -2,7 +2,7 @@
 # Chain: decim -> MTI -> Doppler -> DC notch(w=2) -> CA-CFAR
 # CFAR: guard=2, train=8, alpha=0x30, mode=CA
 # Format: range_bin doppler_bin magnitude threshold
-2 14 57128 48153
-2 29 20281 15318
-2 30 44783 22389
-3 26 19423 19422
+22 5 48799 43698
+22 6 52972 45294
+22 11 65534 38193
+22 12 51509 49542
--- a/Show More
+++ b/Show More