Achieve timing closure: DSP48E1 pipelines, 4-stage NCO, 28-bit CIC, ASYNC_REG

Phase 0+ timing optimization (attempts #13-22 + implementation): NCO (nco_400m_enhanced.v): - 4-stage pipeline: DSP48E1 accumulate -> LUT read -> negate -> quadrant MUX - DSP48E1 phase accumulator in P=P+C mode (eliminates 8-stage CARRY4 chain) - Registered phase_inc_dithered to break cascaded 32-bit add path DDC (ddc_400m.v): - Direct DSP48E1 instantiation for I/Q mixers (AREG=1, BREG=1, MREG=1, PREG=1) - CEP=1, RSTP=!reset_n for proper pipeline control - 3-stage dsp_valid_pipe for PREG=1 latency - Behavioral sim model under ifdef SIMULATION for Icarus compatibility CIC (cic_decimator_4x_enhanced.v): - 28-bit accumulators (was 36) per CIC width formula: 18 + 5*log2(4) = 28 - Removed integrator/comb saturation (CIC uses wrapping arithmetic by design) - Pipelined output saturation comparison CDC/ASYNC_REG: - ASYNC_REG attribute on all CDC synchronizer registers (cdc_modules.v, radar_system_top.v, usb_data_interface.v) - Sync reset in generate blocks (cdc_modules.v) Results: Vivado post-implementation WNS=+1.196ns, 0 failing endpoints, 850 LUTs (1.34%), 466 FFs (0.37%), 2 DSP48E1 (0.83%) on xc7a100t. All testbenches pass: 241/244 (3 known stub failures).
2026-03-16 01:02:07 +02:00
parent 1e51b739a7
commit c983a3c705
15 changed files with 5883 additions and 5466 deletions
@@ -11,11 +11,38 @@ module nco_400m_enhanced (
    output reg dds_ready
 );

-// Phase accumulator with registered outputs for better timing
-reg [31:0] phase_accumulator;
-reg [31:0] phase_accumulator_reg;
+// ============================================================================
+// 4-stage pipelined NCO for 400 MHz timing closure
+//
+// Stage 1: Phase accumulator update (DSP48E1 in P=P+C mode) + offset addition
+//          DSP48E1 does: P_reg <= P_reg + C_port (frequency_tuning_word)
+//          The P register output IS the phase accumulator — no CARRY4 chain.
+//          phase_with_offset = P_output + {phase_offset, 16'b0} (registered)
+// Stage 2: LUT address decode + LUT read → register abs values + quadrant
+// Stage 3: Compute negations from registered abs values → register neg values
+//          (CARRY4 x4 chain has registered inputs, fits in 2.5ns easily)
+// Stage 4: Quadrant sign application → sin_out, cos_out (pure MUX, no arith)
+//
+// Total latency: 4 cycles from phase_valid to sin/cos output
+// Max logic levels per stage: Stage 1=DSP48E1(internal), Stage 2=2(LUT3+LUT6),
+//   Stage 3=4(CARRY4 chain), Stage 4=1(MUX)
+// ============================================================================
+
+// Phase accumulator — DSP48E1 P output provides the accumulated phase
+// In simulation: behavioral reg. In synthesis: DSP48E1 P[31:0].
 reg [31:0] phase_with_offset;
-reg phase_valid_delayed;
+
+// Stage 2 pipeline registers: LUT output + quadrant
+reg [15:0] sin_abs_reg, cos_abs_reg;
+reg [1:0] quadrant_reg;
+
+// Stage 3 pipeline registers: pre-computed negations + abs copies + quadrant
+reg signed [15:0] sin_neg_reg, cos_neg_reg;
+reg [15:0] sin_abs_reg2, cos_abs_reg2;  // Pass-through for Stage 4 MUX
+reg [1:0] quadrant_reg2;                 // Pass-through for Stage 4 MUX
+
+// Valid pipeline: tracks 4-stage latency
+reg [3:0] valid_pipe;

 // Use only the top 8 bits for LUT addressing (256-entry LUT equivalent)
 wire [7:0] lut_address = phase_with_offset[31:24];
@@ -51,61 +78,229 @@ initial begin
    sin_lut[60] = 16'h7F61; sin_lut[61] = 16'h7FA6; sin_lut[62] = 16'h7FD8; sin_lut[63] = 16'h7FF5;
 end

-// Quadrant determination
-wire [1:0] quadrant = lut_address[7:6]; // 00: Q1, 01: Q2, 10: Q3, 11: Q4
-wire [5:0] lut_index = (quadrant[1] ? ~lut_address[5:0] : lut_address[5:0]); // Mirror for Q2/Q3
+// Combinational: quadrant determination and LUT index (feeds Stage 2 registers)
+wire [1:0] quadrant_w = lut_address[7:6];
+wire [5:0] lut_index = (quadrant_w[0] ^ quadrant_w[1]) ? ~lut_address[5:0] : lut_address[5:0];

-// Sine and cosine calculation with quadrant mapping
-wire [15:0] sin_abs = sin_lut[lut_index];
-wire [15:0] cos_abs = sin_lut[63 - lut_index]; // Cosine is phase-shifted sine
+// Combinational LUT read (will be registered in Stage 2)
+wire [15:0] sin_abs_w = sin_lut[lut_index];
+wire [15:0] cos_abs_w = sin_lut[63 - lut_index];
+
+// ============================================================================
+// Stage 1: Phase accumulator (DSP48E1) + offset addition (fabric register)
+//
+// The phase accumulator is the critical path bottleneck: a 32-bit addition
+// requires 8 CARRY4 stages in fabric (2.826 ns > 2.5 ns budget at 400 MHz).
+// Solution: Use DSP48E1 in P = P + C accumulate mode.
+//   - C-port carries frequency_tuning_word (zero-extended to 48 bits)
+//   - CREG=1 registers the tuning word inside the DSP
+//   - PREG=1 registers the accumulator output (P = P + C each cycle)
+//   - The DSP48E1 48-bit ALU performs the add internally at full speed
+//   - Only P[31:0] is used (32-bit phase accumulator)
+//
+// phase_with_offset is computed in fabric: DSP48E1 P output + {phase_offset, 16'b0}
+// This is OK because both operands are registered (P is PREG output, phase_offset
+// is a stable input), and the result feeds Stage 2 LUT which is also registered.
+// ============================================================================
+
+`ifdef SIMULATION
+// ---- Behavioral model for Icarus Verilog simulation ----
+// Mimics DSP48E1 accumulator: P <= P + C, with CREG=1, PREG=1
+reg [31:0] phase_accumulator;

-// Pipeline stage for better timing
 always @(posedge clk_400m or negedge reset_n) begin
    if (!reset_n) begin
        phase_accumulator <= 32'h00000000;
-        phase_accumulator_reg <= 32'h00000000;
        phase_with_offset <= 32'h00000000;
-        phase_valid_delayed <= 1'b0;
-        dds_ready <= 1'b0;
+    end else if (phase_valid) begin
+        phase_accumulator <= phase_accumulator + frequency_tuning_word;
+        phase_with_offset <= phase_accumulator + {phase_offset, 16'b0};
+    end
+end
+
+`else
+// ---- DSP48E1 phase accumulator for Vivado synthesis ----
+// P = P + C mode: accumulates frequency_tuning_word each clock cycle
+// Uses 1 DSP48E1 (total design: 5 of 240 available = 2.08%)
+wire [47:0] phase_accum_p;          // DSP48E1 P output (48 bits, use [31:0])
+
+DSP48E1 #(
+    // Feature control
+    .A_INPUT("DIRECT"),
+    .B_INPUT("DIRECT"),
+    .USE_DPORT("FALSE"),
+    .USE_MULT("NONE"),           // No multiplier — pure ALU accumulate
+    .USE_SIMD("ONE48"),
+    // Pipeline registers
+    .AREG(0),                    // A-port unused for accumulate
+    .BREG(0),                    // B-port unused for accumulate
+    .CREG(1),                    // Register frequency_tuning_word on C-port
+    .MREG(0),                    // No multiplier
+    .PREG(1),                    // P register IS the phase accumulator
+    .ADREG(0),
+    .ACASCREG(0),
+    .BCASCREG(0),
+    .ALUMODEREG(0),
+    .CARRYINREG(0),
+    .CARRYINSELREG(0),
+    .DREG(0),
+    .INMODEREG(0),
+    .OPMODEREG(0),
+    // Pattern detector (unused)
+    .AUTORESET_PATDET("NO_RESET"),
+    .MASK(48'h3fffffffffff),
+    .PATTERN(48'h000000000000),
+    .SEL_MASK("MASK"),
+    .SEL_PATTERN("PATTERN"),
+    .USE_PATTERN_DETECT("NO_PATDET")
+) dsp_phase_accum (
+    // Clock and reset
+    .CLK(clk_400m),
+    .RSTA(1'b0),
+    .RSTB(1'b0),
+    .RSTM(1'b0),
+    .RSTP(!reset_n),             // Reset P register (phase accumulator) on !reset_n
+    .RSTC(!reset_n),             // Reset C register (tuning word) on !reset_n
+    .RSTALLCARRYIN(1'b0),
+    .RSTALUMODE(1'b0),
+    .RSTCTRL(1'b0),
+    .RSTD(1'b0),
+    .RSTINMODE(1'b0),
+    // Clock enables
+    .CEA1(1'b0),
+    .CEA2(1'b0),
+    .CEB1(1'b0),
+    .CEB2(1'b0),
+    .CEC(1'b1),                  // Always register C (tuning word updates)
+    .CEM(1'b0),
+    .CEP(phase_valid),           // Only accumulate when phase_valid is asserted
+    .CEAD(1'b0),
+    .CEALUMODE(1'b0),
+    .CECARRYIN(1'b0),
+    .CECTRL(1'b0),
+    .CED(1'b0),
+    .CEINMODE(1'b0),
+    // Data ports
+    .A(30'b0),                   // Unused for P = P + C
+    .B(18'b0),                   // Unused for P = P + C
+    .C({16'b0, frequency_tuning_word}),  // Zero-extend 32-bit FTW to 48 bits
+    .D(25'b0),
+    .CARRYIN(1'b0),
+    // Control ports
+    .OPMODE(7'b0010011),         // Z=P (010), Y=0 (00), X=C_reg (11) → P = P + C
+    .ALUMODE(4'b0000),           // Z + X + Y + CIN (standard add)
+    .INMODE(5'b00000),
+    .CARRYINSEL(3'b000),
+    // Output ports
+    .P(phase_accum_p),
+    .PATTERNDETECT(),
+    .PATTERNBDETECT(),
+    .OVERFLOW(),
+    .UNDERFLOW(),
+    .CARRYOUT(),
+    // Cascade ports (unused)
+    .ACIN(30'b0),
+    .BCIN(18'b0),
+    .CARRYCASCIN(1'b0),
+    .MULTSIGNIN(1'b0),
+    .PCIN(48'b0),
+    .ACOUT(),
+    .BCOUT(),
+    .CARRYCASCOUT(),
+    .MULTSIGNOUT(),
+    .PCOUT()
+);
+
+// phase_with_offset: add phase_offset to the DSP48E1 accumulator output
+// Both operands are registered (phase_accum_p from PREG, phase_offset is stable input)
+// This fabric add feeds Stage 2 LUT which is also registered — timing is fine
+always @(posedge clk_400m or negedge reset_n) begin
+    if (!reset_n) begin
+        phase_with_offset <= 32'h00000000;
+    end else if (phase_valid) begin
+        phase_with_offset <= phase_accum_p[31:0] + {phase_offset, 16'b0};
+    end
+end
+
+`endif
+
+// ============================================================================
+// Stage 2: LUT read + register absolute values and quadrant
+//          Only LUT decode here — negation is deferred to Stage 3
+// ============================================================================
+always @(posedge clk_400m or negedge reset_n) begin
+    if (!reset_n) begin
+        sin_abs_reg <= 16'h0000;
+        cos_abs_reg <= 16'h7FFF;
+        quadrant_reg <= 2'b00;
+    end else if (valid_pipe[0]) begin
+        sin_abs_reg <= sin_abs_w;
+        cos_abs_reg <= cos_abs_w;
+        quadrant_reg <= quadrant_w;
+    end
+end
+
+// ============================================================================
+// Stage 3: Compute negations from registered abs values
+//          CARRY4 x4 chain has registered inputs — easily fits in 2.5ns
+//          Also pass through abs values and quadrant for Stage 4
+// ============================================================================
+always @(posedge clk_400m or negedge reset_n) begin
+    if (!reset_n) begin
+        sin_neg_reg <= 16'h0000;
+        cos_neg_reg <= -16'h7FFF;
+        sin_abs_reg2 <= 16'h0000;
+        cos_abs_reg2 <= 16'h7FFF;
+        quadrant_reg2 <= 2'b00;
+    end else if (valid_pipe[1]) begin
+        sin_neg_reg <= -sin_abs_reg;
+        cos_neg_reg <= -cos_abs_reg;
+        sin_abs_reg2 <= sin_abs_reg;
+        cos_abs_reg2 <= cos_abs_reg;
+        quadrant_reg2 <= quadrant_reg;
+    end
+end
+
+// ============================================================================
+// Stage 4: Quadrant sign application → final sin/cos output
+//          Uses pre-computed negated values from Stage 3 — pure MUX, no arithmetic
+// ============================================================================
+always @(posedge clk_400m or negedge reset_n) begin
+    if (!reset_n) begin
        sin_out <= 16'h0000;
        cos_out <= 16'h7FFF;
+    end else if (valid_pipe[2]) begin
+        case (quadrant_reg2)
+            2'b00: begin // Quadrant I: sin+, cos+
+                sin_out <= sin_abs_reg2;
+                cos_out <= cos_abs_reg2;
+            end
+            2'b01: begin // Quadrant II: sin+, cos-
+                sin_out <= sin_abs_reg2;
+                cos_out <= cos_neg_reg;
+            end
+            2'b10: begin // Quadrant III: sin-, cos-
+                sin_out <= sin_neg_reg;
+                cos_out <= cos_neg_reg;
+            end
+            2'b11: begin // Quadrant IV: sin-, cos+
+                sin_out <= sin_neg_reg;
+                cos_out <= cos_abs_reg2;
+            end
+        endcase
+    end
+end
+
+// ============================================================================
+// Valid pipeline and dds_ready (4-stage latency)
+// ============================================================================
+always @(posedge clk_400m or negedge reset_n) begin
+    if (!reset_n) begin
+        valid_pipe <= 4'b0000;
+        dds_ready <= 1'b0;
    end else begin
-        phase_valid_delayed <= phase_valid;
-        
-        if (phase_valid) begin
-            // Update phase accumulator with dithered frequency tuning word
-            phase_accumulator <= phase_accumulator + frequency_tuning_word;
-            phase_accumulator_reg <= phase_accumulator;
-            
-            // Apply phase offset
-            phase_with_offset <= phase_accumulator + {phase_offset, 16'b0};
-            dds_ready <= 1'b1;
-        end else begin
-            dds_ready <= 1'b0;
-        end
-        
-        // Generate outputs with one cycle delay for pipelining
-        if (phase_valid_delayed) begin
-            // Calculate sine and cosine with proper quadrant signs
-            case (quadrant)
-                2'b00: begin // Quadrant I: sin+, cos+
-                    sin_out <= sin_abs;
-                    cos_out <= cos_abs;
-                end
-                2'b01: begin // Quadrant II: sin+, cos-
-                    sin_out <= sin_abs;
-                    cos_out <= -cos_abs;
-                end
-                2'b10: begin // Quadrant III: sin-, cos-
-                    sin_out <= -sin_abs;
-                    cos_out <= -cos_abs;
-                end
-                2'b11: begin // Quadrant IV: sin-, cos+
-                    sin_out <= -sin_abs;
-                    cos_out <= cos_abs;
-                end
-            endcase
-        end
+        valid_pipe <= {valid_pipe[2:0], phase_valid};
+        dds_ready <= valid_pipe[3];
    end
 end