DeepWok · idrees-mahmood · Feb 2, 2026 · Feb 7, 2026 · Feb 7, 2026 · Feb 7, 2026
diff --git a/.gitignore b/.gitignore
@@ -172,4 +172,7 @@ mase-trainer/
 test-trainer/
 
 # DiffLogic: tutorial files
-docs/tutorials/difflogic/data-mnist/
+docs/tutorials/difflogic/data-mnist/
+
+# For testing the emit
+generated-SV
diff --git a/docs/labs/bram/hardware/rtl/fc1_bias_source.sv b/docs/labs/bram/hardware/rtl/fc1_bias_source.sv
@@ -0,0 +1,115 @@
+
+// =====================================
+//     Mase Hardware
+//     Parameter: fc1_bias
+//     04/02/2026 21:26:55
+// =====================================
+
+`timescale 1 ns / 1 ps
+module fc1_bias_rom #(
+  parameter DWIDTH = 32,
+  parameter MEM_SIZE = 2,
+  parameter AWIDTH = $clog2(MEM_SIZE) + 1
+) (
+    input clk,
+    input logic [AWIDTH-1:0] addr0,
+    input ce0,
+    output logic [DWIDTH-1:0] q0
+);
+
+  logic [DWIDTH-1:0] ram[0:MEM_SIZE-1];
+  logic [DWIDTH-1:0] q0_t0;
+  logic [DWIDTH-1:0] q0_t1;
+
+  initial begin
+    $readmemh("./bram/hardware/rtl/fc1_bias_rom.dat", ram);
+  end
+
+  assign q0 = q0_t1;
+
+  always_ff @(posedge clk) if (ce0) q0_t1 <= q0_t0;
+  always_ff @(posedge clk) if (ce0) q0_t0 <= ram[addr0];
+
+endmodule
+
+`timescale 1 ns / 1 ps
+module fc1_bias #(
+  parameter DATA_WIDTH = 32'd32,
+  parameter ADDR_RANGE = 32'd2,
+  parameter ADDR_WIDTH = $clog2(ADDR_RANGE) + 1
+) (
+  input reset,
+  input clk,
+  input logic [ADDR_WIDTH - 1:0] address0,
+  input ce0,
+  output logic [DATA_WIDTH - 1:0] q0
+);
+
+  fc1_bias_rom fc1_bias_rom_U (
+      .clk(clk),
+      .addr0(address0),
+      .ce0(ce0),
+      .q0(q0)
+  );
+
+endmodule
+
+
+`timescale 1ns / 1ps
+module fc1_bias_source #(
+    parameter BIAS_TENSOR_SIZE_DIM_0  = 32,
+    parameter BIAS_TENSOR_SIZE_DIM_1  = 1,
+    parameter BIAS_PRECISION_0 = 16,
+    parameter BIAS_PRECISION_1 = 3,
+
+    parameter BIAS_PARALLELISM_DIM_0 = 1,
+    parameter BIAS_PARALLELISM_DIM_1 = 1,
+    parameter OUT_DEPTH = ((BIAS_TENSOR_SIZE_DIM_0 + BIAS_PARALLELISM_DIM_0 - 1) / BIAS_PARALLELISM_DIM_0) * ((BIAS_TENSOR_SIZE_DIM_1 + BIAS_PARALLELISM_DIM_1 - 1) / BIAS_PARALLELISM_DIM_1)
+) (
+    input clk,
+    input rst,
+
+    output logic [BIAS_PRECISION_0-1:0] data_out      [BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1-1:0],
+    output                       data_out_valid,
+    input                        data_out_ready
+);
+  // 1-bit wider so IN_DEPTH also fits.
+  localparam COUNTER_WIDTH = $clog2(OUT_DEPTH);
+  logic [COUNTER_WIDTH:0] counter;
+
+  always_ff @(posedge clk)
+    if (rst) counter <= 0;
+    else begin
+      if (data_out_ready) begin
+        if (counter == OUT_DEPTH - 1) counter <= 0;
+        else counter <= counter + 1;
+      end
+    end
+
+  logic [1:0] clear;
+  always_ff @(posedge clk)
+    if (rst) clear <= 0;
+    else if ((data_out_ready == 1) && (clear != 2)) clear <= clear + 1;
+  logic ce0;
+  assign ce0 = data_out_ready;
+
+  logic [BIAS_PRECISION_0*BIAS_PARALLELISM_DIM_0*BIAS_PARALLELISM_DIM_1-1:0] data_vector;
+  fc1_bias #(
+      .DATA_WIDTH(BIAS_PRECISION_0 * BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1),
+      .ADDR_RANGE(OUT_DEPTH)
+  ) fc1_bias_mem (
+      .clk(clk),
+      .reset(rst),
+      .address0(counter),
+      .ce0(ce0),
+      .q0(data_vector)
+  );
+
+  // Cocotb/verilator does not support array flattening, so
+  // we need to manually add some reshaping process.
+  for (genvar j = 0; j < BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1; j++)
+    assign data_out[j] = data_vector[BIAS_PRECISION_0*j+BIAS_PRECISION_0-1:BIAS_PRECISION_0*j];
+
+  assign data_out_valid = clear == 2;
+
+endmodule
diff --git a/docs/labs/bram/hardware/rtl/fc1_weight_source.sv b/docs/labs/bram/hardware/rtl/fc1_weight_source.sv
@@ -0,0 +1,115 @@
+
+// =====================================
+//     Mase Hardware
+//     Parameter: fc1_weight
+//     04/02/2026 21:26:55
+// =====================================
+
+`timescale 1 ns / 1 ps
+module fc1_weight_rom #(
+  parameter DWIDTH = 128,
+  parameter MEM_SIZE = 2,
+  parameter AWIDTH = $clog2(MEM_SIZE) + 1
+) (
+    input clk,
+    input logic [AWIDTH-1:0] addr0,
+    input ce0,
+    output logic [DWIDTH-1:0] q0
+);
+
+  logic [DWIDTH-1:0] ram[0:MEM_SIZE-1];
+  logic [DWIDTH-1:0] q0_t0;
+  logic [DWIDTH-1:0] q0_t1;
+
+  initial begin
+    $readmemh("./bram/hardware/rtl/fc1_weight_rom.dat", ram);
+  end
+
+  assign q0 = q0_t1;
+
+  always_ff @(posedge clk) if (ce0) q0_t1 <= q0_t0;
+  always_ff @(posedge clk) if (ce0) q0_t0 <= ram[addr0];
+
+endmodule
+
+`timescale 1 ns / 1 ps
+module fc1_weight #(
+  parameter DATA_WIDTH = 32'd128,
+  parameter ADDR_RANGE = 32'd2,
+  parameter ADDR_WIDTH = $clog2(ADDR_RANGE) + 1
+) (
+  input reset,
+  input clk,
+  input logic [ADDR_WIDTH - 1:0] address0,
+  input ce0,
+  output logic [DATA_WIDTH - 1:0] q0
+);
+
+  fc1_weight_rom fc1_weight_rom_U (
+      .clk(clk),
+      .addr0(address0),
+      .ce0(ce0),
+      .q0(q0)
+  );
+
+endmodule
+
+
+`timescale 1ns / 1ps
+module fc1_weight_source #(
+    parameter WEIGHT_TENSOR_SIZE_DIM_0  = 32,
+    parameter WEIGHT_TENSOR_SIZE_DIM_1  = 1,
+    parameter WEIGHT_PRECISION_0 = 16,
+    parameter WEIGHT_PRECISION_1 = 3,
+
+    parameter WEIGHT_PARALLELISM_DIM_0 = 1,
+    parameter WEIGHT_PARALLELISM_DIM_1 = 1,
+    parameter OUT_DEPTH = ((WEIGHT_TENSOR_SIZE_DIM_0 + WEIGHT_PARALLELISM_DIM_0 - 1) / WEIGHT_PARALLELISM_DIM_0) * ((WEIGHT_TENSOR_SIZE_DIM_1 + WEIGHT_PARALLELISM_DIM_1 - 1) / WEIGHT_PARALLELISM_DIM_1)
+) (
+    input clk,
+    input rst,
+
+    output logic [WEIGHT_PRECISION_0-1:0] data_out      [WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0],
+    output                       data_out_valid,
+    input                        data_out_ready
+);
+  // 1-bit wider so IN_DEPTH also fits.
+  localparam COUNTER_WIDTH = $clog2(OUT_DEPTH);
+  logic [COUNTER_WIDTH:0] counter;
+
+  always_ff @(posedge clk)
+    if (rst) counter <= 0;
+    else begin
+      if (data_out_ready) begin
+        if (counter == OUT_DEPTH - 1) counter <= 0;
+        else counter <= counter + 1;
+      end
+    end
+
+  logic [1:0] clear;
+  always_ff @(posedge clk)
+    if (rst) clear <= 0;
+    else if ((data_out_ready == 1) && (clear != 2)) clear <= clear + 1;
+  logic ce0;
+  assign ce0 = data_out_ready;
+
+  logic [WEIGHT_PRECISION_0*WEIGHT_PARALLELISM_DIM_0*WEIGHT_PARALLELISM_DIM_1-1:0] data_vector;
+  fc1_weight #(
+      .DATA_WIDTH(WEIGHT_PRECISION_0 * WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1),
+      .ADDR_RANGE(OUT_DEPTH)
+  ) fc1_weight_mem (
+      .clk(clk),
+      .reset(rst),
+      .address0(counter),
+      .ce0(ce0),
+      .q0(data_vector)
+  );
+
+  // Cocotb/verilator does not support array flattening, so
+  // we need to manually add some reshaping process.
+  for (genvar j = 0; j < WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1; j++)
+    assign data_out[j] = data_vector[WEIGHT_PRECISION_0*j+WEIGHT_PRECISION_0-1:WEIGHT_PRECISION_0*j];
+
+  assign data_out_valid = clear == 2;
+
+endmodule
diff --git a/docs/labs/dram-minimal-sv-dependencies.md b/docs/labs/dram-minimal-sv-dependencies.md
@@ -0,0 +1,120 @@
+# Minimal New `.sv` Dependencies for DRAM-Streamed MLP (with Testbench)
+
+## Goal
+Make the DRAM-based MLP path compile and run with the existing cocotb testbench, with the minimum number of new RTL files.
+
+## Short Answer
+- For the MLP compute path itself, **0 new compute `.sv` files are required**.
+- `fixed_linear.sv` already supports streamed `weight`/`bias` with valid/ready.
+- DRAM mode changes who drives those ports (external stream) rather than changing linear math hardware.
+
+## What to change in codegen (so no new compute module is needed)
+1. Remove the `_dram` module renaming in `src/chop/passes/graph/transforms/verilog/emit_top.py`.
+2. Keep module name as `fixed_linear`.
+3. Keep DRAM behavior controlled by interface metadata (`storage="DRAM"`) and top-level parameter ports.
+
+## Where to add DRAM-only deployment dependencies
+The primary file to control per-node dependency lists is:
+- `src/chop/passes/graph/analysis/add_metadata/add_hardware_metadata.py`
+
+Use `add_component_source(...)` to switch dependency files only when DRAM mode is requested.
+
+Simplest DRAM-only pattern:
+```python
+elif mase_op in INTERNAL_COMP.keys():
+    node.meta["mase"]["hardware"]["toolchain"] = "INTERNAL_RTL"
+    node.meta["mase"]["hardware"]["module"] = INTERNAL_COMP[mase_op][0]["name"]
+    node.meta["mase"]["hardware"]["dependence_files"] = INTERNAL_COMP[mase_op][0]["dependence_files"]
+
+    if pass_args.get("interface", {}).get("storage", "BRAM") == "DRAM":
+        dram_extra = pass_args.get("interface", {}).get("dram_deployment_deps", [])
+        node.meta["mase"]["hardware"]["dependence_files"] = (
+            node.meta["mase"]["hardware"]["dependence_files"] + dram_extra
+        )
+```
+
+Notes:
+- Keep compute dependencies unchanged for simulation.
+- Append only deployment adapters (AXIS formatter/scheduler) under DRAM.
+- Keep `storage` explicit as `"DRAM"` for off-chip flow.
+
+If you want this keyed by op type (for example only `linear`), the cleanest place for static lists is:
+- `src/chop/passes/graph/analysis/add_metadata/hardware_metadata_layers.py`
+
+Example structure (conceptual):
+```python
+DRAM_EXTRA_DEPENDENCIES = {
+    "linear": [
+        "memory_adapters/rtl/axis_param_reformatter.sv",
+        "memory_adapters/rtl/param_stream_scheduler.sv",
+    ]
+}
+```
+
+Then in `add_component_source(...)`, append `DRAM_EXTRA_DEPENDENCIES.get(mase_op, [])` when `storage == "DRAM"`.
+
+## `emit_dram` function (deployment only)
+This function is only for real FPGA deployment. It is not needed for cocotb simulation.
+
+In simulation, cocotb already quantizes and streams parameter blocks directly into DRAM-backed top-level ports.
+In deployment, we need deployable parameter images that PS software and DMA can read from DDR and stream to the accelerator.
+
+```python
+def emit_dram_transform_pass(graph, pass_args={}):
+    """
+    Emit deployable DRAM parameter images (not BRAM ROM files) for real FPGA deployment.
+
+    Expected outputs per DRAM-backed parameter:
+    - packed tensor stream image (e.g. .bin/.hex)
+    - metadata sidecar (shape, precision, parallelism, beat count, ordering)
+
+    Notes:
+    - Not required for cocotb simulation, because cocotb drivers already stream quantized parameters directly.
+    - Required for deployment where PS software / DMA reads from DDR and streams to top-level parameter ports.
+    """
+```
+
+Recommended `pass_args` fields:
+- `project_dir`: output root
+- `format`: `bin` or `hex`
+- `endianness`: `little` or `big`
+- `align_bytes`: beat alignment for DMA
+- `emit_metadata`: `True/False`
+- `target`: board/deployment tag
+
+## Minimum deployment stack (real FPGA)
+Vivado IP helps with transport, but Vivado IP alone is not enough to satisfy model-specific packing, ordering, replay, and control requirements.
+
+1. Parameter image emitter (software/pass side)
+Converts quantized tensors into stream-ready images matching hardware beat format and ordering.
+2. DDR buffer allocation + loader software (PS side)
+Allocates contiguous buffers, loads emitted images, and passes addresses/lengths to DMA/control plane.
+3. AXI DMA (Vivado IP)
+Moves parameter beats from DDR to AXI-Stream.
+4. AXIS packet/beat reformatter (custom RTL/HLS)
+Adapts DMA stream width/packet structure to `fc1_weight` / `fc1_bias` port beat shape.
+5. Parameter stream scheduler/control (custom RTL/HLS or PS-driven FSM)
+Controls when each parameter stream starts, stops, and repeats per inference, while respecting back-pressure.
+6. Optional AXIS FIFO/width converter (Vivado IP)
+Used when clock-domain crossing, buffering, or width adaptation is needed.
+
+### Vivado IP vs custom logic
+Vivado can provide:
+- AXI DMA
+- AXIS FIFO
+- AXIS data width converter
+- AXI interconnect / SmartConnect
+- Clocking/reset and standard infrastructure
+
+You still need to implement:
+- Parameter block ordering/packing contract used by `fixed_linear`
+- Tensor replay policy (e.g. re-stream full weights per sample/batch)
+- Stream scheduling across weight/bias channels
+- Control/status integration between PS and accelerator
+- Model-specific handshaking and sequencing correctness
+
+## Final handoff list for teammate (today)
+- Do not create `fixed_linear_dram.sv`.
+- Remove `_dram` renaming in `src/chop/passes/graph/transforms/verilog/emit_top.py` so top instantiates `fixed_linear`.
+- Add DRAM-only deployment dependencies in `src/chop/passes/graph/analysis/add_metadata/add_hardware_metadata.py` by appending an extra list only when `storage == "DRAM"`.
+- Keep deployment adapters separate from compute RTL (compute stays `fixed_linear`).