Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
3e10177
Good work on tutorials
idrees-mahmood Feb 2, 2026
2eca174
HEGOIADHOIHAIOHIRGH
idrees-mahmood Feb 7, 2026
6681387
HELP ME
idrees-mahmood Feb 7, 2026
51153e2
sdhgishgoih
idrees-mahmood Feb 7, 2026
1eb6854
Help me seriosusly
idrees-mahmood Feb 7, 2026
be0c360
Lab 3 results
idrees-mahmood Feb 9, 2026
2d446d7
Fixing compression comparison
idrees-mahmood Feb 9, 2026
9d76508
Merge branch 'ism' of https://github.com/idrees-mahmood/mase into ism
idrees-mahmood Feb 9, 2026
bbfd14c
Old lab garbage
idrees-mahmood Mar 11, 2026
5b2cd4b
Trying to add hardware metadata
idrees-mahmood Mar 23, 2026
794c8d1
Working metadata pass
idrees-mahmood Mar 23, 2026
c8a1494
We now understand DRAM
idrees-mahmood Mar 23, 2026
ad332c3
Have a read of dram weight streaming, detailed how we might start ope…
idrees-mahmood Mar 23, 2026
136a5a2
Didn't add the md file
idrees-mahmood Mar 24, 2026
f5d2c24
added a comment in the VerilogInterfaceEmitter: "this is for DRAM"
MaminM Mar 24, 2026
e813cfd
towards constructing my own top file
MaminM Mar 25, 2026
afeaaf5
initial commit for training model
MaminM Mar 25, 2026
2a36096
includes plan for Ali
MaminM Mar 25, 2026
867a6e4
added bram mlp for help with testbenches
MaminM Mar 25, 2026
9150e9c
added bram mlp testbench
MaminM Mar 25, 2026
4d4ec54
new notebook for generating the bram mlp
MaminM Mar 25, 2026
d9f66c5
Decent progress on hardware metadata pass to the classification model
idrees-mahmood Mar 25, 2026
4c07530
DRAM and BRAM data files
idrees-mahmood Mar 25, 2026
42527ea
initial commit on new DRAM tb generation
CrazyPharaoh Mar 26, 2026
6ec0f83
Ali's Stuff
CrazyPharaoh Mar 26, 2026
debea21
working testbench
MaminM Mar 27, 2026
32b9fcc
started on the report
MaminM Mar 27, 2026
8fc5d0a
Merge branch 'mlp-model-for-adls' into ism
idrees-mahmood Mar 27, 2026
e8b4607
Report progress, increasing QAM precision
Mar 27, 2026
489679a
Updated QAM model
idrees-mahmood Mar 27, 2026
9e79dee
Report changes
Mar 27, 2026
44544bf
report update
Eclyps365 Mar 27, 2026
e07644b
fixed BRAM not working on tb
CrazyPharaoh Mar 27, 2026
572eb21
Random latex files
Mar 27, 2026
39dab92
Merge branch 'ism' into the-final-final-testbench-with-no-bugs-iA
CrazyPharaoh Mar 27, 2026
1d0b4d5
Merge pull request #1 from idrees-mahmood/the-final-final-testbench-w…
CrazyPharaoh Mar 27, 2026
a950e24
cleaning up repo for PR
MaminM Mar 27, 2026
ee479ab
still cleaning up
MaminM Mar 27, 2026
d67f06a
even more cleaning
MaminM Mar 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -172,4 +172,7 @@ mase-trainer/
test-trainer/

# DiffLogic: tutorial files
docs/tutorials/difflogic/data-mnist/
docs/tutorials/difflogic/data-mnist/

# For testing the emit
generated-SV
115 changes: 115 additions & 0 deletions docs/labs/bram/hardware/rtl/fc1_bias_source.sv
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@

// =====================================
// Mase Hardware
// Parameter: fc1_bias
// 04/02/2026 21:26:55
// =====================================

`timescale 1 ns / 1 ps
module fc1_bias_rom #(
parameter DWIDTH = 32,
parameter MEM_SIZE = 2,
parameter AWIDTH = $clog2(MEM_SIZE) + 1
) (
input clk,
input logic [AWIDTH-1:0] addr0,
input ce0,
output logic [DWIDTH-1:0] q0
);

logic [DWIDTH-1:0] ram[0:MEM_SIZE-1];
logic [DWIDTH-1:0] q0_t0;
logic [DWIDTH-1:0] q0_t1;

initial begin
$readmemh("./bram/hardware/rtl/fc1_bias_rom.dat", ram);
end

assign q0 = q0_t1;

always_ff @(posedge clk) if (ce0) q0_t1 <= q0_t0;
always_ff @(posedge clk) if (ce0) q0_t0 <= ram[addr0];

endmodule

`timescale 1 ns / 1 ps
module fc1_bias #(
parameter DATA_WIDTH = 32'd32,
parameter ADDR_RANGE = 32'd2,
parameter ADDR_WIDTH = $clog2(ADDR_RANGE) + 1
) (
input reset,
input clk,
input logic [ADDR_WIDTH - 1:0] address0,
input ce0,
output logic [DATA_WIDTH - 1:0] q0
);

fc1_bias_rom fc1_bias_rom_U (
.clk(clk),
.addr0(address0),
.ce0(ce0),
.q0(q0)
);

endmodule


`timescale 1ns / 1ps
module fc1_bias_source #(
parameter BIAS_TENSOR_SIZE_DIM_0 = 32,
parameter BIAS_TENSOR_SIZE_DIM_1 = 1,
parameter BIAS_PRECISION_0 = 16,
parameter BIAS_PRECISION_1 = 3,

parameter BIAS_PARALLELISM_DIM_0 = 1,
parameter BIAS_PARALLELISM_DIM_1 = 1,
parameter OUT_DEPTH = ((BIAS_TENSOR_SIZE_DIM_0 + BIAS_PARALLELISM_DIM_0 - 1) / BIAS_PARALLELISM_DIM_0) * ((BIAS_TENSOR_SIZE_DIM_1 + BIAS_PARALLELISM_DIM_1 - 1) / BIAS_PARALLELISM_DIM_1)
) (
input clk,
input rst,

output logic [BIAS_PRECISION_0-1:0] data_out [BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1-1:0],
output data_out_valid,
input data_out_ready
);
// 1-bit wider so IN_DEPTH also fits.
localparam COUNTER_WIDTH = $clog2(OUT_DEPTH);
logic [COUNTER_WIDTH:0] counter;

always_ff @(posedge clk)
if (rst) counter <= 0;
else begin
if (data_out_ready) begin
if (counter == OUT_DEPTH - 1) counter <= 0;
else counter <= counter + 1;
end
end

logic [1:0] clear;
always_ff @(posedge clk)
if (rst) clear <= 0;
else if ((data_out_ready == 1) && (clear != 2)) clear <= clear + 1;
logic ce0;
assign ce0 = data_out_ready;

logic [BIAS_PRECISION_0*BIAS_PARALLELISM_DIM_0*BIAS_PARALLELISM_DIM_1-1:0] data_vector;
fc1_bias #(
.DATA_WIDTH(BIAS_PRECISION_0 * BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1),
.ADDR_RANGE(OUT_DEPTH)
) fc1_bias_mem (
.clk(clk),
.reset(rst),
.address0(counter),
.ce0(ce0),
.q0(data_vector)
);

// Cocotb/verilator does not support array flattening, so
// we need to manually add some reshaping process.
for (genvar j = 0; j < BIAS_PARALLELISM_DIM_0 * BIAS_PARALLELISM_DIM_1; j++)
assign data_out[j] = data_vector[BIAS_PRECISION_0*j+BIAS_PRECISION_0-1:BIAS_PRECISION_0*j];

assign data_out_valid = clear == 2;

endmodule
115 changes: 115 additions & 0 deletions docs/labs/bram/hardware/rtl/fc1_weight_source.sv
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@

// =====================================
// Mase Hardware
// Parameter: fc1_weight
// 04/02/2026 21:26:55
// =====================================

`timescale 1 ns / 1 ps
module fc1_weight_rom #(
parameter DWIDTH = 128,
parameter MEM_SIZE = 2,
parameter AWIDTH = $clog2(MEM_SIZE) + 1
) (
input clk,
input logic [AWIDTH-1:0] addr0,
input ce0,
output logic [DWIDTH-1:0] q0
);

logic [DWIDTH-1:0] ram[0:MEM_SIZE-1];
logic [DWIDTH-1:0] q0_t0;
logic [DWIDTH-1:0] q0_t1;

initial begin
$readmemh("./bram/hardware/rtl/fc1_weight_rom.dat", ram);
end

assign q0 = q0_t1;

always_ff @(posedge clk) if (ce0) q0_t1 <= q0_t0;
always_ff @(posedge clk) if (ce0) q0_t0 <= ram[addr0];

endmodule

`timescale 1 ns / 1 ps
module fc1_weight #(
parameter DATA_WIDTH = 32'd128,
parameter ADDR_RANGE = 32'd2,
parameter ADDR_WIDTH = $clog2(ADDR_RANGE) + 1
) (
input reset,
input clk,
input logic [ADDR_WIDTH - 1:0] address0,
input ce0,
output logic [DATA_WIDTH - 1:0] q0
);

fc1_weight_rom fc1_weight_rom_U (
.clk(clk),
.addr0(address0),
.ce0(ce0),
.q0(q0)
);

endmodule


`timescale 1ns / 1ps
module fc1_weight_source #(
parameter WEIGHT_TENSOR_SIZE_DIM_0 = 32,
parameter WEIGHT_TENSOR_SIZE_DIM_1 = 1,
parameter WEIGHT_PRECISION_0 = 16,
parameter WEIGHT_PRECISION_1 = 3,

parameter WEIGHT_PARALLELISM_DIM_0 = 1,
parameter WEIGHT_PARALLELISM_DIM_1 = 1,
parameter OUT_DEPTH = ((WEIGHT_TENSOR_SIZE_DIM_0 + WEIGHT_PARALLELISM_DIM_0 - 1) / WEIGHT_PARALLELISM_DIM_0) * ((WEIGHT_TENSOR_SIZE_DIM_1 + WEIGHT_PARALLELISM_DIM_1 - 1) / WEIGHT_PARALLELISM_DIM_1)
) (
input clk,
input rst,

output logic [WEIGHT_PRECISION_0-1:0] data_out [WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1-1:0],
output data_out_valid,
input data_out_ready
);
// 1-bit wider so IN_DEPTH also fits.
localparam COUNTER_WIDTH = $clog2(OUT_DEPTH);
logic [COUNTER_WIDTH:0] counter;

always_ff @(posedge clk)
if (rst) counter <= 0;
else begin
if (data_out_ready) begin
if (counter == OUT_DEPTH - 1) counter <= 0;
else counter <= counter + 1;
end
end

logic [1:0] clear;
always_ff @(posedge clk)
if (rst) clear <= 0;
else if ((data_out_ready == 1) && (clear != 2)) clear <= clear + 1;
logic ce0;
assign ce0 = data_out_ready;

logic [WEIGHT_PRECISION_0*WEIGHT_PARALLELISM_DIM_0*WEIGHT_PARALLELISM_DIM_1-1:0] data_vector;
fc1_weight #(
.DATA_WIDTH(WEIGHT_PRECISION_0 * WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1),
.ADDR_RANGE(OUT_DEPTH)
) fc1_weight_mem (
.clk(clk),
.reset(rst),
.address0(counter),
.ce0(ce0),
.q0(data_vector)
);

// Cocotb/verilator does not support array flattening, so
// we need to manually add some reshaping process.
for (genvar j = 0; j < WEIGHT_PARALLELISM_DIM_0 * WEIGHT_PARALLELISM_DIM_1; j++)
assign data_out[j] = data_vector[WEIGHT_PRECISION_0*j+WEIGHT_PRECISION_0-1:WEIGHT_PRECISION_0*j];

assign data_out_valid = clear == 2;

endmodule
120 changes: 120 additions & 0 deletions docs/labs/dram-minimal-sv-dependencies.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# Minimal New `.sv` Dependencies for DRAM-Streamed MLP (with Testbench)

## Goal
Make the DRAM-based MLP path compile and run with the existing cocotb testbench, with the minimum number of new RTL files.

## Short Answer
- For the MLP compute path itself, **0 new compute `.sv` files are required**.
- `fixed_linear.sv` already supports streamed `weight`/`bias` with valid/ready.
- DRAM mode changes who drives those ports (external stream) rather than changing linear math hardware.

## What to change in codegen (so no new compute module is needed)
1. Remove the `_dram` module renaming in `src/chop/passes/graph/transforms/verilog/emit_top.py`.
2. Keep module name as `fixed_linear`.
3. Keep DRAM behavior controlled by interface metadata (`storage="DRAM"`) and top-level parameter ports.

## Where to add DRAM-only deployment dependencies
The primary file to control per-node dependency lists is:
- `src/chop/passes/graph/analysis/add_metadata/add_hardware_metadata.py`

Use `add_component_source(...)` to switch dependency files only when DRAM mode is requested.

Simplest DRAM-only pattern:
```python
elif mase_op in INTERNAL_COMP.keys():
node.meta["mase"]["hardware"]["toolchain"] = "INTERNAL_RTL"
node.meta["mase"]["hardware"]["module"] = INTERNAL_COMP[mase_op][0]["name"]
node.meta["mase"]["hardware"]["dependence_files"] = INTERNAL_COMP[mase_op][0]["dependence_files"]

if pass_args.get("interface", {}).get("storage", "BRAM") == "DRAM":
dram_extra = pass_args.get("interface", {}).get("dram_deployment_deps", [])
node.meta["mase"]["hardware"]["dependence_files"] = (
node.meta["mase"]["hardware"]["dependence_files"] + dram_extra
)
```

Notes:
- Keep compute dependencies unchanged for simulation.
- Append only deployment adapters (AXIS formatter/scheduler) under DRAM.
- Keep `storage` explicit as `"DRAM"` for off-chip flow.

If you want this keyed by op type (for example only `linear`), the cleanest place for static lists is:
- `src/chop/passes/graph/analysis/add_metadata/hardware_metadata_layers.py`

Example structure (conceptual):
```python
DRAM_EXTRA_DEPENDENCIES = {
"linear": [
"memory_adapters/rtl/axis_param_reformatter.sv",
"memory_adapters/rtl/param_stream_scheduler.sv",
]
}
```

Then in `add_component_source(...)`, append `DRAM_EXTRA_DEPENDENCIES.get(mase_op, [])` when `storage == "DRAM"`.

## `emit_dram` function (deployment only)
This function is only for real FPGA deployment. It is not needed for cocotb simulation.

In simulation, cocotb already quantizes and streams parameter blocks directly into DRAM-backed top-level ports.
In deployment, we need deployable parameter images that PS software and DMA can read from DDR and stream to the accelerator.

```python
def emit_dram_transform_pass(graph, pass_args={}):
"""
Emit deployable DRAM parameter images (not BRAM ROM files) for real FPGA deployment.

Expected outputs per DRAM-backed parameter:
- packed tensor stream image (e.g. .bin/.hex)
- metadata sidecar (shape, precision, parallelism, beat count, ordering)

Notes:
- Not required for cocotb simulation, because cocotb drivers already stream quantized parameters directly.
- Required for deployment where PS software / DMA reads from DDR and streams to top-level parameter ports.
"""
```

Recommended `pass_args` fields:
- `project_dir`: output root
- `format`: `bin` or `hex`
- `endianness`: `little` or `big`
- `align_bytes`: beat alignment for DMA
- `emit_metadata`: `True/False`
- `target`: board/deployment tag

## Minimum deployment stack (real FPGA)
Vivado IP helps with transport, but Vivado IP alone is not enough to satisfy model-specific packing, ordering, replay, and control requirements.

1. Parameter image emitter (software/pass side)
Converts quantized tensors into stream-ready images matching hardware beat format and ordering.
2. DDR buffer allocation + loader software (PS side)
Allocates contiguous buffers, loads emitted images, and passes addresses/lengths to DMA/control plane.
3. AXI DMA (Vivado IP)
Moves parameter beats from DDR to AXI-Stream.
4. AXIS packet/beat reformatter (custom RTL/HLS)
Adapts DMA stream width/packet structure to `fc1_weight` / `fc1_bias` port beat shape.
5. Parameter stream scheduler/control (custom RTL/HLS or PS-driven FSM)
Controls when each parameter stream starts, stops, and repeats per inference, while respecting back-pressure.
6. Optional AXIS FIFO/width converter (Vivado IP)
Used when clock-domain crossing, buffering, or width adaptation is needed.

### Vivado IP vs custom logic
Vivado can provide:
- AXI DMA
- AXIS FIFO
- AXIS data width converter
- AXI interconnect / SmartConnect
- Clocking/reset and standard infrastructure

You still need to implement:
- Parameter block ordering/packing contract used by `fixed_linear`
- Tensor replay policy (e.g. re-stream full weights per sample/batch)
- Stream scheduling across weight/bias channels
- Control/status integration between PS and accelerator
- Model-specific handshaking and sequencing correctness

## Final handoff list for teammate (today)
- Do not create `fixed_linear_dram.sv`.
- Remove `_dram` renaming in `src/chop/passes/graph/transforms/verilog/emit_top.py` so top instantiates `fixed_linear`.
- Add DRAM-only deployment dependencies in `src/chop/passes/graph/analysis/add_metadata/add_hardware_metadata.py` by appending an extra list only when `storage == "DRAM"`.
- Keep deployment adapters separate from compute RTL (compute stays `fixed_linear`).
Loading