[experimental] Add lowering for aiex.memcpy inside runtime_sequence for npu by fifield · Pull Request #16 · fifield/mlir-aie

fifield · 2026-01-14T20:41:47Z

This branch uses the existing token and memcpy operations from the aiex dialect inside runtime sequences to program dmas.

Add write_bd lowering for tile dmas to allow tile dma programming from runtime_sequence
Extensive modifications to aie-lower-memcpy to support this use case:
- Emit npu.writebd and npu.push_queue for tile-side BD configuration
- Require buffer address attributes for physical offset calculation
- Add npu.dma_wait after shim DMAs that issue tokens
- Use channel 0 for both S2MM and MM2S directions
Add test: npu-xrt/add_one_runtime_memcpy with aie-opt preprocessing

The test looks like:

  aie.device(npu1_1col) {
    %shim = aie.tile(0, 0)
    %core = aie.tile(0, 2)

    // Tile-local storage for the core.
    %core_in = aie.buffer(%core) {address = 0x400 : i32} : memref<64xi32>
    %core_out = aie.buffer(%core) {address = 0x1000 : i32} : memref<64xi32>

    // Tokens gate the pipeline: input memcpy → core compute → output memcpy.
    aiex.token(0) { sym_name = "token_in" }
    aiex.token(0) { sym_name = "token_out" }

    // Core adds 1 to every element in the tile-local buffer.
    %core_func = aie.core(%core) {
      // Wait for the inbound memcpy to finish.
      aiex.useToken @token_in(Acquire, 1)

      %c0 = arith.constant 0 : index
      %c64 = arith.constant 64 : index
      %c1 = arith.constant 1 : index
      %c1_i32 = arith.constant 1 : i32
      scf.for %i = %c0 to %c64 step %c1 {
        %val = memref.load %core_in[%i] : memref<64xi32>
        %inc = arith.addi %val, %c1_i32 : i32
        memref.store %inc, %core_out[%i] : memref<64xi32>
      }

      // Signal that output is ready for the outbound memcpy.
      aiex.useToken @token_out(Release, 1)
      aie.end
    }

    // Move host → core buffer, then core buffer → host, using token-gated memcpy
    // inside the runtime sequence.
    aie.runtime_sequence @run(%input: memref<64xi32>, %output: memref<64xi32>) {
      // Kick off inbound DMA immediately and bump token_in to 1 when done.
      aiex.memcpy @token_in(0, 1) (%shim : <%input, 0, 64>, %core : <%core_in, 0, 64>) : (memref<64xi32>, memref<64xi32>)

      // Wait for the core to signal completion (token_out reaches 1), then
      // copy results back to host.
      aiex.memcpy @token_out(1, 0) (%core : <%core_out, 0, 64>, %shim : <%output, 0, 64>) : (memref<64xi32>, memref<64xi32>)
    }
  }

which lowers to:

  aie.device(npu1_1col) {
    %shim_noc_tile_0_0 = aie.tile(0, 0)
    aie.shim_dma_allocation @shim_dma_token_out_0_0_0(%shim_noc_tile_0_0, S2MM, 0)
    aie.shim_dma_allocation @shim_dma_token_in_0_0_0(%shim_noc_tile_0_0, MM2S, 0)
    %tile_0_2 = aie.tile(0, 2)
    %token_out_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "token_out_lock"}
    %token_in_lock = aie.lock(%tile_0_2, 0) {init = 0 : i32, sym_name = "token_in_lock"}
    %buffer_0_2 = aie.buffer(%tile_0_2) {address = 1024 : i32} : memref<64xi32> 
    %buffer_0_2_0 = aie.buffer(%tile_0_2) {address = 4096 : i32} : memref<64xi32> 
    aiex.token(0) {sym_name = "token_in"}
    aiex.token(0) {sym_name = "token_out"}
    %core_0_2 = aie.core(%tile_0_2) {
      aie.use_lock(%token_in_lock, AcquireGreaterEqual, 1)
      %c0 = arith.constant 0 : index
      %c64 = arith.constant 64 : index
      %c1 = arith.constant 1 : index
      %c1_i32 = arith.constant 1 : i32
      scf.for %arg0 = %c0 to %c64 step %c1 {
        %0 = memref.load %buffer_0_2[%arg0] : memref<64xi32>
        %1 = arith.addi %0, %c1_i32 : i32
        memref.store %1, %buffer_0_2_0[%arg0] : memref<64xi32>
      }
      aie.use_lock(%token_out_lock, Release, 1)
      aie.end
    }
    aie.runtime_sequence @run(%arg0: memref<64xi32>, %arg1: memref<64xi32>) {
      aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 64 : i32, buffer_offset = 1024 : i32, column = 0 : i32, d0_size = 64 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 1 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 1 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 2 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
      aiex.npu.push_queue(0, 2, S2MM : 0) {bd_id = 0 : i32, issue_token = true, repeat_count = 0 : i32}
      aiex.npu.dma_memcpy_nd(%arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0, 1]) {id = 0 : i64, metadata = @shim_dma_token_in_0_0_0} : memref<64xi32>
      aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 64 : i32, buffer_offset = 4096 : i32, column = 0 : i32, d0_size = 64 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 1 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_size = 1 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 1 : i32, lock_acq_id = 1 : i32, lock_acq_val = 1 : i32, lock_rel_id = 1 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 2 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
      aiex.npu.push_queue(0, 2, MM2S : 0) {bd_id = 1 : i32, issue_token = false, repeat_count = 0 : i32}
      aiex.npu.dma_memcpy_nd(%arg1[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0, 1]) {id = 1 : i64, issue_token = true, metadata = @shim_dma_token_out_0_0_0} : memref<64xi32>
      aiex.npu.dma_wait {symbol = @shim_dma_token_out_0_0_0}
    }
    aie.flow(%shim_noc_tile_0_0, DMA : 0, %tile_0_2, DMA : 0)
    aie.flow(%tile_0_2, DMA : 0, %shim_noc_tile_0_0, DMA : 0)
  }

The pass is proof-of-concept quality.

github-actions · 2026-01-14T20:42:47Z

+    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>()
+              << "\n";


[clang-format] _{reported by reviewdog 🐶}

Suggested change

std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>()

<< "\n";

std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";

github-actions · 2026-01-14T20:42:47Z

+    std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>()
+              << "\n";


[clang-format] _{reported by reviewdog 🐶}

Suggested change

std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>()

<< "\n";

std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";

github-actions · 2026-01-14T20:42:47Z

+  std::vector<uint32_t> srcVec;
+  srcVec.reserve(IN_SIZE);
+  for (int i = 0; i < IN_SIZE; i++)
+    srcVec.push_back(static_cast<uint32_t>(i+24));


[clang-format] _{reported by reviewdog 🐶}

Suggested change

srcVec.push_back(static_cast<uint32_t>(i+24));

srcVec.push_back(static_cast<uint32_t>(i + 24));

github-actions · 2026-01-14T20:42:47Z

+    std::cout << "\nPASS!\n\n";
+    return 0;
+  }
+  std::cout << "\nfailed. (errors: " << errors << "/" << OUT_SIZE  << ")" << "\n\n";


[clang-format] _{reported by reviewdog 🐶}

Suggested change

std::cout << "\nfailed. (errors: " << errors << "/" << OUT_SIZE << ")" << "\n\n";

std::cout << "\nfailed. (errors: " << errors << "/" << OUT_SIZE << ")"

<< "\n\n";

…#2821)

- Add npu.writebd lowering for tile DMAs in AIEDmaToNpu - Add tile_dmas integration tests (blockwrite_using_locks, writebd, writebd_tokens) - Add test coverage for tile DMA buffer descriptor format differences

- Extensive modifications to aie-lower-memcpy to support runtime BD configuration: - Emit npu.writebd and npu.push_queue for tile-side BD configuration - Use buffer address attributes for physical offset calculation - Add npu.dma_wait after shim DMAs that issue tokens - Use channel 0 for both S2MM and MM2S directions - Add test: npu-xrt/add_one_runtime_memcpy with aie-opt preprocessing - Modify AIEDialect to expose buffer addresses

github-actions Bot reviewed Jan 14, 2026

View reviewed changes

fifield and others added 5 commits January 15, 2026 20:12

Revert accidental change to aie-rt submodule from Xilinx#2763 (Xilinx…

11c5e9e

…#2821)

Set bootgen submodule to origin/master (Xilinx#2822)

0c96ecc

Bump cmake/modulesXilinx (Xilinx#2820)

aff7c47

Add tile DMA writebd support and tests

9d5acfc

- Add npu.writebd lowering for tile DMAs in AIEDmaToNpu - Add tile_dmas integration tests (blockwrite_using_locks, writebd, writebd_tokens) - Add test coverage for tile DMA buffer descriptor format differences

fifield force-pushed the aiex_memcpy_seq branch from cf09737 to 0eb960a Compare January 16, 2026 22:27

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[experimental] Add lowering for aiex.memcpy inside runtime_sequence for npu#16

[experimental] Add lowering for aiex.memcpy inside runtime_sequence for npu#16
fifield wants to merge 5 commits intomainfrom
aiex_memcpy_seq

fifield commented Jan 14, 2026

Uh oh!

github-actions Bot Jan 14, 2026

Uh oh!

github-actions Bot Jan 14, 2026

Uh oh!

github-actions Bot Jan 14, 2026

Uh oh!

github-actions Bot Jan 14, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

		std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>()
		<< "\n";

		std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>()
		<< "\n";

	srcVec.push_back(static_cast<uint32_t>(i+24));
	srcVec.push_back(static_cast<uint32_t>(i + 24));

	std::cout << "\nfailed. (errors: " << errors << "/" << OUT_SIZE << ")" << "\n\n";
	std::cout << "\nfailed. (errors: " << errors << "/" << OUT_SIZE << ")"
	<< "\n\n";

Conversation

fifield commented Jan 14, 2026

Uh oh!

github-actions Bot Jan 14, 2026

Choose a reason for hiding this comment

Uh oh!

github-actions Bot Jan 14, 2026

Choose a reason for hiding this comment

Uh oh!

github-actions Bot Jan 14, 2026

Choose a reason for hiding this comment

Uh oh!

github-actions Bot Jan 14, 2026

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants