From 7c6fff0804f1db48fe4f53eade4b63a95a8bd5c5 Mon Sep 17 00:00:00 2001
From: Soren Soe <2106410+stsoe@users.noreply.github.com>
Date: Tue, 1 Apr 2025 13:54:47 -0700
Subject: [PATCH 1/4] Extend runner with an execution profile

This WIP.   Adding execution profile data that constraints how
a run recipe is executed by the xrt::runner class.

The profile is a separate json that is paired with a recipe when
instantiating an xrt::runner.

The profile is optional, while the recipe is required.  The
application can use the xrt::runner APIs to bind external resources or
use the profile json to specify the binding and otherwise constaint
the execution of a recipe.

Signed-off-by: Soren Soe <2106410+stsoe@users.noreply.github.com>
---
 src/runtime_src/core/common/runner/README.md  | 565 +-----------------
 src/runtime_src/core/common/runner/profile.md |  21 +
 src/runtime_src/core/common/runner/recipe.md  | 562 +++++++++++++++++
 src/runtime_src/core/common/runner/runner.cpp | 205 ++++++-
 src/runtime_src/core/common/runner/runner.h   |   4 +
 .../core/common/runner/test/profile.json      |  23 +
 .../core/common/runner/test/recipe.json       |   3 +
 7 files changed, 810 insertions(+), 573 deletions(-)
 create mode 100644 src/runtime_src/core/common/runner/profile.md
 create mode 100644 src/runtime_src/core/common/runner/recipe.md
 create mode 100644 src/runtime_src/core/common/runner/test/profile.json

diff --git a/src/runtime_src/core/common/runner/README.md b/src/runtime_src/core/common/runner/README.md
index 891fcf7f4a5..91bda995032 100644
--- a/src/runtime_src/core/common/runner/README.md
+++ b/src/runtime_src/core/common/runner/README.md
@@ -1,561 +1,12 @@
 <!-- SPDX-License-Identifier: Apache-2.0 -->
-<!-- Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. -->
-# Run recipe for XRT
-
-A run-recipe defines how to execute a graph model using XRT.
-
-This directory contains a stand-alone `xrt::runner` class that reads and 
-executes a run-recipe json file.   The idea is to have tools, e.g. VAIML
-geneate the run-recipe along with xclbin and control code for kernels.
-
-The format (schema) of the recipe json is loosely defined.  The
-implementation of the runner drove some of the defintion of the json
-format.
-
-A run-recipe is associated with exactly one xclbin which, when loaded into
-a region (partition) on the device, can run the recipe.
-
-# JSON format
-
-There are three sections in the run-recipe.
-
-1. [header](#header)
-2. [resources](#resources)
-3. [execution](#execution)
-
-The `header` trivially contains the path (full name) of the xclbin that should
-be loaded before resources can be created or the recipe can be executed.
-
-The `resources` section defines all buffer objects, kernel objects,
-and cpu function objects used to execute the recipe. The resources are
-created as the run recipe is loaded. External input and output buffer
-may be bound later during the execution stage of recipe.
-
-The `execution` section defines how the resources are connected
-together during execution. It simply executes kernels and cpu
-functions that were previously defined in the resource section with
-arguments that were also defined in the resource section.  Execution
-of kernels can consume partial buffer input and produce partial buffer
-output per `size` and `offset` fields define as part of specifying the
-kernel arguments.
-
-## Header
-
-For the time being, the header stores nothing but the path to the
-xclbin.  The xclbin contains the kernel meta data used by XRT when
-xrt::kernel objects are created.  The xclbin contains PDIs for each
-kernel, the PDIs are loaded by firmware prior to running a kernel.
-
-The header section can be amended with other meta data as needed.
-
-```
-{
-  "header": {
-    "xclbin_path": "design.xclbin",
-  },
-  
-  ...
-}
-```
-
-The runner will use the xclbin from the `header` section to create an
-xrt::hw_context, which is subsequently used to create xrt::kernel
-objects.
-
-## Resources
-
-The resources section is a complete list of all objects that are used
-when the recipe is executed. Each kernel used in the `execution`
-section must be listed in the resources section.  All kernel argument
-buffers used by kernels in the `execution` section must be listed in
-the resources section.  Also all functions executed on the CPU must
-be listed in the resources section.
-
-### Kernel functions
-
-Kernels listed in the resoruces section result in runner creating
-`xrt::kernel` objects.  In XRT, the kernel objects are identified by
-name, which must match a kernel name in the xclbin.
-
-Kernels are constructed from the xclbin name and by specifying which
-xrt::hw_context should execute the kernel and what control code the
-kernel should execute.  The hardware context is created by the runner
-from the xclbin specified in the recipe `header` section, so kernels
-in the resources section must contain just the xclbin kernel name
-and the full path to an ELF with the control code.
-
-```
-  "resources": {
-    "kernels": [
-      {
-        "name": "k1",
-        "xclbin_kernel_name": "DPU",
-        "ctrlcode": "no-ctrl-packet.elf"
-      }
-    ]
-  },
-```
-
-The name of the kernel in resources section must be unique in the list
-of kernel instances, the name is used in the `execution` section to refer 
-to which instance should be executed.
-
-If a kernel is instantiated from the same xclbin kernel name and same
-control code, then only one such kernel isntance needs to be listed in
-the resources section.  Listing multiple kernel instances referring to
-the same xclbin kernel and using the same control code is not error,
-but is not necessary.
-
-### CPU functions
-
-Functions to be executed on the CPU are listed in the resource section
-along with a path to a library containing the individual function.
-The library will be runtime loaded (dlopen); it will expose functions
-through a function pointer that is returned through a query lookup
-method, which it returned through a library entry (extern "C") function.
-
-CPU function arguments are expected to be `xrt::bo` objects, for
-example format converting functions will take an input buffer and
-and populate an output buffer, both buffers must be specified in the
-resource buffer section of the recipe.
-
-A library path is relative to the install location of XRT based on 
-the environment value of `XILINX_XRT` or from its inferred location if
-not set.  On windows, the inferred location would be the driver store.
-
-```
-  "resources": {
-    "cpus": [
-      {
-        "name": "convert_ifm",
-        "library_path": "umd/convert.dll"
-      },
-      {
-        "name": "convert_ofm",
-        "library_path": "umd/convert.dll"
-      },
-      {
-        "name": "average_pool",
-        "library_path": "umd/operators.dll"
-      }
-    ]
-  },
-```
-
-### Buffer
-
-The buffer instances listed in the resources section refer to
-`xrt::bo` objects that are used during execution of kernels. The
-buffers can be graph inputs or outputs, which refer to application
-created input and output tensors, or they can be internal buffers used
-during execution of the compiled graph at the discretion of the
-compiler (VAIML).
-
-#### External buffers (graph input and output)
-
-External buffers (input and output) are created by the framework /
-application outside of the runner and bound to the recipe during
-execution.  The runner itself does not create `xrt::bo` objects for
-external buffers, but does rely on the framework to bind these buffers
-to runner object created from the recipe.   The external buffers must
-still be listed in the resources section and specify a name that can 
-be used when execution sets kernel arguments.
-
-```
-  "resources": {
-    "buffers": [
-      {
-        "name": "wts",
-        "type": "input",
-      },
-      {
-        "name": "ifm",
-        "type": "input",
-      },
-      {
-        "name": "ofm",
-        "type": "output",
-      }
-    ]
-  }
-
-``` 
-
-The `name` of the buffers in the resources section must be unique.
-The name is used in the `execution` section to refer to kernel or cpu
-buffer arguments.
-
-<!-- The `src` of the buffers is meant to refer to a tensor name in the
-graph, but the use of this field is TBD as it does not appear to be
-required.  The `name` itself is enough to identify the buffer, both
-within the recipe and for external frame works to bind external
-created buffers to the graph. -->
-
-#### Internal buffers
-
-Internal buffers are created and managed by the runner. These are
-buffers that are used internally within a graph to carry data from one
-kernel or cpu execution to another.
-
-These buffers are created and managed by runner, hence unlike the
-external buffers, the size of internal buffer size must be specified
-in the recipe.
-
-```
-  "resources": {
-    "buffers": [
-      {
-        "name": "ifm_int",
-        "type": "internal",
-        "size": "1024"
-      },
-      {
-        "name": "ofm_int",
-        "type": "internal",
-        "size": "1024"
-      },
-      {
-        "name": "b0",
-        "type": "internal",
-        "size": "1024"
-      },
-      {
-        "name": "b1",
-        "type": "internal:,
-        "size": "1024"
-      },
-      {
-        "name": "b2",
-        "type": "internal",
-        "size": "1024"
-      }
-    ]
-  }
-
-``` 
-The `size` is currently specified in bytes, we could add support
-K/M, e.g. `1048576 = 1024K = 1M`
-
-## Execution
-
-The execution section is an ordered list of xrt::kernel or cpu runs
-with arguments from the resources section.
-
-Before the runner can execute the recipe in the execution section, all
-graph inputs and outputs must be bound to the recipe. As mentioned
-earlier, external inputs and outputs are defined by the framework that
-uses the runner.  Typically these external inputs and outputs are not
-available at the time when the runner is initialized from the recipe
-json.  In other words, the runner can be created even before the
-framework has created input and output tensors, but it can of course
-not be executed until the inputs and outputs are defined. The runner
-API has methods that must be called to bind the external inputs and
-outputs.
-
-Arguments to a run can be a sub-buffer of the corresponding
-resource.  A buffer in the resources section refer to the full buffer,
-but a run can use just a portion of the resource.  By default
-a run argument will use the full buffer, but optional attributes in
-the json for a buffer can specify the size and an offset into the
-resource buffer.
-
-As an example below, the kernel resource `k1` is executed twice with 
-3 arguments. The 3rd input is a sub-buffer of the `ifm_int` resource, the
-4th is the full resource `wts`, and the finally the 5th is a
-sub-buffer of `ofm_int`.
-
-The example illustrates the calling of a CPU function from the `cpu`
-resources section.  The CPU function calls are passed buffers from the
-resources section and scalar values as needed.
-
-```
-  "execution": {
-    "runs": [
-      {
-        "name": "convert_ifm",
-        "where": "cpu",
-        "arguments" : [
-            { "name": "ifm", "argidx": 0 },
-            { "name": "ifm_int", "argidx": 1 }
-         ],
-         "constants" : [
-            { "value": "nchw2nchw4c", "type": "string", "argidx": 2 }
-         ]
-        ]
-      },
-      {
-        "name": "k1",
-        "arguments" : [
-            { "name": "ifm_int", "size": 512, "offset": 0, "argidx": 3 },
-            { "name": "wts", "argidx": 4 },
-            { "name": "ofm_int", "size": 512, "offset": 512, "argidx": 5 }
-        ]
-      },
-      {
-        "name": "k1",
-        "arguments" : [
-            { "name": "ifm_int", "size": 512, "offset": 512, "argidx": 3 },
-            { "name": "wts", "argidx": 4 },
-            { "name": "ofm_int", "size": 512, "offset": 0, "argidx": 5 }
-        ]
-      },
-      {
-        "name": "convert_ofm",
-        "where": "cpu"
-        "arguments" : [
-            { "name": "ofm_int", "argidx": 0 },
-            { "name": "ofm", "argidx": 1 }
-         ],
-         "constants" : [
-            { "value": "nchw4c2nchw", "argidx": 2 }
-         ]
-        ]
-      },
-      ...
-    ]
-  }
-```
-
-The runner internally creates sub-buffers out of the specified
-resource buffers for each run. Both external and internal
-resource buffers can be sliced and diced as required.
-
-The runner creates `xrt::run` or `xrt_core::cpu::run` objects out of
-the specified execution runs.  The runner creates a CPU or NPU runlist
-for each contiguous sequence of CPU runs or NPU runs specified in the
-run recipe. The runlist is inserted into a vector of runlists where
-each individual runlist will be executed in sequence, when the
-framework calls the runner API execute method.
-
-In addition to the buffer arguments referring to resource buffers, the
-xclbin kernels and cpu functions may have additional arguments that
-need to be set. For example the current DPU kernel have 8 arguments
-and some of these must be set to some sentinel value.  Here the
-argument with index 0, represents the kernel opcode which specifies
-the type of control packet used for the kernel resource object.  The
-value `3` implies transaction buffer.
-
-```
-  "execution": {
-    "runs": [
-      {
-        "name": "k1",
-        "arguments" : [
-            { "name": "wts", "argidx": 4 },
-            { "name": "ifm", "argidx": 3 },
-            { "name": "ofm", "argidx": 5 }
-        ],
-        "constants" : [
-            { "value": "3", "type": "int", "argidx": 0 },
-            { "value": "0", "type": "int", "argidx": 1 },
-            { "value": "0", "type": "int", "argidx": 2 },
-            { "value": "0", "type": "int", "argidx": 6 },
-            { "value": "0", "type": "int", "argidx": 7 }
-        ]
-      }
-    ]
-  }
-```
-
-# Complete run recipe
-
-For illustration here is a simple complete run-recipe.json file that
-has been validated on NPU.  There are no internal buffer and external
-input and output are consumed during one kernel execution.  See the 
-`runner/test/recipe.json` for an example leveraging cpu functions.
-
-```
-{
-  "header": {
-    "xclbin_path": "design.xclbin",
-  },
-  "resources": {
-    "buffers": [
-      {
-        "name": "wts",
-        "type": "input",
-      },
-      {
-        "name": "ifm",
-        "type": "input",
-      },
-      {
-        "name": "ofm",
-        "type": "output",
-      }
-    ],
-    "kernels": [
-      {
-        "name": "k1",
-        "xclbin_kernel_name": "DPU",
-        "ctrlcode": "no-ctrl-packet.elf"
-      }
-    ]
-  },
-  "execution": {
-    "runs": [
-      {
-        "name": "k1",
-        "arguments" : [
-            { "name": "wts", "argidx": 4 },
-            { "name": "ifm", "argidx": 3 },
-            { "name": "ofm", "argidx": 5 }
-         ],
-         "constants": [
-            { "value": "3", "type": "int", "argidx": 0 },
-            { "value": "0", "type": "int", "argidx": 1 },
-            { "value": "0", "type": "int", "argidx": 2 },
-            { "value": "0", "type": "int", "argidx": 6 },
-            { "value": "0", "type": "int", "argidx": 7 }
-        ]
-      }
-    ]
-  }
-}
-```
-
-# Runner API
-
-The runner is contructed from a recipe json file and a device object.
-The runner is a standard XRT C++ first class object with the following
-API.  Include documentation will be beefed up when the runner code is 
-moved to public XRT.
-
-```
-class runner_impl;
-class runner
-{
-  std::shared_ptr<runner_impl> m_impl;  // probably unique_ptr is enough
-public:
-  // ctor - Create runner from a recipe json
-  runner(const xrt::device& device, const std::string& recipe);
-
-  // bind_input() - Bind a buffer object to an input tensor
-  void
-  bind_input(const std::string& name, const xrt::bo& bo);
-
-  // bind_output() - Bind a buffer object to an output tensor
-  void
-  bind_output(const std::string& name, const xrt::bo& bo);
-
-  // execute() - Execute the runner
-  void
-  execute();
-
-  // wait() - Wait for the execution to complete
-  void
-  wait();
-};
-```
-
-# CPU library requirements
-
-The run recipe can refer to functions executed on the CPU.  These
-functions should be implemented in a shared library that can be 
-loaded at runtime by the runner based on `resources/cpus` section.
-
-A referenced library is loaded by the runner, which subsequently looks
-for exported entry point (symbol) called `open` to initialize the shared 
-library. The `open()` is supposed to return function objects for callback 
-functions within the library.   At present time, only one callback function
-is required is the `lookup()` function, which the runner 
-uses to lookup functions referenced in the recipe resources section.
-
-The `lookup()` function must return the callable function that the
-runner is requesting along with the number of arguments this function
-expects.  If the function the runner is looking for is not available,
-then the `lookup()` function should throw an exception (TODO: define
-the exact exception to throw).  The reason the `lookup()` function is
-not itself an exported "extern C" function like `open()` is that the
-call semantics must be C++ with the bells and whistles that follow
-(exceptions).
-
-The signature of the `extern "C"` exported `open()` function and the 
-C++ signature of the `lookup()` function is defined in `xrt_runner.h`
-under `namespace xrt::cpu { ... }`.
-
-```
-/**
- * The xrt::runner supports execution of CPU functions as well
- * as xrt::kernel objects.
- *
- * The CPU functions are implemented in runtime loaded dynamic
- * libraries. A library must define and export a function that
- * initializes a callback structure with a lookup function.
- *
- * The signature of the lookup function must be
- * @code
- *  void lookup_fn(const std::string& name, xrt::cpu::lookup_args* args)
- * @endcode
- * where the name is the name of the function to lookup and args is a
- * structure that the lookup function must populate with the function
- * information.
- *
- * The arguments to the CPU functions are elided via std::any and
- * the signature of the CPU functions is fixed to
- * @code
- *  void cpu_function(std::vector<std::any>& args)
- * @endcode
- * Internally, the CPU library unwraps the arguments and calls the
- * actual function.
- */
-namespace xrt::cpu {
-/**
- * struct lookup_args - argument structure for the lookup function
- *
- * The lookup function takes as arguments the name of the function
- * to lookup along with lookup_args to be populated with information
- * about the function.
- *
- * @num_args - number of arguments to function
- * @callable - a C++ function object wrapping the function
- *
- * The callable library functions uses type erasure on their arguments
- * through a std::vector of std::any objects.  The callable must
- * unwrap the std::any objects to its expected type, which is
- * cumbersome, but type safe. The type erased arguments allow the
- * runner to be generic and not tied to a specific function signature.
-*/
-struct lookup_args
-{
-  std::uint32_t num_args;
-  std::function<void(std::vector<std::any>&)> callable;
-};
-
-/**
- * struct library_init_args - argument structure for libray initialization
- *
- * The library initialization function is the only function exported
- * from the run time loaded library.  The library initialization
- * function is called by the runner when a resource references a
- * function in a library and the library is not already loaded.
- *
- * @lookup_fn - a callback function to be populated with the
- *   lookup function.
- *
- * The library initialization function is C callable exported symbol,
- * but returns a C++ function pointer to the lookup function.
-*/
-struct library_init_args
-{
-  std::function<void(const std::string&, lookup_args*)> lookup_fn;
-};
-
-/**
- * library_init_fn - type of the library initialization function
- * The name of the library initialization function is fixed to
- * "library_init".
-*/
-using library_init_fn = void (*)(library_init_args*);
-} // xrt::cpu
-
-```
-
-A unit test for the cpu library and corresponding sample run recipe
-that references the cpu library is under `test/cpulib.cpp` and
-`test/main.cpp`
-
-
+<!-- Copyright (C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. -->
+# Runner instrastructure
 
+This directory contains xrt::runner infrastructure. The runner is
+broken into two json components.  First is the recipe that defines a
+model executed by the xrt::runner.  Second is the profile that defines
+under what constraints how the model is executed.
 
+- [recipe](recipe.md)
+- [profile](profile.md)
 
diff --git a/src/runtime_src/core/common/runner/profile.md b/src/runtime_src/core/common/runner/profile.md
new file mode 100644
index 00000000000..387c60f4186
--- /dev/null
+++ b/src/runtime_src/core/common/runner/profile.md
@@ -0,0 +1,21 @@
+<!-- SPDX-License-Identifier: Apache-2.0 -->
+<!-- Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved. -->
+# Execution profile for XRT runner
+
+An execution profile is an extention to a run recipe (see
+[recipe](recipe.md)).  It automates the run recipe by binding
+resources to the XRT runner that executes the run recipe.
+
+While the `xrt::runner` class can be used stand-alone by an
+application or framework that explicitly manages external resources,
+the execution recipe extends the runner to also manage the external
+resources.
+
+An execution profile is useful for testing of a run recipe.  It allows
+for one external application controlling execution of a run recipe by
+defininng:
+
+- how data is bound to resources
+- how validation is performed
+- how many times a run-recipe is executed and with what data
+
diff --git a/src/runtime_src/core/common/runner/recipe.md b/src/runtime_src/core/common/runner/recipe.md
new file mode 100644
index 00000000000..b1d325f47b4
--- /dev/null
+++ b/src/runtime_src/core/common/runner/recipe.md
@@ -0,0 +1,562 @@
+<!-- SPDX-License-Identifier: Apache-2.0 -->
+<!-- Copyright (C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. -->
+# Run recipe for XRT
+
+A run recipe defines a graph model that can be executed by XRT.
+
+This directory contains a stand-alone `xrt::runner` class that reads and 
+executes a run recipe json file.   The idea is to have tools, e.g. VAIML
+geneate the run recipe along with xclbin and control code for kernels.
+
+The schema of the recipe json is defined in `schema/recipe.schema.json`. The
+implementation of the runner drove some of the defintion of the json
+format.
+
+A run recipe is associated with exactly one configuration (xclbin or
+config elf) which, when loaded into a region (partition) on the
+device, can run the recipe.
+
+# JSON format
+
+There are three sections in the run recipe.
+
+1. [header](#header)
+2. [resources](#resources)
+3. [execution](#execution)
+
+The `header` trivially contains the path (full name) of the
+configuration data that should be loaded before resources can be
+created or the recipe can be executed.
+
+The `resources` section defines all buffer objects, kernel objects,
+and cpu function objects used to execute the recipe. The resources are
+created as the run recipe is loaded. External input and output buffer
+may be bound later during the execution stage of recipe.
+
+The `execution` section defines how the resources are connected
+together during execution. It simply executes kernels and cpu
+functions that were previously defined in the resource section with
+arguments that were also defined in the resource section.  Execution
+of kernels can consume partial buffer input and produce partial buffer
+output per `size` and `offset` fields defined as part of specifying the
+kernel arguments.
+
+## Header
+
+For the time being, the header stores nothing but the path to the
+xclbin.  The xclbin contains the kernel meta data used by XRT when
+xrt::kernel objects are created.  The xclbin contains PDIs for each
+kernel, the PDIs are loaded by firmware prior to running a kernel.
+
+The header section can be amended with other meta data as needed.
+
+```
+{
+  "header": {
+    "xclbin": "design.xclbin",
+  },
+  
+  ...
+}
+```
+
+The runner will use the xclbin from the `header` section to create an
+xrt::hw_context, which is subsequently used to create xrt::kernel
+objects.
+
+## Resources
+
+The resources section is a complete list of all objects that are used
+when the recipe is executed. Each kernel used in the `execution`
+section must be listed in the resources section.  All kernel argument
+buffers used by kernels in the `execution` section must be listed in
+the resources section.  Also all functions executed on the CPU must
+be listed in the resources section.
+
+### Kernel functions
+
+Kernels listed in the resoruces section result in runner creating
+`xrt::kernel` objects.  In XRT, the kernel objects are identified by
+name, which must match a kernel instance name in the xclbin.
+
+Kernels are constructed from the instance name and what control code
+the kernel should execute.  The hardware context associated with the
+kernel is created by the runner from the xclbin specified in the
+recipe `header` section, so kernels in the resources section must
+contain just the kernel instance name and the full path to an ELF with
+the control code.
+
+```
+  "resources": {
+    "kernels": [
+      {
+        "name": "k1",
+        "instance": "DPU",
+        "ctrlcode": "no-ctrl-packet.elf"
+      }
+    ]
+  },
+```
+
+The name of the kernel in resources section must be unique in the list
+of kernel instances, the name is used in the `execution` section to refer 
+to which instance should be executed.
+
+If a kernel is instantiated from the same instance kernel name and same
+control code, then only one such kernel instance needs to be listed in
+the resources section.  Listing multiple kernel instances referring to
+the same xclbin kernel and using the same control code is not error,
+but is not necessary.
+
+### CPU functions
+
+Functions to be executed on the CPU are listed in the resource section
+along with a path to a library containing the individual function.
+The library will be runtime loaded (dlopen); it will expose functions
+through a function pointer that is returned through a query lookup
+method, which it returned through a library entry (extern "C") function.
+
+CPU function arguments are expected to be `xrt::bo` objects, for
+example format converting functions will take an input buffer and
+and populate an output buffer, both buffers must be specified in the
+resource buffer section of the recipe.
+
+A library path is relative to the install location of XRT based on 
+the environment value of `XILINX_XRT` or from its inferred location if
+not set.  On windows, the inferred location would be the driver store.
+
+```
+  "resources": {
+    "cpus": [
+      {
+        "name": "convert_ifm",
+        "library_path": "umd/convert.dll"
+      },
+      {
+        "name": "convert_ofm",
+        "library_path": "umd/convert.dll"
+      },
+      {
+        "name": "average_pool",
+        "library_path": "umd/operators.dll"
+      }
+    ]
+  },
+```
+
+### Buffer
+
+The buffer instances listed in the resources section refer to
+`xrt::bo` objects that are used during execution of kernels. The
+buffers can be graph inputs or outputs, which refer to application
+created input and output tensors, or they can be internal buffers used
+during execution of the compiled graph at the discretion of the
+compiler (VAIML).
+
+#### External buffers (graph input and output)
+
+External buffers (input and output) are created by the framework /
+application outside of the runner and bound to the recipe during
+execution.  The runner itself does not create `xrt::bo` objects for
+external buffers, but does rely on the framework to bind these buffers
+to runner object created from the recipe.   The external buffers must
+still be listed in the resources section and specify a name that can 
+be used when execution sets kernel arguments.
+
+```
+  "resources": {
+    "buffers": [
+      {
+        "name": "wts",
+        "type": "input",
+      },
+      {
+        "name": "ifm",
+        "type": "input",
+      },
+      {
+        "name": "ofm",
+        "type": "output",
+      }
+    ]
+  }
+
+``` 
+
+The `name` of the buffers in the resources section must be unique.
+The name is used in the `execution` section to refer to kernel or cpu
+buffer arguments.
+
+<!-- The `src` of the buffers is meant to refer to a tensor name in the
+graph, but the use of this field is TBD as it does not appear to be
+required.  The `name` itself is enough to identify the buffer, both
+within the recipe and for external frame works to bind external
+created buffers to the graph. -->
+
+#### Internal buffers
+
+Internal buffers are created and managed by the runner. These are
+buffers that are used internally within a graph to carry data from one
+kernel or cpu execution to another.
+
+These buffers are created and managed by runner, hence unlike the
+external buffers, the size of internal buffer size must be specified
+in the recipe.
+
+```
+  "resources": {
+    "buffers": [
+      {
+        "name": "ifm_int",
+        "type": "internal",
+        "size": "1024"
+      },
+      {
+        "name": "ofm_int",
+        "type": "internal",
+        "size": "1024"
+      },
+      {
+        "name": "b0",
+        "type": "internal",
+        "size": "1024"
+      },
+      {
+        "name": "b1",
+        "type": "internal:,
+        "size": "1024"
+      },
+      {
+        "name": "b2",
+        "type": "internal",
+        "size": "1024"
+      }
+    ]
+  }
+
+``` 
+The `size` is currently specified in bytes.
+
+## Execution
+
+The execution section is an ordered list of kernel or cpu instances
+with arguments from the resources section. 
+
+Before the runner can execute the recipe in the execution section, all
+graph inputs and outputs must be bound to the recipe. As mentioned
+earlier, external inputs and outputs are defined by the framework that
+uses the runner.  Typically these external inputs and outputs are not
+available at the time when the runner is initialized from the recipe
+json.  In other words, the runner can be created even before the
+framework has created input and output tensors, but it can of course
+not be executed until the inputs and outputs are defined. The runner
+API has methods that must be called to bind the external inputs and
+outputs.
+
+Arguments to a run can be a sub-buffer of the corresponding
+resource.  A buffer in the resources section refer to the full buffer,
+but a run can use just a portion of the resource.  By default
+a run argument will use the full buffer, but optional attributes in
+the json for a buffer can specify the size and an offset into the
+resource buffer.
+
+As an example below, the kernel resource `k1` is executed twice with 
+3 arguments. The 3rd input is a sub-buffer of the `ifm_int` resource, the
+4th is the full resource `wts`, and the finally the 5th is a
+sub-buffer of `ofm_int`.
+
+The example illustrates the calling of a CPU function from the `cpu`
+resources section.  The CPU function calls are passed buffers from the
+resources section and scalar values as needed.
+
+```
+  "execution": {
+    "runs": [
+      {
+        "name": "convert_ifm",
+        "where": "cpu",
+        "arguments" : [
+            { "name": "ifm", "argidx": 0 },
+            { "name": "ifm_int", "argidx": 1 }
+         ],
+         "constants" : [
+            { "value": "nchw2nchw4c", "type": "string", "argidx": 2 }
+         ]
+        ]
+      },
+      {
+        "name": "k1",
+        "arguments" : [
+            { "name": "ifm_int", "size": 512, "offset": 0, "argidx": 3 },
+            { "name": "wts", "argidx": 4 },
+            { "name": "ofm_int", "size": 512, "offset": 512, "argidx": 5 }
+        ]
+      },
+      {
+        "name": "k1",
+        "arguments" : [
+            { "name": "ifm_int", "size": 512, "offset": 512, "argidx": 3 },
+            { "name": "wts", "argidx": 4 },
+            { "name": "ofm_int", "size": 512, "offset": 0, "argidx": 5 }
+        ]
+      },
+      {
+        "name": "convert_ofm",
+        "where": "cpu"
+        "arguments" : [
+            { "name": "ofm_int", "argidx": 0 },
+            { "name": "ofm", "argidx": 1 }
+         ],
+         "constants" : [
+            { "value": "nchw4c2nchw", "argidx": 2 }
+         ]
+        ]
+      },
+      ...
+    ]
+  }
+```
+
+The runner internally creates sub-buffers out of the specified
+resource buffers for each run. Both external and internal
+resource buffers can be sliced and diced as required.
+
+The runner creates `xrt::run` or `xrt_core::cpu::run` objects out of
+the specified execution runs.  The runner creates a CPU or NPU runlist
+for each contiguous sequence of CPU runs or NPU runs specified in the
+run recipe. The runlist is inserted into a vector of runlists where
+each individual runlist will be executed in sequence, when the
+framework calls the runner API execute method.
+
+In addition to the buffer arguments referring to resource buffers, the
+xclbin kernels and cpu functions may have additional arguments that
+need to be set. For example the current DPU kernel have 8 arguments
+and some of these must be set to some sentinel value.  Here the
+argument with index 0, represents the kernel opcode which specifies
+the type of control packet used for the kernel resource object.  The
+value `3` implies transaction buffer.
+
+```
+  "execution": {
+    "runs": [
+      {
+        "name": "k1",
+        "arguments" : [
+            { "name": "wts", "argidx": 4 },
+            { "name": "ifm", "argidx": 3 },
+            { "name": "ofm", "argidx": 5 }
+        ],
+        "constants" : [
+            { "value": "3", "type": "int", "argidx": 0 },
+            { "value": "0", "type": "int", "argidx": 1 },
+            { "value": "0", "type": "int", "argidx": 2 },
+            { "value": "0", "type": "int", "argidx": 6 },
+            { "value": "0", "type": "int", "argidx": 7 }
+        ]
+      }
+    ]
+  }
+```
+
+# Complete run recipe
+
+For illustration here is a simple complete run recipe.json file that
+has been validated on NPU.  There are no internal buffer and external
+input and output are consumed during one kernel execution.  See the 
+`runner/test/recipe.json` for an example leveraging cpu functions.
+
+```
+{
+  "header": {
+    "xclbin": "design.xclbin",
+  },
+  "resources": {
+    "buffers": [
+      {
+        "name": "wts",
+        "type": "input",
+      },
+      {
+        "name": "ifm",
+        "type": "input",
+      },
+      {
+        "name": "ofm",
+        "type": "output",
+      }
+    ],
+    "kernels": [
+      {
+        "name": "k1",
+        "instance": "DPU",
+        "ctrlcode": "no-ctrl-packet.elf"
+      }
+    ]
+  },
+  "execution": {
+    "runs": [
+      {
+        "name": "k1",
+        "arguments" : [
+            { "name": "wts", "argidx": 4 },
+            { "name": "ifm", "argidx": 3 },
+            { "name": "ofm", "argidx": 5 }
+         ],
+         "constants": [
+            { "value": "3", "type": "int", "argidx": 0 },
+            { "value": "0", "type": "int", "argidx": 1 },
+            { "value": "0", "type": "int", "argidx": 2 },
+            { "value": "0", "type": "int", "argidx": 6 },
+            { "value": "0", "type": "int", "argidx": 7 }
+        ]
+      }
+    ]
+  }
+}
+```
+
+# Runner API
+
+The runner is contructed from a recipe json file and a device object.
+The runner is a standard XRT C++ first class object with the following
+API.  Include documentation will be beefed up when the runner code is 
+moved to public XRT.
+
+```
+class runner_impl;
+class runner
+{
+  std::shared_ptr<runner_impl> m_impl;  // probably unique_ptr is enough
+public:
+  // ctor - Create runner from a recipe json
+  runner(const xrt::device& device, const std::string& recipe);
+
+  // bind_input() - Bind a buffer object to an input tensor
+  void
+  bind_input(const std::string& name, const xrt::bo& bo);
+
+  // bind_output() - Bind a buffer object to an output tensor
+  void
+  bind_output(const std::string& name, const xrt::bo& bo);
+
+  // execute() - Execute the runner
+  void
+  execute();
+
+  // wait() - Wait for the execution to complete
+  void
+  wait();
+};
+```
+
+# CPU library requirements
+
+The run recipe can refer to functions executed on the CPU.  These
+functions should be implemented in a shared library that can be 
+loaded at runtime by the runner based on `resources/cpus` section.
+
+A referenced library is loaded by the runner, which subsequently looks
+for exported entry point (symbol) called `open` to initialize the shared 
+library. The `open()` is supposed to return function objects for callback 
+functions within the library.   At present time, only one callback function
+is required is the `lookup()` function, which the runner 
+uses to lookup functions referenced in the recipe resources section.
+
+The `lookup()` function must return the callable function that the
+runner is requesting along with the number of arguments this function
+expects.  If the function the runner is looking for is not available,
+then the `lookup()` function should throw an exception (TODO: define
+the exact exception to throw).  The reason the `lookup()` function is
+not itself an exported "extern C" function like `open()` is that the
+call semantics must be C++ with the bells and whistles that follow
+(exceptions).
+
+The signature of the `extern "C"` exported `open()` function and the 
+C++ signature of the `lookup()` function is defined in `xrt_runner.h`
+under `namespace xrt::cpu { ... }`.
+
+```
+/**
+ * The xrt::runner supports execution of CPU functions as well
+ * as xrt::kernel objects.
+ *
+ * The CPU functions are implemented in runtime loaded dynamic
+ * libraries. A library must define and export a function that
+ * initializes a callback structure with a lookup function.
+ *
+ * The signature of the lookup function must be
+ * @code
+ *  void lookup_fn(const std::string& name, xrt::cpu::lookup_args* args)
+ * @endcode
+ * where the name is the name of the function to lookup and args is a
+ * structure that the lookup function must populate with the function
+ * information.
+ *
+ * The arguments to the CPU functions are elided via std::any and
+ * the signature of the CPU functions is fixed to
+ * @code
+ *  void cpu_function(std::vector<std::any>& args)
+ * @endcode
+ * Internally, the CPU library unwraps the arguments and calls the
+ * actual function.
+ */
+namespace xrt::cpu {
+/**
+ * struct lookup_args - argument structure for the lookup function
+ *
+ * The lookup function takes as arguments the name of the function
+ * to lookup along with lookup_args to be populated with information
+ * about the function.
+ *
+ * @num_args - number of arguments to function
+ * @callable - a C++ function object wrapping the function
+ *
+ * The callable library functions uses type erasure on their arguments
+ * through a std::vector of std::any objects.  The callable must
+ * unwrap the std::any objects to its expected type, which is
+ * cumbersome, but type safe. The type erased arguments allow the
+ * runner to be generic and not tied to a specific function signature.
+*/
+struct lookup_args
+{
+  std::uint32_t num_args;
+  std::function<void(std::vector<std::any>&)> callable;
+};
+
+/**
+ * struct library_init_args - argument structure for libray initialization
+ *
+ * The library initialization function is the only function exported
+ * from the run time loaded library.  The library initialization
+ * function is called by the runner when a resource references a
+ * function in a library and the library is not already loaded.
+ *
+ * @lookup_fn - a callback function to be populated with the
+ *   lookup function.
+ *
+ * The library initialization function is C callable exported symbol,
+ * but returns a C++ function pointer to the lookup function.
+*/
+struct library_init_args
+{
+  std::function<void(const std::string&, lookup_args*)> lookup_fn;
+};
+
+/**
+ * library_init_fn - type of the library initialization function
+ * The name of the library initialization function is fixed to
+ * "library_init".
+*/
+using library_init_fn = void (*)(library_init_args*);
+} // xrt::cpu
+
+```
+
+A unit test for the cpu library and corresponding sample run recipe
+that references the cpu library is under `test/cpulib.cpp` and
+`test/main.cpp`
+
+
+
+
+
diff --git a/src/runtime_src/core/common/runner/runner.cpp b/src/runtime_src/core/common/runner/runner.cpp
index d992c796519..064a491bc86 100644
--- a/src/runtime_src/core/common/runner/runner.cpp
+++ b/src/runtime_src/core/common/runner/runner.cpp
@@ -975,23 +975,169 @@ class recipe
   }
 }; // class recipe
 
+
+// A runner_impl (xrt::runner) always has a run recipe object and
+// optionally a execution profile object. The latter is optional and default
+// created from an in-mermory json.
+//
+// The profile implements the runner_impl bind APIs and
+// execute/wait APIs, these APIs forward to the run recipe object
+// and must be called for the default execution recipe.
+//
+// An external execution profile can be used to initialize run recipe
+// resources at runner initialization time bind
+// resources per the recipe.  The calling application can still
+// explicitly bind via the xrt::runner APIs, which may override
+// the binding done by the execution recipe.
+class profile
+{
+  class bindings
+  {
+    using name_t = std::string;
+    using path_t = std::string;
+
+    // Map of resource names to file paths. Ths comes directly from
+    // the profile json.
+    std::map<name_t, path_t> m_paths;
+
+    // Map of resource names to buffers.  The buffers are initialized
+    // with data loaded from the file path corresponding to the
+    // resource name.
+    std::map<name_t, xrt::bo> m_bindings;
+
+    // Create a map of resource names to file paths from the profile json
+    static std::map<name_t, path_t>
+    init_paths(const boost::property_tree::ptree& pt)
+    {
+      std::map<name_t, path_t> paths;
+      for (const auto& [name, node] : pt)
+        paths.emplace(name, node.get<std::string>("file"));
+
+      return paths;
+    }
+
+    // Create a map of resource names to buffers initialized with data
+    // from the file paths.  The data is cached in an artifacts::repo
+    static std::map<name_t, xrt::bo>
+    create_bindings(const xrt::device& device,
+                    const std::map<name_t, path_t>& paths,
+                    const artifacts::repo& repo)
+    {
+      std::map<name_t, xrt::bo> bindings;
+      for (const auto& [name, path] : paths) {
+        const auto& data = repo.get(path);
+        xrt::bo bo = xrt::ext::bo{device, data.size()};
+        auto bo_data = bo.map<char*>();
+        std::copy(data.data(), data.data() + data.size(), bo_data);
+        bindings.emplace(name, std::move(bo));
+      }
+      return bindings;
+    }
+
+    // Reset a specific binding to its original value.  The data is
+    // retrived from the artifacts repo data member that was cached
+    // during initialization of the profile bindings.
+    void
+    reset(const std::string& name, xrt::bo& bo, const artifacts::repo& repo)
+    {
+        const auto& data = repo.get(m_paths[name]);
+        if (bo.size() != data.size())
+          throw std::runtime_error("binding size mismatch during reset");
+
+        auto bo_data = bo.map<char*>();
+        std::copy(data.data(), data.data() + data.size(), bo_data);
+    }
+
+  public:
+    bindings() = default;
+
+    bindings(const xrt::device& device, const boost::property_tree::ptree& pt, const artifacts::repo& repo)
+      : m_paths{init_paths(pt)}
+      , m_bindings{create_bindings(device, m_paths, repo)}
+    {}
+
+    // Reset all bindings to their original values
+    void
+    reset(const artifacts::repo& repo)
+    {
+      for (auto& [name, bo] : m_bindings)
+        reset(name, bo, repo);
+    }
+
+    // Reset a specific binding to its original value
+    void
+    reset(const std::string& name, const artifacts::repo& repo)
+    {
+      auto& bo = m_bindings.at(name);
+      reset(name, bo, repo);
+    }
+
+    const std::map<std::string, xrt::bo>&
+    get_bindings() const
+    {
+      return m_bindings;
+    }
+  }; // class profile::bindings
+
+  class execution
+  {
+    size_t m_iterations = 1;
+    
+  }; // class profile::execution
+  
+private:
+  boost::property_tree::ptree m_profile;
+  artifacts::file_repo m_repo;
+  xrt::device m_device;
+  recipe* m_recipe = nullptr;
+  bindings m_bindings;
+
+  static boost::property_tree::ptree
+  load(const std::string& path)
+  {
+    boost::property_tree::ptree pt;
+    boost::property_tree::read_json(path, pt);
+    return pt;
+  }
+
+public:
+  profile(xrt::device device, recipe* rr, const std::string& profile)
+    : m_profile{load(profile)}
+    , m_device{std::move(device)}
+    , m_recipe{rr}
+    , m_bindings{m_device, m_profile.get_child("bindings"), m_repo}
+  {}
+
+  const std::map<std::string, xrt::bo>&
+  get_bo_bindings() const
+  {
+    return m_bindings.get_bindings();
+  }
+}; // class profile
+
 } // namespace
 
 namespace xrt_core {
 
-// class runner_impl -
+// class runner_impl - Insulated implementation of xrt::runner
+//
+// Manages a run recipe and an execution profile.
 //
-// A runner implementation is default created with one instance of a
-// recipe.  But the runner can be used by multiple threads and new
-// recipe instances are created for each thread as needed.
+// The recipe defines the resources and how to run a model.
 //
-// The runner can be created from any thread, but member functions
-// are thread specific.
+// The profile controls how resources are bound to the recipe and how
+// the recipe is executed, e.g. number of times, debug info,
+// validation, etc.
 class runner_impl
 {
-  //std::map<std::thread::id, recipe> m_recipes;
   recipe m_recipe;
-  //thread_local recipe m_thread_recipe;
+
+protected:
+  recipe*
+  get_recipe()
+  {
+    return &m_recipe;
+  }
 
 public:
   runner_impl(const xrt::device& device, const std::string& recipe)
@@ -1002,36 +1148,58 @@ class runner_impl
     : m_recipe{device, recipe, artifacts::ram_repo(artifacts)}
   {}
 
-  void
+  virtual ~runner_impl() = default;
+
+  virtual void
   bind_input(const std::string& name, const xrt::bo& bo)
   {
-    m_recipe.bind_input(name, bo);
+    m_recipe.bind(name, bo);
   }
 
-  void
+  virtual void
   bind_output(const std::string& name, const xrt::bo& bo)
   {
-    m_recipe.bind_output(name, bo);
+    m_recipe.bind(name, bo);
   }
 
-  void
+  virtual void
   bind(const std::string& name, const xrt::bo& bo)
   {
     m_recipe.bind(name, bo);
   }
 
-  void
+  virtual void
   execute()
   {
     m_recipe.execute();
   }
 
-  void
+  virtual void
   wait()
   {
     m_recipe.wait();
   }
-};
+}; // class runner_impl
+
+class profile_impl : public runner_impl
+{
+  profile m_profile;
+
+public:
+  profile_impl(const xrt::device& device, const std::string& recipe, const std::string& profile)
+    : runner_impl{device, recipe}
+    , m_profile{device, get_recipe(), profile}
+  {}
+
+  void
+  execute() override
+  {
+    for (auto& [name, bo] : m_profile.get_bo_bindings())
+      runner_impl::bind(name, bo);
+
+    runner_impl::execute();
+  }
+}; // class profile_impl
 
 ////////////////////////////////////////////////////////////////
 // Public runner interface APIs
@@ -1046,6 +1214,11 @@ runner(const xrt::device& device, const std::string& recipe, const artifacts_rep
   : m_impl{std::make_unique<runner_impl>(device, recipe, repo)}
 {}
 
+runner::
+runner(const xrt::device& device, const std::string& recipe, const std::string& profile)
+  : m_impl{std::make_unique<profile_impl>(device, recipe, profile)}
+{}
+
 void
 runner::
 bind_input(const std::string& name, const xrt::bo& bo)
diff --git a/src/runtime_src/core/common/runner/runner.h b/src/runtime_src/core/common/runner/runner.h
index 787c6b98c51..2d6d5d2c433 100644
--- a/src/runtime_src/core/common/runner/runner.h
+++ b/src/runtime_src/core/common/runner/runner.h
@@ -46,6 +46,10 @@ class runner
   XRT_CORE_COMMON_EXPORT
   runner(const xrt::device& device, const std::string& recipe, const artifacts_repository&);
 
+  // ctor - Create runner from a recipe json and execution profile json
+  XRT_CORE_COMMON_EXPORT
+  runner(const xrt::device& device, const std::string& recipe, const std::string& profile);
+
   // bind_input() - Bind a buffer object to an input tensor
   XRT_CORE_COMMON_EXPORT
   void
diff --git a/src/runtime_src/core/common/runner/test/profile.json b/src/runtime_src/core/common/runner/test/profile.json
new file mode 100644
index 00000000000..5044d03ed81
--- /dev/null
+++ b/src/runtime_src/core/common/runner/test/profile.json
@@ -0,0 +1,23 @@
+{
+  "version": "1.0",
+  "type": "execution profile",
+
+  "bindings": [
+    "wts": {
+      "file": "wts.bin"
+    },
+    "ifm": {
+      "file": "ifm.bin"
+    },
+    "ofm": {
+      "file": "ofm.bin"
+    }    
+  ]
+
+  "execution" : {
+    "iterations": 1,
+    "validation": {
+      "file": "gold.bin"
+    }
+  }
+}
diff --git a/src/runtime_src/core/common/runner/test/recipe.json b/src/runtime_src/core/common/runner/test/recipe.json
index fa4cf0896e8..c1bd3c0dd52 100644
--- a/src/runtime_src/core/common/runner/test/recipe.json
+++ b/src/runtime_src/core/common/runner/test/recipe.json
@@ -1,8 +1,10 @@
 {
   "version": "1.0",
+
   "header": {
     "xclbin": "design.xclbin"
   },
+
   "resources": {
     "buffers": [
       {
@@ -46,6 +48,7 @@
       }
     ]
   },
+
   "execution": {
     "runs": [
       {

From 908b3219b12992a3f4f993535fa0b4440e813638 Mon Sep 17 00:00:00 2001
From: Soren Soe <2106410+stsoe@users.noreply.github.com>
Date: Tue, 1 Apr 2025 16:08:18 -0700
Subject: [PATCH 2/4] Extend profile json with bind, init, and validate nodes

Signed-off-by: Soren Soe <2106410+stsoe@users.noreply.github.com>
---
 src/runtime_src/core/common/runner/runner.cpp | 283 +++++++++++++-----
 src/runtime_src/core/common/runner/runner.h   |  25 +-
 .../core/common/runner/test/CMakeLists.txt    |   7 +-
 .../core/common/runner/test/profile.json      |  39 ++-
 .../common/runner/test/runner-profile.cpp     | 102 +++++++
 5 files changed, 361 insertions(+), 95 deletions(-)
 create mode 100644 src/runtime_src/core/common/runner/test/runner-profile.cpp

diff --git a/src/runtime_src/core/common/runner/runner.cpp b/src/runtime_src/core/common/runner/runner.cpp
index 064a491bc86..ef31478afe4 100644
--- a/src/runtime_src/core/common/runner/runner.cpp
+++ b/src/runtime_src/core/common/runner/runner.cpp
@@ -33,6 +33,8 @@
 # pragma warning (pop)
 #endif
 
+#include <algorithm>
+#include <iostream>
 #include <istream>
 #include <map>
 #include <string>
@@ -48,6 +50,17 @@ namespace {
 
 const boost::property_tree::ptree default_ptree;
 
+template <typename OptionalType>
+static boost::property_tree::ptree
+get_optional(const OptionalType& node)
+{
+#if BOOST_VERSION >= 105600
+  return node.value();
+#else
+  return node.get();
+#endif
+}
+
 // struct streambuf - wrap a std::streambuf around an external buffer
 //
 // This is used create elf files from memory through a std::istream
@@ -110,22 +123,38 @@ class repo
 // Artifacts are loaded from disk and stored in persistent storage  
 class file_repo : public repo
 {
+  std::filesystem::path base_dir;
+
 public:
+  file_repo()
+    : base_dir{"."}
+  {}
+
+  file_repo(std::filesystem::path basedir)
+    : base_dir{std::move(basedir)}
+  {}
+
   const std::vector<char>&
   get(const std::string& path) const override
   {
-    if (auto it = m_data.find(path); it != m_data.end())
+    std::filesystem::path full_path = base_dir / path;
+    if (!std::filesystem::exists(full_path))
+      throw std::runtime_error{"File not found: " + full_path.string()};
+
+    auto key = full_path.string();
+    if (auto it = m_data.find(key); it != m_data.end())
       return (*it).second;
 
-    std::ifstream ifs(path, std::ios::binary);
+    std::ifstream ifs(key, std::ios::binary);
     if (!ifs)
-      throw std::runtime_error{"Failed to open file: " + path};
+      throw std::runtime_error{"Failed to open file: " + key};
 
     ifs.seekg(0, std::ios::end);
     std::vector<char> data(ifs.tellg());
     ifs.seekg(0, std::ios::beg);
     ifs.read(data.data(), data.size());
-    auto [itr, success] = m_data.emplace(path, std::move(data));
+    auto [itr, success] = m_data.emplace(key, std::move(data));
+    XRT_DEBUGF("artifacts::file_repo::get(%s) -> %s\n", path.c_str(), success ? "success" : "failure");
     
     return (*itr).second;
   }
@@ -149,6 +178,7 @@ class ram_repo : public repo
 
     if (auto it = m_reference.find(path); it != m_reference.end()) {
       auto [itr, success] = m_data.emplace(path, it->second);
+      XRT_DEBUGF("artifacts::ram_repo::get(%s) -> %s\n", path.c_str(), success ? "success" : "failure");
       return (*itr).second;
     }
 
@@ -177,12 +207,12 @@ get(const xrt::elf& elf)
 }
 
 static xrt::module
-get(const std::string& path, const artifacts::repo& repo)
+get(const std::string& path, const artifacts::repo* repo)
 {
   if (auto it = s_path2elf.find(path); it != s_path2elf.end())
     return get((*it).second);
 
-  auto& data = repo.get(path);
+  auto& data = repo->get(path);
   streambuf buf{data.data(), data.data() + data.size()};
   std::istream is{&buf};
   xrt::elf elf{is};
@@ -201,15 +231,15 @@ class recipe
     xrt::xclbin m_xclbin;
 
     static xrt::xclbin
-    read_xclbin(const boost::property_tree::ptree& pt, const artifacts::repo& repo)
+    read_xclbin(const boost::property_tree::ptree& pt, const artifacts::repo* repo)
     {
       auto path = pt.get<std::string>("xclbin");
-      auto& data = repo.get(path);
+      auto& data = repo->get(path);
       return xrt::xclbin{data};
     }
 
   public:
-    header(const boost::property_tree::ptree& pt, const artifacts::repo& repo)
+    header(const boost::property_tree::ptree& pt, const artifacts::repo* repo)
       : m_xclbin{read_xclbin(pt, repo)}
     {
       XRT_DEBUGF("Loaded xclbin: %s\n", m_xclbin.get_uuid().to_string().c_str());
@@ -352,7 +382,7 @@ class recipe
       // The kernel control module is created if necessary.
       static kernel
       create_kernel(const xrt::hw_context& hwctx, const boost::property_tree::ptree& pt,
-                    const artifacts::repo& repo)
+                    const artifacts::repo* repo)
       {
         auto name = pt.get<std::string>("name"); // required, default xclbin kernel name
         auto elf = pt.get<std::string>("ctrlcode", ""); // optional elf file
@@ -439,7 +469,7 @@ class recipe
     // create_kernels - create kernel objects from kernel property tree nodes
     static std::map<std::string, kernel>
     create_kernels(xrt::device device, const xrt::hw_context& hwctx,
-                   const boost::property_tree::ptree& pt, const artifacts::repo& repo)
+                   const boost::property_tree::ptree& pt, const artifacts::repo* repo)
     {
       std::map<std::string, kernel> kernels;
       for (const auto& [name, node] : pt)
@@ -461,7 +491,7 @@ class recipe
 
   public:
     resources(xrt::device device, const xrt::xclbin& xclbin,
-              const boost::property_tree::ptree& recipe, const artifacts::repo& repo)
+              const boost::property_tree::ptree& recipe, const artifacts::repo* repo)
       : m_device{std::move(device)}
       , m_hwctx{m_device, m_device.register_xclbin(xclbin)}
       , m_buffers{create_buffers(m_device, recipe.get_child("buffers"))}
@@ -923,7 +953,7 @@ class recipe
   }
 
 public:
-  recipe(xrt::device device, const std::string& path, const artifacts::repo& repo)
+  recipe(xrt::device device, const std::string& path, const artifacts::repo* repo)
     : m_device{std::move(device)}
     , m_recipe{load(path)}
     , m_header{m_recipe.get_child("header"), repo}
@@ -995,87 +1025,124 @@ class profile
   {
     using name_t = std::string;
     using path_t = std::string;
+    using binding_node = boost::property_tree::ptree;
+    using validate_node = boost::property_tree::ptree;
 
-    // Map of resource names to file paths. Ths comes directly from
-    // the profile json.
-    std::map<name_t, path_t> m_paths;
+    // Map of resource name to json binding element.  This comes
+    // directly from the profile json.
+    std::map<name_t, binding_node> m_bindings;
 
     // Map of resource names to buffers.  The buffers are initialized
     // with data loaded from the file path corresponding to the
     // resource name.
-    std::map<name_t, xrt::bo> m_bindings;
+    std::map<name_t, xrt::bo> m_bo_bindings;
 
-    // Create a map of resource names to file paths from the profile json
-    static std::map<name_t, path_t>
-    init_paths(const boost::property_tree::ptree& pt)
+    // Create a map of resource names to json binding nodes from the profile json
+    static std::map<name_t, binding_node>
+    init_bindings(const boost::property_tree::ptree& pt)
     {
-      std::map<name_t, path_t> paths;
+      std::map<name_t, binding_node> bindings;
       for (const auto& [name, node] : pt)
-        paths.emplace(name, node.get<std::string>("file"));
+        bindings.emplace(node.get<std::string>("name"), node);
 
-      return paths;
+      return bindings;
     }
 
     // Create a map of resource names to buffers initialized with data
     // from the file paths.  The data is cached in an artifacts::repo
     static std::map<name_t, xrt::bo>
-    create_bindings(const xrt::device& device,
-                    const std::map<name_t, path_t>& paths,
-                    const artifacts::repo& repo)
+    create_buffers(const xrt::device& device,
+                   const std::map<name_t, binding_node>& bindings,
+                   const artifacts::repo* repo)
     {
-      std::map<name_t, xrt::bo> bindings;
-      for (const auto& [name, path] : paths) {
-        const auto& data = repo.get(path);
+      std::map<name_t, xrt::bo> bos;
+      for (const auto& [name, node] : bindings) {
+        const auto& data = repo->get(node.get<std::string>("file"));
         xrt::bo bo = xrt::ext::bo{device, data.size()};
         auto bo_data = bo.map<char*>();
         std::copy(data.data(), data.data() + data.size(), bo_data);
-        bindings.emplace(name, std::move(bo));
+        bo.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+        bos.emplace(node.get<std::string>("name"), std::move(bo));
       }
-      return bindings;
+      return bos;
     }
 
-    // Reset a specific binding to its original value.  The data is
-    // retrived from the artifacts repo data member that was cached
-    // during initialization of the profile bindings.
-    void
-    reset(const std::string& name, xrt::bo& bo, const artifacts::repo& repo)
+    // Validate a resource buffer per the validate json node
+    static void
+    validate_buffer(xrt::bo& bo, const validate_node& node, const artifacts::repo* repo)
     {
-        const auto& data = repo.get(m_paths[name]);
-        if (bo.size() != data.size())
-          throw std::runtime_error("binding size mismatch during reset");
+      const auto& golden_data = repo->get(node.get<std::string>("file"));
+      // here we could extract offset and size of region to validate
+      
+      bo.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+      auto bo_data = bo.map<char*>();
+      if (bo.size() != golden_data.size())
+        throw std::runtime_error("Size mismatch during validation");
+
+      if (!std::equal(golden_data.data(), golden_data.data() + golden_data.size(), bo_data)) {
+        for (uint64_t i = 0; i < golden_data.size(); ++i) {
+          if (golden_data[i] != bo_data[i])
+            throw std::runtime_error("gold[" + std::to_string(i) + "] = " + std::to_string(golden_data[i])
+                                     + " does not match bo value " + std::to_string(bo_data[i]));
+        }
+      }
+    }
 
-        auto bo_data = bo.map<char*>();
-        std::copy(data.data(), data.data() + data.size(), bo_data);
+    // Initialize a resource buffer per the binding json node
+    static void
+    init_buffer(xrt::bo& bo, const binding_node& node)
+    {
+      // Get the pattern, which must be one character
+      auto pattern = node.get<std::string>("pattern");
+      if (pattern.size() != 1)
+        throw std::runtime_error("pattern size must be 1");
+      
+      // Fill the resource buffer with the pattern
+      auto bo_data = bo.map<char*>();
+      std::fill(bo_data, bo_data + bo.size(), pattern[0]);
+      bo.sync(XCL_BO_SYNC_BO_TO_DEVICE);
     }
 
   public:
     bindings() = default;
 
-    bindings(const xrt::device& device, const boost::property_tree::ptree& pt, const artifacts::repo& repo)
-      : m_paths{init_paths(pt)}
-      , m_bindings{create_bindings(device, m_paths, repo)}
+    bindings(const xrt::device& device, const boost::property_tree::ptree& pt, const artifacts::repo* repo)
+      : m_bindings{init_bindings(pt)}
+      , m_bo_bindings{create_buffers(device, m_bindings, repo)}
     {}
 
-    // Reset all bindings to their original values
+    // Validate resource buffers per json.  Validation is per bound buffer
+    // as defined in the profile json.
     void
-    reset(const artifacts::repo& repo)
+    validate(const artifacts::repo* repo)
     {
-      for (auto& [name, bo] : m_bindings)
-        reset(name, bo, repo);
+      for (auto& [name, node] : m_bindings) {
+        if (auto validate_node = node.get_child_optional("validate")) {
+          validate_buffer(m_bo_bindings.at(name), get_optional(validate_node), repo);
+        }
+      }
     }
 
-    // Reset a specific binding to its original value
+    // Init bindings per json.  Initialization is done by filling a
+    // pattern into a buffer that requires initialization.  The
+    // pattern is currently limited to a single character.
     void
-    reset(const std::string& name, const artifacts::repo& repo)
+    init()
     {
-      auto& bo = m_bindings.at(name);
-      reset(name, bo, repo);
+      for (auto& [name, node] : m_bindings) {
+        if (auto init_node = node.get_child_optional("init"))
+          init_buffer(m_bo_bindings.at(name), get_optional(init_node));
+      }
     }
 
-    const std::map<std::string, xrt::bo>&
-    get_bindings() const
+    // Bind resources to the recipe per json
+    void
+    bind(recipe* rr)
     {
-      return m_bindings;
+      for (auto& [name, node] : m_bindings) {
+        if (node.get<bool>("bind", false))
+          rr->bind(name, m_bo_bindings.at(name));
+      }
     }
   }; // class profile::bindings
 
@@ -1087,7 +1154,7 @@ class profile
   
 private:
   boost::property_tree::ptree m_profile;
-  artifacts::file_repo m_repo;
+  std::shared_ptr<artifacts::repo> m_repo;
   xrt::device m_device;
   recipe* m_recipe = nullptr;
   bindings m_bindings;
@@ -1101,17 +1168,50 @@ class profile
   }
 
 public:
-  profile(xrt::device device, recipe* rr, const std::string& profile)
+  profile(xrt::device device, recipe* rr, const std::string& profile,
+          std::shared_ptr<artifacts::repo> repo)
     : m_profile{load(profile)}
+    , m_repo{std::move(repo)}
     , m_device{std::move(device)}
     , m_recipe{rr}
-    , m_bindings{m_device, m_profile.get_child("bindings"), m_repo}
+    , m_bindings{m_device, m_profile.get_child("bindings"), m_repo.get()}
   {}
 
-  const std::map<std::string, xrt::bo>&
-  get_bo_bindings() const
+  void
+  bind()
   {
-    return m_bindings.get_bindings();
+    m_bindings.bind(m_recipe);
+  }
+
+  void
+  init()
+  {
+    m_bindings.init();
+  }
+
+  void
+  validate()
+  {
+    m_bindings.validate(m_repo.get());
+  }
+
+  void
+  execute()
+  {
+    // TBD, fill out execution control and pass control
+    // there.   This will handle iterations and other
+    bind();
+    init();
+
+    m_recipe->execute();
+  }
+
+  void
+  wait()
+  {
+    m_recipe->wait();
+
+    validate();
   }
 }; // class profile
 
@@ -1140,12 +1240,9 @@ class runner_impl
   }
 
 public:
-  runner_impl(const xrt::device& device, const std::string& recipe)
-    : m_recipe{device, recipe, artifacts::file_repo{}}
-  {}
-
-  runner_impl(const xrt::device& device, const std::string& recipe, const runner::artifacts_repository& artifacts)
-    : m_recipe{device, recipe, artifacts::ram_repo(artifacts)}
+  runner_impl(const xrt::device& device, const std::string& recipe,
+              const std::shared_ptr<artifacts::repo>& repo)
+    : m_recipe{device, recipe, repo.get()}
   {}
 
   virtual ~runner_impl() = default;
@@ -1186,18 +1283,23 @@ class profile_impl : public runner_impl
   profile m_profile;
 
 public:
-  profile_impl(const xrt::device& device, const std::string& recipe, const std::string& profile)
-    : runner_impl{device, recipe}
-    , m_profile{device, get_recipe(), profile}
+  profile_impl(const xrt::device& device,
+               const std::string& recipe, const std::string& profile,
+               const std::shared_ptr<artifacts::repo>& repo)
+    : runner_impl{device, recipe, repo}
+    , m_profile{device, get_recipe(), profile, repo}
   {}
 
   void
   execute() override
   {
-    for (auto& [name, bo] : m_profile.get_bo_bindings())
-      runner_impl::bind(name, bo);
+    m_profile.execute();
+  }
 
-    runner_impl::execute();
+  void
+  wait() override
+  {
+    m_profile.wait();
   }
 }; // class profile_impl
 
@@ -1205,18 +1307,41 @@ class profile_impl : public runner_impl
 // Public runner interface APIs
 ////////////////////////////////////////////////////////////////
 runner::
-runner(const xrt::device& device, const std::string& recipe)
-  : m_impl{std::make_unique<runner_impl>(device, recipe)}
+runner(const xrt::device& device,
+       const std::string& recipe)
+  : m_impl{std::make_unique<runner_impl>
+           (device, recipe, std::make_shared<artifacts::file_repo>())}
 {} 
   
 runner::
-runner(const xrt::device& device, const std::string& recipe, const artifacts_repository& repo)
-  : m_impl{std::make_unique<runner_impl>(device, recipe, repo)}
+runner(const xrt::device& device,
+       const std::string& recipe,
+       const std::filesystem::path& dir)
+  : m_impl{std::make_unique<runner_impl>
+           (device, recipe, std::make_shared<artifacts::file_repo>(dir))}
+{} 
+
+runner::
+runner(const xrt::device& device,
+       const std::string& recipe,
+       const artifacts_repository& repo)
+  : m_impl{std::make_unique<runner_impl>
+           (device, recipe, std::make_shared<artifacts::ram_repo>(repo))}
+{}
+
+runner::
+runner(const xrt::device& device,
+       const std::string& recipe, const std::string& profile)
+  : m_impl{std::make_unique<profile_impl>
+           (device, recipe, profile, std::make_shared<artifacts::file_repo>())}
 {}
 
 runner::
-runner(const xrt::device& device, const std::string& recipe, const std::string& profile)
-  : m_impl{std::make_unique<profile_impl>(device, recipe, profile)}
+runner(const xrt::device& device,
+       const std::string& recipe, const std::string& profile,
+       const std::filesystem::path& dir)
+  : m_impl{std::make_unique<profile_impl>
+           (device, recipe, profile, std::make_shared<artifacts::file_repo>(dir))}
 {}
 
 void
diff --git a/src/runtime_src/core/common/runner/runner.h b/src/runtime_src/core/common/runner/runner.h
index 2d6d5d2c433..c2684fb5798 100644
--- a/src/runtime_src/core/common/runner/runner.h
+++ b/src/runtime_src/core/common/runner/runner.h
@@ -6,6 +6,7 @@
 
 #include <any>
 #include <cstdint>
+#include <filesystem>
 #include <functional>
 #include <map>
 #include <memory>
@@ -37,19 +38,37 @@ class runner
    */
   using artifacts_repository = std::map<std::string, std::vector<char>>;
 
-  // ctor - Create runner from a recipe json
+  // ctor - Create runner from a recipe json.
+  // Any artifacts referenced by the recipe are looked up in the
+  // current directory.
   XRT_CORE_COMMON_EXPORT
   runner(const xrt::device& device, const std::string& recipe);
 
+  // ctor - Create runner from a recipe json and path to directory
+  // with artifacts
+  XRT_CORE_COMMON_EXPORT
+  runner(const xrt::device& device, const std::string& recipe,
+         const std::filesystem::path& artifacts_dir);
+
   // ctor - Create runner from a recipe json and artifacts repository
-  // The lifetime of the repo must extend the lifetime of the runner
+  // The repo is not copied so the lifetime of the repo must extend
+  // the lifetime of the runner.
   XRT_CORE_COMMON_EXPORT
-  runner(const xrt::device& device, const std::string& recipe, const artifacts_repository&);
+  runner(const xrt::device& device, const std::string& recipe,
+         const artifacts_repository&);
 
   // ctor - Create runner from a recipe json and execution profile json
+  // Any artifacts referenced by recipe and profile are looked up in
+  // the current directory.
   XRT_CORE_COMMON_EXPORT
   runner(const xrt::device& device, const std::string& recipe, const std::string& profile);
 
+  // ctor - Create runner from a recipe json and execution profile
+  // json and path to directory with artifacts.
+  XRT_CORE_COMMON_EXPORT
+  runner(const xrt::device& device, const std::string& recipe, const std::string& profile,
+         const std::filesystem::path& artifacts_dir);
+
   // bind_input() - Bind a buffer object to an input tensor
   XRT_CORE_COMMON_EXPORT
   void
diff --git a/src/runtime_src/core/common/runner/test/CMakeLists.txt b/src/runtime_src/core/common/runner/test/CMakeLists.txt
index 1d519d5f40d..24a0ab9d57d 100644
--- a/src/runtime_src/core/common/runner/test/CMakeLists.txt
+++ b/src/runtime_src/core/common/runner/test/CMakeLists.txt
@@ -23,10 +23,15 @@ add_executable(recipe recipe.cpp)
 target_include_directories(recipe PRIVATE ${XRT_INCLUDE_DIRS} ${XRT_ROOT}/src/runtime_src)
 target_link_libraries(recipe PRIVATE XRT::xrt_coreutil)
 
+add_executable(runner-profile runner-profile.cpp)
+target_include_directories(runner-profile PRIVATE ${XRT_INCLUDE_DIRS} ${XRT_ROOT}/src/runtime_src)
+target_link_libraries(runner-profile PRIVATE XRT::xrt_coreutil)
+
 if (NOT WIN32)
   target_link_libraries(runner PRIVATE pthread uuid dl)
+  target_link_libraries(runner-profile PRIVATE pthread uuid dl)
   target_link_libraries(recipe PRIVATE pthread uuid dl)
 endif()
 
-install(TARGETS runner recipe)
+install(TARGETS runner runner-profile recipe)
 
diff --git a/src/runtime_src/core/common/runner/test/profile.json b/src/runtime_src/core/common/runner/test/profile.json
index 5044d03ed81..63fa38fe725 100644
--- a/src/runtime_src/core/common/runner/test/profile.json
+++ b/src/runtime_src/core/common/runner/test/profile.json
@@ -1,23 +1,38 @@
 {
   "version": "1.0",
-  "type": "execution profile",
 
   "bindings": [
-    "wts": {
-      "file": "wts.bin"
+    {
+      "name": "wts",
+      "file": "wts.bin",
+      "bind": true
     },
-    "ifm": {
-      "file": "ifm.bin"
+    {
+      "name": "ifm",
+      "file": "ifm.bin",
+      "bind": true
     },
-    "ofm": {
-      "file": "ofm.bin"
-    }    
-  ]
+    {
+      "name": "ofm",
+      "file": "ofm.bin",
+      "bind": true,
+      "init": {
+        "pattern": "A",
+      },
+      "validate": {
+        "size": 0,
+        "offset": 0,
+        "file": "gold.bin"
+      }
+    }
+  ],
 
   "execution" : {
-    "iterations": 1,
-    "validation": {
-      "file": "gold.bin"
+    "iterations": 2,
+    "iteration" : {
+      "bind": false,
+      "init": true,
+      "validate": true
     }
   }
 }
diff --git a/src/runtime_src/core/common/runner/test/runner-profile.cpp b/src/runtime_src/core/common/runner/test/runner-profile.cpp
new file mode 100644
index 00000000000..c5474ca011c
--- /dev/null
+++ b/src/runtime_src/core/common/runner/test/runner-profile.cpp
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: Apache-2.0
+// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+// This test configures and runs a recipe one time
+// g++ -g -std=c++17
+//   -I/home/stsoe/git/stsoe/XRT/build/Debug/opt/xilinx/xrt/include
+//   -I/home/stsoe/git/stsoe/XRT/src/runtime_src
+//   -L/home/stsoe/git/stsoe/XRT/build/Debug/opt/xilinx/xrt/lib
+//   -o runner-profile.exe runner-profile.cpp -lxrt_coreutil -pthread
+//
+// or
+//
+// mkdir build
+// cd build
+// cmake -DXILINX_XRT=/home/stsoe/git/stsoe/XRT/build/Debug/opt/xilinx/xrt
+//       -DXRT_ROOT=/home/stsoe/git/stsoe/XRT ..
+// cmake --build . --config Debug
+//
+// ./runner.exe --recipe ... --profile ... [--dir ...]
+
+#include "xrt/xrt_device.h"
+#include "experimental/xrt_ext.h"
+#include "core/common/runner/runner.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+static void
+usage()
+{
+  std::cout << "usage: %s [options]\n";
+  std::cout << " --recipe <recipe.json> recipe file to run\n";
+  std::cout << " --profile <profile.json> execution profile\n";
+  std::cout << " [--dir <path>] directory containing artifacts (default: current dir)\n";
+  std::cout << "\n\n";
+  std::cout << "runner.exe --recipe recipe.json --profile profile.json\n";
+}
+
+static void
+run(const std::string& recipe,
+    const std::string& profile,
+    const std::string& dir)
+{
+  xrt::device device{0};
+  xrt_core::runner runner {device, recipe, profile, dir};
+  runner.execute();
+  runner.wait();
+}
+
+static void
+run(int argc, char* argv[])
+{
+  std::vector<std::string> args(argv+1,argv+argc);
+  std::string cur;
+  std::string recipe;
+  std::string profile;
+  std::string dir = ".";
+  for (auto& arg : args) {
+    if (arg == "-h") {
+      usage();
+      return;
+    }
+
+    if (arg[0] == '-') {
+      cur = arg;
+      continue;
+    }
+
+    if (cur == "--recipe")
+      recipe = arg;
+    else if (cur == "--profile")
+      profile = arg;
+    else if (cur == "--dir")
+      dir = arg;
+    else
+      throw std::runtime_error("Unknown option value " + cur + " " + arg);
+  }
+
+  run(recipe, profile, dir);
+}
+
+int
+main(int argc, char **argv)
+{
+  try {
+    run(argc, argv);
+    return 0;
+  }
+  catch (const std::exception& ex) {
+    std::cerr << "Error: " << ex.what() << '\n';
+  }
+  catch (...) {
+    std::cerr << "Unknown error\n";
+  }
+  return 1;
+
+}

From b9be79e669c44283b6df53aa4ef9a757e880c250 Mon Sep 17 00:00:00 2001
From: Soren Soe <2106410+stsoe@users.noreply.github.com>
Date: Thu, 3 Apr 2025 10:53:49 -0700
Subject: [PATCH 3/4] Few more keys to profile::execution

Signed-off-by: Soren Soe <2106410+stsoe@users.noreply.github.com>
---
 src/runtime_src/core/common/runner/runner.cpp | 91 +++++++++++++++----
 1 file changed, 72 insertions(+), 19 deletions(-)

diff --git a/src/runtime_src/core/common/runner/runner.cpp b/src/runtime_src/core/common/runner/runner.cpp
index ef31478afe4..cbc58da51c0 100644
--- a/src/runtime_src/core/common/runner/runner.cpp
+++ b/src/runtime_src/core/common/runner/runner.cpp
@@ -1148,16 +1148,61 @@ class profile
 
   class execution
   {
-    size_t m_iterations = 1;
+    using iteration_node = boost::property_tree::ptree;
+    profile* m_profile;
+    size_t m_iterations;
+    iteration_node m_iteration;
+
+    void
+    execute_iteration(size_t idx)
+    {
+      // (Re)bind buffers to recipe if requested
+      if (m_iteration.get<bool>("bind"))
+        m_profile->bind();
+      
+      // Initialize buffers if requested
+      if (m_iteration.get<bool>("init"))
+        m_profile->init();
+      
+      m_profile->execute_recipe();
+
+      // Wait execution to complete if requested
+      if (m_iteration.get<bool>("wait"))
+        m_profile->wait_recipe();
+
+      // Validate if requested (implies wait)
+      if (m_iteration.get<bool>("validate"))
+        m_profile->validate();
+    }
+
+  public:
+    execution(profile* pr, const boost::property_tree::ptree& pt)
+      : m_profile(pr)
+      , m_iterations(pt.get<size_t>("iterations"))
+      , m_iteration(pt.get_child("iteration"))
+    {
+      // Bind buffers to the recipe prior to executing the recipe
+      m_profile->bind();
+    }
+      
+    void
+    execute()
+    {
+      for (size_t i = 0; i < m_iterations; ++i)
+        execute_iteration(i);
+    }
     
   }; // class profile::execution
   
 private:
+  friend class bindings;  // embedded class
+  friend class execution; // embedded class
   boost::property_tree::ptree m_profile;
   std::shared_ptr<artifacts::repo> m_repo;
   xrt::device m_device;
   recipe* m_recipe = nullptr;
   bindings m_bindings;
+  execution m_execution;
 
   static boost::property_tree::ptree
   load(const std::string& path)
@@ -1167,16 +1212,6 @@ class profile
     return pt;
   }
 
-public:
-  profile(xrt::device device, recipe* rr, const std::string& profile,
-          std::shared_ptr<artifacts::repo> repo)
-    : m_profile{load(profile)}
-    , m_repo{std::move(repo)}
-    , m_device{std::move(device)}
-    , m_recipe{rr}
-    , m_bindings{m_device, m_profile.get_child("bindings"), m_repo.get()}
-  {}
-
   void
   bind()
   {
@@ -1196,22 +1231,40 @@ class profile
   }
 
   void
-  execute()
+  execute_recipe()
   {
-    // TBD, fill out execution control and pass control
-    // there.   This will handle iterations and other
-    bind();
-    init();
-
     m_recipe->execute();
   }
 
   void
-  wait()
+  wait_recipe()
   {
     m_recipe->wait();
+  }
+  
+
+public:
+  profile(xrt::device device, recipe* rr, const std::string& profile,
+          std::shared_ptr<artifacts::repo> repo)
+    : m_profile{load(profile)}
+    , m_repo{std::move(repo)}
+    , m_device{std::move(device)}
+    , m_recipe{rr}
+    , m_bindings{m_device, m_profile.get_child("bindings"), m_repo.get()}
+    , m_execution(this, m_profile.get_child("execution"))
+  {}
 
-    validate();
+  void
+  execute()
+  {
+    m_execution.execute();
+  }
+
+  void
+  wait()
+  {
+    // waiting is controlled through execution in json
+    // so a noop here
   }
 }; // class profile
 

From 18750843eec81c52de1f3c73288a8d2b06a7b451 Mon Sep 17 00:00:00 2001
From: Soren Soe <2106410+stsoe@users.noreply.github.com>
Date: Thu, 3 Apr 2025 13:53:16 -0700
Subject: [PATCH 4/4] Make artfacts repo return string_view

The std::vector return type by reference was awkward.
Should really be a std::span (c++20).

Update xrt::xclbin ctor to with std::string_view.

Add comments to runner.cpp.

Signed-off-by: Soren Soe <2106410+stsoe@users.noreply.github.com>
---
 .../core/common/api/xrt_device.cpp            |   2 +-
 .../core/common/api/xrt_xclbin.cpp            |   5 +
 src/runtime_src/core/common/runner/runner.cpp | 184 +++++++++++++-----
 .../include/xrt/experimental/xrt_xclbin.h     |  14 ++
 4 files changed, 151 insertions(+), 54 deletions(-)

diff --git a/src/runtime_src/core/common/api/xrt_device.cpp b/src/runtime_src/core/common/api/xrt_device.cpp
index dc2b2c3ea02..a627cf8d958 100644
--- a/src/runtime_src/core/common/api/xrt_device.cpp
+++ b/src/runtime_src/core/common/api/xrt_device.cpp
@@ -612,7 +612,7 @@ xrtDeviceLoadXclbinFile(xrtDeviceHandle dhdl, const char* fnm)
 {
   try {
     return xdp::native::profiling_wrapper(__func__, [dhdl, fnm]{
-      xrt::xclbin xclbin{fnm};
+      xrt::xclbin xclbin{std::string{fnm}};
       auto device = device_cache.get_or_error(dhdl);
       device->load_xclbin(xclbin);
       return 0;
diff --git a/src/runtime_src/core/common/api/xrt_xclbin.cpp b/src/runtime_src/core/common/api/xrt_xclbin.cpp
index 8f9d60f8914..fc857d6aa84 100644
--- a/src/runtime_src/core/common/api/xrt_xclbin.cpp
+++ b/src/runtime_src/core/common/api/xrt_xclbin.cpp
@@ -1035,6 +1035,11 @@ xclbin(const std::vector<char>& data)
   : detail::pimpl<xclbin_impl>(std::make_shared<xclbin_full>(data))
 {}
 
+xclbin::
+xclbin(const std::string_view& data)
+  : detail::pimpl<xclbin_impl>(std::make_shared<xclbin_full>(std::vector<char>{data.begin(), data.end()}))
+{}
+
 xclbin::
 xclbin(const axlf* top)
   : detail::pimpl<xclbin_impl>(std::make_shared<xclbin_full>(top))
diff --git a/src/runtime_src/core/common/runner/runner.cpp b/src/runtime_src/core/common/runner/runner.cpp
index cbc58da51c0..ed6ea6c2d65 100644
--- a/src/runtime_src/core/common/runner/runner.cpp
+++ b/src/runtime_src/core/common/runner/runner.cpp
@@ -38,6 +38,7 @@
 #include <istream>
 #include <map>
 #include <string>
+#include <string_view>
 #include <tuple>
 #include <utility>
 #include <variant>
@@ -115,8 +116,17 @@ class repo
 public:
   virtual ~repo() = default;
 
-  virtual const std::vector<char>&
+  // Should be std::span, but not until c++20
+  virtual const std::string_view
   get(const std::string& path) const = 0;
+
+  // Should be std::span, but not until c++20
+  static std::string_view
+  to_sv(const std::vector<char>& vec)
+  {
+    // return {vec.begin(), vec.end()};
+    return {vec.data(), vec.size()};
+  }
 };
 
 // class file_repo - file system artifact repository
@@ -134,7 +144,7 @@ class file_repo : public repo
     : base_dir{std::move(basedir)}
   {}
 
-  const std::vector<char>&
+  const std::string_view
   get(const std::string& path) const override
   {
     std::filesystem::path full_path = base_dir / path;
@@ -143,7 +153,7 @@ class file_repo : public repo
 
     auto key = full_path.string();
     if (auto it = m_data.find(key); it != m_data.end())
-      return (*it).second;
+      return to_sv((*it).second);
 
     std::ifstream ifs(key, std::ios::binary);
     if (!ifs)
@@ -156,7 +166,7 @@ class file_repo : public repo
     auto [itr, success] = m_data.emplace(key, std::move(data));
     XRT_DEBUGF("artifacts::file_repo::get(%s) -> %s\n", path.c_str(), success ? "success" : "failure");
     
-    return (*itr).second;
+    return to_sv((*itr).second);
   }
 };
 
@@ -170,16 +180,16 @@ class ram_repo : public repo
     : m_reference{data}
   {}
 
-  const std::vector<char>&
+  const std::string_view
   get(const std::string& path) const override
   {
     if (auto it = m_data.find(path); it != m_data.end())
-      return (*it).second;
+      return to_sv((*it).second);
 
     if (auto it = m_reference.find(path); it != m_reference.end()) {
       auto [itr, success] = m_data.emplace(path, it->second);
       XRT_DEBUGF("artifacts::ram_repo::get(%s) -> %s\n", path.c_str(), success ? "success" : "failure");
-      return (*itr).second;
+      return to_sv((*itr).second);
     }
 
     throw std::runtime_error{"Failed to find artifact: " + path};
@@ -212,7 +222,7 @@ get(const std::string& path, const artifacts::repo* repo)
   if (auto it = s_path2elf.find(path); it != s_path2elf.end())
     return get((*it).second);
 
-  auto& data = repo->get(path);
+  auto data = repo->get(path);
   streambuf buf{data.data(), data.data() + data.size()};
   std::istream is{&buf};
   xrt::elf elf{is};
@@ -234,7 +244,7 @@ class recipe
     read_xclbin(const boost::property_tree::ptree& pt, const artifacts::repo* repo)
     {
       auto path = pt.get<std::string>("xclbin");
-      auto& data = repo->get(path);
+      auto data = repo->get(path);
       return xrt::xclbin{data};
     }
 
@@ -1005,39 +1015,59 @@ class recipe
   }
 }; // class recipe
 
-
-// A runner_impl (xrt::runner) always has a run recipe object and
-// optionally a execution profile object. The latter is optional and default
-// created from an in-mermory json.
+// class profile - Execution profile
 //
-// The profile implements the runner_impl bind APIs and
-// execute/wait APIs, these APIs forward to the run recipe object
-// and must be called for the default execution recipe.
+// The profile class controls how a run recipe is bound to external
+// resources and how the recipe is executed.
 //
-// An external execution profile can be used to initialize run recipe
-// resources at runner initialization time bind
-// resources per the recipe.  The calling application can still
-// explicitly bind via the xrt::runner APIs, which may override
-// the binding done by the execution recipe.
+// An execution profile can be used to initialize run recipe resources
+// at runner initialization time by binding resources per the recipe.
+// The calling application can still explicitly bind via the
+// xrt::runner APIs, which may override the binding done by the
+// execution profile.
 class profile
 {
+  // class bindings - represents the bindings sections of a profile json
+  //
+  // {
+  //   "name": buffer name in recipe
+  //   "file": (optional with init) if present use to initialize the buffer
+  //   "size": (required if no file) the size of the buffer
+  //   "init": (optional) how to initialize a buffer
+  //   "validate": how to validate a buffer after execution
+  // }
+  // 
+  // The bindings section specify what xrt::bo objects to create for
+  // external buffers. The buffers are bound to the recipe prior to
+  // first execution.
+  // 
+  // A binding can specify a file from which the buffer should be
+  // initialized.  If a "file" is specified, the buffer is created with
+  // this size unless "size" is also specified, in which case the size
+  // is exactly the size of the buffer and max size bytes of file is
+  // used to initialize the buffer.
+  //
+  // If "init" is specified, then it defines how the buffer should be
+  // initialzed. "init" takes precedence over "file" if "file" is also
+  // specified, potentially overwriting already initialized buffer.
+  //
+  // If "validate" is specified then it has instructions on how to
+  // validate a buffer after executing the recipe.
   class bindings
   {
+    // Convenience types for readability
     using name_t = std::string;
     using path_t = std::string;
     using binding_node = boost::property_tree::ptree;
     using validate_node = boost::property_tree::ptree;
 
-    // Map of resource name to json binding element.  This comes
-    // directly from the profile json.
+    // Map of resource name to json binding element.
     std::map<name_t, binding_node> m_bindings;
 
-    // Map of resource names to buffers.  The buffers are initialized
-    // with data loaded from the file path corresponding to the
-    // resource name.
-    std::map<name_t, xrt::bo> m_bo_bindings;
+    // Map of resource names to XRT buffer objects.
+    std::map<name_t, xrt::bo> m_xrt_bos;
 
-    // Create a map of resource names to json binding nodes from the profile json
+    // Create a map of resource names to json binding nodes
     static std::map<name_t, binding_node>
     init_bindings(const boost::property_tree::ptree& pt)
     {
@@ -1048,8 +1078,11 @@ class profile
       return bindings;
     }
 
-    // Create a map of resource names to buffers initialized with data
-    // from the file paths.  The data is cached in an artifacts::repo
+    // Create a map of resource names to XRT buffer objects.
+    // Initialize the BO with data from the file if any.
+    // The size of the xrt::bo is either the size of the "file"
+    // if present, or it is the "size" per json.  An explicit
+    // "size" always has precedence.
     static std::map<name_t, xrt::bo>
     create_buffers(const xrt::device& device,
                    const std::map<name_t, binding_node>& bindings,
@@ -1057,21 +1090,32 @@ class profile
     {
       std::map<name_t, xrt::bo> bos;
       for (const auto& [name, node] : bindings) {
-        const auto& data = repo->get(node.get<std::string>("file"));
-        xrt::bo bo = xrt::ext::bo{device, data.size()};
-        auto bo_data = bo.map<char*>();
-        std::copy(data.data(), data.data() + data.size(), bo_data);
-        bo.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+        auto size = node.get<size_t>("size", 0);
+        auto file = node.get<std::string>("file", "");
+        auto data = file.empty() ? std::string_view{} : repo->get(file);
+        size = size ? size : data.size(); // specified size has precedence
+        xrt::bo bo = xrt::ext::bo{device, size};
+        if (!data.empty()) {
+          auto bo_data = bo.map<char*>();
+          std::copy(data.data(), data.data() + std::min<size_t>(size, data.size()), bo_data);
+          bo.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+        }
         bos.emplace(node.get<std::string>("name"), std::move(bo));
       }
       return bos;
     }
 
-    // Validate a resource buffer per the validate json node
+    // Validate a resource buffer per profile.json validate json node
+    // "validate": {
+    //   "size": 0,   // unused for now
+    //   "offset": 0, // unused for now
+    //   "file": "gold.bin"
+    //  }
+
     static void
     validate_buffer(xrt::bo& bo, const validate_node& node, const artifacts::repo* repo)
     {
-      const auto& golden_data = repo->get(node.get<std::string>("file"));
+      auto golden_data = repo->get(node.get<std::string>("file"));
       // here we could extract offset and size of region to validate
       
       bo.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
@@ -1079,12 +1123,15 @@ class profile
       if (bo.size() != golden_data.size())
         throw std::runtime_error("Size mismatch during validation");
 
-      if (!std::equal(golden_data.data(), golden_data.data() + golden_data.size(), bo_data)) {
-        for (uint64_t i = 0; i < golden_data.size(); ++i) {
-          if (golden_data[i] != bo_data[i])
-            throw std::runtime_error("gold[" + std::to_string(i) + "] = " + std::to_string(golden_data[i])
-                                     + " does not match bo value " + std::to_string(bo_data[i]));
-        }
+      if (std::equal(golden_data.data(), golden_data.data() + golden_data.size(), bo_data))
+        return;
+
+      // Error
+      for (uint64_t i = 0; i < golden_data.size(); ++i) {
+        if (golden_data[i] != bo_data[i])
+          throw std::runtime_error
+            ("gold[" + std::to_string(i) + "] = " + std::to_string(golden_data[i])
+             + " does not match bo value in bo " + std::to_string(bo_data[i]));
       }
     }
 
@@ -1108,7 +1155,7 @@ class profile
 
     bindings(const xrt::device& device, const boost::property_tree::ptree& pt, const artifacts::repo* repo)
       : m_bindings{init_bindings(pt)}
-      , m_bo_bindings{create_buffers(device, m_bindings, repo)}
+      , m_xrt_bos{create_buffers(device, m_bindings, repo)}
     {}
 
     // Validate resource buffers per json.  Validation is per bound buffer
@@ -1118,7 +1165,7 @@ class profile
     {
       for (auto& [name, node] : m_bindings) {
         if (auto validate_node = node.get_child_optional("validate")) {
-          validate_buffer(m_bo_bindings.at(name), get_optional(validate_node), repo);
+          validate_buffer(m_xrt_bos.at(name), get_optional(validate_node), repo);
         }
       }
     }
@@ -1131,7 +1178,7 @@ class profile
     {
       for (auto& [name, node] : m_bindings) {
         if (auto init_node = node.get_child_optional("init"))
-          init_buffer(m_bo_bindings.at(name), get_optional(init_node));
+          init_buffer(m_xrt_bos.at(name), get_optional(init_node));
       }
     }
 
@@ -1141,11 +1188,37 @@ class profile
     {
       for (auto& [name, node] : m_bindings) {
         if (node.get<bool>("bind", false))
-          rr->bind(name, m_bo_bindings.at(name));
+          rr->bind(name, m_xrt_bos.at(name));
       }
     }
   }; // class profile::bindings
 
+  // class execution - represents the execution section of a profile json
+  //
+  // {
+  //  "execution" : {
+  //    "iterations": 2,
+  //    "iteration" : {
+  //      "bind": false,
+  //      "init": true,
+  //      "wait": true,
+  //      "validate": true
+  //    }
+  //  }
+  //
+  // The execution section specifies how a recipe should be executed.
+  // Number of iterations specfied how many times the recipe should be
+  // executed when the application calls xrt::runnner::execute().
+  //
+  // The behavior of an iteration is within the iteration sub-node.
+  // - "bind" indicates if a buffers should be re-bound to the
+  //   recipe before an iteration.
+  // - "init" indicates of buffer should be initialized per what is
+  //    specified in the binding element.
+  // - "wait" says that execution should wait for completion between
+  //    iterations and after last iteration.
+  // - "validate" means buffer validation per what is specified in
+  //   the binding element.
   class execution
   {
     using iteration_node = boost::property_tree::ptree;
@@ -1181,10 +1254,12 @@ class profile
       , m_iterations(pt.get<size_t>("iterations"))
       , m_iteration(pt.get_child("iteration"))
     {
-      // Bind buffers to the recipe prior to executing the recipe
+      // Bind buffers to the recipe prior to executing the recipe. This
+      // will bind the buffers which have binding::bind set to true.
       m_profile->bind();
     }
-      
+
+    // Execute the profile
     void
     execute()
     {
@@ -1199,7 +1274,6 @@ class profile
   friend class execution; // embedded class
   boost::property_tree::ptree m_profile;
   std::shared_ptr<artifacts::repo> m_repo;
-  xrt::device m_device;
   recipe* m_recipe = nullptr;
   bindings m_bindings;
   execution m_execution;
@@ -1244,13 +1318,17 @@ class profile
   
 
 public:
-  profile(xrt::device device, recipe* rr, const std::string& profile,
+  // profile - constructor
+  //
+  // Reads json, creates xrt::bo bindings to recipe and initializes
+  // execution. The respository is used for looking up artifacts.
+  // The recipe is what the profile binds to and what it executes.
+  profile(const xrt::device& device, recipe* rr, const std::string& profile,
           std::shared_ptr<artifacts::repo> repo)
     : m_profile{load(profile)}
     , m_repo{std::move(repo)}
-    , m_device{std::move(device)}
     , m_recipe{rr}
-    , m_bindings{m_device, m_profile.get_child("bindings"), m_repo.get()}
+    , m_bindings{device, m_profile.get_child("bindings"), m_repo.get()}
     , m_execution(this, m_profile.get_child("execution"))
   {}
 
diff --git a/src/runtime_src/core/include/xrt/experimental/xrt_xclbin.h b/src/runtime_src/core/include/xrt/experimental/xrt_xclbin.h
index dc3d40289ad..d467b9909bb 100644
--- a/src/runtime_src/core/include/xrt/experimental/xrt_xclbin.h
+++ b/src/runtime_src/core/include/xrt/experimental/xrt_xclbin.h
@@ -18,6 +18,7 @@
 # include <utility>
 # include <vector>
 # include <string>
+# include <string_view>
 #endif
 
 /**
@@ -624,6 +625,19 @@ class xclbin : public detail::pimpl<xclbin_impl>
   explicit
   xclbin(const std::vector<char>& data);
 
+  /**
+   * xclbin() - Constructor from raw data
+   *
+   * @param data
+   *  Raw data of xclbin
+   *
+   * The raw data of the xclbin can be deleted after calling the
+   * constructor.
+   */
+  XRT_API_EXPORT
+  explicit
+  xclbin(const std::string_view& data);
+
   /**
    * xclbin() - Constructor from raw data
    *