From c22624b9eaf273619a9c07dd87c84e42d6f86853 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 18 Jan 2026 16:04:55 +0000
Subject: [PATCH 1/4] Initial plan


From f664d2a229dbad03f8657d3874426ce0e71a7202 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 18 Jan 2026 16:13:25 +0000
Subject: [PATCH 2/4] Fix tensor type mismatch by pre-scanning tensor metadata
 before allocation

This change addresses the issue where allocated device memory doesn't
match the tensor type from the file. The fix implements a two-pass
approach:

1. Pre-scan: Read all tensor metadata and data from the file upfront
2. Create tensors using the actual types from the file (not assumed types)
3. Load weights from the pre-scanned buffer

This ensures that memory allocation on the device corresponds to the
correct tensor type from the file, fixing mixed precision model loading.

Co-authored-by: lhpqaq <63844184+lhpqaq@users.noreply.github.com>
---
 src/whisper.cpp | 175 ++++++++++++++++++++++++++----------------------
 1 file changed, 95 insertions(+), 80 deletions(-)
diff --git a/src/whisper.cpp b/src/whisper.cpp
index 082e7619e07..4326a59cbe6 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -1684,6 +1684,60 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
 
     const size_t n_tensors = 10 /* input */ + 15 + 15*n_audio_layer + 24*n_text_layer;
 
+    // Pre-scan tensor metadata and data from file
+    // This allows us to create tensors with the correct type from the file,
+    // ensuring proper memory allocation on the device
+    struct tensor_info {
+        ggml_type type;
+        int32_t ne[4];
+        int32_t n_dims;
+        std::vector<char> data;
+    };
+    std::map<std::string, tensor_info> tensor_data_map;
+
+    // Read all tensor metadata and data from the file
+    {
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            read_safe(loader, n_dims);
+            read_safe(loader, length);
+            read_safe(loader, ttype);
+
+            if (loader->eof(loader->context)) {
+                break;
+            }
+
+            tensor_info info;
+            info.n_dims = n_dims;
+            info.type = ggml_type(ttype);
+            info.ne[0] = 1;
+            info.ne[1] = 1;
+            info.ne[2] = 1;
+            info.ne[3] = 1;
+
+            int32_t nelements = 1;
+            for (int i = 0; i < n_dims; ++i) {
+                read_safe(loader, info.ne[i]);
+                nelements *= info.ne[i];
+            }
+
+            std::string name;
+            std::vector<char> tmp(length);
+            loader->read(loader->context, &tmp[0], tmp.size());
+            name.assign(&tmp[0], tmp.size());
+
+            // Calculate and read tensor data
+            const size_t tensor_data_size = ggml_row_size(info.type, info.ne[0]) * (nelements / info.ne[0]);
+            info.data.resize(tensor_data_size);
+            loader->read(loader->context, info.data.data(), tensor_data_size);
+
+            tensor_data_map[name] = std::move(info);
+        }
+    }
+
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
     auto get_ctx = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
         auto it = ctx_map.find(buft);
@@ -1712,6 +1766,24 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
     buft_list_t buft_list = make_buft_list(wctx.params);
 
     auto create_tensor = [&](asr_tensor type, asr_system system, ggml_tensor * meta, int layer = 0) -> ggml_tensor * {
+        // Get the tensor name and look up its actual type from the pre-scanned file data
+        std::string tensor_name = format(ASR_TENSOR_NAMES.at(system).at(type), layer);
+        auto it = tensor_data_map.find(tensor_name);
+        if (it != tensor_data_map.end()) {
+            // Use the type from the file to ensure proper memory allocation
+            const tensor_info & info = it->second;
+            if (meta->type != info.type) {
+                // Update meta tensor type to match the file
+                meta->type = info.type;
+                // Update strides based on new type
+                meta->nb[0] = ggml_type_size(meta->type);
+                meta->nb[1] = meta->nb[0] * (meta->ne[0] / ggml_blck_size(meta->type));
+                for (int i = 2; i < GGML_MAX_DIMS; i++) {
+                    meta->nb[i] = meta->nb[i-1] * meta->ne[i-1];
+                }
+            }
+        }
+
         ggml_op op = ASR_TENSOR_INFO.at(type);
         ggml_backend_buffer_type_t buft = select_weight_buft(hparams, meta, op, buft_list);
         if (!buft) {
@@ -1721,7 +1793,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
         ggml_context * ctx = get_ctx(buft);
         ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
 
-        model.tensors[format(ASR_TENSOR_NAMES.at(system).at(type), layer)] = tensor;
+        model.tensors[tensor_name] = tensor;
 
         return tensor;
     };
@@ -1858,39 +1930,13 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
         }
     }
 
-    // load weights
+    // load weights from pre-scanned tensor data
     {
         size_t total_size = 0;
 
         model.n_loaded = 0;
 
-        std::vector<char> read_buf;
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            read_safe(loader, n_dims);
-            read_safe(loader, length);
-            read_safe(loader, ttype);
-
-            if (loader->eof(loader->context)) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[4] = { 1, 1, 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                read_safe(loader, ne[i]);
-                nelements *= ne[i];
-            }
-
-            std::string name;
-            std::vector<char> tmp(length); // create a buffer
-            loader->read(loader->context, &tmp[0], tmp.size()); // read to buffer
-            name.assign(&tmp[0], tmp.size());
-
+        for (auto & [name, info] : tensor_data_map) {
             if (model.tensors.find(name) == model.tensors.end()) {
                 WHISPER_LOG_ERROR("%s: unknown tensor '%s' in model file\n", __func__, name.data());
                 return false;
@@ -1898,77 +1944,46 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
 
             auto tensor = model.tensors[name.data()];
 
+            // Verify tensor properties match
+            const int32_t nelements = info.ne[0] * info.ne[1] * info.ne[2] * info.ne[3];
             if (ggml_nelements(tensor) != nelements) {
                 WHISPER_LOG_ERROR("%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
                 WHISPER_LOG_ERROR("%s: shape: [%d, %d, %d], expected: [%d, %d, %d]\n",
-                        __func__, ne[0], ne[1], ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]);
+                        __func__, info.ne[0], info.ne[1], info.ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]);
                 return false;
             }
 
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
+            if (tensor->ne[0] != info.ne[0] || tensor->ne[1] != info.ne[1] || tensor->ne[2] != info.ne[2]) {
                 WHISPER_LOG_ERROR("%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
-                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], ne[0], ne[1], ne[2]);
+                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], info.ne[0], info.ne[1], info.ne[2]);
                 return false;
             }
 
-            // Calculate size based on file's tensor type
-            const size_t file_tensor_size = ggml_row_size(ggml_type(ttype), ne[0]) * (nelements / ne[0]);
-            const size_t expected_tensor_size = ggml_nbytes(tensor);
-            
-            // For mixed precision models, the tensor type in file may differ from the type
-            // the tensor was created with. We need to handle this carefully.
-            if (tensor->type != ggml_type(ttype)) {
-                // Mixed precision: tensor created with one type, file has another
-                // We need to update the tensor's type to match the file
-                WHISPER_LOG_DEBUG("%s: tensor '%s' type mismatch (expected %s, file has %s)\n",
-                        __func__, name.data(), ggml_type_name(tensor->type), ggml_type_name(ggml_type(ttype)));
-                
-                // Check if the allocated buffer is large enough for the file's data
-                if (file_tensor_size > expected_tensor_size) {
-                    WHISPER_LOG_ERROR("%s: tensor '%s' buffer too small: allocated %zu bytes for %s, but file needs %zu bytes for %s\n",
-                            __func__, name.data(), expected_tensor_size, ggml_type_name(tensor->type), 
-                            file_tensor_size, ggml_type_name(ggml_type(ttype)));
-                    return false;
-                }
-                
-                // Update tensor type to match the file
-                tensor->type = ggml_type(ttype);
-                
-                // Update tensor strides (nb) based on new type
-                tensor->nb[0] = ggml_type_size(tensor->type);
-                tensor->nb[1] = tensor->nb[0] * (tensor->ne[0] / ggml_blck_size(tensor->type));
-                for (int i = 2; i < GGML_MAX_DIMS; i++) {
-                    tensor->nb[i] = tensor->nb[i-1] * tensor->ne[i-1];
-                }
-            } else {
-                // Normal case: types match, verify size
-                if (file_tensor_size != expected_tensor_size) {
-                    WHISPER_LOG_ERROR("%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                            __func__, name.data(), expected_tensor_size, file_tensor_size);
-                    return false;
-                }
+            // Type should already match since we used the file's type during tensor creation
+            if (tensor->type != info.type) {
+                WHISPER_LOG_ERROR("%s: tensor '%s' has wrong type: expected %s, got %s\n",
+                        __func__, name.data(), ggml_type_name(info.type), ggml_type_name(tensor->type));
+                return false;
             }
-            
-            // Now read the data - use the file's size
-            const size_t bytes_to_read = file_tensor_size;
+
+            const size_t bytes_to_write = info.data.size();
 
             if (ggml_backend_buffer_is_host(tensor->buffer)) {
-                // for the CPU and Metal backend, we can read directly into the tensor
-                loader->read(loader->context, tensor->data, bytes_to_read);
+                // for the CPU and Metal backend, we can copy directly into the tensor
+                memcpy(tensor->data, info.data.data(), bytes_to_write);
                 BYTESWAP_TENSOR(tensor);
             } else {
-                // read into a temporary buffer first, then copy to device memory
-                read_buf.resize(bytes_to_read);
-
-                loader->read(loader->context, read_buf.data(), read_buf.size());
-
-                ggml_backend_tensor_set(tensor, read_buf.data(), 0, bytes_to_read);
+                // copy to device memory
+                ggml_backend_tensor_set(tensor, info.data.data(), 0, bytes_to_write);
             }
 
-            total_size += bytes_to_read;
+            total_size += bytes_to_write;
             model.n_loaded++;
         }
 
+        // Clear the pre-scanned data to free memory
+        tensor_data_map.clear();
+
         WHISPER_LOG_INFO("%s: model size    = %7.2f MB\n", __func__, total_size/1e6);
 
         if (model.n_loaded == 0) {

From f6fcf09064ff979d700267822bcb9d9c2b3d3639 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 18 Jan 2026 16:16:21 +0000
Subject: [PATCH 3/4] Address code review feedback

- Move EOF check before reading partial header data
- Add comment explaining temporary memory doubling tradeoff
- Add warning when tensor not found in pre-scanned data
- Use name.c_str() consistently instead of name.data()

Co-authored-by: lhpqaq <63844184+lhpqaq@users.noreply.github.com>
---
 src/whisper.cpp | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/whisper.cpp b/src/whisper.cpp
index 4326a59cbe6..e8ce7ca373d 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -1696,20 +1696,26 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
     std::map<std::string, tensor_info> tensor_data_map;
 
     // Read all tensor metadata and data from the file
+    // Note: This approach temporarily doubles memory usage during model loading.
+    // This is necessary because the loader interface doesn't support seeking,
+    // so we must read tensor data before we can determine the correct types for allocation.
     {
         while (true) {
             int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
 
             read_safe(loader, n_dims);
-            read_safe(loader, length);
-            read_safe(loader, ttype);
 
+            // Check for EOF after reading the first field of the header
             if (loader->eof(loader->context)) {
                 break;
             }
 
+            int32_t length;
+            int32_t ttype;
+
+            read_safe(loader, length);
+            read_safe(loader, ttype);
+
             tensor_info info;
             info.n_dims = n_dims;
             info.type = ggml_type(ttype);
@@ -1782,6 +1788,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
                     meta->nb[i] = meta->nb[i-1] * meta->ne[i-1];
                 }
             }
+        } else {
+            // Tensor not found in pre-scanned data - this shouldn't happen with a valid model file
+            WHISPER_LOG_WARN("%s: tensor '%s' not found in model file, using default type\n", __func__, tensor_name.c_str());
         }
 
         ggml_op op = ASR_TENSOR_INFO.at(type);
@@ -1938,16 +1947,16 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
 
         for (auto & [name, info] : tensor_data_map) {
             if (model.tensors.find(name) == model.tensors.end()) {
-                WHISPER_LOG_ERROR("%s: unknown tensor '%s' in model file\n", __func__, name.data());
+                WHISPER_LOG_ERROR("%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
                 return false;
             }
 
-            auto tensor = model.tensors[name.data()];
+            auto tensor = model.tensors[name];
 
             // Verify tensor properties match
             const int32_t nelements = info.ne[0] * info.ne[1] * info.ne[2] * info.ne[3];
             if (ggml_nelements(tensor) != nelements) {
-                WHISPER_LOG_ERROR("%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
+                WHISPER_LOG_ERROR("%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
                 WHISPER_LOG_ERROR("%s: shape: [%d, %d, %d], expected: [%d, %d, %d]\n",
                         __func__, info.ne[0], info.ne[1], info.ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]);
                 return false;
@@ -1955,14 +1964,14 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
 
             if (tensor->ne[0] != info.ne[0] || tensor->ne[1] != info.ne[1] || tensor->ne[2] != info.ne[2]) {
                 WHISPER_LOG_ERROR("%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
-                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], info.ne[0], info.ne[1], info.ne[2]);
+                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], info.ne[0], info.ne[1], info.ne[2]);
                 return false;
             }
 
             // Type should already match since we used the file's type during tensor creation
             if (tensor->type != info.type) {
                 WHISPER_LOG_ERROR("%s: tensor '%s' has wrong type: expected %s, got %s\n",
-                        __func__, name.data(), ggml_type_name(info.type), ggml_type_name(tensor->type));
+                        __func__, name.c_str(), ggml_type_name(info.type), ggml_type_name(tensor->type));
                 return false;
             }
 

From 9422867400711a01f525c8cb86f42b9ca180167e Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 18 Jan 2026 16:38:04 +0000
Subject: [PATCH 4/4] Improve tensor type handling by deferring device memory
 allocation

Instead of pre-scanning all tensor data into memory (which doubles memory
usage), this change:

1. Adds skip, seek, and tell functions to whisper_model_loader interface
2. First pass: scans tensor metadata and skips data using seek (no memory)
3. Creates tensors with correct types from file before buffer allocation
4. Seeks back to tensor section for second pass data loading
5. Maintains backwards compatibility for loaders without seek support

This addresses the reviewer feedback to avoid doubling memory usage
while still fixing the tensor type mismatch issue.

Co-authored-by: lhpqaq <63844184+lhpqaq@users.noreply.github.com>
---
 include/whisper.h |  11 +++
 src/whisper.cpp   | 247 +++++++++++++++++++++++++++++++++++-----------
 2 files changed, 199 insertions(+), 59 deletions(-)

diff --git a/include/whisper.h b/include/whisper.h
index f4cc6bf7abd..dfed3dd431e 100644
--- a/include/whisper.h
+++ b/include/whisper.h
@@ -156,6 +156,17 @@ extern "C" {
         size_t (*read)(void * ctx, void * output, size_t read_size);
         bool    (*eof)(void * ctx);
         void  (*close)(void * ctx);
+        // Optional: skip forward by offset bytes.
+        // If NULL, skipping is not supported.
+        // Returns true on success, false on failure.
+        bool  (*skip)(void * ctx, size_t offset);
+        // Optional: seek to absolute position in the file.
+        // If NULL, absolute seeking is not supported.
+        // Returns true on success, false on failure.
+        bool  (*seek)(void * ctx, size_t offset);
+        // Optional: get current position in the file.
+        // If NULL, position tracking is not supported.
+        size_t (*tell)(void * ctx);
     } whisper_model_loader;
 
     // grammar element type
diff --git a/src/whisper.cpp b/src/whisper.cpp
index e8ce7ca373d..392bb9490db 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -1684,28 +1684,26 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
 
     const size_t n_tensors = 10 /* input */ + 15 + 15*n_audio_layer + 24*n_text_layer;
 
-    // Pre-scan tensor metadata and data from file
-    // This allows us to create tensors with the correct type from the file,
-    // ensuring proper memory allocation on the device
-    struct tensor_info {
+    // Pre-scan tensor metadata from file to determine actual types
+    // This allows us to allocate device memory with the correct sizes
+    struct tensor_meta {
         ggml_type type;
         int32_t ne[4];
-        int32_t n_dims;
-        std::vector<char> data;
     };
-    std::map<std::string, tensor_info> tensor_data_map;
+    std::map<std::string, tensor_meta> tensor_type_map;
+    size_t tensor_start_offset = 0; // file offset where tensor section begins
+
+    // If loader supports skip, seek, and tell, scan tensor metadata first (without loading data)
+    if (loader->skip && loader->seek && loader->tell) {
+        // Remember the current position (start of tensors section)
+        tensor_start_offset = loader->tell(loader->context);
 
-    // Read all tensor metadata and data from the file
-    // Note: This approach temporarily doubles memory usage during model loading.
-    // This is necessary because the loader interface doesn't support seeking,
-    // so we must read tensor data before we can determine the correct types for allocation.
-    {
         while (true) {
             int32_t n_dims;
 
             read_safe(loader, n_dims);
 
-            // Check for EOF after reading the first field of the header
+            // Check for EOF after reading the first field
             if (loader->eof(loader->context)) {
                 break;
             }
@@ -1716,18 +1714,17 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
             read_safe(loader, length);
             read_safe(loader, ttype);
 
-            tensor_info info;
-            info.n_dims = n_dims;
-            info.type = ggml_type(ttype);
-            info.ne[0] = 1;
-            info.ne[1] = 1;
-            info.ne[2] = 1;
-            info.ne[3] = 1;
+            tensor_meta meta;
+            meta.type = ggml_type(ttype);
+            meta.ne[0] = 1;
+            meta.ne[1] = 1;
+            meta.ne[2] = 1;
+            meta.ne[3] = 1;
 
             int32_t nelements = 1;
             for (int i = 0; i < n_dims; ++i) {
-                read_safe(loader, info.ne[i]);
-                nelements *= info.ne[i];
+                read_safe(loader, meta.ne[i]);
+                nelements *= meta.ne[i];
             }
 
             std::string name;
@@ -1735,12 +1732,20 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
             loader->read(loader->context, &tmp[0], tmp.size());
             name.assign(&tmp[0], tmp.size());
 
-            // Calculate and read tensor data
-            const size_t tensor_data_size = ggml_row_size(info.type, info.ne[0]) * (nelements / info.ne[0]);
-            info.data.resize(tensor_data_size);
-            loader->read(loader->context, info.data.data(), tensor_data_size);
+            // Calculate tensor data size and skip it (without loading into memory)
+            const size_t tensor_data_size = ggml_row_size(meta.type, meta.ne[0]) * (nelements / meta.ne[0]);
+            if (!loader->skip(loader->context, tensor_data_size)) {
+                WHISPER_LOG_ERROR("%s: failed to skip tensor data for '%s'\n", __func__, name.c_str());
+                return false;
+            }
 
-            tensor_data_map[name] = std::move(info);
+            tensor_type_map[name] = meta;
+        }
+
+        // Seek back to the start of tensors section for the actual data loading later
+        if (!loader->seek(loader->context, tensor_start_offset)) {
+            WHISPER_LOG_ERROR("%s: failed to seek back to tensor data\n", __func__);
+            return false;
         }
     }
 
@@ -1772,15 +1777,16 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
     buft_list_t buft_list = make_buft_list(wctx.params);
 
     auto create_tensor = [&](asr_tensor type, asr_system system, ggml_tensor * meta, int layer = 0) -> ggml_tensor * {
-        // Get the tensor name and look up its actual type from the pre-scanned file data
+        // Get the tensor name
         std::string tensor_name = format(ASR_TENSOR_NAMES.at(system).at(type), layer);
-        auto it = tensor_data_map.find(tensor_name);
-        if (it != tensor_data_map.end()) {
-            // Use the type from the file to ensure proper memory allocation
-            const tensor_info & info = it->second;
-            if (meta->type != info.type) {
+
+        // If we pre-scanned tensor types, update meta tensor to use the actual type from file
+        auto it = tensor_type_map.find(tensor_name);
+        if (it != tensor_type_map.end()) {
+            const tensor_meta & file_meta = it->second;
+            if (meta->type != file_meta.type) {
                 // Update meta tensor type to match the file
-                meta->type = info.type;
+                meta->type = file_meta.type;
                 // Update strides based on new type
                 meta->nb[0] = ggml_type_size(meta->type);
                 meta->nb[1] = meta->nb[0] * (meta->ne[0] / ggml_blck_size(meta->type));
@@ -1788,9 +1794,6 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
                     meta->nb[i] = meta->nb[i-1] * meta->ne[i-1];
                 }
             }
-        } else {
-            // Tensor not found in pre-scanned data - this shouldn't happen with a valid model file
-            WHISPER_LOG_WARN("%s: tensor '%s' not found in model file, using default type\n", __func__, tensor_name.c_str());
         }
 
         ggml_op op = ASR_TENSOR_INFO.at(type);
@@ -1939,13 +1942,39 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
         }
     }
 
-    // load weights from pre-scanned tensor data
+    // load weights
     {
         size_t total_size = 0;
 
         model.n_loaded = 0;
 
-        for (auto & [name, info] : tensor_data_map) {
+        std::vector<char> read_buf;
+
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            read_safe(loader, n_dims);
+            read_safe(loader, length);
+            read_safe(loader, ttype);
+
+            if (loader->eof(loader->context)) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[4] = { 1, 1, 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                read_safe(loader, ne[i]);
+                nelements *= ne[i];
+            }
+
+            std::string name;
+            std::vector<char> tmp(length); // create a buffer
+            loader->read(loader->context, &tmp[0], tmp.size()); // read to buffer
+            name.assign(&tmp[0], tmp.size());
+
             if (model.tensors.find(name) == model.tensors.end()) {
                 WHISPER_LOG_ERROR("%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
                 return false;
@@ -1953,46 +1982,85 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
 
             auto tensor = model.tensors[name];
 
-            // Verify tensor properties match
-            const int32_t nelements = info.ne[0] * info.ne[1] * info.ne[2] * info.ne[3];
             if (ggml_nelements(tensor) != nelements) {
                 WHISPER_LOG_ERROR("%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
                 WHISPER_LOG_ERROR("%s: shape: [%d, %d, %d], expected: [%d, %d, %d]\n",
-                        __func__, info.ne[0], info.ne[1], info.ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]);
+                        __func__, ne[0], ne[1], ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]);
                 return false;
             }
 
-            if (tensor->ne[0] != info.ne[0] || tensor->ne[1] != info.ne[1] || tensor->ne[2] != info.ne[2]) {
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
                 WHISPER_LOG_ERROR("%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
-                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], info.ne[0], info.ne[1], info.ne[2]);
+                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], ne[0], ne[1], ne[2]);
                 return false;
             }
 
-            // Type should already match since we used the file's type during tensor creation
-            if (tensor->type != info.type) {
-                WHISPER_LOG_ERROR("%s: tensor '%s' has wrong type: expected %s, got %s\n",
-                        __func__, name.c_str(), ggml_type_name(info.type), ggml_type_name(tensor->type));
-                return false;
-            }
+            // Calculate size based on file's tensor type
+            const size_t file_tensor_size = ggml_row_size(ggml_type(ttype), ne[0]) * (nelements / ne[0]);
+            const size_t expected_tensor_size = ggml_nbytes(tensor);
+            
+            // If we pre-scanned types, the tensor type should already match
+            // Otherwise (loader doesn't support seek), we need to handle type mismatch here
+            if (tensor->type != ggml_type(ttype)) {
+                // Type mismatch - this happens when loader doesn't support seek
+                // or when tensor wasn't found during pre-scan
+                if (!tensor_type_map.empty()) {
+                    // We pre-scanned but types still don't match - this is unexpected
+                    WHISPER_LOG_ERROR("%s: tensor '%s' type mismatch after pre-scan: expected %s, file has %s\n",
+                            __func__, name.c_str(), ggml_type_name(tensor->type), ggml_type_name(ggml_type(ttype)));
+                    return false;
+                }
 
-            const size_t bytes_to_write = info.data.size();
+                // Loader doesn't support seek - handle type mismatch at runtime (legacy path)
+                WHISPER_LOG_DEBUG("%s: tensor '%s' type mismatch (expected %s, file has %s)\n",
+                        __func__, name.c_str(), ggml_type_name(tensor->type), ggml_type_name(ggml_type(ttype)));
+                
+                // Check if the allocated buffer is large enough for the file's data
+                if (file_tensor_size > expected_tensor_size) {
+                    WHISPER_LOG_ERROR("%s: tensor '%s' buffer too small: allocated %zu bytes for %s, but file needs %zu bytes for %s\n",
+                            __func__, name.c_str(), expected_tensor_size, ggml_type_name(tensor->type), 
+                            file_tensor_size, ggml_type_name(ggml_type(ttype)));
+                    return false;
+                }
+                
+                // Update tensor type to match the file
+                tensor->type = ggml_type(ttype);
+                
+                // Update tensor strides (nb) based on new type
+                tensor->nb[0] = ggml_type_size(tensor->type);
+                tensor->nb[1] = tensor->nb[0] * (tensor->ne[0] / ggml_blck_size(tensor->type));
+                for (int i = 2; i < GGML_MAX_DIMS; i++) {
+                    tensor->nb[i] = tensor->nb[i-1] * tensor->ne[i-1];
+                }
+            } else {
+                // Types match, verify size
+                if (file_tensor_size != expected_tensor_size) {
+                    WHISPER_LOG_ERROR("%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                            __func__, name.c_str(), expected_tensor_size, file_tensor_size);
+                    return false;
+                }
+            }
+            
+            // Now read the data - use the file's size
+            const size_t bytes_to_read = file_tensor_size;
 
             if (ggml_backend_buffer_is_host(tensor->buffer)) {
-                // for the CPU and Metal backend, we can copy directly into the tensor
-                memcpy(tensor->data, info.data.data(), bytes_to_write);
+                // for the CPU and Metal backend, we can read directly into the tensor
+                loader->read(loader->context, tensor->data, bytes_to_read);
                 BYTESWAP_TENSOR(tensor);
             } else {
-                // copy to device memory
-                ggml_backend_tensor_set(tensor, info.data.data(), 0, bytes_to_write);
+                // read into a temporary buffer first, then copy to device memory
+                read_buf.resize(bytes_to_read);
+
+                loader->read(loader->context, read_buf.data(), read_buf.size());
+
+                ggml_backend_tensor_set(tensor, read_buf.data(), 0, bytes_to_read);
             }
 
-            total_size += bytes_to_write;
+            total_size += bytes_to_read;
             model.n_loaded++;
         }
 
-        // Clear the pre-scanned data to free memory
-        tensor_data_map.clear();
-
         WHISPER_LOG_INFO("%s: model size    = %7.2f MB\n", __func__, total_size/1e6);
 
         if (model.n_loaded == 0) {
@@ -3713,6 +3781,25 @@ struct whisper_context * whisper_init_from_file_with_params_no_state(const char
         fin->close();
     };
 
+    loader.skip = [](void * ctx, size_t offset) {
+        std::ifstream * fin = (std::ifstream*)ctx;
+        fin->seekg(offset, std::ios::cur);
+        return fin->good();
+    };
+
+    loader.seek = [](void * ctx, size_t offset) {
+        std::ifstream * fin = (std::ifstream*)ctx;
+        fin->clear(); // clear any error flags
+        fin->seekg(offset, std::ios::beg);
+        return fin->good();
+    };
+
+    loader.tell = [](void * ctx) {
+        std::ifstream * fin = (std::ifstream*)ctx;
+        auto pos = fin->tellg();
+        return (pos == std::streampos(-1)) ? SIZE_MAX : static_cast<size_t>(pos);
+    };
+
     auto ctx = whisper_init_with_params_no_state(&loader, params);
 
     if (ctx) {
@@ -3756,6 +3843,29 @@ struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * bu
 
     loader.close = [](void * /*ctx*/) { };
 
+    loader.skip = [](void * ctx, size_t offset) {
+        buf_context * buf = reinterpret_cast<buf_context *>(ctx);
+        if (buf->current_offset + offset > buf->size) {
+            return false;
+        }
+        buf->current_offset += offset;
+        return true;
+    };
+
+    loader.seek = [](void * ctx, size_t offset) {
+        buf_context * buf = reinterpret_cast<buf_context *>(ctx);
+        if (offset > buf->size) {
+            return false;
+        }
+        buf->current_offset = offset;
+        return true;
+    };
+
+    loader.tell = [](void * ctx) {
+        buf_context * buf = reinterpret_cast<buf_context *>(ctx);
+        return buf->current_offset;
+    };
+
     return whisper_init_with_params_no_state(&loader, params);
 }
 
@@ -4806,6 +4916,25 @@ struct whisper_vad_context * whisper_vad_init_from_file_with_params(
         fin->close();
     };
 
+    loader.skip = [](void * ctx, size_t offset) {
+        std::ifstream * fin = (std::ifstream*)ctx;
+        fin->seekg(offset, std::ios::cur);
+        return fin->good();
+    };
+
+    loader.seek = [](void * ctx, size_t offset) {
+        std::ifstream * fin = (std::ifstream*)ctx;
+        fin->clear();
+        fin->seekg(offset, std::ios::beg);
+        return fin->good();
+    };
+
+    loader.tell = [](void * ctx) {
+        std::ifstream * fin = (std::ifstream*)ctx;
+        auto pos = fin->tellg();
+        return (pos == std::streampos(-1)) ? SIZE_MAX : static_cast<size_t>(pos);
+    };
+
     auto ctx = whisper_vad_init_with_params(&loader, params);
     if (!ctx) {
         whisper_vad_free(ctx);