feat: optimize tensor loading time (#790)

* opt tensor loading * fix build failure * revert the changes * allow the use of n_threads * fix lora loading * optimize lora loading * add mutex * use atomic * fix build * fix potential duplicate issue * avoid duplicate lookup of lora tensor * fix progeress bar * remove unused remove_duplicates --------- Co-authored-by: leejet <leejet714@gmail.com>
2025-12-12 13:28:37 +00:00 · 2025-09-14 16:48:35 +02:00 · 2025-09-14 16:48:35 +02:00 · 55c2e05d98
commit 55c2e05d98
parent 52a97b3ac1
4 changed files with 345 additions and 274 deletions
--- a/lora.hpp
+++ b/lora.hpp
@ -1,6 +1,7 @@
 #ifndef __LORA_HPP__
 #define __LORA_HPP__

+#include <mutex>
 #include "ggml_extend.hpp"

 #define LORA_GRAPH_BASE_SIZE 10240
@ -115,7 +116,7 @@ struct LoraModel : public GGMLRunner {
        return "lora";
    }

-    bool load_from_file(bool filter_tensor = false) {
+    bool load_from_file(bool filter_tensor = false, int n_threads = 0) {
        LOG_INFO("loading LoRA from '%s'", file_path.c_str());

        if (load_failed) {
@ -123,41 +124,53 @@ struct LoraModel : public GGMLRunner {
            return false;
        }

+        std::unordered_map<std::string, TensorStorage> tensors_to_create;
+        std::mutex lora_mutex;
        bool dry_run          = true;
        auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
-            const std::string& name = tensor_storage.name;
+            if (dry_run) {
+                const std::string& name = tensor_storage.name;

-            if (filter_tensor && !contains(name, "lora")) {
-                // LOG_INFO("skipping LoRA tesnor '%s'", name.c_str());
-                return true;
-            }
-            // LOG_INFO("lora_tensor %s", name.c_str());
-            for (int i = 0; i < LORA_TYPE_COUNT; i++) {
-                if (name.find(type_fingerprints[i]) != std::string::npos) {
-                    type = (lora_t)i;
-                    break;
+                if (filter_tensor && !contains(name, "lora")) {
+                    return true;
+                }
+
+                {
+                    std::lock_guard<std::mutex> lock(lora_mutex);
+                    for (int i = 0; i < LORA_TYPE_COUNT; i++) {
+                        if (name.find(type_fingerprints[i]) != std::string::npos) {
+                            type = (lora_t)i;
+                            break;
+                        }
+                    }
+                    tensors_to_create[name] = tensor_storage;
+                }
+            } else {
+                const std::string& name = tensor_storage.name;
+                auto iter               = lora_tensors.find(name);
+                if (iter != lora_tensors.end()) {
+                    *dst_tensor = iter->second;
                }
            }
-
-            if (dry_run) {
-                struct ggml_tensor* real = ggml_new_tensor(params_ctx,
-                                                           tensor_storage.type,
-                                                           tensor_storage.n_dims,
-                                                           tensor_storage.ne);
-                lora_tensors[name]       = real;
-            } else {
-                auto real   = lora_tensors[name];
-                *dst_tensor = real;
-            }
-
            return true;
        };

-        model_loader.load_tensors(on_new_tensor_cb);
+        model_loader.load_tensors(on_new_tensor_cb, n_threads);
+
+        for (const auto& pair : tensors_to_create) {
+            const auto& name         = pair.first;
+            const auto& ts           = pair.second;
+            struct ggml_tensor* real = ggml_new_tensor(params_ctx,
+                                                       ts.type,
+                                                       ts.n_dims,
+                                                       ts.ne);
+            lora_tensors[name]       = real;
+        }
+
        alloc_params_buffer();
-        // exit(0);
+
        dry_run = false;
-        model_loader.load_tensors(on_new_tensor_cb);
+        model_loader.load_tensors(on_new_tensor_cb, n_threads);

        LOG_DEBUG("lora type: \"%s\"/\"%s\"", lora_downs[type].c_str(), lora_ups[type].c_str());

--- a/model.cpp
+++ b/model.cpp
@ -1,8 +1,13 @@
 #include <stdarg.h>
+#include <atomic>
+#include <chrono>
 #include <fstream>
+#include <functional>
+#include <mutex>
 #include <regex>
 #include <set>
 #include <string>
+#include <thread>
 #include <unordered_map>
 #include <vector>

@ -1944,292 +1949,344 @@ std::string ModelLoader::load_umt5_tokenizer_json() {
    return json_str;
 }

-std::vector<TensorStorage> remove_duplicates(const std::vector<TensorStorage>& vec) {
-    std::vector<TensorStorage> res;
-    std::unordered_map<std::string, size_t> name_to_index_map;
+bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p) {
+    int64_t process_time_ms = 0;
+    std::atomic<int64_t> read_time_ms(0);
+    std::atomic<int64_t> memcpy_time_ms(0);
+    std::atomic<int64_t> copy_to_backend_time_ms(0);
+    std::atomic<int64_t> convert_time_ms(0);

-    for (size_t i = 0; i < vec.size(); ++i) {
-        const std::string& current_name = vec[i].name;
-        auto it                         = name_to_index_map.find(current_name);
+    int num_threads_to_use = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency();

-        if (it != name_to_index_map.end()) {
-            res[it->second] = vec[i];
-        } else {
-            name_to_index_map[current_name] = i;
-            res.push_back(vec[i]);
-        }
-    }
-
-    // vec.resize(name_to_index_map.size());
-
-    return res;
-}
-
-bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
-    int64_t process_time_ms         = 0;
-    int64_t read_time_ms            = 0;
-    int64_t memcpy_time_ms          = 0;
-    int64_t copy_to_backend_time_ms = 0;
-    int64_t convert_time_ms         = 0;
-
-    int64_t prev_time_ms = 0;
-    int64_t curr_time_ms = 0;
-    int64_t start_time   = ggml_time_ms();
-    prev_time_ms         = start_time;
+    int64_t start_time = ggml_time_ms();
    std::vector<TensorStorage> processed_tensor_storages;
-    for (auto& tensor_storage : tensor_storages) {
-        // LOG_DEBUG("%s", name.c_str());

-        if (is_unused_tensor(tensor_storage.name)) {
-            continue;
+    {
+        struct IndexedStorage {
+            size_t index;
+            TensorStorage ts;
+        };
+
+        std::mutex vec_mutex;
+        std::vector<IndexedStorage> all_results;
+
+        int n_threads = std::min(num_threads_to_use, (int)tensor_storages.size());
+        if (n_threads < 1) {
+            n_threads = 1;
+        }
+        std::vector<std::thread> workers;
+
+        for (int i = 0; i < n_threads; ++i) {
+            workers.emplace_back([&, thread_id = i]() {
+                std::vector<IndexedStorage> local_results;
+                std::vector<TensorStorage> temp_storages;
+
+                for (size_t j = thread_id; j < tensor_storages.size(); j += n_threads) {
+                    const auto& tensor_storage = tensor_storages[j];
+                    if (is_unused_tensor(tensor_storage.name)) {
+                        continue;
+                    }
+
+                    temp_storages.clear();
+                    preprocess_tensor(tensor_storage, temp_storages);
+
+                    for (const auto& ts : temp_storages) {
+                        local_results.push_back({j, ts});
+                    }
+                }
+
+                if (!local_results.empty()) {
+                    std::lock_guard<std::mutex> lock(vec_mutex);
+                    all_results.insert(all_results.end(),
+                                       local_results.begin(), local_results.end());
+                }
+            });
+        }
+        for (auto& w : workers) {
+            w.join();
        }

-        preprocess_tensor(tensor_storage, processed_tensor_storages);
-    }
-    std::vector<TensorStorage> dedup = remove_duplicates(processed_tensor_storages);
-    processed_tensor_storages        = dedup;
-    curr_time_ms                     = ggml_time_ms();
-    process_time_ms                  = curr_time_ms - prev_time_ms;
-    prev_time_ms                     = curr_time_ms;
+        std::unordered_map<std::string, IndexedStorage> latest_map;
+        for (auto& entry : all_results) {
+            latest_map[entry.ts.name] = entry;
+        }
+
+        processed_tensor_storages.reserve(latest_map.size());
+        for (auto& [name, entry] : latest_map) {
+            processed_tensor_storages.push_back(entry.ts);
+        }
+    }
+
+    process_time_ms = ggml_time_ms() - start_time;
+
+    bool success                          = true;
+    size_t total_tensors_processed        = 0;
+    const size_t total_tensors_to_process = processed_tensor_storages.size();
+    const int64_t t_start                 = ggml_time_ms();
+    int last_n_threads                    = 1;

-    bool success = true;
    for (size_t file_index = 0; file_index < file_paths_.size(); file_index++) {
        std::string file_path = file_paths_[file_index];
        LOG_DEBUG("loading tensors from %s", file_path.c_str());

-        std::ifstream file(file_path, std::ios::binary);
-        if (!file.is_open()) {
-            LOG_ERROR("failed to open '%s'", file_path.c_str());
-            return false;
+        std::vector<const TensorStorage*> file_tensors;
+        for (const auto& ts : processed_tensor_storages) {
+            if (ts.file_index == file_index) {
+                file_tensors.push_back(&ts);
+            }
+        }
+        if (file_tensors.empty()) {
+            continue;
        }

        bool is_zip = false;
-        for (auto& tensor_storage : tensor_storages) {
-            if (tensor_storage.file_index != file_index) {
-                continue;
-            }
-            if (tensor_storage.index_in_zip >= 0) {
+        for (auto const& ts : file_tensors) {
+            if (ts->index_in_zip >= 0) {
                is_zip = true;
                break;
            }
        }

-        struct zip_t* zip = NULL;
-        if (is_zip) {
-            zip = zip_open(file_path.c_str(), 0, 'r');
-            if (zip == NULL) {
-                LOG_ERROR("failed to open zip '%s'", file_path.c_str());
-                return false;
-            }
+        int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size());
+        if (n_threads < 1) {
+            n_threads = 1;
+        }
+        last_n_threads = n_threads;
+
+        std::atomic<size_t> tensor_idx(0);
+        std::atomic<bool> failed(false);
+        std::vector<std::thread> workers;
+
+        for (int i = 0; i < n_threads; ++i) {
+            workers.emplace_back([&, file_path, is_zip]() {
+                std::ifstream file;
+                struct zip_t* zip = NULL;
+                if (is_zip) {
+                    zip = zip_open(file_path.c_str(), 0, 'r');
+                    if (zip == NULL) {
+                        LOG_ERROR("failed to open zip '%s'", file_path.c_str());
+                        failed = true;
+                        return;
+                    }
+                } else {
+                    file.open(file_path, std::ios::binary);
+                    if (!file.is_open()) {
+                        LOG_ERROR("failed to open '%s'", file_path.c_str());
+                        failed = true;
+                        return;
+                    }
+                }
+
+                std::vector<uint8_t> read_buffer;
+                std::vector<uint8_t> convert_buffer;
+
+                while (true) {
+                    int64_t t0, t1;
+                    size_t idx = tensor_idx.fetch_add(1);
+                    if (idx >= file_tensors.size() || failed) {
+                        break;
+                    }
+
+                    const TensorStorage& tensor_storage = *file_tensors[idx];
+                    ggml_tensor* dst_tensor             = NULL;
+
+                    t0 = ggml_time_ms();
+
+                    if (!on_new_tensor_cb(tensor_storage, &dst_tensor)) {
+                        LOG_WARN("process tensor failed: '%s'", tensor_storage.name.c_str());
+                        failed = true;
+                        break;
+                    }
+
+                    if (dst_tensor == NULL) {
+                        t1 = ggml_time_ms();
+                        read_time_ms.fetch_add(t1 - t0);
+                        continue;
+                    }
+
+                    size_t nbytes_to_read = tensor_storage.nbytes_to_read();
+
+                    auto read_data = [&](char* buf, size_t n) {
+                        if (zip != NULL) {
+                            zip_entry_openbyindex(zip, tensor_storage.index_in_zip);
+                            size_t entry_size = zip_entry_size(zip);
+                            if (entry_size != n) {
+                                int64_t t_memcpy_start;
+                                read_buffer.resize(entry_size);
+                                zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size);
+                                t_memcpy_start = ggml_time_ms();
+                                memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n);
+                                memcpy_time_ms.fetch_add(ggml_time_ms() - t_memcpy_start);
+                            } else {
+                                zip_entry_noallocread(zip, (void*)buf, n);
+                            }
+                            zip_entry_close(zip);
+                        } else {
+                            file.seekg(tensor_storage.offset);
+                            file.read(buf, n);
+                            if (!file) {
+                                LOG_ERROR("read tensor data failed: '%s'", file_path.c_str());
+                                failed = true;
+                            }
+                        }
+                    };
+
+                    if (dst_tensor->buffer == NULL || ggml_backend_buffer_is_host(dst_tensor->buffer)) {
+                        if (tensor_storage.type == dst_tensor->type) {
+                            GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes());
+                            if (tensor_storage.is_f64 || tensor_storage.is_i64) {
+                                read_buffer.resize(tensor_storage.nbytes_to_read());
+                                read_data((char*)read_buffer.data(), nbytes_to_read);
+                            } else {
+                                read_data((char*)dst_tensor->data, nbytes_to_read);
+                            }
+                            t1 = ggml_time_ms();
+                            read_time_ms.fetch_add(t1 - t0);
+
+                            t0 = ggml_time_ms();
+                            if (tensor_storage.is_bf16) {
+                                // inplace op
+                                bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements());
+                            } else if (tensor_storage.is_f8_e4m3) {
+                                // inplace op
+                                f8_e4m3_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
+                            } else if (tensor_storage.is_f8_e5m2) {
+                                // inplace op
+                                f8_e5m2_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
+                            } else if (tensor_storage.is_f64) {
+                                f64_to_f32_vec((double*)read_buffer.data(), (float*)dst_tensor->data, tensor_storage.nelements());
+                            } else if (tensor_storage.is_i64) {
+                                i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements());
+                            }
+                            t1 = ggml_time_ms();
+                            convert_time_ms.fetch_add(t1 - t0);
+                        } else {
+                            read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
+                            read_data((char*)read_buffer.data(), nbytes_to_read);
+                            t1 = ggml_time_ms();
+                            read_time_ms.fetch_add(t1 - t0);
+
+                            t0 = ggml_time_ms();
+                            if (tensor_storage.is_bf16) {
+                                // inplace op
+                                bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
+                            } else if (tensor_storage.is_f8_e4m3) {
+                                // inplace op
+                                f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
+                            } else if (tensor_storage.is_f8_e5m2) {
+                                // inplace op
+                                f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
+                            } else if (tensor_storage.is_f64) {
+                                // inplace op
+                                f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
+                            } else if (tensor_storage.is_i64) {
+                                // inplace op
+                                i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
+                            }
+                            convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data, dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
+                            t1 = ggml_time_ms();
+                            convert_time_ms.fetch_add(t1 - t0);
+                        }
+                    } else {
+                        read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
+                        read_data((char*)read_buffer.data(), nbytes_to_read);
+                        t1 = ggml_time_ms();
+                        read_time_ms.fetch_add(t1 - t0);
+
+                        t0 = ggml_time_ms();
+                        if (tensor_storage.is_bf16) {
+                            // inplace op
+                            bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
+                        } else if (tensor_storage.is_f8_e4m3) {
+                            // inplace op
+                            f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
+                        } else if (tensor_storage.is_f8_e5m2) {
+                            // inplace op
+                            f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
+                        } else if (tensor_storage.is_f64) {
+                            // inplace op
+                            f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
+                        } else if (tensor_storage.is_i64) {
+                            // inplace op
+                            i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
+                        }
+
+                        if (tensor_storage.type == dst_tensor->type) {
+                            // copy to device memory
+                            t1 = ggml_time_ms();
+                            convert_time_ms.fetch_add(t1 - t0);
+                            t0 = ggml_time_ms();
+                            ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor));
+                            t1 = ggml_time_ms();
+                            copy_to_backend_time_ms.fetch_add(t1 - t0);
+                        } else {
+                            // convert first, then copy to device memory
+
+                            convert_buffer.resize(ggml_nbytes(dst_tensor));
+                            convert_tensor((void*)read_buffer.data(), tensor_storage.type, (void*)convert_buffer.data(), dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
+                            t1 = ggml_time_ms();
+                            convert_time_ms.fetch_add(t1 - t0);
+                            t0 = ggml_time_ms();
+                            ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor));
+                            t1 = ggml_time_ms();
+                            copy_to_backend_time_ms.fetch_add(t1 - t0);
+                        }
+                    }
+                }
+                if (zip != NULL) {
+                    zip_close(zip);
+                }
+            });
        }

-        std::vector<uint8_t> read_buffer;
-        std::vector<uint8_t> convert_buffer;
-
-        auto read_data = [&](const TensorStorage& tensor_storage, char* buf, size_t n) {
-            if (zip != NULL) {
-                zip_entry_openbyindex(zip, tensor_storage.index_in_zip);
-                size_t entry_size = zip_entry_size(zip);
-                if (entry_size != n) {
-                    read_buffer.resize(entry_size);
-                    prev_time_ms = ggml_time_ms();
-                    zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size);
-                    curr_time_ms = ggml_time_ms();
-                    read_time_ms += curr_time_ms - prev_time_ms;
-                    prev_time_ms = curr_time_ms;
-                    memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n);
-                    curr_time_ms = ggml_time_ms();
-                    memcpy_time_ms += curr_time_ms - prev_time_ms;
-                } else {
-                    prev_time_ms = ggml_time_ms();
-                    zip_entry_noallocread(zip, (void*)buf, n);
-                    curr_time_ms = ggml_time_ms();
-                    read_time_ms += curr_time_ms - prev_time_ms;
-                }
-                zip_entry_close(zip);
-            } else {
-                prev_time_ms = ggml_time_ms();
-                file.seekg(tensor_storage.offset);
-                file.read(buf, n);
-                curr_time_ms = ggml_time_ms();
-                read_time_ms += curr_time_ms - prev_time_ms;
-                if (!file) {
-                    LOG_ERROR("read tensor data failed: '%s'", file_path.c_str());
-                    return false;
-                }
-            }
-            return true;
-        };
-        int tensor_count = 0;
-        int64_t t0       = ggml_time_ms();
-        int64_t t1       = t0;
-        bool partial     = true;
-        int tensor_max   = (int)processed_tensor_storages.size();
-        pretty_progress(0, tensor_max, 0.0f);
-        for (auto& tensor_storage : processed_tensor_storages) {
-            if (tensor_storage.file_index != file_index) {
-                ++tensor_count;
-                continue;
-            }
-            ggml_tensor* dst_tensor = NULL;
-
-            success = on_new_tensor_cb(tensor_storage, &dst_tensor);
-            if (!success) {
-                LOG_WARN("process tensor failed: '%s'", tensor_storage.name.c_str());
+        while (true) {
+            size_t current_idx = tensor_idx.load();
+            if (current_idx >= file_tensors.size() || failed) {
                break;
            }
-
-            if (dst_tensor == NULL) {
-                ++tensor_count;
-                continue;
-            }
-
-            size_t nbytes_to_read = tensor_storage.nbytes_to_read();
-
-            if (dst_tensor->buffer == NULL || ggml_backend_buffer_is_host(dst_tensor->buffer)) {
-                // for the CPU and Metal backend, we can copy directly into the tensor
-                if (tensor_storage.type == dst_tensor->type) {
-                    GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes());
-                    if (tensor_storage.is_f64 || tensor_storage.is_i64) {
-                        read_buffer.resize(tensor_storage.nbytes_to_read());
-                        read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
-                    } else {
-                        read_data(tensor_storage, (char*)dst_tensor->data, nbytes_to_read);
-                    }
-
-                    prev_time_ms = ggml_time_ms();
-                    if (tensor_storage.is_bf16) {
-                        // inplace op
-                        bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements());
-                    } else if (tensor_storage.is_f8_e4m3) {
-                        // inplace op
-                        f8_e4m3_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
-                    } else if (tensor_storage.is_f8_e5m2) {
-                        // inplace op
-                        f8_e5m2_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
-                    } else if (tensor_storage.is_f64) {
-                        f64_to_f32_vec((double*)read_buffer.data(), (float*)dst_tensor->data, tensor_storage.nelements());
-                    } else if (tensor_storage.is_i64) {
-                        i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements());
-                    }
-                    curr_time_ms = ggml_time_ms();
-                    convert_time_ms += curr_time_ms - prev_time_ms;
-                } else {
-                    read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
-                    read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
-
-                    prev_time_ms = ggml_time_ms();
-                    if (tensor_storage.is_bf16) {
-                        // inplace op
-                        bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
-                    } else if (tensor_storage.is_f8_e4m3) {
-                        // inplace op
-                        f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
-                    } else if (tensor_storage.is_f8_e5m2) {
-                        // inplace op
-                        f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
-                    } else if (tensor_storage.is_f64) {
-                        // inplace op
-                        f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
-                    } else if (tensor_storage.is_i64) {
-                        // inplace op
-                        i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
-                    }
-
-                    convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
-                                   dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
-                    curr_time_ms = ggml_time_ms();
-                    convert_time_ms += curr_time_ms - prev_time_ms;
-                }
-            } else {
-                read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
-                read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
-
-                prev_time_ms = ggml_time_ms();
-                if (tensor_storage.is_bf16) {
-                    // inplace op
-                    bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
-                } else if (tensor_storage.is_f8_e4m3) {
-                    // inplace op
-                    f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
-                } else if (tensor_storage.is_f8_e5m2) {
-                    // inplace op
-                    f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
-                } else if (tensor_storage.is_f64) {
-                    // inplace op
-                    f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
-                } else if (tensor_storage.is_i64) {
-                    // inplace op
-                    i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
-                }
-
-                if (tensor_storage.type == dst_tensor->type) {
-                    // copy to device memory
-                    curr_time_ms = ggml_time_ms();
-                    convert_time_ms += curr_time_ms - prev_time_ms;
-                    prev_time_ms = curr_time_ms;
-                    ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor));
-                    curr_time_ms = ggml_time_ms();
-                    copy_to_backend_time_ms += curr_time_ms - prev_time_ms;
-                } else {
-                    // convert first, then copy to device memory
-                    convert_buffer.resize(ggml_nbytes(dst_tensor));
-                    convert_tensor((void*)read_buffer.data(), tensor_storage.type,
-                                   (void*)convert_buffer.data(), dst_tensor->type,
-                                   (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
-                    curr_time_ms = ggml_time_ms();
-                    convert_time_ms += curr_time_ms - prev_time_ms;
-                    prev_time_ms = curr_time_ms;
-                    ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor));
-                    curr_time_ms = ggml_time_ms();
-                    copy_to_backend_time_ms += curr_time_ms - prev_time_ms;
-                }
-            }
-            ++tensor_count;
-            int64_t t2 = ggml_time_ms();
-            if ((t2 - t1) >= 200) {
-                t1 = t2;
-                pretty_progress(tensor_count, tensor_max, (t1 - t0) / (1000.0f * tensor_count));
-                partial = tensor_count != tensor_max;
-            }
+            size_t curr_num = total_tensors_processed + current_idx;
+            pretty_progress(curr_num, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f / (curr_num + 1e-6f));
+            std::this_thread::sleep_for(std::chrono::milliseconds(200));
        }

-        if (partial) {
-            if (tensor_count >= 1) {
-                t1 = ggml_time_ms();
-                pretty_progress(tensor_count, tensor_max, (t1 - t0) / (1000.0f * tensor_count));
-            }
-            if (tensor_count < tensor_max) {
-                printf("\n");
-            }
+        for (auto& w : workers) {
+            w.join();
        }

-        if (zip != NULL) {
-            zip_close(zip);
-        }
-
-        if (!success) {
+        if (failed) {
+            success = false;
            break;
        }
+        total_tensors_processed += file_tensors.size();
+        pretty_progress(total_tensors_processed, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f / (total_tensors_processed + 1e-6f));
+        if (total_tensors_processed < total_tensors_to_process) {
+            printf("\n");
+        }
    }
+
    int64_t end_time = ggml_time_ms();
    LOG_INFO("loading tensors completed, taking %.2fs (process: %.2fs, read: %.2fs, memcpy: %.2fs, convert: %.2fs, copy_to_backend: %.2fs)",
             (end_time - start_time) / 1000.f,
             process_time_ms / 1000.f,
-             read_time_ms / 1000.f,
-             memcpy_time_ms / 1000.f,
-             convert_time_ms / 1000.f,
-             copy_to_backend_time_ms / 1000.f);
+             (read_time_ms.load() / (float)last_n_threads) / 1000.f,
+             (memcpy_time_ms.load() / (float)last_n_threads) / 1000.f,
+             (convert_time_ms.load() / (float)last_n_threads) / 1000.f,
+             (copy_to_backend_time_ms.load() / (float)last_n_threads) / 1000.f);
    return success;
 }

 bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
-                               std::set<std::string> ignore_tensors) {
+                               std::set<std::string> ignore_tensors,
+                               int n_threads) {
    std::set<std::string> tensor_names_in_file;
+    std::mutex tensor_names_mutex;
    auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
        const std::string& name = tensor_storage.name;
        // LOG_DEBUG("%s", tensor_storage.to_string().c_str());
-        tensor_names_in_file.insert(name);
+        {
+            std::lock_guard<std::mutex> lock(tensor_names_mutex);
+            tensor_names_in_file.insert(name);
+        }

        struct ggml_tensor* real;
        if (tensors.find(name) != tensors.end()) {
@ -2263,7 +2320,7 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
        return true;
    };

-    bool success = load_tensors(on_new_tensor_cb);
+    bool success = load_tensors(on_new_tensor_cb, n_threads);
    if (!success) {
        LOG_ERROR("load tensors from file failed");
        return false;
--- a/model.h
+++ b/model.h
@ -247,9 +247,10 @@ public:
    ggml_type get_diffusion_model_wtype();
    ggml_type get_vae_wtype();
    void set_wtype_override(ggml_type wtype, std::string prefix = "");
-    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb);
+    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0);
    bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
-                      std::set<std::string> ignore_tensors = {});
+                      std::set<std::string> ignore_tensors = {},
+                      int n_threads                        = 0);

    bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules);
    bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -576,7 +576,7 @@ public:
        if (version == VERSION_SVD) {
            ignore_tensors.insert("conditioner.embedders.3");
        }
-        bool success = model_loader.load_tensors(tensors, ignore_tensors);
+        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);
        if (!success) {
            LOG_ERROR("load tensors from model loader failed");
            ggml_free(ctx);