feat: optimize tensor loading time (#790)

* opt tensor loading

* fix build failure

* revert the changes

* allow the use of n_threads

* fix lora loading

* optimize lora loading

* add mutex

* use atomic

* fix build

* fix potential duplicate issue

* avoid duplicate lookup of lora tensor

* fix progeress bar

* remove unused remove_duplicates

---------

Co-authored-by: leejet <leejet714@gmail.com>
This commit is contained in:
rmatif 2025-09-14 16:48:35 +02:00 committed by GitHub
parent 52a97b3ac1
commit 55c2e05d98
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 345 additions and 274 deletions

View File

@ -1,6 +1,7 @@
#ifndef __LORA_HPP__
#define __LORA_HPP__
#include <mutex>
#include "ggml_extend.hpp"
#define LORA_GRAPH_BASE_SIZE 10240
@ -115,7 +116,7 @@ struct LoraModel : public GGMLRunner {
return "lora";
}
bool load_from_file(bool filter_tensor = false) {
bool load_from_file(bool filter_tensor = false, int n_threads = 0) {
LOG_INFO("loading LoRA from '%s'", file_path.c_str());
if (load_failed) {
@ -123,41 +124,53 @@ struct LoraModel : public GGMLRunner {
return false;
}
std::unordered_map<std::string, TensorStorage> tensors_to_create;
std::mutex lora_mutex;
bool dry_run = true;
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
const std::string& name = tensor_storage.name;
if (dry_run) {
const std::string& name = tensor_storage.name;
if (filter_tensor && !contains(name, "lora")) {
// LOG_INFO("skipping LoRA tesnor '%s'", name.c_str());
return true;
}
// LOG_INFO("lora_tensor %s", name.c_str());
for (int i = 0; i < LORA_TYPE_COUNT; i++) {
if (name.find(type_fingerprints[i]) != std::string::npos) {
type = (lora_t)i;
break;
if (filter_tensor && !contains(name, "lora")) {
return true;
}
{
std::lock_guard<std::mutex> lock(lora_mutex);
for (int i = 0; i < LORA_TYPE_COUNT; i++) {
if (name.find(type_fingerprints[i]) != std::string::npos) {
type = (lora_t)i;
break;
}
}
tensors_to_create[name] = tensor_storage;
}
} else {
const std::string& name = tensor_storage.name;
auto iter = lora_tensors.find(name);
if (iter != lora_tensors.end()) {
*dst_tensor = iter->second;
}
}
if (dry_run) {
struct ggml_tensor* real = ggml_new_tensor(params_ctx,
tensor_storage.type,
tensor_storage.n_dims,
tensor_storage.ne);
lora_tensors[name] = real;
} else {
auto real = lora_tensors[name];
*dst_tensor = real;
}
return true;
};
model_loader.load_tensors(on_new_tensor_cb);
model_loader.load_tensors(on_new_tensor_cb, n_threads);
for (const auto& pair : tensors_to_create) {
const auto& name = pair.first;
const auto& ts = pair.second;
struct ggml_tensor* real = ggml_new_tensor(params_ctx,
ts.type,
ts.n_dims,
ts.ne);
lora_tensors[name] = real;
}
alloc_params_buffer();
// exit(0);
dry_run = false;
model_loader.load_tensors(on_new_tensor_cb);
model_loader.load_tensors(on_new_tensor_cb, n_threads);
LOG_DEBUG("lora type: \"%s\"/\"%s\"", lora_downs[type].c_str(), lora_ups[type].c_str());

547
model.cpp
View File

@ -1,8 +1,13 @@
#include <stdarg.h>
#include <atomic>
#include <chrono>
#include <fstream>
#include <functional>
#include <mutex>
#include <regex>
#include <set>
#include <string>
#include <thread>
#include <unordered_map>
#include <vector>
@ -1944,292 +1949,344 @@ std::string ModelLoader::load_umt5_tokenizer_json() {
return json_str;
}
std::vector<TensorStorage> remove_duplicates(const std::vector<TensorStorage>& vec) {
std::vector<TensorStorage> res;
std::unordered_map<std::string, size_t> name_to_index_map;
bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p) {
int64_t process_time_ms = 0;
std::atomic<int64_t> read_time_ms(0);
std::atomic<int64_t> memcpy_time_ms(0);
std::atomic<int64_t> copy_to_backend_time_ms(0);
std::atomic<int64_t> convert_time_ms(0);
for (size_t i = 0; i < vec.size(); ++i) {
const std::string& current_name = vec[i].name;
auto it = name_to_index_map.find(current_name);
int num_threads_to_use = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency();
if (it != name_to_index_map.end()) {
res[it->second] = vec[i];
} else {
name_to_index_map[current_name] = i;
res.push_back(vec[i]);
}
}
// vec.resize(name_to_index_map.size());
return res;
}
bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
int64_t process_time_ms = 0;
int64_t read_time_ms = 0;
int64_t memcpy_time_ms = 0;
int64_t copy_to_backend_time_ms = 0;
int64_t convert_time_ms = 0;
int64_t prev_time_ms = 0;
int64_t curr_time_ms = 0;
int64_t start_time = ggml_time_ms();
prev_time_ms = start_time;
int64_t start_time = ggml_time_ms();
std::vector<TensorStorage> processed_tensor_storages;
for (auto& tensor_storage : tensor_storages) {
// LOG_DEBUG("%s", name.c_str());
if (is_unused_tensor(tensor_storage.name)) {
continue;
{
struct IndexedStorage {
size_t index;
TensorStorage ts;
};
std::mutex vec_mutex;
std::vector<IndexedStorage> all_results;
int n_threads = std::min(num_threads_to_use, (int)tensor_storages.size());
if (n_threads < 1) {
n_threads = 1;
}
std::vector<std::thread> workers;
for (int i = 0; i < n_threads; ++i) {
workers.emplace_back([&, thread_id = i]() {
std::vector<IndexedStorage> local_results;
std::vector<TensorStorage> temp_storages;
for (size_t j = thread_id; j < tensor_storages.size(); j += n_threads) {
const auto& tensor_storage = tensor_storages[j];
if (is_unused_tensor(tensor_storage.name)) {
continue;
}
temp_storages.clear();
preprocess_tensor(tensor_storage, temp_storages);
for (const auto& ts : temp_storages) {
local_results.push_back({j, ts});
}
}
if (!local_results.empty()) {
std::lock_guard<std::mutex> lock(vec_mutex);
all_results.insert(all_results.end(),
local_results.begin(), local_results.end());
}
});
}
for (auto& w : workers) {
w.join();
}
preprocess_tensor(tensor_storage, processed_tensor_storages);
}
std::vector<TensorStorage> dedup = remove_duplicates(processed_tensor_storages);
processed_tensor_storages = dedup;
curr_time_ms = ggml_time_ms();
process_time_ms = curr_time_ms - prev_time_ms;
prev_time_ms = curr_time_ms;
std::unordered_map<std::string, IndexedStorage> latest_map;
for (auto& entry : all_results) {
latest_map[entry.ts.name] = entry;
}
processed_tensor_storages.reserve(latest_map.size());
for (auto& [name, entry] : latest_map) {
processed_tensor_storages.push_back(entry.ts);
}
}
process_time_ms = ggml_time_ms() - start_time;
bool success = true;
size_t total_tensors_processed = 0;
const size_t total_tensors_to_process = processed_tensor_storages.size();
const int64_t t_start = ggml_time_ms();
int last_n_threads = 1;
bool success = true;
for (size_t file_index = 0; file_index < file_paths_.size(); file_index++) {
std::string file_path = file_paths_[file_index];
LOG_DEBUG("loading tensors from %s", file_path.c_str());
std::ifstream file(file_path, std::ios::binary);
if (!file.is_open()) {
LOG_ERROR("failed to open '%s'", file_path.c_str());
return false;
std::vector<const TensorStorage*> file_tensors;
for (const auto& ts : processed_tensor_storages) {
if (ts.file_index == file_index) {
file_tensors.push_back(&ts);
}
}
if (file_tensors.empty()) {
continue;
}
bool is_zip = false;
for (auto& tensor_storage : tensor_storages) {
if (tensor_storage.file_index != file_index) {
continue;
}
if (tensor_storage.index_in_zip >= 0) {
for (auto const& ts : file_tensors) {
if (ts->index_in_zip >= 0) {
is_zip = true;
break;
}
}
struct zip_t* zip = NULL;
if (is_zip) {
zip = zip_open(file_path.c_str(), 0, 'r');
if (zip == NULL) {
LOG_ERROR("failed to open zip '%s'", file_path.c_str());
return false;
}
int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size());
if (n_threads < 1) {
n_threads = 1;
}
last_n_threads = n_threads;
std::atomic<size_t> tensor_idx(0);
std::atomic<bool> failed(false);
std::vector<std::thread> workers;
for (int i = 0; i < n_threads; ++i) {
workers.emplace_back([&, file_path, is_zip]() {
std::ifstream file;
struct zip_t* zip = NULL;
if (is_zip) {
zip = zip_open(file_path.c_str(), 0, 'r');
if (zip == NULL) {
LOG_ERROR("failed to open zip '%s'", file_path.c_str());
failed = true;
return;
}
} else {
file.open(file_path, std::ios::binary);
if (!file.is_open()) {
LOG_ERROR("failed to open '%s'", file_path.c_str());
failed = true;
return;
}
}
std::vector<uint8_t> read_buffer;
std::vector<uint8_t> convert_buffer;
while (true) {
int64_t t0, t1;
size_t idx = tensor_idx.fetch_add(1);
if (idx >= file_tensors.size() || failed) {
break;
}
const TensorStorage& tensor_storage = *file_tensors[idx];
ggml_tensor* dst_tensor = NULL;
t0 = ggml_time_ms();
if (!on_new_tensor_cb(tensor_storage, &dst_tensor)) {
LOG_WARN("process tensor failed: '%s'", tensor_storage.name.c_str());
failed = true;
break;
}
if (dst_tensor == NULL) {
t1 = ggml_time_ms();
read_time_ms.fetch_add(t1 - t0);
continue;
}
size_t nbytes_to_read = tensor_storage.nbytes_to_read();
auto read_data = [&](char* buf, size_t n) {
if (zip != NULL) {
zip_entry_openbyindex(zip, tensor_storage.index_in_zip);
size_t entry_size = zip_entry_size(zip);
if (entry_size != n) {
int64_t t_memcpy_start;
read_buffer.resize(entry_size);
zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size);
t_memcpy_start = ggml_time_ms();
memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n);
memcpy_time_ms.fetch_add(ggml_time_ms() - t_memcpy_start);
} else {
zip_entry_noallocread(zip, (void*)buf, n);
}
zip_entry_close(zip);
} else {
file.seekg(tensor_storage.offset);
file.read(buf, n);
if (!file) {
LOG_ERROR("read tensor data failed: '%s'", file_path.c_str());
failed = true;
}
}
};
if (dst_tensor->buffer == NULL || ggml_backend_buffer_is_host(dst_tensor->buffer)) {
if (tensor_storage.type == dst_tensor->type) {
GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes());
if (tensor_storage.is_f64 || tensor_storage.is_i64) {
read_buffer.resize(tensor_storage.nbytes_to_read());
read_data((char*)read_buffer.data(), nbytes_to_read);
} else {
read_data((char*)dst_tensor->data, nbytes_to_read);
}
t1 = ggml_time_ms();
read_time_ms.fetch_add(t1 - t0);
t0 = ggml_time_ms();
if (tensor_storage.is_bf16) {
// inplace op
bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements());
} else if (tensor_storage.is_f8_e4m3) {
// inplace op
f8_e4m3_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
} else if (tensor_storage.is_f8_e5m2) {
// inplace op
f8_e5m2_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
} else if (tensor_storage.is_f64) {
f64_to_f32_vec((double*)read_buffer.data(), (float*)dst_tensor->data, tensor_storage.nelements());
} else if (tensor_storage.is_i64) {
i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements());
}
t1 = ggml_time_ms();
convert_time_ms.fetch_add(t1 - t0);
} else {
read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
read_data((char*)read_buffer.data(), nbytes_to_read);
t1 = ggml_time_ms();
read_time_ms.fetch_add(t1 - t0);
t0 = ggml_time_ms();
if (tensor_storage.is_bf16) {
// inplace op
bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
} else if (tensor_storage.is_f8_e4m3) {
// inplace op
f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
} else if (tensor_storage.is_f8_e5m2) {
// inplace op
f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
} else if (tensor_storage.is_f64) {
// inplace op
f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
} else if (tensor_storage.is_i64) {
// inplace op
i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
}
convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data, dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
t1 = ggml_time_ms();
convert_time_ms.fetch_add(t1 - t0);
}
} else {
read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
read_data((char*)read_buffer.data(), nbytes_to_read);
t1 = ggml_time_ms();
read_time_ms.fetch_add(t1 - t0);
t0 = ggml_time_ms();
if (tensor_storage.is_bf16) {
// inplace op
bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
} else if (tensor_storage.is_f8_e4m3) {
// inplace op
f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
} else if (tensor_storage.is_f8_e5m2) {
// inplace op
f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
} else if (tensor_storage.is_f64) {
// inplace op
f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
} else if (tensor_storage.is_i64) {
// inplace op
i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
}
if (tensor_storage.type == dst_tensor->type) {
// copy to device memory
t1 = ggml_time_ms();
convert_time_ms.fetch_add(t1 - t0);
t0 = ggml_time_ms();
ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor));
t1 = ggml_time_ms();
copy_to_backend_time_ms.fetch_add(t1 - t0);
} else {
// convert first, then copy to device memory
convert_buffer.resize(ggml_nbytes(dst_tensor));
convert_tensor((void*)read_buffer.data(), tensor_storage.type, (void*)convert_buffer.data(), dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
t1 = ggml_time_ms();
convert_time_ms.fetch_add(t1 - t0);
t0 = ggml_time_ms();
ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor));
t1 = ggml_time_ms();
copy_to_backend_time_ms.fetch_add(t1 - t0);
}
}
}
if (zip != NULL) {
zip_close(zip);
}
});
}
std::vector<uint8_t> read_buffer;
std::vector<uint8_t> convert_buffer;
auto read_data = [&](const TensorStorage& tensor_storage, char* buf, size_t n) {
if (zip != NULL) {
zip_entry_openbyindex(zip, tensor_storage.index_in_zip);
size_t entry_size = zip_entry_size(zip);
if (entry_size != n) {
read_buffer.resize(entry_size);
prev_time_ms = ggml_time_ms();
zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size);
curr_time_ms = ggml_time_ms();
read_time_ms += curr_time_ms - prev_time_ms;
prev_time_ms = curr_time_ms;
memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n);
curr_time_ms = ggml_time_ms();
memcpy_time_ms += curr_time_ms - prev_time_ms;
} else {
prev_time_ms = ggml_time_ms();
zip_entry_noallocread(zip, (void*)buf, n);
curr_time_ms = ggml_time_ms();
read_time_ms += curr_time_ms - prev_time_ms;
}
zip_entry_close(zip);
} else {
prev_time_ms = ggml_time_ms();
file.seekg(tensor_storage.offset);
file.read(buf, n);
curr_time_ms = ggml_time_ms();
read_time_ms += curr_time_ms - prev_time_ms;
if (!file) {
LOG_ERROR("read tensor data failed: '%s'", file_path.c_str());
return false;
}
}
return true;
};
int tensor_count = 0;
int64_t t0 = ggml_time_ms();
int64_t t1 = t0;
bool partial = true;
int tensor_max = (int)processed_tensor_storages.size();
pretty_progress(0, tensor_max, 0.0f);
for (auto& tensor_storage : processed_tensor_storages) {
if (tensor_storage.file_index != file_index) {
++tensor_count;
continue;
}
ggml_tensor* dst_tensor = NULL;
success = on_new_tensor_cb(tensor_storage, &dst_tensor);
if (!success) {
LOG_WARN("process tensor failed: '%s'", tensor_storage.name.c_str());
while (true) {
size_t current_idx = tensor_idx.load();
if (current_idx >= file_tensors.size() || failed) {
break;
}
if (dst_tensor == NULL) {
++tensor_count;
continue;
}
size_t nbytes_to_read = tensor_storage.nbytes_to_read();
if (dst_tensor->buffer == NULL || ggml_backend_buffer_is_host(dst_tensor->buffer)) {
// for the CPU and Metal backend, we can copy directly into the tensor
if (tensor_storage.type == dst_tensor->type) {
GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes());
if (tensor_storage.is_f64 || tensor_storage.is_i64) {
read_buffer.resize(tensor_storage.nbytes_to_read());
read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
} else {
read_data(tensor_storage, (char*)dst_tensor->data, nbytes_to_read);
}
prev_time_ms = ggml_time_ms();
if (tensor_storage.is_bf16) {
// inplace op
bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements());
} else if (tensor_storage.is_f8_e4m3) {
// inplace op
f8_e4m3_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
} else if (tensor_storage.is_f8_e5m2) {
// inplace op
f8_e5m2_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
} else if (tensor_storage.is_f64) {
f64_to_f32_vec((double*)read_buffer.data(), (float*)dst_tensor->data, tensor_storage.nelements());
} else if (tensor_storage.is_i64) {
i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements());
}
curr_time_ms = ggml_time_ms();
convert_time_ms += curr_time_ms - prev_time_ms;
} else {
read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
prev_time_ms = ggml_time_ms();
if (tensor_storage.is_bf16) {
// inplace op
bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
} else if (tensor_storage.is_f8_e4m3) {
// inplace op
f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
} else if (tensor_storage.is_f8_e5m2) {
// inplace op
f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
} else if (tensor_storage.is_f64) {
// inplace op
f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
} else if (tensor_storage.is_i64) {
// inplace op
i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
}
convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
curr_time_ms = ggml_time_ms();
convert_time_ms += curr_time_ms - prev_time_ms;
}
} else {
read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
prev_time_ms = ggml_time_ms();
if (tensor_storage.is_bf16) {
// inplace op
bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
} else if (tensor_storage.is_f8_e4m3) {
// inplace op
f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
} else if (tensor_storage.is_f8_e5m2) {
// inplace op
f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
} else if (tensor_storage.is_f64) {
// inplace op
f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
} else if (tensor_storage.is_i64) {
// inplace op
i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
}
if (tensor_storage.type == dst_tensor->type) {
// copy to device memory
curr_time_ms = ggml_time_ms();
convert_time_ms += curr_time_ms - prev_time_ms;
prev_time_ms = curr_time_ms;
ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor));
curr_time_ms = ggml_time_ms();
copy_to_backend_time_ms += curr_time_ms - prev_time_ms;
} else {
// convert first, then copy to device memory
convert_buffer.resize(ggml_nbytes(dst_tensor));
convert_tensor((void*)read_buffer.data(), tensor_storage.type,
(void*)convert_buffer.data(), dst_tensor->type,
(int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
curr_time_ms = ggml_time_ms();
convert_time_ms += curr_time_ms - prev_time_ms;
prev_time_ms = curr_time_ms;
ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor));
curr_time_ms = ggml_time_ms();
copy_to_backend_time_ms += curr_time_ms - prev_time_ms;
}
}
++tensor_count;
int64_t t2 = ggml_time_ms();
if ((t2 - t1) >= 200) {
t1 = t2;
pretty_progress(tensor_count, tensor_max, (t1 - t0) / (1000.0f * tensor_count));
partial = tensor_count != tensor_max;
}
size_t curr_num = total_tensors_processed + current_idx;
pretty_progress(curr_num, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f / (curr_num + 1e-6f));
std::this_thread::sleep_for(std::chrono::milliseconds(200));
}
if (partial) {
if (tensor_count >= 1) {
t1 = ggml_time_ms();
pretty_progress(tensor_count, tensor_max, (t1 - t0) / (1000.0f * tensor_count));
}
if (tensor_count < tensor_max) {
printf("\n");
}
for (auto& w : workers) {
w.join();
}
if (zip != NULL) {
zip_close(zip);
}
if (!success) {
if (failed) {
success = false;
break;
}
total_tensors_processed += file_tensors.size();
pretty_progress(total_tensors_processed, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f / (total_tensors_processed + 1e-6f));
if (total_tensors_processed < total_tensors_to_process) {
printf("\n");
}
}
int64_t end_time = ggml_time_ms();
LOG_INFO("loading tensors completed, taking %.2fs (process: %.2fs, read: %.2fs, memcpy: %.2fs, convert: %.2fs, copy_to_backend: %.2fs)",
(end_time - start_time) / 1000.f,
process_time_ms / 1000.f,
read_time_ms / 1000.f,
memcpy_time_ms / 1000.f,
convert_time_ms / 1000.f,
copy_to_backend_time_ms / 1000.f);
(read_time_ms.load() / (float)last_n_threads) / 1000.f,
(memcpy_time_ms.load() / (float)last_n_threads) / 1000.f,
(convert_time_ms.load() / (float)last_n_threads) / 1000.f,
(copy_to_backend_time_ms.load() / (float)last_n_threads) / 1000.f);
return success;
}
bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
std::set<std::string> ignore_tensors) {
std::set<std::string> ignore_tensors,
int n_threads) {
std::set<std::string> tensor_names_in_file;
std::mutex tensor_names_mutex;
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
const std::string& name = tensor_storage.name;
// LOG_DEBUG("%s", tensor_storage.to_string().c_str());
tensor_names_in_file.insert(name);
{
std::lock_guard<std::mutex> lock(tensor_names_mutex);
tensor_names_in_file.insert(name);
}
struct ggml_tensor* real;
if (tensors.find(name) != tensors.end()) {
@ -2263,7 +2320,7 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
return true;
};
bool success = load_tensors(on_new_tensor_cb);
bool success = load_tensors(on_new_tensor_cb, n_threads);
if (!success) {
LOG_ERROR("load tensors from file failed");
return false;

View File

@ -247,9 +247,10 @@ public:
ggml_type get_diffusion_model_wtype();
ggml_type get_vae_wtype();
void set_wtype_override(ggml_type wtype, std::string prefix = "");
bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb);
bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0);
bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
std::set<std::string> ignore_tensors = {});
std::set<std::string> ignore_tensors = {},
int n_threads = 0);
bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules);
bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);

View File

@ -576,7 +576,7 @@ public:
if (version == VERSION_SVD) {
ignore_tensors.insert("conditioner.embedders.3");
}
bool success = model_loader.load_tensors(tensors, ignore_tensors);
bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);
if (!success) {
LOG_ERROR("load tensors from model loader failed");
ggml_free(ctx);