mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-12 13:28:37 +00:00
feat: optimize tensor loading time (#790)
* opt tensor loading * fix build failure * revert the changes * allow the use of n_threads * fix lora loading * optimize lora loading * add mutex * use atomic * fix build * fix potential duplicate issue * avoid duplicate lookup of lora tensor * fix progeress bar * remove unused remove_duplicates --------- Co-authored-by: leejet <leejet714@gmail.com>
This commit is contained in:
parent
52a97b3ac1
commit
55c2e05d98
65
lora.hpp
65
lora.hpp
@ -1,6 +1,7 @@
|
||||
#ifndef __LORA_HPP__
|
||||
#define __LORA_HPP__
|
||||
|
||||
#include <mutex>
|
||||
#include "ggml_extend.hpp"
|
||||
|
||||
#define LORA_GRAPH_BASE_SIZE 10240
|
||||
@ -115,7 +116,7 @@ struct LoraModel : public GGMLRunner {
|
||||
return "lora";
|
||||
}
|
||||
|
||||
bool load_from_file(bool filter_tensor = false) {
|
||||
bool load_from_file(bool filter_tensor = false, int n_threads = 0) {
|
||||
LOG_INFO("loading LoRA from '%s'", file_path.c_str());
|
||||
|
||||
if (load_failed) {
|
||||
@ -123,41 +124,53 @@ struct LoraModel : public GGMLRunner {
|
||||
return false;
|
||||
}
|
||||
|
||||
std::unordered_map<std::string, TensorStorage> tensors_to_create;
|
||||
std::mutex lora_mutex;
|
||||
bool dry_run = true;
|
||||
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
|
||||
const std::string& name = tensor_storage.name;
|
||||
if (dry_run) {
|
||||
const std::string& name = tensor_storage.name;
|
||||
|
||||
if (filter_tensor && !contains(name, "lora")) {
|
||||
// LOG_INFO("skipping LoRA tesnor '%s'", name.c_str());
|
||||
return true;
|
||||
}
|
||||
// LOG_INFO("lora_tensor %s", name.c_str());
|
||||
for (int i = 0; i < LORA_TYPE_COUNT; i++) {
|
||||
if (name.find(type_fingerprints[i]) != std::string::npos) {
|
||||
type = (lora_t)i;
|
||||
break;
|
||||
if (filter_tensor && !contains(name, "lora")) {
|
||||
return true;
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(lora_mutex);
|
||||
for (int i = 0; i < LORA_TYPE_COUNT; i++) {
|
||||
if (name.find(type_fingerprints[i]) != std::string::npos) {
|
||||
type = (lora_t)i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
tensors_to_create[name] = tensor_storage;
|
||||
}
|
||||
} else {
|
||||
const std::string& name = tensor_storage.name;
|
||||
auto iter = lora_tensors.find(name);
|
||||
if (iter != lora_tensors.end()) {
|
||||
*dst_tensor = iter->second;
|
||||
}
|
||||
}
|
||||
|
||||
if (dry_run) {
|
||||
struct ggml_tensor* real = ggml_new_tensor(params_ctx,
|
||||
tensor_storage.type,
|
||||
tensor_storage.n_dims,
|
||||
tensor_storage.ne);
|
||||
lora_tensors[name] = real;
|
||||
} else {
|
||||
auto real = lora_tensors[name];
|
||||
*dst_tensor = real;
|
||||
}
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
model_loader.load_tensors(on_new_tensor_cb);
|
||||
model_loader.load_tensors(on_new_tensor_cb, n_threads);
|
||||
|
||||
for (const auto& pair : tensors_to_create) {
|
||||
const auto& name = pair.first;
|
||||
const auto& ts = pair.second;
|
||||
struct ggml_tensor* real = ggml_new_tensor(params_ctx,
|
||||
ts.type,
|
||||
ts.n_dims,
|
||||
ts.ne);
|
||||
lora_tensors[name] = real;
|
||||
}
|
||||
|
||||
alloc_params_buffer();
|
||||
// exit(0);
|
||||
|
||||
dry_run = false;
|
||||
model_loader.load_tensors(on_new_tensor_cb);
|
||||
model_loader.load_tensors(on_new_tensor_cb, n_threads);
|
||||
|
||||
LOG_DEBUG("lora type: \"%s\"/\"%s\"", lora_downs[type].c_str(), lora_ups[type].c_str());
|
||||
|
||||
|
||||
547
model.cpp
547
model.cpp
@ -1,8 +1,13 @@
|
||||
#include <stdarg.h>
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <mutex>
|
||||
#include <regex>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
@ -1944,292 +1949,344 @@ std::string ModelLoader::load_umt5_tokenizer_json() {
|
||||
return json_str;
|
||||
}
|
||||
|
||||
std::vector<TensorStorage> remove_duplicates(const std::vector<TensorStorage>& vec) {
|
||||
std::vector<TensorStorage> res;
|
||||
std::unordered_map<std::string, size_t> name_to_index_map;
|
||||
bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p) {
|
||||
int64_t process_time_ms = 0;
|
||||
std::atomic<int64_t> read_time_ms(0);
|
||||
std::atomic<int64_t> memcpy_time_ms(0);
|
||||
std::atomic<int64_t> copy_to_backend_time_ms(0);
|
||||
std::atomic<int64_t> convert_time_ms(0);
|
||||
|
||||
for (size_t i = 0; i < vec.size(); ++i) {
|
||||
const std::string& current_name = vec[i].name;
|
||||
auto it = name_to_index_map.find(current_name);
|
||||
int num_threads_to_use = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency();
|
||||
|
||||
if (it != name_to_index_map.end()) {
|
||||
res[it->second] = vec[i];
|
||||
} else {
|
||||
name_to_index_map[current_name] = i;
|
||||
res.push_back(vec[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// vec.resize(name_to_index_map.size());
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
|
||||
int64_t process_time_ms = 0;
|
||||
int64_t read_time_ms = 0;
|
||||
int64_t memcpy_time_ms = 0;
|
||||
int64_t copy_to_backend_time_ms = 0;
|
||||
int64_t convert_time_ms = 0;
|
||||
|
||||
int64_t prev_time_ms = 0;
|
||||
int64_t curr_time_ms = 0;
|
||||
int64_t start_time = ggml_time_ms();
|
||||
prev_time_ms = start_time;
|
||||
int64_t start_time = ggml_time_ms();
|
||||
std::vector<TensorStorage> processed_tensor_storages;
|
||||
for (auto& tensor_storage : tensor_storages) {
|
||||
// LOG_DEBUG("%s", name.c_str());
|
||||
|
||||
if (is_unused_tensor(tensor_storage.name)) {
|
||||
continue;
|
||||
{
|
||||
struct IndexedStorage {
|
||||
size_t index;
|
||||
TensorStorage ts;
|
||||
};
|
||||
|
||||
std::mutex vec_mutex;
|
||||
std::vector<IndexedStorage> all_results;
|
||||
|
||||
int n_threads = std::min(num_threads_to_use, (int)tensor_storages.size());
|
||||
if (n_threads < 1) {
|
||||
n_threads = 1;
|
||||
}
|
||||
std::vector<std::thread> workers;
|
||||
|
||||
for (int i = 0; i < n_threads; ++i) {
|
||||
workers.emplace_back([&, thread_id = i]() {
|
||||
std::vector<IndexedStorage> local_results;
|
||||
std::vector<TensorStorage> temp_storages;
|
||||
|
||||
for (size_t j = thread_id; j < tensor_storages.size(); j += n_threads) {
|
||||
const auto& tensor_storage = tensor_storages[j];
|
||||
if (is_unused_tensor(tensor_storage.name)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
temp_storages.clear();
|
||||
preprocess_tensor(tensor_storage, temp_storages);
|
||||
|
||||
for (const auto& ts : temp_storages) {
|
||||
local_results.push_back({j, ts});
|
||||
}
|
||||
}
|
||||
|
||||
if (!local_results.empty()) {
|
||||
std::lock_guard<std::mutex> lock(vec_mutex);
|
||||
all_results.insert(all_results.end(),
|
||||
local_results.begin(), local_results.end());
|
||||
}
|
||||
});
|
||||
}
|
||||
for (auto& w : workers) {
|
||||
w.join();
|
||||
}
|
||||
|
||||
preprocess_tensor(tensor_storage, processed_tensor_storages);
|
||||
}
|
||||
std::vector<TensorStorage> dedup = remove_duplicates(processed_tensor_storages);
|
||||
processed_tensor_storages = dedup;
|
||||
curr_time_ms = ggml_time_ms();
|
||||
process_time_ms = curr_time_ms - prev_time_ms;
|
||||
prev_time_ms = curr_time_ms;
|
||||
std::unordered_map<std::string, IndexedStorage> latest_map;
|
||||
for (auto& entry : all_results) {
|
||||
latest_map[entry.ts.name] = entry;
|
||||
}
|
||||
|
||||
processed_tensor_storages.reserve(latest_map.size());
|
||||
for (auto& [name, entry] : latest_map) {
|
||||
processed_tensor_storages.push_back(entry.ts);
|
||||
}
|
||||
}
|
||||
|
||||
process_time_ms = ggml_time_ms() - start_time;
|
||||
|
||||
bool success = true;
|
||||
size_t total_tensors_processed = 0;
|
||||
const size_t total_tensors_to_process = processed_tensor_storages.size();
|
||||
const int64_t t_start = ggml_time_ms();
|
||||
int last_n_threads = 1;
|
||||
|
||||
bool success = true;
|
||||
for (size_t file_index = 0; file_index < file_paths_.size(); file_index++) {
|
||||
std::string file_path = file_paths_[file_index];
|
||||
LOG_DEBUG("loading tensors from %s", file_path.c_str());
|
||||
|
||||
std::ifstream file(file_path, std::ios::binary);
|
||||
if (!file.is_open()) {
|
||||
LOG_ERROR("failed to open '%s'", file_path.c_str());
|
||||
return false;
|
||||
std::vector<const TensorStorage*> file_tensors;
|
||||
for (const auto& ts : processed_tensor_storages) {
|
||||
if (ts.file_index == file_index) {
|
||||
file_tensors.push_back(&ts);
|
||||
}
|
||||
}
|
||||
if (file_tensors.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
bool is_zip = false;
|
||||
for (auto& tensor_storage : tensor_storages) {
|
||||
if (tensor_storage.file_index != file_index) {
|
||||
continue;
|
||||
}
|
||||
if (tensor_storage.index_in_zip >= 0) {
|
||||
for (auto const& ts : file_tensors) {
|
||||
if (ts->index_in_zip >= 0) {
|
||||
is_zip = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
struct zip_t* zip = NULL;
|
||||
if (is_zip) {
|
||||
zip = zip_open(file_path.c_str(), 0, 'r');
|
||||
if (zip == NULL) {
|
||||
LOG_ERROR("failed to open zip '%s'", file_path.c_str());
|
||||
return false;
|
||||
}
|
||||
int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size());
|
||||
if (n_threads < 1) {
|
||||
n_threads = 1;
|
||||
}
|
||||
last_n_threads = n_threads;
|
||||
|
||||
std::atomic<size_t> tensor_idx(0);
|
||||
std::atomic<bool> failed(false);
|
||||
std::vector<std::thread> workers;
|
||||
|
||||
for (int i = 0; i < n_threads; ++i) {
|
||||
workers.emplace_back([&, file_path, is_zip]() {
|
||||
std::ifstream file;
|
||||
struct zip_t* zip = NULL;
|
||||
if (is_zip) {
|
||||
zip = zip_open(file_path.c_str(), 0, 'r');
|
||||
if (zip == NULL) {
|
||||
LOG_ERROR("failed to open zip '%s'", file_path.c_str());
|
||||
failed = true;
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
file.open(file_path, std::ios::binary);
|
||||
if (!file.is_open()) {
|
||||
LOG_ERROR("failed to open '%s'", file_path.c_str());
|
||||
failed = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<uint8_t> read_buffer;
|
||||
std::vector<uint8_t> convert_buffer;
|
||||
|
||||
while (true) {
|
||||
int64_t t0, t1;
|
||||
size_t idx = tensor_idx.fetch_add(1);
|
||||
if (idx >= file_tensors.size() || failed) {
|
||||
break;
|
||||
}
|
||||
|
||||
const TensorStorage& tensor_storage = *file_tensors[idx];
|
||||
ggml_tensor* dst_tensor = NULL;
|
||||
|
||||
t0 = ggml_time_ms();
|
||||
|
||||
if (!on_new_tensor_cb(tensor_storage, &dst_tensor)) {
|
||||
LOG_WARN("process tensor failed: '%s'", tensor_storage.name.c_str());
|
||||
failed = true;
|
||||
break;
|
||||
}
|
||||
|
||||
if (dst_tensor == NULL) {
|
||||
t1 = ggml_time_ms();
|
||||
read_time_ms.fetch_add(t1 - t0);
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t nbytes_to_read = tensor_storage.nbytes_to_read();
|
||||
|
||||
auto read_data = [&](char* buf, size_t n) {
|
||||
if (zip != NULL) {
|
||||
zip_entry_openbyindex(zip, tensor_storage.index_in_zip);
|
||||
size_t entry_size = zip_entry_size(zip);
|
||||
if (entry_size != n) {
|
||||
int64_t t_memcpy_start;
|
||||
read_buffer.resize(entry_size);
|
||||
zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size);
|
||||
t_memcpy_start = ggml_time_ms();
|
||||
memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n);
|
||||
memcpy_time_ms.fetch_add(ggml_time_ms() - t_memcpy_start);
|
||||
} else {
|
||||
zip_entry_noallocread(zip, (void*)buf, n);
|
||||
}
|
||||
zip_entry_close(zip);
|
||||
} else {
|
||||
file.seekg(tensor_storage.offset);
|
||||
file.read(buf, n);
|
||||
if (!file) {
|
||||
LOG_ERROR("read tensor data failed: '%s'", file_path.c_str());
|
||||
failed = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if (dst_tensor->buffer == NULL || ggml_backend_buffer_is_host(dst_tensor->buffer)) {
|
||||
if (tensor_storage.type == dst_tensor->type) {
|
||||
GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes());
|
||||
if (tensor_storage.is_f64 || tensor_storage.is_i64) {
|
||||
read_buffer.resize(tensor_storage.nbytes_to_read());
|
||||
read_data((char*)read_buffer.data(), nbytes_to_read);
|
||||
} else {
|
||||
read_data((char*)dst_tensor->data, nbytes_to_read);
|
||||
}
|
||||
t1 = ggml_time_ms();
|
||||
read_time_ms.fetch_add(t1 - t0);
|
||||
|
||||
t0 = ggml_time_ms();
|
||||
if (tensor_storage.is_bf16) {
|
||||
// inplace op
|
||||
bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_f8_e4m3) {
|
||||
// inplace op
|
||||
f8_e4m3_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_f8_e5m2) {
|
||||
// inplace op
|
||||
f8_e5m2_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_f64) {
|
||||
f64_to_f32_vec((double*)read_buffer.data(), (float*)dst_tensor->data, tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_i64) {
|
||||
i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements());
|
||||
}
|
||||
t1 = ggml_time_ms();
|
||||
convert_time_ms.fetch_add(t1 - t0);
|
||||
} else {
|
||||
read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
|
||||
read_data((char*)read_buffer.data(), nbytes_to_read);
|
||||
t1 = ggml_time_ms();
|
||||
read_time_ms.fetch_add(t1 - t0);
|
||||
|
||||
t0 = ggml_time_ms();
|
||||
if (tensor_storage.is_bf16) {
|
||||
// inplace op
|
||||
bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_f8_e4m3) {
|
||||
// inplace op
|
||||
f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_f8_e5m2) {
|
||||
// inplace op
|
||||
f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_f64) {
|
||||
// inplace op
|
||||
f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_i64) {
|
||||
// inplace op
|
||||
i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
|
||||
}
|
||||
convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data, dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
|
||||
t1 = ggml_time_ms();
|
||||
convert_time_ms.fetch_add(t1 - t0);
|
||||
}
|
||||
} else {
|
||||
read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
|
||||
read_data((char*)read_buffer.data(), nbytes_to_read);
|
||||
t1 = ggml_time_ms();
|
||||
read_time_ms.fetch_add(t1 - t0);
|
||||
|
||||
t0 = ggml_time_ms();
|
||||
if (tensor_storage.is_bf16) {
|
||||
// inplace op
|
||||
bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_f8_e4m3) {
|
||||
// inplace op
|
||||
f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_f8_e5m2) {
|
||||
// inplace op
|
||||
f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_f64) {
|
||||
// inplace op
|
||||
f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_i64) {
|
||||
// inplace op
|
||||
i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
|
||||
}
|
||||
|
||||
if (tensor_storage.type == dst_tensor->type) {
|
||||
// copy to device memory
|
||||
t1 = ggml_time_ms();
|
||||
convert_time_ms.fetch_add(t1 - t0);
|
||||
t0 = ggml_time_ms();
|
||||
ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor));
|
||||
t1 = ggml_time_ms();
|
||||
copy_to_backend_time_ms.fetch_add(t1 - t0);
|
||||
} else {
|
||||
// convert first, then copy to device memory
|
||||
|
||||
convert_buffer.resize(ggml_nbytes(dst_tensor));
|
||||
convert_tensor((void*)read_buffer.data(), tensor_storage.type, (void*)convert_buffer.data(), dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
|
||||
t1 = ggml_time_ms();
|
||||
convert_time_ms.fetch_add(t1 - t0);
|
||||
t0 = ggml_time_ms();
|
||||
ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor));
|
||||
t1 = ggml_time_ms();
|
||||
copy_to_backend_time_ms.fetch_add(t1 - t0);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (zip != NULL) {
|
||||
zip_close(zip);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
std::vector<uint8_t> read_buffer;
|
||||
std::vector<uint8_t> convert_buffer;
|
||||
|
||||
auto read_data = [&](const TensorStorage& tensor_storage, char* buf, size_t n) {
|
||||
if (zip != NULL) {
|
||||
zip_entry_openbyindex(zip, tensor_storage.index_in_zip);
|
||||
size_t entry_size = zip_entry_size(zip);
|
||||
if (entry_size != n) {
|
||||
read_buffer.resize(entry_size);
|
||||
prev_time_ms = ggml_time_ms();
|
||||
zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size);
|
||||
curr_time_ms = ggml_time_ms();
|
||||
read_time_ms += curr_time_ms - prev_time_ms;
|
||||
prev_time_ms = curr_time_ms;
|
||||
memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n);
|
||||
curr_time_ms = ggml_time_ms();
|
||||
memcpy_time_ms += curr_time_ms - prev_time_ms;
|
||||
} else {
|
||||
prev_time_ms = ggml_time_ms();
|
||||
zip_entry_noallocread(zip, (void*)buf, n);
|
||||
curr_time_ms = ggml_time_ms();
|
||||
read_time_ms += curr_time_ms - prev_time_ms;
|
||||
}
|
||||
zip_entry_close(zip);
|
||||
} else {
|
||||
prev_time_ms = ggml_time_ms();
|
||||
file.seekg(tensor_storage.offset);
|
||||
file.read(buf, n);
|
||||
curr_time_ms = ggml_time_ms();
|
||||
read_time_ms += curr_time_ms - prev_time_ms;
|
||||
if (!file) {
|
||||
LOG_ERROR("read tensor data failed: '%s'", file_path.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
int tensor_count = 0;
|
||||
int64_t t0 = ggml_time_ms();
|
||||
int64_t t1 = t0;
|
||||
bool partial = true;
|
||||
int tensor_max = (int)processed_tensor_storages.size();
|
||||
pretty_progress(0, tensor_max, 0.0f);
|
||||
for (auto& tensor_storage : processed_tensor_storages) {
|
||||
if (tensor_storage.file_index != file_index) {
|
||||
++tensor_count;
|
||||
continue;
|
||||
}
|
||||
ggml_tensor* dst_tensor = NULL;
|
||||
|
||||
success = on_new_tensor_cb(tensor_storage, &dst_tensor);
|
||||
if (!success) {
|
||||
LOG_WARN("process tensor failed: '%s'", tensor_storage.name.c_str());
|
||||
while (true) {
|
||||
size_t current_idx = tensor_idx.load();
|
||||
if (current_idx >= file_tensors.size() || failed) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (dst_tensor == NULL) {
|
||||
++tensor_count;
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t nbytes_to_read = tensor_storage.nbytes_to_read();
|
||||
|
||||
if (dst_tensor->buffer == NULL || ggml_backend_buffer_is_host(dst_tensor->buffer)) {
|
||||
// for the CPU and Metal backend, we can copy directly into the tensor
|
||||
if (tensor_storage.type == dst_tensor->type) {
|
||||
GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes());
|
||||
if (tensor_storage.is_f64 || tensor_storage.is_i64) {
|
||||
read_buffer.resize(tensor_storage.nbytes_to_read());
|
||||
read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
|
||||
} else {
|
||||
read_data(tensor_storage, (char*)dst_tensor->data, nbytes_to_read);
|
||||
}
|
||||
|
||||
prev_time_ms = ggml_time_ms();
|
||||
if (tensor_storage.is_bf16) {
|
||||
// inplace op
|
||||
bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_f8_e4m3) {
|
||||
// inplace op
|
||||
f8_e4m3_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_f8_e5m2) {
|
||||
// inplace op
|
||||
f8_e5m2_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_f64) {
|
||||
f64_to_f32_vec((double*)read_buffer.data(), (float*)dst_tensor->data, tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_i64) {
|
||||
i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements());
|
||||
}
|
||||
curr_time_ms = ggml_time_ms();
|
||||
convert_time_ms += curr_time_ms - prev_time_ms;
|
||||
} else {
|
||||
read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
|
||||
read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
|
||||
|
||||
prev_time_ms = ggml_time_ms();
|
||||
if (tensor_storage.is_bf16) {
|
||||
// inplace op
|
||||
bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_f8_e4m3) {
|
||||
// inplace op
|
||||
f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_f8_e5m2) {
|
||||
// inplace op
|
||||
f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_f64) {
|
||||
// inplace op
|
||||
f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_i64) {
|
||||
// inplace op
|
||||
i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
|
||||
}
|
||||
|
||||
convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
|
||||
dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
|
||||
curr_time_ms = ggml_time_ms();
|
||||
convert_time_ms += curr_time_ms - prev_time_ms;
|
||||
}
|
||||
} else {
|
||||
read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
|
||||
read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
|
||||
|
||||
prev_time_ms = ggml_time_ms();
|
||||
if (tensor_storage.is_bf16) {
|
||||
// inplace op
|
||||
bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_f8_e4m3) {
|
||||
// inplace op
|
||||
f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_f8_e5m2) {
|
||||
// inplace op
|
||||
f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_f64) {
|
||||
// inplace op
|
||||
f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
|
||||
} else if (tensor_storage.is_i64) {
|
||||
// inplace op
|
||||
i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
|
||||
}
|
||||
|
||||
if (tensor_storage.type == dst_tensor->type) {
|
||||
// copy to device memory
|
||||
curr_time_ms = ggml_time_ms();
|
||||
convert_time_ms += curr_time_ms - prev_time_ms;
|
||||
prev_time_ms = curr_time_ms;
|
||||
ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor));
|
||||
curr_time_ms = ggml_time_ms();
|
||||
copy_to_backend_time_ms += curr_time_ms - prev_time_ms;
|
||||
} else {
|
||||
// convert first, then copy to device memory
|
||||
convert_buffer.resize(ggml_nbytes(dst_tensor));
|
||||
convert_tensor((void*)read_buffer.data(), tensor_storage.type,
|
||||
(void*)convert_buffer.data(), dst_tensor->type,
|
||||
(int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
|
||||
curr_time_ms = ggml_time_ms();
|
||||
convert_time_ms += curr_time_ms - prev_time_ms;
|
||||
prev_time_ms = curr_time_ms;
|
||||
ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor));
|
||||
curr_time_ms = ggml_time_ms();
|
||||
copy_to_backend_time_ms += curr_time_ms - prev_time_ms;
|
||||
}
|
||||
}
|
||||
++tensor_count;
|
||||
int64_t t2 = ggml_time_ms();
|
||||
if ((t2 - t1) >= 200) {
|
||||
t1 = t2;
|
||||
pretty_progress(tensor_count, tensor_max, (t1 - t0) / (1000.0f * tensor_count));
|
||||
partial = tensor_count != tensor_max;
|
||||
}
|
||||
size_t curr_num = total_tensors_processed + current_idx;
|
||||
pretty_progress(curr_num, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f / (curr_num + 1e-6f));
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(200));
|
||||
}
|
||||
|
||||
if (partial) {
|
||||
if (tensor_count >= 1) {
|
||||
t1 = ggml_time_ms();
|
||||
pretty_progress(tensor_count, tensor_max, (t1 - t0) / (1000.0f * tensor_count));
|
||||
}
|
||||
if (tensor_count < tensor_max) {
|
||||
printf("\n");
|
||||
}
|
||||
for (auto& w : workers) {
|
||||
w.join();
|
||||
}
|
||||
|
||||
if (zip != NULL) {
|
||||
zip_close(zip);
|
||||
}
|
||||
|
||||
if (!success) {
|
||||
if (failed) {
|
||||
success = false;
|
||||
break;
|
||||
}
|
||||
total_tensors_processed += file_tensors.size();
|
||||
pretty_progress(total_tensors_processed, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f / (total_tensors_processed + 1e-6f));
|
||||
if (total_tensors_processed < total_tensors_to_process) {
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
int64_t end_time = ggml_time_ms();
|
||||
LOG_INFO("loading tensors completed, taking %.2fs (process: %.2fs, read: %.2fs, memcpy: %.2fs, convert: %.2fs, copy_to_backend: %.2fs)",
|
||||
(end_time - start_time) / 1000.f,
|
||||
process_time_ms / 1000.f,
|
||||
read_time_ms / 1000.f,
|
||||
memcpy_time_ms / 1000.f,
|
||||
convert_time_ms / 1000.f,
|
||||
copy_to_backend_time_ms / 1000.f);
|
||||
(read_time_ms.load() / (float)last_n_threads) / 1000.f,
|
||||
(memcpy_time_ms.load() / (float)last_n_threads) / 1000.f,
|
||||
(convert_time_ms.load() / (float)last_n_threads) / 1000.f,
|
||||
(copy_to_backend_time_ms.load() / (float)last_n_threads) / 1000.f);
|
||||
return success;
|
||||
}
|
||||
|
||||
bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
|
||||
std::set<std::string> ignore_tensors) {
|
||||
std::set<std::string> ignore_tensors,
|
||||
int n_threads) {
|
||||
std::set<std::string> tensor_names_in_file;
|
||||
std::mutex tensor_names_mutex;
|
||||
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
|
||||
const std::string& name = tensor_storage.name;
|
||||
// LOG_DEBUG("%s", tensor_storage.to_string().c_str());
|
||||
tensor_names_in_file.insert(name);
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(tensor_names_mutex);
|
||||
tensor_names_in_file.insert(name);
|
||||
}
|
||||
|
||||
struct ggml_tensor* real;
|
||||
if (tensors.find(name) != tensors.end()) {
|
||||
@ -2263,7 +2320,7 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
|
||||
return true;
|
||||
};
|
||||
|
||||
bool success = load_tensors(on_new_tensor_cb);
|
||||
bool success = load_tensors(on_new_tensor_cb, n_threads);
|
||||
if (!success) {
|
||||
LOG_ERROR("load tensors from file failed");
|
||||
return false;
|
||||
|
||||
5
model.h
5
model.h
@ -247,9 +247,10 @@ public:
|
||||
ggml_type get_diffusion_model_wtype();
|
||||
ggml_type get_vae_wtype();
|
||||
void set_wtype_override(ggml_type wtype, std::string prefix = "");
|
||||
bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb);
|
||||
bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0);
|
||||
bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
|
||||
std::set<std::string> ignore_tensors = {});
|
||||
std::set<std::string> ignore_tensors = {},
|
||||
int n_threads = 0);
|
||||
|
||||
bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules);
|
||||
bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
|
||||
|
||||
@ -576,7 +576,7 @@ public:
|
||||
if (version == VERSION_SVD) {
|
||||
ignore_tensors.insert("conditioner.embedders.3");
|
||||
}
|
||||
bool success = model_loader.load_tensors(tensors, ignore_tensors);
|
||||
bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);
|
||||
if (!success) {
|
||||
LOG_ERROR("load tensors from model loader failed");
|
||||
ggml_free(ctx);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user