mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-05-15 03:40:34 +00:00
feat: support for memory-mapping model weights (#1414)
Co-authored-by: Piotr Wilkin <piotr.wilkin@syndatis.com> Co-authored-by: Junmo Kim <me@junmo.kim> Co-authored-by: leejet <leejet714@gmail.com>
This commit is contained in:
parent
9d683417cb
commit
57ff2eb0f4
@ -2567,6 +2567,23 @@ public:
|
|||||||
|
|
||||||
bool alloc_params_buffer() {
|
bool alloc_params_buffer() {
|
||||||
size_t num_tensors = ggml_tensor_num(params_ctx);
|
size_t num_tensors = ggml_tensor_num(params_ctx);
|
||||||
|
if (num_tensors > 0) {
|
||||||
|
// ggml_backend_alloc_ctx_tensors fails when all tensors are already allocated
|
||||||
|
// (typical for memory-mapped weights). See ggml-alloc.c n_buffers==0 branch.
|
||||||
|
bool all_have_data = true;
|
||||||
|
for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) {
|
||||||
|
if (t->data == nullptr) {
|
||||||
|
all_have_data = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (all_have_data) {
|
||||||
|
LOG_DEBUG("%s all params already mmap-allocated (no separate buffer needed)", get_desc().c_str());
|
||||||
|
params_buffer = nullptr;
|
||||||
|
rebuild_params_tensor_set();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend);
|
params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend);
|
||||||
if (params_buffer == nullptr) {
|
if (params_buffer == nullptr) {
|
||||||
LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
|
LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
|
||||||
|
|||||||
195
src/model.cpp
195
src/model.cpp
@ -730,16 +730,10 @@ void ModelLoader::set_wtype_override(ggml_type wtype, std::string tensor_type_ru
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool enable_mmap) {
|
void ModelLoader::process_model_files(bool enable_mmap, bool writable_mmap) {
|
||||||
int64_t process_time_ms = 0;
|
if (model_files_processed) {
|
||||||
std::atomic<int64_t> read_time_ms(0);
|
return;
|
||||||
std::atomic<int64_t> memcpy_time_ms(0);
|
}
|
||||||
std::atomic<int64_t> copy_to_backend_time_ms(0);
|
|
||||||
std::atomic<int64_t> convert_time_ms(0);
|
|
||||||
std::atomic<uint64_t> bytes_processed(0);
|
|
||||||
|
|
||||||
int num_threads_to_use = n_threads_p > 0 ? n_threads_p : sd_get_num_physical_cores();
|
|
||||||
LOG_DEBUG("using %d threads for model loading", num_threads_to_use);
|
|
||||||
|
|
||||||
int64_t start_time = ggml_time_ms();
|
int64_t start_time = ggml_time_ms();
|
||||||
|
|
||||||
@ -751,22 +745,13 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
|
|||||||
processed_tensor_storages.push_back(tensor_storage);
|
processed_tensor_storages.push_back(tensor_storage);
|
||||||
}
|
}
|
||||||
|
|
||||||
process_time_ms = ggml_time_ms() - start_time;
|
|
||||||
|
|
||||||
bool success = true;
|
|
||||||
size_t total_tensors_processed = 0;
|
|
||||||
const size_t total_tensors_to_process = processed_tensor_storages.size();
|
|
||||||
const int64_t t_start = ggml_time_ms();
|
|
||||||
int last_n_threads = 1;
|
|
||||||
|
|
||||||
for (size_t file_index = 0; file_index < file_paths_.size(); file_index++) {
|
for (size_t file_index = 0; file_index < file_paths_.size(); file_index++) {
|
||||||
std::string file_path = file_paths_[file_index];
|
std::string file_path = file_paths_[file_index];
|
||||||
LOG_DEBUG("loading tensors from %s", file_path.c_str());
|
|
||||||
|
|
||||||
std::vector<const TensorStorage*> file_tensors;
|
std::vector<TensorStorage> file_tensors;
|
||||||
for (const auto& ts : processed_tensor_storages) {
|
for (const auto& ts : processed_tensor_storages) {
|
||||||
if (ts.file_index == file_index) {
|
if (ts.file_index == file_index) {
|
||||||
file_tensors.push_back(&ts);
|
file_tensors.push_back(ts);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (file_tensors.empty()) {
|
if (file_tensors.empty()) {
|
||||||
@ -775,20 +760,168 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
|
|||||||
|
|
||||||
bool is_zip = false;
|
bool is_zip = false;
|
||||||
for (auto const& ts : file_tensors) {
|
for (auto const& ts : file_tensors) {
|
||||||
if (ts->index_in_zip >= 0) {
|
if (ts.index_in_zip >= 0) {
|
||||||
is_zip = true;
|
is_zip = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<MmapWrapper> mmapped;
|
ModelFileData fdata = {};
|
||||||
|
fdata.path = file_path;
|
||||||
|
fdata.is_zip = is_zip;
|
||||||
|
fdata.tensors = std::move(file_tensors);
|
||||||
|
|
||||||
if (enable_mmap && !is_zip) {
|
if (enable_mmap && !is_zip) {
|
||||||
LOG_DEBUG("using mmap for I/O");
|
LOG_DEBUG("using mmap for I/O");
|
||||||
mmapped = MmapWrapper::create(file_path);
|
std::unique_ptr<MmapWrapper> mmapped = MmapWrapper::create(file_path, writable_mmap);
|
||||||
if (!mmapped) {
|
if (mmapped) {
|
||||||
LOG_WARN("failed to memory-map '%s'", file_path.c_str());
|
uint8_t* mmap_data = static_cast<uint8_t*>(mmapped->writable_data());
|
||||||
|
ggml_backend_buffer_t buf_mmap = ggml_backend_cpu_buffer_from_ptr(mmap_data, mmapped->size());
|
||||||
|
if (buf_mmap) {
|
||||||
|
LOG_INFO("using mmap for '%s'", file_path.c_str());
|
||||||
|
fdata.mmbuffer = std::shared_ptr<struct ggml_backend_buffer>(buf_mmap, ggml_backend_buffer_free);
|
||||||
|
} else {
|
||||||
|
LOG_WARN("mmap: failed to create backend buffer for file %s", fdata.path.c_str());
|
||||||
|
}
|
||||||
|
fdata.mmapped = std::shared_ptr<MmapWrapper>(std::move(mmapped));
|
||||||
|
} else {
|
||||||
|
LOG_WARN("failed to memory-map '%s' (falling back to read())", file_path.c_str());
|
||||||
|
}
|
||||||
|
} else if (!is_zip) {
|
||||||
|
LOG_INFO("NOT using mmap for '%s' (mmap disabled by caller)",
|
||||||
|
file_path.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
file_data.push_back(std::move(fdata));
|
||||||
|
}
|
||||||
|
|
||||||
|
model_files_processed = true;
|
||||||
|
|
||||||
|
int64_t end_time = ggml_time_ms();
|
||||||
|
int64_t process_time_ms = end_time - start_time;
|
||||||
|
|
||||||
|
LOG_INFO("model files processing completed in %.2fs", process_time_ms / 1000.f);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<MmapTensorStore> ModelLoader::mmap_tensors(std::map<std::string, ggml_tensor*>& tensors,
|
||||||
|
std::set<std::string> ignore_tensors,
|
||||||
|
bool writable_mmap) {
|
||||||
|
process_model_files(true, writable_mmap);
|
||||||
|
|
||||||
|
std::vector<MmapTensorStore> result;
|
||||||
|
uint64_t mapped_bytes = 0;
|
||||||
|
size_t mapped_tensors = 0;
|
||||||
|
|
||||||
|
LOG_DEBUG("memory-mapping tensors...");
|
||||||
|
|
||||||
|
int64_t t_start = ggml_time_ms();
|
||||||
|
|
||||||
|
for (auto& fdata : file_data) {
|
||||||
|
if (!fdata.mmbuffer)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
const std::vector<TensorStorage>& file_tensors = fdata.tensors;
|
||||||
|
|
||||||
|
size_t file_mapped_bytes = 0;
|
||||||
|
size_t file_mapped_tensors = 0;
|
||||||
|
|
||||||
|
for (const auto& tensor_storage : file_tensors) {
|
||||||
|
const std::string& name = tensor_storage.name;
|
||||||
|
|
||||||
|
bool is_ignored = false;
|
||||||
|
for (const auto& ignore_prefix : ignore_tensors) {
|
||||||
|
if (starts_with(name, ignore_prefix)) {
|
||||||
|
is_ignored = true;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (is_ignored)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
auto it = tensors.find(name);
|
||||||
|
if (it == tensors.end())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
ggml_tensor* dst_tensor = it->second;
|
||||||
|
if (dst_tensor == nullptr)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (tensor_storage.type != dst_tensor->type)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
size_t tensor_size = tensor_storage.nbytes();
|
||||||
|
size_t tensor_offset = tensor_storage.offset;
|
||||||
|
|
||||||
|
if (tensor_storage.ne[0] != dst_tensor->ne[0] ||
|
||||||
|
tensor_storage.ne[1] != dst_tensor->ne[1] ||
|
||||||
|
tensor_storage.ne[2] != dst_tensor->ne[2] ||
|
||||||
|
tensor_storage.ne[3] != dst_tensor->ne[3] ||
|
||||||
|
tensor_size != ggml_nbytes(dst_tensor)) {
|
||||||
|
// let load_tensors worry about this
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_t buf_mmap = fdata.mmbuffer.get();
|
||||||
|
uint8_t* mmap_data = static_cast<uint8_t*>(ggml_backend_buffer_get_base(buf_mmap));
|
||||||
|
dst_tensor->buffer = buf_mmap;
|
||||||
|
dst_tensor->data = mmap_data + tensor_offset;
|
||||||
|
|
||||||
|
file_mapped_bytes += tensor_size;
|
||||||
|
file_mapped_tensors++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (file_mapped_bytes > 0) {
|
||||||
|
mapped_tensors += file_mapped_tensors;
|
||||||
|
mapped_bytes += file_mapped_bytes;
|
||||||
|
result.push_back({fdata.mmapped, fdata.mmbuffer});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t t_end = ggml_time_ms();
|
||||||
|
int64_t duration_ms = t_end - t_start;
|
||||||
|
|
||||||
|
LOG_INFO("memory-mapped %zu tensors in %zu files (%.2f MB), taking %.2fs",
|
||||||
|
mapped_tensors,
|
||||||
|
result.size(),
|
||||||
|
mapped_bytes / (1024.0 * 1024.0),
|
||||||
|
duration_ms / 1000.0);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool enable_mmap) {
|
||||||
|
process_model_files(enable_mmap, false);
|
||||||
|
|
||||||
|
std::atomic<int64_t> read_time_ms(0);
|
||||||
|
std::atomic<int64_t> memcpy_time_ms(0);
|
||||||
|
std::atomic<int64_t> copy_to_backend_time_ms(0);
|
||||||
|
std::atomic<int64_t> convert_time_ms(0);
|
||||||
|
std::atomic<uint64_t> bytes_processed(0);
|
||||||
|
|
||||||
|
int num_threads_to_use = n_threads_p > 0 ? n_threads_p : sd_get_num_physical_cores();
|
||||||
|
LOG_DEBUG("using %d threads for model loading", num_threads_to_use);
|
||||||
|
|
||||||
|
int64_t start_time = ggml_time_ms();
|
||||||
|
|
||||||
|
size_t total_tensors_to_process = 0;
|
||||||
|
for (const auto& fdata : file_data) {
|
||||||
|
total_tensors_to_process += fdata.tensors.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool success = true;
|
||||||
|
size_t total_tensors_processed = 0;
|
||||||
|
const int64_t t_start = start_time;
|
||||||
|
int last_n_threads = 1;
|
||||||
|
|
||||||
|
for (auto& fdata : file_data) {
|
||||||
|
const std::string& file_path = fdata.path;
|
||||||
|
LOG_DEBUG("loading tensors from %s", file_path.c_str());
|
||||||
|
|
||||||
|
const std::vector<TensorStorage>& file_tensors = fdata.tensors;
|
||||||
|
|
||||||
|
bool is_zip = fdata.is_zip;
|
||||||
|
|
||||||
|
std::shared_ptr<MmapWrapper> mmapped = fdata.mmapped;
|
||||||
|
|
||||||
int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size());
|
int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size());
|
||||||
if (n_threads < 1) {
|
if (n_threads < 1) {
|
||||||
@ -830,7 +963,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
const TensorStorage& tensor_storage = *file_tensors[idx];
|
const TensorStorage& tensor_storage = file_tensors[idx];
|
||||||
ggml_tensor* dst_tensor = nullptr;
|
ggml_tensor* dst_tensor = nullptr;
|
||||||
|
|
||||||
t0 = ggml_time_ms();
|
t0 = ggml_time_ms();
|
||||||
@ -847,6 +980,11 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// skip mmapped tensors
|
||||||
|
if (dst_tensor->buffer != nullptr && dst_tensor->buffer == fdata.mmbuffer.get()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
size_t nbytes_to_read = tensor_storage.nbytes_to_read();
|
size_t nbytes_to_read = tensor_storage.nbytes_to_read();
|
||||||
|
|
||||||
auto read_data = [&](char* buf, size_t n) {
|
auto read_data = [&](char* buf, size_t n) {
|
||||||
@ -990,9 +1128,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
|
|||||||
}
|
}
|
||||||
|
|
||||||
int64_t end_time = ggml_time_ms();
|
int64_t end_time = ggml_time_ms();
|
||||||
LOG_INFO("loading tensors completed, taking %.2fs (process: %.2fs, read: %.2fs, memcpy: %.2fs, convert: %.2fs, copy_to_backend: %.2fs)",
|
LOG_INFO("loading tensors completed, taking %.2fs (read: %.2fs, memcpy: %.2fs, convert: %.2fs, copy_to_backend: %.2fs)",
|
||||||
(end_time - start_time) / 1000.f,
|
(end_time - start_time) / 1000.f,
|
||||||
process_time_ms / 1000.f,
|
|
||||||
(read_time_ms.load() / (float)last_n_threads) / 1000.f,
|
(read_time_ms.load() / (float)last_n_threads) / 1000.f,
|
||||||
(memcpy_time_ms.load() / (float)last_n_threads) / 1000.f,
|
(memcpy_time_ms.load() / (float)last_n_threads) / 1000.f,
|
||||||
(convert_time_ms.load() / (float)last_n_threads) / 1000.f,
|
(convert_time_ms.load() / (float)last_n_threads) / 1000.f,
|
||||||
|
|||||||
21
src/model.h
21
src/model.h
@ -193,10 +193,27 @@ using TensorTypeRules = std::vector<std::pair<std::string, ggml_type>>;
|
|||||||
|
|
||||||
TensorTypeRules parse_tensor_type_rules(const std::string& tensor_type_rules);
|
TensorTypeRules parse_tensor_type_rules(const std::string& tensor_type_rules);
|
||||||
|
|
||||||
|
class MmapWrapper;
|
||||||
|
|
||||||
|
struct ModelFileData {
|
||||||
|
std::string path;
|
||||||
|
std::vector<TensorStorage> tensors;
|
||||||
|
std::shared_ptr<MmapWrapper> mmapped;
|
||||||
|
std::shared_ptr<struct ggml_backend_buffer> mmbuffer;
|
||||||
|
bool is_zip;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct MmapTensorStore {
|
||||||
|
std::shared_ptr<MmapWrapper> mmapped;
|
||||||
|
std::shared_ptr<struct ggml_backend_buffer> mmbuffer;
|
||||||
|
};
|
||||||
|
|
||||||
class ModelLoader {
|
class ModelLoader {
|
||||||
protected:
|
protected:
|
||||||
SDVersion version_ = VERSION_COUNT;
|
SDVersion version_ = VERSION_COUNT;
|
||||||
std::vector<std::string> file_paths_;
|
std::vector<std::string> file_paths_;
|
||||||
|
std::vector<ModelFileData> file_data;
|
||||||
|
bool model_files_processed = false;
|
||||||
String2TensorStorage tensor_storage_map;
|
String2TensorStorage tensor_storage_map;
|
||||||
|
|
||||||
void add_tensor_storage(const TensorStorage& tensor_storage);
|
void add_tensor_storage(const TensorStorage& tensor_storage);
|
||||||
@ -220,6 +237,10 @@ public:
|
|||||||
std::map<ggml_type, uint32_t> get_vae_wtype_stat();
|
std::map<ggml_type, uint32_t> get_vae_wtype_stat();
|
||||||
String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
|
String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
|
||||||
void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = "");
|
void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = "");
|
||||||
|
void process_model_files(bool enable_mmap = false, bool writable_mmap = true);
|
||||||
|
std::vector<MmapTensorStore> mmap_tensors(std::map<std::string, ggml_tensor*>& tensors,
|
||||||
|
std::set<std::string> ignore_tensors = {},
|
||||||
|
bool writable = true);
|
||||||
bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0, bool use_mmap = false);
|
bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0, bool use_mmap = false);
|
||||||
bool load_tensors(std::map<std::string, ggml_tensor*>& tensors,
|
bool load_tensors(std::map<std::string, ggml_tensor*>& tensors,
|
||||||
std::set<std::string> ignore_tensors = {},
|
std::set<std::string> ignore_tensors = {},
|
||||||
|
|||||||
@ -110,6 +110,7 @@ static float get_cache_reuse_threshold(const sd_cache_params_t& params) {
|
|||||||
|
|
||||||
class StableDiffusionGGML {
|
class StableDiffusionGGML {
|
||||||
public:
|
public:
|
||||||
|
std::vector<MmapTensorStore> mmap_tensor_store;
|
||||||
ggml_backend_t backend = nullptr; // general backend
|
ggml_backend_t backend = nullptr; // general backend
|
||||||
ggml_backend_t clip_backend = nullptr;
|
ggml_backend_t clip_backend = nullptr;
|
||||||
ggml_backend_t control_net_backend = nullptr;
|
ggml_backend_t control_net_backend = nullptr;
|
||||||
@ -362,6 +363,51 @@ public:
|
|||||||
apply_lora_immediately = false;
|
apply_lora_immediately = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::map<std::string, ggml_tensor*> mmap_able_tensors;
|
||||||
|
bool enable_mmap_tensors = false;
|
||||||
|
bool main_backend_mmap = false;
|
||||||
|
bool needs_writable_mmap = false;
|
||||||
|
if (sd_ctx_params->enable_mmap) {
|
||||||
|
if (apply_lora_immediately) {
|
||||||
|
needs_writable_mmap = true;
|
||||||
|
LOG_WARN("in mode 'immediately', LoRAs will cause extra memory usage with mmap");
|
||||||
|
}
|
||||||
|
enable_mmap_tensors = true;
|
||||||
|
if (offload_params_to_cpu) {
|
||||||
|
main_backend_mmap = true;
|
||||||
|
} else {
|
||||||
|
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
|
||||||
|
struct ggml_backend_dev_props props;
|
||||||
|
ggml_backend_dev_get_props(dev, &props);
|
||||||
|
main_backend_mmap = props.caps.buffer_from_host_ptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// split definition to avoid msvc choking on the extra parameter handling
|
||||||
|
auto get_param_tensors_p = [&](auto&& model, bool force_cpu, const char* prefix) {
|
||||||
|
std::map<std::string, ggml_tensor*> temp;
|
||||||
|
model->get_param_tensors(temp, prefix);
|
||||||
|
bool do_mmap = enable_mmap_tensors && (main_backend_mmap || force_cpu);
|
||||||
|
for (const auto& [key, tensor] : temp) {
|
||||||
|
tensors[key] = tensor;
|
||||||
|
if (do_mmap) {
|
||||||
|
mmap_able_tensors[key] = tensor;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
auto get_param_tensors = [&](auto&& model, bool force_cpu = false) {
|
||||||
|
std::map<std::string, ggml_tensor*> temp;
|
||||||
|
model->get_param_tensors(temp);
|
||||||
|
bool do_mmap = enable_mmap_tensors && (main_backend_mmap || force_cpu);
|
||||||
|
for (const auto& [key, tensor] : temp) {
|
||||||
|
tensors[key] = tensor;
|
||||||
|
if (do_mmap) {
|
||||||
|
mmap_able_tensors[key] = tensor;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
if (sd_version_is_control(version)) {
|
if (sd_version_is_control(version)) {
|
||||||
// Might need vae encode for control cond
|
// Might need vae encode for control cond
|
||||||
vae_decode_only = false;
|
vae_decode_only = false;
|
||||||
@ -473,8 +519,7 @@ public:
|
|||||||
offload_params_to_cpu,
|
offload_params_to_cpu,
|
||||||
tensor_storage_map);
|
tensor_storage_map);
|
||||||
clip_vision->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
clip_vision->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
||||||
clip_vision->alloc_params_buffer();
|
get_param_tensors(clip_vision);
|
||||||
clip_vision->get_param_tensors(tensors);
|
|
||||||
}
|
}
|
||||||
} else if (sd_version_is_qwen_image(version)) {
|
} else if (sd_version_is_qwen_image(version)) {
|
||||||
bool enable_vision = false;
|
bool enable_vision = false;
|
||||||
@ -550,12 +595,10 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
cond_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
cond_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
||||||
cond_stage_model->alloc_params_buffer();
|
get_param_tensors(cond_stage_model, clip_on_cpu);
|
||||||
cond_stage_model->get_param_tensors(tensors);
|
|
||||||
|
|
||||||
diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
||||||
diffusion_model->alloc_params_buffer();
|
get_param_tensors(diffusion_model);
|
||||||
diffusion_model->get_param_tensors(tensors);
|
|
||||||
|
|
||||||
if (sd_version_is_unet_edit(version)) {
|
if (sd_version_is_unet_edit(version)) {
|
||||||
vae_decode_only = false;
|
vae_decode_only = false;
|
||||||
@ -563,8 +606,7 @@ public:
|
|||||||
|
|
||||||
if (high_noise_diffusion_model) {
|
if (high_noise_diffusion_model) {
|
||||||
high_noise_diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
high_noise_diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
||||||
high_noise_diffusion_model->alloc_params_buffer();
|
get_param_tensors(high_noise_diffusion_model);
|
||||||
high_noise_diffusion_model->get_param_tensors(tensors);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) {
|
if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) {
|
||||||
@ -627,6 +669,8 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
bool force_vae_cpu = sd_ctx_params->keep_vae_on_cpu;
|
||||||
|
|
||||||
if (version == VERSION_CHROMA_RADIANCE) {
|
if (version == VERSION_CHROMA_RADIANCE) {
|
||||||
LOG_INFO("using FakeVAE");
|
LOG_INFO("using FakeVAE");
|
||||||
first_stage_model = std::make_shared<FakeVAE>(version,
|
first_stage_model = std::make_shared<FakeVAE>(version,
|
||||||
@ -636,20 +680,17 @@ public:
|
|||||||
LOG_INFO("using TAE for encoding / decoding");
|
LOG_INFO("using TAE for encoding / decoding");
|
||||||
first_stage_model = create_tae();
|
first_stage_model = create_tae();
|
||||||
first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
||||||
first_stage_model->alloc_params_buffer();
|
get_param_tensors_p(first_stage_model, force_vae_cpu, "tae");
|
||||||
first_stage_model->get_param_tensors(tensors, "tae");
|
|
||||||
} else {
|
} else {
|
||||||
LOG_INFO("using VAE for encoding / decoding");
|
LOG_INFO("using VAE for encoding / decoding");
|
||||||
first_stage_model = create_vae();
|
first_stage_model = create_vae();
|
||||||
first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
||||||
first_stage_model->alloc_params_buffer();
|
get_param_tensors_p(first_stage_model, force_vae_cpu, "first_stage_model");
|
||||||
first_stage_model->get_param_tensors(tensors, "first_stage_model");
|
|
||||||
if (use_tae && tae_preview_only) {
|
if (use_tae && tae_preview_only) {
|
||||||
LOG_INFO("using TAE for preview");
|
LOG_INFO("using TAE for preview");
|
||||||
preview_vae = create_tae();
|
preview_vae = create_tae();
|
||||||
preview_vae->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
preview_vae->set_max_graph_vram_bytes(max_graph_vram_bytes);
|
||||||
preview_vae->alloc_params_buffer();
|
get_param_tensors_p(first_stage_model, force_vae_cpu, "vae");
|
||||||
preview_vae->get_param_tensors(tensors, "tae");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -714,11 +755,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (use_pmid) {
|
if (use_pmid) {
|
||||||
if (!pmid_model->alloc_params_buffer()) {
|
get_param_tensors_p(pmid_model, false, "pmid");
|
||||||
LOG_ERROR(" pmid model params buffer allocation failed");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
pmid_model->get_param_tensors(tensors, "pmid");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sd_ctx_params->flash_attn) {
|
if (sd_ctx_params->flash_attn) {
|
||||||
@ -798,6 +835,41 @@ public:
|
|||||||
ignore_tensors.insert("text_encoders.llm.vision_tower.");
|
ignore_tensors.insert("text_encoders.llm.vision_tower.");
|
||||||
ignore_tensors.insert("text_encoders.llm.multi_modal_projector.");
|
ignore_tensors.insert("text_encoders.llm.multi_modal_projector.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (enable_mmap_tensors) {
|
||||||
|
if (mmap_able_tensors.empty()) {
|
||||||
|
LOG_DEBUG("no tensors could be memory-mapped");
|
||||||
|
} else {
|
||||||
|
mmap_tensor_store = model_loader.mmap_tensors(mmap_able_tensors, ignore_tensors, needs_writable_mmap);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (clip_vision) {
|
||||||
|
clip_vision->alloc_params_buffer();
|
||||||
|
}
|
||||||
|
if (cond_stage_model) {
|
||||||
|
cond_stage_model->alloc_params_buffer();
|
||||||
|
}
|
||||||
|
if (diffusion_model) {
|
||||||
|
diffusion_model->alloc_params_buffer();
|
||||||
|
}
|
||||||
|
if (high_noise_diffusion_model) {
|
||||||
|
high_noise_diffusion_model->alloc_params_buffer();
|
||||||
|
}
|
||||||
|
if (first_stage_model) {
|
||||||
|
first_stage_model->alloc_params_buffer();
|
||||||
|
}
|
||||||
|
if (preview_vae) {
|
||||||
|
preview_vae->alloc_params_buffer();
|
||||||
|
}
|
||||||
|
if (use_pmid && pmid_model) {
|
||||||
|
if (!pmid_model->alloc_params_buffer()) {
|
||||||
|
LOG_ERROR(" pmid model params buffer allocation failed");
|
||||||
|
ggml_free(ctx);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap);
|
bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap);
|
||||||
if (!success) {
|
if (!success) {
|
||||||
LOG_ERROR("load tensors from model loader failed");
|
LOG_ERROR("load tensors from model loader failed");
|
||||||
|
|||||||
108
src/util.cpp
108
src/util.cpp
@ -112,7 +112,7 @@ private:
|
|||||||
HANDLE hmapping_;
|
HANDLE hmapping_;
|
||||||
};
|
};
|
||||||
|
|
||||||
std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
|
std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename, bool writable) {
|
||||||
void* mapped_data = nullptr;
|
void* mapped_data = nullptr;
|
||||||
size_t file_size = 0;
|
size_t file_size = 0;
|
||||||
|
|
||||||
@ -137,14 +137,18 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
|
|||||||
|
|
||||||
file_size = static_cast<size_t>(size.QuadPart);
|
file_size = static_cast<size_t>(size.QuadPart);
|
||||||
|
|
||||||
HANDLE mapping_handle = CreateFileMapping(file_handle, nullptr, PAGE_READONLY, 0, 0, nullptr);
|
DWORD page_prot = writable ? PAGE_WRITECOPY : PAGE_READONLY;
|
||||||
|
|
||||||
|
HANDLE mapping_handle = CreateFileMapping(file_handle, nullptr, page_prot, 0, 0, nullptr);
|
||||||
|
|
||||||
if (mapping_handle == nullptr) {
|
if (mapping_handle == nullptr) {
|
||||||
CloseHandle(file_handle);
|
CloseHandle(file_handle);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
mapped_data = MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, file_size);
|
DWORD view_access = writable ? FILE_MAP_COPY : FILE_MAP_READ;
|
||||||
|
|
||||||
|
mapped_data = MapViewOfFile(mapping_handle, view_access, 0, 0, file_size);
|
||||||
|
|
||||||
if (mapped_data == nullptr) {
|
if (mapped_data == nullptr) {
|
||||||
CloseHandle(mapping_handle);
|
CloseHandle(mapping_handle);
|
||||||
@ -172,28 +176,85 @@ bool is_directory(const std::string& path) {
|
|||||||
return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
|
return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
|
||||||
}
|
}
|
||||||
|
|
||||||
class MmapWrapperImpl : public MmapWrapper {
|
struct MmapFlags {
|
||||||
public:
|
bool sequential;
|
||||||
MmapWrapperImpl(void* data, size_t size)
|
bool populate;
|
||||||
: MmapWrapper(data, size) {}
|
bool willneed;
|
||||||
|
bool dontneed;
|
||||||
~MmapWrapperImpl() override {
|
|
||||||
munmap(data_, size_);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
|
static MmapFlags get_mmap_flags() {
|
||||||
|
MmapFlags result = {};
|
||||||
|
const char* SD_MMAP_FLAGS = std::getenv("SD_MMAP_FLAGS");
|
||||||
|
if (SD_MMAP_FLAGS && *SD_MMAP_FLAGS) {
|
||||||
|
std::stringstream ss(SD_MMAP_FLAGS);
|
||||||
|
std::string token;
|
||||||
|
while (std::getline(ss, token, ',')) {
|
||||||
|
std::string ntoken = trim(token);
|
||||||
|
std::transform(ntoken.begin(), ntoken.end(), ntoken.begin(), ::tolower);
|
||||||
|
if (ntoken == "sequential") {
|
||||||
|
result.sequential = true;
|
||||||
|
} else if (ntoken == "populate") {
|
||||||
|
result.populate = true;
|
||||||
|
} else if (ntoken == "willneed") {
|
||||||
|
result.willneed = true;
|
||||||
|
} else if (ntoken == "dontneed") {
|
||||||
|
result.dontneed = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
class MmapWrapperImpl : public MmapWrapper {
|
||||||
|
public:
|
||||||
|
MmapWrapperImpl(void* data, size_t size, int fd)
|
||||||
|
: MmapWrapper(data, size), fd_(fd) {}
|
||||||
|
|
||||||
|
~MmapWrapperImpl() override {
|
||||||
|
#ifdef __linux__
|
||||||
|
auto cfg_flags = get_mmap_flags();
|
||||||
|
|
||||||
|
// Drop the kernel pagecache pages for this file. madvise(DONTNEED)
|
||||||
|
// alone only unmaps from the process address space; pagecache
|
||||||
|
// entries persist (`free` reports them as buff/cache and the OOM
|
||||||
|
// killer doesn't touch them, but they ARE counted against
|
||||||
|
// overcommit and can starve other allocations on tight-RAM
|
||||||
|
// systems). posix_fadvise(POSIX_FADV_DONTNEED) is the documented
|
||||||
|
// way to evict pagecache for a specific fd's pages.
|
||||||
|
if (cfg_flags.dontneed) {
|
||||||
|
madvise(data_, size_, MADV_DONTNEED);
|
||||||
|
posix_fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
munmap(data_, size_);
|
||||||
|
close(fd_);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
int fd_;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename, bool writable) {
|
||||||
int file_descriptor = open(filename.c_str(), O_RDONLY);
|
int file_descriptor = open(filename.c_str(), O_RDONLY);
|
||||||
if (file_descriptor == -1) {
|
if (file_descriptor == -1) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto cfg_flags = get_mmap_flags();
|
||||||
|
|
||||||
int mmap_flags = MAP_PRIVATE;
|
int mmap_flags = MAP_PRIVATE;
|
||||||
|
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
// performance flags used by llama.cpp
|
// Sequential access hint helps the kernel read-ahead efficiently and
|
||||||
// posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL);
|
// also encourages eviction of already-read pages (the kernel keeps
|
||||||
// mmap_flags |= MAP_POPULATE;
|
// a smaller working set when this is set).
|
||||||
|
if (cfg_flags.sequential) {
|
||||||
|
posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL);
|
||||||
|
}
|
||||||
|
if (cfg_flags.populate) {
|
||||||
|
mmap_flags |= MAP_POPULATE;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
struct stat sb;
|
struct stat sb;
|
||||||
@ -204,20 +265,27 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
|
|||||||
|
|
||||||
size_t file_size = sb.st_size;
|
size_t file_size = sb.st_size;
|
||||||
|
|
||||||
void* mapped_data = mmap(nullptr, file_size, PROT_READ, mmap_flags, file_descriptor, 0);
|
if (file_size == 0) {
|
||||||
|
|
||||||
close(file_descriptor);
|
close(file_descriptor);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
int mmap_prot = PROT_READ | (writable ? PROT_WRITE : 0);
|
||||||
|
|
||||||
|
void* mapped_data = mmap(nullptr, file_size, mmap_prot, mmap_flags, file_descriptor, 0);
|
||||||
|
|
||||||
if (mapped_data == MAP_FAILED) {
|
if (mapped_data == MAP_FAILED) {
|
||||||
|
close(file_descriptor);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
// performance flags used by llama.cpp
|
if (cfg_flags.willneed) {
|
||||||
// posix_madvise(mapped_data, file_size, POSIX_MADV_WILLNEED);
|
posix_madvise(mapped_data, file_size, POSIX_MADV_WILLNEED);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return std::make_unique<MmapWrapperImpl>(mapped_data, file_size);
|
return std::make_unique<MmapWrapperImpl>(mapped_data, file_size, file_descriptor);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -42,7 +42,7 @@ sd::Tensor<float> clip_preprocess(const sd::Tensor<float>& image, int target_wid
|
|||||||
|
|
||||||
class MmapWrapper {
|
class MmapWrapper {
|
||||||
public:
|
public:
|
||||||
static std::unique_ptr<MmapWrapper> create(const std::string& filename);
|
static std::unique_ptr<MmapWrapper> create(const std::string& filename, bool writable = false);
|
||||||
|
|
||||||
virtual ~MmapWrapper() = default;
|
virtual ~MmapWrapper() = default;
|
||||||
|
|
||||||
@ -52,6 +52,7 @@ public:
|
|||||||
MmapWrapper& operator=(MmapWrapper&&) = delete;
|
MmapWrapper& operator=(MmapWrapper&&) = delete;
|
||||||
|
|
||||||
const uint8_t* data() const { return static_cast<uint8_t*>(data_); }
|
const uint8_t* data() const { return static_cast<uint8_t*>(data_); }
|
||||||
|
uint8_t* writable_data() { return static_cast<uint8_t*>(data_); }
|
||||||
size_t size() const { return size_; }
|
size_t size() const { return size_; }
|
||||||
bool copy_data(void* buf, size_t n, size_t offset) const;
|
bool copy_data(void* buf, size_t n, size_t offset) const;
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user