mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-06-23 14:46:39 +00:00
perf: --eager-load to pre-load params at model-load time (#1687)
This commit is contained in:
parent
b12098f5d0
commit
787d229d84
@ -496,6 +496,10 @@ ArgOptions SDContextParams::get_options() {
|
|||||||
"--stream-layers",
|
"--stream-layers",
|
||||||
"enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram; defaults to false)",
|
"enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram; defaults to false)",
|
||||||
true, &stream_layers},
|
true, &stream_layers},
|
||||||
|
{"",
|
||||||
|
"--eager-load",
|
||||||
|
"load all params into the params backend at model-load time instead of lazily on first use (defaults to false)",
|
||||||
|
true, &eager_load},
|
||||||
{"",
|
{"",
|
||||||
"--force-sdxl-vae-conv-scale",
|
"--force-sdxl-vae-conv-scale",
|
||||||
"force use of conv scale on sdxl vae",
|
"force use of conv scale on sdxl vae",
|
||||||
@ -799,6 +803,7 @@ std::string SDContextParams::to_string() const {
|
|||||||
<< " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
|
<< " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
|
||||||
<< " max_vram: \"" << max_vram << "\",\n"
|
<< " max_vram: \"" << max_vram << "\",\n"
|
||||||
<< " stream_layers: " << (stream_layers ? "true" : "false") << ",\n"
|
<< " stream_layers: " << (stream_layers ? "true" : "false") << ",\n"
|
||||||
|
<< " eager_load: " << (eager_load ? "true" : "false") << ",\n"
|
||||||
<< " backend: \"" << backend << "\",\n"
|
<< " backend: \"" << backend << "\",\n"
|
||||||
<< " params_backend: \"" << params_backend << "\",\n"
|
<< " params_backend: \"" << params_backend << "\",\n"
|
||||||
<< " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
|
<< " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
|
||||||
@ -878,6 +883,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool taesd_preview) {
|
|||||||
sd_ctx_params.vae_format = str_to_vae_format(vae_format);
|
sd_ctx_params.vae_format = str_to_vae_format(vae_format);
|
||||||
sd_ctx_params.max_vram = max_vram.c_str();
|
sd_ctx_params.max_vram = max_vram.c_str();
|
||||||
sd_ctx_params.stream_layers = stream_layers;
|
sd_ctx_params.stream_layers = stream_layers;
|
||||||
|
sd_ctx_params.eager_load = eager_load;
|
||||||
sd_ctx_params.backend = effective_backend.c_str();
|
sd_ctx_params.backend = effective_backend.c_str();
|
||||||
sd_ctx_params.params_backend = effective_params_backend.c_str();
|
sd_ctx_params.params_backend = effective_params_backend.c_str();
|
||||||
sd_ctx_params.rpc_servers = rpc_servers.c_str();
|
sd_ctx_params.rpc_servers = rpc_servers.c_str();
|
||||||
|
|||||||
@ -148,6 +148,7 @@ struct SDContextParams {
|
|||||||
bool offload_params_to_cpu = false;
|
bool offload_params_to_cpu = false;
|
||||||
std::string max_vram = "0";
|
std::string max_vram = "0";
|
||||||
bool stream_layers = false;
|
bool stream_layers = false;
|
||||||
|
bool eager_load = false;
|
||||||
std::string backend;
|
std::string backend;
|
||||||
std::string params_backend;
|
std::string params_backend;
|
||||||
std::string rpc_servers;
|
std::string rpc_servers;
|
||||||
|
|||||||
@ -219,6 +219,7 @@ typedef struct {
|
|||||||
enum sd_vae_format_t vae_format;
|
enum sd_vae_format_t vae_format;
|
||||||
const char* max_vram; // GiB budget or backend assignment spec for graph-cut segmented param offload (0 = disabled, -1 = auto)
|
const char* max_vram; // GiB budget or backend assignment spec for graph-cut segmented param offload (0 = disabled, -1 = auto)
|
||||||
bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram)
|
bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram)
|
||||||
|
bool eager_load; // Load all params into the params backend at model-load time instead of lazily on first use
|
||||||
const char* backend;
|
const char* backend;
|
||||||
const char* params_backend;
|
const char* params_backend;
|
||||||
const char* rpc_servers;
|
const char* rpc_servers;
|
||||||
|
|||||||
@ -147,6 +147,17 @@ bool ModelManager::register_param_tensors(const std::string& desc,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ModelManager::load_all_params_eagerly() {
|
||||||
|
std::vector<TensorState*> all_states;
|
||||||
|
all_states.reserve(tensor_states_.size());
|
||||||
|
for (const auto& s : tensor_states_) {
|
||||||
|
if (s != nullptr) {
|
||||||
|
all_states.push_back(s.get());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return load_tensors_to_params_backend(all_states);
|
||||||
|
}
|
||||||
|
|
||||||
bool ModelManager::validate_registered_tensors() {
|
bool ModelManager::validate_registered_tensors() {
|
||||||
bool ok = true;
|
bool ok = true;
|
||||||
for (const auto& state : tensor_states_) {
|
for (const auto& state : tensor_states_) {
|
||||||
|
|||||||
@ -158,6 +158,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool validate_registered_tensors();
|
bool validate_registered_tensors();
|
||||||
|
bool load_all_params_eagerly();
|
||||||
|
|
||||||
bool prepare_params(const std::vector<ggml_tensor*>& tensors) override;
|
bool prepare_params(const std::vector<ggml_tensor*>& tensors) override;
|
||||||
void release_compute_backend_params(const std::vector<ggml_tensor*>& tensors) override;
|
void release_compute_backend_params(const std::vector<ggml_tensor*>& tensors) override;
|
||||||
|
|||||||
@ -199,6 +199,7 @@ public:
|
|||||||
bool enable_mmap = false;
|
bool enable_mmap = false;
|
||||||
sd::ggml_graph_cut::MaxVramAssignment max_vram_assignment;
|
sd::ggml_graph_cut::MaxVramAssignment max_vram_assignment;
|
||||||
bool stream_layers = false;
|
bool stream_layers = false;
|
||||||
|
bool eager_load = false;
|
||||||
std::string backend_spec;
|
std::string backend_spec;
|
||||||
std::string params_backend_spec;
|
std::string params_backend_spec;
|
||||||
|
|
||||||
@ -342,6 +343,7 @@ public:
|
|||||||
n_threads = sd_ctx_params->n_threads;
|
n_threads = sd_ctx_params->n_threads;
|
||||||
enable_mmap = sd_ctx_params->enable_mmap;
|
enable_mmap = sd_ctx_params->enable_mmap;
|
||||||
stream_layers = sd_ctx_params->stream_layers;
|
stream_layers = sd_ctx_params->stream_layers;
|
||||||
|
eager_load = sd_ctx_params->eager_load;
|
||||||
backend_spec = SAFE_STR(sd_ctx_params->backend);
|
backend_spec = SAFE_STR(sd_ctx_params->backend);
|
||||||
params_backend_spec = SAFE_STR(sd_ctx_params->params_backend);
|
params_backend_spec = SAFE_STR(sd_ctx_params->params_backend);
|
||||||
max_vram_assignment.reset(0.f);
|
max_vram_assignment.reset(0.f);
|
||||||
@ -1153,7 +1155,15 @@ public:
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_DEBUG("model metadata validated; weights will be prepared lazily");
|
if (eager_load) {
|
||||||
|
if (!model_manager->load_all_params_eagerly()) {
|
||||||
|
LOG_ERROR("model params eager load failed");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
LOG_DEBUG("model metadata validated; weights pre-loaded to params backend");
|
||||||
|
} else {
|
||||||
|
LOG_DEBUG("model metadata validated; weights will be prepared lazily");
|
||||||
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
size_t total_params_ram_size = 0;
|
size_t total_params_ram_size = 0;
|
||||||
@ -2696,6 +2706,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
|
|||||||
sd_ctx_params->lora_apply_mode = LORA_APPLY_AUTO;
|
sd_ctx_params->lora_apply_mode = LORA_APPLY_AUTO;
|
||||||
sd_ctx_params->max_vram = nullptr;
|
sd_ctx_params->max_vram = nullptr;
|
||||||
sd_ctx_params->stream_layers = false;
|
sd_ctx_params->stream_layers = false;
|
||||||
|
sd_ctx_params->eager_load = false;
|
||||||
sd_ctx_params->enable_mmap = false;
|
sd_ctx_params->enable_mmap = false;
|
||||||
sd_ctx_params->diffusion_flash_attn = false;
|
sd_ctx_params->diffusion_flash_attn = false;
|
||||||
sd_ctx_params->circular_x = false;
|
sd_ctx_params->circular_x = false;
|
||||||
@ -2742,6 +2753,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
|||||||
"prediction: %s\n"
|
"prediction: %s\n"
|
||||||
"max_vram: %s\n"
|
"max_vram: %s\n"
|
||||||
"stream_layers: %s\n"
|
"stream_layers: %s\n"
|
||||||
|
"eager_load: %s\n"
|
||||||
"backend: %s\n"
|
"backend: %s\n"
|
||||||
"params_backend: %s\n"
|
"params_backend: %s\n"
|
||||||
"flash_attn: %s\n"
|
"flash_attn: %s\n"
|
||||||
@ -2777,6 +2789,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
|||||||
sd_prediction_name(sd_ctx_params->prediction),
|
sd_prediction_name(sd_ctx_params->prediction),
|
||||||
SAFE_STR(sd_ctx_params->max_vram),
|
SAFE_STR(sd_ctx_params->max_vram),
|
||||||
BOOL_STR(sd_ctx_params->stream_layers),
|
BOOL_STR(sd_ctx_params->stream_layers),
|
||||||
|
BOOL_STR(sd_ctx_params->eager_load),
|
||||||
SAFE_STR(sd_ctx_params->backend),
|
SAFE_STR(sd_ctx_params->backend),
|
||||||
SAFE_STR(sd_ctx_params->params_backend),
|
SAFE_STR(sd_ctx_params->params_backend),
|
||||||
BOOL_STR(sd_ctx_params->flash_attn),
|
BOOL_STR(sd_ctx_params->flash_attn),
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user