From 563137a5926ac9455420240c2a7d0f3f15eb9bd0 Mon Sep 17 00:00:00 2001 From: leejet Date: Sat, 13 Jun 2026 13:19:13 +0800 Subject: [PATCH] refactor: centralize runner weight staging and cleanup (#1644) --- src/conditioning/conditioner.hpp | 384 ++++----- src/core/ggml_extend.hpp | 933 +++++++------------- src/core/ggml_graph_cut.cpp | 93 +- src/core/ggml_graph_cut.h | 1 - src/core/layer_registry.cpp | 132 --- src/core/layer_registry.h | 50 -- src/core/util.cpp | 14 +- src/extensions/generation_extension.h | 22 +- src/extensions/photomaker_extension.cpp | 82 +- src/model/adapter/lora.hpp | 51 +- src/model/adapter/pmid.hpp | 7 +- src/model/diffusion/anima.hpp | 2 +- src/model/diffusion/control.hpp | 17 +- src/model/diffusion/ernie_image.hpp | 2 +- src/model/diffusion/flux.hpp | 2 +- src/model/diffusion/hidream_o1.hpp | 35 +- src/model/diffusion/ideogram4.hpp | 2 +- src/model/diffusion/lens.hpp | 2 +- src/model/diffusion/ltxv.hpp | 2 +- src/model/diffusion/mmdit.hpp | 2 +- src/model/diffusion/pid.hpp | 2 +- src/model/diffusion/qwen_image.hpp | 2 +- src/model/diffusion/unet.hpp | 2 +- src/model/diffusion/wan.hpp | 2 +- src/model/diffusion/z_image.hpp | 2 +- src/model/te/clip.hpp | 7 +- src/model/te/llm.hpp | 14 +- src/model/te/t5.hpp | 7 +- src/model/upscaler/esrgan.hpp | 8 +- src/model/upscaler/ltx_latent_upscaler.hpp | 53 +- src/model/vae/auto_encoder_kl.hpp | 8 +- src/model/vae/ltx_audio_vae.hpp | 10 +- src/model/vae/ltx_vae.hpp | 14 +- src/model/vae/tae.hpp | 22 +- src/model/vae/vae.hpp | 9 +- src/model/vae/wan_vae.hpp | 16 +- src/model_loader.cpp | 194 ++++- src/model_loader.h | 15 +- src/model_manager.cpp | 944 +++++++++++++++++++++ src/model_manager.h | 131 +++ src/runtime/guidance.cpp | 4 +- src/stable-diffusion.cpp | 748 +++++++--------- src/weight_manager.h | 15 + 43 files changed, 2335 insertions(+), 1729 deletions(-) delete mode 100644 src/core/layer_registry.cpp delete mode 100644 src/core/layer_registry.h create mode 100644 src/model_manager.cpp create mode 100644 src/model_manager.h create mode 100644 src/weight_manager.h diff --git a/src/conditioning/conditioner.hpp b/src/conditioning/conditioner.hpp index 5e74af07..9eaa0e72 100644 --- a/src/conditioning/conditioner.hpp +++ b/src/conditioning/conditioner.hpp @@ -113,14 +113,13 @@ struct Conditioner { public: virtual SDCondition get_learned_condition(int n_threads, const ConditionerParams& conditioner_params) = 0; - virtual bool alloc_params_buffer() = 0; - virtual void free_params_buffer() = 0; virtual void get_param_tensors(std::map& tensors) = 0; - virtual size_t get_params_buffer_size() = 0; virtual void set_max_graph_vram_bytes(size_t max_vram_bytes) {} virtual void set_stream_layers_enabled(bool enabled) {} virtual void set_flash_attention_enabled(bool enabled) = 0; virtual void set_weight_adapter(const std::shared_ptr& adapter) {} + virtual void set_weight_manager(const std::shared_ptr& manager) {} + virtual void runner_done() {} }; // ldm.modules.encoders.modules.FrozenCLIPEmbedder @@ -167,33 +166,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } } - bool alloc_params_buffer() override { - if (!text_model->alloc_params_buffer()) { - return false; - } - if (sd_version_is_sdxl(version)) { - if (!text_model2->alloc_params_buffer()) { - return false; - } - } - return true; - } - - void free_params_buffer() override { - text_model->free_params_buffer(); - if (sd_version_is_sdxl(version)) { - text_model2->free_params_buffer(); - } - } - - size_t get_params_buffer_size() override { - size_t buffer_size = text_model->get_params_buffer_size(); - if (sd_version_is_sdxl(version)) { - buffer_size += text_model2->get_params_buffer_size(); - } - return buffer_size; - } - void set_max_graph_vram_bytes(size_t max_vram_bytes) override { text_model->set_max_graph_vram_bytes(max_vram_bytes); if (sd_version_is_sdxl(version)) { @@ -222,6 +194,20 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } } + void set_weight_manager(const std::shared_ptr& manager) override { + text_model->set_weight_manager(manager); + if (sd_version_is_sdxl(version)) { + text_model2->set_weight_manager(manager); + } + } + + void runner_done() override { + text_model->runner_done(); + if (sd_version_is_sdxl(version)) { + text_model2->runner_done(); + } + } + bool load_embedding(std::string embd_name, std::string embd_path, std::vector& bpe_tokens) { ModelLoader model_loader; if (!model_loader.init_from_file_and_convert_name(embd_path)) { @@ -263,7 +249,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } return true; }; - model_loader.load_tensors(on_load, 1); + model_loader.set_n_threads(1); + model_loader.load_tensors(on_load); int pos_start = num_custom_embeddings; if (embd) { int64_t hidden_size = text_model->model.hidden_size; @@ -432,7 +419,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { token_embed_custom.data(), max_token_idx, false, - clip_skip); + clip_skip, + false, + true, + true); GGML_ASSERT(!chunk_hidden_states.empty()); if (sd_version_is_sdxl(version)) { auto chunk_hidden_states2 = text_model2->compute(n_threads, @@ -441,7 +431,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { token_embed_custom.data(), max_token_idx, false, - clip_skip); + clip_skip, + false, + true, + true); GGML_ASSERT(!chunk_hidden_states2.empty()); chunk_hidden_states = sd::ops::concat(chunk_hidden_states, chunk_hidden_states2, 0); @@ -452,7 +445,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { token_embed_custom.data(), max_token_idx, true, - clip_skip); + clip_skip, + false, + true, + true); GGML_ASSERT(!pooled.empty()); } } @@ -523,15 +519,15 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { struct FrozenCLIPVisionEmbedder : public GGMLRunner { CLIPVisionModelProjection vision_model; + std::string weight_prefix = "cond_stage_model.transformer"; FrozenCLIPVisionEmbedder(ggml_backend_t backend, ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}) : GGMLRunner(backend, params_backend) { - std::string prefix = "cond_stage_model.transformer"; - bool proj_in = false; + bool proj_in = false; for (const auto& [name, tensor_storage] : tensor_storage_map) { - if (!starts_with(name, prefix)) { + if (!starts_with(name, weight_prefix)) { continue; } if (contains(name, "self_attn.in_proj")) { @@ -540,7 +536,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner { } } vision_model = CLIPVisionModelProjection(OPEN_CLIP_VIT_H_14, false, proj_in); - vision_model.init(params_ctx, tensor_storage_map, prefix); + vision_model.init(params_ctx, tensor_storage_map, weight_prefix); } std::string get_desc() override { @@ -548,7 +544,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner { } void get_param_tensors(std::map& tensors) { - vision_model.get_param_tensors(tensors, "cond_stage_model.transformer"); + vision_model.get_param_tensors(tensors, weight_prefix); } ggml_cgraph* build_graph(const sd::Tensor& pixel_values_tensor, bool return_pooled, int clip_skip) { @@ -571,7 +567,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(pixel_values, return_pooled, clip_skip); }; - return take_or_empty(GGMLRunner::compute(get_graph, n_threads, true)); + return take_or_empty(GGMLRunner::compute(get_graph, n_threads, true, true, true)); } }; @@ -626,51 +622,6 @@ struct SD3CLIPEmbedder : public Conditioner { } } - bool alloc_params_buffer() override { - if (clip_l) { - if (!clip_l->alloc_params_buffer()) { - return false; - } - } - if (clip_g) { - if (!clip_g->alloc_params_buffer()) { - return false; - } - } - if (t5) { - if (!t5->alloc_params_buffer()) { - return false; - } - } - return true; - } - - void free_params_buffer() override { - if (clip_l) { - clip_l->free_params_buffer(); - } - if (clip_g) { - clip_g->free_params_buffer(); - } - if (t5) { - t5->free_params_buffer(); - } - } - - size_t get_params_buffer_size() override { - size_t buffer_size = 0; - if (clip_l) { - buffer_size += clip_l->get_params_buffer_size(); - } - if (clip_g) { - buffer_size += clip_g->get_params_buffer_size(); - } - if (t5) { - buffer_size += t5->get_params_buffer_size(); - } - return buffer_size; - } - void set_max_graph_vram_bytes(size_t max_vram_bytes) override { if (clip_l) { clip_l->set_max_graph_vram_bytes(max_vram_bytes); @@ -719,6 +670,30 @@ struct SD3CLIPEmbedder : public Conditioner { } } + void set_weight_manager(const std::shared_ptr& manager) override { + if (clip_l) { + clip_l->set_weight_manager(manager); + } + if (clip_g) { + clip_g->set_weight_manager(manager); + } + if (t5) { + t5->set_weight_manager(manager); + } + } + + void runner_done() override { + if (clip_l) { + clip_l->runner_done(); + } + if (clip_g) { + clip_g->runner_done(); + } + if (t5) { + t5->runner_done(); + } + } + std::vector, std::vector>> tokenize(std::string text, size_t min_length = 0, size_t max_length = 0, @@ -834,7 +809,10 @@ struct SD3CLIPEmbedder : public Conditioner { nullptr, max_token_idx, false, - clip_skip); + clip_skip, + false, + true, + true); GGML_ASSERT(!chunk_hidden_states_l.empty()); chunk_hidden_states_l = ::apply_token_weights(std::move(chunk_hidden_states_l), chunk_weights); @@ -847,7 +825,10 @@ struct SD3CLIPEmbedder : public Conditioner { nullptr, max_token_idx, true, - clip_skip); + clip_skip, + false, + true, + true); GGML_ASSERT(!pooled_l.empty()); } } else { @@ -875,7 +856,10 @@ struct SD3CLIPEmbedder : public Conditioner { nullptr, max_token_idx, false, - clip_skip); + clip_skip, + false, + true, + true); GGML_ASSERT(!chunk_hidden_states_g.empty()); chunk_hidden_states_g = ::apply_token_weights(std::move(chunk_hidden_states_g), chunk_weights); @@ -888,7 +872,10 @@ struct SD3CLIPEmbedder : public Conditioner { nullptr, max_token_idx, true, - clip_skip); + clip_skip, + false, + true, + true); GGML_ASSERT(!pooled_g.empty()); } } else { @@ -910,7 +897,10 @@ struct SD3CLIPEmbedder : public Conditioner { chunk_hidden_states_t5 = t5->compute(n_threads, input_ids, - sd::Tensor()); + sd::Tensor(), + false, + true, + true); GGML_ASSERT(!chunk_hidden_states_t5.empty()); chunk_hidden_states_t5 = ::apply_token_weights(std::move(chunk_hidden_states_t5), chunk_weights); } else { @@ -1009,40 +999,6 @@ struct FluxCLIPEmbedder : public Conditioner { } } - bool alloc_params_buffer() override { - if (clip_l) { - if (!clip_l->alloc_params_buffer()) { - return false; - } - } - if (t5) { - if (!t5->alloc_params_buffer()) { - return false; - } - } - return true; - } - - void free_params_buffer() override { - if (clip_l) { - clip_l->free_params_buffer(); - } - if (t5) { - t5->free_params_buffer(); - } - } - - size_t get_params_buffer_size() override { - size_t buffer_size = 0; - if (clip_l) { - buffer_size += clip_l->get_params_buffer_size(); - } - if (t5) { - buffer_size += t5->get_params_buffer_size(); - } - return buffer_size; - } - void set_max_graph_vram_bytes(size_t max_vram_bytes) override { if (clip_l) { clip_l->set_max_graph_vram_bytes(max_vram_bytes); @@ -1070,7 +1026,7 @@ struct FluxCLIPEmbedder : public Conditioner { } } - void set_weight_adapter(const std::shared_ptr& adapter) { + void set_weight_adapter(const std::shared_ptr& adapter) override { if (clip_l) { clip_l->set_weight_adapter(adapter); } @@ -1079,6 +1035,24 @@ struct FluxCLIPEmbedder : public Conditioner { } } + void set_weight_manager(const std::shared_ptr& manager) override { + if (clip_l) { + clip_l->set_weight_manager(manager); + } + if (t5) { + t5->set_weight_manager(manager); + } + } + + void runner_done() override { + if (clip_l) { + clip_l->runner_done(); + } + if (t5) { + t5->runner_done(); + } + } + std::vector, std::vector>> tokenize(std::string text, size_t min_length = 0, size_t max_length = 0) { @@ -1177,7 +1151,10 @@ struct FluxCLIPEmbedder : public Conditioner { nullptr, max_token_idx, true, - clip_skip); + clip_skip, + false, + true, + true); GGML_ASSERT(!pooled.empty()); } else { pooled = sd::Tensor::zeros({768}); @@ -1195,7 +1172,10 @@ struct FluxCLIPEmbedder : public Conditioner { sd::Tensor input_ids({static_cast(chunk_tokens.size())}, chunk_tokens); chunk_hidden_states = t5->compute(n_threads, input_ids, - sd::Tensor()); + sd::Tensor(), + false, + true, + true); GGML_ASSERT(!chunk_hidden_states.empty()); chunk_hidden_states = ::apply_token_weights(std::move(chunk_hidden_states), chunk_weights); if (zero_out_masked) { @@ -1266,29 +1246,6 @@ struct T5CLIPEmbedder : public Conditioner { } } - bool alloc_params_buffer() override { - if (t5) { - if (!t5->alloc_params_buffer()) { - return false; - } - } - return true; - } - - void free_params_buffer() override { - if (t5) { - t5->free_params_buffer(); - } - } - - size_t get_params_buffer_size() override { - size_t buffer_size = 0; - if (t5) { - buffer_size += t5->get_params_buffer_size(); - } - return buffer_size; - } - void set_max_graph_vram_bytes(size_t max_vram_bytes) override { if (t5) { t5->set_max_graph_vram_bytes(max_vram_bytes); @@ -1313,6 +1270,18 @@ struct T5CLIPEmbedder : public Conditioner { } } + void set_weight_manager(const std::shared_ptr& manager) override { + if (t5) { + t5->set_weight_manager(manager); + } + } + + void runner_done() override { + if (t5) { + t5->runner_done(); + } + } + std::tuple, std::vector, std::vector> tokenize(std::string text, size_t min_length = 0, size_t max_length = 0) { @@ -1406,7 +1375,10 @@ struct T5CLIPEmbedder : public Conditioner { auto chunk_hidden_states = t5->compute(n_threads, input_ids, - t5_attn_mask_chunk); + t5_attn_mask_chunk, + false, + true, + true); GGML_ASSERT(!chunk_hidden_states.empty()); chunk_hidden_states = apply_token_weights(std::move(chunk_hidden_states), chunk_weights); @@ -1465,21 +1437,6 @@ struct AnimaConditioner : public Conditioner { llm->get_param_tensors(tensors, "text_encoders.llm"); } - bool alloc_params_buffer() override { - if (!llm->alloc_params_buffer()) { - return false; - } - return true; - } - - void free_params_buffer() override { - llm->free_params_buffer(); - } - - size_t get_params_buffer_size() override { - return llm->get_params_buffer_size(); - } - void set_max_graph_vram_bytes(size_t max_vram_bytes) override { llm->set_max_graph_vram_bytes(max_vram_bytes); } @@ -1496,6 +1453,14 @@ struct AnimaConditioner : public Conditioner { llm->set_weight_adapter(adapter); } + void set_weight_manager(const std::shared_ptr& manager) override { + llm->set_weight_manager(manager); + } + + void runner_done() override { + llm->runner_done(); + } + std::tuple, std::vector, std::vector, std::vector> tokenize(std::string text) { auto parsed_attention = parse_prompt_attention(text); @@ -1553,7 +1518,11 @@ struct AnimaConditioner : public Conditioner { input_ids, sd::Tensor(), {}, - {}); + {}, + false, + false, + true, + true); GGML_ASSERT(!hidden_states.empty()); hidden_states = apply_token_weights(std::move(hidden_states), qwen_weights); auto t5_ids_tensor = sd::Tensor::from_vector(t5_tokens); @@ -1617,23 +1586,6 @@ struct LLMEmbedder : public Conditioner { llm->get_param_tensors(tensors, "text_encoders.llm"); } - bool alloc_params_buffer() override { - if (!llm->alloc_params_buffer()) { - return false; - } - return true; - } - - void free_params_buffer() override { - llm->free_params_buffer(); - } - - size_t get_params_buffer_size() override { - size_t buffer_size = 0; - buffer_size += llm->get_params_buffer_size(); - return buffer_size; - } - void set_max_graph_vram_bytes(size_t max_vram_bytes) override { llm->set_max_graph_vram_bytes(max_vram_bytes); } @@ -1652,6 +1604,18 @@ struct LLMEmbedder : public Conditioner { } } + void set_weight_manager(const std::shared_ptr& manager) override { + if (llm) { + llm->set_weight_manager(manager); + } + } + + void runner_done() override { + if (llm) { + llm->runner_done(); + } + } + std::tuple, std::vector, std::vector> tokenize(std::string text, const std::pair& attn_range, size_t min_length = 0, @@ -1747,7 +1711,11 @@ struct LLMEmbedder : public Conditioner { input_ids, attention_mask, image_embeds, - out_layers); + out_layers, + false, + false, + true, + true); GGML_ASSERT(!hidden_states.empty()); hidden_states = apply_token_weights(std::move(hidden_states), weights); GGML_ASSERT(hidden_states.shape()[1] > prompt_template_encode_start_idx); @@ -1825,7 +1793,7 @@ struct LLMEmbedder : public Conditioner { auto resized_image = clip_preprocess(image, w_bar, h_bar); - auto image_embed = llm->encode_image(n_threads, resized_image); + auto image_embed = llm->encode_image(n_threads, resized_image, false, true, true); GGML_ASSERT(!image_embed.empty()); image_embeds.emplace_back(image_embed_idx, image_embed); image_embed_idx += 1 + static_cast(image_embed.shape()[1]) + 6; @@ -1895,7 +1863,7 @@ struct LLMEmbedder : public Conditioner { LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, height, width, h_bar, w_bar); auto resized_image = clip_preprocess(image, w_bar, h_bar); - auto image_embed = llm->encode_image(n_threads, resized_image); + auto image_embed = llm->encode_image(n_threads, resized_image, false, true, true); GGML_ASSERT(!image_embed.empty()); image_embeds.emplace_back(image_embed_idx, image_embed); image_embed_idx += 1 + static_cast(image_embed.shape()[1]) + 6; @@ -2163,11 +2131,15 @@ struct LTXAVTextProjectionRunner : public GGMLRunner { return gf; } - sd::Tensor compute(int n_threads, const sd::Tensor& x) { + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + bool auto_free = true, + bool free_compute_buffer = true, + bool free_compute_params = true) { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x); }; - return take_or_empty(GGMLRunner::compute(get_graph, n_threads, true)); + return take_or_empty(GGMLRunner::compute(get_graph, n_threads, auto_free, free_compute_buffer, free_compute_params)); } }; @@ -2205,25 +2177,6 @@ struct LTXAVEmbedder : public Conditioner { projector->get_param_tensors(tensors, "text_embedding_projection"); } - bool alloc_params_buffer() override { - if (!llm->alloc_params_buffer()) { - return false; - } - if (!projector->alloc_params_buffer()) { - return false; - } - return true; - } - - void free_params_buffer() override { - llm->free_params_buffer(); - projector->free_params_buffer(); - } - - size_t get_params_buffer_size() override { - return llm->get_params_buffer_size() + projector->get_params_buffer_size(); - } - void set_flash_attention_enabled(bool enabled) override { llm->set_flash_attention_enabled(enabled); projector->set_flash_attention_enabled(enabled); @@ -2239,6 +2192,16 @@ struct LTXAVEmbedder : public Conditioner { projector->set_weight_adapter(adapter); } + void set_weight_manager(const std::shared_ptr& manager) override { + llm->set_weight_manager(manager); + projector->set_weight_manager(manager); + } + + void runner_done() override { + llm->runner_done(); + projector->runner_done(); + } + std::tuple, std::vector, std::vector> tokenize(std::string text, const std::pair& attn_range) { std::vector> parsed_attention; @@ -2302,6 +2265,9 @@ struct LTXAVEmbedder : public Conditioner { attention_mask, {}, {}, + true, + false, + true, true); GGML_ASSERT(!hidden_states.empty()); hidden_states = apply_token_weights(std::move(hidden_states), weights); @@ -2361,7 +2327,7 @@ struct LTXAVEmbedder : public Conditioner { } hidden_states.reshape_({kNumStates * kHiddenSize, valid_tokens}); - return projector->compute(n_threads, hidden_states); + return projector->compute(n_threads, hidden_states, false, true, true); } SDCondition get_learned_condition(int n_threads, diff --git a/src/core/ggml_extend.hpp b/src/core/ggml_extend.hpp index ee4b413c..70703d24 100644 --- a/src/core/ggml_extend.hpp +++ b/src/core/ggml_extend.hpp @@ -25,7 +25,6 @@ #include "core/ggml_extend_backend.h" #include "core/ggml_graph_cut.h" -#include "core/layer_registry.h" #include "ggml-alloc.h" #include "ggml-backend.h" #include "ggml.h" @@ -36,6 +35,7 @@ #include "core/rng.hpp" #include "core/tensor_ggml.hpp" #include "core/util.h" +#include "weight_manager.h" #define EPS 1e-05f @@ -1655,6 +1655,7 @@ struct GGMLRunnerContext { std::vector>* debug_tensors = nullptr; std::function get_cache_tensor; std::function cache_tensor; + std::function set_backend_tensor_data; void capture_tensor(const std::string& name, ggml_tensor* tensor) { if (debug_tensors == nullptr || tensor == nullptr) { @@ -1680,6 +1681,13 @@ struct GGMLRunnerContext { } cache_tensor(name, tensor); } + + void bind_backend_tensor_data(ggml_tensor* tensor, const void* data) const { + if (!set_backend_tensor_data || tensor == nullptr || data == nullptr) { + return; + } + set_backend_tensor_data(tensor, data); + } }; struct GGMLRunner { @@ -1691,11 +1699,8 @@ protected: ggml_backend_t params_backend = nullptr; ggml_backend_t runtime_backend = nullptr; - ggml_context* params_ctx = nullptr; - ggml_backend_buffer_t params_buffer = nullptr; - ggml_context* offload_ctx = nullptr; - ggml_backend_buffer_t runtime_params_buffer = nullptr; - bool params_on_runtime_backend = false; + ggml_context* params_ctx = nullptr; + ggml_backend_buffer_t params_buffer = nullptr; ggml_context* cache_ctx = nullptr; ggml_backend_buffer_t cache_buffer = nullptr; @@ -1703,24 +1708,16 @@ protected: ggml_context* compute_ctx = nullptr; ggml_gallocr* compute_allocr = nullptr; - ggml_context* partial_offload_ctx = nullptr; - ggml_backend_buffer_t partial_runtime_params_buffer = nullptr; - std::vector> partial_offload_pairs; - - // Params kept on the runtime backend across streaming segments. - ggml_context* resident_offload_ctx = nullptr; - std::vector> resident_offload_pairs; - ggml_backend_buffer_t resident_runtime_params_buffer = nullptr; - std::unordered_set resident_param_set; - uint64_t resident_state_token = 0; - size_t max_graph_vram_bytes = 0; bool stream_layers_enabled = false; size_t observed_max_effective_budget_ = 0; - sd::layer_registry::LayerRegistry layer_registry_; - std::shared_ptr weight_adapter = nullptr; + std::weak_ptr weight_manager; + std::unordered_set kept_compute_param_tensor_set; + std::vector runner_param_tensors; + std::unordered_set runner_param_tensor_set; + bool params_tensor_set_dirty_ = true; std::vector one_vec = {1.f}; ggml_tensor* one_tensor = nullptr; @@ -1776,10 +1773,7 @@ protected: params_ctx = ggml_init(params); GGML_ASSERT(params_ctx != nullptr); params_tensor_set_.clear(); - if (params_backend != runtime_backend) { - offload_ctx = ggml_init(params); - GGML_ASSERT(offload_ctx != nullptr); - } + params_tensor_set_dirty_ = true; } void free_params_ctx() { @@ -1788,14 +1782,7 @@ protected: params_ctx = nullptr; } params_tensor_set_.clear(); - if (offload_ctx != nullptr) { - ggml_free(offload_ctx); - offload_ctx = nullptr; - } - if (partial_offload_ctx != nullptr) { - ggml_free(partial_offload_ctx); - partial_offload_ctx = nullptr; - } + params_tensor_set_dirty_ = true; } void alloc_cache_ctx() { @@ -1835,6 +1822,9 @@ protected: } void rebuild_params_tensor_set() { + if (!params_tensor_set_dirty_) { + return; + } params_tensor_set_.clear(); if (params_ctx == nullptr) { return; @@ -1842,6 +1832,96 @@ protected: for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) { params_tensor_set_.insert(t); } + params_tensor_set_dirty_ = false; + } + + std::vector collect_used_param_tensors(ggml_cgraph* gf) { + std::vector used_params; + rebuild_params_tensor_set(); + if (gf == nullptr || params_tensor_set_.empty()) { + return used_params; + } + + std::unordered_set seen_params; + const int n_leafs = sd::ggml_graph_cut::leaf_count(gf); + seen_params.reserve(static_cast(n_leafs)); + for (int i = 0; i < n_leafs; ++i) { + ggml_tensor* leaf = sd::ggml_graph_cut::leaf_tensor(gf, i); + ggml_tensor* param_leaf = leaf; + if (param_leaf != nullptr && params_tensor_set_.find(param_leaf) == params_tensor_set_.end()) { + param_leaf = param_leaf->view_src; + } + if (param_leaf != nullptr && + params_tensor_set_.find(param_leaf) != params_tensor_set_.end() && + seen_params.insert(param_leaf).second) { + used_params.push_back(param_leaf); + } + } + return used_params; + } + + bool prepare_execute_graph_weights(ggml_cgraph* gf, + std::vector& graph_param_tensors, + std::vector& params_to_prepare, + bool keep_compute_params) { + graph_param_tensors = collect_used_param_tensors(gf); + params_to_prepare.clear(); + params_to_prepare.reserve(graph_param_tensors.size()); + for (ggml_tensor* param : graph_param_tensors) { + if (param == nullptr) { + continue; + } + if (keep_compute_params && + kept_compute_param_tensor_set.find(param) != kept_compute_param_tensor_set.end()) { + continue; + } + params_to_prepare.push_back(param); + } + auto manager = weight_manager.lock(); + if (manager == nullptr) { + if (!params_to_prepare.empty()) { + if (params_buffer != nullptr) { + return true; + } + LOG_ERROR("%s weight manager is not set for graph params", get_desc().c_str()); + return false; + } + return true; + } + + if (!manager->prepare_params(params_to_prepare)) { + LOG_ERROR("%s prepare graph weights failed", get_desc().c_str()); + return false; + } + for (ggml_tensor* param : params_to_prepare) { + if (param == nullptr) { + continue; + } + if (runner_param_tensor_set.insert(param).second) { + runner_param_tensors.push_back(param); + } + } + return true; + } + + void free_compute_backend_param_tensors(const std::vector& tensors) { + if (tensors.empty()) { + return; + } + auto manager = weight_manager.lock(); + if (manager != nullptr) { + manager->release_compute_backend_params(tensors); + } + } + + void free_params_backend_param_tensors(const std::vector& tensors) { + if (tensors.empty()) { + return; + } + auto manager = weight_manager.lock(); + if (manager != nullptr) { + manager->release_params_backend_params(tensors); + } } void prepare_build_in_tensor_before() { @@ -2109,316 +2189,6 @@ protected: } } - bool offload_all_params() { - restore_partial_params(); - if (params_backend == runtime_backend) { - return true; - } - if (params_on_runtime_backend) { - return true; - } - GGML_ASSERT(runtime_params_buffer == nullptr); - int64_t t0 = ggml_time_ms(); - size_t num_tensors = ggml_tensor_num(offload_ctx); - if (num_tensors == 0) { - for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) { - GGML_ASSERT(t->view_src == nullptr); - ggml_dup_tensor(offload_ctx, t); - } - } - num_tensors = ggml_tensor_num(offload_ctx); - GGML_ASSERT(num_tensors == ggml_tensor_num(params_ctx)); - - runtime_params_buffer = ggml_backend_alloc_ctx_tensors(offload_ctx, runtime_backend); - - if (runtime_params_buffer == nullptr) { - LOG_ERROR("%s alloc runtime params backend buffer failed, num_tensors = %i", - get_desc().c_str(), - num_tensors); - return false; - } - ggml_backend_buffer_set_usage(runtime_params_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - - ggml_tensor* t = ggml_get_first_tensor(params_ctx); - ggml_tensor* offload_t = ggml_get_first_tensor(offload_ctx); - - while (t != nullptr && offload_t != nullptr) { - ggml_backend_tensor_copy(t, offload_t); - std::swap(t->buffer, offload_t->buffer); - std::swap(t->data, offload_t->data); - std::swap(t->extra, offload_t->extra); - - t = ggml_get_next_tensor(params_ctx, t); - offload_t = ggml_get_next_tensor(offload_ctx, offload_t); - } - - int64_t t1 = ggml_time_ms(); - - size_t params_buffer_size = ggml_backend_buffer_get_size(runtime_params_buffer); - LOG_INFO("%s offload params (%6.2f MB, %i tensors) to runtime backend (%s), taking %.2fs", - get_desc().c_str(), - params_buffer_size / (1024.f * 1024.f), - num_tensors, - ggml_backend_name(runtime_backend), - (t1 - t0) * 1.0f / 1000); - - params_on_runtime_backend = true; - - return true; - } - - bool offload_partial_params(const std::vector& tensors) { - restore_partial_params(); - if (params_backend == runtime_backend) { - return true; - } - if (tensors.empty()) { - return true; - } - GGML_ASSERT(!params_on_runtime_backend); - GGML_ASSERT(partial_runtime_params_buffer == nullptr); - - std::vector unique_tensors; - std::unordered_set seen_tensors; - unique_tensors.reserve(tensors.size()); - seen_tensors.reserve(tensors.size()); - for (ggml_tensor* tensor : tensors) { - if (tensor == nullptr) { - continue; - } - if (resident_param_set.find(tensor) != resident_param_set.end()) { - continue; - } - if (seen_tensors.insert(tensor).second) { - unique_tensors.push_back(tensor); - } - } - if (unique_tensors.empty()) { - return true; - } - - ggml_init_params params; - params.mem_size = std::max(1, unique_tensors.size()) * ggml_tensor_overhead(); - params.mem_buffer = nullptr; - params.no_alloc = true; - - partial_offload_ctx = ggml_init(params); - GGML_ASSERT(partial_offload_ctx != nullptr); - - partial_offload_pairs.clear(); - partial_offload_pairs.reserve(unique_tensors.size()); - - for (ggml_tensor* tensor : unique_tensors) { - GGML_ASSERT(tensor->view_src == nullptr); - ggml_tensor* offload_tensor = ggml_dup_tensor(partial_offload_ctx, tensor); - ggml_set_name(offload_tensor, tensor->name); - partial_offload_pairs.push_back({tensor, offload_tensor}); - } - - partial_runtime_params_buffer = ggml_backend_alloc_ctx_tensors(partial_offload_ctx, runtime_backend); - if (partial_runtime_params_buffer == nullptr) { - LOG_ERROR("%s alloc partial runtime params backend buffer failed, num_tensors = %zu", - get_desc().c_str(), - partial_offload_pairs.size()); - ggml_free(partial_offload_ctx); - partial_offload_ctx = nullptr; - partial_offload_pairs.clear(); - return false; - } - ggml_backend_buffer_set_usage(partial_runtime_params_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - - for (auto& pair : partial_offload_pairs) { - ggml_tensor* tensor = pair.first; - ggml_tensor* offload_tensor = pair.second; - - ggml_backend_tensor_copy(tensor, offload_tensor); - std::swap(tensor->buffer, offload_tensor->buffer); - std::swap(tensor->data, offload_tensor->data); - std::swap(tensor->extra, offload_tensor->extra); - } - - size_t params_buffer_size = ggml_backend_buffer_get_size(partial_runtime_params_buffer); - LOG_DEBUG("%s offload partial params (%6.2f MB, %zu tensors) to runtime backend (%s)", - get_desc().c_str(), - params_buffer_size / (1024.f * 1024.f), - partial_offload_pairs.size(), - ggml_backend_name(runtime_backend)); - - return true; - } - - void restore_all_params() { - restore_partial_params(); - if (!params_on_runtime_backend) { - return; - } - ggml_tensor* t = ggml_get_first_tensor(params_ctx); - ggml_tensor* offload_t = ggml_get_first_tensor(offload_ctx); - - while (t != nullptr && offload_t != nullptr) { - t->buffer = offload_t->buffer; - t->data = offload_t->data; - t->extra = offload_t->extra; - offload_t->buffer = nullptr; - offload_t->data = nullptr; - offload_t->extra = nullptr; - - t = ggml_get_next_tensor(params_ctx, t); - offload_t = ggml_get_next_tensor(offload_ctx, offload_t); - } - - if (runtime_params_buffer != nullptr) { - ggml_backend_buffer_free(runtime_params_buffer); - runtime_params_buffer = nullptr; - } - params_on_runtime_backend = false; - } - - void restore_partial_params() { - if (partial_offload_pairs.empty()) { - if (partial_runtime_params_buffer != nullptr) { - ggml_backend_buffer_free(partial_runtime_params_buffer); - partial_runtime_params_buffer = nullptr; - } - if (partial_offload_ctx != nullptr) { - ggml_free(partial_offload_ctx); - partial_offload_ctx = nullptr; - } - return; - } - - for (auto& pair : partial_offload_pairs) { - ggml_tensor* tensor = pair.first; - ggml_tensor* offload_tensor = pair.second; - - tensor->buffer = offload_tensor->buffer; - tensor->data = offload_tensor->data; - tensor->extra = offload_tensor->extra; - offload_tensor->buffer = nullptr; - offload_tensor->data = nullptr; - offload_tensor->extra = nullptr; - } - - if (partial_runtime_params_buffer != nullptr) { - ggml_backend_buffer_free(partial_runtime_params_buffer); - partial_runtime_params_buffer = nullptr; - } - partial_offload_pairs.clear(); - - if (partial_offload_ctx != nullptr) { - ggml_free(partial_offload_ctx); - partial_offload_ctx = nullptr; - } - } - - bool offload_resident_params(const std::vector& tensors) { - if (params_backend == runtime_backend) { - return true; - } - if (tensors.empty()) { - return true; - } - GGML_ASSERT(resident_runtime_params_buffer == nullptr); - GGML_ASSERT(resident_offload_ctx == nullptr); - GGML_ASSERT(resident_offload_pairs.empty()); - GGML_ASSERT(resident_param_set.empty()); - - std::vector unique_tensors; - std::unordered_set seen; - unique_tensors.reserve(tensors.size()); - seen.reserve(tensors.size()); - for (ggml_tensor* t : tensors) { - if (t == nullptr) - continue; - if (seen.insert(t).second) - unique_tensors.push_back(t); - } - if (unique_tensors.empty()) - return true; - - ggml_init_params init = {}; - init.mem_size = std::max(1, unique_tensors.size()) * ggml_tensor_overhead(); - init.mem_buffer = nullptr; - init.no_alloc = true; - resident_offload_ctx = ggml_init(init); - GGML_ASSERT(resident_offload_ctx != nullptr); - - resident_offload_pairs.reserve(unique_tensors.size()); - for (ggml_tensor* t : unique_tensors) { - GGML_ASSERT(t->view_src == nullptr); - ggml_tensor* twin = ggml_dup_tensor(resident_offload_ctx, t); - ggml_set_name(twin, t->name); - resident_offload_pairs.push_back({t, twin}); - } - - resident_runtime_params_buffer = ggml_backend_alloc_ctx_tensors(resident_offload_ctx, runtime_backend); - if (resident_runtime_params_buffer == nullptr) { - LOG_ERROR("%s alloc resident runtime params backend buffer failed, num_tensors = %zu", - get_desc().c_str(), resident_offload_pairs.size()); - ggml_free(resident_offload_ctx); - resident_offload_ctx = nullptr; - resident_offload_pairs.clear(); - return false; - } - ggml_backend_buffer_set_usage(resident_runtime_params_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - - for (auto& pair : resident_offload_pairs) { - ggml_tensor* t = pair.first; - ggml_tensor* twin = pair.second; - ggml_backend_tensor_copy(t, twin); - std::swap(t->buffer, twin->buffer); - std::swap(t->data, twin->data); - std::swap(t->extra, twin->extra); - resident_param_set.insert(t); - } - ggml_backend_synchronize(runtime_backend); - - size_t sz = ggml_backend_buffer_get_size(resident_runtime_params_buffer); - LOG_INFO("%s offload resident params (%6.2f MB, %zu tensors) to runtime backend (%s)", - get_desc().c_str(), - sz / (1024.f * 1024.f), - resident_offload_pairs.size(), - ggml_backend_name(runtime_backend)); - return true; - } - - void restore_resident_params() { - if (resident_offload_pairs.empty()) { - if (resident_runtime_params_buffer != nullptr) { - ggml_backend_buffer_free(resident_runtime_params_buffer); - resident_runtime_params_buffer = nullptr; - } - if (resident_offload_ctx != nullptr) { - ggml_free(resident_offload_ctx); - resident_offload_ctx = nullptr; - } - resident_param_set.clear(); - resident_state_token = 0; - return; - } - for (auto& pair : resident_offload_pairs) { - ggml_tensor* t = pair.first; - ggml_tensor* twin = pair.second; - t->buffer = twin->buffer; - t->data = twin->data; - t->extra = twin->extra; - twin->buffer = nullptr; - twin->data = nullptr; - twin->extra = nullptr; - } - if (resident_runtime_params_buffer != nullptr) { - ggml_backend_buffer_free(resident_runtime_params_buffer); - resident_runtime_params_buffer = nullptr; - } - resident_offload_pairs.clear(); - if (resident_offload_ctx != nullptr) { - ggml_free(resident_offload_ctx); - resident_offload_ctx = nullptr; - } - resident_param_set.clear(); - resident_state_token = 0; - } - bool should_use_graph_cut_segmented_compute(const GraphCutPlan& plan) { return plan.has_cuts && plan.valid && @@ -2440,18 +2210,12 @@ protected: GGML_ASSERT(plan_out != nullptr); GGML_ASSERT(gf != nullptr); - // Keep the plan and resident params under the same live-VRAM cap. - // Add back our own resident buffer so we don't see chunk-K's - // allocation as "taken" VRAM and shrink the budget on every step. size_t effective_budget = max_graph_vram_bytes; if (stream_layers_enabled && max_graph_vram_bytes > 0 && runtime_backend != nullptr) { ggml_backend_dev_t dev = ggml_backend_get_device(runtime_backend); if (dev != nullptr && ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) { size_t free_vram = 0, total_vram = 0; ggml_backend_dev_memory(dev, &free_vram, &total_vram); - if (resident_runtime_params_buffer != nullptr) { - free_vram += ggml_backend_buffer_get_size(resident_runtime_params_buffer); - } constexpr size_t safety_margin = 512ull * 1024 * 1024; size_t free_clamp = (free_vram > safety_margin) ? (free_vram - safety_margin) : 0; if (free_clamp < effective_budget) { @@ -2500,6 +2264,9 @@ protected: planner_budget, params_tensor_set_, get_desc().c_str()); + if (stream_layers_enabled) { + sd::ggml_graph_cut::annotate_residency(*plan_out, effective_budget); + } if (stream_layers_enabled) { if (budget_increased) { LOG_INFO("%s streaming budget = %.2f MB", @@ -2635,310 +2402,173 @@ protected: template std::optional> execute_graph(ggml_cgraph* gf, int n_threads, - bool free_compute_buffer_immediately, - const std::vector& runtime_param_tensors, + bool free_compute_buffer, + bool free_compute_params, bool preserve_backend_tensor_data_map, bool no_return = false, const std::unordered_set* cache_keep_names = nullptr) { - int64_t t_execute_begin = ggml_time_ms(); - const bool use_partial_param_offload = !runtime_param_tensors.empty(); - int64_t t_offload_begin = ggml_time_ms(); - if (use_partial_param_offload) { - if (!offload_partial_params(runtime_param_tensors)) { - LOG_ERROR("%s offload partial params to runtime backend failed", get_desc().c_str()); - return std::nullopt; - } - } else { - if (!offload_all_params()) { - LOG_ERROR("%s offload params to runtime backend failed", get_desc().c_str()); - return std::nullopt; - } - } - int64_t t_offload_end = ggml_time_ms(); - - int64_t t_alloc_begin = ggml_time_ms(); - if (!alloc_compute_buffer(gf)) { - LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str()); - if (use_partial_param_offload) { - restore_partial_params(); - } + std::vector graph_param_tensors; + std::vector params_to_prepare; + if (!prepare_execute_graph_weights(gf, graph_param_tensors, params_to_prepare, !free_compute_params)) { return std::nullopt; } + struct GraphWeightDoneGuard { + GraphWeightDoneGuard(GGMLRunner* runner, const std::vector* tensors) + : runner(runner), + tensors(tensors) {} + + GGMLRunner* runner = nullptr; + const std::vector* tensors = nullptr; + bool enabled = true; + + ~GraphWeightDoneGuard() { + if (enabled && runner != nullptr && tensors != nullptr) { + runner->free_compute_backend_param_tensors(*tensors); + } + } + + void dismiss() { enabled = false; } + + GraphWeightDoneGuard(const GraphWeightDoneGuard&) = delete; + GraphWeightDoneGuard& operator=(const GraphWeightDoneGuard&) = delete; + }; + GraphWeightDoneGuard graph_weight_done_guard(this, ¶ms_to_prepare); + + if (!alloc_compute_buffer(gf)) { + LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str()); + return std::nullopt; + } + struct ComputeBufferGuard { + ComputeBufferGuard(GGMLRunner* runner, bool enabled) + : runner(runner), + enabled(enabled) {} + + GGMLRunner* runner = nullptr; + bool enabled = false; + + ~ComputeBufferGuard() { + if (enabled && runner != nullptr) { + runner->free_compute_buffer(); + } + } + + ComputeBufferGuard(const ComputeBufferGuard&) = delete; + ComputeBufferGuard& operator=(const ComputeBufferGuard&) = delete; + }; + ComputeBufferGuard compute_buffer_guard(this, free_compute_buffer); if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) { LOG_ERROR("%s alloc compute graph failed", get_desc().c_str()); - if (free_compute_buffer_immediately) { - free_compute_buffer(); - } else if (use_partial_param_offload) { - restore_partial_params(); - } return std::nullopt; } - int64_t t_alloc_end = ggml_time_ms(); - int64_t t_copy_begin = ggml_time_ms(); copy_data_to_backend_tensor(gf, !preserve_backend_tensor_data_map); - int64_t t_copy_end = ggml_time_ms(); if (sd_backend_is_cpu(runtime_backend)) { sd_backend_cpu_set_n_threads(runtime_backend, n_threads); } - int64_t t_compute_begin = ggml_time_ms(); - ggml_status status = ggml_backend_graph_compute(runtime_backend, gf); - int64_t t_compute_end = ggml_time_ms(); + ggml_status status = ggml_backend_graph_compute(runtime_backend, gf); if (status != GGML_STATUS_SUCCESS) { LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status)); - if (free_compute_buffer_immediately) { - free_compute_buffer(); - } else if (use_partial_param_offload) { - restore_partial_params(); - } return std::nullopt; } - std::unordered_set debug_graph_tensor_set; - const int n_debug_leafs = sd::ggml_graph_cut::leaf_count(gf); - const int n_debug_nodes = ggml_graph_n_nodes(gf); - debug_graph_tensor_set.reserve(static_cast(n_debug_leafs + n_debug_nodes)); - for (int i = 0; i < n_debug_leafs; ++i) { - debug_graph_tensor_set.insert(sd::ggml_graph_cut::leaf_tensor(gf, i)); - } - for (int i = 0; i < n_debug_nodes; ++i) { - debug_graph_tensor_set.insert(ggml_graph_node(gf, i)); + if (!debug_tensors.empty()) { + std::unordered_set debug_graph_tensor_set; + const int n_debug_leafs = sd::ggml_graph_cut::leaf_count(gf); + const int n_debug_nodes = ggml_graph_n_nodes(gf); + debug_graph_tensor_set.reserve(static_cast(n_debug_leafs + n_debug_nodes)); + for (int i = 0; i < n_debug_leafs; ++i) { + debug_graph_tensor_set.insert(sd::ggml_graph_cut::leaf_tensor(gf, i)); + } + for (int i = 0; i < n_debug_nodes; ++i) { + debug_graph_tensor_set.insert(ggml_graph_node(gf, i)); + } + + for (const auto& entry : debug_tensors) { + auto tensor = entry.first; + if (tensor == nullptr) { + continue; + } + if (debug_graph_tensor_set.find(tensor) == debug_graph_tensor_set.end()) { + continue; + } + ggml_backend_buffer_t tensor_buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + if (tensor_buf == nullptr) { + LOG_WARN("%s skip debug tensor '%s': tensor buffer not set", + get_desc().c_str(), + entry.second.c_str()); + continue; + } + if (tensor->type != GGML_TYPE_F32) { + LOG_WARN("%s skip debug tensor '%s': only GGML_TYPE_F32 is supported, got %s", + get_desc().c_str(), + entry.second.c_str(), + ggml_type_name(tensor->type)); + continue; + } + auto debug_tensor = sd::make_sd_tensor_from_ggml(tensor); + print_sd_tensor(debug_tensor, false, entry.second.c_str()); + } } - for (const auto& entry : debug_tensors) { - auto tensor = entry.first; - if (tensor == nullptr) { - continue; - } - if (debug_graph_tensor_set.find(tensor) == debug_graph_tensor_set.end()) { - continue; - } - ggml_backend_buffer_t tensor_buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; - if (tensor_buf == nullptr) { - LOG_WARN("%s skip debug tensor '%s': tensor buffer not set", - get_desc().c_str(), - entry.second.c_str()); - continue; - } - if (tensor->type != GGML_TYPE_F32) { - LOG_WARN("%s skip debug tensor '%s': only GGML_TYPE_F32 is supported, got %s", - get_desc().c_str(), - entry.second.c_str(), - ggml_type_name(tensor->type)); - continue; - } - auto debug_tensor = sd::make_sd_tensor_from_ggml(tensor); - print_sd_tensor(debug_tensor, false, entry.second.c_str()); - } - - int64_t t_cache_begin = ggml_time_ms(); if (!copy_cache_tensors_to_cache_buffer(cache_keep_names)) { - if (free_compute_buffer_immediately) { - free_compute_buffer(); - } else if (use_partial_param_offload) { - restore_partial_params(); - } return std::nullopt; } - int64_t t_cache_end = ggml_time_ms(); - auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str()); + auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str()); std::optional> output; if (!no_return) { output = read_graph_tensor(result, "output"); if (!output.has_value()) { - if (free_compute_buffer_immediately) { - free_compute_buffer(); - } else if (use_partial_param_offload) { - restore_partial_params(); - } return std::nullopt; } } else { output = sd::Tensor(); } - if (free_compute_buffer_immediately) { - free_compute_buffer(); - } else if (use_partial_param_offload) { - restore_partial_params(); - } - if (use_partial_param_offload) { - LOG_DEBUG("%s execute_graph timing: offload=%lld ms alloc=%lld ms copy_in=%lld ms compute=%lld ms cache=%lld ms total=%lld ms", - get_desc().c_str(), - t_offload_end - t_offload_begin, - t_alloc_end - t_alloc_begin, - t_copy_end - t_copy_begin, - t_compute_end - t_compute_begin, - t_cache_end - t_cache_begin, - ggml_time_ms() - t_execute_begin); - } - return output; - } - - template - std::optional> compute_with_graph_cuts(ggml_cgraph* gf, - const GraphCutPlan& plan, - int n_threads, - bool free_compute_buffer_immediately, - bool no_return = false) { - GGML_ASSERT(gf != nullptr); - - free_compute_buffer(); - free_cache_ctx_and_buffer(); - - std::unordered_map persistent_externals; - snapshot_persistent_externals(plan, gf, persistent_externals); - - std::optional> output = sd::Tensor(); - for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) { - int64_t t_segment_begin = ggml_time_ms(); - const auto& segment = plan.segments[seg_idx]; - auto future_cut_names = sd::ggml_graph_cut::collect_future_input_names(gf, plan, seg_idx); - LOG_DEBUG("%s graph cut executing segment %zu/%zu: %s", - get_desc().c_str(), - seg_idx + 1, - plan.segments.size(), - segment.group_name.c_str()); - - reset_segment_runtime_tensors(segment, gf, &persistent_externals); - if (!bind_segment_cached_inputs(gf, segment)) { - free_cache_ctx_and_buffer(); - free_compute_buffer(); - free_compute_ctx(); - return std::nullopt; - } - - const bool is_last_segment = seg_idx + 1 == plan.segments.size(); - if (!is_last_segment) { - for (size_t output_idx = 0; output_idx < segment.output_node_indices.size(); ++output_idx) { - ggml_tensor* output_tensor = sd::ggml_graph_cut::output_tensor(gf, segment, output_idx); - if (output_tensor != nullptr && - sd::ggml_graph_cut::is_graph_cut_tensor(output_tensor) && - future_cut_names.find(output_tensor->name) != future_cut_names.end()) { - cache(output_tensor->name, output_tensor); - } + if (!free_compute_params) { + for (ggml_tensor* param : params_to_prepare) { + if (param == nullptr) { + continue; } + kept_compute_param_tensor_set.insert(param); } - - ggml_context* segment_graph_ctx = nullptr; - ggml_cgraph* segment_graph = sd::ggml_graph_cut::build_segment_graph(gf, segment, &segment_graph_ctx); - auto segment_output = execute_graph(segment_graph, - n_threads, - true, - sd::ggml_graph_cut::runtime_param_tensors(gf, segment, get_desc().c_str()), - true, - !is_last_segment || no_return, - &future_cut_names); - ggml_free(segment_graph_ctx); - if (!segment_output.has_value()) { - free_cache_ctx_and_buffer(); - free_compute_buffer(); - free_compute_ctx(); - return std::nullopt; - } - output = std::move(segment_output); + graph_weight_done_guard.dismiss(); } - - backend_tensor_data_map.clear(); - free_cache_ctx_and_buffer(); - free_compute_ctx(); return output; } -public: - void release_streaming_residency() { - restore_resident_params(); - } - template - std::optional> compute_streaming_segments(ggml_cgraph* gf, + std::optional> compute_graph_cut_segments(ggml_cgraph* gf, const GraphCutPlan& plan, - size_t residency_budget_bytes, int n_threads, - bool free_compute_buffer_immediately, + bool log_residency, bool no_return = false) { GGML_ASSERT(gf != nullptr); - // Runtime LoRA composes `weight + diff` in the compute graph via - // ggml_add; the resident weight tensor's data is never mutated, so - // chunk-K residency stays valid across sampling steps. - // Reserve room for the worst merged segment so chunk-K can't grow - // large enough to starve later partial-param allocations. - size_t worst_merged_segment_footprint = 0; - for (const auto& seg : plan.segments) { - const size_t fp = seg.input_param_bytes + - seg.compute_buffer_size + - seg.output_bytes + - seg.input_previous_cut_bytes + - seg.input_external_bytes; - if (fp > worst_merged_segment_footprint) { - worst_merged_segment_footprint = fp; - } - } - const size_t residency_budget_for_annotate = - residency_budget_bytes > worst_merged_segment_footprint - ? residency_budget_bytes - worst_merged_segment_footprint - : 0; - - sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan; - if (base_plan.available) { - sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_for_annotate); - - std::vector resident_params; - uint64_t token = 0; - for (const auto& segment : base_plan.segments) { - if (segment.residency != sd::ggml_graph_cut::SegmentResidency::RESIDENT) { - continue; - } - auto seg_params = sd::ggml_graph_cut::param_tensors(gf, segment); - for (ggml_tensor* t : seg_params) { - if (t == nullptr) - continue; - resident_params.push_back(t); - token ^= reinterpret_cast(t) * 0x9E3779B97F4A7C15ull; - } - } - if (token != resident_state_token) { - restore_resident_params(); - if (!resident_params.empty()) { - if (offload_resident_params(resident_params)) { - resident_state_token = token; - } else { - LOG_ERROR("%s chunk-K: resident offload failed; continuing with per-segment streaming", - get_desc().c_str()); - restore_resident_params(); - } - } - } - } - free_compute_buffer(); free_cache_ctx_and_buffer(); - layer_registry_.move_layer_to_gpu("_global"); - std::unordered_map persistent_externals; snapshot_persistent_externals(plan, gf, persistent_externals); std::optional> output = sd::Tensor(); for (size_t seg_idx = 0; seg_idx < plan.segments.size(); ++seg_idx) { - int64_t t_segment_begin = ggml_time_ms(); - const auto& segment = plan.segments[seg_idx]; - const bool is_last = seg_idx + 1 == plan.segments.size(); - auto future_cut_names = sd::ggml_graph_cut::collect_future_input_names(gf, plan, seg_idx); - - LOG_DEBUG("%s streaming-cut executing segment %zu/%zu: %s (residency=%s)", - get_desc().c_str(), - seg_idx + 1, - plan.segments.size(), - segment.group_name.c_str(), - segment.residency == sd::ggml_graph_cut::SegmentResidency::RESIDENT ? "RESIDENT" : "STREAMED"); - - if (!layer_registry_.move_layer_to_gpu(segment.group_name)) { - LOG_DEBUG("%s streaming: no registry entry for group '%s' (using upstream offload path)", + const auto& segment = plan.segments[seg_idx]; + const bool is_last = seg_idx + 1 == plan.segments.size(); + auto future_cut_names = sd::ggml_graph_cut::collect_future_input_names(gf, plan, seg_idx); + if (log_residency) { + LOG_DEBUG("%s graph cut executing segment %zu/%zu: %s (residency=%s)", get_desc().c_str(), + seg_idx + 1, + plan.segments.size(), + segment.group_name.c_str(), + segment.residency == sd::ggml_graph_cut::SegmentResidency::RESIDENT ? "RESIDENT" : "STREAMED"); + } else { + LOG_DEBUG("%s graph cut executing segment %zu/%zu: %s", + get_desc().c_str(), + seg_idx + 1, + plan.segments.size(), segment.group_name.c_str()); } @@ -2952,23 +2582,24 @@ public: if (!is_last) { for (size_t output_idx = 0; output_idx < segment.output_node_indices.size(); ++output_idx) { - ggml_tensor* out_tensor = sd::ggml_graph_cut::output_tensor(gf, segment, output_idx); - if (out_tensor != nullptr && - sd::ggml_graph_cut::is_graph_cut_tensor(out_tensor) && - future_cut_names.find(out_tensor->name) != future_cut_names.end()) { - cache(out_tensor->name, out_tensor); + ggml_tensor* output_tensor = sd::ggml_graph_cut::output_tensor(gf, segment, output_idx); + if (output_tensor != nullptr && + sd::ggml_graph_cut::is_graph_cut_tensor(output_tensor) && + future_cut_names.find(output_tensor->name) != future_cut_names.end()) { + cache(output_tensor->name, output_tensor); } } } ggml_context* segment_graph_ctx = nullptr; ggml_cgraph* segment_graph = sd::ggml_graph_cut::build_segment_graph(gf, segment, &segment_graph_ctx); + const bool keep_segment_params = segment.residency == sd::ggml_graph_cut::SegmentResidency::RESIDENT; auto segment_output = execute_graph(segment_graph, n_threads, - /*free_compute_buffer_immediately=*/true, - sd::ggml_graph_cut::runtime_param_tensors(gf, segment, get_desc().c_str()), - /*preserve_backend_tensor_data_map=*/true, - /*no_return=*/!is_last || no_return, + true, + !keep_segment_params, + true, + !is_last || no_return, &future_cut_names); ggml_free(segment_graph_ctx); if (!segment_output.has_value()) { @@ -2978,11 +2609,6 @@ public: return std::nullopt; } output = std::move(segment_output); - - if (segment.residency == sd::ggml_graph_cut::SegmentResidency::STREAMED) { - layer_registry_.move_layer_to_cpu(segment.group_name); - } - (void)t_segment_begin; } backend_tensor_data_map.clear(); @@ -2991,6 +2617,17 @@ public: return output; } +public: + void runner_done() { + free_compute_buffer(); + std::vector tensors_to_release = std::move(this->runner_param_tensors); + this->runner_param_tensors.clear(); + runner_param_tensor_set.clear(); + kept_compute_param_tensor_set.clear(); + free_compute_backend_param_tensors(tensors_to_release); + free_params_backend_param_tensors(tensors_to_release); + } + public: virtual std::string get_desc() = 0; @@ -3000,11 +2637,9 @@ public: GGML_ASSERT(runtime_backend != nullptr); GGML_ASSERT(params_backend != nullptr); alloc_params_ctx(); - layer_registry_.set_backends(runtime_backend, params_backend); } virtual ~GGMLRunner() { - restore_resident_params(); free_params_buffer(); free_compute_buffer(); free_params_ctx(); @@ -3028,6 +2663,9 @@ public: runner_ctx.cache_tensor = [this](const std::string& name, ggml_tensor* tensor) { this->cache(name, tensor); }; + runner_ctx.set_backend_tensor_data = [this](ggml_tensor* tensor, const void* data) { + this->set_backend_tensor_data(tensor, data); + }; return runner_ctx; } @@ -3087,9 +2725,8 @@ public: return true; } +protected: void free_params_buffer() { - // Restore swapped resident params before freeing their backing buffer. - restore_resident_params(); if (params_buffer != nullptr) { ggml_backend_buffer_free(params_buffer); params_buffer = nullptr; @@ -3104,6 +2741,7 @@ public: return 0; } +public: void free_cache_ctx_and_buffer() { free_cache_buffer(); free_cache_ctx(); @@ -3114,8 +2752,6 @@ public: ggml_gallocr_free(compute_allocr); compute_allocr = nullptr; } - restore_partial_params(); - restore_all_params(); } // do copy after alloc graph @@ -3180,47 +2816,57 @@ public: template std::optional> compute(get_graph_cb_t get_graph, int n_threads, - bool free_compute_buffer_immediately, - bool no_return = false) { + bool auto_free = true, + bool free_compute_buffer = true, + bool free_compute_params = true, + bool no_return = false) { + struct RunnerDoneGuard { + RunnerDoneGuard(GGMLRunner* runner, bool enabled) + : runner(runner), + enabled(enabled) {} + + ~RunnerDoneGuard() { + if (enabled && runner != nullptr) { + runner->runner_done(); + } + } + + RunnerDoneGuard(const RunnerDoneGuard&) = delete; + RunnerDoneGuard& operator=(const RunnerDoneGuard&) = delete; + + GGMLRunner* runner = nullptr; + bool enabled = false; + }; + RunnerDoneGuard runner_done_guard(this, auto_free); + ggml_cgraph* gf = nullptr; if (!prepare_compute_graph(get_graph, &gf)) { return std::nullopt; } GGML_ASSERT(gf != nullptr); + rebuild_params_tensor_set(); if (can_attempt_graph_cut_segmented_compute()) { GraphCutPlan plan; - size_t effective_graph_vram_bytes = 0; - if (!resolve_graph_cut_plan(gf, &plan, &effective_graph_vram_bytes)) { + if (!resolve_graph_cut_plan(gf, &plan)) { free_compute_ctx(); return std::nullopt; } if (should_use_graph_cut_segmented_compute(plan)) { - if (stream_layers_enabled) { - return compute_streaming_segments(gf, - plan, - effective_graph_vram_bytes, - n_threads, - free_compute_buffer_immediately, - no_return); - } - return compute_with_graph_cuts(gf, - plan, - n_threads, - free_compute_buffer_immediately, - no_return); + return compute_graph_cut_segments(gf, + plan, + n_threads, + stream_layers_enabled, + no_return); } } - if (!alloc_compute_buffer(gf)) { - LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str()); - return std::nullopt; - } return execute_graph(gf, n_threads, - free_compute_buffer_immediately, - {}, + free_compute_buffer, + free_compute_params, false, - no_return); + no_return, + nullptr); } void set_flash_attention_enabled(bool enabled) { @@ -3240,6 +2886,15 @@ public: weight_adapter = adapter; } + void set_weight_manager(const std::shared_ptr& manager) { + weight_manager = manager; + } + + void set_weight_manager(const std::shared_ptr& manager, + const std::string&) { + set_weight_manager(manager); + } + void set_max_graph_vram_bytes(size_t max_vram_bytes) { max_graph_vram_bytes = max_vram_bytes; } @@ -3248,8 +2903,6 @@ public: stream_layers_enabled = enabled; } - sd::layer_registry::LayerRegistry& get_layer_registry() { return layer_registry_; } - ggml_backend_t get_runtime_backend() { return runtime_backend; } diff --git a/src/core/ggml_graph_cut.cpp b/src/core/ggml_graph_cut.cpp index c77a4544..08312aab 100644 --- a/src/core/ggml_graph_cut.cpp +++ b/src/core/ggml_graph_cut.cpp @@ -44,7 +44,9 @@ namespace sd::ggml_graph_cut { if (tensor == nullptr) { return false; } - return params_tensor_set.find(tensor) != params_tensor_set.end(); + return params_tensor_set.find(tensor) != params_tensor_set.end() || + (tensor->view_src != nullptr && + params_tensor_set.find(tensor->view_src) != params_tensor_set.end()); } static int graph_node_index_by_name(ggml_cgraph* gf, const char* name) { @@ -135,6 +137,24 @@ namespace sd::ggml_graph_cut { return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(-max_vram, backend)); } + static bool is_segment_output_needed_after(const Plan& plan, + size_t end_segment_index, + int output_node_index) { + if (end_segment_index + 1 >= plan.segments.size()) { + return false; + } + for (size_t seg_idx = end_segment_index + 1; seg_idx < plan.segments.size(); ++seg_idx) { + const auto& segment = plan.segments[seg_idx]; + for (const auto& input_ref : segment.input_refs) { + if (input_ref.type == Segment::INPUT_PREVIOUS_CUT && + input_ref.node_index == output_node_index) { + return true; + } + } + } + return false; + } + static Segment make_segment_seed(const Plan& plan, size_t start_segment_index, size_t end_segment_index) { @@ -147,8 +167,11 @@ namespace sd::ggml_graph_cut { const auto& target_segment = plan.segments[end_segment_index]; std::unordered_set seen_output_node_indices; for (size_t seg_idx = start_segment_index; seg_idx <= end_segment_index; ++seg_idx) { + const bool is_boundary_segment = seg_idx == end_segment_index; for (int output_node_index : plan.segments[seg_idx].output_node_indices) { - if (seen_output_node_indices.insert(output_node_index).second) { + if ((is_boundary_segment || + is_segment_output_needed_after(plan, end_segment_index, output_node_index)) && + seen_output_node_indices.insert(output_node_index).second) { seed.output_node_indices.push_back(output_node_index); } } @@ -400,23 +423,6 @@ namespace sd::ggml_graph_cut { return tensors; } - std::vector runtime_param_tensors(ggml_cgraph* gf, const Segment& segment, const char* log_desc) { - std::vector tensors = param_tensors(gf, segment); - std::vector filtered_tensors; - filtered_tensors.reserve(tensors.size()); - for (ggml_tensor* tensor : tensors) { - if (tensor_buffer(tensor) == nullptr) { - LOG_WARN("%s graph cut skipping param input without buffer: segment=%s tensor=%s", - log_desc == nullptr ? "unknown" : log_desc, - segment.group_name.c_str(), - tensor->name); - continue; - } - filtered_tensors.push_back(tensor); - } - return filtered_tensors; - } - std::unordered_set collect_future_input_names(ggml_cgraph* gf, const Plan& plan, size_t current_segment_index) { @@ -487,6 +493,44 @@ namespace sd::ggml_graph_cut { return 0; } + struct TensorRuntimeBinding { + ggml_backend_buffer_t buffer = nullptr; + void* data = nullptr; + void* extra = nullptr; + }; + std::unordered_map saved_bindings; + auto mark_measurement_external = [&](ggml_tensor* tensor) { + if (tensor == nullptr) { + return; + } + auto save_tensor = [&](ggml_tensor* t) { + if (t == nullptr || saved_bindings.find(t) != saved_bindings.end()) { + return; + } + saved_bindings[t] = {t->buffer, t->data, t->extra}; + // During real execution params and previous-cut inputs already + // have backend/cache buffers, so gallocr must not reserve them. + t->data = reinterpret_cast(static_cast(1)); + }; + save_tensor(tensor); + save_tensor(tensor->view_src); + }; + for (const auto& input : segment.input_refs) { + if (input.type != Segment::INPUT_PARAM && + input.type != Segment::INPUT_PREVIOUS_CUT) { + continue; + } + mark_measurement_external(input_tensor(gf, input)); + } + + std::unordered_map saved_output_flags; + for (int output_node_index : segment.output_node_indices) { + ggml_tensor* output = ggml_graph_node(gf, output_node_index); + if (output != nullptr && saved_output_flags.find(output) == saved_output_flags.end()) { + saved_output_flags[output] = output->flags; + } + } + ggml_context* graph_ctx = nullptr; ggml_cgraph* segment_graph = build_segment_graph(gf, segment, &graph_ctx); ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); @@ -502,6 +546,14 @@ namespace sd::ggml_graph_cut { ggml_gallocr_free(allocr); ggml_free(graph_ctx); + for (const auto& kv : saved_output_flags) { + kv.first->flags = kv.second; + } + for (const auto& kv : saved_bindings) { + kv.first->buffer = kv.second.buffer; + kv.first->data = kv.second.data; + kv.first->extra = kv.second.extra; + } return buffer_size; } @@ -669,7 +721,8 @@ namespace sd::ggml_graph_cut { GGML_ASSERT(!candidate_plan.segments.empty()); const auto& candidate_segment = candidate_plan.segments.back(); - if (graph_cut_segment_vram_bytes(candidate_segment) > max_graph_vram_bytes) { + const size_t candidate_bytes = graph_cut_segment_vram_bytes(candidate_segment); + if (candidate_bytes > max_graph_vram_bytes) { break; } diff --git a/src/core/ggml_graph_cut.h b/src/core/ggml_graph_cut.h index 7919acad..01e9b3ad 100644 --- a/src/core/ggml_graph_cut.h +++ b/src/core/ggml_graph_cut.h @@ -80,7 +80,6 @@ namespace sd::ggml_graph_cut { ggml_tensor* output_tensor(ggml_cgraph* gf, const Segment& segment, size_t output_index); ggml_tensor* input_tensor(ggml_cgraph* gf, const Segment::InputRef& input_ref); std::vector param_tensors(ggml_cgraph* gf, const Segment& segment); - std::vector runtime_param_tensors(ggml_cgraph* gf, const Segment& segment, const char* log_desc); std::unordered_set collect_future_input_names(ggml_cgraph* gf, const Plan& plan, size_t current_segment_index); diff --git a/src/core/layer_registry.cpp b/src/core/layer_registry.cpp deleted file mode 100644 index 65771ee9..00000000 --- a/src/core/layer_registry.cpp +++ /dev/null @@ -1,132 +0,0 @@ -#include "core/layer_registry.h" - -#include - -#include "core/util.h" - -namespace sd::layer_registry { - - void LayerRegistry::register_layer(const std::string& name, ggml_tensor* tensor) { - auto& info = layers_[name]; - info.tensors.push_back(tensor); - info.bytes += ggml_nbytes(tensor); - } - - bool LayerRegistry::move_layer_to_gpu(const std::string& name) { - auto it = layers_.find(name); - if (it == layers_.end()) - return false; - - LayerInfo& info = it->second; - if (info.on_gpu) - return true; - if (gpu_backend_ == nullptr || cpu_backend_ == nullptr) { - LOG_ERROR("layer_registry: backends not set; cannot move '%s' to GPU", - name.c_str()); - return false; - } - if (info.tensors.empty()) { - info.on_gpu = true; - return true; - } - - // 1. Build a no_alloc context big enough to hold one twin tensor per CPU - // tensor, plus a little overhead. - const size_t ctx_size = info.tensors.size() * ggml_tensor_overhead() + 1024; - ggml_init_params ctx_params{ctx_size, /*mem_buffer=*/nullptr, /*no_alloc=*/true}; - ggml_context* twin_ctx = ggml_init(ctx_params); - if (twin_ctx == nullptr) { - LOG_ERROR("layer_registry: failed to allocate twin context for '%s'", - name.c_str()); - return false; - } - - // 2. Create one GPU twin per CPU tensor. The twin shares the original - // name so any name-based lookup keeps working. - std::vector gpu_twins; - gpu_twins.reserve(info.tensors.size()); - for (ggml_tensor* cpu_t : info.tensors) { - ggml_tensor* twin = ggml_dup_tensor(twin_ctx, cpu_t); - if (cpu_t->name[0] != '\0') { - ggml_set_name(twin, cpu_t->name); - } - gpu_twins.push_back(twin); - } - - // 3. Back the twins with a GPU buffer in one alloc call. - ggml_backend_buffer_t gpu_buffer = ggml_backend_alloc_ctx_tensors(twin_ctx, gpu_backend_); - if (gpu_buffer == nullptr) { - LOG_ERROR("layer_registry: failed to allocate GPU buffer for '%s'", - name.c_str()); - ggml_free(twin_ctx); - return false; - } - - // 4. H2D copy + sync. - for (size_t i = 0; i < info.tensors.size(); ++i) { - ggml_backend_tensor_copy(info.tensors[i], gpu_twins[i]); - } - ggml_backend_synchronize(gpu_backend_); - - // 5. Swap buffer/data/extra so the originals now point at GPU memory. - for (size_t i = 0; i < info.tensors.size(); ++i) { - std::swap(info.tensors[i]->buffer, gpu_twins[i]->buffer); - std::swap(info.tensors[i]->data, gpu_twins[i]->data); - std::swap(info.tensors[i]->extra, gpu_twins[i]->extra); - } - - info.gpu_twins = std::move(gpu_twins); - info.twin_ctx = twin_ctx; - info.gpu_buffer = gpu_buffer; - info.on_gpu = true; - return true; - } - - bool LayerRegistry::move_layer_to_cpu(const std::string& name) { - auto it = layers_.find(name); - if (it == layers_.end()) - return false; - - LayerInfo& info = it->second; - if (!info.on_gpu) - return true; - if (info.tensors.size() != info.gpu_twins.size()) { - LOG_ERROR("layer_registry: twin/tensor count mismatch for '%s'", - name.c_str()); - return false; - } - - // 1. Swap back: originals point at CPU memory again. - for (size_t i = 0; i < info.tensors.size(); ++i) { - if (info.gpu_twins[i] == nullptr) - continue; - std::swap(info.tensors[i]->buffer, info.gpu_twins[i]->buffer); - std::swap(info.tensors[i]->data, info.gpu_twins[i]->data); - std::swap(info.tensors[i]->extra, info.gpu_twins[i]->extra); - } - - // 2. Free the GPU buffer + twin context. - if (info.gpu_buffer != nullptr) { - ggml_backend_buffer_free(info.gpu_buffer); - info.gpu_buffer = nullptr; - } - if (info.twin_ctx != nullptr) { - ggml_free(info.twin_ctx); - info.twin_ctx = nullptr; - } - info.gpu_twins.clear(); - info.on_gpu = false; - return true; - } - - bool LayerRegistry::is_layer_on_gpu(const std::string& name) const { - auto it = layers_.find(name); - return it != layers_.end() && it->second.on_gpu; - } - - size_t LayerRegistry::get_layer_size(const std::string& name) const { - auto it = layers_.find(name); - return it != layers_.end() ? it->second.bytes : 0; - } - -} // namespace sd::layer_registry diff --git a/src/core/layer_registry.h b/src/core/layer_registry.h deleted file mode 100644 index c0b980d0..00000000 --- a/src/core/layer_registry.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef __SD_CORE_LAYER_REGISTRY_H__ -#define __SD_CORE_LAYER_REGISTRY_H__ - -#include -#include -#include -#include - -#include "ggml-backend.h" -#include "ggml.h" - -namespace sd::layer_registry { - - struct LayerInfo { - std::vector tensors; - std::vector gpu_twins; - ggml_context* twin_ctx = nullptr; - ggml_backend_buffer_t gpu_buffer = nullptr; - bool on_gpu = false; - size_t bytes = 0; - }; - - class LayerRegistry { - public: - LayerRegistry() = default; - LayerRegistry(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend) - : gpu_backend_(gpu_backend), cpu_backend_(cpu_backend) {} - - void set_backends(ggml_backend_t gpu_backend, ggml_backend_t cpu_backend) { - gpu_backend_ = gpu_backend; - cpu_backend_ = cpu_backend; - } - void register_layer(const std::string& name, ggml_tensor* tensor); - bool move_layer_to_gpu(const std::string& name); - bool move_layer_to_cpu(const std::string& name); - bool is_layer_on_gpu(const std::string& name) const; - size_t get_layer_size(const std::string& name) const; - size_t get_layer_count() const { return layers_.size(); } - - const std::map& layers() const { return layers_; } - - private: - ggml_backend_t gpu_backend_ = nullptr; - ggml_backend_t cpu_backend_ = nullptr; - std::map layers_; - }; - -} // namespace sd::layer_registry - -#endif // __SD_CORE_LAYER_REGISTRY_H__ diff --git a/src/core/util.cpp b/src/core/util.cpp index 61101a08..7325607e 100644 --- a/src/core/util.cpp +++ b/src/core/util.cpp @@ -488,7 +488,7 @@ bool parse_strict_bool(const std::string& text, bool& value) { return false; } -static std::string build_progress_bar(int step, int steps) { +static std::string build_progress_bar(int step, int steps, char progress_char = '=', bool show_head = true) { std::string progress = " |"; int max_progress = 50; int32_t current = 0; @@ -498,21 +498,21 @@ static std::string build_progress_bar(int step, int steps) { for (int i = 0; i < 50; i++) { if (i > current) { progress += " "; - } else if (i == current && i != max_progress - 1) { + } else if (show_head && i == current && i != max_progress - 1) { progress += ">"; } else { - progress += "="; + progress += progress_char; } } progress += "|"; return progress; } -static void print_progress_line(int step, int steps, const std::string& speed_text) { +static void print_progress_line(int step, int steps, const std::string& speed_text, char progress_char = '=', bool show_head = true) { if (step == 0) { return; } - std::string progress = build_progress_bar(step, steps); + std::string progress = build_progress_bar(step, steps, progress_char, show_head); const char* lf = (step == steps ? "\n" : ""); printf("\r%s %i/%i - %s\033[K%s", progress.c_str(), step, steps, speed_text.c_str(), lf); fflush(stdout); // for linux @@ -552,9 +552,9 @@ void pretty_bytes_progress(int step, int steps, uint64_t bytes_processed, float double speed_mb = bytes_per_second / (1024.0 * 1024.0); if (speed_mb >= 1024.0) { - print_progress_line(step, steps, sd_format("%.2fGB/s", speed_mb / 1024.0)); + print_progress_line(step, steps, sd_format("%.2fGB/s", speed_mb / 1024.0), '#', false); } else { - print_progress_line(step, steps, sd_format("%.2fMB/s", speed_mb)); + print_progress_line(step, steps, sd_format("%.2fMB/s", speed_mb), '#', false); } } diff --git a/src/extensions/generation_extension.h b/src/extensions/generation_extension.h index 1e6d1341..0f8e1263 100644 --- a/src/extensions/generation_extension.h +++ b/src/extensions/generation_extension.h @@ -6,10 +6,12 @@ #include #include #include +#include #include "conditioning/conditioner.hpp" #include "core/ggml_extend_backend.h" #include "model_loader.h" +#include "model_manager.h" #include "stable-diffusion.h" struct GenerationExtensionInitContext { @@ -23,21 +25,12 @@ struct GenerationExtensionInitContext { std::function params_backend_for; }; -struct GenerationExtensionTensorContext { - std::map& tensors; - std::map& mmap_able_tensors; - std::function module_can_mmap; -}; - struct GenerationExtensionConditionContext { Conditioner* conditioner; ConditionerParams& condition_params; const sd_pm_params_t& pm_params; - std::map& tensors; - SDVersion version; int n_threads; int total_steps; - bool free_params_immediately; }; struct GenerationExtension { @@ -50,14 +43,11 @@ struct GenerationExtension { virtual bool init(const GenerationExtensionInitContext&) { return true; } - virtual void collect_param_tensors(GenerationExtensionTensorContext&) {} + virtual void get_param_tensors(std::map&) {} + virtual void collect_loras(std::vector&) {} virtual void add_ignore_tensors(std::set&) const {} - virtual bool alloc_params_buffer() { - return true; - } - virtual size_t get_params_buffer_size() const { - return 0; - } + virtual void set_weight_manager(const std::shared_ptr&) {} + virtual void runner_done() {} virtual void reset_runtime_condition() {} virtual bool prepare_condition(GenerationExtensionConditionContext&) { return false; diff --git a/src/extensions/photomaker_extension.cpp b/src/extensions/photomaker_extension.cpp index ac3949a1..cbeb7c41 100644 --- a/src/extensions/photomaker_extension.cpp +++ b/src/extensions/photomaker_extension.cpp @@ -7,7 +7,6 @@ #include "core/tensor_ggml.hpp" #include "core/util.h" -#include "model/adapter/lora.hpp" #include "model/adapter/pmid.hpp" static std::tuple, std::vector, std::vector> @@ -103,7 +102,6 @@ static std::string remove_photomaker_trigger_from_prompt(FrozenCLIPEmbedderWithC struct PhotoMakerExtension : public GenerationExtension { std::shared_ptr pmid_model; - std::shared_ptr pmid_lora; bool enabled = false; std::string model_path; std::string trigger_word = "img"; @@ -129,7 +127,13 @@ struct PhotoMakerExtension : public GenerationExtension { } PMVersion pm_version = std::strstr(model_path.c_str(), "v2") != nullptr ? PM_VERSION_2 : PM_VERSION_1; - pmid_model = std::make_shared(ctx.backend_for(SDBackendModule::PHOTOMAKER), + LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", model_path.c_str()); + if (!ctx.model_loader.init_from_file_and_convert_name(model_path, "pmid.")) { + LOG_WARN("loading stacked ID embedding from '%s' failed", model_path.c_str()); + return true; + } + + pmid_model = std::make_shared(ctx.backend_for(SDBackendModule::PHOTOMAKER), ctx.params_backend_for(SDBackendModule::PHOTOMAKER), ctx.tensor_storage_map, "pmid", @@ -139,44 +143,28 @@ struct PhotoMakerExtension : public GenerationExtension { LOG_INFO("using PhotoMaker Version 2"); } - pmid_lora = std::make_shared("pmid", - ctx.backend_for(SDBackendModule::PHOTOMAKER), - ctx.params_backend_for(SDBackendModule::PHOTOMAKER), - model_path, - "", - ctx.version); - auto lora_tensor_filter = [&](const std::string& tensor_name) { - return starts_with(tensor_name, "lora.model"); - }; - if (!pmid_lora->load_from_file(ctx.n_threads, lora_tensor_filter)) { - LOG_WARN("load photomaker lora tensors from %s failed", model_path.c_str()); - return false; - } - - LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", model_path.c_str()); - if (!ctx.model_loader.init_from_file_and_convert_name(model_path, "pmid.")) { - LOG_WARN("loading stacked ID embedding from '%s' failed", model_path.c_str()); - return true; - } - enabled = true; return true; } - void collect_param_tensors(GenerationExtensionTensorContext& ctx) override { + void get_param_tensors(std::map& tensors) override { if (!enabled || pmid_model == nullptr) { return; } - std::map temp; - pmid_model->get_param_tensors(temp, "pmid"); - bool do_mmap = ctx.module_can_mmap(SDBackendModule::PHOTOMAKER); - for (const auto& [key, tensor] : temp) { - ctx.tensors[key] = tensor; - if (do_mmap) { - ctx.mmap_able_tensors[key] = tensor; - } + pmid_model->get_param_tensors(tensors, "pmid"); + } + + void collect_loras(std::vector& loras) override { + if (!enabled || model_path.empty()) { + return; } + ModelManager::LoraSpec lora; + lora.path = model_path; + lora.multiplier = 1.0f; + lora.tensor_name_prefix_filter = "lora.model"; + lora.required = true; + loras.push_back(std::move(lora)); } void add_ignore_tensors(std::set& ignore_tensors) const override { @@ -186,18 +174,16 @@ struct PhotoMakerExtension : public GenerationExtension { ignore_tensors.insert("pmid.unet."); } - bool alloc_params_buffer() override { - if (!enabled || pmid_model == nullptr) { - return true; + void set_weight_manager(const std::shared_ptr& manager) override { + if (pmid_model != nullptr) { + pmid_model->set_weight_manager(manager); } - return pmid_model->alloc_params_buffer(); } - size_t get_params_buffer_size() const override { - if (!enabled || pmid_model == nullptr) { - return 0; + void runner_done() override { + if (pmid_model != nullptr) { + pmid_model->runner_done(); } - return pmid_model->get_params_buffer_size(); } void reset_runtime_condition() override { @@ -207,21 +193,10 @@ struct PhotoMakerExtension : public GenerationExtension { bool prepare_condition(GenerationExtensionConditionContext& ctx) override { reset_runtime_condition(); - if (!enabled || pmid_model == nullptr || pmid_lora == nullptr) { + if (!enabled || pmid_model == nullptr) { return false; } - if (!pmid_lora->applied) { - int64_t t0 = ggml_time_ms(); - pmid_lora->apply(ctx.tensors, ctx.version, ctx.n_threads); - int64_t t1 = ggml_time_ms(); - pmid_lora->applied = true; - LOG_INFO("pmid_lora apply completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - if (ctx.free_params_immediately) { - pmid_lora->free_params_buffer(); - } - } - bool pmv2 = pmid_model->get_version() == PM_VERSION_2; if (ctx.pm_params.id_images_count <= 0 || ctx.pm_params.id_images == nullptr) { LOG_WARN("Provided PhotoMaker model file, but NO input ID images"); @@ -305,9 +280,6 @@ struct PhotoMakerExtension : public GenerationExtension { LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0); LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step); - if (ctx.free_params_immediately) { - pmid_model->free_params_buffer(); - } return true; } diff --git a/src/model/adapter/lora.hpp b/src/model/adapter/lora.hpp index 7ab8a8de..850f6c10 100644 --- a/src/model/adapter/lora.hpp +++ b/src/model/adapter/lora.hpp @@ -71,7 +71,8 @@ struct LoraModel : public GGMLRunner { return true; }; - model_loader.load_tensors(on_new_tensor_cb, n_threads); + model_loader.set_n_threads(n_threads); + model_loader.load_tensors(on_new_tensor_cb); if (tensors_to_create.empty()) { return true; @@ -93,19 +94,39 @@ struct LoraModel : public GGMLRunner { } dry_run = false; - model_loader.load_tensors(on_new_tensor_cb, n_threads); + model_loader.load_tensors(on_new_tensor_cb); LOG_DEBUG("finished loaded lora"); return true; } - void preprocess_lora_tensors(const std::map& model_tensors) { + void release_loaded_tensors() { + free_compute_buffer(); + free_params_buffer(); + free_params_ctx(); + alloc_params_ctx(); + lora_tensors.clear(); + original_tensor_to_final_tensor.clear(); + applied_lora_tensors.clear(); + applied = false; + tensor_preprocessed = false; + } + + static std::set tensor_names(const std::map& model_tensors) { + std::set names; + for (const auto& item : model_tensors) { + names.insert(item.first); + } + return names; + } + + void preprocess_lora_tensors(const std::set& model_tensor_names) { if (tensor_preprocessed) { return; } tensor_preprocessed = true; // I really hate these hardcoded processes. - if (model_tensors.find("cond_stage_model.1.transformer.text_model.encoder.layers.0.self_attn.in_proj.weight") != model_tensors.end()) { + if (model_tensor_names.find("cond_stage_model.1.transformer.text_model.encoder.layers.0.self_attn.in_proj.weight") != model_tensor_names.end()) { std::unordered_map new_lora_tensors; for (auto& [old_name, tensor] : lora_tensors) { std::string new_name = old_name; @@ -753,11 +774,13 @@ struct LoraModel : public GGMLRunner { return out_diff; } - ggml_cgraph* build_lora_graph(const std::map& model_tensors, SDVersion version) { + ggml_cgraph* build_lora_graph(const std::map& model_tensors, + const std::set& model_tensor_names, + SDVersion version) { size_t lora_graph_size = LORA_GRAPH_BASE_SIZE + lora_tensors.size() * 10; ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, lora_graph_size, false); - preprocess_lora_tensors(model_tensors); + preprocess_lora_tensors(model_tensor_names); original_tensor_to_final_tensor.clear(); applied_lora_tensors.clear(); @@ -794,12 +817,16 @@ struct LoraModel : public GGMLRunner { return gf; } - void apply(std::map model_tensors, SDVersion version, int n_threads) { + void apply(std::map model_tensors, + const std::set& model_tensor_names, + SDVersion version, + int n_threads, + bool warn_unused = true) { auto get_graph = [&]() -> ggml_cgraph* { - return build_lora_graph(model_tensors, version); + return build_lora_graph(model_tensors, model_tensor_names, version); }; - GGMLRunner::compute(get_graph, n_threads, false, true); - stat(); + GGMLRunner::compute(get_graph, n_threads, false, false, false, true); + stat(!warn_unused); for (auto item : original_tensor_to_final_tensor) { ggml_tensor* original_tensor = item.first; ggml_tensor* final_tensor = item.second; @@ -810,6 +837,10 @@ struct LoraModel : public GGMLRunner { GGMLRunner::free_compute_buffer(); } + void apply(std::map model_tensors, SDVersion version, int n_threads, bool warn_unused = true) { + apply(model_tensors, tensor_names(model_tensors), version, n_threads, warn_unused); + } + void stat(bool at_runntime = false) { size_t total_lora_tensors_count = 0; size_t applied_lora_tensors_count = 0; diff --git a/src/model/adapter/pmid.hpp b/src/model/adapter/pmid.hpp index 3cf59a47..6773734d 100644 --- a/src/model/adapter/pmid.hpp +++ b/src/model/adapter/pmid.hpp @@ -558,7 +558,7 @@ public: return build_graph(id_pixel_values, prompt_embeds, class_tokens_mask, id_embeds); }; - return take_or_empty(GGMLRunner::compute(get_graph, n_threads, true)); + return take_or_empty(GGMLRunner::compute(get_graph, n_threads, true, true, true)); } }; @@ -616,14 +616,15 @@ struct PhotoMakerIDEmbed : public GGMLRunner { return true; }; - model_loader->load_tensors(on_new_tensor_cb, n_threads); + model_loader->set_n_threads(n_threads); + model_loader->load_tensors(on_new_tensor_cb); if (!alloc_params_buffer()) { LOG_ERROR("PhotoMaker ID embeds buffer allocation failed"); return false; } dry_run = false; - model_loader->load_tensors(on_new_tensor_cb, n_threads); + model_loader->load_tensors(on_new_tensor_cb); LOG_DEBUG("finished loading PhotoMaker ID Embeds "); return true; diff --git a/src/model/diffusion/anima.hpp b/src/model/diffusion/anima.hpp index 49c2e45a..7bf765fe 100644 --- a/src/model/diffusion/anima.hpp +++ b/src/model/diffusion/anima.hpp @@ -697,7 +697,7 @@ namespace Anima { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x, timesteps, context, t5_ids, t5_weights); }; - return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), x.dim()); } sd::Tensor compute(int n_threads, diff --git a/src/model/diffusion/control.hpp b/src/model/diffusion/control.hpp index 2f5eb574..d8316b7b 100644 --- a/src/model/diffusion/control.hpp +++ b/src/model/diffusion/control.hpp @@ -309,6 +309,7 @@ public: struct ControlNet : public GGMLRunner { SDVersion version = VERSION_SD1; ControlNetBlock control_net; + std::string weight_prefix; ggml_backend_buffer_t control_buffer = nullptr; ggml_context* control_ctx = nullptr; @@ -321,9 +322,10 @@ struct ControlNet : public GGMLRunner { ControlNet(ggml_backend_t backend, ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, - SDVersion version = VERSION_SD1) - : GGMLRunner(backend, params_backend), control_net(version) { - control_net.init(params_ctx, tensor_storage_map, ""); + SDVersion version = VERSION_SD1, + const std::string& prefix = "") + : GGMLRunner(backend, params_backend), version(version), control_net(version), weight_prefix(prefix) { + control_net.init(params_ctx, tensor_storage_map, prefix); } ~ControlNet() override { @@ -374,8 +376,8 @@ struct ControlNet : public GGMLRunner { return "control_net"; } - void get_param_tensors(std::map& tensors, const std::string prefix) { - control_net.get_param_tensors(tensors, prefix); + void get_param_tensors(std::map& tensors) { + control_net.get_param_tensors(tensors, weight_prefix); } ggml_cgraph* build_graph(const sd::Tensor& x_tensor, @@ -435,7 +437,7 @@ struct ControlNet : public GGMLRunner { return build_graph(x, hint, timesteps, context, y); }; - auto compute_result = GGMLRunner::compute(get_graph, n_threads, false); + auto compute_result = GGMLRunner::compute(get_graph, n_threads, false, false, false); if (!compute_result.has_value()) { return std::nullopt; } @@ -472,7 +474,8 @@ struct ControlNet : public GGMLRunner { return false; } - bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads); + model_loader.set_n_threads(n_threads); + bool success = model_loader.load_tensors(tensors, ignore_tensors); if (!success) { LOG_ERROR("load control net tensors from model loader failed"); diff --git a/src/model/diffusion/ernie_image.hpp b/src/model/diffusion/ernie_image.hpp index 09bcba3b..abb14dab 100644 --- a/src/model/diffusion/ernie_image.hpp +++ b/src/model/diffusion/ernie_image.hpp @@ -440,7 +440,7 @@ namespace ErnieImage { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x, timesteps, context); }; - return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), x.dim()); } sd::Tensor compute(int n_threads, diff --git a/src/model/diffusion/flux.hpp b/src/model/diffusion/flux.hpp index 1d01041b..3181a113 100644 --- a/src/model/diffusion/flux.hpp +++ b/src/model/diffusion/flux.hpp @@ -1500,7 +1500,7 @@ namespace Flux { return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, skip_layers); }; - auto result = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + auto result = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), x.dim()); return result; } diff --git a/src/model/diffusion/hidream_o1.hpp b/src/model/diffusion/hidream_o1.hpp index 3d384def..8ea4f7f5 100644 --- a/src/model/diffusion/hidream_o1.hpp +++ b/src/model/diffusion/hidream_o1.hpp @@ -323,11 +323,15 @@ namespace HiDreamO1 { return gf; } - sd::Tensor compute(int n_threads, const sd::Tensor& image) { + sd::Tensor compute(int n_threads, + const sd::Tensor& image, + bool auto_free = true, + bool free_compute_buffer = true, + bool free_compute_params = true) { auto get_graph = [&]() { return build_graph(image); }; - auto output = GGMLRunner::compute(get_graph, n_threads, false); + auto output = GGMLRunner::compute(get_graph, n_threads, auto_free, free_compute_buffer, free_compute_params); return output.has_value() ? std::move(output.value()) : sd::Tensor(); } }; @@ -455,7 +459,7 @@ namespace HiDreamO1 { auto get_graph = [&]() { return build_graph(x, timestep, input_ids, input_pos, token_types, vinput_mask, image_embeds, ref_images); }; - return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), x.dim()); } sd::Tensor compute(int n_threads, @@ -494,21 +498,6 @@ namespace HiDreamO1 { vision_runner->get_param_tensors(tensors); } - bool alloc_params_buffer() override { - if (!vision_runner->alloc_params_buffer()) { - return false; - } - return true; - } - - void free_params_buffer() override { - vision_runner->free_params_buffer(); - } - - size_t get_params_buffer_size() override { - return vision_runner->get_params_buffer_size(); - } - void set_max_graph_vram_bytes(size_t max_graph_vram_bytes) override { vision_runner->set_max_graph_vram_bytes(max_graph_vram_bytes); } @@ -521,6 +510,14 @@ namespace HiDreamO1 { vision_runner->set_weight_adapter(adapter); } + void set_weight_manager(const std::shared_ptr& manager) override { + vision_runner->set_weight_manager(manager); + } + + void runner_done() override { + vision_runner->runner_done(); + } + SDCondition get_learned_condition(int n_threads, const ConditionerParams& conditioner_params) override { SDCondition result; @@ -666,7 +663,7 @@ namespace HiDreamO1 { result.c_vinput_mask = sd::Tensor(vinput_mask_shape, std::move(vinput_mask)); result.c_image_embeds.reserve(vlm_images.size()); for (const auto& vlm_image : vlm_images) { - auto image_embed = vision_runner->compute(n_threads, vlm_image.second); + auto image_embed = vision_runner->compute(n_threads, vlm_image.second, false, true, true); if (image_embed.empty()) { LOG_ERROR("hidream_o1 conditioner: encode VLM image failed"); return SDCondition(); diff --git a/src/model/diffusion/ideogram4.hpp b/src/model/diffusion/ideogram4.hpp index 2f53c787..330543c3 100644 --- a/src/model/diffusion/ideogram4.hpp +++ b/src/model/diffusion/ideogram4.hpp @@ -537,7 +537,7 @@ namespace Ideogram4 { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x, timesteps, context, use_uncond_model); }; - return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), x.dim()); } sd::Tensor compute(int n_threads, diff --git a/src/model/diffusion/lens.hpp b/src/model/diffusion/lens.hpp index 740ff824..32de8537 100644 --- a/src/model/diffusion/lens.hpp +++ b/src/model/diffusion/lens.hpp @@ -408,7 +408,7 @@ namespace Lens { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x, timesteps, context); }; - return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), x.dim()); } sd::Tensor compute(int n_threads, diff --git a/src/model/diffusion/ltxv.hpp b/src/model/diffusion/ltxv.hpp index a86b4cf5..455dc4b2 100644 --- a/src/model/diffusion/ltxv.hpp +++ b/src/model/diffusion/ltxv.hpp @@ -1939,7 +1939,7 @@ namespace LTXV { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x, timesteps, context, audio_x, audio_timesteps, audio_length, frame_rate, video_positions); }; - auto out = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + auto out = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), x.dim()); return out; } diff --git a/src/model/diffusion/mmdit.hpp b/src/model/diffusion/mmdit.hpp index 84433945..0f6c2d30 100644 --- a/src/model/diffusion/mmdit.hpp +++ b/src/model/diffusion/mmdit.hpp @@ -935,7 +935,7 @@ struct MMDiTRunner : public DiffusionModelRunner { return build_graph(x, timesteps, context, y, skip_layers); }; - return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), x.dim()); } sd::Tensor compute(int n_threads, diff --git a/src/model/diffusion/pid.hpp b/src/model/diffusion/pid.hpp index ac585151..a0dfb324 100644 --- a/src/model/diffusion/pid.hpp +++ b/src/model/diffusion/pid.hpp @@ -823,7 +823,7 @@ namespace Pid { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x, timesteps, context, lq_latent, degrade_sigma); }; - return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), x.dim()); } sd::Tensor compute(int n_threads, diff --git a/src/model/diffusion/qwen_image.hpp b/src/model/diffusion/qwen_image.hpp index 678c3467..1113a922 100644 --- a/src/model/diffusion/qwen_image.hpp +++ b/src/model/diffusion/qwen_image.hpp @@ -627,7 +627,7 @@ namespace Qwen { return build_graph(x, timesteps, context, ref_latents, increase_ref_index); }; - return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), x.dim()); } sd::Tensor compute(int n_threads, diff --git a/src/model/diffusion/unet.hpp b/src/model/diffusion/unet.hpp index f1a96f9d..ab01a60b 100644 --- a/src/model/diffusion/unet.hpp +++ b/src/model/diffusion/unet.hpp @@ -772,7 +772,7 @@ struct UNetModelRunner : public DiffusionModelRunner { return build_graph(x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength); }; - return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), x.dim()); } sd::Tensor compute(int n_threads, diff --git a/src/model/diffusion/wan.hpp b/src/model/diffusion/wan.hpp index 92f49dc2..fd56a0f5 100644 --- a/src/model/diffusion/wan.hpp +++ b/src/model/diffusion/wan.hpp @@ -950,7 +950,7 @@ namespace WAN { return build_graph(x, timesteps, context, clip_fea, c_concat, time_dim_concat, vace_context, vace_strength); }; - return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), x.dim()); } sd::Tensor compute(int n_threads, diff --git a/src/model/diffusion/z_image.hpp b/src/model/diffusion/z_image.hpp index c35c4495..a7d08b09 100644 --- a/src/model/diffusion/z_image.hpp +++ b/src/model/diffusion/z_image.hpp @@ -634,7 +634,7 @@ namespace ZImage { return build_graph(x, timesteps, context, ref_latents, increase_ref_index); }; - return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), x.dim()); } sd::Tensor compute(int n_threads, diff --git a/src/model/te/clip.hpp b/src/model/te/clip.hpp index 7b7c883e..6767a1b1 100644 --- a/src/model/te/clip.hpp +++ b/src/model/te/clip.hpp @@ -567,11 +567,14 @@ struct CLIPTextModelRunner : public GGMLRunner { void* custom_embeddings_data, size_t max_token_idx, bool return_pooled, - int clip_skip) { + int clip_skip, + bool auto_free = true, + bool free_compute_buffer = true, + bool free_compute_params = true) { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip); }; - auto result = GGMLRunner::compute(get_graph, n_threads, true); + auto result = GGMLRunner::compute(get_graph, n_threads, auto_free, free_compute_buffer, free_compute_params); if (return_pooled) { return take_or_empty(std::move(result)); } diff --git a/src/model/te/llm.hpp b/src/model/te/llm.hpp index 3a22e881..d8623bc3 100644 --- a/src/model/te/llm.hpp +++ b/src/model/te/llm.hpp @@ -1733,7 +1733,10 @@ namespace LLM { const sd::Tensor& attention_mask, const std::vector>>& image_embeds, std::set out_layers, - bool return_all_hidden_states = false) { + bool return_all_hidden_states = false, + bool auto_free = true, + bool free_compute_buffer = true, + bool free_compute_params = true) { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(input_ids, attention_mask, @@ -1741,7 +1744,7 @@ namespace LLM { out_layers, return_all_hidden_states); }; - return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, true), + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, auto_free, free_compute_buffer, free_compute_params), input_ids.dim() + 1); } @@ -1802,11 +1805,14 @@ namespace LLM { } sd::Tensor encode_image(const int n_threads, - const sd::Tensor& image) { + const sd::Tensor& image, + bool auto_free = false, + bool free_compute_buffer = false, + bool free_compute_params = false) { auto get_graph = [&]() -> ggml_cgraph* { return build_encode_image_graph(image); }; - return take_or_empty(GGMLRunner::compute(get_graph, n_threads, false)); + return take_or_empty(GGMLRunner::compute(get_graph, n_threads, auto_free, free_compute_buffer, free_compute_params)); } }; diff --git a/src/model/te/t5.hpp b/src/model/te/t5.hpp index 41d9978e..9bde46fc 100644 --- a/src/model/te/t5.hpp +++ b/src/model/te/t5.hpp @@ -394,11 +394,14 @@ struct T5Runner : public GGMLRunner { sd::Tensor compute(const int n_threads, const sd::Tensor& input_ids, - const sd::Tensor& attention_mask) { + const sd::Tensor& attention_mask, + bool auto_free = true, + bool free_compute_buffer = true, + bool free_compute_params = true) { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(input_ids, attention_mask); }; - return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, true), 3); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, auto_free, free_compute_buffer, free_compute_params), 3); } static std::vector _relative_position_bucket(const std::vector& relative_position, diff --git a/src/model/upscaler/esrgan.hpp b/src/model/upscaler/esrgan.hpp index a56ebfe5..3bd07923 100644 --- a/src/model/upscaler/esrgan.hpp +++ b/src/model/upscaler/esrgan.hpp @@ -336,9 +336,11 @@ struct ESRGAN : public GGMLRunner { } } - success = model_loader.load_tensors(model_tensors, {}, n_threads); + model_loader.set_n_threads(n_threads); + success = model_loader.load_tensors(model_tensors); } else { - success = model_loader.load_tensors(esrgan_tensors, {}, n_threads); + model_loader.set_n_threads(n_threads); + success = model_loader.load_tensors(esrgan_tensors); } if (!success) { @@ -367,7 +369,7 @@ struct ESRGAN : public GGMLRunner { sd::Tensor compute(const int n_threads, const sd::Tensor& x) { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x); }; - auto result = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + auto result = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), x.dim()); return result; } }; diff --git a/src/model/upscaler/ltx_latent_upscaler.hpp b/src/model/upscaler/ltx_latent_upscaler.hpp index b411e8aa..1c98b3fd 100644 --- a/src/model/upscaler/ltx_latent_upscaler.hpp +++ b/src/model/upscaler/ltx_latent_upscaler.hpp @@ -240,20 +240,25 @@ namespace LTXVUpsampler { protected: int64_t channels; int stride; - ggml_tensor* kernel = nullptr; std::vector kernel_data; + std::string kernel_name; void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + SD_UNUSED(ctx); SD_UNUSED(tensor_storage_map); if (stride == 1) { return; } - kernel = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 5, 5, 1, channels); - std::string name = prefix + "kernel"; - ggml_set_name(kernel, name.c_str()); + kernel_name = prefix + "kernel"; + } + public: + BlurDownsample(int64_t channels, int stride) + : channels(channels), + stride(stride) { + GGML_ASSERT(stride >= 1); static const float binomial[5] = {1.f, 4.f, 6.f, 4.f, 1.f}; kernel_data.resize(static_cast(5 * 5 * channels)); for (int64_t c = 0; c < channels; ++c) { @@ -266,26 +271,16 @@ namespace LTXVUpsampler { } } - public: - BlurDownsample(int64_t channels, int stride) - : channels(channels), - stride(stride) { - GGML_ASSERT(stride >= 1); - } - - void load_fixed_tensors() { - if (kernel == nullptr || kernel_data.empty()) { - return; - } - ggml_backend_tensor_set(kernel, kernel_data.data(), 0, kernel_data.size() * sizeof(float)); - } - ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { if (stride == 1) { return x; } - GGML_ASSERT(kernel != nullptr); + GGML_ASSERT(ctx != nullptr); + GGML_ASSERT(!kernel_data.empty()); GGML_ASSERT(x->ne[2] == channels); + ggml_tensor* kernel = ggml_new_tensor_4d(ctx->ggml_ctx, GGML_TYPE_F32, 5, 5, 1, channels); + ggml_set_name(kernel, kernel_name.empty() ? "blur_down.kernel" : kernel_name.c_str()); + ctx->bind_backend_tensor_data(kernel, kernel_data.data()); if (ctx->conv2d_direct_enabled) { return ggml_conv_2d_dw_direct(ctx->ggml_ctx, kernel, x, stride, stride, 2, 2, 1, 1); } @@ -311,11 +306,6 @@ namespace LTXVUpsampler { blocks["blur_down"] = std::shared_ptr(new BlurDownsample(mid_channels, den)); } - void load_fixed_tensors() { - auto blur_down = std::dynamic_pointer_cast(blocks["blur_down"]); - blur_down->load_fixed_tensors(); - } - ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { auto conv = std::dynamic_pointer_cast(blocks["conv"]); auto pixel_shuffle = std::dynamic_pointer_cast(blocks["pixel_shuffle"]); @@ -426,14 +416,6 @@ namespace LTXVUpsampler { sd::ggml_graph_cut::mark_graph_cut(x, "ltx_latent_upsampler.final", "x"); return x; } - - void load_fixed_tensors() { - if (!config.rational_resampler) { - return; - } - auto upsampler = std::dynamic_pointer_cast(blocks["upsampler"]); - upsampler->load_fixed_tensors(); - } }; struct LatentUpsamplerRunner : public GGMLRunner { @@ -490,12 +472,11 @@ namespace LTXVUpsampler { if (config.rational_resampler) { ignore_tensors.insert("upsampler.blur_down.kernel"); } - if (!model_loader.load_tensors(tensors, ignore_tensors, n_threads)) { + model_loader.set_n_threads(n_threads); + if (!model_loader.load_tensors(tensors, ignore_tensors)) { LOG_ERROR("load LTX latent upsampler tensors failed"); return false; } - model->load_fixed_tensors(); - LOG_INFO("LTX latent upsampler loaded: in_channels=%" PRId64 ", mid_channels=%" PRId64 ", blocks=%d, scale=%.3f, temporal_factor=%d, rational=%d", config.in_channels, config.mid_channels, @@ -542,7 +523,7 @@ namespace LTXVUpsampler { } size_t expected_dim = static_cast(x.dim()); auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x); }; - return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), expected_dim); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), expected_dim); } }; diff --git a/src/model/vae/auto_encoder_kl.hpp b/src/model/vae/auto_encoder_kl.hpp index 49472ff4..443846fe 100644 --- a/src/model/vae/auto_encoder_kl.hpp +++ b/src/model/vae/auto_encoder_kl.hpp @@ -670,7 +670,7 @@ struct AutoEncoderKL : public VAE { bool decode_only = false, bool use_video_decoder = false, SDVersion version = VERSION_SD1) - : decode_only(decode_only), VAE(version, backend, params_backend) { + : VAE(version, backend, params_backend, prefix), decode_only(decode_only) { if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) { scale_factor = 0.18215f; shift_factor = 0.f; @@ -718,8 +718,8 @@ struct AutoEncoderKL : public VAE { return "vae"; } - void get_param_tensors(std::map& tensors, const std::string prefix) override { - ae.get_param_tensors(tensors, prefix); + void get_param_tensors(std::map& tensors) override { + ae.get_param_tensors(tensors, weight_prefix); } ggml_cgraph* build_graph(const sd::Tensor& z_tensor, bool decode_graph) { @@ -742,7 +742,7 @@ struct AutoEncoderKL : public VAE { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(z, decode_graph); }; - return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), z.dim()); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), z.dim()); } sd::Tensor gaussian_latent_sample(const sd::Tensor& moments, std::shared_ptr rng) { diff --git a/src/model/vae/ltx_audio_vae.hpp b/src/model/vae/ltx_audio_vae.hpp index d41a79a4..822386b9 100644 --- a/src/model/vae/ltx_audio_vae.hpp +++ b/src/model/vae/ltx_audio_vae.hpp @@ -997,6 +997,7 @@ namespace LTXV { struct LTXAudioVAERunner : public GGMLRunner { LTXAudioVAEConfig config; LTXAudioVAE model; + std::string weight_prefix; sd::Tensor bwe_skip_filter_tensor; LTXAudioVAERunner(ggml_backend_t backend, @@ -1004,6 +1005,7 @@ namespace LTXV { const String2TensorStorage& tensor_storage_map, const std::string& prefix = "") : GGMLRunner(backend, params_backend), + weight_prefix(prefix), config(LTXAudioVAEConfig::detect_from_weights(tensor_storage_map)), model(config) { model.init(params_ctx, tensor_storage_map, prefix); @@ -1013,8 +1015,8 @@ namespace LTXV { } } - void get_param_tensors(std::map& tensors, const std::string prefix) { - model.get_param_tensors(tensors, prefix); + void get_param_tensors(std::map& tensors) { + model.get_param_tensors(tensors, weight_prefix); } size_t get_params_buffer_size() { @@ -1037,7 +1039,7 @@ namespace LTXV { ggml_build_forward_expand(gf, waveform); return gf; }; - auto result = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), 4); + auto result = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), 4); int64_t t1 = ggml_time_ms(); LOG_INFO("ltx audio vae decode completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); return result; @@ -1082,7 +1084,7 @@ namespace LTXV { } std::map tensors; - ltx_audio_vae->get_param_tensors(tensors, ""); + ltx_audio_vae->get_param_tensors(tensors); if (!model_loader.load_tensors(tensors)) { LOG_ERROR("load tensors from model loader failed"); diff --git a/src/model/vae/ltx_vae.hpp b/src/model/vae/ltx_vae.hpp index 86fcdcb0..59e38c32 100644 --- a/src/model/vae/ltx_vae.hpp +++ b/src/model/vae/ltx_vae.hpp @@ -1239,7 +1239,7 @@ struct LTXVideoVAE : public VAE { patch_size, tensor_storage_map, prefix), - VAE(version, backend, params_backend) { + VAE(version, backend, params_backend, prefix) { vae.init(params_ctx, tensor_storage_map, prefix); decode_timestep_tensor.values()[0] = vae.decode_timestep; } @@ -1271,8 +1271,8 @@ struct LTXVideoVAE : public VAE { } } - void get_param_tensors(std::map& tensors, const std::string prefix) override { - vae.get_param_tensors(tensors, prefix); + void get_param_tensors(std::map& tensors) override { + vae.get_param_tensors(tensors, weight_prefix); } struct TemporalTilePlan { @@ -1396,7 +1396,7 @@ struct LTXVideoVAE : public VAE { static_cast(start), chunk_overlap); }; - auto chunk = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, true), + auto chunk = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, true, true, true), expected_dim); if (chunk.empty()) { free_cache_ctx_and_buffer(); @@ -1452,7 +1452,7 @@ struct LTXVideoVAE : public VAE { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(input, decode_graph); }; - auto result = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), expected_dim); + auto result = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), expected_dim); if (result.empty()) { return {}; } @@ -1465,7 +1465,7 @@ struct LTXVideoVAE : public VAE { auto get_graph = [&]() -> ggml_cgraph* { return build_latent_statistics_graph(z, normalize); }; - return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), static_cast(z.dim())); } @@ -1541,7 +1541,7 @@ struct LTXVideoVAE : public VAE { } std::map tensors; - vae->get_param_tensors(tensors, "first_stage_model"); + vae->get_param_tensors(tensors); if (!model_loader.load_tensors(tensors)) { LOG_ERROR("load tensors from model loader failed"); diff --git a/src/model/vae/tae.hpp b/src/model/vae/tae.hpp index fcb62c2e..95a2cd58 100644 --- a/src/model/vae/tae.hpp +++ b/src/model/vae/tae.hpp @@ -628,9 +628,9 @@ struct TinyImageAutoEncoder : public VAE { const std::string prefix, bool decoder_only = true, SDVersion version = VERSION_SD1) - : decode_only(decoder_only), - taesd(decoder_only, version), - VAE(version, backend, params_backend) { + : VAE(version, backend, params_backend, "tae"), + decode_only(decoder_only), + taesd(decoder_only, version) { scale_input = false; taesd.init(params_ctx, tensor_storage_map, prefix); } @@ -639,8 +639,8 @@ struct TinyImageAutoEncoder : public VAE { return "taesd"; } - void get_param_tensors(std::map& tensors, const std::string prefix) { - taesd.get_param_tensors(tensors, prefix); + void get_param_tensors(std::map& tensors) override { + taesd.get_param_tensors(tensors, weight_prefix); } sd::Tensor vae_output_to_latents(const sd::Tensor& vae_output, std::shared_ptr rng) override { @@ -676,7 +676,7 @@ struct TinyImageAutoEncoder : public VAE { return build_graph(z_tensor, decode_graph); }; - return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), z_tensor.dim()); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), z_tensor.dim()); } }; @@ -691,8 +691,8 @@ struct TinyVideoAutoEncoder : public VAE { const std::string prefix, bool decoder_only = true, SDVersion version = VERSION_WAN2) - : decode_only(decoder_only), - VAE(version, backend, params_backend) { + : VAE(version, backend, params_backend, "tae"), + decode_only(decoder_only) { for (auto tensor_storage : tensor_storage_map) { if (tensor_storage.first.find(prefix + ".3.conv.6.weight") != std::string::npos) { is_wide = true; @@ -708,8 +708,8 @@ struct TinyVideoAutoEncoder : public VAE { return "taehv"; } - void get_param_tensors(std::map& tensors, const std::string prefix) { - taehv.get_param_tensors(tensors, prefix); + void get_param_tensors(std::map& tensors) override { + taehv.get_param_tensors(tensors, weight_prefix); } sd::Tensor vae_output_to_latents(const sd::Tensor& vae_output, std::shared_ptr rng) override { @@ -746,7 +746,7 @@ struct TinyVideoAutoEncoder : public VAE { return build_graph(z_tensor, decode_graph); }; - return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), z_tensor.dim()); + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), z_tensor.dim()); } }; diff --git a/src/model/vae/vae.hpp b/src/model/vae/vae.hpp index bd0ce6c4..1f508b6e 100644 --- a/src/model/vae/vae.hpp +++ b/src/model/vae/vae.hpp @@ -7,6 +7,7 @@ struct VAE : public GGMLRunner { protected: SDVersion version; + std::string weight_prefix; bool scale_input = true; virtual sd::Tensor _compute(const int n_threads, const sd::Tensor& z, @@ -62,8 +63,8 @@ protected: } public: - VAE(SDVersion version, ggml_backend_t backend, ggml_backend_t params_backend) - : version(version), GGMLRunner(backend, params_backend) {} + VAE(SDVersion version, ggml_backend_t backend, ggml_backend_t params_backend, const std::string& weight_prefix = "") + : version(version), weight_prefix(weight_prefix), GGMLRunner(backend, params_backend) {} int get_scale_factor() { int scale_factor = 8; @@ -214,7 +215,7 @@ public: virtual sd::Tensor vae_output_to_latents(const sd::Tensor& vae_output, std::shared_ptr rng) = 0; virtual sd::Tensor diffusion_to_vae_latents(const sd::Tensor& latents) = 0; virtual sd::Tensor vae_to_diffusion_latents(const sd::Tensor& latents) = 0; - virtual void get_param_tensors(std::map& tensors, const std::string prefix) = 0; + virtual void get_param_tensors(std::map& tensors) = 0; virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); }; virtual void set_temporal_tiling_enabled(bool enabled) { SD_UNUSED(enabled); }; virtual void set_tiling_params(const sd_tiling_params_t& params) { @@ -251,7 +252,7 @@ struct FakeVAE : public VAE { return latents; } - void get_param_tensors(std::map& tensors, const std::string prefix) override {} + void get_param_tensors(std::map& tensors) override {} std::string get_desc() override { return "fake_vae"; diff --git a/src/model/vae/wan_vae.hpp b/src/model/vae/wan_vae.hpp index c20764cd..36bb8696 100644 --- a/src/model/vae/wan_vae.hpp +++ b/src/model/vae/wan_vae.hpp @@ -1129,7 +1129,7 @@ namespace WAN { const std::string prefix = "", bool decode_only = false, SDVersion version = VERSION_WAN2) - : decode_only(decode_only), ae(decode_only, version == VERSION_WAN2_2_TI2V), VAE(version, backend, params_backend) { + : VAE(version, backend, params_backend, prefix), decode_only(decode_only), ae(decode_only, version == VERSION_WAN2_2_TI2V) { ae.init(params_ctx, tensor_storage_map, prefix); } @@ -1137,8 +1137,8 @@ namespace WAN { return "wan_vae"; } - void get_param_tensors(std::map& tensors, const std::string prefix) override { - ae.get_param_tensors(tensors, prefix); + void get_param_tensors(std::map& tensors) override { + ae.get_param_tensors(tensors, weight_prefix); } sd::Tensor vae_output_to_latents(const sd::Tensor& vae_output, std::shared_ptr rng) override { @@ -1255,7 +1255,7 @@ namespace WAN { return build_graph(input, decode_graph); } }; - auto result = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, true), + auto result = restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, true, true, true), input.empty() ? z.dim() : input.dim()); if (!result.empty() && z.dim() == 4) { result.squeeze_(2); @@ -1268,7 +1268,7 @@ namespace WAN { auto get_graph = [&]() -> ggml_cgraph* { return build_graph_partial(z, decode_graph, i); }; - auto out_opt = GGMLRunner::compute(get_graph, n_threads, true); + auto out_opt = GGMLRunner::compute(get_graph, n_threads, true, true, true); if (!out_opt.has_value()) { return {}; } @@ -1281,7 +1281,7 @@ namespace WAN { sd::Tensor output = std::move(out); for (i = 1; i < t; i++) { - auto chunk_opt = GGMLRunner::compute(get_graph, n_threads, true); + auto chunk_opt = GGMLRunner::compute(get_graph, n_threads, true, true, true); if (!chunk_opt.has_value()) { return {}; } @@ -1327,7 +1327,7 @@ namespace WAN { // ggml_backend_t backend = ggml_backend_cuda_init(0); ggml_backend_t backend = sd_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_F16; - std::shared_ptr vae = std::make_shared(backend, backend, String2TensorStorage{}, "", false, VERSION_WAN2_2_TI2V); + std::shared_ptr vae = std::make_shared(backend, backend, String2TensorStorage{}, "first_stage_model", false, VERSION_WAN2_2_TI2V); { LOG_INFO("loading from '%s'", file_path.c_str()); @@ -1336,7 +1336,7 @@ namespace WAN { return; } std::map tensors; - vae->get_param_tensors(tensors, "first_stage_model"); + vae->get_param_tensors(tensors); ModelLoader model_loader; if (!model_loader.init_from_file_and_convert_name(file_path, "vae.")) { diff --git a/src/model_loader.cpp b/src/model_loader.cpp index 9c2d5cef..8d37d39a 100644 --- a/src/model_loader.cpp +++ b/src/model_loader.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -204,10 +205,28 @@ void convert_tensor(void* src, /*================================================= ModelLoader ==================================================*/ +ModelLoader::ModelLoader() + : n_threads_(sd_get_num_physical_cores()) { +} + +size_t ModelLoader::add_file_path(const std::string& file_path) { + if (model_files_processed) { + file_data.clear(); + model_files_processed = false; + } + file_paths_.push_back(file_path); + return file_paths_.size() - 1; +} + void ModelLoader::add_tensor_storage(const TensorStorage& tensor_storage) { tensor_storage_map[tensor_storage.name] = tensor_storage; } +void ModelLoader::set_n_threads(int n_threads) { + n_threads_ = n_threads > 0 ? n_threads : sd_get_num_physical_cores(); + LOG_DEBUG("using %d threads for model loading", n_threads_); +} + bool ModelLoader::init_from_file(const std::string& file_path, const std::string& prefix) { if (is_directory(file_path)) { LOG_INFO("load %s using diffusers format", file_path.c_str()); @@ -271,8 +290,7 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s return false; } - file_paths_.push_back(file_path); - size_t file_index = file_paths_.size() - 1; + size_t file_index = add_file_path(file_path); for (auto& tensor_storage : tensor_storages) { // LOG_DEBUG("%s", tensor_storage.name.c_str()); @@ -300,8 +318,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const return false; } - file_paths_.push_back(file_path); - size_t file_index = file_paths_.size() - 1; + size_t file_index = add_file_path(file_path); for (auto& tensor_storage : tensor_storages) { if (is_unused_tensor(tensor_storage.name)) { @@ -335,8 +352,7 @@ bool ModelLoader::init_from_torch_legacy_file(const std::string& file_path, cons return false; } - file_paths_.push_back(file_path); - size_t file_index = file_paths_.size() - 1; + size_t file_index = add_file_path(file_path); for (auto& tensor_storage : tensor_storages) { if (is_unused_tensor(tensor_storage.name)) { @@ -366,8 +382,7 @@ bool ModelLoader::init_from_torch_zip_file(const std::string& file_path, const s return false; } - file_paths_.push_back(file_path); - size_t file_index = file_paths_.size() - 1; + size_t file_index = add_file_path(file_path); for (auto& tensor_storage : tensor_storages) { if (!starts_with(tensor_storage.name, prefix)) { @@ -760,8 +775,6 @@ void ModelLoader::process_model_files(bool enable_mmap, bool writable_mmap) { return; } - int64_t start_time = ggml_time_ms(); - std::vector processed_tensor_storages; for (const auto& [name, tensor_storage] : tensor_storage_map) { if (is_unused_tensor(tensor_storage.name)) { @@ -812,20 +825,12 @@ void ModelLoader::process_model_files(bool enable_mmap, bool writable_mmap) { } else { LOG_WARN("failed to memory-map '%s' (falling back to read())", file_path.c_str()); } - } else if (!is_zip) { - LOG_INFO("NOT using mmap for '%s' (mmap disabled by caller)", - file_path.c_str()); } file_data.push_back(std::move(fdata)); } model_files_processed = true; - - int64_t end_time = ggml_time_ms(); - int64_t process_time_ms = end_time - start_time; - - LOG_INFO("model files processing completed in %.2fs", process_time_ms / 1000.f); } std::vector ModelLoader::mmap_tensors(std::map& tensors, @@ -919,7 +924,9 @@ std::vector ModelLoader::mmap_tensors(std::map* target_tensor_names) { process_model_files(enable_mmap, false); std::atomic read_time_ms(0); @@ -928,14 +935,26 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread std::atomic convert_time_ms(0); std::atomic bytes_processed(0); - int num_threads_to_use = n_threads_p > 0 ? n_threads_p : sd_get_num_physical_cores(); - LOG_DEBUG("using %d threads for model loading", num_threads_to_use); + int num_threads_to_use = n_threads_; int64_t start_time = ggml_time_ms(); size_t total_tensors_to_process = 0; + std::vector file_tensors_to_process_counts; + file_tensors_to_process_counts.reserve(file_data.size()); for (const auto& fdata : file_data) { - total_tensors_to_process += fdata.tensors.size(); + size_t file_tensors_to_process = 0; + if (target_tensor_names == nullptr) { + file_tensors_to_process = fdata.tensors.size(); + } else { + for (const TensorStorage& tensor_storage : fdata.tensors) { + if (target_tensor_names->find(tensor_storage.name) != target_tensor_names->end()) { + file_tensors_to_process++; + } + } + } + file_tensors_to_process_counts.push_back(file_tensors_to_process); + total_tensors_to_process += file_tensors_to_process; } bool success = true; @@ -943,17 +962,38 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread const int64_t t_start = start_time; int last_n_threads = 1; - for (auto& fdata : file_data) { + for (size_t file_index = 0; file_index < file_data.size(); ++file_index) { + auto& fdata = file_data[file_index]; const std::string& file_path = fdata.path; - LOG_DEBUG("loading tensors from %s", file_path.c_str()); const std::vector& file_tensors = fdata.tensors; + std::vector tensors_to_process; + size_t file_tensors_to_process = file_tensors_to_process_counts[file_index]; + tensors_to_process.reserve(file_tensors_to_process); + if (target_tensor_names == nullptr) { + for (const TensorStorage& tensor_storage : file_tensors) { + tensors_to_process.push_back(&tensor_storage); + } + } else { + for (const TensorStorage& tensor_storage : file_tensors) { + if (target_tensor_names->find(tensor_storage.name) != target_tensor_names->end()) { + tensors_to_process.push_back(&tensor_storage); + } + } + } + if (tensors_to_process.empty()) { + continue; + } + LOG_DEBUG("loading %zu/%zu tensors from %s", + tensors_to_process.size(), + file_tensors.size(), + file_path.c_str()); bool is_zip = fdata.is_zip; std::shared_ptr mmapped = fdata.mmapped; - int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size()); + int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)tensors_to_process.size()); if (n_threads < 1) { n_threads = 1; } @@ -989,11 +1029,11 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread while (true) { int64_t t0, t1; size_t idx = tensor_idx.fetch_add(1); - if (idx >= file_tensors.size() || failed) { + if (idx >= tensors_to_process.size() || failed) { break; } - const TensorStorage& tensor_storage = file_tensors[idx]; + const TensorStorage& tensor_storage = *tensors_to_process[idx]; ggml_tensor* dst_tensor = nullptr; t0 = ggml_time_ms(); @@ -1133,16 +1173,18 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread while (true) { size_t current_idx = tensor_idx.load(); - if (current_idx >= file_tensors.size() || failed) { + if (current_idx >= tensors_to_process.size() || failed) { break; } size_t curr_num = total_tensors_processed + current_idx; float elapsed_seconds = (ggml_time_ms() - t_start) / 1000.0f; - pretty_bytes_progress(static_cast(curr_num), - static_cast(total_tensors_to_process), - bytes_processed.load(), - elapsed_seconds); - std::this_thread::sleep_for(std::chrono::milliseconds(200)); + if (total_tensors_to_process > 0) { + pretty_bytes_progress(static_cast(curr_num), + static_cast(total_tensors_to_process), + bytes_processed.load(), + elapsed_seconds); + } + std::this_thread::sleep_for(std::chrono::milliseconds(total_tensors_to_process <= 4 ? 10 : 200)); } for (auto& w : workers) { @@ -1153,12 +1195,14 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread success = false; break; } - total_tensors_processed += file_tensors.size(); - pretty_bytes_progress(static_cast(total_tensors_processed), - static_cast(total_tensors_to_process), - bytes_processed.load(), - (ggml_time_ms() - t_start) / 1000.0f); - if (total_tensors_processed < total_tensors_to_process) { + total_tensors_processed += tensors_to_process.size(); + if (total_tensors_to_process > 0) { + pretty_bytes_progress(static_cast(total_tensors_processed), + static_cast(total_tensors_to_process), + bytes_processed.load(), + (ggml_time_ms() - t_start) / 1000.0f); + } + if (total_tensors_processed < total_tensors_to_process && total_tensors_to_process > 0) { printf("\n"); } } @@ -1173,9 +1217,77 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread return success; } +bool ModelLoader::load_float_tensor(const std::string& name, + std::vector& data, + int n_threads, + bool use_mmap) { + data.clear(); + + auto tensor_storage_it = tensor_storage_map.find(name); + if (tensor_storage_it == tensor_storage_map.end()) { + return false; + } + + const TensorStorage& tensor_storage = tensor_storage_it->second; + int64_t n_elements = tensor_storage.nelements(); + if (n_elements <= 0) { + LOG_ERROR("tensor '%s' has invalid element count: %" PRId64, name.c_str(), n_elements); + return false; + } + if (tensor_storage.n_dims <= 0 || tensor_storage.n_dims > GGML_MAX_DIMS) { + LOG_ERROR("tensor '%s' has unsupported dims: %d", name.c_str(), tensor_storage.n_dims); + return false; + } + + std::vector loaded_data(static_cast(n_elements)); + ggml_init_params params; + params.mem_size = ggml_tensor_overhead(); + params.mem_buffer = nullptr; + params.no_alloc = true; + + ggml_context* ctx = ggml_init(params); + if (ctx == nullptr) { + LOG_ERROR("failed to create context for tensor '%s'", name.c_str()); + return false; + } + + ggml_tensor* tensor = ggml_new_tensor(ctx, GGML_TYPE_F32, tensor_storage.n_dims, tensor_storage.ne); + ggml_set_name(tensor, name.c_str()); + tensor->data = loaded_data.data(); + + bool loaded = false; + auto on_new_tensor_cb = [&](const TensorStorage& current_tensor_storage, ggml_tensor** dst_tensor) -> bool { + *dst_tensor = nullptr; + if (current_tensor_storage.name != name) { + return true; + } + if (current_tensor_storage.nelements() != n_elements) { + LOG_ERROR("tensor '%s' element count changed during load", name.c_str()); + return false; + } + *dst_tensor = tensor; + loaded = true; + return true; + }; + + std::set target_tensor_names{name}; + if (n_threads > 0) { + set_n_threads(n_threads); + } + bool success = load_tensors(on_new_tensor_cb, use_mmap, &target_tensor_names); + ggml_free(ctx); + + if (!success || !loaded) { + data.clear(); + return false; + } + + data = std::move(loaded_data); + return true; +} + bool ModelLoader::load_tensors(std::map& tensors, std::set ignore_tensors, - int n_threads, bool enable_mmap) { std::set tensor_names_in_file; std::mutex tensor_names_mutex; @@ -1219,7 +1331,7 @@ bool ModelLoader::load_tensors(std::map& tensors, return true; }; - bool success = load_tensors(on_new_tensor_cb, n_threads, enable_mmap); + bool success = load_tensors(on_new_tensor_cb, enable_mmap); if (!success) { LOG_ERROR("load tensors from file failed"); return false; diff --git a/src/model_loader.h b/src/model_loader.h index 8e0f4198..4dc700f2 100644 --- a/src/model_loader.h +++ b/src/model_loader.h @@ -34,7 +34,9 @@ protected: std::vector file_data; bool model_files_processed = false; String2TensorStorage tensor_storage_map; + int n_threads_; + size_t add_file_path(const std::string& file_path); void add_tensor_storage(const TensorStorage& tensor_storage); bool init_from_gguf_file(const std::string& file_path, const std::string& prefix = ""); @@ -44,6 +46,8 @@ protected: bool init_from_diffusers_file(const std::string& file_path, const std::string& prefix = ""); public: + ModelLoader(); + bool init_from_file(const std::string& file_path, const std::string& prefix = ""); void convert_tensors_name(); bool init_from_file_and_convert_name(const std::string& file_path, @@ -55,16 +59,23 @@ public: std::map get_diffusion_model_wtype_stat(); std::map get_vae_wtype_stat(); String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; } + const String2TensorStorage& get_tensor_storage_map() const { return tensor_storage_map; } + void set_n_threads(int n_threads); void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = ""); void process_model_files(bool enable_mmap = false, bool writable_mmap = true); std::vector mmap_tensors(std::map& tensors, std::set ignore_tensors = {}, bool writable = true); - bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0, bool use_mmap = false); + bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, + bool use_mmap = false, + const std::set* target_tensor_names = nullptr); bool load_tensors(std::map& tensors, std::set ignore_tensors = {}, - int n_threads = 0, bool use_mmap = false); + bool load_float_tensor(const std::string& name, + std::vector& data, + int n_threads = 0, + bool use_mmap = false); std::vector get_tensor_names() const { std::vector names; diff --git a/src/model_manager.cpp b/src/model_manager.cpp new file mode 100644 index 00000000..328a478b --- /dev/null +++ b/src/model_manager.cpp @@ -0,0 +1,944 @@ +#include "model_manager.h" + +#include +#include +#include +#include +#include + +#include "core/ggml_extend_backend.h" +#include "core/util.h" +#include "model/adapter/lora.hpp" + +static size_t aligned_offset(const void* buffer, size_t offset, size_t alignment) { + GGML_ASSERT(alignment != 0 && (alignment & (alignment - 1)) == 0); + size_t align = (alignment - ((reinterpret_cast(buffer) + offset) % alignment)) % alignment; + return offset + align; +} + +static bool lora_specs_equal(const std::vector& lhs, + const std::vector& rhs) { + if (lhs.size() != rhs.size()) { + return false; + } + for (size_t i = 0; i < lhs.size(); ++i) { + if (lhs[i].path != rhs[i].path || + lhs[i].multiplier != rhs[i].multiplier || + lhs[i].is_high_noise != rhs[i].is_high_noise || + lhs[i].tensor_name_prefix_filter != rhs[i].tensor_name_prefix_filter || + lhs[i].required != rhs[i].required) { + return false; + } + } + return true; +} + +static std::string lora_id(const ModelManager::LoraSpec& lora) { + return lora.is_high_noise ? "|high_noise|" + lora.path : lora.path; +} + +static bool backend_supports_host_buffer(ggml_backend_t backend) { + if (backend == nullptr) { + return false; + } + if (sd_backend_is_cpu(backend)) { + return true; + } + ggml_backend_dev_t dev = ggml_backend_get_device(backend); + if (dev == nullptr) { + return false; + } + ggml_backend_dev_props props; + ggml_backend_dev_get_props(dev, &props); + return props.caps.buffer_from_host_ptr; +} + +ModelManager::~ModelManager() { + release_all(); +} + +void ModelManager::set_common_ignore_tensors(std::set ignore_tensors) { + common_ignore_tensors_ = std::move(ignore_tensors); +} + +void ModelManager::set_loras(std::vector loras, SDVersion version) { + if (loras.empty() && loras_.empty()) { + lora_version_ = version; + return; + } + if (lora_version_ == version && lora_specs_equal(loras_, loras)) { + return; + } + + loras_ = std::move(loras); + lora_version_ = version; + current_lora_epoch_++; + reset_lora_applied_params(); +} + +std::set ModelManager::tensor_names() const { + std::set names; + for (const auto& state : tensor_states_) { + if (state != nullptr) { + names.insert(state->name); + } + } + return names; +} + +size_t estimate_tensors_size(const std::map& tensors) { + size_t size = 0; + std::unordered_set seen; + for (const auto& pair : tensors) { + ggml_tensor* tensor = pair.second; + if (tensor == nullptr || seen.find(tensor) != seen.end()) { + continue; + } + seen.insert(tensor); + size += ggml_nbytes(tensor); + } + return size; +} + +bool ModelManager::register_param_tensors(const std::string& desc, + std::map tensors, + ResidencyMode residency_mode, + ggml_backend_t compute_backend, + ggml_backend_t params_backend, + size_t* registered_tensor_size) { + if (desc.empty()) { + LOG_ERROR("model manager tensor desc is empty"); + return false; + } + if (registered_tensor_size != nullptr) { + *registered_tensor_size += estimate_tensors_size(tensors); + } + + std::vector> new_states; + new_states.reserve(tensors.size()); + + for (const auto& pair : tensors) { + const std::string& name = pair.first; + ggml_tensor* tensor = pair.second; + if (tensor == nullptr) { + continue; + } + if (tensor_states_by_name_.find(name) != tensor_states_by_name_.end()) { + LOG_ERROR("model manager tensor name '%s' is already registered", name.c_str()); + return false; + } + ggml_set_name(tensor, name.c_str()); + + auto state = std::make_unique(); + state->name = name; + state->tensor = tensor; + state->desc = desc; + state->residency_mode = residency_mode; + state->compute_backend = compute_backend; + state->params_backend = params_backend; + new_states.push_back(std::move(state)); + } + + for (auto& state : new_states) { + TensorState* registered_state = state.get(); + tensor_states_by_name_[registered_state->name] = registered_state; + tensor_states_.push_back(std::move(state)); + } + return true; +} + +bool ModelManager::validate_registered_tensors() { + bool ok = true; + for (const auto& state : tensor_states_) { + if (state == nullptr) { + ok = false; + continue; + } + bool state_ok = validate_tensor(*state); + if (state_ok) { + state->metadata_validated = true; + } + ok = state_ok && ok; + } + return ok; +} + +bool ModelManager::load_tensors_to_params_backend(const std::vector& states) { + std::vector need_load; + need_load.reserve(states.size()); + for (TensorState* state : states) { + if (state == nullptr || should_ignore(*state) || is_optional_missing_tensor(state->name)) { + continue; + } + if (!state->metadata_validated) { + if (!validate_tensor(*state)) { + return false; + } + state->metadata_validated = true; + } + if (!state->loaded_to_params_backend) { + need_load.push_back(state); + } + } + if (need_load.empty()) { + return true; + } + + std::vector created_storage_blocks; + if (!mmap_params(need_load, created_storage_blocks)) { + for (ParamsStorageBlock* block : created_storage_blocks) { + if (block != nullptr) { + free_params_storage_block(*block); + erase_params_storage_block(block); + } + } + return false; + } + + std::vector need_alloc; + need_alloc.reserve(need_load.size()); + for (TensorState* state : need_load) { + if (state->tensor != nullptr && state->tensor->data == nullptr && state->tensor->view_src == nullptr) { + need_alloc.push_back(state); + } + } + + if (!alloc_params_buffers(need_alloc, created_storage_blocks) || + !load_tensors(need_load)) { + for (ParamsStorageBlock* block : created_storage_blocks) { + if (block != nullptr) { + free_params_storage_block(*block); + erase_params_storage_block(block); + } + } + return false; + } + for (ParamsStorageBlock* block : created_storage_blocks) { + if (block != nullptr && block->buffer != nullptr) { + LOG_DEBUG("model manager prepared params backend buffer (%6.2f MB, %zu tensors, %s)", + ggml_backend_buffer_get_size(block->buffer) / (1024.f * 1024.f), + block->states.size(), + ggml_backend_buffer_is_host(block->buffer) ? "RAM" : "VRAM"); + } + } + + return true; +} + +bool ModelManager::stage_tensors_to_compute_backend(const std::vector& states) { + std::map> states_by_compute_backend; + for (TensorState* state : states) { + if (state == nullptr || should_ignore(*state) || is_optional_missing_tensor(state->name)) { + continue; + } + if (state->compute_backend == nullptr) { + LOG_ERROR("model manager compute backend is null for tensor '%s'", state->name.c_str()); + return false; + } + if (state->params_backend == nullptr) { + LOG_ERROR("model manager params backend is null for tensor '%s'", state->name.c_str()); + return false; + } + if (state->compute_backend == state->params_backend || state->staged_to_compute_backend) { + continue; + } + if (!state->loaded_to_params_backend || state->tensor == nullptr || state->tensor->data == nullptr) { + LOG_ERROR("model manager tensor '%s' is not loaded to params backend", state->name.c_str()); + return false; + } + states_by_compute_backend[state->compute_backend].push_back(state); + } + + for (const auto& pair : states_by_compute_backend) { + ggml_backend_t compute_backend = pair.first; + const std::vector& states = pair.second; + if (states.empty()) { + continue; + } + + int64_t t0 = ggml_time_ms(); + + ggml_init_params init_params; + init_params.mem_size = std::max(1, states.size()) * ggml_tensor_overhead(); + init_params.mem_buffer = nullptr; + init_params.no_alloc = true; + + ggml_context* staging_ctx = ggml_init(init_params); + GGML_ASSERT(staging_ctx != nullptr); + + std::vector> staged_tensors; + staged_tensors.reserve(states.size()); + for (TensorState* state : states) { + ggml_tensor* staging_tensor = ggml_dup_tensor(staging_ctx, state->tensor); + ggml_set_name(staging_tensor, state->tensor->name); + staged_tensors.push_back({state, staging_tensor}); + } + + ggml_backend_buffer_t compute_buffer = ggml_backend_alloc_ctx_tensors(staging_ctx, compute_backend); + if (compute_buffer == nullptr) { + LOG_ERROR("model manager alloc compute params backend buffer failed, num_tensors = %zu", + staged_tensors.size()); + ggml_free(staging_ctx); + return false; + } + ggml_backend_buffer_set_usage(compute_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + + for (auto& staged_tensor : staged_tensors) { + TensorState* state = staged_tensor.first; + ggml_tensor* managed_tensor = state->tensor; + ggml_tensor* staging_tensor = staged_tensor.second; + ggml_backend_tensor_copy(managed_tensor, staging_tensor); + std::swap(managed_tensor->buffer, staging_tensor->buffer); + std::swap(managed_tensor->data, staging_tensor->data); + std::swap(managed_tensor->extra, staging_tensor->extra); + } + ggml_backend_synchronize(compute_backend); + + auto block = std::make_unique(); + block->compute_backend = compute_backend; + block->buffer = compute_buffer; + block->staging_ctx = staging_ctx; + block->staged_tensors = std::move(staged_tensors); + for (auto& staged_tensor : block->staged_tensors) { + TensorState* state = staged_tensor.first; + state->staged_to_compute_backend = true; + } + compute_staging_blocks_.push_back(std::move(block)); + + int64_t t1 = ggml_time_ms(); + LOG_DEBUG("model manager staged compute params (%6.2f MB, %zu tensors) to %s, taking %.2fs", + ggml_backend_buffer_get_size(compute_buffer) / (1024.f * 1024.f), + states.size(), + ggml_backend_name(compute_backend), + (t1 - t0) * 1.0f / 1000); + } + + return true; +} + +bool ModelManager::apply_loras_to_params(const std::vector& states) { + if (loras_.empty()) { + return true; + } + + struct LoraApplyGroup { + std::map model_tensors; + std::vector states; + }; + + std::map groups; + for (TensorState* state : states) { + if (state == nullptr || state->tensor == nullptr || + should_ignore(*state) || is_optional_missing_tensor(state->name)) { + continue; + } + if (state->applied_lora_epoch == current_lora_epoch_) { + continue; + } + if (state->compute_backend == nullptr) { + LOG_ERROR("model manager compute backend is null for lora target tensor '%s'", state->name.c_str()); + return false; + } + if (state->tensor->data == nullptr) { + LOG_ERROR("model manager lora target tensor '%s' is not prepared", state->name.c_str()); + return false; + } + LoraApplyGroup& group = groups[state->compute_backend]; + group.model_tensors[state->name] = state->tensor; + group.states.push_back(state); + } + + if (groups.empty()) { + return true; + } + + std::set all_tensor_names = tensor_names(); + for (auto& group_pair : groups) { + ggml_backend_t compute_backend = group_pair.first; + LoraApplyGroup& group = group_pair.second; + for (const LoraSpec& lora_spec : loras_) { + if (group.model_tensors.empty()) { + continue; + } + + std::string id = lora_id(lora_spec); + auto lora = std::make_shared(id, + compute_backend, + compute_backend, + lora_spec.path, + lora_spec.is_high_noise ? "model.high_noise_" : "", + lora_version_); + + LoraModel::filter_t lora_tensor_filter = nullptr; + if (!lora_spec.tensor_name_prefix_filter.empty()) { + lora_tensor_filter = [&](const std::string& tensor_name) { + return starts_with(tensor_name, lora_spec.tensor_name_prefix_filter); + }; + } + if (!lora->load_from_file(n_threads_, lora_tensor_filter)) { + LOG_WARN("load lora tensors from %s failed", lora_spec.path.c_str()); + if (lora_spec.required) { + return false; + } + continue; + } + if (lora->lora_tensors.empty()) { + if (lora_spec.required) { + LOG_ERROR("required lora has no tensors: %s", lora_spec.path.c_str()); + return false; + } + continue; + } + lora->multiplier = lora_spec.multiplier; + lora->apply(group.model_tensors, all_tensor_names, lora_version_, n_threads_, false); + lora->release_loaded_tensors(); + } + + for (TensorState* state : group.states) { + if (state != nullptr) { + state->applied_lora_epoch = current_lora_epoch_; + } + } + } + return true; +} + +void ModelManager::reset_lora_applied_params() { + release_compute_staging_blocks(true); + release_params_storage_blocks(true); + for (auto& state : tensor_states_) { + state->applied_lora_epoch = UINT64_MAX; + } +} + +bool ModelManager::should_ignore(const TensorState& state) const { + for (const auto& ignore_prefix : common_ignore_tensors_) { + if (starts_with(state.name, ignore_prefix)) { + return true; + } + } + return false; +} + +bool ModelManager::is_optional_missing_tensor(const std::string& name) const { + return name.find("cond_stage_model.transformer.text_model.encoder.layers.23") != std::string::npos || + name.find("alphas_cumprod") != std::string::npos; +} + +bool ModelManager::validate_tensor(const TensorState& state) const { + if (state.tensor == nullptr || should_ignore(state) || is_optional_missing_tensor(state.name)) { + return true; + } + + const auto& tensor_storage_map = model_loader_.get_tensor_storage_map(); + auto ts_it = tensor_storage_map.find(state.name); + if (ts_it == tensor_storage_map.end()) { + LOG_ERROR("%s tensor '%s' not in model metadata", state.desc.c_str(), state.name.c_str()); + return false; + } + + const TensorStorage& tensor_storage = ts_it->second; + if (state.tensor->ne[0] != tensor_storage.ne[0] || + state.tensor->ne[1] != tensor_storage.ne[1] || + state.tensor->ne[2] != tensor_storage.ne[2] || + state.tensor->ne[3] != tensor_storage.ne[3]) { + LOG_ERROR( + "%s tensor '%s' has wrong shape in model metadata: got [%d, %d, %d, %d], expected [%d, %d, %d, %d]", + state.desc.c_str(), + state.name.c_str(), + (int)tensor_storage.ne[0], (int)tensor_storage.ne[1], (int)tensor_storage.ne[2], (int)tensor_storage.ne[3], + (int)state.tensor->ne[0], (int)state.tensor->ne[1], (int)state.tensor->ne[2], (int)state.tensor->ne[3]); + return false; + } + return true; +} + +bool ModelManager::mmap_params(const std::vector& states, + std::vector& created_storage_blocks) { + std::map mmap_candidates; + std::map mmap_states; + for (TensorState* state : states) { + if (state == nullptr || !can_mmap_storage(*state) || state->tensor == nullptr || + state->tensor->data != nullptr || state->tensor->view_src != nullptr) { + continue; + } + mmap_candidates[state->name] = state->tensor; + mmap_states[state->name] = state; + } + if (mmap_candidates.empty()) { + return true; + } + + auto mmap_store = model_loader_.mmap_tensors(mmap_candidates, {}, true); + if (mmap_store.empty()) { + return true; + } + + auto block = std::make_unique(); + block->mmap_tensor_stores = std::move(mmap_store); + ParamsStorageBlock* raw = block.get(); + for (const auto& pair : mmap_states) { + TensorState* state = pair.second; + if (state != nullptr && state->tensor != nullptr && state->tensor->data != nullptr) { + block->states.push_back(state); + } + } + + if (!block->states.empty()) { + params_storage_blocks_.push_back(std::move(block)); + created_storage_blocks.push_back(raw); + } + return true; +} + +bool ModelManager::can_mmap_storage(const TensorState& state) const { + if (!enable_mmap_ || state.residency_mode != ResidencyMode::Resident) { + return false; + } + if (state.compute_backend == nullptr || state.params_backend == nullptr) { + return false; + } + return sd_backend_is_cpu(state.compute_backend) || + sd_backend_is_cpu(state.params_backend) || + backend_supports_host_buffer(state.compute_backend); +} + +bool ModelManager::alloc_params_buffers(const std::vector& states, + std::vector& created_storage_blocks) { + std::map, std::vector> states_by_buffer_type; + for (TensorState* state : states) { + if (state == nullptr || state->tensor == nullptr) { + continue; + } + ggml_backend_buffer_type_t params_buft = params_buffer_type_for(*state); + if (params_buft == nullptr) { + return false; + } + states_by_buffer_type[{params_buft, static_cast(state->residency_mode)}].push_back(state); + } + + for (const auto& pair : states_by_buffer_type) { + ggml_backend_buffer_type_t params_buft = pair.first.first; + const std::vector& states = pair.second; + size_t alignment = ggml_backend_buft_get_alignment(params_buft); + size_t max_size = ggml_backend_buft_get_max_size(params_buft); + + auto alloc_chunk = [&](const std::vector& chunk, size_t chunk_size) -> bool { + if (chunk.empty() || chunk_size == 0) { + return true; + } + + ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(params_buft, chunk_size); + if (buffer == nullptr) { + LOG_ERROR("model manager alloc params backend buffer failed, size = %.2fMB", + chunk_size / (1024.0 * 1024.0)); + return false; + } + ggml_backend_buffer_set_usage(buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + + std::vector initialized_tensors; + void* base = ggml_backend_buffer_get_base(buffer); + size_t offset = aligned_offset(base, 0, ggml_backend_buffer_get_alignment(buffer)); + for (TensorState* state : chunk) { + ggml_tensor* tensor = state->tensor; + size_t tensor_size = GGML_PAD(ggml_backend_buffer_get_alloc_size(buffer, tensor), + ggml_backend_buffer_get_alignment(buffer)); + enum ggml_status status = ggml_backend_tensor_alloc(buffer, tensor, static_cast(base) + offset); + if (status != GGML_STATUS_SUCCESS) { + LOG_ERROR("model manager failed to initialize params tensor '%s'", ggml_get_name(tensor)); + for (ggml_tensor* initialized : initialized_tensors) { + initialized->buffer = nullptr; + initialized->data = nullptr; + initialized->extra = nullptr; + } + LOG_DEBUG("model manager releasing params backend buffer (%6.2f MB, %zu tensors, %s)", + ggml_backend_buffer_get_size(buffer) / (1024.f * 1024.f), + initialized_tensors.size(), + ggml_backend_buffer_is_host(buffer) ? "RAM" : "VRAM"); + ggml_backend_buffer_free(buffer); + return false; + } + initialized_tensors.push_back(tensor); + offset += tensor_size; + } + + auto block = std::make_unique(); + block->buffer = buffer; + block->states = chunk; + ParamsStorageBlock* raw = block.get(); + params_storage_blocks_.push_back(std::move(block)); + created_storage_blocks.push_back(raw); + + return true; + }; + + std::vector chunk; + size_t chunk_size = 0; + for (TensorState* state : states) { + ggml_tensor* tensor = state->tensor; + size_t tensor_size = GGML_PAD(ggml_backend_buft_get_alloc_size(params_buft, tensor), alignment); + if (max_size > 0 && tensor_size > max_size) { + LOG_ERROR("model manager tensor '%s' is too large for params buffer: %zu > %zu", + ggml_get_name(tensor), + tensor_size, + max_size); + return false; + } + if (!chunk.empty() && max_size > 0 && chunk_size + tensor_size > max_size) { + if (!alloc_chunk(chunk, chunk_size)) { + return false; + } + chunk.clear(); + chunk_size = 0; + } + chunk.push_back(state); + chunk_size += tensor_size; + } + + if (!alloc_chunk(chunk, chunk_size)) { + return false; + } + } + + return true; +} + +bool ModelManager::load_tensors(const std::vector& states) { + std::map states_by_name; + std::set target_tensor_names; + for (TensorState* state : states) { + if (state == nullptr) { + continue; + } + states_by_name[state->name] = state; + target_tensor_names.insert(state->name); + } + if (states_by_name.empty()) { + return true; + } + + std::set loaded_names; + std::mutex loaded_names_mutex; + auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { + const std::string& name = tensor_storage.name; + *dst_tensor = nullptr; + + auto state_it = states_by_name.find(name); + if (state_it == states_by_name.end()) { + return true; + } + + TensorState* state = state_it->second; + if (state == nullptr || state->tensor == nullptr) { + LOG_ERROR("model manager tensor '%s' is null", name.c_str()); + return false; + } + + if (state->tensor->ne[0] != tensor_storage.ne[0] || + state->tensor->ne[1] != tensor_storage.ne[1] || + state->tensor->ne[2] != tensor_storage.ne[2] || + state->tensor->ne[3] != tensor_storage.ne[3]) { + LOG_ERROR( + "model manager tensor '%s' has wrong shape in model file: got [%d, %d, %d, %d], expected [%d, %d, %d, %d]", + name.c_str(), + (int)tensor_storage.ne[0], (int)tensor_storage.ne[1], (int)tensor_storage.ne[2], (int)tensor_storage.ne[3], + (int)state->tensor->ne[0], (int)state->tensor->ne[1], (int)state->tensor->ne[2], (int)state->tensor->ne[3]); + return false; + } + + { + std::lock_guard lock(loaded_names_mutex); + loaded_names.insert(name); + } + *dst_tensor = state->tensor; + return true; + }; + + if (!model_loader_.load_tensors(on_new_tensor_cb, enable_mmap_, &target_tensor_names)) { + LOG_ERROR("model manager load tensors failed"); + return false; + } + + bool missing = false; + for (const auto& pair : states_by_name) { + const std::string& name = pair.first; + if (loaded_names.find(name) == loaded_names.end()) { + LOG_ERROR("model manager tensor '%s' was not loaded", name.c_str()); + missing = true; + } + } + if (missing) { + return false; + } + + for (const auto& pair : states_by_name) { + pair.second->loaded_to_params_backend = true; + } + return true; +} + +ggml_backend_buffer_type_t ModelManager::params_buffer_type_for(const TensorState& state) const { + if (state.params_backend == nullptr) { + LOG_ERROR("model manager params backend is null for tensor '%s'", state.name.c_str()); + return nullptr; + } + ggml_backend_buffer_type_t params_buft = nullptr; + if (state.compute_backend != nullptr && state.params_backend != state.compute_backend) { + ggml_backend_dev_t compute_dev = ggml_backend_get_device(state.compute_backend); + if (compute_dev != nullptr) { + params_buft = ggml_backend_dev_host_buffer_type(compute_dev); + } + } + if (params_buft == nullptr) { + params_buft = ggml_backend_get_default_buffer_type(state.params_backend); + } + return params_buft; +} + +void ModelManager::free_compute_staging_block(ComputeStagingBlock& block) { + for (auto& staged_tensor : block.staged_tensors) { + TensorState* state = staged_tensor.first; + ggml_tensor* staging_tensor = staged_tensor.second; + if (state == nullptr || state->tensor == nullptr || staging_tensor == nullptr) { + continue; + } + ggml_tensor* managed_tensor = state->tensor; + managed_tensor->buffer = staging_tensor->buffer; + managed_tensor->data = staging_tensor->data; + managed_tensor->extra = staging_tensor->extra; + staging_tensor->buffer = nullptr; + staging_tensor->data = nullptr; + staging_tensor->extra = nullptr; + + state->staged_to_compute_backend = false; + state->applied_lora_epoch = UINT64_MAX; + } + + if (block.buffer != nullptr) { + LOG_DEBUG("model manager releasing compute params (%6.2f MB, %zu tensors) from %s", + ggml_backend_buffer_get_size(block.buffer) / (1024.f * 1024.f), + block.staged_tensors.size(), + block.compute_backend != nullptr ? ggml_backend_name(block.compute_backend) : "unknown"); + ggml_backend_buffer_free(block.buffer); + block.buffer = nullptr; + } + if (block.staging_ctx != nullptr) { + ggml_free(block.staging_ctx); + block.staging_ctx = nullptr; + } + block.staged_tensors.clear(); +} + +void ModelManager::release_compute_staging_blocks(bool force, + const std::unordered_set* target_states) { + for (auto it = compute_staging_blocks_.begin(); it != compute_staging_blocks_.end();) { + ComputeStagingBlock* block = it->get(); + bool can_release = force; + if (!can_release) { + can_release = std::all_of(block->staged_tensors.begin(), + block->staged_tensors.end(), + [target_states](const std::pair& pair) { + TensorState* state = pair.first; + if (state == nullptr) { + return true; + } + if (target_states != nullptr && + target_states->find(state) == target_states->end()) { + return false; + } + return state->active_prepare_count == 0; + }); + } + + if (can_release) { + free_compute_staging_block(*block); + it = compute_staging_blocks_.erase(it); + } else { + ++it; + } + } +} + +void ModelManager::free_params_storage_block(ParamsStorageBlock& block) { + if (block.buffer != nullptr) { + LOG_DEBUG("model manager releasing params backend buffer (%6.2f MB, %zu tensors, %s)", + ggml_backend_buffer_get_size(block.buffer) / (1024.f * 1024.f), + block.states.size(), + ggml_backend_buffer_is_host(block.buffer) ? "RAM" : "VRAM"); + ggml_backend_buffer_free(block.buffer); + block.buffer = nullptr; + } + block.mmap_tensor_stores.clear(); + + for (TensorState* state : block.states) { + if (state == nullptr || state->tensor == nullptr) { + continue; + } + state->tensor->buffer = nullptr; + state->tensor->data = nullptr; + state->tensor->extra = nullptr; + + state->loaded_to_params_backend = false; + state->applied_lora_epoch = UINT64_MAX; + } + block.states.clear(); +} + +void ModelManager::release_params_storage_blocks(bool force, + const std::unordered_set* target_states) { + for (auto it = params_storage_blocks_.begin(); it != params_storage_blocks_.end();) { + ParamsStorageBlock* block = it->get(); + bool can_release = force; + if (!can_release) { + can_release = std::all_of(block->states.begin(), + block->states.end(), + [target_states](TensorState* state) { + if (state == nullptr) { + return true; + } + if (target_states != nullptr && + target_states->find(state) == target_states->end()) { + return false; + } + return state->active_prepare_count == 0 && + !state->staged_to_compute_backend && + state->residency_mode == ResidencyMode::Disk; + }); + } + + if (can_release) { + free_params_storage_block(*block); + it = params_storage_blocks_.erase(it); + } else { + ++it; + } + } +} + +void ModelManager::erase_params_storage_block(ParamsStorageBlock* block) { + auto it = std::find_if(params_storage_blocks_.begin(), + params_storage_blocks_.end(), + [block](const std::unique_ptr& item) { + return item.get() == block; + }); + if (it != params_storage_blocks_.end()) { + params_storage_blocks_.erase(it); + } +} + +void ModelManager::release_all() { + for (auto& state : tensor_states_) { + state->active_prepare_count = 0; + state->applied_lora_epoch = UINT64_MAX; + } + release_compute_staging_blocks(true); + release_params_storage_blocks(true); +} + +bool ModelManager::resolve_required_tensor_states(const std::vector& tensors, + std::vector& required_states) const { + required_states.clear(); + std::unordered_set seen; + for (ggml_tensor* tensor : tensors) { + if (tensor == nullptr) { + continue; + } + const char* raw_name = ggml_get_name(tensor); + if (raw_name == nullptr || raw_name[0] == '\0') { + LOG_ERROR("model manager unnamed tensor is not registered"); + return false; + } + auto state_it = tensor_states_by_name_.find(raw_name); + if (state_it == tensor_states_by_name_.end()) { + LOG_ERROR("model manager tensor '%s' is not registered", raw_name); + return false; + } + TensorState* state = state_it->second; + if (state == nullptr) { + LOG_ERROR("model manager tensor '%s' has no tensor state", raw_name); + return false; + } + if (seen.insert(state).second) { + required_states.push_back(state); + } + } + return true; +} + +bool ModelManager::prepare_params(const std::vector& tensors) { + if (tensors.empty()) { + return true; + } + + std::vector required_states; + if (!resolve_required_tensor_states(tensors, required_states)) { + return false; + } + + if (!load_tensors_to_params_backend(required_states)) { + return false; + } + + if (!stage_tensors_to_compute_backend(required_states)) { + release_compute_staging_blocks(false); + release_params_storage_blocks(false); + return false; + } + + if (!apply_loras_to_params(required_states)) { + release_compute_staging_blocks(false); + release_params_storage_blocks(false); + return false; + } + + for (TensorState* state : required_states) { + if (state == nullptr) { + continue; + } + state->active_prepare_count++; + } + return true; +} + +void ModelManager::finish_compute_backend_usage(const std::vector& states) { + if (states.empty()) { + return; + } + + std::unordered_set target_states; + for (TensorState* state : states) { + if (state == nullptr || !target_states.insert(state).second) { + continue; + } + if (state->active_prepare_count > 0) { + state->active_prepare_count--; + } + } + release_compute_staging_blocks(false, &target_states); +} + +void ModelManager::release_compute_backend_params(const std::vector& tensors) { + if (tensors.empty()) { + return; + } + std::vector required_states; + if (!resolve_required_tensor_states(tensors, required_states)) { + return; + } + finish_compute_backend_usage(required_states); +} + +void ModelManager::release_params_backend_params(const std::vector& tensors) { + if (tensors.empty()) { + return; + } + std::vector required_states; + if (!resolve_required_tensor_states(tensors, required_states)) { + return; + } + if (required_states.empty()) { + return; + } + std::unordered_set target_states(required_states.begin(), required_states.end()); + release_params_storage_blocks(false, &target_states); +} diff --git a/src/model_manager.h b/src/model_manager.h new file mode 100644 index 00000000..b3da8a36 --- /dev/null +++ b/src/model_manager.h @@ -0,0 +1,131 @@ +#ifndef __MODEL_MANAGER_H__ +#define __MODEL_MANAGER_H__ + +#include +#include +#include +#include +#include +#include +#include + +#include "model_loader.h" +#include "weight_manager.h" + +class ModelManager : public RunnerWeightManager { +public: + enum class ResidencyMode { + Disk, + Resident, + }; + + struct LoraSpec { + std::string path; + float multiplier = 1.0f; + bool is_high_noise = false; + std::string tensor_name_prefix_filter; + bool required = false; + }; + +private: + struct TensorState { + std::string name; + ggml_tensor* tensor = nullptr; + std::string desc; + + ResidencyMode residency_mode = ResidencyMode::Resident; + ggml_backend_t compute_backend = nullptr; + ggml_backend_t params_backend = nullptr; + bool metadata_validated = false; + + int active_prepare_count = 0; + + bool loaded_to_params_backend = false; + bool staged_to_compute_backend = false; + uint64_t applied_lora_epoch = UINT64_MAX; + }; + + struct ParamsStorageBlock { + ggml_backend_buffer_t buffer = nullptr; + std::vector mmap_tensor_stores; + std::vector states; + }; + + struct ComputeStagingBlock { + ggml_backend_t compute_backend = nullptr; + ggml_backend_buffer_t buffer = nullptr; + ggml_context* staging_ctx = nullptr; + std::vector> staged_tensors; + }; + + ModelLoader model_loader_; + std::vector> tensor_states_; + std::map tensor_states_by_name_; + std::vector> params_storage_blocks_; + std::vector> compute_staging_blocks_; + std::set common_ignore_tensors_; + std::vector loras_; + SDVersion lora_version_ = VERSION_COUNT; + uint64_t current_lora_epoch_ = 0; + int n_threads_ = 0; + bool enable_mmap_ = false; + + void finish_compute_backend_usage(const std::vector& states); + void release_all(); + + bool resolve_required_tensor_states(const std::vector& tensors, + std::vector& required_states) const; + bool should_ignore(const TensorState& state) const; + bool is_optional_missing_tensor(const std::string& name) const; + bool validate_tensor(const TensorState& state) const; + + bool load_tensors_to_params_backend(const std::vector& states); + bool apply_loras_to_params(const std::vector& states); + bool mmap_params(const std::vector& states, + std::vector& created_storage_blocks); + bool can_mmap_storage(const TensorState& state) const; + bool alloc_params_buffers(const std::vector& states, + std::vector& created_storage_blocks); + bool load_tensors(const std::vector& states); + bool stage_tensors_to_compute_backend(const std::vector& states); + + ggml_backend_buffer_type_t params_buffer_type_for(const TensorState& state) const; + void release_compute_staging_blocks(bool force = false, + const std::unordered_set* target_states = nullptr); + void release_params_storage_blocks(bool force = false, + const std::unordered_set* target_states = nullptr); + void free_compute_staging_block(ComputeStagingBlock& block); + void free_params_storage_block(ParamsStorageBlock& block); + void erase_params_storage_block(ParamsStorageBlock* block); + void reset_lora_applied_params(); + +public: + ~ModelManager() override; + + ModelLoader& loader() { return model_loader_; } + const ModelLoader& loader() const { return model_loader_; } + + void set_n_threads(int n_threads) { + n_threads_ = n_threads; + model_loader_.set_n_threads(n_threads); + } + void set_enable_mmap(bool enable_mmap) { enable_mmap_ = enable_mmap; } + void set_common_ignore_tensors(std::set ignore_tensors); + void set_loras(std::vector loras, SDVersion version); + + std::set tensor_names() const; + + bool register_param_tensors(const std::string& desc, + std::map tensors, + ResidencyMode residency_mode, + ggml_backend_t compute_backend, + ggml_backend_t params_backend, + size_t* registered_tensor_size = nullptr); + bool validate_registered_tensors(); + + bool prepare_params(const std::vector& tensors) override; + void release_compute_backend_params(const std::vector& tensors) override; + void release_params_backend_params(const std::vector& tensors) override; +}; + +#endif // __MODEL_MANAGER_H__ diff --git a/src/runtime/guidance.cpp b/src/runtime/guidance.cpp index a83680cd..f925b4b8 100644 --- a/src/runtime/guidance.cpp +++ b/src/runtime/guidance.cpp @@ -172,8 +172,8 @@ namespace sd::guidance { momentum_buffer_ = deltas; } - float diff_norm = 0.0f; - const int standard_res = 2 * 1024 / 8; // Use SDXL as the standard resolution (1024x1024, 8x8 patches, 4=2x2 channels) + float diff_norm = 0.0f; + const int standard_res = 2 * 1024 / 8; // Use SDXL as the standard resolution (1024x1024, 8x8 patches, 4=2x2 channels) if (params_.norm_threshold > 0.0f) { diff_norm = std::sqrt((deltas * deltas).sum()) * standard_res / std::sqrt(static_cast(deltas.numel())); } diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 916a93f2..04f0598c 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include "core/ggml_extend.hpp" #include "core/ggml_graph_cut.h" @@ -11,6 +12,7 @@ #include "core/rng_philox.hpp" #include "core/util.h" #include "model_loader.h" +#include "model_manager.h" #include "stable-diffusion.h" #include "conditioning/conditioner.hpp" @@ -157,7 +159,6 @@ static float get_cache_reuse_threshold(const sd_cache_params_t& params) { class StableDiffusionGGML { public: - std::vector mmap_tensor_store; SDBackendManager backend_manager; SDVersion version; @@ -182,14 +183,13 @@ public: std::shared_ptr audio_vae_model; std::shared_ptr control_net; std::vector> generation_extensions; - std::vector> cond_stage_lora_models; - std::vector> diffusion_lora_models; - std::vector> first_stage_lora_models; + std::vector> runtime_lora_models; bool apply_lora_immediately = false; std::string taesd_path; sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0, 0, nullptr}; bool offload_params_to_cpu = false; + bool enable_mmap = false; float max_vram = 0.f; bool stream_layers = false; std::string backend_spec; @@ -198,12 +198,10 @@ public: bool is_using_v_parameterization = false; bool is_using_edm_v_parameterization = false; - std::map tensors; - - // lora_name => multiplier - std::unordered_map curr_lora_state; + std::shared_ptr model_manager; std::shared_ptr denoiser = std::make_shared(); + std::vector file_alphas_cumprod; StableDiffusionGGML() = default; @@ -232,6 +230,28 @@ public: return params_backend_for(module) != nullptr; } + template + bool register_runner_params(const std::string& desc, + const std::shared_ptr& model, + SDBackendModule module, + size_t* params_mem_size = nullptr) { + if (model == nullptr) { + return true; + } + std::map group_tensors; + model->get_param_tensors(group_tensors); + model->set_weight_manager(model_manager); + if (model_manager == nullptr) { + return true; + } + return model_manager->register_param_tensors(desc, + std::move(group_tensors), + free_params_immediately ? ModelManager::ResidencyMode::Disk : ModelManager::ResidencyMode::Resident, + backend_for(module), + params_backend_for(module), + params_mem_size); + } + bool init_backend(const sd_ctx_params_t* sd_ctx_params) { std::string error; if (!backend_manager.init(sd_ctx_params->backend, @@ -257,11 +277,53 @@ public: } } + void refresh_compvis_denoiser_sigmas() { + auto comp_vis_denoiser = std::dynamic_pointer_cast(denoiser); + if (!comp_vis_denoiser) { + return; + } + std::vector alphas_cumprod(TIMESTEPS); + if (file_alphas_cumprod.size() == TIMESTEPS) { + alphas_cumprod = file_alphas_cumprod; + } else { + calculate_alphas_cumprod(alphas_cumprod.data()); + } + for (int i = 0; i < TIMESTEPS; i++) { + comp_vis_denoiser->sigmas[i] = std::sqrt((1 - alphas_cumprod[i]) / alphas_cumprod[i]); + comp_vis_denoiser->log_sigmas[i] = std::log(comp_vis_denoiser->sigmas[i]); + } + } + + void load_alphas_cumprod(ModelLoader& model_loader) { + file_alphas_cumprod.clear(); + + std::vector loaded_alphas; + if (!model_loader.load_float_tensor("alphas_cumprod", loaded_alphas, n_threads, enable_mmap)) { + return; + } + if (loaded_alphas.size() != TIMESTEPS) { + LOG_WARN("ignore alphas_cumprod from model file: expected %d values, got %zu", + TIMESTEPS, + loaded_alphas.size()); + return; + } + for (float alpha : loaded_alphas) { + if (!std::isfinite(alpha) || alpha <= 0.0f || alpha > 1.0f) { + LOG_WARN("ignore invalid alphas_cumprod from model file"); + return; + } + } + + file_alphas_cumprod = std::move(loaded_alphas); + LOG_DEBUG("loaded alphas_cumprod from model file"); + } + bool init(const sd_ctx_params_t* sd_ctx_params) { n_threads = sd_ctx_params->n_threads; vae_decode_only = sd_ctx_params->vae_decode_only; free_params_immediately = sd_ctx_params->free_params_immediately; offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu; + enable_mmap = sd_ctx_params->enable_mmap; max_vram = sd_ctx_params->max_vram; stream_layers = sd_ctx_params->stream_layers; backend_spec = SAFE_STR(sd_ctx_params->backend); @@ -276,8 +338,9 @@ public: stream_layers = false; } - bool use_tae = false; - bool use_audio_vae = false; + bool use_tae = false; + bool use_audio_vae = false; + bool use_control_net = false; rng = get_rng(sd_ctx_params->rng_type); if (sd_ctx_params->sampler_rng_type != RNG_TYPE_COUNT && sd_ctx_params->sampler_rng_type != sd_ctx_params->rng_type) { @@ -293,7 +356,10 @@ public: } max_vram = sd::ggml_graph_cut::resolve_max_vram_gib(max_vram, backend_for(SDBackendModule::DIFFUSION)); - ModelLoader model_loader; + model_manager = std::make_shared(); + model_manager->set_n_threads(n_threads); + model_manager->set_enable_mmap(enable_mmap); + ModelLoader& model_loader = model_manager->loader(); if (strlen(SAFE_STR(sd_ctx_params->model_path)) > 0) { LOG_INFO("loading model from '%s'", sd_ctx_params->model_path); @@ -403,6 +469,15 @@ public: } } + if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) { + if (!model_loader.init_from_file(sd_ctx_params->control_net_path)) { + LOG_ERROR("init control net model loader from file failed: '%s'", sd_ctx_params->control_net_path); + return false; + } else { + use_control_net = true; + } + } + model_loader.convert_tensors_name(); version = model_loader.get_sd_version(); @@ -421,6 +496,7 @@ public: if (wtype != GGML_TYPE_COUNT || tensor_type_rules.size() > 0) { model_loader.set_wtype_override(wtype, tensor_type_rules); } + model_loader.process_model_files(enable_mmap, true); std::map wtype_stat = model_loader.get_wtype_stat(); std::map conditioner_wtype_stat = model_loader.get_conditioner_wtype_stat(); @@ -474,46 +550,16 @@ public: apply_lora_immediately = false; } - std::map mmap_able_tensors; - bool enable_mmap_tensors = false; - bool needs_writable_mmap = false; - if (sd_ctx_params->enable_mmap) { - if (apply_lora_immediately) { - needs_writable_mmap = true; - LOG_WARN("in mode 'immediately', LoRAs will cause extra memory usage with mmap"); - } - enable_mmap_tensors = true; + if (enable_mmap && apply_lora_immediately) { + LOG_WARN("in mode 'immediately', LoRAs will cause extra memory usage with mmap"); } + load_alphas_cumprod(model_loader); - // split definition to avoid msvc choking on the extra parameter handling - auto module_can_mmap = [&](SDBackendModule module) { - return enable_mmap_tensors && - (backend_manager.runtime_backend_is_cpu(module) || - backend_manager.params_backend_is_cpu(module) || - backend_manager.runtime_backend_supports_host_buffer(module)); - }; - - auto get_param_tensors_p = [&](auto&& model, bool do_mmap, const char* prefix) { - std::map temp; - model->get_param_tensors(temp, prefix); - for (const auto& [key, tensor] : temp) { - tensors[key] = tensor; - if (do_mmap) { - mmap_able_tensors[key] = tensor; - } - } - }; - - auto get_param_tensors = [&](auto&& model, bool do_mmap) { - std::map temp; - model->get_param_tensors(temp); - for (const auto& [key, tensor] : temp) { - tensors[key] = tensor; - if (do_mmap) { - mmap_able_tensors[key] = tensor; - } - } - }; + size_t text_encoder_params_mem_size = 0; + size_t unet_params_mem_size = 0; + size_t vae_params_mem_size = 0; + size_t control_net_params_mem_size = 0; + size_t extension_params_mem_size = 0; if (sd_version_is_control(version)) { // Might need vae encode for control cond @@ -647,7 +693,11 @@ public: params_backend_for(SDBackendModule::CLIP_VISION), tensor_storage_map); clip_vision->set_max_graph_vram_bytes(max_graph_vram_bytes); - get_param_tensors(clip_vision, module_can_mmap(SDBackendModule::CLIP_VISION)); + if (!register_runner_params("CLIP vision", + clip_vision, + SDBackendModule::CLIP_VISION)) { + return false; + } } } else if (sd_version_is_qwen_image(version)) { bool enable_vision = false; @@ -749,11 +799,21 @@ public: } cond_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes); - get_param_tensors(cond_stage_model, module_can_mmap(SDBackendModule::TE)); + if (!register_runner_params("Conditioner model", + cond_stage_model, + SDBackendModule::TE, + &text_encoder_params_mem_size)) { + return false; + } diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes); diffusion_model->set_stream_layers_enabled(stream_layers); - get_param_tensors(diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION)); + if (!register_runner_params("Diffusion model", + diffusion_model, + SDBackendModule::DIFFUSION, + &unet_params_mem_size)) { + return false; + } if (sd_version_is_unet_edit(version)) { vae_decode_only = false; @@ -762,7 +822,12 @@ public: if (high_noise_diffusion_model) { high_noise_diffusion_model->set_max_graph_vram_bytes(max_graph_vram_bytes); high_noise_diffusion_model->set_stream_layers_enabled(stream_layers); - get_param_tensors(high_noise_diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION)); + if (!register_runner_params("High noise diffusion model", + high_noise_diffusion_model, + SDBackendModule::DIFFUSION, + &unet_params_mem_size)) { + return false; + } } if (!ensure_backend_pair(SDBackendModule::VAE)) { @@ -840,28 +905,47 @@ public: } }; - bool vae_mmap = module_can_mmap(SDBackendModule::VAE); - if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1) { LOG_INFO("using FakeVAE"); first_stage_model = std::make_shared(version, backend_for(SDBackendModule::VAE), params_backend_for(SDBackendModule::VAE)); + if (!register_runner_params("VAE", + first_stage_model, + SDBackendModule::VAE, + &vae_params_mem_size)) { + return false; + } } else if (use_tae && !tae_preview_only) { LOG_INFO("using TAE for encoding / decoding"); first_stage_model = create_tae(); first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes); - get_param_tensors_p(first_stage_model, vae_mmap, "tae"); + if (!register_runner_params("VAE", + first_stage_model, + SDBackendModule::VAE, + &vae_params_mem_size)) { + return false; + } } else { LOG_INFO("using VAE for encoding / decoding"); first_stage_model = create_vae(); first_stage_model->set_max_graph_vram_bytes(max_graph_vram_bytes); - get_param_tensors_p(first_stage_model, vae_mmap, "first_stage_model"); + if (!register_runner_params("VAE", + first_stage_model, + SDBackendModule::VAE, + &vae_params_mem_size)) { + return false; + } if (use_tae && tae_preview_only) { LOG_INFO("using TAE for preview"); preview_vae = create_tae(); preview_vae->set_max_graph_vram_bytes(max_graph_vram_bytes); - get_param_tensors_p(preview_vae, vae_mmap, "tae"); + if (!register_runner_params("preview VAE", + preview_vae, + SDBackendModule::VAE, + &vae_params_mem_size)) { + return false; + } } } @@ -869,7 +953,12 @@ public: audio_vae_model = std::make_shared(backend_for(SDBackendModule::VAE), params_backend_for(SDBackendModule::VAE), tensor_storage_map); - get_param_tensors_p(audio_vae_model, vae_mmap, ""); + if (!register_runner_params("LTX audio VAE", + audio_vae_model, + SDBackendModule::VAE, + &vae_params_mem_size)) { + return false; + } } if (sd_ctx_params->vae_conv_direct) { @@ -880,18 +969,24 @@ public: } } - if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) { + if (use_control_net) { if (!ensure_backend_pair(SDBackendModule::CONTROL_NET)) { return false; } control_net = std::make_shared(backend_for(SDBackendModule::CONTROL_NET), params_backend_for(SDBackendModule::CONTROL_NET), - tensor_storage_map, + model_loader.get_tensor_storage_map(), version); if (sd_ctx_params->diffusion_conv_direct) { LOG_INFO("Using Conv2d direct in the control net"); control_net->set_conv2d_direct_enabled(true); } + if (!register_runner_params("ControlNet", + control_net, + SDBackendModule::CONTROL_NET, + &control_net_params_mem_size)) { + return false; + } } { @@ -914,14 +1009,12 @@ public: generation_extensions.push_back(photomaker_extension); } } - { - GenerationExtensionTensorContext extension_tensor_ctx{ - tensors, - mmap_able_tensors, - module_can_mmap, - }; - for (auto& extension : generation_extensions) { - extension->collect_param_tensors(extension_tensor_ctx); + for (auto& extension : generation_extensions) { + if (!register_runner_params(extension->name(), + extension, + SDBackendModule::PHOTOMAKER, + &extension_params_mem_size)) { + return false; } } @@ -958,21 +1051,9 @@ public: circular_y = sd_ctx_params->circular_y; } - ggml_init_params params; - params.mem_size = static_cast(10 * 1024) * 1024; // 10M - params.mem_buffer = nullptr; - params.no_alloc = false; - // LOG_DEBUG("mem_size %u ", params.mem_size); - ggml_context* ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check - GGML_ASSERT(ctx != nullptr); - ggml_tensor* alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS); - calculate_alphas_cumprod((float*)alphas_cumprod_tensor->data); - - // load weights - LOG_DEBUG("loading weights"); + LOG_DEBUG("validating model metadata"); std::set ignore_tensors; - tensors["alphas_cumprod"] = alphas_cumprod_tensor; if (use_tae && !tae_preview_only) { ignore_tensors.insert("first_stage_model."); } @@ -1021,93 +1102,15 @@ public: ignore_tensors.insert("model.visual.deepstack_merger_list."); } - if (enable_mmap_tensors) { - if (mmap_able_tensors.empty()) { - LOG_DEBUG("no tensors could be memory-mapped"); - } else { - mmap_tensor_store = model_loader.mmap_tensors(mmap_able_tensors, ignore_tensors, needs_writable_mmap); - } - } - - if (clip_vision && !clip_vision->alloc_params_buffer()) { - LOG_ERROR("CLIP vision params buffer allocation failed"); - ggml_free(ctx); - return false; - } - if (cond_stage_model && !cond_stage_model->alloc_params_buffer()) { - LOG_ERROR("Conditioner model params buffer allocation failed"); - ggml_free(ctx); - return false; - } - if (diffusion_model && !diffusion_model->alloc_params_buffer()) { - LOG_ERROR("Diffusion model params buffer allocation failed"); - ggml_free(ctx); - return false; - } - if (high_noise_diffusion_model && !high_noise_diffusion_model->alloc_params_buffer()) { - LOG_ERROR("High noise diffusion model params buffer allocation failed"); - ggml_free(ctx); - return false; - } - if (first_stage_model && !first_stage_model->alloc_params_buffer()) { - LOG_ERROR("VAE params buffer allocation failed"); - ggml_free(ctx); - return false; - } - if (preview_vae && !preview_vae->alloc_params_buffer()) { - LOG_ERROR("preview VAE params buffer allocation failed"); - ggml_free(ctx); - return false; - } - if (audio_vae_model && !audio_vae_model->alloc_params_buffer()) { - LOG_ERROR("LTX audio VAE params buffer allocation failed"); - ggml_free(ctx); - return false; - } - for (auto& extension : generation_extensions) { - if (!extension->alloc_params_buffer()) { - LOG_ERROR("%s params buffer allocation failed", extension->name()); - ggml_free(ctx); - return false; - } - } - - bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap); - if (!success) { - LOG_ERROR("load tensors from model loader failed"); - ggml_free(ctx); + model_manager->set_common_ignore_tensors(ignore_tensors); + if (!model_manager->validate_registered_tensors()) { + LOG_ERROR("model metadata validation failed"); return false; } - LOG_DEBUG("finished loaded file"); + LOG_DEBUG("model metadata validated; weights will be prepared lazily"); { - size_t clip_params_mem_size = cond_stage_model->get_params_buffer_size(); - size_t unet_params_mem_size = diffusion_model->get_params_buffer_size(); - if (high_noise_diffusion_model) { - unet_params_mem_size += high_noise_diffusion_model->get_params_buffer_size(); - } - size_t vae_params_mem_size = 0; - vae_params_mem_size = first_stage_model->get_params_buffer_size(); - if (preview_vae) { - vae_params_mem_size += preview_vae->get_params_buffer_size(); - } - if (audio_vae_model) { - vae_params_mem_size += audio_vae_model->get_params_buffer_size(); - } - size_t control_net_params_mem_size = 0; - if (control_net) { - if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path), n_threads)) { - ggml_free(ctx); - return false; - } - control_net_params_mem_size = control_net->get_params_buffer_size(); - } - size_t extension_params_mem_size = 0; - for (auto& extension : generation_extensions) { - extension_params_mem_size += extension->get_params_buffer_size(); - } - size_t total_params_ram_size = 0; size_t total_params_vram_size = 0; auto add_params_memory = [&](size_t size, SDBackendModule module) { @@ -1136,12 +1139,11 @@ public: return sd_backend_is_cpu(module_backend) ? "RAM" : "VRAM"; }; - if (!add_params_memory(clip_params_mem_size, SDBackendModule::TE) || + if (!add_params_memory(text_encoder_params_mem_size, SDBackendModule::TE) || !add_params_memory(extension_params_mem_size, SDBackendModule::PHOTOMAKER) || !add_params_memory(unet_params_mem_size, SDBackendModule::DIFFUSION) || !add_params_memory(vae_params_mem_size, SDBackendModule::VAE) || !add_params_memory(control_net_params_mem_size, SDBackendModule::CONTROL_NET)) { - ggml_free(ctx); return false; } @@ -1152,8 +1154,8 @@ public: total_params_size / 1024.0 / 1024.0, total_params_vram_size / 1024.0 / 1024.0, total_params_ram_size / 1024.0 / 1024.0, - clip_params_mem_size / 1024.0 / 1024.0, - params_memory_location(clip_params_mem_size, SDBackendModule::TE), + text_encoder_params_mem_size / 1024.0 / 1024.0, + params_memory_location(text_encoder_params_mem_size, SDBackendModule::TE), unet_params_mem_size / 1024.0 / 1024.0, params_memory_location(unet_params_mem_size, SDBackendModule::DIFFUSION), vae_params_mem_size / 1024.0 / 1024.0, @@ -1170,12 +1172,7 @@ public: if (pred_type == PREDICTION_COUNT) { if (sd_version_is_sd2(version)) { - // check is_using_v_parameterization_for_sd2 - if (is_using_v_parameterization_for_sd2(sd_version_is_inpaint(version))) { - pred_type = V_PRED; - } else { - pred_type = EPS_PRED; - } + pred_type = is_using_v_parameterization_for_sd2(sd_version_is_inpaint(version)) ? V_PRED : EPS_PRED; } else if (sd_version_is_sdxl(version)) { if (tensor_storage_map.find("edm_vpred.sigma_max") != tensor_storage_map.end()) { // CosXL models @@ -1268,25 +1265,27 @@ public: } default: { LOG_ERROR("Unknown predition type %i", pred_type); - ggml_free(ctx); return false; } } - auto comp_vis_denoiser = std::dynamic_pointer_cast(denoiser); - if (comp_vis_denoiser) { - for (int i = 0; i < TIMESTEPS; i++) { - comp_vis_denoiser->sigmas[i] = std::sqrt((1 - ((float*)alphas_cumprod_tensor->data)[i]) / ((float*)alphas_cumprod_tensor->data)[i]); - comp_vis_denoiser->log_sigmas[i] = std::log(comp_vis_denoiser->sigmas[i]); - } - } + refresh_compvis_denoiser_sigmas(); } - ggml_free(ctx); return true; } bool is_using_v_parameterization_for_sd2(bool is_inpaint = false) { + struct RunnerDoneOnExit { + GGMLRunner* runner = nullptr; + ~RunnerDoneOnExit() { + if (runner != nullptr) { + runner->runner_done(); + } + } + }; + RunnerDoneOnExit diffusion_runner_done{diffusion_model.get()}; + sd::Tensor x_t = sd::full({8, 8, 4, 1}, 0.5f); sd::Tensor c = sd::full({1024, 2, 1, 1}, 0.5f); sd::Tensor steps = sd::full({1}, 999.0f); @@ -1308,7 +1307,6 @@ public: auto out_opt = diffusion_model->compute(n_threads, diffusion_params); GGML_ASSERT(!out_opt.empty()); out = std::move(out_opt); - diffusion_model->free_compute_buffer(); double result = static_cast((out - x_t).mean()); int64_t t1 = ggml_time_ms(); @@ -1316,84 +1314,41 @@ public: return result < -1; } - std::shared_ptr load_lora_model_from_file(const std::string& lora_id, - float multiplier, - SDBackendModule module, - LoraModel::filter_t lora_tensor_filter = nullptr) { - std::string lora_path = lora_id; - static std::string high_noise_tag = "|high_noise|"; - bool is_high_noise = false; - if (starts_with(lora_path, high_noise_tag)) { - lora_path = lora_path.substr(high_noise_tag.size()); - is_high_noise = true; - LOG_DEBUG("high noise lora: %s", lora_path.c_str()); - } + static std::string lora_log_id(const ModelManager::LoraSpec& lora) { + return lora.is_high_noise ? "|high_noise|" + lora.path : lora.path; + } + + std::shared_ptr load_lora_model(const ModelManager::LoraSpec& lora_spec, + SDBackendModule module, + LoraModel::filter_t module_filter = nullptr) { if (!ensure_backend_pair(module)) { return nullptr; } - auto lora = std::make_shared(lora_id, + if (lora_spec.is_high_noise) { + LOG_DEBUG("high noise lora: %s", lora_spec.path.c_str()); + } + auto lora = std::make_shared(lora_log_id(lora_spec), backend_for(module), backend_for(module), - lora_path, - is_high_noise ? "model.high_noise_" : "", + lora_spec.path, + lora_spec.is_high_noise ? "model.high_noise_" : "", version); + LoraModel::filter_t lora_tensor_filter = module_filter; + if (!lora_spec.tensor_name_prefix_filter.empty()) { + lora_tensor_filter = [module_filter, prefix = lora_spec.tensor_name_prefix_filter](const std::string& tensor_name) { + return starts_with(tensor_name, prefix) && (!module_filter || module_filter(tensor_name)); + }; + } if (!lora->load_from_file(n_threads, lora_tensor_filter)) { - LOG_WARN("load lora tensors from %s failed", lora_path.c_str()); + LOG_WARN("load lora tensors from %s failed", lora_spec.path.c_str()); return nullptr; } - lora->multiplier = multiplier; + lora->multiplier = lora_spec.multiplier; return lora; } - void apply_loras_immediately(const std::unordered_map& lora_state) { - std::unordered_map lora_state_diff; - for (auto& kv : lora_state) { - const std::string& lora_name = kv.first; - float multiplier = kv.second; - lora_state_diff[lora_name] += multiplier; - } - for (auto& kv : curr_lora_state) { - const std::string& lora_name = kv.first; - float curr_multiplier = kv.second; - lora_state_diff[lora_name] -= curr_multiplier; - } - - if (lora_state_diff.empty()) { - return; - } - - LOG_INFO("apply lora immediately"); - - size_t rm = lora_state_diff.size() - lora_state.size(); - if (rm != 0) { - LOG_INFO("attempting to apply %lu LoRAs (removing %lu applied LoRAs)", lora_state.size(), rm); - } else { - LOG_INFO("attempting to apply %lu LoRAs", lora_state.size()); - } - - for (auto& kv : lora_state_diff) { - int64_t t0 = ggml_time_ms(); - - auto lora = load_lora_model_from_file(kv.first, kv.second, SDBackendModule::DIFFUSION); - if (!lora || lora->lora_tensors.empty()) { - continue; - } - lora->apply(tensors, version, n_threads); - lora->free_params_buffer(); - - int64_t t1 = ggml_time_ms(); - - LOG_INFO("lora '%s' applied, taking %.2fs", kv.first.c_str(), (t1 - t0) * 1.0f / 1000); - } - - curr_lora_state = lora_state; - } - - void apply_loras_at_runtime(const std::unordered_map& lora_state) { - cond_stage_lora_models.clear(); - diffusion_lora_models.clear(); - first_stage_lora_models.clear(); + void clear_lora_adapters() { if (cond_stage_model) { cond_stage_model->set_weight_adapter(nullptr); } @@ -1406,39 +1361,74 @@ public: if (first_stage_model) { first_stage_model->set_weight_adapter(nullptr); } - if (lora_state.empty()) { + } + + std::vector> load_runtime_loras_for_module(const std::vector& loras, + const std::set& model_tensor_names, + SDBackendModule module, + LoraModel::filter_t module_filter = nullptr) { + std::vector> module_lora_models; + for (const auto& lora_spec : loras) { + auto lora = load_lora_model(lora_spec, module, module_filter); + if (lora == nullptr) { + if (lora_spec.required) { + LOG_ERROR("required lora load failed: %s", lora_spec.path.c_str()); + } + continue; + } + if (lora->lora_tensors.empty()) { + continue; + } + + lora->preprocess_lora_tensors(model_tensor_names); + runtime_lora_models.push_back(lora); + module_lora_models.push_back(std::move(lora)); + } + return module_lora_models; + } + + void apply_loras_immediately(const std::vector& loras) { + if (model_manager == nullptr) { + if (!loras.empty()) { + LOG_WARN("model manager is not available for immediate lora"); + } return; } + + clear_lora_adapters(); + runtime_lora_models.clear(); + + model_manager->set_loras(loras, version); + } + + void apply_loras_at_runtime(const std::vector& loras) { + if (model_manager != nullptr) { + model_manager->set_loras({}, version); + } + runtime_lora_models.clear(); + clear_lora_adapters(); + if (loras.empty()) { + return; + } + + std::set model_tensor_names; + if (model_manager != nullptr) { + model_tensor_names = model_manager->tensor_names(); + } + LOG_INFO("apply lora at runtime"); if (cond_stage_model) { - std::vector> lora_models; - auto lora_state_diff = lora_state; - for (auto& lora_model : cond_stage_lora_models) { - auto iter = lora_state_diff.find(lora_model->lora_id); - - if (iter != lora_state_diff.end()) { - lora_model->multiplier = iter->second; - lora_models.push_back(lora_model); - lora_state_diff.erase(iter); - } - } - cond_stage_lora_models = lora_models; auto lora_tensor_filter = [&](const std::string& tensor_name) { if (is_cond_stage_model_name(tensor_name)) { return true; } return false; }; - for (auto& kv : lora_state_diff) { - const std::string& lora_id = kv.first; - float multiplier = kv.second; - - auto lora = load_lora_model_from_file(lora_id, multiplier, SDBackendModule::TE, lora_tensor_filter); - if (lora && !lora->lora_tensors.empty()) { - lora->preprocess_lora_tensors(tensors); - cond_stage_lora_models.push_back(lora); - } - } + auto cond_stage_lora_models = + load_runtime_loras_for_module(loras, + model_tensor_names, + SDBackendModule::TE, + lora_tensor_filter); // Only attach the adapter when there are LoRAs targeting the cond_stage model. // An empty MultiLoraAdapter still routes every linear/conv through // forward_with_lora() instead of the direct kernel path — slower for no benefit. @@ -1448,34 +1438,17 @@ public: } } if (diffusion_model) { - std::vector> lora_models; - auto lora_state_diff = lora_state; - for (auto& lora_model : diffusion_lora_models) { - auto iter = lora_state_diff.find(lora_model->lora_id); - - if (iter != lora_state_diff.end()) { - lora_model->multiplier = iter->second; - lora_models.push_back(lora_model); - lora_state_diff.erase(iter); - } - } - diffusion_lora_models = lora_models; auto lora_tensor_filter = [&](const std::string& tensor_name) { if (is_diffusion_model_name(tensor_name)) { return true; } return false; }; - for (auto& kv : lora_state_diff) { - const std::string& lora_name = kv.first; - float multiplier = kv.second; - - auto lora = load_lora_model_from_file(lora_name, multiplier, SDBackendModule::DIFFUSION, lora_tensor_filter); - if (lora && !lora->lora_tensors.empty()) { - lora->preprocess_lora_tensors(tensors); - diffusion_lora_models.push_back(lora); - } - } + auto diffusion_lora_models = + load_runtime_loras_for_module(loras, + model_tensor_names, + SDBackendModule::DIFFUSION, + lora_tensor_filter); if (!diffusion_lora_models.empty()) { auto multi_lora_adapter = std::make_shared(diffusion_lora_models); diffusion_model->set_weight_adapter(multi_lora_adapter); @@ -1486,34 +1459,17 @@ public: } if (first_stage_model) { - std::vector> lora_models; - auto lora_state_diff = lora_state; - for (auto& lora_model : first_stage_lora_models) { - auto iter = lora_state_diff.find(lora_model->lora_id); - - if (iter != lora_state_diff.end()) { - lora_model->multiplier = iter->second; - lora_models.push_back(lora_model); - lora_state_diff.erase(iter); - } - } - first_stage_lora_models = lora_models; auto lora_tensor_filter = [&](const std::string& tensor_name) { if (is_first_stage_model_name(tensor_name)) { return true; } return false; }; - for (auto& kv : lora_state_diff) { - const std::string& lora_name = kv.first; - float multiplier = kv.second; - - auto lora = load_lora_model_from_file(lora_name, multiplier, SDBackendModule::VAE, lora_tensor_filter); - if (lora && !lora->lora_tensors.empty()) { - lora->preprocess_lora_tensors(tensors); - first_stage_lora_models.push_back(lora); - } - } + auto first_stage_lora_models = + load_runtime_loras_for_module(loras, + model_tensor_names, + SDBackendModule::VAE, + lora_tensor_filter); if (!first_stage_lora_models.empty()) { auto multi_lora_adapter = std::make_shared(first_stage_lora_models); first_stage_model->set_weight_adapter(multi_lora_adapter); @@ -1522,46 +1478,42 @@ public: } void lora_stat() { - if (!cond_stage_lora_models.empty()) { - LOG_INFO("cond_stage_lora_models:"); - for (auto& lora_model : cond_stage_lora_models) { - lora_model->stat(); - } - } - - if (!diffusion_lora_models.empty()) { - LOG_INFO("diffusion_lora_models:"); - for (auto& lora_model : diffusion_lora_models) { - lora_model->stat(); - } - } - - if (!first_stage_lora_models.empty()) { - LOG_INFO("first_stage_lora_models:"); - for (auto& lora_model : first_stage_lora_models) { + if (!runtime_lora_models.empty()) { + LOG_INFO("runtime_lora_models:"); + for (auto& lora_model : runtime_lora_models) { lora_model->stat(); } } } void apply_loras(const sd_lora_t* loras, uint32_t lora_count) { - std::unordered_map lora_f2m; + std::vector all_loras; + all_loras.reserve(lora_count); for (uint32_t i = 0; i < lora_count; i++) { std::string lora_id = SAFE_STR(loras[i].path); + ModelManager::LoraSpec lora_spec; + lora_spec.path = lora_id; + lora_spec.multiplier = loras[i].multiplier; + lora_spec.is_high_noise = loras[i].is_high_noise; + all_loras.push_back(std::move(lora_spec)); if (loras[i].is_high_noise) { lora_id = "|high_noise|" + lora_id; } - lora_f2m[lora_id] = loras[i].multiplier; LOG_DEBUG("lora %s:%.2f", lora_id.c_str(), loras[i].multiplier); } + + for (auto& extension : generation_extensions) { + extension->collect_loras(all_loras); + } + int64_t t0 = ggml_time_ms(); if (apply_lora_immediately) { - apply_loras_immediately(lora_f2m); + apply_loras_immediately(all_loras); } else { - apply_loras_at_runtime(lora_f2m); + apply_loras_at_runtime(all_loras); } int64_t t1 = ggml_time_ms(); - if (!lora_f2m.empty()) { + if (!all_loras.empty()) { LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); } } @@ -1580,11 +1532,8 @@ public: cond_stage_model.get(), condition_params, pm_params, - tensors, - version, n_threads, total_steps, - free_params_immediately, }; for (auto& extension : generation_extensions) { @@ -1848,11 +1797,17 @@ public: sd_get_preview_mode()}; } - void report_sample_progress(int step, size_t total_steps, int64_t t0) { - int64_t t1 = ggml_time_us(); + void report_sample_progress(int step, size_t total_steps, int64_t* last_progress_us) { if (step > 0 || step == -(int)total_steps) { - int showstep = std::abs(step); - pretty_progress(showstep, (int)total_steps, (t1 - t0) / 1000000.f / showstep); + int64_t now = ggml_time_us(); + int showstep = std::abs(step); + float step_seconds = last_progress_us != nullptr && *last_progress_us > 0 + ? (now - *last_progress_us) / 1000000.f + : 0.f; + pretty_progress(showstep, (int)total_steps, step_seconds); + if (last_progress_us != nullptr) { + *last_progress_us = now; + } } } @@ -1906,6 +1861,18 @@ public: float frame_rate, const sd_cache_params_t* cache_params, const sd::Tensor& video_positions = {}) { + struct RunnerDoneOnExit { + GGMLRunner* runner = nullptr; + ~RunnerDoneOnExit() { + if (runner != nullptr) { + runner->runner_done(); + } + } + }; + RunnerDoneOnExit sample_diffusion_runner_done{work_diffusion_model.get()}; + + RunnerDoneOnExit sample_control_runner_done{!control_image.empty() && control_net != nullptr ? control_net.get() : nullptr}; + std::vector skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count); float cfg_scale = guidance.txt_cfg; float img_cfg_scale = guidance.img_cfg; @@ -1951,7 +1918,7 @@ public: noise *= eta; } - int64_t t0 = ggml_time_us(); + int64_t last_progress_us = ggml_time_us(); sd::Tensor x_t = !noise.empty() ? denoiser->noise_scaling(sigmas[0], noise, init_latent) : init_latent; @@ -1961,6 +1928,7 @@ public: auto denoise = [&](const sd::Tensor& x, float sigma, int step) -> sd::guidance::GuiderOutput { if (step == 1 || step == -1) { pretty_progress(0, (int)steps, 0); + last_progress_us = ggml_time_us(); } std::vector scaling = denoiser->get_scalings(sigma); @@ -1998,7 +1966,7 @@ public: if (sd_should_preview_denoised() && preview.callback != nullptr) { preview_image(step, denoised, version, preview.mode, preview.callback, preview.data, false); } - report_sample_progress(step, steps, t0); + report_sample_progress(step, steps, &last_progress_us); sd::guidance::GuiderOutput output; output.pred = denoised; return output; @@ -2181,7 +2149,7 @@ public: if (sd_should_preview_denoised() && preview.callback != nullptr) { preview_image(step, denoised, version, preview.mode, preview.callback, preview.data, false); } - report_sample_progress(step, steps, t0); + report_sample_progress(step, steps, &last_progress_us); output.pred = denoised; return output; }; @@ -2327,15 +2295,6 @@ public: if (sd_version_is_pid(version)) { return sd::ops::clamp((x + 1.f) * 0.5f, 0.0f, 1.0f); } - // Free resident diffusion params before VAE allocates its compute buffer. - if (stream_layers) { - if (diffusion_model) { - diffusion_model->release_streaming_residency(); - } - if (high_noise_diffusion_model) { - high_noise_diffusion_model->release_streaming_residency(); - } - } auto latents = first_stage_model->diffusion_to_vae_latents(x); first_stage_model->set_temporal_tiling_enabled(vae_tiling_params.temporal_tiling); return first_stage_model->decode(n_threads, latents, vae_tiling_params, decode_video, circular_x, circular_y); @@ -2364,9 +2323,6 @@ public: return {}; } auto waveform = audio_vae_model->decode(n_threads, audio_latent); - if (free_params_immediately) { - audio_vae_model->free_params_buffer(); - } return waveform; } @@ -3797,6 +3753,15 @@ struct ImageGenerationEmbeds { SDCondition img_uncond; }; +struct ConditionerRunnerDoneOnExit { + Conditioner* conditioner = nullptr; + ~ConditionerRunnerDoneOnExit() { + if (conditioner != nullptr) { + conditioner->runner_done(); + } + } +}; + struct CircularAxesState { bool circular_x = false; bool circular_y = false; @@ -4092,6 +4057,8 @@ static std::optional prepare_image_generation_embeds(sd_c GenerationRequest* request, SamplePlan* plan, ImageGenerationLatents* latents) { + ConditionerRunnerDoneOnExit conditioner_runner_done{sd_ctx->sd->cond_stage_model.get()}; + ConditionerParams condition_params; condition_params.text = request->prompt; condition_params.clip_skip = request->clip_skip; @@ -4163,10 +4130,6 @@ static std::optional prepare_image_generation_embeds(sd_c int64_t t1 = ggml_time_ms(); LOG_INFO("get_learned_condition completed, taking %.2fs", (t1 - prepare_start_ms) * 1.0f / 1000); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->cond_stage_model->free_params_buffer(); - } - ImageGenerationEmbeds embeds; embeds.img_uncond = std::move(img_uncond); embeds.cond = std::move(cond); @@ -4191,9 +4154,6 @@ static sd_image_t* decode_image_outputs(sd_ctx_t* sd_ctx, sd::Tensor image = sd_ctx->sd->decode_first_stage(final_latents[i]); if (image.empty()) { LOG_ERROR("decode_first_stage failed for latent %" PRId64, i + 1); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->first_stage_model->free_params_buffer(); - } return nullptr; } decoded_images.push_back(std::move(image)); @@ -4203,9 +4163,6 @@ static sd_image_t* decode_image_outputs(sd_ctx_t* sd_ctx, int64_t t4 = ggml_time_ms(); LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t0) * 1.0f / 1000); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->first_stage_model->free_params_buffer(); - } sd_image_t* result_images = (sd_image_t*)calloc(request.batch_count, sizeof(sd_image_t)); if (result_images == nullptr) { @@ -4479,14 +4436,8 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s b + 1, request.batch_count, (sampling_end - sampling_start) * 1.0f / 1000); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->diffusion_model->free_params_buffer(); - } return nullptr; } - if (sd_ctx->sd->free_params_immediately && !request.hires.enabled) { - sd_ctx->sd->diffusion_model->free_params_buffer(); - } int64_t denoise_end = ggml_time_ms(); LOG_INFO("generating %zu latent images completed, taking %.2fs", final_latents.size(), @@ -4509,9 +4460,6 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s sd_ctx->sd->offload_params_to_cpu, sd_ctx->sd->n_threads)) { LOG_ERROR("load hires model upscaler failed"); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->diffusion_model->free_params_buffer(); - } return nullptr; } } @@ -4543,9 +4491,6 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s request, hires_upscaler.get()); if (upscaled.empty()) { - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->diffusion_model->free_params_buffer(); - } return nullptr; } @@ -4600,14 +4545,8 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s b + 1, (int)final_latents.size(), (hires_sample_end - hires_sample_start) * 1.0f / 1000); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->diffusion_model->free_params_buffer(); - } return nullptr; } - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->diffusion_model->free_params_buffer(); - } int64_t hires_denoise_end = ggml_time_ms(); LOG_INFO("hires fix completed, taking %.2fs", (hires_denoise_end - hires_denoise_start) * 1.0f / 1000); @@ -4943,6 +4882,8 @@ static ImageGenerationEmbeds prepare_video_generation_embeds(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, const GenerationRequest& request, const ImageGenerationLatents& latents) { + ConditionerRunnerDoneOnExit conditioner_runner_done{sd_ctx->sd->cond_stage_model.get()}; + ImageGenerationEmbeds embeds; ConditionerParams condition_params; condition_params.clip_skip = request.clip_skip; @@ -4965,9 +4906,6 @@ static ImageGenerationEmbeds prepare_video_generation_embeds(sd_ctx_t* sd_ctx, int64_t t1 = ggml_time_ms(); LOG_INFO("get_learned_condition completed, taking %.2fs", (t1 - prepare_start_ms) * 1.0f / 1000); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->cond_stage_model->free_params_buffer(); - } return embeds; } @@ -4994,9 +4932,6 @@ static sd_image_t* decode_video_outputs(sd_ctx_t* sd_ctx, sd::Tensor vid = sd_ctx->sd->decode_first_stage(video_latent, true); int64_t t5 = ggml_time_ms(); LOG_INFO("decode_first_stage completed, taking %.2fs", (t5 - t4) * 1.0f / 1000); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->first_stage_model->free_params_buffer(); - } if (vid.empty()) { LOG_ERROR("decode_first_stage failed for video"); return nullptr; @@ -5063,7 +4998,7 @@ static sd::Tensor upscale_ltx_spatial_video_latent(sd_ctx_t* sd_ctx, std::unique_ptr upsampler = std::make_unique(sd_ctx->sd->backend_for(SDBackendModule::UPSCALER), - sd_ctx->sd->params_backend_for(SDBackendModule::UPSCALER)); + sd_ctx->sd->backend_for(SDBackendModule::UPSCALER)); const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(sd_ctx->sd->max_vram); upsampler->set_max_graph_vram_bytes(max_graph_vram_bytes); if (!upsampler->load_from_file(model_path, sd_ctx->sd->n_threads)) { @@ -5304,18 +5239,12 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx, int64_t sampling_end = ggml_time_ms(); if (x_t_sampled.empty()) { LOG_ERROR("sampling(high noise) failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->high_noise_diffusion_model->free_params_buffer(); - } return false; } x_t = std::move(x_t_sampled); noise = {}; LOG_INFO("sampling(high noise) completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->high_noise_diffusion_model->free_params_buffer(); - } } LOG_DEBUG("sample %dx%dx%d", W, H, T); @@ -5348,9 +5277,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx, int64_t sampling_end = ggml_time_ms(); if (final_latent.empty()) { - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->diffusion_model->free_params_buffer(); - } LOG_ERROR("sampling failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); return false; } @@ -5364,9 +5290,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx, latents.audio_length); int64_t upscale_end = ggml_time_ms(); if (upscaled_latent.empty()) { - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->diffusion_model->free_params_buffer(); - } return false; } LOG_INFO("LTX latent spatial upscale completed, taking %.2fs", @@ -5399,9 +5322,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx, LOG_ERROR("failed to resize LTX audio latent for latent upscale: %d -> %d", latents.audio_length, target_audio_length); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->diffusion_model->free_params_buffer(); - } return false; } x_t = pack_ltxav_audio_and_video_latents(video_latent, audio_latent); @@ -5429,9 +5349,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx, &x_t, &hires_denoise_mask, &hires_video_positions)) { - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->diffusion_model->free_params_buffer(); - } return false; } noise = sd::Tensor::randn_like(x_t, sd_ctx->sd->rng); @@ -5488,9 +5405,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx, hires_request.cache_params, hires_video_positions); sampling_end = ggml_time_ms(); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->diffusion_model->free_params_buffer(); - } if (final_latent.empty()) { LOG_ERROR("sampling(latent upscale) failed after %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); @@ -5498,8 +5412,6 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx, } LOG_INFO("sampling(latent upscale) completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); - } else if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->diffusion_model->free_params_buffer(); } int64_t latent_end = ggml_time_ms(); diff --git a/src/weight_manager.h b/src/weight_manager.h new file mode 100644 index 00000000..28d6cf5c --- /dev/null +++ b/src/weight_manager.h @@ -0,0 +1,15 @@ +#ifndef __WEIGHT_MANAGER_H__ +#define __WEIGHT_MANAGER_H__ + +#include + +struct ggml_tensor; + +struct RunnerWeightManager { + virtual ~RunnerWeightManager() = default; + virtual bool prepare_params(const std::vector& tensors) = 0; + virtual void release_compute_backend_params(const std::vector& tensors) = 0; + virtual void release_params_backend_params(const std::vector& tensors) = 0; +}; + +#endif // __WEIGHT_MANAGER_H__