diff --git a/src/conditioning/conditioner.hpp b/src/conditioning/conditioner.hpp index 9eaa0e72..b5dda4c0 100644 --- a/src/conditioning/conditioner.hpp +++ b/src/conditioning/conditioner.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_CONDITIONING_CONDITIONER_HPP__ +#ifndef __SD_CONDITIONING_CONDITIONER_HPP__ #define __SD_CONDITIONING_CONDITIONER_HPP__ #include @@ -118,7 +118,6 @@ public: virtual void set_stream_layers_enabled(bool enabled) {} virtual void set_flash_attention_enabled(bool enabled) = 0; virtual void set_weight_adapter(const std::shared_ptr& adapter) {} - virtual void set_weight_manager(const std::shared_ptr& manager) {} virtual void runner_done() {} }; @@ -137,10 +136,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { std::map> embedding_pos_map; FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend, - ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::map& orig_embedding_map, - SDVersion version = VERSION_SD1) + SDVersion version = VERSION_SD1, + std::shared_ptr weight_manager = nullptr) : version(version), tokenizer(sd_version_is_sd2(version) ? 0 : 49407) { for (const auto& kv : orig_embedding_map) { std::string name = kv.first; @@ -150,12 +149,12 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } bool force_clip_f32 = !embedding_map.empty(); if (sd_version_is_sd1(version)) { - text_model = std::make_shared(backend, params_backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32); + text_model = std::make_shared(backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32, weight_manager); } else if (sd_version_is_sd2(version)) { - text_model = std::make_shared(backend, params_backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32); + text_model = std::make_shared(backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32, weight_manager); } else if (sd_version_is_sdxl(version)) { - text_model = std::make_shared(backend, params_backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32); - text_model2 = std::make_shared(backend, params_backend, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32); + text_model = std::make_shared(backend, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32, weight_manager); + text_model2 = std::make_shared(backend, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32, weight_manager); } } @@ -194,13 +193,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } } - void set_weight_manager(const std::shared_ptr& manager) override { - text_model->set_weight_manager(manager); - if (sd_version_is_sdxl(version)) { - text_model2->set_weight_manager(manager); - } - } - void runner_done() override { text_model->runner_done(); if (sd_version_is_sdxl(version)) { @@ -522,9 +514,9 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner { std::string weight_prefix = "cond_stage_model.transformer"; FrozenCLIPVisionEmbedder(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}) - : GGMLRunner(backend, params_backend) { + const String2TensorStorage& tensor_storage_map = {}, + std::shared_ptr weight_manager = nullptr) + : GGMLRunner(backend, weight_manager) { bool proj_in = false; for (const auto& [name, tensor_storage] : tensor_storage_map) { if (!starts_with(name, weight_prefix)) { @@ -580,8 +572,8 @@ struct SD3CLIPEmbedder : public Conditioner { std::shared_ptr t5; SD3CLIPEmbedder(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}) + const String2TensorStorage& tensor_storage_map = {}, + std::shared_ptr weight_manager = nullptr) : clip_g_tokenizer(0) { bool use_clip_l = false; bool use_clip_g = false; @@ -600,13 +592,13 @@ struct SD3CLIPEmbedder : public Conditioner { return; } if (use_clip_l) { - clip_l = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); + clip_l = std::make_shared(backend, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, false, weight_manager); } if (use_clip_g) { - clip_g = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); + clip_g = std::make_shared(backend, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, false, weight_manager); } if (use_t5) { - t5 = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.t5xxl.transformer"); + t5 = std::make_shared(backend, tensor_storage_map, "text_encoders.t5xxl.transformer", false, weight_manager); } } @@ -670,18 +662,6 @@ struct SD3CLIPEmbedder : public Conditioner { } } - void set_weight_manager(const std::shared_ptr& manager) override { - if (clip_l) { - clip_l->set_weight_manager(manager); - } - if (clip_g) { - clip_g->set_weight_manager(manager); - } - if (t5) { - t5->set_weight_manager(manager); - } - } - void runner_done() override { if (clip_l) { clip_l->runner_done(); @@ -961,8 +941,8 @@ struct FluxCLIPEmbedder : public Conditioner { size_t chunk_len = 256; FluxCLIPEmbedder(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}) { + const String2TensorStorage& tensor_storage_map = {}, + std::shared_ptr weight_manager = nullptr) { bool use_clip_l = false; bool use_t5 = false; for (auto pair : tensor_storage_map) { @@ -979,12 +959,12 @@ struct FluxCLIPEmbedder : public Conditioner { } if (use_clip_l) { - clip_l = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); + clip_l = std::make_shared(backend, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, false, weight_manager); } else { LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded."); } if (use_t5) { - t5 = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.t5xxl.transformer"); + t5 = std::make_shared(backend, tensor_storage_map, "text_encoders.t5xxl.transformer", false, weight_manager); } else { LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded."); } @@ -1035,15 +1015,6 @@ struct FluxCLIPEmbedder : public Conditioner { } } - void set_weight_manager(const std::shared_ptr& manager) override { - if (clip_l) { - clip_l->set_weight_manager(manager); - } - if (t5) { - t5->set_weight_manager(manager); - } - } - void runner_done() override { if (clip_l) { clip_l->runner_done(); @@ -1219,11 +1190,11 @@ struct T5CLIPEmbedder : public Conditioner { bool is_umt5 = false; T5CLIPEmbedder(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}, - bool use_mask = false, - int mask_pad = 0, - bool is_umt5 = false) + const String2TensorStorage& tensor_storage_map = {}, + bool use_mask = false, + int mask_pad = 0, + bool is_umt5 = false, + std::shared_ptr weight_manager = nullptr) : use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) { bool use_t5 = false; for (auto pair : tensor_storage_map) { @@ -1236,7 +1207,7 @@ struct T5CLIPEmbedder : public Conditioner { LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!"); return; } else { - t5 = std::make_shared(backend, params_backend, tensor_storage_map, "text_encoders.t5xxl.transformer", is_umt5); + t5 = std::make_shared(backend, tensor_storage_map, "text_encoders.t5xxl.transformer", is_umt5, weight_manager); } } @@ -1270,12 +1241,6 @@ struct T5CLIPEmbedder : public Conditioner { } } - void set_weight_manager(const std::shared_ptr& manager) override { - if (t5) { - t5->set_weight_manager(manager); - } - } - void runner_done() override { if (t5) { t5->runner_done(); @@ -1422,15 +1387,15 @@ struct AnimaConditioner : public Conditioner { std::shared_ptr llm; AnimaConditioner(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}) { + const String2TensorStorage& tensor_storage_map = {}, + std::shared_ptr weight_manager = nullptr) { qwen_tokenizer = std::make_shared(); llm = std::make_shared(LLM::LLMArch::QWEN3, backend, - params_backend, tensor_storage_map, "text_encoders.llm", - false); + false, + weight_manager); } void get_param_tensors(std::map& tensors) override { @@ -1453,10 +1418,6 @@ struct AnimaConditioner : public Conditioner { llm->set_weight_adapter(adapter); } - void set_weight_manager(const std::shared_ptr& manager) override { - llm->set_weight_manager(manager); - } - void runner_done() override { llm->runner_done(); } @@ -1545,11 +1506,11 @@ struct LLMEmbedder : public Conditioner { std::shared_ptr llm; LLMEmbedder(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}, - SDVersion version = VERSION_QWEN_IMAGE, - const std::string prefix = "", - bool enable_vision = false) + const String2TensorStorage& tensor_storage_map = {}, + SDVersion version = VERSION_QWEN_IMAGE, + const std::string prefix = "", + bool enable_vision = false, + std::shared_ptr weight_manager = nullptr) : version(version) { LLM::LLMArch arch = LLM::LLMArch::QWEN2_5_VL; if (version == VERSION_FLUX2) { @@ -1576,10 +1537,10 @@ struct LLMEmbedder : public Conditioner { } llm = std::make_shared(arch, backend, - params_backend, tensor_storage_map, "text_encoders.llm", - enable_vision); + enable_vision, + weight_manager); } void get_param_tensors(std::map& tensors) override { @@ -1604,12 +1565,6 @@ struct LLMEmbedder : public Conditioner { } } - void set_weight_manager(const std::shared_ptr& manager) override { - if (llm) { - llm->set_weight_manager(manager); - } - } - void runner_done() override { if (llm) { llm->runner_done(); @@ -2106,10 +2061,10 @@ struct LTXAVTextProjectionRunner : public GGMLRunner { LTXAVTextProjection model; LTXAVTextProjectionRunner(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}, - const std::string& prefix = "") - : GGMLRunner(backend, params_backend), + const String2TensorStorage& tensor_storage_map = {}, + const std::string& prefix = "", + std::shared_ptr weight_manager = nullptr) + : GGMLRunner(backend, weight_manager), model(tensor_storage_map.find(prefix + ".video_aggregate_embed.weight") != tensor_storage_map.end()) { model.init(params_ctx, tensor_storage_map, prefix); } @@ -2154,22 +2109,22 @@ struct LTXAVEmbedder : public Conditioner { bool dual_projection = false; LTXAVEmbedder(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}, - const std::string& llm_prefix = "text_encoders.llm", - const std::string& projector_prefix = "text_embedding_projection") { + const String2TensorStorage& tensor_storage_map = {}, + const std::string& llm_prefix = "text_encoders.llm", + const std::string& projector_prefix = "text_embedding_projection", + std::shared_ptr weight_manager = nullptr) { tokenizer = std::make_shared(); llm = std::make_shared(LLM::LLMArch::GEMMA3_12B, backend, - params_backend, tensor_storage_map, llm_prefix, - false); + false, + weight_manager); dual_projection = tensor_storage_map.find(projector_prefix + ".video_aggregate_embed.weight") != tensor_storage_map.end(); projector = std::make_shared(backend, - params_backend, tensor_storage_map, - projector_prefix); + projector_prefix, + weight_manager); } void get_param_tensors(std::map& tensors) override { @@ -2192,11 +2147,6 @@ struct LTXAVEmbedder : public Conditioner { projector->set_weight_adapter(adapter); } - void set_weight_manager(const std::shared_ptr& manager) override { - llm->set_weight_manager(manager); - projector->set_weight_manager(manager); - } - void runner_done() override { llm->runner_done(); projector->runner_done(); diff --git a/src/core/ggml_extend.hpp b/src/core/ggml_extend.hpp index 70703d24..76109d04 100644 --- a/src/core/ggml_extend.hpp +++ b/src/core/ggml_extend.hpp @@ -1696,11 +1696,9 @@ protected: using GraphCutSegment = sd::ggml_graph_cut::Segment; using GraphCutPlan = sd::ggml_graph_cut::Plan; - ggml_backend_t params_backend = nullptr; ggml_backend_t runtime_backend = nullptr; - ggml_context* params_ctx = nullptr; - ggml_backend_buffer_t params_buffer = nullptr; + ggml_context* params_ctx = nullptr; ggml_context* cache_ctx = nullptr; ggml_backend_buffer_t cache_buffer = nullptr; @@ -1880,9 +1878,6 @@ protected: auto manager = weight_manager.lock(); if (manager == nullptr) { if (!params_to_prepare.empty()) { - if (params_buffer != nullptr) { - return true; - } LOG_ERROR("%s weight manager is not set for graph params", get_desc().c_str()); return false; } @@ -2194,13 +2189,11 @@ protected: plan.valid && max_graph_vram_bytes > 0 && plan.segments.size() > 1 && - params_backend != runtime_backend && !sd_backend_is_cpu(runtime_backend); } bool can_attempt_graph_cut_segmented_compute() const { return max_graph_vram_bytes > 0 && - params_backend != runtime_backend && !sd_backend_is_cpu(runtime_backend); } @@ -2631,16 +2624,15 @@ public: public: virtual std::string get_desc() = 0; - GGMLRunner(ggml_backend_t backend, ggml_backend_t params_backend) - : params_backend(params_backend), - runtime_backend(backend) { + GGMLRunner(ggml_backend_t backend, + std::shared_ptr manager = nullptr) + : runtime_backend(backend), + weight_manager(manager) { GGML_ASSERT(runtime_backend != nullptr); - GGML_ASSERT(params_backend != nullptr); alloc_params_ctx(); } virtual ~GGMLRunner() { - free_params_buffer(); free_compute_buffer(); free_params_ctx(); free_compute_ctx(); @@ -2674,73 +2666,6 @@ public: alloc_compute_ctx(); } - bool alloc_params_buffer() { - size_t num_tensors = ggml_tensor_num(params_ctx); - if (num_tensors > 0) { - // ggml_backend_alloc_ctx_tensors fails when all tensors are already allocated - // (typical for memory-mapped weights). See ggml-alloc.c n_buffers==0 branch. - bool all_have_data = true; - for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) { - if (t->data == nullptr) { - all_have_data = false; - break; - } - } - if (all_have_data) { - LOG_DEBUG("%s all params already mmap-allocated (no separate buffer needed)", get_desc().c_str()); - params_buffer = nullptr; - rebuild_params_tensor_set(); - return true; - } - } else { - LOG_DEBUG("%s skipping params allocation (no tensors)", get_desc().c_str()); - return true; - } - // Pinned host buffer when CPU-offloaded for DMA-direct H2D. - ggml_backend_buffer_type_t params_buft = nullptr; - if (params_backend != runtime_backend) { - ggml_backend_dev_t runtime_dev = ggml_backend_get_device(runtime_backend); - if (runtime_dev != nullptr) { - params_buft = ggml_backend_dev_host_buffer_type(runtime_dev); - } - } - if (params_buft == nullptr) { - params_buft = ggml_backend_get_default_buffer_type(params_backend); - } - params_buffer = ggml_backend_alloc_ctx_tensors_from_buft(params_ctx, params_buft); - if (params_buffer == nullptr) { - LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i", - get_desc().c_str(), - num_tensors); - return false; - } - rebuild_params_tensor_set(); - ggml_backend_buffer_set_usage(params_buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - size_t params_buffer_size = ggml_backend_buffer_get_size(params_buffer); - LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)", - get_desc().c_str(), - params_buffer_size / (1024.f * 1024.f), - sd_backend_is_cpu(params_backend) ? "RAM" : "VRAM", - num_tensors); - return true; - } - -protected: - void free_params_buffer() { - if (params_buffer != nullptr) { - ggml_backend_buffer_free(params_buffer); - params_buffer = nullptr; - } - observed_max_effective_budget_ = 0; - } - - size_t get_params_buffer_size() { - if (params_buffer != nullptr) { - return ggml_backend_buffer_get_size(params_buffer); - } - return 0; - } - public: void free_cache_ctx_and_buffer() { free_cache_buffer(); @@ -2886,15 +2811,6 @@ public: weight_adapter = adapter; } - void set_weight_manager(const std::shared_ptr& manager) { - weight_manager = manager; - } - - void set_weight_manager(const std::shared_ptr& manager, - const std::string&) { - set_weight_manager(manager); - } - void set_max_graph_vram_bytes(size_t max_vram_bytes) { max_graph_vram_bytes = max_vram_bytes; } @@ -2902,14 +2818,6 @@ public: void set_stream_layers_enabled(bool enabled) { stream_layers_enabled = enabled; } - - ggml_backend_t get_runtime_backend() { - return runtime_backend; - } - - ggml_backend_t get_params_backend() { - return params_backend; - } }; class GGMLBlock { diff --git a/src/extensions/generation_extension.h b/src/extensions/generation_extension.h index 0f8e1263..dd0c51cc 100644 --- a/src/extensions/generation_extension.h +++ b/src/extensions/generation_extension.h @@ -19,6 +19,7 @@ struct GenerationExtensionInitContext { SDVersion version; const String2TensorStorage& tensor_storage_map; ModelLoader& model_loader; + std::shared_ptr model_manager; int n_threads; std::function ensure_backend_pair; std::function backend_for; @@ -46,7 +47,6 @@ struct GenerationExtension { virtual void get_param_tensors(std::map&) {} virtual void collect_loras(std::vector&) {} virtual void add_ignore_tensors(std::set&) const {} - virtual void set_weight_manager(const std::shared_ptr&) {} virtual void runner_done() {} virtual void reset_runtime_condition() {} virtual bool prepare_condition(GenerationExtensionConditionContext&) { diff --git a/src/extensions/photomaker_extension.cpp b/src/extensions/photomaker_extension.cpp index cbeb7c41..78c5cdb9 100644 --- a/src/extensions/photomaker_extension.cpp +++ b/src/extensions/photomaker_extension.cpp @@ -134,11 +134,12 @@ struct PhotoMakerExtension : public GenerationExtension { } pmid_model = std::make_shared(ctx.backend_for(SDBackendModule::PHOTOMAKER), - ctx.params_backend_for(SDBackendModule::PHOTOMAKER), ctx.tensor_storage_map, "pmid", ctx.version, - pm_version); + pm_version, + 20.f, + ctx.model_manager); if (pm_version == PM_VERSION_2) { LOG_INFO("using PhotoMaker Version 2"); } @@ -174,12 +175,6 @@ struct PhotoMakerExtension : public GenerationExtension { ignore_tensors.insert("pmid.unet."); } - void set_weight_manager(const std::shared_ptr& manager) override { - if (pmid_model != nullptr) { - pmid_model->set_weight_manager(manager); - } - } - void runner_done() override { if (pmid_model != nullptr) { pmid_model->runner_done(); diff --git a/src/model/adapter/lora.hpp b/src/model/adapter/lora.hpp index 850f6c10..6f0a5943 100644 --- a/src/model/adapter/lora.hpp +++ b/src/model/adapter/lora.hpp @@ -4,6 +4,7 @@ #include #include "core/ggml_extend.hpp" #include "model_loader.h" +#include "model_manager.h" #define LORA_GRAPH_BASE_SIZE 10240 @@ -14,22 +15,24 @@ struct LoraModel : public GGMLRunner { std::map original_tensor_to_final_tensor; std::set applied_lora_tensors; std::string file_path; - ModelLoader model_loader; - bool load_failed = false; - bool applied = false; - bool tensor_preprocessed = false; + std::shared_ptr model_manager; + ggml_backend_t params_backend = nullptr; + bool load_failed = false; + bool applied = false; + bool tensor_preprocessed = false; typedef std::function filter_t; LoraModel(const std::string& lora_id, ggml_backend_t backend, - ggml_backend_t params_backend, - const std::string& file_path = "", - std::string prefix = "", - SDVersion version = VERSION_COUNT) - : lora_id(lora_id), file_path(file_path), GGMLRunner(backend, params_backend) { + ggml_backend_t params_backend_, + const std::string& file_path = "", + std::string prefix = "", + SDVersion version = VERSION_COUNT, + std::shared_ptr manager = std::make_shared()) + : GGMLRunner(backend, manager), lora_id(lora_id), file_path(file_path), model_manager(std::move(manager)), params_backend(params_backend_) { prefix = "lora." + prefix; - if (!model_loader.init_from_file_and_convert_name(file_path, prefix, version)) { + if (model_manager == nullptr || !model_manager->loader().init_from_file_and_convert_name(file_path, prefix, version)) { load_failed = true; } } @@ -71,7 +74,10 @@ struct LoraModel : public GGMLRunner { return true; }; - model_loader.set_n_threads(n_threads); + if (model_manager != nullptr) { + model_manager->set_n_threads(n_threads); + } + ModelLoader& model_loader = model_manager->loader(); model_loader.load_tensors(on_new_tensor_cb); if (tensors_to_create.empty()) { @@ -88,23 +94,42 @@ struct LoraModel : public GGMLRunner { lora_tensors[name] = real; } - if (!alloc_params_buffer()) { - LOG_ERROR("lora model buffer allocation failed"); + std::map tensors; + for (const auto& pair : lora_tensors) { + tensors[pair.first] = pair.second; + } + if (model_manager == nullptr || + !model_manager->register_param_tensors("LoRA", + std::move(tensors), + ModelManager::ResidencyMode::Resident, + runtime_backend, + params_backend) || + !model_manager->validate_registered_tensors()) { + LOG_ERROR("lora model manager registration failed"); + return false; + } + std::vector lora_params; + lora_params.reserve(lora_tensors.size()); + for (const auto& pair : lora_tensors) { + lora_params.push_back(pair.second); + } + if (!model_manager->prepare_params(lora_params)) { + LOG_ERROR("lora model manager prepare params failed"); return false; } - - dry_run = false; - model_loader.load_tensors(on_new_tensor_cb); LOG_DEBUG("finished loaded lora"); return true; } void release_loaded_tensors() { + runner_done(); free_compute_buffer(); - free_params_buffer(); + model_manager.reset(); free_params_ctx(); alloc_params_ctx(); + model_manager = std::make_shared(); + weight_manager = model_manager; lora_tensors.clear(); original_tensor_to_final_tensor.clear(); applied_lora_tensors.clear(); diff --git a/src/model/adapter/pmid.hpp b/src/model/adapter/pmid.hpp index 6773734d..69191b74 100644 --- a/src/model/adapter/pmid.hpp +++ b/src/model/adapter/pmid.hpp @@ -413,13 +413,13 @@ public: public: PhotoMakerIDEncoder(ggml_backend_t backend, - ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, - SDVersion version = VERSION_SDXL, - PMVersion pm_v = PM_VERSION_1, - float sty = 20.f) - : GGMLRunner(backend, params_backend), + SDVersion version = VERSION_SDXL, + PMVersion pm_v = PM_VERSION_1, + float sty = 20.f, + std::shared_ptr weight_manager = nullptr) + : GGMLRunner(backend, weight_manager), version(version), pm_version(pm_v), style_strength(sty) { @@ -565,17 +565,18 @@ public: struct PhotoMakerIDEmbed : public GGMLRunner { std::map tensors; std::string file_path; - ModelLoader* model_loader; - bool load_failed = false; - bool applied = false; + std::shared_ptr model_manager; + ggml_backend_t params_backend = nullptr; + bool load_failed = false; + bool applied = false; PhotoMakerIDEmbed(ggml_backend_t backend, - ggml_backend_t params_backend, - ModelLoader* ml, - const std::string& file_path = "", - const std::string& prefix = "") - : file_path(file_path), GGMLRunner(backend, params_backend), model_loader(ml) { - if (!model_loader->init_from_file_and_convert_name(file_path, prefix)) { + ggml_backend_t params_backend_, + std::shared_ptr manager = std::make_shared(), + const std::string& file_path = "", + const std::string& prefix = "") + : GGMLRunner(backend, manager), file_path(file_path), model_manager(std::move(manager)), params_backend(params_backend_) { + if (model_manager == nullptr || !model_manager->loader().init_from_file_and_convert_name(file_path, prefix)) { load_failed = true; } } @@ -616,15 +617,27 @@ struct PhotoMakerIDEmbed : public GGMLRunner { return true; }; - model_loader->set_n_threads(n_threads); - model_loader->load_tensors(on_new_tensor_cb); - if (!alloc_params_buffer()) { - LOG_ERROR("PhotoMaker ID embeds buffer allocation failed"); + model_manager->set_n_threads(n_threads); + ModelLoader& model_loader = model_manager->loader(); + model_loader.load_tensors(on_new_tensor_cb); + if (!model_manager->register_param_tensors("PhotoMaker ID embeds", + tensors, + ModelManager::ResidencyMode::Resident, + runtime_backend, + params_backend) || + !model_manager->validate_registered_tensors()) { + LOG_ERROR("PhotoMaker ID embeds model manager registration failed"); + return false; + } + std::vector id_embed_params; + id_embed_params.reserve(tensors.size()); + for (const auto& pair : tensors) { + id_embed_params.push_back(pair.second); + } + if (!model_manager->prepare_params(id_embed_params)) { + LOG_ERROR("PhotoMaker ID embeds model manager prepare params failed"); return false; } - - dry_run = false; - model_loader->load_tensors(on_new_tensor_cb); LOG_DEBUG("finished loading PhotoMaker ID Embeds "); return true; diff --git a/src/model/common/block.hpp b/src/model/common/block.hpp index 69db0a90..15bfa376 100644 --- a/src/model/common/block.hpp +++ b/src/model/common/block.hpp @@ -560,11 +560,11 @@ protected: params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1); } - float get_alpha() { + ggml_tensor* get_alpha(GGMLRunnerContext* ctx) { // image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,] // so learned_with_images is same as learned - float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]); - return sigmoid(alpha); + auto mix_factor = ggml_ext_cast_f32(ctx->ggml_ctx, ctx->backend, params["mix_factor"]); + return ggml_sigmoid(ctx->ggml_ctx, mix_factor); } public: @@ -578,11 +578,12 @@ public: ggml_tensor* x_spatial, ggml_tensor* x_temporal) { // image_only_indicator is always tensor([0.]) - float alpha = get_alpha(); - auto x = ggml_add(ctx->ggml_ctx, - ggml_ext_scale(ctx->ggml_ctx, x_spatial, alpha), - ggml_ext_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha)); - return x; + auto alpha = get_alpha(ctx); + return ggml_add(ctx->ggml_ctx, + x_temporal, + ggml_mul(ctx->ggml_ctx, + ggml_sub(ctx->ggml_ctx, x_spatial, x_temporal), + alpha)); } }; diff --git a/src/model/diffusion/anima.hpp b/src/model/diffusion/anima.hpp index 7bf765fe..6042516a 100644 --- a/src/model/diffusion/anima.hpp +++ b/src/model/diffusion/anima.hpp @@ -561,10 +561,10 @@ namespace Anima { AnimaNet net; AnimaRunner(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}, - const std::string prefix = "model.diffusion_model") - : DiffusionModelRunner(backend, params_backend, prefix), + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "model.diffusion_model", + std::shared_ptr weight_manager = nullptr) + : DiffusionModelRunner(backend, prefix, weight_manager), config(AnimaConfig::detect_from_weights(tensor_storage_map, prefix + ".net")) { net = AnimaNet(config); net.init(params_ctx, tensor_storage_map, prefix + ".net"); diff --git a/src/model/diffusion/control.hpp b/src/model/diffusion/control.hpp index d8316b7b..7cf9370b 100644 --- a/src/model/diffusion/control.hpp +++ b/src/model/diffusion/control.hpp @@ -1,8 +1,9 @@ -#ifndef __SD_MODEL_DIFFUSION_CONTROL_HPP__ +#ifndef __SD_MODEL_DIFFUSION_CONTROL_HPP__ #define __SD_MODEL_DIFFUSION_CONTROL_HPP__ #include "model/common/block.hpp" #include "model_loader.h" +#include "model_manager.h" #define CONTROL_NET_GRAPH_SIZE 1536 @@ -318,13 +319,16 @@ struct ControlNet : public GGMLRunner { std::vector> controls; sd::Tensor guided_hint; bool guided_hint_cached = false; + std::shared_ptr owned_model_manager; + ggml_backend_t params_backend = nullptr; ControlNet(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}, - SDVersion version = VERSION_SD1, - const std::string& prefix = "") - : GGMLRunner(backend, params_backend), version(version), control_net(version), weight_prefix(prefix) { + ggml_backend_t params_backend_, + const String2TensorStorage& tensor_storage_map = {}, + SDVersion version = VERSION_SD1, + const std::string& prefix = "", + std::shared_ptr weight_manager = nullptr) + : GGMLRunner(backend, weight_manager), version(version), control_net(version), weight_prefix(prefix), params_backend(params_backend_) { control_net.init(params_ctx, tensor_storage_map, prefix); } @@ -459,31 +463,35 @@ struct ControlNet : public GGMLRunner { bool load_from_file(const std::string& file_path, int n_threads) { LOG_INFO("loading control net from '%s'", file_path.c_str()); - if (!alloc_params_buffer()) { - LOG_ERROR("control net model buffer allocation failed"); - return false; - } - std::map tensors; control_net.get_param_tensors(tensors); - std::set ignore_tensors; - ModelLoader model_loader; + auto manager = std::dynamic_pointer_cast(weight_manager.lock()); + if (manager == nullptr) { + owned_model_manager = std::make_shared(); + weight_manager = owned_model_manager; + manager = owned_model_manager; + } + + ModelLoader& model_loader = manager->loader(); if (!model_loader.init_from_file_and_convert_name(file_path)) { LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str()); return false; } - model_loader.set_n_threads(n_threads); - bool success = model_loader.load_tensors(tensors, ignore_tensors); - - if (!success) { - LOG_ERROR("load control net tensors from model loader failed"); + manager->set_n_threads(n_threads); + if (!manager->register_param_tensors("ControlNet", + std::move(tensors), + ModelManager::ResidencyMode::Resident, + runtime_backend, + params_backend) || + !manager->validate_registered_tensors()) { + LOG_ERROR("register control net tensors with model manager failed"); return false; } LOG_INFO("control net model loaded"); - return success; + return true; } }; diff --git a/src/model/diffusion/ernie_image.hpp b/src/model/diffusion/ernie_image.hpp index abb14dab..12fcada5 100644 --- a/src/model/diffusion/ernie_image.hpp +++ b/src/model/diffusion/ernie_image.hpp @@ -387,10 +387,10 @@ namespace ErnieImage { std::vector pe_vec; ErnieImageRunner(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}, - const std::string prefix = "") - : DiffusionModelRunner(backend, params_backend, prefix), + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "", + std::shared_ptr weight_manager = nullptr) + : DiffusionModelRunner(backend, prefix, weight_manager), config(ErnieImageConfig::detect_from_weights(tensor_storage_map, prefix)) { ernie_image = ErnieImageModel(config); ernie_image.init(params_ctx, tensor_storage_map, prefix); diff --git a/src/model/diffusion/flux.hpp b/src/model/diffusion/flux.hpp index 3181a113..7efaf931 100644 --- a/src/model/diffusion/flux.hpp +++ b/src/model/diffusion/flux.hpp @@ -1301,12 +1301,12 @@ namespace Flux { bool use_mask = false; FluxRunner(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}, - const std::string prefix = "", - SDVersion version = VERSION_FLUX, - bool use_mask = false) - : DiffusionModelRunner(backend, params_backend, prefix), + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "", + SDVersion version = VERSION_FLUX, + bool use_mask = false, + std::shared_ptr weight_manager = nullptr) + : DiffusionModelRunner(backend, prefix, weight_manager), config(FluxConfig::detect_from_weights(tensor_storage_map, prefix, version)), version(version), use_mask(use_mask) { @@ -1583,7 +1583,8 @@ namespace Flux { ggml_backend_t backend = sd_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_COUNT; - ModelLoader model_loader; + auto model_manager = std::make_shared(); + ModelLoader& model_loader = model_manager->loader(); if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) { LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); return; @@ -1599,24 +1600,20 @@ namespace Flux { } std::shared_ptr flux = std::make_shared(backend, - backend, tensor_storage_map, "model.diffusion_model", VERSION_FLUX2, - false); + false, + model_manager); - if (!flux->alloc_params_buffer()) { - LOG_ERROR("flux model allocation failed"); - return; - } - - std::map tensors; - flux->get_param_tensors(tensors, "model.diffusion_model"); - - bool success = model_loader.load_tensors(tensors); - - if (!success) { - LOG_ERROR("load tensors from model loader failed"); + if (!model_manager->register_runner_params("Flux test", + *flux, + "model.diffusion_model", + ModelManager::ResidencyMode::Resident, + backend, + backend) || + !model_manager->validate_registered_tensors()) { + LOG_ERROR("register flux tensors with model manager failed"); return; } diff --git a/src/model/diffusion/hidream_o1.hpp b/src/model/diffusion/hidream_o1.hpp index 8ea4f7f5..559f61bc 100644 --- a/src/model/diffusion/hidream_o1.hpp +++ b/src/model/diffusion/hidream_o1.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__ +#ifndef __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__ #define __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__ #include @@ -282,10 +282,10 @@ namespace HiDreamO1 { std::array, 4> pos_embed_weight_data_; HiDreamO1VisionRunner(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}, - const std::string& prefix = "model.visual") - : GGMLRunner(backend, params_backend), + const String2TensorStorage& tensor_storage_map = {}, + const std::string& prefix = "model.visual", + std::shared_ptr weight_manager = nullptr) + : GGMLRunner(backend, weight_manager), config(HiDreamO1Config::detect_from_weights(tensor_storage_map, prefix)), model(std::make_shared(false, config.llm.vision)) { model->init(params_ctx, tensor_storage_map, prefix); @@ -343,10 +343,10 @@ namespace HiDreamO1 { std::vector attention_mask_vec; HiDreamO1Runner(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}, - const std::string& prefix = "model") - : DiffusionModelRunner(backend, params_backend, prefix), + const String2TensorStorage& tensor_storage_map = {}, + const std::string& prefix = "model", + std::shared_ptr weight_manager = nullptr) + : DiffusionModelRunner(backend, prefix, weight_manager), config(HiDreamO1Config::detect_from_weights(tensor_storage_map, prefix)) { model = HiDreamO1Model(config); model.init(params_ctx, tensor_storage_map, prefix); @@ -490,9 +490,9 @@ namespace HiDreamO1 { std::shared_ptr vision_runner; HiDreamO1Conditioner(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}) - : vision_runner(std::make_shared(backend, params_backend, tensor_storage_map)) {} + const String2TensorStorage& tensor_storage_map = {}, + std::shared_ptr weight_manager = nullptr) + : vision_runner(std::make_shared(backend, tensor_storage_map, "model.visual", weight_manager)) {} void get_param_tensors(std::map& tensors) override { vision_runner->get_param_tensors(tensors); @@ -510,10 +510,6 @@ namespace HiDreamO1 { vision_runner->set_weight_adapter(adapter); } - void set_weight_manager(const std::shared_ptr& manager) override { - vision_runner->set_weight_manager(manager); - } - void runner_done() override { vision_runner->runner_done(); } diff --git a/src/model/diffusion/ideogram4.hpp b/src/model/diffusion/ideogram4.hpp index 8c3a04ca..bfa2f86a 100644 --- a/src/model/diffusion/ideogram4.hpp +++ b/src/model/diffusion/ideogram4.hpp @@ -449,10 +449,10 @@ namespace Ideogram4 { std::vector image_indicator_vec; Ideogram4Runner(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}, - const std::string prefix = "") - : DiffusionModelRunner(backend, params_backend, prefix), + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "", + std::shared_ptr weight_manager = nullptr) + : DiffusionModelRunner(backend, prefix, weight_manager), config(Ideogram4Config::detect_from_weights(tensor_storage_map, prefix)), uncond_prefix(prefix + ".uncond") { model = Ideogram4Transformer(config); diff --git a/src/model/diffusion/lens.hpp b/src/model/diffusion/lens.hpp index 32de8537..931a8527 100644 --- a/src/model/diffusion/lens.hpp +++ b/src/model/diffusion/lens.hpp @@ -356,10 +356,10 @@ namespace Lens { std::vector pe_vec; LensRunner(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}, - const std::string prefix = "") - : DiffusionModelRunner(backend, params_backend, prefix), + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "", + std::shared_ptr weight_manager = nullptr) + : DiffusionModelRunner(backend, prefix, weight_manager), config(LensConfig::detect_from_weights(tensor_storage_map, prefix)) { lens = LensModel(config); lens.init(params_ctx, tensor_storage_map, prefix); diff --git a/src/model/diffusion/ltxv.hpp b/src/model/diffusion/ltxv.hpp index 455dc4b2..3535821d 100644 --- a/src/model/diffusion/ltxv.hpp +++ b/src/model/diffusion/ltxv.hpp @@ -1686,10 +1686,10 @@ namespace LTXV { sd::Tensor ax_input_cache; LTXAVRunner(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}, - const std::string& prefix = "model.diffusion_model") - : DiffusionModelRunner(backend, params_backend, prefix), + const String2TensorStorage& tensor_storage_map = {}, + const std::string& prefix = "model.diffusion_model", + std::shared_ptr weight_manager = nullptr) + : DiffusionModelRunner(backend, prefix, weight_manager), config(LTXAVConfig::detect_from_weights(tensor_storage_map, prefix)), model(config) { model.init(params_ctx, tensor_storage_map, prefix); @@ -2025,7 +2025,8 @@ namespace LTXV { ggml_backend_t backend = sd_backend_cpu_init(); LOG_INFO("loading ltxav from '%s'", model_path.c_str()); - ModelLoader model_loader; + auto model_manager = std::make_shared(); + ModelLoader& model_loader = model_manager->loader(); if (!model_loader.init_from_file_and_convert_name(model_path, "model.diffusion_model.")) { LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str()); return; @@ -2040,19 +2041,18 @@ namespace LTXV { auto& tensor_storage_map = model_loader.get_tensor_storage_map(); std::shared_ptr ltxav = std::make_shared(backend, - backend, tensor_storage_map, - "model.diffusion_model"); + "model.diffusion_model", + model_manager); - if (!ltxav->alloc_params_buffer()) { - LOG_ERROR("ltxav buffer allocation failed"); - return; - } - std::map tensors; - ltxav->get_param_tensors(tensors, "model.diffusion_model"); - - if (!model_loader.load_tensors(tensors)) { - LOG_ERROR("load tensors from model loader failed"); + if (!model_manager->register_runner_params("LTXAV test", + *ltxav, + "model.diffusion_model", + ModelManager::ResidencyMode::Resident, + backend, + backend) || + !model_manager->validate_registered_tensors()) { + LOG_ERROR("register ltxav tensors with model manager failed"); return; } diff --git a/src/model/diffusion/mmdit.hpp b/src/model/diffusion/mmdit.hpp index 0f6c2d30..b73a9fc7 100644 --- a/src/model/diffusion/mmdit.hpp +++ b/src/model/diffusion/mmdit.hpp @@ -879,10 +879,10 @@ struct MMDiTRunner : public DiffusionModelRunner { MMDiT mmdit; MMDiTRunner(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}, - const std::string prefix = "") - : DiffusionModelRunner(backend, params_backend, prefix), + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "", + std::shared_ptr weight_manager = nullptr) + : DiffusionModelRunner(backend, prefix, weight_manager), config(MMDiTConfig::detect_from_weights(tensor_storage_map, prefix)), mmdit(config) { mmdit.init(params_ctx, tensor_storage_map, prefix); @@ -1001,28 +1001,25 @@ struct MMDiTRunner : public DiffusionModelRunner { // ggml_backend_t backend = ggml_backend_cuda_init(0); ggml_backend_t backend = sd_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_F16; - std::shared_ptr mmdit = std::make_shared(backend, backend); + auto model_manager = std::make_shared(); + std::shared_ptr mmdit = std::make_shared(backend, String2TensorStorage{}, "", model_manager); { LOG_INFO("loading from '%s'", file_path.c_str()); - if (!mmdit->alloc_params_buffer()) { - LOG_ERROR("mmdit embeds buffer allocation failed"); - return; - } - - std::map tensors; - mmdit->get_param_tensors(tensors, "model.diffusion_model"); - - ModelLoader model_loader; + ModelLoader& model_loader = model_manager->loader(); if (!model_loader.init_from_file_and_convert_name(file_path)) { LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); return; } - bool success = model_loader.load_tensors(tensors); - - if (!success) { - LOG_ERROR("load tensors from model loader failed"); + if (!model_manager->register_runner_params("MMDiT test", + *mmdit, + "model.diffusion_model", + ModelManager::ResidencyMode::Resident, + backend, + backend) || + !model_manager->validate_registered_tensors()) { + LOG_ERROR("register mmdit tensors with model manager failed"); return; } diff --git a/src/model/diffusion/model.hpp b/src/model/diffusion/model.hpp index b386711b..20afbe3f 100644 --- a/src/model/diffusion/model.hpp +++ b/src/model/diffusion/model.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_MODEL_DIFFUSION_MODEL_HPP__ +#ifndef __SD_MODEL_DIFFUSION_MODEL_HPP__ #define __SD_MODEL_DIFFUSION_MODEL_HPP__ #include @@ -7,6 +7,7 @@ #include "core/ggml_extend.hpp" #include "core/tensor_ggml.hpp" +#include "model_manager.h" struct UNetDiffusionExtra { int num_video_frames = -1; @@ -88,9 +89,9 @@ protected: public: DiffusionModelRunner(ggml_backend_t backend, - ggml_backend_t params_backend, - const std::string& prefix) - : GGMLRunner(backend, params_backend), + const std::string& prefix, + std::shared_ptr weight_manager = nullptr) + : GGMLRunner(backend, weight_manager), prefix(prefix) {} virtual sd::Tensor compute(int n_threads, diff --git a/src/model/diffusion/pid.hpp b/src/model/diffusion/pid.hpp index a0dfb324..68dca00f 100644 --- a/src/model/diffusion/pid.hpp +++ b/src/model/diffusion/pid.hpp @@ -710,10 +710,10 @@ namespace Pid { std::vector pixel_pos_comp_vec; PiDRunner(ggml_backend_t backend, - ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, - const std::string prefix = "model.diffusion_model") - : DiffusionModelRunner(backend, params_backend, prefix), + const std::string prefix = "model.diffusion_model", + std::shared_ptr weight_manager = nullptr) + : DiffusionModelRunner(backend, prefix, weight_manager), config(PixelDiTConfig::detect_from_weights(tensor_storage_map, prefix)) { model = PixelDiT(config); model.init(params_ctx, tensor_storage_map, prefix); diff --git a/src/model/diffusion/qwen_image.hpp b/src/model/diffusion/qwen_image.hpp index 1113a922..5cee54c5 100644 --- a/src/model/diffusion/qwen_image.hpp +++ b/src/model/diffusion/qwen_image.hpp @@ -518,12 +518,12 @@ namespace Qwen { SDVersion version; QwenImageRunner(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}, - const std::string prefix = "", - SDVersion version = VERSION_QWEN_IMAGE, - bool zero_cond_t = false) - : DiffusionModelRunner(backend, params_backend, prefix), + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "", + SDVersion version = VERSION_QWEN_IMAGE, + bool zero_cond_t = false, + std::shared_ptr weight_manager = nullptr) + : DiffusionModelRunner(backend, prefix, weight_manager), config(QwenImageConfig::detect_from_weights(tensor_storage_map, prefix)) { config.zero_cond_t = config.zero_cond_t || zero_cond_t; qwen_image = QwenImageModel(config); @@ -691,7 +691,8 @@ namespace Qwen { ggml_backend_t backend = sd_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_Q8_0; - ModelLoader model_loader; + auto model_manager = std::make_shared(); + ModelLoader& model_loader = model_manager->loader(); if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) { LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); return; @@ -705,23 +706,20 @@ namespace Qwen { } std::shared_ptr qwen_image = std::make_shared(backend, - backend, tensor_storage_map, "model.diffusion_model", - VERSION_QWEN_IMAGE); + VERSION_QWEN_IMAGE, + false, + model_manager); - if (!qwen_image->alloc_params_buffer()) { - LOG_ERROR("qwen_image buffer allocation failed"); - return; - } - - std::map tensors; - qwen_image->get_param_tensors(tensors, "model.diffusion_model"); - - bool success = model_loader.load_tensors(tensors); - - if (!success) { - LOG_ERROR("load tensors from model loader failed"); + if (!model_manager->register_runner_params("Qwen image test", + *qwen_image, + "model.diffusion_model", + ModelManager::ResidencyMode::Resident, + backend, + backend) || + !model_manager->validate_registered_tensors()) { + LOG_ERROR("register qwen_image tensors with model manager failed"); return; } diff --git a/src/model/diffusion/unet.hpp b/src/model/diffusion/unet.hpp index ab01a60b..253b3b4b 100644 --- a/src/model/diffusion/unet.hpp +++ b/src/model/diffusion/unet.hpp @@ -694,11 +694,11 @@ struct UNetModelRunner : public DiffusionModelRunner { UnetModelBlock unet; UNetModelRunner(ggml_backend_t backend, - ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, - SDVersion version = VERSION_SD1) - : DiffusionModelRunner(backend, params_backend, prefix), + SDVersion version = VERSION_SD1, + std::shared_ptr weight_manager = nullptr) + : DiffusionModelRunner(backend, prefix, weight_manager), config(UNetConfig::detect_from_weights(tensor_storage_map, prefix, version)), unet(config) { unet.init(params_ctx, tensor_storage_map, prefix); diff --git a/src/model/diffusion/wan.hpp b/src/model/diffusion/wan.hpp index fd56a0f5..9e27807f 100644 --- a/src/model/diffusion/wan.hpp +++ b/src/model/diffusion/wan.hpp @@ -799,11 +799,11 @@ namespace WAN { SDVersion version; WanRunner(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}, - const std::string prefix = "", - SDVersion version = VERSION_WAN2) - : DiffusionModelRunner(backend, params_backend, prefix), + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "", + SDVersion version = VERSION_WAN2, + std::shared_ptr weight_manager = nullptr) + : DiffusionModelRunner(backend, prefix, weight_manager), config(WanConfig::detect_from_weights(tensor_storage_map, prefix)) { if (config.num_layers == 30) { if (version == VERSION_WAN2_2_TI2V) { @@ -1017,7 +1017,8 @@ namespace WAN { ggml_type model_data_type = GGML_TYPE_F16; LOG_INFO("loading from '%s'", file_path.c_str()); - ModelLoader model_loader; + auto model_manager = std::make_shared(); + ModelLoader& model_loader = model_manager->loader(); if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) { LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); return; @@ -1031,23 +1032,19 @@ namespace WAN { } std::shared_ptr wan = std::make_shared(backend, - backend, tensor_storage_map, "model.diffusion_model", - VERSION_WAN2_2_TI2V); + VERSION_WAN2_2_TI2V, + model_manager); - if (!wan->alloc_params_buffer()) { - LOG_ERROR("wan buffer allocation failed"); - return; - } - - std::map tensors; - wan->get_param_tensors(tensors, "model.diffusion_model"); - - bool success = model_loader.load_tensors(tensors); - - if (!success) { - LOG_ERROR("load tensors from model loader failed"); + if (!model_manager->register_runner_params("Wan test", + *wan, + "model.diffusion_model", + ModelManager::ResidencyMode::Resident, + backend, + backend) || + !model_manager->validate_registered_tensors()) { + LOG_ERROR("register wan tensors with model manager failed"); return; } diff --git a/src/model/diffusion/z_image.hpp b/src/model/diffusion/z_image.hpp index a7d08b09..936da0f7 100644 --- a/src/model/diffusion/z_image.hpp +++ b/src/model/diffusion/z_image.hpp @@ -553,11 +553,11 @@ namespace ZImage { SDVersion version; ZImageRunner(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}, - const std::string prefix = "", - SDVersion version = VERSION_Z_IMAGE) - : DiffusionModelRunner(backend, params_backend, prefix), + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "", + SDVersion version = VERSION_Z_IMAGE, + std::shared_ptr weight_manager = nullptr) + : DiffusionModelRunner(backend, prefix, weight_manager), config(ZImageConfig::detect_from_weights(tensor_storage_map, prefix)) { z_image = ZImageModel(config); z_image.init(params_ctx, tensor_storage_map, prefix); @@ -698,7 +698,8 @@ namespace ZImage { ggml_backend_t backend = sd_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_Q8_0; - ModelLoader model_loader; + auto model_manager = std::make_shared(); + ModelLoader& model_loader = model_manager->loader(); if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) { LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); return; @@ -714,22 +715,19 @@ namespace ZImage { } std::shared_ptr z_image = std::make_shared(backend, - backend, tensor_storage_map, "model.diffusion_model", - VERSION_QWEN_IMAGE); + VERSION_QWEN_IMAGE, + model_manager); - if (!z_image->alloc_params_buffer()) { - LOG_ERROR("z_image buffer allocation failed"); - return; - } - std::map tensors; - z_image->get_param_tensors(tensors, "model.diffusion_model"); - - bool success = model_loader.load_tensors(tensors); - - if (!success) { - LOG_ERROR("load tensors from model loader failed"); + if (!model_manager->register_runner_params("ZImage test", + *z_image, + "model.diffusion_model", + ModelManager::ResidencyMode::Resident, + backend, + backend) || + !model_manager->validate_registered_tensors()) { + LOG_ERROR("register z_image tensors with model manager failed"); return; } diff --git a/src/model/te/clip.hpp b/src/model/te/clip.hpp index 6767a1b1..6dc8a947 100644 --- a/src/model/te/clip.hpp +++ b/src/model/te/clip.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_MODEL_TE_CLIP_HPP__ +#ifndef __SD_MODEL_TE_CLIP_HPP__ #define __SD_MODEL_TE_CLIP_HPP__ #include "core/ggml_extend.hpp" @@ -469,13 +469,13 @@ struct CLIPTextModelRunner : public GGMLRunner { std::vector attention_mask_vec; CLIPTextModelRunner(ggml_backend_t backend, - ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, - CLIPVersion version = OPENAI_CLIP_VIT_L_14, - bool with_final_ln = true, - bool force_clip_f32 = false) - : GGMLRunner(backend, params_backend) { + CLIPVersion version = OPENAI_CLIP_VIT_L_14, + bool with_final_ln = true, + bool force_clip_f32 = false, + std::shared_ptr weight_manager = nullptr) + : GGMLRunner(backend, weight_manager) { bool proj_in = false; for (const auto& [name, tensor_storage] : tensor_storage_map) { if (!starts_with(name, prefix)) { diff --git a/src/model/te/llm.hpp b/src/model/te/llm.hpp index d8623bc3..3905d53a 100644 --- a/src/model/te/llm.hpp +++ b/src/model/te/llm.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_MODEL_TE_LLM_HPP__ +#ifndef __SD_MODEL_TE_LLM_HPP__ #define __SD_MODEL_TE_LLM_HPP__ #include @@ -22,6 +22,7 @@ #include "json.hpp" #include "model/common/rope.hpp" #include "model_loader.h" +#include "model_manager.h" #include "tokenizers/bpe_tokenizer.h" #include "tokenizers/gemma_tokenizer.h" #include "tokenizers/gpt_oss_tokenizer.h" @@ -1571,11 +1572,11 @@ namespace LLM { public: LLMRunner(LLMArch arch, ggml_backend_t backend, - ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, - bool enable_vision_ = false) - : GGMLRunner(backend, params_backend), + bool enable_vision_ = false, + std::shared_ptr weight_manager = nullptr) + : GGMLRunner(backend, weight_manager), config(LLMConfig::detect_from_weights(tensor_storage_map, prefix, arch)), enable_vision(enable_vision_) { if (enable_vision && !config.have_vision_weight) { @@ -1822,11 +1823,11 @@ namespace LLM { LLMEmbedder(LLMArch arch, ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}, - const std::string prefix = "", - bool enable_vision = false) - : model(arch, backend, params_backend, tensor_storage_map, prefix, enable_vision) { + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "", + bool enable_vision = false, + std::shared_ptr weight_manager = nullptr) + : model(arch, backend, tensor_storage_map, prefix, enable_vision, weight_manager) { if (arch == LLMArch::MISTRAL_SMALL_3_2 || arch == LLMArch::MINISTRAL_3_3B) { tokenizer = std::make_shared(); } else if (arch == LLMArch::GPT_OSS_20B) { @@ -1840,13 +1841,6 @@ namespace LLM { model.get_param_tensors(tensors, prefix); } - bool alloc_params_buffer() { - if (!model.alloc_params_buffer()) { - return false; - } - return true; - } - std::tuple, std::vector> tokenize(std::string text, std::pair attn_range, size_t max_length = 0, @@ -2062,7 +2056,8 @@ namespace LLM { ggml_backend_t backend = sd_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_COUNT; - ModelLoader model_loader; + auto model_manager = std::make_shared(); + ModelLoader& model_loader = model_manager->loader(); if (!model_loader.init_from_file_and_convert_name(file_path, "text_encoders.llm.")) { LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); return; @@ -2080,24 +2075,20 @@ namespace LLM { LLMArch arch = LLMArch::QWEN3; std::shared_ptr llm = std::make_shared(arch, - backend, backend, tensor_storage_map, "text_encoders.llm", - true); + true, + model_manager); - if (!llm->alloc_params_buffer()) { - LOG_ERROR("llm model allocation failed"); - return; - } - - std::map tensors; - llm->get_param_tensors(tensors, "text_encoders.llm"); - - bool success = model_loader.load_tensors(tensors); - - if (!success) { - LOG_ERROR("load tensors from model loader failed"); + if (!model_manager->register_runner_params("LLM test", + *llm, + "text_encoders.llm", + ModelManager::ResidencyMode::Resident, + backend, + backend) || + !model_manager->validate_registered_tensors()) { + LOG_ERROR("register llm tensors with model manager failed"); return; } diff --git a/src/model/te/t5.hpp b/src/model/te/t5.hpp index 9bde46fc..a8d1e869 100644 --- a/src/model/te/t5.hpp +++ b/src/model/te/t5.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_MODEL_TE_T5_HPP__ +#ifndef __SD_MODEL_TE_T5_HPP__ #define __SD_MODEL_TE_T5_HPP__ #include @@ -12,6 +12,7 @@ #include "core/ggml_extend.hpp" #include "model_loader.h" +#include "model_manager.h" #include "tokenizers/t5_unigram_tokenizer.h" struct T5Config { @@ -334,11 +335,11 @@ struct T5Runner : public GGMLRunner { std::vector relative_position_bucket_vec; T5Runner(ggml_backend_t backend, - ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, - bool is_umt5 = false) - : GGMLRunner(backend, params_backend), + bool is_umt5 = false, + std::shared_ptr weight_manager = nullptr) + : GGMLRunner(backend, weight_manager), config(T5Config::detect_from_weights(tensor_storage_map, prefix, is_umt5)) { model = T5(config); model.init(params_ctx, tensor_storage_map, prefix); @@ -477,24 +478,17 @@ struct T5Embedder { T5Runner model; T5Embedder(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}, - const std::string prefix = "", - bool is_umt5 = false) - : model(backend, params_backend, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) { + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "", + bool is_umt5 = false, + std::shared_ptr weight_manager = nullptr) + : model(backend, tensor_storage_map, prefix, is_umt5, weight_manager), tokenizer(is_umt5) { } void get_param_tensors(std::map& tensors, const std::string prefix) { model.get_param_tensors(tensors, prefix); } - bool alloc_params_buffer() { - if (!model.alloc_params_buffer()) { - return false; - } - return true; - } - std::tuple, std::vector, std::vector> tokenize(std::string text, size_t max_length = 0, bool padding = false) { @@ -579,7 +573,8 @@ struct T5Embedder { ggml_backend_t backend = sd_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_F16; - ModelLoader model_loader; + auto model_manager = std::make_shared(); + ModelLoader& model_loader = model_manager->loader(); if (!model_loader.init_from_file_and_convert_name(file_path)) { LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); return; @@ -592,19 +587,16 @@ struct T5Embedder { } } - std::shared_ptr t5 = std::make_shared(backend, backend, tensor_storage_map, "", true); + std::shared_ptr t5 = std::make_shared(backend, tensor_storage_map, "", true, model_manager); - if (!t5->alloc_params_buffer()) { - LOG_ERROR("t5 params buffer allocation failed"); - return; - } - std::map tensors; - t5->get_param_tensors(tensors, ""); - - bool success = model_loader.load_tensors(tensors); - - if (!success) { - LOG_ERROR("load tensors from model loader failed"); + if (!model_manager->register_runner_params("T5 test", + *t5, + "", + ModelManager::ResidencyMode::Resident, + backend, + backend) || + !model_manager->validate_registered_tensors()) { + LOG_ERROR("register t5 tensors with model manager failed"); return; } diff --git a/src/model/upscaler/esrgan.hpp b/src/model/upscaler/esrgan.hpp index 7fabd6ef..4afbab07 100644 --- a/src/model/upscaler/esrgan.hpp +++ b/src/model/upscaler/esrgan.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_MODEL_UPSCALER_ESRGAN_HPP__ +#ifndef __SD_MODEL_UPSCALER_ESRGAN_HPP__ #define __SD_MODEL_UPSCALER_ESRGAN_HPP__ #include @@ -229,9 +229,9 @@ struct ESRGAN : public GGMLRunner { std::unique_ptr rrdb_net; ESRGAN(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}) - : GGMLRunner(backend, params_backend), + const String2TensorStorage& tensor_storage_map = {}, + std::shared_ptr weight_manager = nullptr) + : GGMLRunner(backend, weight_manager), config(ESRGANConfig::detect_from_weights(tensor_storage_map)), rrdb_net(std::make_unique(config)) { rrdb_net->init(params_ctx, tensor_storage_map, ""); diff --git a/src/model/upscaler/ltx_latent_upscaler.hpp b/src/model/upscaler/ltx_latent_upscaler.hpp index 1bccae2b..5343ad03 100644 --- a/src/model/upscaler/ltx_latent_upscaler.hpp +++ b/src/model/upscaler/ltx_latent_upscaler.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__ +#ifndef __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__ #define __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__ #include @@ -433,9 +433,9 @@ namespace LTXVUpsampler { std::unique_ptr model; LatentUpsamplerRunner(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map) - : GGMLRunner(backend, params_backend), + const String2TensorStorage& tensor_storage_map, + std::shared_ptr weight_manager = nullptr) + : GGMLRunner(backend, weight_manager), config(LatentUpsamplerConfig::detect_from_weights(tensor_storage_map)) { if (config.dims != 3 || (!config.spatial_upsample && !config.temporal_upsample) || config.spatial_up_num < 1 || config.spatial_down_den < 1 || config.temporal_up_factor < 1) { diff --git a/src/model/vae/auto_encoder_kl.hpp b/src/model/vae/auto_encoder_kl.hpp index 443846fe..478b18ed 100644 --- a/src/model/vae/auto_encoder_kl.hpp +++ b/src/model/vae/auto_encoder_kl.hpp @@ -213,9 +213,9 @@ protected: params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1); } - float get_alpha() { - float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]); - return sigmoid(alpha); + ggml_tensor* get_alpha(GGMLRunnerContext* ctx) { + auto mix_factor = ggml_ext_cast_f32(ctx->ggml_ctx, ctx->backend, params["mix_factor"]); + return ggml_sigmoid(ctx->ggml_ctx, mix_factor); } public: @@ -250,10 +250,12 @@ public: x = time_stack->forward(ctx, x); // b t c (h w) - float alpha = get_alpha(); - x = ggml_add(ctx->ggml_ctx, - ggml_ext_scale(ctx->ggml_ctx, x, alpha), - ggml_ext_scale(ctx->ggml_ctx, x_mix, 1.0f - alpha)); + auto alpha = get_alpha(ctx); + x = ggml_add(ctx->ggml_ctx, + x_mix, + ggml_mul(ctx->ggml_ctx, + ggml_sub(ctx->ggml_ctx, x, x_mix), + alpha)); x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w) x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w @@ -664,13 +666,13 @@ struct AutoEncoderKL : public VAE { AutoEncoderKLModel ae; AutoEncoderKL(ggml_backend_t backend, - ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, - bool decode_only = false, - bool use_video_decoder = false, - SDVersion version = VERSION_SD1) - : VAE(version, backend, params_backend, prefix), decode_only(decode_only) { + bool decode_only = false, + bool use_video_decoder = false, + SDVersion version = VERSION_SD1, + std::shared_ptr weight_manager = nullptr) + : VAE(version, backend, prefix, weight_manager), decode_only(decode_only) { if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) { scale_factor = 0.18215f; shift_factor = 0.f; diff --git a/src/model/vae/ltx_audio_vae.hpp b/src/model/vae/ltx_audio_vae.hpp index 822386b9..bd0d18a9 100644 --- a/src/model/vae/ltx_audio_vae.hpp +++ b/src/model/vae/ltx_audio_vae.hpp @@ -1,4 +1,4 @@ -#ifndef __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__ +#ifndef __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__ #define __SD_MODEL_VAE_LTX_AUDIO_VAE_HPP__ #include @@ -9,6 +9,7 @@ #include "core/ggml_extend.hpp" #include "model_loader.h" +#include "model_manager.h" namespace LTXV { @@ -1001,10 +1002,10 @@ namespace LTXV { sd::Tensor bwe_skip_filter_tensor; LTXAudioVAERunner(ggml_backend_t backend, - ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, - const std::string& prefix = "") - : GGMLRunner(backend, params_backend), + const std::string& prefix = "", + std::shared_ptr weight_manager = nullptr) + : GGMLRunner(backend, weight_manager), weight_prefix(prefix), config(LTXAudioVAEConfig::detect_from_weights(tensor_storage_map)), model(config) { @@ -1019,7 +1020,7 @@ namespace LTXV { model.get_param_tensors(tensors, weight_prefix); } - size_t get_params_buffer_size() { + size_t get_params_mem_size() { return model.get_params_mem_size(); } @@ -1066,7 +1067,8 @@ namespace LTXV { // ggml_backend_t backend = ggml_backend_cuda_init(0); LOG_INFO("loading ltx audio vae from '%s'", model_path.c_str()); - ModelLoader model_loader; + auto model_manager = std::make_shared(); + ModelLoader& model_loader = model_manager->loader(); if (!model_loader.init_from_file(model_path)) { LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str()); return; @@ -1074,20 +1076,17 @@ namespace LTXV { auto& tensor_storage_map = model_loader.get_tensor_storage_map(); auto ltx_audio_vae = std::make_shared(backend, - backend, tensor_storage_map, - prefix); + prefix, + model_manager); - if (!ltx_audio_vae->alloc_params_buffer()) { - LOG_ERROR("ltx audio vae buffer allocation failed"); - return; - } - - std::map tensors; - ltx_audio_vae->get_param_tensors(tensors); - - if (!model_loader.load_tensors(tensors)) { - LOG_ERROR("load tensors from model loader failed"); + if (!model_manager->register_runner_params("LTX audio VAE test", + *ltx_audio_vae, + ModelManager::ResidencyMode::Resident, + backend, + backend) || + !model_manager->validate_registered_tensors()) { + LOG_ERROR("register ltx audio vae tensors with model manager failed"); return; } diff --git a/src/model/vae/ltx_vae.hpp b/src/model/vae/ltx_vae.hpp index 59e38c32..77ce9656 100644 --- a/src/model/vae/ltx_vae.hpp +++ b/src/model/vae/ltx_vae.hpp @@ -957,8 +957,8 @@ namespace LTXVAE { ggml_tensor* scaled_timestep = timestep; if (timestep_conditioning) { - auto multiplier = ggml_ext_backend_tensor_get_f32(params["timestep_scale_multiplier"]); - scaled_timestep = ggml_ext_scale(ctx->ggml_ctx, timestep, multiplier); + auto multiplier = ggml_ext_cast_f32(ctx->ggml_ctx, ctx->backend, params["timestep_scale_multiplier"]); + scaled_timestep = ggml_mul(ctx->ggml_ctx, timestep, multiplier); } x = conv_in->forward(ctx, x, causal_decoder); @@ -1008,8 +1008,8 @@ namespace LTXVAE { ggml_tensor* scaled_timestep = timestep; if (timestep_conditioning && timestep != nullptr) { - auto multiplier = ggml_ext_backend_tensor_get_f32(params["timestep_scale_multiplier"]); - scaled_timestep = ggml_ext_scale(ctx->ggml_ctx, timestep, multiplier); + auto multiplier = ggml_ext_cast_f32(ctx->ggml_ctx, ctx->backend, params["timestep_scale_multiplier"]); + scaled_timestep = ggml_mul(ctx->ggml_ctx, timestep, multiplier); } // conv_in with feat_map for left temporal context @@ -1223,11 +1223,11 @@ struct LTXVideoVAE : public VAE { LTXVAE::VideoVAE vae; LTXVideoVAE(ggml_backend_t backend, - ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string& prefix, - bool decode_only = true, - SDVersion version = VERSION_LTXAV) + bool decode_only = true, + SDVersion version = VERSION_LTXAV, + std::shared_ptr weight_manager = nullptr) : decode_only(decode_only), ltx_vae_version(LTXVAE::detect_ltx_vae_version(tensor_storage_map, prefix)), timestep_conditioning(LTXVAE::detect_ltx_vae_timestep_conditioning(tensor_storage_map, prefix)), @@ -1239,7 +1239,7 @@ struct LTXVideoVAE : public VAE { patch_size, tensor_storage_map, prefix), - VAE(version, backend, params_backend, prefix) { + VAE(version, backend, prefix, weight_manager) { vae.init(params_ctx, tensor_storage_map, prefix); decode_timestep_tensor.values()[0] = vae.decode_timestep; } @@ -1521,7 +1521,8 @@ struct LTXVideoVAE : public VAE { ggml_backend_t backend = sd_backend_cpu_init(); LOG_INFO("loading ltx vae from '%s'", model_path.c_str()); - ModelLoader model_loader; + auto model_manager = std::make_shared(); + ModelLoader& model_loader = model_manager->loader(); if (!model_loader.init_from_file_and_convert_name(model_path, "vae.")) { LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str()); return; @@ -1529,22 +1530,19 @@ struct LTXVideoVAE : public VAE { auto& tensor_storage_map = model_loader.get_tensor_storage_map(); std::shared_ptr vae = std::make_shared(backend, - backend, tensor_storage_map, "first_stage_model", true, - VERSION_LTXAV); + VERSION_LTXAV, + model_manager); - if (!vae->alloc_params_buffer()) { - LOG_ERROR("vae buffer allocation failed"); - return; - } - - std::map tensors; - vae->get_param_tensors(tensors); - - if (!model_loader.load_tensors(tensors)) { - LOG_ERROR("load tensors from model loader failed"); + if (!model_manager->register_runner_params("LTX VAE test", + *vae, + ModelManager::ResidencyMode::Resident, + backend, + backend) || + !model_manager->validate_registered_tensors()) { + LOG_ERROR("register ltx vae tensors with model manager failed"); return; } diff --git a/src/model/vae/tae.hpp b/src/model/vae/tae.hpp index 95a2cd58..7c6e1d35 100644 --- a/src/model/vae/tae.hpp +++ b/src/model/vae/tae.hpp @@ -623,12 +623,12 @@ struct TinyImageAutoEncoder : public VAE { bool decode_only = false; TinyImageAutoEncoder(ggml_backend_t backend, - ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, - bool decoder_only = true, - SDVersion version = VERSION_SD1) - : VAE(version, backend, params_backend, "tae"), + bool decoder_only = true, + SDVersion version = VERSION_SD1, + std::shared_ptr weight_manager = nullptr) + : VAE(version, backend, "tae", weight_manager), decode_only(decoder_only), taesd(decoder_only, version) { scale_input = false; @@ -686,12 +686,12 @@ struct TinyVideoAutoEncoder : public VAE { bool is_wide = false; TinyVideoAutoEncoder(ggml_backend_t backend, - ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map, const std::string prefix, - bool decoder_only = true, - SDVersion version = VERSION_WAN2) - : VAE(version, backend, params_backend, "tae"), + bool decoder_only = true, + SDVersion version = VERSION_WAN2, + std::shared_ptr weight_manager = nullptr) + : VAE(version, backend, "tae", weight_manager), decode_only(decoder_only) { for (auto tensor_storage : tensor_storage_map) { if (tensor_storage.first.find(prefix + ".3.conv.6.weight") != std::string::npos) { diff --git a/src/model/vae/vae.hpp b/src/model/vae/vae.hpp index 1f508b6e..af091bb5 100644 --- a/src/model/vae/vae.hpp +++ b/src/model/vae/vae.hpp @@ -1,8 +1,9 @@ -#ifndef __SD_MODEL_VAE_VAE_HPP__ +#ifndef __SD_MODEL_VAE_VAE_HPP__ #define __SD_MODEL_VAE_VAE_HPP__ #include "core/tensor_ggml.hpp" #include "model/common/block.hpp" +#include "model_manager.h" struct VAE : public GGMLRunner { protected: @@ -63,8 +64,11 @@ protected: } public: - VAE(SDVersion version, ggml_backend_t backend, ggml_backend_t params_backend, const std::string& weight_prefix = "") - : version(version), weight_prefix(weight_prefix), GGMLRunner(backend, params_backend) {} + VAE(SDVersion version, + ggml_backend_t backend, + const std::string& weight_prefix = "", + std::shared_ptr weight_manager = nullptr) + : version(version), weight_prefix(weight_prefix), GGMLRunner(backend, weight_manager) {} int get_scale_factor() { int scale_factor = 8; @@ -224,8 +228,10 @@ public: }; struct FakeVAE : public VAE { - FakeVAE(SDVersion version, ggml_backend_t backend, ggml_backend_t params_backend) - : VAE(version, backend, params_backend) {} + FakeVAE(SDVersion version, + ggml_backend_t backend, + std::shared_ptr weight_manager = nullptr) + : VAE(version, backend, "", weight_manager) {} int get_encoder_output_channels(int input_channels) { return input_channels; diff --git a/src/model/vae/wan_vae.hpp b/src/model/vae/wan_vae.hpp index 36bb8696..c8cfaa9d 100644 --- a/src/model/vae/wan_vae.hpp +++ b/src/model/vae/wan_vae.hpp @@ -1124,12 +1124,12 @@ namespace WAN { WanVAE ae; WanVAERunner(ggml_backend_t backend, - ggml_backend_t params_backend, - const String2TensorStorage& tensor_storage_map = {}, - const std::string prefix = "", - bool decode_only = false, - SDVersion version = VERSION_WAN2) - : VAE(version, backend, params_backend, prefix), decode_only(decode_only), ae(decode_only, version == VERSION_WAN2_2_TI2V) { + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "", + bool decode_only = false, + SDVersion version = VERSION_WAN2, + std::shared_ptr weight_manager = nullptr) + : VAE(version, backend, prefix, weight_manager), decode_only(decode_only), ae(decode_only, version == VERSION_WAN2_2_TI2V) { ae.init(params_ctx, tensor_storage_map, prefix); } @@ -1327,27 +1327,24 @@ namespace WAN { // ggml_backend_t backend = ggml_backend_cuda_init(0); ggml_backend_t backend = sd_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_F16; - std::shared_ptr vae = std::make_shared(backend, backend, String2TensorStorage{}, "first_stage_model", false, VERSION_WAN2_2_TI2V); + auto model_manager = std::make_shared(); + std::shared_ptr vae = std::make_shared(backend, String2TensorStorage{}, "first_stage_model", false, VERSION_WAN2_2_TI2V, model_manager); { LOG_INFO("loading from '%s'", file_path.c_str()); - if (!vae->alloc_params_buffer()) { - LOG_ERROR("vae buffer allocation failed"); - return; - } - std::map tensors; - vae->get_param_tensors(tensors); - - ModelLoader model_loader; + ModelLoader& model_loader = model_manager->loader(); if (!model_loader.init_from_file_and_convert_name(file_path, "vae.")) { LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); return; } - bool success = model_loader.load_tensors(tensors); - - if (!success) { - LOG_ERROR("load tensors from model loader failed"); + if (!model_manager->register_runner_params("Wan VAE test", + *vae, + ModelManager::ResidencyMode::Resident, + backend, + backend) || + !model_manager->validate_registered_tensors()) { + LOG_ERROR("register wan vae tensors with model manager failed"); return; } diff --git a/src/model_manager.h b/src/model_manager.h index b3da8a36..e18d4c5d 100644 --- a/src/model_manager.h +++ b/src/model_manager.h @@ -121,6 +121,42 @@ public: ggml_backend_t compute_backend, ggml_backend_t params_backend, size_t* registered_tensor_size = nullptr); + + template + bool register_runner_params(const std::string& desc, + Runner& runner, + ResidencyMode residency_mode, + ggml_backend_t compute_backend, + ggml_backend_t params_backend, + size_t* registered_tensor_size = nullptr) { + std::map tensors; + runner.get_param_tensors(tensors); + return register_param_tensors(desc, + std::move(tensors), + residency_mode, + compute_backend, + params_backend, + registered_tensor_size); + } + + template + bool register_runner_params(const std::string& desc, + Runner& runner, + const std::string& prefix, + ResidencyMode residency_mode, + ggml_backend_t compute_backend, + ggml_backend_t params_backend, + size_t* registered_tensor_size = nullptr) { + std::map tensors; + runner.get_param_tensors(tensors, prefix); + return register_param_tensors(desc, + std::move(tensors), + residency_mode, + compute_backend, + params_backend, + registered_tensor_size); + } + bool validate_registered_tensors(); bool prepare_params(const std::vector& tensors) override; diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 19f9e85e..c071fd29 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -241,7 +241,6 @@ public: } std::map group_tensors; model->get_param_tensors(group_tensors); - model->set_weight_manager(model_manager); if (model_manager == nullptr) { return true; } @@ -586,33 +585,35 @@ public: if (sd_version_is_sd3(version)) { cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), - params_backend_for(SDBackendModule::TE), - tensor_storage_map); + tensor_storage_map, + model_manager); diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), - params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, - "model.diffusion_model"); + "model.diffusion_model", + model_manager); } else if (sd_version_is_pid(version)) { vae_decode_only = false; cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), - params_backend_for(SDBackendModule::TE), - tensor_storage_map, - version); - diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), - params_backend_for(SDBackendModule::DIFFUSION), - tensor_storage_map, - "model.diffusion_model.net"); - } else if (sd_version_is_ideogram4(version)) { - cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), - params_backend_for(SDBackendModule::TE), tensor_storage_map, version, "", - false); + false, + model_manager); + diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), + tensor_storage_map, + "model.diffusion_model.net", + model_manager); + } else if (sd_version_is_ideogram4(version)) { + cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), + tensor_storage_map, + version, + "", + false, + model_manager); diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), - params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, - "model.diffusion_model"); + "model.diffusion_model", + model_manager); } else if (sd_version_is_flux(version)) { bool is_chroma = false; for (auto pair : tensor_storage_map) { @@ -623,66 +624,71 @@ public: } if (is_chroma) { cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), - params_backend_for(SDBackendModule::TE), tensor_storage_map, sd_ctx_params->chroma_use_t5_mask, - sd_ctx_params->chroma_t5_mask_pad); + sd_ctx_params->chroma_t5_mask_pad, + false, + model_manager); } else if (version == VERSION_OVIS_IMAGE) { cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), - params_backend_for(SDBackendModule::TE), tensor_storage_map, version, "", - false); + false, + model_manager); } else { cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), - params_backend_for(SDBackendModule::TE), - tensor_storage_map); + tensor_storage_map, + model_manager); } diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), - params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.diffusion_model", version, - sd_ctx_params->chroma_use_dit_mask); + sd_ctx_params->chroma_use_dit_mask, + model_manager); } else if (sd_version_is_flux2(version)) { bool is_chroma = false; cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), - params_backend_for(SDBackendModule::TE), tensor_storage_map, - version); + version, + "", + false, + model_manager); diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), - params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.diffusion_model", version, - sd_ctx_params->chroma_use_dit_mask); + sd_ctx_params->chroma_use_dit_mask, + model_manager); } else if (sd_version_is_ltxav(version)) { cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), - params_backend_for(SDBackendModule::TE), - tensor_storage_map); + tensor_storage_map, + "text_encoders.llm", + "text_embedding_projection", + model_manager); diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), - params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, - "model.diffusion_model"); + "model.diffusion_model", + model_manager); } else if (sd_version_is_wan(version)) { cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), - params_backend_for(SDBackendModule::TE), tensor_storage_map, true, 0, - true); + true, + model_manager); diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), - params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.diffusion_model", - version); + version, + model_manager); if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) { high_noise_diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), - params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.high_noise_diffusion_model", - version); + version, + model_manager); } if (diffusion_model->get_desc() == "Wan2.1-I2V-14B" || diffusion_model->get_desc() == "Wan2.1-FLF2V-14B" || @@ -691,8 +697,8 @@ public: return false; } clip_vision = std::make_shared(backend_for(SDBackendModule::CLIP_VISION), - params_backend_for(SDBackendModule::CLIP_VISION), - tensor_storage_map); + tensor_storage_map, + model_manager); clip_vision->set_max_graph_vram_bytes(max_graph_vram_bytes); if (!register_runner_params("CLIP vision", clip_vision, @@ -706,93 +712,99 @@ public: enable_vision = true; } cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), - params_backend_for(SDBackendModule::TE), tensor_storage_map, version, "", - enable_vision); + enable_vision, + model_manager); diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), - params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.diffusion_model", version, - sd_ctx_params->qwen_image_zero_cond_t); + sd_ctx_params->qwen_image_zero_cond_t, + model_manager); } else if (sd_version_is_longcat(version)) { bool enable_vision = false; if (!vae_decode_only) { enable_vision = true; } cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), - params_backend_for(SDBackendModule::TE), tensor_storage_map, version, "", - enable_vision); + enable_vision, + model_manager); diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), - params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.diffusion_model", version, - sd_ctx_params->chroma_use_dit_mask); + sd_ctx_params->chroma_use_dit_mask, + model_manager); } else if (version == VERSION_HIDREAM_O1) { cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), - params_backend_for(SDBackendModule::TE), - tensor_storage_map); + tensor_storage_map, + model_manager); diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), - params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, - "model"); + "model", + model_manager); } else if (sd_version_is_anima(version)) { cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), - params_backend_for(SDBackendModule::TE), - tensor_storage_map); + tensor_storage_map, + model_manager); diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), - params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, - "model.diffusion_model"); + "model.diffusion_model", + model_manager); } else if (sd_version_is_z_image(version)) { cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), - params_backend_for(SDBackendModule::TE), tensor_storage_map, - version); + version, + "", + false, + model_manager); diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), - params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.diffusion_model", - version); + version, + model_manager); } else if (sd_version_is_ernie_image(version)) { cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), - params_backend_for(SDBackendModule::TE), tensor_storage_map, - version); + version, + "", + false, + model_manager); diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), - params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, - "model.diffusion_model"); + "model.diffusion_model", + model_manager); } else if (sd_version_is_lens(version)) { cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), - params_backend_for(SDBackendModule::TE), tensor_storage_map, - version); + version, + "", + false, + model_manager); diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), - params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, - "model.diffusion_model"); + "model.diffusion_model", + model_manager); } else { // SD1.x SD2.x SDXL std::map embbeding_map; for (uint32_t i = 0; i < sd_ctx_params->embedding_count; i++) { embbeding_map.emplace(SAFE_STR(sd_ctx_params->embeddings[i].name), SAFE_STR(sd_ctx_params->embeddings[i].path)); } cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), - params_backend_for(SDBackendModule::TE), tensor_storage_map, embbeding_map, - version); + version, + model_manager); diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), - params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.diffusion_model", - version); + version, + model_manager); if (sd_ctx_params->diffusion_conv_direct) { LOG_INFO("Using Conv2d direct in the diffusion model"); diffusion_model->set_conv2d_direct_enabled(true); @@ -841,19 +853,19 @@ public: sd_version_is_anima(version) || sd_version_is_ltxav(version)) { return std::make_shared(backend_for(SDBackendModule::VAE), - params_backend_for(SDBackendModule::VAE), tensor_storage_map, "decoder", vae_decode_only, - version); + version, + model_manager); } else { auto model = std::make_shared(backend_for(SDBackendModule::VAE), - params_backend_for(SDBackendModule::VAE), tensor_storage_map, "decoder.layers", vae_decode_only, - version); + version, + model_manager); return model; } }; @@ -871,28 +883,28 @@ public: auto create_vae = [&]() -> std::shared_ptr { if (sd_version_is_ltxav(version)) { return std::make_shared(backend_for(SDBackendModule::VAE), - params_backend_for(SDBackendModule::VAE), tensor_storage_map, "first_stage_model", vae_decode_only, - version); + version, + model_manager); } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { return std::make_shared(backend_for(SDBackendModule::VAE), - params_backend_for(SDBackendModule::VAE), tensor_storage_map, "first_stage_model", vae_decode_only, - version); + version, + model_manager); } else { auto model = std::make_shared(backend_for(SDBackendModule::VAE), - params_backend_for(SDBackendModule::VAE), tensor_storage_map, "first_stage_model", vae_decode_only, false, - vae_version); + vae_version, + model_manager); if (sd_version_is_sdxl(version) && (strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 || sd_ctx_params->force_sdxl_vae_conv_scale || external_vae_is_invalid)) { float vae_conv_2d_scale = 1.f / 32.f; @@ -910,7 +922,7 @@ public: LOG_INFO("using FakeVAE"); first_stage_model = std::make_shared(version, backend_for(SDBackendModule::VAE), - params_backend_for(SDBackendModule::VAE)); + model_manager); if (!register_runner_params("VAE", first_stage_model, SDBackendModule::VAE, @@ -952,8 +964,9 @@ public: if (use_audio_vae) { audio_vae_model = std::make_shared(backend_for(SDBackendModule::VAE), - params_backend_for(SDBackendModule::VAE), - tensor_storage_map); + tensor_storage_map, + "", + model_manager); if (!register_runner_params("LTX audio VAE", audio_vae_model, SDBackendModule::VAE, @@ -977,7 +990,9 @@ public: control_net = std::make_shared(backend_for(SDBackendModule::CONTROL_NET), params_backend_for(SDBackendModule::CONTROL_NET), model_loader.get_tensor_storage_map(), - version); + version, + "", + model_manager); if (sd_ctx_params->diffusion_conv_direct) { LOG_INFO("Using Conv2d direct in the control net"); control_net->set_conv2d_direct_enabled(true); @@ -998,6 +1013,7 @@ public: version, tensor_storage_map, model_loader, + model_manager, n_threads, [this](SDBackendModule module) { return ensure_backend_pair(module); }, [this](SDBackendModule module) { return backend_for(module); }, @@ -5008,8 +5024,8 @@ static sd::Tensor upscale_ltx_spatial_video_latent(sd_ctx_t* sd_ctx, std::unique_ptr upsampler = std::make_unique(sd_ctx->sd->backend_for(SDBackendModule::UPSCALER), - sd_ctx->sd->params_backend_for(SDBackendModule::UPSCALER), - model_loader.get_tensor_storage_map()); + model_loader.get_tensor_storage_map(), + upsampler_manager); const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(sd_ctx->sd->max_vram); upsampler->set_max_graph_vram_bytes(max_graph_vram_bytes); if (upsampler->model == nullptr) { @@ -5019,7 +5035,6 @@ static sd::Tensor upscale_ltx_spatial_video_latent(sd_ctx_t* sd_ctx, std::map tensors; upsampler->get_param_tensors(tensors); - upsampler->set_weight_manager(upsampler_manager); if (!upsampler_manager->register_param_tensors("LTX latent upsampler", std::move(tensors), ModelManager::ResidencyMode::Resident, diff --git a/src/upscaler.cpp b/src/upscaler.cpp index b2bc9a62..0a9182e9 100644 --- a/src/upscaler.cpp +++ b/src/upscaler.cpp @@ -90,8 +90,8 @@ bool UpscalerGGML::load_from_file(const std::string& esrgan_path, model_loader.set_wtype_override(model_data_type); LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type)); esrgan_upscaler = std::make_shared(backend_for(SDBackendModule::UPSCALER), - params_backend_for(SDBackendModule::UPSCALER), - model_loader.get_tensor_storage_map()); + model_loader.get_tensor_storage_map(), + model_manager); if (esrgan_upscaler == nullptr || esrgan_upscaler->rrdb_net == nullptr) { LOG_ERROR("init esrgan model from metadata failed: '%s'", esrgan_path.c_str()); return false; @@ -104,7 +104,6 @@ bool UpscalerGGML::load_from_file(const std::string& esrgan_path, std::map tensors; esrgan_upscaler->get_param_tensors(tensors); - esrgan_upscaler->set_weight_manager(model_manager); if (!model_manager->register_param_tensors("ESRGAN", std::move(tensors), ModelManager::ResidencyMode::Resident,