diff --git a/clip.hpp b/clip.hpp index 321f5f8..1ee942d 100644 --- a/clip.hpp +++ b/clip.hpp @@ -868,12 +868,13 @@ struct CLIPTextModelRunner : public GGMLRunner { CLIPTextModel model; CLIPTextModelRunner(ggml_backend_t backend, + bool offload_params_to_cpu, const String2GGMLType& tensor_types, const std::string prefix, CLIPVersion version = OPENAI_CLIP_VIT_L_14, bool with_final_ln = true, int clip_skip_value = -1) - : GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) { + : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, clip_skip_value) { model.init(params_ctx, tensor_types, prefix); } diff --git a/conditioner.hpp b/conditioner.hpp index e63169b..e5b5d35 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -57,6 +57,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { std::vector readed_embeddings; FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend, + bool offload_params_to_cpu, const String2GGMLType& tensor_types, const std::string& embd_dir, SDVersion version = VERSION_SD1, @@ -64,12 +65,12 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { int clip_skip = -1) : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) { if (sd_version_is_sd1(version)) { - text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14); + text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14); } else if (sd_version_is_sd2(version)) { - text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14); + text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14); } else if (sd_version_is_sdxl(version)) { - text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); - text_model2 = std::make_shared(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); + text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); + text_model2 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); } set_clip_skip(clip_skip); } @@ -154,7 +155,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } return true; }; - model_loader.load_tensors(on_load, NULL); + model_loader.load_tensors(on_load); readed_embeddings.push_back(embd_name); if (embd) { int64_t hidden_size = text_model->model.hidden_size; @@ -618,8 +619,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { struct FrozenCLIPVisionEmbedder : public GGMLRunner { CLIPVisionModelProjection vision_model; - FrozenCLIPVisionEmbedder(ggml_backend_t backend, const String2GGMLType& tensor_types = {}) - : vision_model(OPEN_CLIP_VIT_H_14, true), GGMLRunner(backend) { + FrozenCLIPVisionEmbedder(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2GGMLType& tensor_types = {}) + : vision_model(OPEN_CLIP_VIT_H_14, true), GGMLRunner(backend, offload_params_to_cpu) { vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer"); } @@ -663,12 +666,13 @@ struct SD3CLIPEmbedder : public Conditioner { std::shared_ptr t5; SD3CLIPEmbedder(ggml_backend_t backend, + bool offload_params_to_cpu, const String2GGMLType& tensor_types = {}, int clip_skip = -1) : clip_g_tokenizer(0) { - clip_l = std::make_shared(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); - clip_g = std::make_shared(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); - t5 = std::make_shared(backend, tensor_types, "text_encoders.t5xxl.transformer"); + clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); + clip_g = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); + t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer"); set_clip_skip(clip_skip); } @@ -1010,10 +1014,11 @@ struct FluxCLIPEmbedder : public Conditioner { size_t chunk_len = 256; FluxCLIPEmbedder(ggml_backend_t backend, + bool offload_params_to_cpu, const String2GGMLType& tensor_types = {}, int clip_skip = -1) { - clip_l = std::make_shared(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); - t5 = std::make_shared(backend, tensor_types, "text_encoders.t5xxl.transformer"); + clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); + t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer"); set_clip_skip(clip_skip); } @@ -1232,13 +1237,14 @@ struct T5CLIPEmbedder : public Conditioner { bool is_umt5 = false; T5CLIPEmbedder(ggml_backend_t backend, + bool offload_params_to_cpu, const String2GGMLType& tensor_types = {}, int clip_skip = -1, bool use_mask = false, int mask_pad = 1, bool is_umt5 = false) : use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) { - t5 = std::make_shared(backend, tensor_types, "text_encoders.t5xxl.transformer", is_umt5); + t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5); } void set_clip_skip(int clip_skip) { diff --git a/control.hpp b/control.hpp index d8f81fc..19f9181 100644 --- a/control.hpp +++ b/control.hpp @@ -317,9 +317,10 @@ struct ControlNet : public GGMLRunner { bool guided_hint_cached = false; ControlNet(ggml_backend_t backend, + bool offload_params_to_cpu, const String2GGMLType& tensor_types = {}, SDVersion version = VERSION_SD1) - : GGMLRunner(backend), control_net(version) { + : GGMLRunner(backend, offload_params_to_cpu), control_net(version) { control_net.init(params_ctx, tensor_types, ""); } @@ -346,7 +347,7 @@ struct ControlNet : public GGMLRunner { control_buffer_size += ggml_nbytes(controls[i]); } - control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, backend); + control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, runtime_backend); LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f); } @@ -443,7 +444,7 @@ struct ControlNet : public GGMLRunner { return false; } - bool success = model_loader.load_tensors(tensors, backend, ignore_tensors); + bool success = model_loader.load_tensors(tensors, ignore_tensors); if (!success) { LOG_ERROR("load control net tensors from model loader failed"); diff --git a/diffusion_model.hpp b/diffusion_model.hpp index 6ac5c9b..4a9f170 100644 --- a/diffusion_model.hpp +++ b/diffusion_model.hpp @@ -33,10 +33,11 @@ struct UNetModel : public DiffusionModel { UNetModelRunner unet; UNetModel(ggml_backend_t backend, + bool offload_params_to_cpu, const String2GGMLType& tensor_types = {}, SDVersion version = VERSION_SD1, bool flash_attn = false) - : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) { + : unet(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn) { } void alloc_params_buffer() { @@ -86,8 +87,9 @@ struct MMDiTModel : public DiffusionModel { MMDiTRunner mmdit; MMDiTModel(ggml_backend_t backend, + bool offload_params_to_cpu, const String2GGMLType& tensor_types = {}) - : mmdit(backend, tensor_types, "model.diffusion_model") { + : mmdit(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model") { } void alloc_params_buffer() { @@ -136,11 +138,12 @@ struct FluxModel : public DiffusionModel { Flux::FluxRunner flux; FluxModel(ggml_backend_t backend, + bool offload_params_to_cpu, const String2GGMLType& tensor_types = {}, SDVersion version = VERSION_FLUX, bool flash_attn = false, bool use_mask = false) - : flux(backend, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) { + : flux(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) { } void alloc_params_buffer() { @@ -189,10 +192,11 @@ struct WanModel : public DiffusionModel { WAN::WanRunner wan; WanModel(ggml_backend_t backend, + bool offload_params_to_cpu, const String2GGMLType& tensor_types = {}, SDVersion version = VERSION_FLUX, bool flash_attn = false) - : wan(backend, tensor_types, "model.diffusion_model", version, flash_attn) { + : wan(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn) { } void alloc_params_buffer() { diff --git a/esrgan.hpp b/esrgan.hpp index 4215db1..154e51b 100644 --- a/esrgan.hpp +++ b/esrgan.hpp @@ -142,8 +142,10 @@ struct ESRGAN : public GGMLRunner { int scale = 4; int tile_size = 128; // avoid cuda OOM for 4gb VRAM - ESRGAN(ggml_backend_t backend, const String2GGMLType& tensor_types = {}) - : GGMLRunner(backend) { + ESRGAN(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2GGMLType& tensor_types = {}) + : GGMLRunner(backend, offload_params_to_cpu) { rrdb_net.init(params_ctx, tensor_types, ""); } @@ -164,7 +166,7 @@ struct ESRGAN : public GGMLRunner { return false; } - bool success = model_loader.load_tensors(esrgan_tensors, backend); + bool success = model_loader.load_tensors(esrgan_tensors); if (!success) { LOG_ERROR("load esrgan tensors from model loader failed"); diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 9bbe7c7..a20b9b4 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -95,6 +95,7 @@ struct SDParams { int64_t seed = 42; bool verbose = false; bool vae_tiling = false; + bool offload_params_to_cpu = false; bool control_net_cpu = false; bool normalize_input = false; bool clip_on_cpu = false; @@ -141,8 +142,9 @@ void print_params(SDParams params) { for (auto& path : params.ref_image_paths) { printf(" %s\n", path.c_str()); }; - printf(" clip on cpu: %s\n", params.clip_on_cpu ? "true" : "false"); - printf(" controlnet cpu: %s\n", params.control_net_cpu ? "true" : "false"); + printf(" offload_params_to_cpu: %s\n", params.offload_params_to_cpu ? "true" : "false"); + printf(" clip_on_cpu: %s\n", params.clip_on_cpu ? "true" : "false"); + printf(" control_net_cpu: %s\n", params.control_net_cpu ? "true" : "false"); printf(" vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false"); printf(" diffusion flash attention:%s\n", params.diffusion_flash_attn ? "true" : "false"); printf(" strength(control): %.2f\n", params.control_strength); @@ -461,6 +463,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { options.bool_options = { {"", "--vae-tiling", "", true, ¶ms.vae_tiling}, + {"", "--offload-to-cpu", "", true, ¶ms.offload_params_to_cpu}, {"", "--control-net-cpu", "", true, ¶ms.control_net_cpu}, {"", "--normalize-input", "", true, ¶ms.normalize_input}, {"", "--clip-on-cpu", "", true, ¶ms.clip_on_cpu}, @@ -943,6 +946,7 @@ int main(int argc, const char* argv[]) { params.wtype, params.rng_type, params.schedule, + params.offload_params_to_cpu, params.clip_on_cpu, params.control_net_cpu, params.vae_on_cpu, @@ -1058,6 +1062,7 @@ int main(int argc, const char* argv[]) { int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) { upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(), + params.offload_params_to_cpu, params.n_threads); if (upscaler_ctx == NULL) { diff --git a/flux.hpp b/flux.hpp index 17af38a..044ea82 100644 --- a/flux.hpp +++ b/flux.hpp @@ -881,12 +881,13 @@ namespace Flux { bool use_mask = false; FluxRunner(ggml_backend_t backend, + bool offload_params_to_cpu, const String2GGMLType& tensor_types = {}, const std::string prefix = "", SDVersion version = VERSION_FLUX, bool flash_attn = false, bool use_mask = false) - : GGMLRunner(backend), use_mask(use_mask) { + : GGMLRunner(backend, offload_params_to_cpu), use_mask(use_mask) { flux_params.flash_attn = flash_attn; flux_params.guidance_embed = false; flux_params.depth = 0; @@ -1085,7 +1086,7 @@ namespace Flux { // ggml_backend_t backend = ggml_backend_cuda_init(0); ggml_backend_t backend = ggml_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_Q8_0; - std::shared_ptr flux = std::shared_ptr(new FluxRunner(backend)); + std::shared_ptr flux = std::shared_ptr(new FluxRunner(backend, false)); { LOG_INFO("loading from '%s'", file_path.c_str()); @@ -1099,7 +1100,7 @@ namespace Flux { return; } - bool success = model_loader.load_tensors(tensors, backend); + bool success = model_loader.load_tensors(tensors); if (!success) { LOG_ERROR("load tensors from model loader failed"); diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 7563aed..20134c2 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -1230,16 +1230,20 @@ struct GGMLRunner { protected: typedef std::function get_graph_cb_t; - struct ggml_context* params_ctx = NULL; - ggml_backend_buffer_t params_buffer = NULL; + ggml_backend_t params_backend = NULL; + ggml_backend_t runtime_backend = NULL; + + struct ggml_context* params_ctx = NULL; + ggml_backend_buffer_t params_buffer = NULL; + struct ggml_context* offload_ctx = NULL; + ggml_backend_buffer_t runtime_params_buffer = NULL; + bool params_on_runtime_backend = false; struct ggml_context* compute_ctx = NULL; struct ggml_gallocr* compute_allocr = NULL; std::map backend_tensor_data_map; - ggml_backend_t backend = NULL; - void alloc_params_ctx() { struct ggml_init_params params; params.mem_size = static_cast(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead()); @@ -1248,6 +1252,10 @@ protected: params_ctx = ggml_init(params); GGML_ASSERT(params_ctx != NULL); + if (params_backend != runtime_backend) { + offload_ctx = ggml_init(params); + GGML_ASSERT(offload_ctx != NULL); + } } void free_params_ctx() { @@ -1255,6 +1263,10 @@ protected: ggml_free(params_ctx); params_ctx = NULL; } + if (offload_ctx != NULL) { + ggml_free(offload_ctx); + offload_ctx = NULL; + } } void alloc_compute_ctx() { @@ -1281,7 +1293,7 @@ protected: reset_compute_ctx(); struct ggml_cgraph* gf = get_graph(); backend_tensor_data_map.clear(); - compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(runtime_backend)); if (!ggml_gallocr_reserve(compute_allocr, gf)) { // failed to allocate the compute buffer @@ -1295,7 +1307,7 @@ protected: LOG_DEBUG("%s compute buffer size: %.2f MB(%s)", get_desc().c_str(), compute_buffer_size / 1024.0 / 1024.0, - ggml_backend_is_cpu(backend) ? "RAM" : "VRAM"); + ggml_backend_is_cpu(runtime_backend) ? "RAM" : "VRAM"); return true; } @@ -1310,12 +1322,96 @@ protected: backend_tensor_data_map.clear(); } + bool offload_params_to_runtime_backend() { + if (params_backend == runtime_backend) { + return true; + } + if (params_on_runtime_backend) { + return true; + } + GGML_ASSERT(runtime_params_buffer == NULL); + int64_t t0 = ggml_time_ms(); + size_t num_tensors = ggml_tensor_num(offload_ctx); + if (num_tensors == 0) { + for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != NULL; t = ggml_get_next_tensor(params_ctx, t)) { + GGML_ASSERT(t->view_src == NULL); + ggml_dup_tensor(offload_ctx, t); + } + } + num_tensors = ggml_tensor_num(offload_ctx); + GGML_ASSERT(num_tensors == ggml_tensor_num(params_ctx)); + + runtime_params_buffer = ggml_backend_alloc_ctx_tensors(offload_ctx, runtime_backend); + + if (runtime_params_buffer == NULL) { + LOG_ERROR("%s alloc runtime params backend buffer failed, num_tensors = %i", + get_desc().c_str(), + num_tensors); + return false; + } + + ggml_tensor* t = ggml_get_first_tensor(params_ctx); + ggml_tensor* offload_t = ggml_get_first_tensor(offload_ctx); + + while (t != NULL && offload_t != NULL) { + ggml_backend_tensor_copy(t, offload_t); + std::swap(t->buffer, offload_t->buffer); + std::swap(t->data, offload_t->data); + + t = ggml_get_next_tensor(params_ctx, t); + offload_t = ggml_get_next_tensor(offload_ctx, offload_t); + } + + int64_t t1 = ggml_time_ms(); + + size_t params_buffer_size = ggml_backend_buffer_get_size(runtime_params_buffer); + LOG_INFO("%s offload params (%6.2f MB, %i tensors) to runtime backend (%s), taking %.2fs", + get_desc().c_str(), + params_buffer_size / (1024.f * 1024.f), + num_tensors, + ggml_backend_name(runtime_backend), + (t1 - t0) * 1.0f / 1000); + + params_on_runtime_backend = true; + + return true; + } + + void offload_params_to_params_backend() { + if (!params_on_runtime_backend) { + return; + } + ggml_tensor* t = ggml_get_first_tensor(params_ctx); + ggml_tensor* offload_t = ggml_get_first_tensor(offload_ctx); + + while (t != NULL && offload_t != NULL) { + t->buffer = offload_t->buffer; + t->data = offload_t->data; + offload_t->buffer = NULL; + offload_t->data = NULL; + + t = ggml_get_next_tensor(params_ctx, t); + offload_t = ggml_get_next_tensor(offload_ctx, offload_t); + } + + if (runtime_params_buffer != NULL) { + ggml_backend_buffer_free(runtime_params_buffer); + runtime_params_buffer = NULL; + } + params_on_runtime_backend = false; + } + public: virtual std::string get_desc() = 0; - GGMLRunner(ggml_backend_t backend) - : backend(backend) { + GGMLRunner(ggml_backend_t backend, bool offload_params_to_cpu = false) + : runtime_backend(backend) { alloc_params_ctx(); + if (!ggml_backend_is_cpu(runtime_backend) && offload_params_to_cpu) { + params_backend = ggml_backend_cpu_init(); + } else { + params_backend = runtime_backend; + } } virtual ~GGMLRunner() { @@ -1323,6 +1419,9 @@ public: free_compute_buffer(); free_params_ctx(); free_compute_ctx(); + if (params_backend != runtime_backend) { + ggml_backend_free(params_backend); + } } void reset_compute_ctx() { @@ -1332,7 +1431,7 @@ public: bool alloc_params_buffer() { size_t num_tensors = ggml_tensor_num(params_ctx); - params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, backend); + params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend); if (params_buffer == NULL) { LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i", get_desc().c_str(), @@ -1342,14 +1441,9 @@ public: size_t params_buffer_size = ggml_backend_buffer_get_size(params_buffer); LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)", get_desc().c_str(), - params_buffer_size / (1024.0 * 1024.0), - ggml_backend_is_cpu(backend) ? "RAM" : "VRAM", + params_buffer_size / (1024.f * 1024.f), + ggml_backend_is_cpu(params_backend) ? "RAM" : "VRAM", num_tensors); - // printf("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)\n", - // get_desc().c_str(), - // params_buffer_size / (1024.0 * 1024.0), - // ggml_backend_is_cpu(backend) ? "RAM" : "VRAM", - // num_tensors); return true; } @@ -1372,6 +1466,7 @@ public: ggml_gallocr_free(compute_allocr); compute_allocr = NULL; } + offload_params_to_params_backend(); } // do copy after alloc graph @@ -1385,7 +1480,7 @@ public: return NULL; } // it's performing a compute, check if backend isn't cpu - if (!ggml_backend_is_cpu(backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) { + if (!ggml_backend_is_cpu(runtime_backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) { // pass input tensors to gpu memory auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor); @@ -1401,16 +1496,20 @@ public: bool free_compute_buffer_immediately = true, struct ggml_tensor** output = NULL, struct ggml_context* output_ctx = NULL) { + if (!offload_params_to_runtime_backend()) { + LOG_ERROR("%s offload params to runtime backend failed", get_desc().c_str()); + return; + } alloc_compute_buffer(get_graph); reset_compute_ctx(); struct ggml_cgraph* gf = get_graph(); GGML_ASSERT(ggml_gallocr_alloc_graph(compute_allocr, gf)); cpy_data_to_backend_tensor(); - if (ggml_backend_is_cpu(backend)) { - ggml_backend_cpu_set_n_threads(backend, n_threads); + if (ggml_backend_is_cpu(runtime_backend)) { + ggml_backend_cpu_set_n_threads(runtime_backend, n_threads); } - ggml_backend_graph_compute(backend, gf); + ggml_backend_graph_compute(runtime_backend, gf); #ifdef GGML_PERF ggml_graph_print(gf); #endif @@ -1420,7 +1519,7 @@ public: *output = ggml_dup_tensor(output_ctx, result); } if (*output != NULL) { - ggml_backend_tensor_get_and_sync(backend, result, (*output)->data, 0, ggml_nbytes(*output)); + ggml_backend_tensor_get_and_sync(runtime_backend, result, (*output)->data, 0, ggml_nbytes(*output)); } } diff --git a/lora.hpp b/lora.hpp index 35f5aac..b1a4971 100644 --- a/lora.hpp +++ b/lora.hpp @@ -92,6 +92,7 @@ struct LoraModel : public GGMLRunner { float multiplier = 1.0f; std::map lora_tensors; + std::map original_weight_to_final_weight; std::string file_path; ModelLoader model_loader; bool load_failed = false; @@ -103,7 +104,7 @@ struct LoraModel : public GGMLRunner { LoraModel(ggml_backend_t backend, const std::string& file_path = "", const std::string prefix = "") - : file_path(file_path), GGMLRunner(backend) { + : file_path(file_path), GGMLRunner(backend, false) { if (!model_loader.init_from_file(file_path, prefix)) { load_failed = true; } @@ -151,11 +152,11 @@ struct LoraModel : public GGMLRunner { return true; }; - model_loader.load_tensors(on_new_tensor_cb, backend); + model_loader.load_tensors(on_new_tensor_cb); alloc_params_buffer(); // exit(0); dry_run = false; - model_loader.load_tensors(on_new_tensor_cb, backend); + model_loader.load_tensors(on_new_tensor_cb); LOG_DEBUG("lora type: \"%s\"/\"%s\"", lora_downs[type].c_str(), lora_ups[type].c_str()); @@ -790,6 +791,11 @@ struct LoraModel : public GGMLRunner { updown = ggml_merge_lora(compute_ctx, lora_down, lora_up, lora_mid); } scale_value *= multiplier; + ggml_tensor* original_weight = weight; + if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(weight->buffer)) { + weight = ggml_dup_tensor(compute_ctx, weight); + set_backend_tensor_data(weight, original_weight->data); + } updown = ggml_reshape(compute_ctx, updown, weight); GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight)); updown = ggml_scale_inplace(compute_ctx, updown, scale_value); @@ -805,6 +811,9 @@ struct LoraModel : public GGMLRunner { } // final_weight = ggml_add_inplace(compute_ctx, weight, updown); // apply directly ggml_build_forward_expand(gf, final_weight); + if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_weight->buffer)) { + original_weight_to_final_weight[original_weight] = final_weight; + } break; } } @@ -839,7 +848,14 @@ struct LoraModel : public GGMLRunner { auto get_graph = [&]() -> struct ggml_cgraph* { return build_lora_graph(model_tensors, version); }; - GGMLRunner::compute(get_graph, n_threads, true); + GGMLRunner::compute(get_graph, n_threads, false); + for (auto item : original_weight_to_final_weight) { + ggml_tensor* original_weight = item.first; + ggml_tensor* final_weight = item.second; + + ggml_backend_tensor_copy(final_weight, original_weight); + } + GGMLRunner::free_compute_buffer(); } }; diff --git a/mmdit.hpp b/mmdit.hpp index 5348808..904cda4 100644 --- a/mmdit.hpp +++ b/mmdit.hpp @@ -846,9 +846,10 @@ struct MMDiTRunner : public GGMLRunner { MMDiT mmdit; MMDiTRunner(ggml_backend_t backend, + bool offload_params_to_cpu, const String2GGMLType& tensor_types = {}, const std::string prefix = "") - : GGMLRunner(backend), mmdit(tensor_types) { + : GGMLRunner(backend, offload_params_to_cpu), mmdit(tensor_types) { mmdit.init(params_ctx, tensor_types, prefix); } @@ -946,7 +947,7 @@ struct MMDiTRunner : public GGMLRunner { // ggml_backend_t backend = ggml_backend_cuda_init(0); ggml_backend_t backend = ggml_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_F16; - std::shared_ptr mmdit = std::shared_ptr(new MMDiTRunner(backend)); + std::shared_ptr mmdit = std::shared_ptr(new MMDiTRunner(backend, false)); { LOG_INFO("loading from '%s'", file_path.c_str()); @@ -960,7 +961,7 @@ struct MMDiTRunner : public GGMLRunner { return; } - bool success = model_loader.load_tensors(tensors, backend); + bool success = model_loader.load_tensors(tensors); if (!success) { LOG_ERROR("load tensors from model loader failed"); diff --git a/model.cpp b/model.cpp index 89f9abc..1cb1507 100644 --- a/model.cpp +++ b/model.cpp @@ -1048,12 +1048,12 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s } } for (int i = GGML_MAX_DIMS; i < n_dims; i++) { - shape->ne[GGML_MAX_DIMS - 1] *= ne[i]; // stack to last dim; + shape->ne[GGML_MAX_DIMS - 1] *= ne[i]; // stack to last dim; } return true; }; - ctx_gguf_ = gguf_init_from_file_ext(file_path.c_str(), {true, &ctx_meta_}, on_tensor_shape_read); + ctx_gguf_ = gguf_init_from_file_ext(file_path.c_str(), {true, &ctx_meta_}, on_tensor_shape_read); if (!ctx_gguf_) { LOG_ERROR("failed to open '%s'", file_path.c_str()); return false; @@ -1917,7 +1917,7 @@ std::vector remove_duplicates(const std::vector& v return res; } -bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend) { +bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) { std::vector processed_tensor_storages; for (auto& tensor_storage : tensor_storages) { // LOG_DEBUG("%s", name.c_str()); @@ -2115,7 +2115,6 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend } bool ModelLoader::load_tensors(std::map& tensors, - ggml_backend_t backend, std::set ignore_tensors) { std::set tensor_names_in_file; auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { @@ -2155,7 +2154,7 @@ bool ModelLoader::load_tensors(std::map& tenso return true; }; - bool success = load_tensors(on_new_tensor_cb, backend); + bool success = load_tensors(on_new_tensor_cb); if (!success) { LOG_ERROR("load tensors from file failed"); return false; @@ -2299,7 +2298,7 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type return true; }; - bool success = load_tensors(on_new_tensor_cb, backend); + bool success = load_tensors(on_new_tensor_cb); ggml_backend_free(backend); LOG_INFO("load tensors done"); LOG_INFO("trying to save tensors to %s", file_path.c_str()); diff --git a/model.h b/model.h index 10a7449..8dd2e87 100644 --- a/model.h +++ b/model.h @@ -245,9 +245,8 @@ public: ggml_type get_diffusion_model_wtype(); ggml_type get_vae_wtype(); void set_wtype_override(ggml_type wtype, std::string prefix = ""); - bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend); + bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb); bool load_tensors(std::map& tensors, - ggml_backend_t backend, std::set ignore_tensors = {}); bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules); diff --git a/pmid.hpp b/pmid.hpp index e2a0f62..9b725de 100644 --- a/pmid.hpp +++ b/pmid.hpp @@ -624,12 +624,13 @@ public: public: PhotoMakerIDEncoder(ggml_backend_t backend, + bool offload_params_to_cpu, const String2GGMLType& tensor_types, const std::string prefix, SDVersion version = VERSION_SDXL, PMVersion pm_v = PM_VERSION_1, float sty = 20.f) - : GGMLRunner(backend), + : GGMLRunner(backend, offload_params_to_cpu), version(version), pm_version(pm_v), style_strength(sty) { @@ -785,10 +786,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner { bool applied = false; PhotoMakerIDEmbed(ggml_backend_t backend, + bool offload_params_to_cpu, ModelLoader* ml, const std::string& file_path = "", const std::string& prefix = "") - : file_path(file_path), GGMLRunner(backend), model_loader(ml) { + : file_path(file_path), GGMLRunner(backend, offload_params_to_cpu), model_loader(ml) { if (!model_loader->init_from_file(file_path, prefix)) { load_failed = true; } @@ -828,11 +830,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner { return true; }; - model_loader->load_tensors(on_new_tensor_cb, backend); + model_loader->load_tensors(on_new_tensor_cb); alloc_params_buffer(); dry_run = false; - model_loader->load_tensors(on_new_tensor_cb, backend); + model_loader->load_tensors(on_new_tensor_cb); LOG_DEBUG("finished loading PhotoMaker ID Embeds "); return true; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 08b9f4d..50796a5 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -104,9 +104,10 @@ public: std::shared_ptr pmid_id_embeds; std::string taesd_path; - bool use_tiny_autoencoder = false; - bool vae_tiling = false; - bool stacked_id = false; + bool use_tiny_autoencoder = false; + bool vae_tiling = false; + bool offload_params_to_cpu = false; + bool stacked_id = false; bool is_using_v_parameterization = false; bool is_using_edm_v_parameterization = false; @@ -180,6 +181,7 @@ public: taesd_path = SAFE_STR(sd_ctx_params->taesd_path); use_tiny_autoencoder = taesd_path.size() > 0; vae_tiling = sd_ctx_params->vae_tiling; + offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu; if (sd_ctx_params->rng_type == STD_DEFAULT_RNG) { rng = std::make_shared(); @@ -327,8 +329,12 @@ public: if (sd_ctx_params->diffusion_flash_attn) { LOG_WARN("flash attention in this diffusion model is currently unsupported!"); } - cond_stage_model = std::make_shared(clip_backend, model_loader.tensor_storages_types); - diffusion_model = std::make_shared(backend, model_loader.tensor_storages_types); + cond_stage_model = std::make_shared(clip_backend, + offload_params_to_cpu, + model_loader.tensor_storages_types); + diffusion_model = std::make_shared(backend, + offload_params_to_cpu, + model_loader.tensor_storages_types); } else if (sd_version_is_flux(version)) { bool is_chroma = false; for (auto pair : model_loader.tensor_storages_types) { @@ -339,43 +345,52 @@ public: } if (is_chroma) { cond_stage_model = std::make_shared(clip_backend, + offload_params_to_cpu, model_loader.tensor_storages_types, -1, sd_ctx_params->chroma_use_t5_mask, sd_ctx_params->chroma_t5_mask_pad); } else { - cond_stage_model = std::make_shared(clip_backend, model_loader.tensor_storages_types); + cond_stage_model = std::make_shared(clip_backend, + offload_params_to_cpu, + model_loader.tensor_storages_types); } diffusion_model = std::make_shared(backend, + offload_params_to_cpu, model_loader.tensor_storages_types, version, sd_ctx_params->diffusion_flash_attn, sd_ctx_params->chroma_use_dit_mask); } else if (sd_version_is_wan(version)) { cond_stage_model = std::make_shared(clip_backend, + offload_params_to_cpu, model_loader.tensor_storages_types, -1, true, 1, true); diffusion_model = std::make_shared(backend, + offload_params_to_cpu, model_loader.tensor_storages_types, version, sd_ctx_params->diffusion_flash_attn); } else { // SD1.x SD2.x SDXL if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) { cond_stage_model = std::make_shared(clip_backend, + offload_params_to_cpu, model_loader.tensor_storages_types, SAFE_STR(sd_ctx_params->embedding_dir), version, PM_VERSION_2); } else { cond_stage_model = std::make_shared(clip_backend, + offload_params_to_cpu, model_loader.tensor_storages_types, SAFE_STR(sd_ctx_params->embedding_dir), version); } diffusion_model = std::make_shared(backend, + offload_params_to_cpu, model_loader.tensor_storages_types, version, sd_ctx_params->diffusion_flash_attn); @@ -396,6 +411,7 @@ public: if (sd_version_is_wan(version)) { first_stage_model = std::make_shared(vae_backend, + offload_params_to_cpu, model_loader.tensor_storages_types, "first_stage_model", vae_decode_only); @@ -403,6 +419,7 @@ public: first_stage_model->get_param_tensors(tensors, "first_stage_model"); } else if (!use_tiny_autoencoder) { first_stage_model = std::make_shared(vae_backend, + offload_params_to_cpu, model_loader.tensor_storages_types, "first_stage_model", vae_decode_only, @@ -412,6 +429,7 @@ public: first_stage_model->get_param_tensors(tensors, "first_stage_model"); } else { tae_first_stage = std::make_shared(vae_backend, + offload_params_to_cpu, model_loader.tensor_storages_types, "decoder.layers", vae_decode_only, @@ -427,14 +445,26 @@ public: } else { controlnet_backend = backend; } - control_net = std::make_shared(controlnet_backend, model_loader.tensor_storages_types, version); + control_net = std::make_shared(controlnet_backend, + offload_params_to_cpu, + model_loader.tensor_storages_types, + version); } if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) { - pmid_model = std::make_shared(backend, model_loader.tensor_storages_types, "pmid", version, PM_VERSION_2); + pmid_model = std::make_shared(backend, + offload_params_to_cpu, + model_loader.tensor_storages_types, + "pmid", + version, + PM_VERSION_2); LOG_INFO("using PhotoMaker Version 2"); } else { - pmid_model = std::make_shared(backend, model_loader.tensor_storages_types, "pmid", version); + pmid_model = std::make_shared(backend, + offload_params_to_cpu, + model_loader.tensor_storages_types, + "pmid", + version); } if (strlen(SAFE_STR(sd_ctx_params->stacked_id_embed_dir)) > 0) { pmid_lora = std::make_shared(backend, sd_ctx_params->stacked_id_embed_dir, ""); @@ -489,7 +519,7 @@ public: if (version == VERSION_SVD) { ignore_tensors.insert("conditioner.embedders.3"); } - bool success = model_loader.load_tensors(tensors, backend, ignore_tensors); + bool success = model_loader.load_tensors(tensors, ignore_tensors); if (!success) { LOG_ERROR("load tensors from model loader failed"); ggml_free(ctx); @@ -1354,6 +1384,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->wtype = SD_TYPE_COUNT; sd_ctx_params->rng_type = CUDA_RNG; sd_ctx_params->schedule = DEFAULT; + sd_ctx_params->offload_params_to_cpu = false; sd_ctx_params->keep_clip_on_cpu = false; sd_ctx_params->keep_control_net_on_cpu = false; sd_ctx_params->keep_vae_on_cpu = false; @@ -1388,6 +1419,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "wtype: %s\n" "rng_type: %s\n" "schedule: %s\n" + "offload_params_to_cpu: %s\n" "keep_clip_on_cpu: %s\n" "keep_control_net_on_cpu: %s\n" "keep_vae_on_cpu: %s\n" @@ -1413,6 +1445,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { sd_type_name(sd_ctx_params->wtype), sd_rng_type_name(sd_ctx_params->rng_type), sd_schedule_name(sd_ctx_params->schedule), + BOOL_STR(sd_ctx_params->offload_params_to_cpu), BOOL_STR(sd_ctx_params->keep_clip_on_cpu), BOOL_STR(sd_ctx_params->keep_control_net_on_cpu), BOOL_STR(sd_ctx_params->keep_vae_on_cpu), diff --git a/stable-diffusion.h b/stable-diffusion.h index 644f930..a6a87dd 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -130,6 +130,7 @@ typedef struct { enum sd_type_t wtype; enum rng_type_t rng_type; enum schedule_t schedule; + bool offload_params_to_cpu; bool keep_clip_on_cpu; bool keep_control_net_on_cpu; bool keep_vae_on_cpu; @@ -236,10 +237,13 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s typedef struct upscaler_ctx_t upscaler_ctx_t; SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path, + bool offload_params_to_cpu, int n_threads); SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx); -SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor); +SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, + sd_image_t input_image, + uint32_t upscale_factor); SD_API bool convert(const char* input_path, const char* vae_path, diff --git a/t5.hpp b/t5.hpp index 408c256..f149dad 100644 --- a/t5.hpp +++ b/t5.hpp @@ -756,10 +756,11 @@ struct T5Runner : public GGMLRunner { std::vector relative_position_bucket_vec; T5Runner(ggml_backend_t backend, + bool offload_params_to_cpu, const String2GGMLType& tensor_types, const std::string prefix, bool is_umt5 = false) - : GGMLRunner(backend) { + : GGMLRunner(backend, offload_params_to_cpu) { if (is_umt5) { params.vocab_size = 256384; params.relative_attention = false; @@ -900,10 +901,11 @@ struct T5Embedder { T5Runner model; T5Embedder(ggml_backend_t backend, + bool offload_params_to_cpu, const String2GGMLType& tensor_types = {}, const std::string prefix = "", bool is_umt5 = false) - : model(backend, tensor_types, prefix, is_umt5), tokenizer(is_umt5) { + : model(backend, offload_params_to_cpu, tensor_types, prefix, is_umt5), tokenizer(is_umt5) { } void get_param_tensors(std::map& tensors, const std::string prefix) { @@ -1012,13 +1014,13 @@ struct T5Embedder { } } - std::shared_ptr t5 = std::shared_ptr(new T5Embedder(backend, tensor_types, "", true)); + std::shared_ptr t5 = std::shared_ptr(new T5Embedder(backend, false, tensor_types, "", true)); t5->alloc_params_buffer(); std::map tensors; t5->get_param_tensors(tensors, ""); - bool success = model_loader.load_tensors(tensors, backend); + bool success = model_loader.load_tensors(tensors); if (!success) { LOG_ERROR("load tensors from model loader failed"); diff --git a/tae.hpp b/tae.hpp index 51fb94f..da5aa56 100644 --- a/tae.hpp +++ b/tae.hpp @@ -196,13 +196,14 @@ struct TinyAutoEncoder : public GGMLRunner { bool decode_only = false; TinyAutoEncoder(ggml_backend_t backend, + bool offload_params_to_cpu, const String2GGMLType& tensor_types, const std::string prefix, bool decoder_only = true, SDVersion version = VERSION_SD1) : decode_only(decoder_only), taesd(decoder_only, version), - GGMLRunner(backend) { + GGMLRunner(backend, offload_params_to_cpu) { taesd.init(params_ctx, tensor_types, prefix); } @@ -226,7 +227,7 @@ struct TinyAutoEncoder : public GGMLRunner { return false; } - bool success = model_loader.load_tensors(taesd_tensors, backend, ignore_tensors); + bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors); if (!success) { LOG_ERROR("load tae tensors from model loader failed"); diff --git a/unet.hpp b/unet.hpp index 7ab4934..847911d 100644 --- a/unet.hpp +++ b/unet.hpp @@ -538,11 +538,12 @@ struct UNetModelRunner : public GGMLRunner { UnetModelBlock unet; UNetModelRunner(ggml_backend_t backend, + bool offload_params_to_cpu, const String2GGMLType& tensor_types, const std::string prefix, SDVersion version = VERSION_SD1, bool flash_attn = false) - : GGMLRunner(backend), unet(version, tensor_types, flash_attn) { + : GGMLRunner(backend, offload_params_to_cpu), unet(version, tensor_types, flash_attn) { unet.init(params_ctx, tensor_types, prefix); } diff --git a/upscaler.cpp b/upscaler.cpp index 1372134..c7fb305 100644 --- a/upscaler.cpp +++ b/upscaler.cpp @@ -14,7 +14,8 @@ struct UpscalerGGML { : n_threads(n_threads) { } - bool load_from_file(const std::string& esrgan_path) { + bool load_from_file(const std::string& esrgan_path, + bool offload_params_to_cpu) { #ifdef SD_USE_CUDA LOG_DEBUG("Using CUDA backend"); backend = ggml_backend_cuda_init(0); @@ -46,7 +47,7 @@ struct UpscalerGGML { backend = ggml_backend_cpu_init(); } LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type)); - esrgan_upscaler = std::make_shared(backend, model_loader.tensor_storages_types); + esrgan_upscaler = std::make_shared(backend, offload_params_to_cpu, model_loader.tensor_storages_types); if (!esrgan_upscaler->load_from_file(esrgan_path)) { return false; } @@ -104,6 +105,7 @@ struct upscaler_ctx_t { }; upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str, + bool offload_params_to_cpu, int n_threads) { upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t)); if (upscaler_ctx == NULL) { @@ -116,7 +118,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str, return NULL; } - if (!upscaler_ctx->upscaler->load_from_file(esrgan_path)) { + if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu)) { delete upscaler_ctx->upscaler; upscaler_ctx->upscaler = NULL; free(upscaler_ctx); diff --git a/vae.hpp b/vae.hpp index fcbe091..dc44dde 100644 --- a/vae.hpp +++ b/vae.hpp @@ -521,8 +521,8 @@ public: }; struct VAE : public GGMLRunner { - VAE(ggml_backend_t backend) - : GGMLRunner(backend) {} + VAE(ggml_backend_t backend, bool offload_params_to_cpu) + : GGMLRunner(backend, offload_params_to_cpu) {} virtual void compute(const int n_threads, struct ggml_tensor* z, bool decode_graph, @@ -536,12 +536,13 @@ struct AutoEncoderKL : public VAE { AutoencodingEngine ae; AutoEncoderKL(ggml_backend_t backend, + bool offload_params_to_cpu, const String2GGMLType& tensor_types, const std::string prefix, bool decode_only = false, bool use_video_decoder = false, SDVersion version = VERSION_SD1) - : decode_only(decode_only), ae(decode_only, use_video_decoder, version), VAE(backend) { + : decode_only(decode_only), ae(decode_only, use_video_decoder, version), VAE(backend, offload_params_to_cpu) { ae.init(params_ctx, tensor_types, prefix); } diff --git a/wan.hpp b/wan.hpp index 25f2c17..f6dbaad 100644 --- a/wan.hpp +++ b/wan.hpp @@ -767,10 +767,11 @@ namespace WAN { std::vector _feat_vec_map; WanVAERunner(ggml_backend_t backend, + bool offload_params_to_cpu, const String2GGMLType& tensor_types = {}, const std::string prefix = "", bool decode_only = false) - : decode_only(decode_only), ae(decode_only), VAE(backend) { + : decode_only(decode_only), ae(decode_only), VAE(backend, offload_params_to_cpu) { ae.init(params_ctx, tensor_types, prefix); rest_feat_vec_map(); } @@ -857,7 +858,7 @@ namespace WAN { feat_cache_vec.is_rep = true; _feat_vec_map[feat_idx] = feat_cache_vec; } else if (feat_cache != NULL) { - _feat_vec_map[feat_idx] = FeatCache(backend, feat_cache); + _feat_vec_map[feat_idx] = FeatCache(runtime_backend, feat_cache); } } GGMLRunner::free_compute_buffer(); @@ -897,7 +898,7 @@ namespace WAN { feat_cache_vec.is_rep = true; _feat_vec_map[feat_idx] = feat_cache_vec; } else if (feat_cache != NULL) { - _feat_vec_map[feat_idx] = FeatCache(backend, feat_cache); + _feat_vec_map[feat_idx] = FeatCache(runtime_backend, feat_cache); } } @@ -943,7 +944,7 @@ namespace WAN { ggml_backend_t backend = ggml_backend_cuda_init(0); // ggml_backend_t backend = ggml_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_F16; - std::shared_ptr vae = std::shared_ptr(new WanVAERunner(backend)); + std::shared_ptr vae = std::shared_ptr(new WanVAERunner(backend, false)); { LOG_INFO("loading from '%s'", file_path.c_str()); @@ -957,7 +958,7 @@ namespace WAN { return; } - bool success = model_loader.load_tensors(tensors, backend); + bool success = model_loader.load_tensors(tensors); if (!success) { LOG_ERROR("load tensors from model loader failed"); @@ -1564,11 +1565,12 @@ namespace WAN { SDVersion version; WanRunner(ggml_backend_t backend, + bool offload_params_to_cpu, const String2GGMLType& tensor_types = {}, const std::string prefix = "", SDVersion version = VERSION_WAN2, bool flash_attn = false) - : GGMLRunner(backend) { + : GGMLRunner(backend, offload_params_to_cpu) { wan_params.flash_attn = flash_attn; wan_params.num_layers = 0; for (auto pair : tensor_types) { @@ -1747,6 +1749,7 @@ namespace WAN { } std::shared_ptr wan = std::shared_ptr(new WanRunner(backend, + false, tensor_types, "model.diffusion_model")); @@ -1754,7 +1757,7 @@ namespace WAN { std::map tensors; wan->get_param_tensors(tensors, "model.diffusion_model"); - bool success = model_loader.load_tensors(tensors, backend); + bool success = model_loader.load_tensors(tensors); if (!success) { LOG_ERROR("load tensors from model loader failed");