add offload params to cpu support

2025-12-13 05:48:56 +00:00 · 2025-08-17 03:13:16 +08:00 · 2025-08-17 03:13:16 +08:00 · 9b29de27a8
commit 9b29de27a8
parent b0833eb4d8
21 changed files with 283 additions and 100 deletions
--- a/clip.hpp
+++ b/clip.hpp
@ -868,12 +868,13 @@ struct CLIPTextModelRunner : public GGMLRunner {
    CLIPTextModel model;
    CLIPTextModelRunner(ggml_backend_t backend,
                        bool offload_params_to_cpu,
                        const String2GGMLType& tensor_types,
                        const std::string prefix,
                        CLIPVersion version = OPENAI_CLIP_VIT_L_14,
                        bool with_final_ln  = true,
                        int clip_skip_value = -1)
-        : GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) {
+        : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, clip_skip_value) {
        model.init(params_ctx, tensor_types, prefix);
    }
--- a/conditioner.hpp
+++ b/conditioner.hpp
@ -57,6 +57,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
    std::vector<std::string> readed_embeddings;
    FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
                                      bool offload_params_to_cpu,
                                      const String2GGMLType& tensor_types,
                                      const std::string& embd_dir,
                                      SDVersion version = VERSION_SD1,
@ -64,12 +65,12 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                      int clip_skip     = -1)
        : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
        if (sd_version_is_sd1(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
        } else if (sd_version_is_sd2(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
        } else if (sd_version_is_sdxl(version)) {
-            text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
+            text_model  = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
-            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
+            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
        }
        set_clip_skip(clip_skip);
    }
@ -154,7 +155,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
            }
            return true;
        };
-        model_loader.load_tensors(on_load, NULL);
+        model_loader.load_tensors(on_load);
        readed_embeddings.push_back(embd_name);
        if (embd) {
            int64_t hidden_size = text_model->model.hidden_size;
@ -618,8 +619,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
 struct FrozenCLIPVisionEmbedder : public GGMLRunner {
    CLIPVisionModelProjection vision_model;
-    FrozenCLIPVisionEmbedder(ggml_backend_t backend, const String2GGMLType& tensor_types = {})
+    FrozenCLIPVisionEmbedder(ggml_backend_t backend,
-        : vision_model(OPEN_CLIP_VIT_H_14, true), GGMLRunner(backend) {
+                             bool offload_params_to_cpu,
                             const String2GGMLType& tensor_types = {})
        : vision_model(OPEN_CLIP_VIT_H_14, true), GGMLRunner(backend, offload_params_to_cpu) {
        vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer");
    }
@ -663,12 +666,13 @@ struct SD3CLIPEmbedder : public Conditioner {
    std::shared_ptr<T5Runner> t5;
    SD3CLIPEmbedder(ggml_backend_t backend,
                    bool offload_params_to_cpu,
                    const String2GGMLType& tensor_types = {},
                    int clip_skip                       = -1)
        : clip_g_tokenizer(0) {
-        clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
+        clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
-        clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
+        clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
-        t5     = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
+        t5     = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
        set_clip_skip(clip_skip);
    }
@ -1010,10 +1014,11 @@ struct FluxCLIPEmbedder : public Conditioner {
    size_t chunk_len = 256;
    FluxCLIPEmbedder(ggml_backend_t backend,
                     bool offload_params_to_cpu,
                     const String2GGMLType& tensor_types = {},
                     int clip_skip                       = -1) {
-        clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
+        clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
-        t5     = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
+        t5     = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
        set_clip_skip(clip_skip);
    }
@ -1232,13 +1237,14 @@ struct T5CLIPEmbedder : public Conditioner {
    bool is_umt5     = false;
    T5CLIPEmbedder(ggml_backend_t backend,
                   bool offload_params_to_cpu,
                   const String2GGMLType& tensor_types = {},
                   int clip_skip                       = -1,
                   bool use_mask                       = false,
                   int mask_pad                        = 1,
                   bool is_umt5                        = false)
        : use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) {
-        t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer", is_umt5);
+        t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5);
    }
    void set_clip_skip(int clip_skip) {
--- a/control.hpp
+++ b/control.hpp
@ -317,9 +317,10 @@ struct ControlNet : public GGMLRunner {
    bool guided_hint_cached         = false;
    ControlNet(ggml_backend_t backend,
               bool offload_params_to_cpu,
               const String2GGMLType& tensor_types = {},
               SDVersion version                   = VERSION_SD1)
-        : GGMLRunner(backend), control_net(version) {
+        : GGMLRunner(backend, offload_params_to_cpu), control_net(version) {
        control_net.init(params_ctx, tensor_types, "");
    }
@ -346,7 +347,7 @@ struct ControlNet : public GGMLRunner {
            control_buffer_size += ggml_nbytes(controls[i]);
        }
-        control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, backend);
+        control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, runtime_backend);
        LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f);
    }
@ -443,7 +444,7 @@ struct ControlNet : public GGMLRunner {
            return false;
        }
-        bool success = model_loader.load_tensors(tensors, backend, ignore_tensors);
+        bool success = model_loader.load_tensors(tensors, ignore_tensors);
        if (!success) {
            LOG_ERROR("load control net tensors from model loader failed");
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@ -33,10 +33,11 @@ struct UNetModel : public DiffusionModel {
    UNetModelRunner unet;
    UNetModel(ggml_backend_t backend,
              bool offload_params_to_cpu,
              const String2GGMLType& tensor_types = {},
              SDVersion version                   = VERSION_SD1,
              bool flash_attn                     = false)
-        : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
+        : unet(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn) {
    }
    void alloc_params_buffer() {
@ -86,8 +87,9 @@ struct MMDiTModel : public DiffusionModel {
    MMDiTRunner mmdit;
    MMDiTModel(ggml_backend_t backend,
               bool offload_params_to_cpu,
               const String2GGMLType& tensor_types = {})
-        : mmdit(backend, tensor_types, "model.diffusion_model") {
+        : mmdit(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model") {
    }
    void alloc_params_buffer() {
@ -136,11 +138,12 @@ struct FluxModel : public DiffusionModel {
    Flux::FluxRunner flux;
    FluxModel(ggml_backend_t backend,
              bool offload_params_to_cpu,
              const String2GGMLType& tensor_types = {},
              SDVersion version                   = VERSION_FLUX,
              bool flash_attn                     = false,
              bool use_mask                       = false)
-        : flux(backend, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) {
+        : flux(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) {
    }
    void alloc_params_buffer() {
@ -189,10 +192,11 @@ struct WanModel : public DiffusionModel {
    WAN::WanRunner wan;
    WanModel(ggml_backend_t backend,
             bool offload_params_to_cpu,
             const String2GGMLType& tensor_types = {},
             SDVersion version                   = VERSION_FLUX,
             bool flash_attn                     = false)
-        : wan(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
+        : wan(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn) {
    }
    void alloc_params_buffer() {
--- a/esrgan.hpp
+++ b/esrgan.hpp
@ -142,8 +142,10 @@ struct ESRGAN : public GGMLRunner {
    int scale     = 4;
    int tile_size = 128;  // avoid cuda OOM for 4gb VRAM
-    ESRGAN(ggml_backend_t backend, const String2GGMLType& tensor_types = {})
+    ESRGAN(ggml_backend_t backend,
-        : GGMLRunner(backend) {
+           bool offload_params_to_cpu,
           const String2GGMLType& tensor_types = {})
        : GGMLRunner(backend, offload_params_to_cpu) {
        rrdb_net.init(params_ctx, tensor_types, "");
    }
@ -164,7 +166,7 @@ struct ESRGAN : public GGMLRunner {
            return false;
        }
-        bool success = model_loader.load_tensors(esrgan_tensors, backend);
+        bool success = model_loader.load_tensors(esrgan_tensors);
        if (!success) {
            LOG_ERROR("load esrgan tensors from model loader failed");
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -95,6 +95,7 @@ struct SDParams {
    int64_t seed                  = 42;
    bool verbose                  = false;
    bool vae_tiling               = false;
    bool offload_params_to_cpu    = false;
    bool control_net_cpu          = false;
    bool normalize_input          = false;
    bool clip_on_cpu              = false;
@ -141,8 +142,9 @@ void print_params(SDParams params) {
    for (auto& path : params.ref_image_paths) {
        printf("        %s\n", path.c_str());
    };
-    printf("    clip on cpu:       %s\n", params.clip_on_cpu ? "true" : "false");
+    printf("    offload_params_to_cpu:        %s\n", params.offload_params_to_cpu ? "true" : "false");
-    printf("    controlnet cpu:    %s\n", params.control_net_cpu ? "true" : "false");
+    printf("    clip_on_cpu:       %s\n", params.clip_on_cpu ? "true" : "false");
    printf("    control_net_cpu:    %s\n", params.control_net_cpu ? "true" : "false");
    printf("    vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
    printf("    diffusion flash attention:%s\n", params.diffusion_flash_attn ? "true" : "false");
    printf("    strength(control): %.2f\n", params.control_strength);
@ -461,6 +463,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
    options.bool_options = {
        {"", "--vae-tiling", "", true, &params.vae_tiling},
        {"", "--offload-to-cpu", "", true, &params.offload_params_to_cpu},
        {"", "--control-net-cpu", "", true, &params.control_net_cpu},
        {"", "--normalize-input", "", true, &params.normalize_input},
        {"", "--clip-on-cpu", "", true, &params.clip_on_cpu},
@ -943,6 +946,7 @@ int main(int argc, const char* argv[]) {
        params.wtype,
        params.rng_type,
        params.schedule,
        params.offload_params_to_cpu,
        params.clip_on_cpu,
        params.control_net_cpu,
        params.vae_on_cpu,
@ -1058,6 +1062,7 @@ int main(int argc, const char* argv[]) {
    int upscale_factor = 4;  // unused for RealESRGAN_x4plus_anime_6B.pth
    if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) {
        upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
                                                        params.offload_params_to_cpu,
                                                        params.n_threads);
        if (upscaler_ctx == NULL) {
--- a/flux.hpp
+++ b/flux.hpp
@ -881,12 +881,13 @@ namespace Flux {
        bool use_mask = false;
        FluxRunner(ggml_backend_t backend,
                   bool offload_params_to_cpu,
                   const String2GGMLType& tensor_types = {},
                   const std::string prefix            = "",
                   SDVersion version                   = VERSION_FLUX,
                   bool flash_attn                     = false,
                   bool use_mask                       = false)
-            : GGMLRunner(backend), use_mask(use_mask) {
+            : GGMLRunner(backend, offload_params_to_cpu), use_mask(use_mask) {
            flux_params.flash_attn          = flash_attn;
            flux_params.guidance_embed      = false;
            flux_params.depth               = 0;
@ -1085,7 +1086,7 @@ namespace Flux {
            // ggml_backend_t backend    = ggml_backend_cuda_init(0);
            ggml_backend_t backend           = ggml_backend_cpu_init();
            ggml_type model_data_type        = GGML_TYPE_Q8_0;
-            std::shared_ptr<FluxRunner> flux = std::shared_ptr<FluxRunner>(new FluxRunner(backend));
+            std::shared_ptr<FluxRunner> flux = std::shared_ptr<FluxRunner>(new FluxRunner(backend, false));
            {
                LOG_INFO("loading from '%s'", file_path.c_str());
@ -1099,7 +1100,7 @@ namespace Flux {
                    return;
                }
-                bool success = model_loader.load_tensors(tensors, backend);
+                bool success = model_loader.load_tensors(tensors);
                if (!success) {
                    LOG_ERROR("load tensors from model loader failed");
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -1230,16 +1230,20 @@ struct GGMLRunner {
 protected:
    typedef std::function<struct ggml_cgraph*()> get_graph_cb_t;
-    struct ggml_context* params_ctx     = NULL;
+    ggml_backend_t params_backend  = NULL;
-    ggml_backend_buffer_t params_buffer = NULL;
+    ggml_backend_t runtime_backend = NULL;
    struct ggml_context* params_ctx             = NULL;
    ggml_backend_buffer_t params_buffer         = NULL;
    struct ggml_context* offload_ctx            = NULL;
    ggml_backend_buffer_t runtime_params_buffer = NULL;
    bool params_on_runtime_backend              = false;
    struct ggml_context* compute_ctx    = NULL;
    struct ggml_gallocr* compute_allocr = NULL;
    std::map<struct ggml_tensor*, const void*> backend_tensor_data_map;
    ggml_backend_t backend = NULL;
    void alloc_params_ctx() {
        struct ggml_init_params params;
        params.mem_size   = static_cast<size_t>(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead());
@ -1248,6 +1252,10 @@ protected:
        params_ctx = ggml_init(params);
        GGML_ASSERT(params_ctx != NULL);
        if (params_backend != runtime_backend) {
            offload_ctx = ggml_init(params);
            GGML_ASSERT(offload_ctx != NULL);
        }
    }
    void free_params_ctx() {
@ -1255,6 +1263,10 @@ protected:
            ggml_free(params_ctx);
            params_ctx = NULL;
        }
        if (offload_ctx != NULL) {
            ggml_free(offload_ctx);
            offload_ctx = NULL;
        }
    }
    void alloc_compute_ctx() {
@ -1281,7 +1293,7 @@ protected:
        reset_compute_ctx();
        struct ggml_cgraph* gf = get_graph();
        backend_tensor_data_map.clear();
-        compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+        compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(runtime_backend));
        if (!ggml_gallocr_reserve(compute_allocr, gf)) {
            // failed to allocate the compute buffer
@ -1295,7 +1307,7 @@ protected:
        LOG_DEBUG("%s compute buffer size: %.2f MB(%s)",
                  get_desc().c_str(),
                  compute_buffer_size / 1024.0 / 1024.0,
-                  ggml_backend_is_cpu(backend) ? "RAM" : "VRAM");
+                  ggml_backend_is_cpu(runtime_backend) ? "RAM" : "VRAM");
        return true;
    }
@ -1310,12 +1322,96 @@ protected:
        backend_tensor_data_map.clear();
    }
    bool offload_params_to_runtime_backend() {
        if (params_backend == runtime_backend) {
            return true;
        }
        if (params_on_runtime_backend) {
            return true;
        }
        GGML_ASSERT(runtime_params_buffer == NULL);
        int64_t t0         = ggml_time_ms();
        size_t num_tensors = ggml_tensor_num(offload_ctx);
        if (num_tensors == 0) {
            for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != NULL; t = ggml_get_next_tensor(params_ctx, t)) {
                GGML_ASSERT(t->view_src == NULL);
                ggml_dup_tensor(offload_ctx, t);
            }
        }
        num_tensors = ggml_tensor_num(offload_ctx);
        GGML_ASSERT(num_tensors == ggml_tensor_num(params_ctx));
        runtime_params_buffer = ggml_backend_alloc_ctx_tensors(offload_ctx, runtime_backend);
        if (runtime_params_buffer == NULL) {
            LOG_ERROR("%s alloc runtime params backend buffer failed, num_tensors = %i",
                      get_desc().c_str(),
                      num_tensors);
            return false;
        }
        ggml_tensor* t         = ggml_get_first_tensor(params_ctx);
        ggml_tensor* offload_t = ggml_get_first_tensor(offload_ctx);
        while (t != NULL && offload_t != NULL) {
            ggml_backend_tensor_copy(t, offload_t);
            std::swap(t->buffer, offload_t->buffer);
            std::swap(t->data, offload_t->data);
            t         = ggml_get_next_tensor(params_ctx, t);
            offload_t = ggml_get_next_tensor(offload_ctx, offload_t);
        }
        int64_t t1 = ggml_time_ms();
        size_t params_buffer_size = ggml_backend_buffer_get_size(runtime_params_buffer);
        LOG_INFO("%s offload params (%6.2f MB, %i tensors) to runtime backend (%s), taking %.2fs",
                 get_desc().c_str(),
                 params_buffer_size / (1024.f * 1024.f),
                 num_tensors,
                 ggml_backend_name(runtime_backend),
                 (t1 - t0) * 1.0f / 1000);
        params_on_runtime_backend = true;
        return true;
    }
    void offload_params_to_params_backend() {
        if (!params_on_runtime_backend) {
            return;
        }
        ggml_tensor* t         = ggml_get_first_tensor(params_ctx);
        ggml_tensor* offload_t = ggml_get_first_tensor(offload_ctx);
        while (t != NULL && offload_t != NULL) {
            t->buffer         = offload_t->buffer;
            t->data           = offload_t->data;
            offload_t->buffer = NULL;
            offload_t->data   = NULL;
            t         = ggml_get_next_tensor(params_ctx, t);
            offload_t = ggml_get_next_tensor(offload_ctx, offload_t);
        }
        if (runtime_params_buffer != NULL) {
            ggml_backend_buffer_free(runtime_params_buffer);
            runtime_params_buffer = NULL;
        }
        params_on_runtime_backend = false;
    }
 public:
    virtual std::string get_desc() = 0;
-    GGMLRunner(ggml_backend_t backend)
+    GGMLRunner(ggml_backend_t backend, bool offload_params_to_cpu = false)
-        : backend(backend) {
+        : runtime_backend(backend) {
        alloc_params_ctx();
        if (!ggml_backend_is_cpu(runtime_backend) && offload_params_to_cpu) {
            params_backend = ggml_backend_cpu_init();
        } else {
            params_backend = runtime_backend;
        }
    }
    virtual ~GGMLRunner() {
@ -1323,6 +1419,9 @@ public:
        free_compute_buffer();
        free_params_ctx();
        free_compute_ctx();
        if (params_backend != runtime_backend) {
            ggml_backend_free(params_backend);
        }
    }
    void reset_compute_ctx() {
@ -1332,7 +1431,7 @@ public:
    bool alloc_params_buffer() {
        size_t num_tensors = ggml_tensor_num(params_ctx);
-        params_buffer      = ggml_backend_alloc_ctx_tensors(params_ctx, backend);
+        params_buffer      = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend);
        if (params_buffer == NULL) {
            LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
                      get_desc().c_str(),
@ -1342,14 +1441,9 @@ public:
        size_t params_buffer_size = ggml_backend_buffer_get_size(params_buffer);
        LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)",
                  get_desc().c_str(),
-                  params_buffer_size / (1024.0 * 1024.0),
+                  params_buffer_size / (1024.f * 1024.f),
-                  ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
+                  ggml_backend_is_cpu(params_backend) ? "RAM" : "VRAM",
                  num_tensors);
        // printf("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)\n",
        //           get_desc().c_str(),
        //           params_buffer_size / (1024.0 * 1024.0),
        //           ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
        //           num_tensors);
        return true;
    }
@ -1372,6 +1466,7 @@ public:
            ggml_gallocr_free(compute_allocr);
            compute_allocr = NULL;
        }
        offload_params_to_params_backend();
    }
    // do copy after alloc graph
@ -1385,7 +1480,7 @@ public:
            return NULL;
        }
        // it's performing a compute, check if backend isn't cpu
-        if (!ggml_backend_is_cpu(backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) {
+        if (!ggml_backend_is_cpu(runtime_backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) {
            // pass input tensors to gpu memory
            auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);
@ -1401,16 +1496,20 @@ public:
                 bool free_compute_buffer_immediately = true,
                 struct ggml_tensor** output          = NULL,
                 struct ggml_context* output_ctx      = NULL) {
        if (!offload_params_to_runtime_backend()) {
            LOG_ERROR("%s offload params to runtime backend failed", get_desc().c_str());
            return;
        }
        alloc_compute_buffer(get_graph);
        reset_compute_ctx();
        struct ggml_cgraph* gf = get_graph();
        GGML_ASSERT(ggml_gallocr_alloc_graph(compute_allocr, gf));
        cpy_data_to_backend_tensor();
-        if (ggml_backend_is_cpu(backend)) {
+        if (ggml_backend_is_cpu(runtime_backend)) {
-            ggml_backend_cpu_set_n_threads(backend, n_threads);
+            ggml_backend_cpu_set_n_threads(runtime_backend, n_threads);
        }
-        ggml_backend_graph_compute(backend, gf);
+        ggml_backend_graph_compute(runtime_backend, gf);
 #ifdef GGML_PERF
        ggml_graph_print(gf);
 #endif
@ -1420,7 +1519,7 @@ public:
                *output = ggml_dup_tensor(output_ctx, result);
            }
            if (*output != NULL) {
-                ggml_backend_tensor_get_and_sync(backend, result, (*output)->data, 0, ggml_nbytes(*output));
+                ggml_backend_tensor_get_and_sync(runtime_backend, result, (*output)->data, 0, ggml_nbytes(*output));
            }
        }
--- a/lora.hpp
+++ b/lora.hpp
@ -92,6 +92,7 @@ struct LoraModel : public GGMLRunner {
    float multiplier = 1.0f;
    std::map<std::string, struct ggml_tensor*> lora_tensors;
    std::map<ggml_tensor*, ggml_tensor*> original_weight_to_final_weight;
    std::string file_path;
    ModelLoader model_loader;
    bool load_failed                = false;
@ -103,7 +104,7 @@ struct LoraModel : public GGMLRunner {
    LoraModel(ggml_backend_t backend,
              const std::string& file_path = "",
              const std::string prefix     = "")
-        : file_path(file_path), GGMLRunner(backend) {
+        : file_path(file_path), GGMLRunner(backend, false) {
        if (!model_loader.init_from_file(file_path, prefix)) {
            load_failed = true;
        }
@ -151,11 +152,11 @@ struct LoraModel : public GGMLRunner {
            return true;
        };
-        model_loader.load_tensors(on_new_tensor_cb, backend);
+        model_loader.load_tensors(on_new_tensor_cb);
        alloc_params_buffer();
        // exit(0);
        dry_run = false;
-        model_loader.load_tensors(on_new_tensor_cb, backend);
+        model_loader.load_tensors(on_new_tensor_cb);
        LOG_DEBUG("lora type: \"%s\"/\"%s\"", lora_downs[type].c_str(), lora_ups[type].c_str());
@ -790,6 +791,11 @@ struct LoraModel : public GGMLRunner {
                    updown = ggml_merge_lora(compute_ctx, lora_down, lora_up, lora_mid);
                }
                scale_value *= multiplier;
                ggml_tensor* original_weight = weight;
                if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(weight->buffer)) {
                    weight = ggml_dup_tensor(compute_ctx, weight);
                    set_backend_tensor_data(weight, original_weight->data);
                }
                updown = ggml_reshape(compute_ctx, updown, weight);
                GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
                updown = ggml_scale_inplace(compute_ctx, updown, scale_value);
@ -805,6 +811,9 @@ struct LoraModel : public GGMLRunner {
                }
                // final_weight = ggml_add_inplace(compute_ctx, weight, updown);  // apply directly
                ggml_build_forward_expand(gf, final_weight);
                if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_weight->buffer)) {
                    original_weight_to_final_weight[original_weight] = final_weight;
                }
                break;
            }
        }
@ -839,7 +848,14 @@ struct LoraModel : public GGMLRunner {
        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_lora_graph(model_tensors, version);
        };
-        GGMLRunner::compute(get_graph, n_threads, true);
+        GGMLRunner::compute(get_graph, n_threads, false);
        for (auto item : original_weight_to_final_weight) {
            ggml_tensor* original_weight = item.first;
            ggml_tensor* final_weight    = item.second;
            ggml_backend_tensor_copy(final_weight, original_weight);
        }
        GGMLRunner::free_compute_buffer();
    }
 };
--- a/mmdit.hpp
+++ b/mmdit.hpp
@ -846,9 +846,10 @@ struct MMDiTRunner : public GGMLRunner {
    MMDiT mmdit;
    MMDiTRunner(ggml_backend_t backend,
                bool offload_params_to_cpu,
                const String2GGMLType& tensor_types = {},
                const std::string prefix            = "")
-        : GGMLRunner(backend), mmdit(tensor_types) {
+        : GGMLRunner(backend, offload_params_to_cpu), mmdit(tensor_types) {
        mmdit.init(params_ctx, tensor_types, prefix);
    }
@ -946,7 +947,7 @@ struct MMDiTRunner : public GGMLRunner {
        // ggml_backend_t backend    = ggml_backend_cuda_init(0);
        ggml_backend_t backend             = ggml_backend_cpu_init();
        ggml_type model_data_type          = GGML_TYPE_F16;
-        std::shared_ptr<MMDiTRunner> mmdit = std::shared_ptr<MMDiTRunner>(new MMDiTRunner(backend));
+        std::shared_ptr<MMDiTRunner> mmdit = std::shared_ptr<MMDiTRunner>(new MMDiTRunner(backend, false));
        {
            LOG_INFO("loading from '%s'", file_path.c_str());
@ -960,7 +961,7 @@ struct MMDiTRunner : public GGMLRunner {
                return;
            }
-            bool success = model_loader.load_tensors(tensors, backend);
+            bool success = model_loader.load_tensors(tensors);
            if (!success) {
                LOG_ERROR("load tensors from model loader failed");
--- a/model.cpp
+++ b/model.cpp
@ -1048,12 +1048,12 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s
            }
        }
        for (int i = GGML_MAX_DIMS; i < n_dims; i++) {
-            shape->ne[GGML_MAX_DIMS - 1] *= ne[i]; // stack to last dim;
+            shape->ne[GGML_MAX_DIMS - 1] *= ne[i];  // stack to last dim;
        }
        return true;
    };
-    ctx_gguf_               = gguf_init_from_file_ext(file_path.c_str(), {true, &ctx_meta_}, on_tensor_shape_read);
+    ctx_gguf_ = gguf_init_from_file_ext(file_path.c_str(), {true, &ctx_meta_}, on_tensor_shape_read);
    if (!ctx_gguf_) {
        LOG_ERROR("failed to open '%s'", file_path.c_str());
        return false;
@ -1917,7 +1917,7 @@ std::vector<TensorStorage> remove_duplicates(const std::vector<TensorStorage>& v
    return res;
 }
-bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend) {
+bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
    std::vector<TensorStorage> processed_tensor_storages;
    for (auto& tensor_storage : tensor_storages) {
        // LOG_DEBUG("%s", name.c_str());
@ -2115,7 +2115,6 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
 }
 bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                               ggml_backend_t backend,
                               std::set<std::string> ignore_tensors) {
    std::set<std::string> tensor_names_in_file;
    auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
@ -2155,7 +2154,7 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
        return true;
    };
-    bool success = load_tensors(on_new_tensor_cb, backend);
+    bool success = load_tensors(on_new_tensor_cb);
    if (!success) {
        LOG_ERROR("load tensors from file failed");
        return false;
@ -2299,7 +2298,7 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
        return true;
    };
-    bool success = load_tensors(on_new_tensor_cb, backend);
+    bool success = load_tensors(on_new_tensor_cb);
    ggml_backend_free(backend);
    LOG_INFO("load tensors done");
    LOG_INFO("trying to save tensors to %s", file_path.c_str());
--- a/model.h
+++ b/model.h
@ -245,9 +245,8 @@ public:
    ggml_type get_diffusion_model_wtype();
    ggml_type get_vae_wtype();
    void set_wtype_override(ggml_type wtype, std::string prefix = "");
-    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend);
+    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb);
    bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                      ggml_backend_t backend,
                      std::set<std::string> ignore_tensors = {});
    bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules);
--- a/pmid.hpp
+++ b/pmid.hpp
@ -624,12 +624,13 @@ public:
 public:
    PhotoMakerIDEncoder(ggml_backend_t backend,
                        bool offload_params_to_cpu,
                        const String2GGMLType& tensor_types,
                        const std::string prefix,
                        SDVersion version = VERSION_SDXL,
                        PMVersion pm_v    = PM_VERSION_1,
                        float sty         = 20.f)
-        : GGMLRunner(backend),
+        : GGMLRunner(backend, offload_params_to_cpu),
          version(version),
          pm_version(pm_v),
          style_strength(sty) {
@ -785,10 +786,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
    bool applied     = false;
    PhotoMakerIDEmbed(ggml_backend_t backend,
                      bool offload_params_to_cpu,
                      ModelLoader* ml,
                      const std::string& file_path = "",
                      const std::string& prefix    = "")
-        : file_path(file_path), GGMLRunner(backend), model_loader(ml) {
+        : file_path(file_path), GGMLRunner(backend, offload_params_to_cpu), model_loader(ml) {
        if (!model_loader->init_from_file(file_path, prefix)) {
            load_failed = true;
        }
@ -828,11 +830,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
            return true;
        };
-        model_loader->load_tensors(on_new_tensor_cb, backend);
+        model_loader->load_tensors(on_new_tensor_cb);
        alloc_params_buffer();
        dry_run = false;
-        model_loader->load_tensors(on_new_tensor_cb, backend);
+        model_loader->load_tensors(on_new_tensor_cb);
        LOG_DEBUG("finished loading PhotoMaker ID Embeds ");
        return true;
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -104,9 +104,10 @@ public:
    std::shared_ptr<PhotoMakerIDEmbed> pmid_id_embeds;
    std::string taesd_path;
-    bool use_tiny_autoencoder = false;
+    bool use_tiny_autoencoder  = false;
-    bool vae_tiling           = false;
+    bool vae_tiling            = false;
-    bool stacked_id           = false;
+    bool offload_params_to_cpu = false;
    bool stacked_id            = false;
    bool is_using_v_parameterization     = false;
    bool is_using_edm_v_parameterization = false;
@ -180,6 +181,7 @@ public:
        taesd_path              = SAFE_STR(sd_ctx_params->taesd_path);
        use_tiny_autoencoder    = taesd_path.size() > 0;
        vae_tiling              = sd_ctx_params->vae_tiling;
        offload_params_to_cpu   = sd_ctx_params->offload_params_to_cpu;
        if (sd_ctx_params->rng_type == STD_DEFAULT_RNG) {
            rng = std::make_shared<STDDefaultRNG>();
@ -327,8 +329,12 @@ public:
                if (sd_ctx_params->diffusion_flash_attn) {
                    LOG_WARN("flash attention in this diffusion model is currently unsupported!");
                }
-                cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
+                cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend,
-                diffusion_model  = std::make_shared<MMDiTModel>(backend, model_loader.tensor_storages_types);
+                                                                     offload_params_to_cpu,
                                                                     model_loader.tensor_storages_types);
                diffusion_model  = std::make_shared<MMDiTModel>(backend,
                                                               offload_params_to_cpu,
                                                               model_loader.tensor_storages_types);
            } else if (sd_version_is_flux(version)) {
                bool is_chroma = false;
                for (auto pair : model_loader.tensor_storages_types) {
@ -339,43 +345,52 @@ public:
                }
                if (is_chroma) {
                    cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                        offload_params_to_cpu,
                                                                        model_loader.tensor_storages_types,
                                                                        -1,
                                                                        sd_ctx_params->chroma_use_t5_mask,
                                                                        sd_ctx_params->chroma_t5_mask_pad);
                } else {
-                    cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
+                    cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend,
                                                                          offload_params_to_cpu,
                                                                          model_loader.tensor_storages_types);
                }
                diffusion_model = std::make_shared<FluxModel>(backend,
                                                              offload_params_to_cpu,
                                                              model_loader.tensor_storages_types,
                                                              version,
                                                              sd_ctx_params->diffusion_flash_attn,
                                                              sd_ctx_params->chroma_use_dit_mask);
            } else if (sd_version_is_wan(version)) {
                cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                    offload_params_to_cpu,
                                                                    model_loader.tensor_storages_types,
                                                                    -1,
                                                                    true,
                                                                    1,
                                                                    true);
                diffusion_model  = std::make_shared<WanModel>(backend,
                                                             offload_params_to_cpu,
                                                             model_loader.tensor_storages_types,
                                                             version,
                                                             sd_ctx_params->diffusion_flash_attn);
            } else {  // SD1.x SD2.x SDXL
                if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) {
                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
                                                                                           offload_params_to_cpu,
                                                                                           model_loader.tensor_storages_types,
                                                                                           SAFE_STR(sd_ctx_params->embedding_dir),
                                                                                           version,
                                                                                           PM_VERSION_2);
                } else {
                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
                                                                                           offload_params_to_cpu,
                                                                                           model_loader.tensor_storages_types,
                                                                                           SAFE_STR(sd_ctx_params->embedding_dir),
                                                                                           version);
                }
                diffusion_model = std::make_shared<UNetModel>(backend,
                                                              offload_params_to_cpu,
                                                              model_loader.tensor_storages_types,
                                                              version,
                                                              sd_ctx_params->diffusion_flash_attn);
@ -396,6 +411,7 @@ public:
            if (sd_version_is_wan(version)) {
                first_stage_model = std::make_shared<WAN::WanVAERunner>(vae_backend,
                                                                        offload_params_to_cpu,
                                                                        model_loader.tensor_storages_types,
                                                                        "first_stage_model",
                                                                        vae_decode_only);
@ -403,6 +419,7 @@ public:
                first_stage_model->get_param_tensors(tensors, "first_stage_model");
            } else if (!use_tiny_autoencoder) {
                first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend,
                                                                    offload_params_to_cpu,
                                                                    model_loader.tensor_storages_types,
                                                                    "first_stage_model",
                                                                    vae_decode_only,
@ -412,6 +429,7 @@ public:
                first_stage_model->get_param_tensors(tensors, "first_stage_model");
            } else {
                tae_first_stage = std::make_shared<TinyAutoEncoder>(vae_backend,
                                                                    offload_params_to_cpu,
                                                                    model_loader.tensor_storages_types,
                                                                    "decoder.layers",
                                                                    vae_decode_only,
@ -427,14 +445,26 @@ public:
                } else {
                    controlnet_backend = backend;
                }
-                control_net = std::make_shared<ControlNet>(controlnet_backend, model_loader.tensor_storages_types, version);
+                control_net = std::make_shared<ControlNet>(controlnet_backend,
                                                           offload_params_to_cpu,
                                                           model_loader.tensor_storages_types,
                                                           version);
            }
            if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) {
-                pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend, model_loader.tensor_storages_types, "pmid", version, PM_VERSION_2);
+                pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend,
                                                                   offload_params_to_cpu,
                                                                   model_loader.tensor_storages_types,
                                                                   "pmid",
                                                                   version,
                                                                   PM_VERSION_2);
                LOG_INFO("using PhotoMaker Version 2");
            } else {
-                pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend, model_loader.tensor_storages_types, "pmid", version);
+                pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend,
                                                                   offload_params_to_cpu,
                                                                   model_loader.tensor_storages_types,
                                                                   "pmid",
                                                                   version);
            }
            if (strlen(SAFE_STR(sd_ctx_params->stacked_id_embed_dir)) > 0) {
                pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->stacked_id_embed_dir, "");
@ -489,7 +519,7 @@ public:
        if (version == VERSION_SVD) {
            ignore_tensors.insert("conditioner.embedders.3");
        }
-        bool success = model_loader.load_tensors(tensors, backend, ignore_tensors);
+        bool success = model_loader.load_tensors(tensors, ignore_tensors);
        if (!success) {
            LOG_ERROR("load tensors from model loader failed");
            ggml_free(ctx);
@ -1354,6 +1384,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
    sd_ctx_params->wtype                   = SD_TYPE_COUNT;
    sd_ctx_params->rng_type                = CUDA_RNG;
    sd_ctx_params->schedule                = DEFAULT;
    sd_ctx_params->offload_params_to_cpu   = false;
    sd_ctx_params->keep_clip_on_cpu        = false;
    sd_ctx_params->keep_control_net_on_cpu = false;
    sd_ctx_params->keep_vae_on_cpu         = false;
@ -1388,6 +1419,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
             "wtype: %s\n"
             "rng_type: %s\n"
             "schedule: %s\n"
             "offload_params_to_cpu: %s\n"
             "keep_clip_on_cpu: %s\n"
             "keep_control_net_on_cpu: %s\n"
             "keep_vae_on_cpu: %s\n"
@ -1413,6 +1445,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
             sd_type_name(sd_ctx_params->wtype),
             sd_rng_type_name(sd_ctx_params->rng_type),
             sd_schedule_name(sd_ctx_params->schedule),
             BOOL_STR(sd_ctx_params->offload_params_to_cpu),
             BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
             BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
             BOOL_STR(sd_ctx_params->keep_vae_on_cpu),
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -130,6 +130,7 @@ typedef struct {
    enum sd_type_t wtype;
    enum rng_type_t rng_type;
    enum schedule_t schedule;
    bool offload_params_to_cpu;
    bool keep_clip_on_cpu;
    bool keep_control_net_on_cpu;
    bool keep_vae_on_cpu;
@ -236,10 +237,13 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
 typedef struct upscaler_ctx_t upscaler_ctx_t;
 SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
                                        bool offload_params_to_cpu,
                                        int n_threads);
 SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
-SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
+SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
                          sd_image_t input_image,
                          uint32_t upscale_factor);
 SD_API bool convert(const char* input_path,
                    const char* vae_path,
--- a/t5.hpp
+++ b/t5.hpp
@ -756,10 +756,11 @@ struct T5Runner : public GGMLRunner {
    std::vector<int> relative_position_bucket_vec;
    T5Runner(ggml_backend_t backend,
             bool offload_params_to_cpu,
             const String2GGMLType& tensor_types,
             const std::string prefix,
             bool is_umt5 = false)
-        : GGMLRunner(backend) {
+        : GGMLRunner(backend, offload_params_to_cpu) {
        if (is_umt5) {
            params.vocab_size         = 256384;
            params.relative_attention = false;
@ -900,10 +901,11 @@ struct T5Embedder {
    T5Runner model;
    T5Embedder(ggml_backend_t backend,
               bool offload_params_to_cpu,
               const String2GGMLType& tensor_types = {},
               const std::string prefix            = "",
               bool is_umt5                        = false)
-        : model(backend, tensor_types, prefix, is_umt5), tokenizer(is_umt5) {
+        : model(backend, offload_params_to_cpu, tensor_types, prefix, is_umt5), tokenizer(is_umt5) {
    }
    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
@ -1012,13 +1014,13 @@ struct T5Embedder {
            }
        }
-        std::shared_ptr<T5Embedder> t5 = std::shared_ptr<T5Embedder>(new T5Embedder(backend, tensor_types, "", true));
+        std::shared_ptr<T5Embedder> t5 = std::shared_ptr<T5Embedder>(new T5Embedder(backend, false, tensor_types, "", true));
        t5->alloc_params_buffer();
        std::map<std::string, ggml_tensor*> tensors;
        t5->get_param_tensors(tensors, "");
-        bool success = model_loader.load_tensors(tensors, backend);
+        bool success = model_loader.load_tensors(tensors);
        if (!success) {
            LOG_ERROR("load tensors from model loader failed");
--- a/tae.hpp
+++ b/tae.hpp
@ -196,13 +196,14 @@ struct TinyAutoEncoder : public GGMLRunner {
    bool decode_only = false;
    TinyAutoEncoder(ggml_backend_t backend,
                    bool offload_params_to_cpu,
                    const String2GGMLType& tensor_types,
                    const std::string prefix,
                    bool decoder_only = true,
                    SDVersion version = VERSION_SD1)
        : decode_only(decoder_only),
          taesd(decoder_only, version),
-          GGMLRunner(backend) {
+          GGMLRunner(backend, offload_params_to_cpu) {
        taesd.init(params_ctx, tensor_types, prefix);
    }
@ -226,7 +227,7 @@ struct TinyAutoEncoder : public GGMLRunner {
            return false;
        }
-        bool success = model_loader.load_tensors(taesd_tensors, backend, ignore_tensors);
+        bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors);
        if (!success) {
            LOG_ERROR("load tae tensors from model loader failed");
--- a/unet.hpp
+++ b/unet.hpp
@ -538,11 +538,12 @@ struct UNetModelRunner : public GGMLRunner {
    UnetModelBlock unet;
    UNetModelRunner(ggml_backend_t backend,
                    bool offload_params_to_cpu,
                    const String2GGMLType& tensor_types,
                    const std::string prefix,
                    SDVersion version = VERSION_SD1,
                    bool flash_attn   = false)
-        : GGMLRunner(backend), unet(version, tensor_types, flash_attn) {
+        : GGMLRunner(backend, offload_params_to_cpu), unet(version, tensor_types, flash_attn) {
        unet.init(params_ctx, tensor_types, prefix);
    }
--- a/upscaler.cpp
+++ b/upscaler.cpp
@ -14,7 +14,8 @@ struct UpscalerGGML {
        : n_threads(n_threads) {
    }
-    bool load_from_file(const std::string& esrgan_path) {
+    bool load_from_file(const std::string& esrgan_path,
                        bool offload_params_to_cpu) {
 #ifdef SD_USE_CUDA
        LOG_DEBUG("Using CUDA backend");
        backend = ggml_backend_cuda_init(0);
@ -46,7 +47,7 @@ struct UpscalerGGML {
            backend = ggml_backend_cpu_init();
        }
        LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
-        esrgan_upscaler = std::make_shared<ESRGAN>(backend, model_loader.tensor_storages_types);
+        esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, model_loader.tensor_storages_types);
        if (!esrgan_upscaler->load_from_file(esrgan_path)) {
            return false;
        }
@ -104,6 +105,7 @@ struct upscaler_ctx_t {
 };
 upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
                                 bool offload_params_to_cpu,
                                 int n_threads) {
    upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
    if (upscaler_ctx == NULL) {
@ -116,7 +118,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
        return NULL;
    }
-    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path)) {
+    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu)) {
        delete upscaler_ctx->upscaler;
        upscaler_ctx->upscaler = NULL;
        free(upscaler_ctx);
--- a/vae.hpp
+++ b/vae.hpp
@ -521,8 +521,8 @@ public:
 };
 struct VAE : public GGMLRunner {
-    VAE(ggml_backend_t backend)
+    VAE(ggml_backend_t backend, bool offload_params_to_cpu)
-        : GGMLRunner(backend) {}
+        : GGMLRunner(backend, offload_params_to_cpu) {}
    virtual void compute(const int n_threads,
                         struct ggml_tensor* z,
                         bool decode_graph,
@ -536,12 +536,13 @@ struct AutoEncoderKL : public VAE {
    AutoencodingEngine ae;
    AutoEncoderKL(ggml_backend_t backend,
                  bool offload_params_to_cpu,
                  const String2GGMLType& tensor_types,
                  const std::string prefix,
                  bool decode_only       = false,
                  bool use_video_decoder = false,
                  SDVersion version      = VERSION_SD1)
-        : decode_only(decode_only), ae(decode_only, use_video_decoder, version), VAE(backend) {
+        : decode_only(decode_only), ae(decode_only, use_video_decoder, version), VAE(backend, offload_params_to_cpu) {
        ae.init(params_ctx, tensor_types, prefix);
    }
--- a/wan.hpp
+++ b/wan.hpp
@ -767,10 +767,11 @@ namespace WAN {
        std::vector<FeatCache> _feat_vec_map;
        WanVAERunner(ggml_backend_t backend,
                     bool offload_params_to_cpu,
                     const String2GGMLType& tensor_types = {},
                     const std::string prefix            = "",
                     bool decode_only                    = false)
-            : decode_only(decode_only), ae(decode_only), VAE(backend) {
+            : decode_only(decode_only), ae(decode_only), VAE(backend, offload_params_to_cpu) {
            ae.init(params_ctx, tensor_types, prefix);
            rest_feat_vec_map();
        }
@ -857,7 +858,7 @@ namespace WAN {
                        feat_cache_vec.is_rep   = true;
                        _feat_vec_map[feat_idx] = feat_cache_vec;
                    } else if (feat_cache != NULL) {
-                        _feat_vec_map[feat_idx] = FeatCache(backend, feat_cache);
+                        _feat_vec_map[feat_idx] = FeatCache(runtime_backend, feat_cache);
                    }
                }
                GGMLRunner::free_compute_buffer();
@ -897,7 +898,7 @@ namespace WAN {
                            feat_cache_vec.is_rep   = true;
                            _feat_vec_map[feat_idx] = feat_cache_vec;
                        } else if (feat_cache != NULL) {
-                            _feat_vec_map[feat_idx] = FeatCache(backend, feat_cache);
+                            _feat_vec_map[feat_idx] = FeatCache(runtime_backend, feat_cache);
                        }
                    }
@ -943,7 +944,7 @@ namespace WAN {
            ggml_backend_t backend = ggml_backend_cuda_init(0);
            // ggml_backend_t backend            = ggml_backend_cpu_init();
            ggml_type model_data_type         = GGML_TYPE_F16;
-            std::shared_ptr<WanVAERunner> vae = std::shared_ptr<WanVAERunner>(new WanVAERunner(backend));
+            std::shared_ptr<WanVAERunner> vae = std::shared_ptr<WanVAERunner>(new WanVAERunner(backend, false));
            {
                LOG_INFO("loading from '%s'", file_path.c_str());
@ -957,7 +958,7 @@ namespace WAN {
                    return;
                }
-                bool success = model_loader.load_tensors(tensors, backend);
+                bool success = model_loader.load_tensors(tensors);
                if (!success) {
                    LOG_ERROR("load tensors from model loader failed");
@ -1564,11 +1565,12 @@ namespace WAN {
        SDVersion version;
        WanRunner(ggml_backend_t backend,
                  bool offload_params_to_cpu,
                  const String2GGMLType& tensor_types = {},
                  const std::string prefix            = "",
                  SDVersion version                   = VERSION_WAN2,
                  bool flash_attn                     = false)
-            : GGMLRunner(backend) {
+            : GGMLRunner(backend, offload_params_to_cpu) {
            wan_params.flash_attn = flash_attn;
            wan_params.num_layers = 0;
            for (auto pair : tensor_types) {
@ -1747,6 +1749,7 @@ namespace WAN {
            }
            std::shared_ptr<WanRunner> wan = std::shared_ptr<WanRunner>(new WanRunner(backend,
                                                                                      false,
                                                                                      tensor_types,
                                                                                      "model.diffusion_model"));
@ -1754,7 +1757,7 @@ namespace WAN {
            std::map<std::string, ggml_tensor*> tensors;
            wan->get_param_tensors(tensors, "model.diffusion_model");
-            bool success = model_loader.load_tensors(tensors, backend);
+            bool success = model_loader.load_tensors(tensors);
            if (!success) {
                LOG_ERROR("load tensors from model loader failed");