diff --git a/clip.hpp b/clip.hpp
index 321f5f8..1ee942d 100644
--- a/clip.hpp
+++ b/clip.hpp
@@ -868,12 +868,13 @@ struct CLIPTextModelRunner : public GGMLRunner {
     CLIPTextModel model;
 
     CLIPTextModelRunner(ggml_backend_t backend,
+                        bool offload_params_to_cpu,
                         const String2GGMLType& tensor_types,
                         const std::string prefix,
                         CLIPVersion version = OPENAI_CLIP_VIT_L_14,
                         bool with_final_ln  = true,
                         int clip_skip_value = -1)
-        : GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) {
+        : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, clip_skip_value) {
         model.init(params_ctx, tensor_types, prefix);
     }
 
diff --git a/conditioner.hpp b/conditioner.hpp
index e63169b..e5b5d35 100644
--- a/conditioner.hpp
+++ b/conditioner.hpp
@@ -57,6 +57,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
     std::vector<std::string> readed_embeddings;
 
     FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
+                                      bool offload_params_to_cpu,
                                       const String2GGMLType& tensor_types,
                                       const std::string& embd_dir,
                                       SDVersion version = VERSION_SD1,
@@ -64,12 +65,12 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                       int clip_skip     = -1)
         : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
         if (sd_version_is_sd1(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
         } else if (sd_version_is_sd2(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
         } else if (sd_version_is_sdxl(version)) {
-            text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
-            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
+            text_model  = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
+            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
         }
         set_clip_skip(clip_skip);
     }
@@ -154,7 +155,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             }
             return true;
         };
-        model_loader.load_tensors(on_load, NULL);
+        model_loader.load_tensors(on_load);
         readed_embeddings.push_back(embd_name);
         if (embd) {
             int64_t hidden_size = text_model->model.hidden_size;
@@ -618,8 +619,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
 struct FrozenCLIPVisionEmbedder : public GGMLRunner {
     CLIPVisionModelProjection vision_model;
 
-    FrozenCLIPVisionEmbedder(ggml_backend_t backend, const String2GGMLType& tensor_types = {})
-        : vision_model(OPEN_CLIP_VIT_H_14, true), GGMLRunner(backend) {
+    FrozenCLIPVisionEmbedder(ggml_backend_t backend,
+                             bool offload_params_to_cpu,
+                             const String2GGMLType& tensor_types = {})
+        : vision_model(OPEN_CLIP_VIT_H_14, true), GGMLRunner(backend, offload_params_to_cpu) {
         vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer");
     }
 
@@ -663,12 +666,13 @@ struct SD3CLIPEmbedder : public Conditioner {
     std::shared_ptr<T5Runner> t5;
 
     SD3CLIPEmbedder(ggml_backend_t backend,
+                    bool offload_params_to_cpu,
                     const String2GGMLType& tensor_types = {},
                     int clip_skip                       = -1)
         : clip_g_tokenizer(0) {
-        clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
-        clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
-        t5     = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
+        clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
+        clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
+        t5     = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
         set_clip_skip(clip_skip);
     }
 
@@ -1010,10 +1014,11 @@ struct FluxCLIPEmbedder : public Conditioner {
     size_t chunk_len = 256;
 
     FluxCLIPEmbedder(ggml_backend_t backend,
+                     bool offload_params_to_cpu,
                      const String2GGMLType& tensor_types = {},
                      int clip_skip                       = -1) {
-        clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
-        t5     = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
+        clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
+        t5     = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
         set_clip_skip(clip_skip);
     }
 
@@ -1232,13 +1237,14 @@ struct T5CLIPEmbedder : public Conditioner {
     bool is_umt5     = false;
 
     T5CLIPEmbedder(ggml_backend_t backend,
+                   bool offload_params_to_cpu,
                    const String2GGMLType& tensor_types = {},
                    int clip_skip                       = -1,
                    bool use_mask                       = false,
                    int mask_pad                        = 1,
                    bool is_umt5                        = false)
         : use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) {
-        t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer", is_umt5);
+        t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5);
     }
 
     void set_clip_skip(int clip_skip) {
diff --git a/control.hpp b/control.hpp
index d8f81fc..19f9181 100644
--- a/control.hpp
+++ b/control.hpp
@@ -317,9 +317,10 @@ struct ControlNet : public GGMLRunner {
     bool guided_hint_cached         = false;
 
     ControlNet(ggml_backend_t backend,
+               bool offload_params_to_cpu,
                const String2GGMLType& tensor_types = {},
                SDVersion version                   = VERSION_SD1)
-        : GGMLRunner(backend), control_net(version) {
+        : GGMLRunner(backend, offload_params_to_cpu), control_net(version) {
         control_net.init(params_ctx, tensor_types, "");
     }
 
@@ -346,7 +347,7 @@ struct ControlNet : public GGMLRunner {
             control_buffer_size += ggml_nbytes(controls[i]);
         }
 
-        control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, backend);
+        control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, runtime_backend);
 
         LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f);
     }
@@ -443,7 +444,7 @@ struct ControlNet : public GGMLRunner {
             return false;
         }
 
-        bool success = model_loader.load_tensors(tensors, backend, ignore_tensors);
+        bool success = model_loader.load_tensors(tensors, ignore_tensors);
 
         if (!success) {
             LOG_ERROR("load control net tensors from model loader failed");
diff --git a/diffusion_model.hpp b/diffusion_model.hpp
index 6ac5c9b..4a9f170 100644
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@@ -33,10 +33,11 @@ struct UNetModel : public DiffusionModel {
     UNetModelRunner unet;
 
     UNetModel(ggml_backend_t backend,
+              bool offload_params_to_cpu,
               const String2GGMLType& tensor_types = {},
               SDVersion version                   = VERSION_SD1,
               bool flash_attn                     = false)
-        : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
+        : unet(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn) {
     }
 
     void alloc_params_buffer() {
@@ -86,8 +87,9 @@ struct MMDiTModel : public DiffusionModel {
     MMDiTRunner mmdit;
 
     MMDiTModel(ggml_backend_t backend,
+               bool offload_params_to_cpu,
                const String2GGMLType& tensor_types = {})
-        : mmdit(backend, tensor_types, "model.diffusion_model") {
+        : mmdit(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model") {
     }
 
     void alloc_params_buffer() {
@@ -136,11 +138,12 @@ struct FluxModel : public DiffusionModel {
     Flux::FluxRunner flux;
 
     FluxModel(ggml_backend_t backend,
+              bool offload_params_to_cpu,
               const String2GGMLType& tensor_types = {},
               SDVersion version                   = VERSION_FLUX,
               bool flash_attn                     = false,
               bool use_mask                       = false)
-        : flux(backend, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) {
+        : flux(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) {
     }
 
     void alloc_params_buffer() {
@@ -189,10 +192,11 @@ struct WanModel : public DiffusionModel {
     WAN::WanRunner wan;
 
     WanModel(ggml_backend_t backend,
+             bool offload_params_to_cpu,
              const String2GGMLType& tensor_types = {},
              SDVersion version                   = VERSION_FLUX,
              bool flash_attn                     = false)
-        : wan(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
+        : wan(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn) {
     }
 
     void alloc_params_buffer() {
diff --git a/esrgan.hpp b/esrgan.hpp
index 4215db1..154e51b 100644
--- a/esrgan.hpp
+++ b/esrgan.hpp
@@ -142,8 +142,10 @@ struct ESRGAN : public GGMLRunner {
     int scale     = 4;
     int tile_size = 128;  // avoid cuda OOM for 4gb VRAM
 
-    ESRGAN(ggml_backend_t backend, const String2GGMLType& tensor_types = {})
-        : GGMLRunner(backend) {
+    ESRGAN(ggml_backend_t backend,
+           bool offload_params_to_cpu,
+           const String2GGMLType& tensor_types = {})
+        : GGMLRunner(backend, offload_params_to_cpu) {
         rrdb_net.init(params_ctx, tensor_types, "");
     }
 
@@ -164,7 +166,7 @@ struct ESRGAN : public GGMLRunner {
             return false;
         }
 
-        bool success = model_loader.load_tensors(esrgan_tensors, backend);
+        bool success = model_loader.load_tensors(esrgan_tensors);
 
         if (!success) {
             LOG_ERROR("load esrgan tensors from model loader failed");
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 9bbe7c7..a20b9b4 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -95,6 +95,7 @@ struct SDParams {
     int64_t seed                  = 42;
     bool verbose                  = false;
     bool vae_tiling               = false;
+    bool offload_params_to_cpu    = false;
     bool control_net_cpu          = false;
     bool normalize_input          = false;
     bool clip_on_cpu              = false;
@@ -141,8 +142,9 @@ void print_params(SDParams params) {
     for (auto& path : params.ref_image_paths) {
         printf("        %s\n", path.c_str());
     };
-    printf("    clip on cpu:       %s\n", params.clip_on_cpu ? "true" : "false");
-    printf("    controlnet cpu:    %s\n", params.control_net_cpu ? "true" : "false");
+    printf("    offload_params_to_cpu:        %s\n", params.offload_params_to_cpu ? "true" : "false");
+    printf("    clip_on_cpu:       %s\n", params.clip_on_cpu ? "true" : "false");
+    printf("    control_net_cpu:    %s\n", params.control_net_cpu ? "true" : "false");
     printf("    vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
     printf("    diffusion flash attention:%s\n", params.diffusion_flash_attn ? "true" : "false");
     printf("    strength(control): %.2f\n", params.control_strength);
@@ -461,6 +463,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
 
     options.bool_options = {
         {"", "--vae-tiling", "", true, &params.vae_tiling},
+        {"", "--offload-to-cpu", "", true, &params.offload_params_to_cpu},
         {"", "--control-net-cpu", "", true, &params.control_net_cpu},
         {"", "--normalize-input", "", true, &params.normalize_input},
         {"", "--clip-on-cpu", "", true, &params.clip_on_cpu},
@@ -943,6 +946,7 @@ int main(int argc, const char* argv[]) {
         params.wtype,
         params.rng_type,
         params.schedule,
+        params.offload_params_to_cpu,
         params.clip_on_cpu,
         params.control_net_cpu,
         params.vae_on_cpu,
@@ -1058,6 +1062,7 @@ int main(int argc, const char* argv[]) {
     int upscale_factor = 4;  // unused for RealESRGAN_x4plus_anime_6B.pth
     if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) {
         upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
+                                                        params.offload_params_to_cpu,
                                                         params.n_threads);
 
         if (upscaler_ctx == NULL) {
diff --git a/flux.hpp b/flux.hpp
index 17af38a..044ea82 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -881,12 +881,13 @@ namespace Flux {
         bool use_mask = false;
 
         FluxRunner(ggml_backend_t backend,
+                   bool offload_params_to_cpu,
                    const String2GGMLType& tensor_types = {},
                    const std::string prefix            = "",
                    SDVersion version                   = VERSION_FLUX,
                    bool flash_attn                     = false,
                    bool use_mask                       = false)
-            : GGMLRunner(backend), use_mask(use_mask) {
+            : GGMLRunner(backend, offload_params_to_cpu), use_mask(use_mask) {
             flux_params.flash_attn          = flash_attn;
             flux_params.guidance_embed      = false;
             flux_params.depth               = 0;
@@ -1085,7 +1086,7 @@ namespace Flux {
             // ggml_backend_t backend    = ggml_backend_cuda_init(0);
             ggml_backend_t backend           = ggml_backend_cpu_init();
             ggml_type model_data_type        = GGML_TYPE_Q8_0;
-            std::shared_ptr<FluxRunner> flux = std::shared_ptr<FluxRunner>(new FluxRunner(backend));
+            std::shared_ptr<FluxRunner> flux = std::shared_ptr<FluxRunner>(new FluxRunner(backend, false));
             {
                 LOG_INFO("loading from '%s'", file_path.c_str());
 
@@ -1099,7 +1100,7 @@ namespace Flux {
                     return;
                 }
 
-                bool success = model_loader.load_tensors(tensors, backend);
+                bool success = model_loader.load_tensors(tensors);
 
                 if (!success) {
                     LOG_ERROR("load tensors from model loader failed");
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 7563aed..20134c2 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -1230,16 +1230,20 @@ struct GGMLRunner {
 protected:
     typedef std::function<struct ggml_cgraph*()> get_graph_cb_t;
 
-    struct ggml_context* params_ctx     = NULL;
-    ggml_backend_buffer_t params_buffer = NULL;
+    ggml_backend_t params_backend  = NULL;
+    ggml_backend_t runtime_backend = NULL;
+
+    struct ggml_context* params_ctx             = NULL;
+    ggml_backend_buffer_t params_buffer         = NULL;
+    struct ggml_context* offload_ctx            = NULL;
+    ggml_backend_buffer_t runtime_params_buffer = NULL;
+    bool params_on_runtime_backend              = false;
 
     struct ggml_context* compute_ctx    = NULL;
     struct ggml_gallocr* compute_allocr = NULL;
 
     std::map<struct ggml_tensor*, const void*> backend_tensor_data_map;
 
-    ggml_backend_t backend = NULL;
-
     void alloc_params_ctx() {
         struct ggml_init_params params;
         params.mem_size   = static_cast<size_t>(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead());
@@ -1248,6 +1252,10 @@ protected:
 
         params_ctx = ggml_init(params);
         GGML_ASSERT(params_ctx != NULL);
+        if (params_backend != runtime_backend) {
+            offload_ctx = ggml_init(params);
+            GGML_ASSERT(offload_ctx != NULL);
+        }
     }
 
     void free_params_ctx() {
@@ -1255,6 +1263,10 @@ protected:
             ggml_free(params_ctx);
             params_ctx = NULL;
         }
+        if (offload_ctx != NULL) {
+            ggml_free(offload_ctx);
+            offload_ctx = NULL;
+        }
     }
 
     void alloc_compute_ctx() {
@@ -1281,7 +1293,7 @@ protected:
         reset_compute_ctx();
         struct ggml_cgraph* gf = get_graph();
         backend_tensor_data_map.clear();
-        compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+        compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(runtime_backend));
 
         if (!ggml_gallocr_reserve(compute_allocr, gf)) {
             // failed to allocate the compute buffer
@@ -1295,7 +1307,7 @@ protected:
         LOG_DEBUG("%s compute buffer size: %.2f MB(%s)",
                   get_desc().c_str(),
                   compute_buffer_size / 1024.0 / 1024.0,
-                  ggml_backend_is_cpu(backend) ? "RAM" : "VRAM");
+                  ggml_backend_is_cpu(runtime_backend) ? "RAM" : "VRAM");
         return true;
     }
 
@@ -1310,12 +1322,96 @@ protected:
         backend_tensor_data_map.clear();
     }
 
+    bool offload_params_to_runtime_backend() {
+        if (params_backend == runtime_backend) {
+            return true;
+        }
+        if (params_on_runtime_backend) {
+            return true;
+        }
+        GGML_ASSERT(runtime_params_buffer == NULL);
+        int64_t t0         = ggml_time_ms();
+        size_t num_tensors = ggml_tensor_num(offload_ctx);
+        if (num_tensors == 0) {
+            for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != NULL; t = ggml_get_next_tensor(params_ctx, t)) {
+                GGML_ASSERT(t->view_src == NULL);
+                ggml_dup_tensor(offload_ctx, t);
+            }
+        }
+        num_tensors = ggml_tensor_num(offload_ctx);
+        GGML_ASSERT(num_tensors == ggml_tensor_num(params_ctx));
+
+        runtime_params_buffer = ggml_backend_alloc_ctx_tensors(offload_ctx, runtime_backend);
+
+        if (runtime_params_buffer == NULL) {
+            LOG_ERROR("%s alloc runtime params backend buffer failed, num_tensors = %i",
+                      get_desc().c_str(),
+                      num_tensors);
+            return false;
+        }
+
+        ggml_tensor* t         = ggml_get_first_tensor(params_ctx);
+        ggml_tensor* offload_t = ggml_get_first_tensor(offload_ctx);
+
+        while (t != NULL && offload_t != NULL) {
+            ggml_backend_tensor_copy(t, offload_t);
+            std::swap(t->buffer, offload_t->buffer);
+            std::swap(t->data, offload_t->data);
+
+            t         = ggml_get_next_tensor(params_ctx, t);
+            offload_t = ggml_get_next_tensor(offload_ctx, offload_t);
+        }
+
+        int64_t t1 = ggml_time_ms();
+
+        size_t params_buffer_size = ggml_backend_buffer_get_size(runtime_params_buffer);
+        LOG_INFO("%s offload params (%6.2f MB, %i tensors) to runtime backend (%s), taking %.2fs",
+                 get_desc().c_str(),
+                 params_buffer_size / (1024.f * 1024.f),
+                 num_tensors,
+                 ggml_backend_name(runtime_backend),
+                 (t1 - t0) * 1.0f / 1000);
+
+        params_on_runtime_backend = true;
+
+        return true;
+    }
+
+    void offload_params_to_params_backend() {
+        if (!params_on_runtime_backend) {
+            return;
+        }
+        ggml_tensor* t         = ggml_get_first_tensor(params_ctx);
+        ggml_tensor* offload_t = ggml_get_first_tensor(offload_ctx);
+
+        while (t != NULL && offload_t != NULL) {
+            t->buffer         = offload_t->buffer;
+            t->data           = offload_t->data;
+            offload_t->buffer = NULL;
+            offload_t->data   = NULL;
+
+            t         = ggml_get_next_tensor(params_ctx, t);
+            offload_t = ggml_get_next_tensor(offload_ctx, offload_t);
+        }
+
+        if (runtime_params_buffer != NULL) {
+            ggml_backend_buffer_free(runtime_params_buffer);
+            runtime_params_buffer = NULL;
+        }
+        params_on_runtime_backend = false;
+    }
+
 public:
     virtual std::string get_desc() = 0;
 
-    GGMLRunner(ggml_backend_t backend)
-        : backend(backend) {
+    GGMLRunner(ggml_backend_t backend, bool offload_params_to_cpu = false)
+        : runtime_backend(backend) {
         alloc_params_ctx();
+        if (!ggml_backend_is_cpu(runtime_backend) && offload_params_to_cpu) {
+            params_backend = ggml_backend_cpu_init();
+        } else {
+            params_backend = runtime_backend;
+        }
     }
 
     virtual ~GGMLRunner() {
@@ -1323,6 +1419,9 @@ public:
         free_compute_buffer();
         free_params_ctx();
         free_compute_ctx();
+        if (params_backend != runtime_backend) {
+            ggml_backend_free(params_backend);
+        }
     }
 
     void reset_compute_ctx() {
@@ -1332,7 +1431,7 @@ public:
 
     bool alloc_params_buffer() {
         size_t num_tensors = ggml_tensor_num(params_ctx);
-        params_buffer      = ggml_backend_alloc_ctx_tensors(params_ctx, backend);
+        params_buffer      = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend);
         if (params_buffer == NULL) {
             LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
                       get_desc().c_str(),
@@ -1342,14 +1441,9 @@ public:
         size_t params_buffer_size = ggml_backend_buffer_get_size(params_buffer);
         LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)",
                   get_desc().c_str(),
-                  params_buffer_size / (1024.0 * 1024.0),
-                  ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
+                  params_buffer_size / (1024.f * 1024.f),
+                  ggml_backend_is_cpu(params_backend) ? "RAM" : "VRAM",
                   num_tensors);
-        // printf("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)\n",
-        //           get_desc().c_str(),
-        //           params_buffer_size / (1024.0 * 1024.0),
-        //           ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
-        //           num_tensors);
         return true;
     }
 
@@ -1372,6 +1466,7 @@ public:
             ggml_gallocr_free(compute_allocr);
             compute_allocr = NULL;
         }
+        offload_params_to_params_backend();
     }
 
     // do copy after alloc graph
@@ -1385,7 +1480,7 @@ public:
             return NULL;
         }
         // it's performing a compute, check if backend isn't cpu
-        if (!ggml_backend_is_cpu(backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) {
+        if (!ggml_backend_is_cpu(runtime_backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) {
             // pass input tensors to gpu memory
             auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);
 
@@ -1401,16 +1496,20 @@ public:
                  bool free_compute_buffer_immediately = true,
                  struct ggml_tensor** output          = NULL,
                  struct ggml_context* output_ctx      = NULL) {
+        if (!offload_params_to_runtime_backend()) {
+            LOG_ERROR("%s offload params to runtime backend failed", get_desc().c_str());
+            return;
+        }
         alloc_compute_buffer(get_graph);
         reset_compute_ctx();
         struct ggml_cgraph* gf = get_graph();
         GGML_ASSERT(ggml_gallocr_alloc_graph(compute_allocr, gf));
         cpy_data_to_backend_tensor();
-        if (ggml_backend_is_cpu(backend)) {
-            ggml_backend_cpu_set_n_threads(backend, n_threads);
+        if (ggml_backend_is_cpu(runtime_backend)) {
+            ggml_backend_cpu_set_n_threads(runtime_backend, n_threads);
         }
 
-        ggml_backend_graph_compute(backend, gf);
+        ggml_backend_graph_compute(runtime_backend, gf);
 #ifdef GGML_PERF
         ggml_graph_print(gf);
 #endif
@@ -1420,7 +1519,7 @@ public:
                 *output = ggml_dup_tensor(output_ctx, result);
             }
             if (*output != NULL) {
-                ggml_backend_tensor_get_and_sync(backend, result, (*output)->data, 0, ggml_nbytes(*output));
+                ggml_backend_tensor_get_and_sync(runtime_backend, result, (*output)->data, 0, ggml_nbytes(*output));
             }
         }
 
diff --git a/lora.hpp b/lora.hpp
index 35f5aac..b1a4971 100644
--- a/lora.hpp
+++ b/lora.hpp
@@ -92,6 +92,7 @@ struct LoraModel : public GGMLRunner {
 
     float multiplier = 1.0f;
     std::map<std::string, struct ggml_tensor*> lora_tensors;
+    std::map<ggml_tensor*, ggml_tensor*> original_weight_to_final_weight;
     std::string file_path;
     ModelLoader model_loader;
     bool load_failed                = false;
@@ -103,7 +104,7 @@ struct LoraModel : public GGMLRunner {
     LoraModel(ggml_backend_t backend,
               const std::string& file_path = "",
               const std::string prefix     = "")
-        : file_path(file_path), GGMLRunner(backend) {
+        : file_path(file_path), GGMLRunner(backend, false) {
         if (!model_loader.init_from_file(file_path, prefix)) {
             load_failed = true;
         }
@@ -151,11 +152,11 @@ struct LoraModel : public GGMLRunner {
             return true;
         };
 
-        model_loader.load_tensors(on_new_tensor_cb, backend);
+        model_loader.load_tensors(on_new_tensor_cb);
         alloc_params_buffer();
         // exit(0);
         dry_run = false;
-        model_loader.load_tensors(on_new_tensor_cb, backend);
+        model_loader.load_tensors(on_new_tensor_cb);
 
         LOG_DEBUG("lora type: \"%s\"/\"%s\"", lora_downs[type].c_str(), lora_ups[type].c_str());
 
@@ -790,6 +791,11 @@ struct LoraModel : public GGMLRunner {
                     updown = ggml_merge_lora(compute_ctx, lora_down, lora_up, lora_mid);
                 }
                 scale_value *= multiplier;
+                ggml_tensor* original_weight = weight;
+                if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(weight->buffer)) {
+                    weight = ggml_dup_tensor(compute_ctx, weight);
+                    set_backend_tensor_data(weight, original_weight->data);
+                }
                 updown = ggml_reshape(compute_ctx, updown, weight);
                 GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
                 updown = ggml_scale_inplace(compute_ctx, updown, scale_value);
@@ -805,6 +811,9 @@ struct LoraModel : public GGMLRunner {
                 }
                 // final_weight = ggml_add_inplace(compute_ctx, weight, updown);  // apply directly
                 ggml_build_forward_expand(gf, final_weight);
+                if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_weight->buffer)) {
+                    original_weight_to_final_weight[original_weight] = final_weight;
+                }
                 break;
             }
         }
@@ -839,7 +848,14 @@ struct LoraModel : public GGMLRunner {
         auto get_graph = [&]() -> struct ggml_cgraph* {
             return build_lora_graph(model_tensors, version);
         };
-        GGMLRunner::compute(get_graph, n_threads, true);
+        GGMLRunner::compute(get_graph, n_threads, false);
+        for (auto item : original_weight_to_final_weight) {
+            ggml_tensor* original_weight = item.first;
+            ggml_tensor* final_weight    = item.second;
+
+            ggml_backend_tensor_copy(final_weight, original_weight);
+        }
+        GGMLRunner::free_compute_buffer();
     }
 };
 
diff --git a/mmdit.hpp b/mmdit.hpp
index 5348808..904cda4 100644
--- a/mmdit.hpp
+++ b/mmdit.hpp
@@ -846,9 +846,10 @@ struct MMDiTRunner : public GGMLRunner {
     MMDiT mmdit;
 
     MMDiTRunner(ggml_backend_t backend,
+                bool offload_params_to_cpu,
                 const String2GGMLType& tensor_types = {},
                 const std::string prefix            = "")
-        : GGMLRunner(backend), mmdit(tensor_types) {
+        : GGMLRunner(backend, offload_params_to_cpu), mmdit(tensor_types) {
         mmdit.init(params_ctx, tensor_types, prefix);
     }
 
@@ -946,7 +947,7 @@ struct MMDiTRunner : public GGMLRunner {
         // ggml_backend_t backend    = ggml_backend_cuda_init(0);
         ggml_backend_t backend             = ggml_backend_cpu_init();
         ggml_type model_data_type          = GGML_TYPE_F16;
-        std::shared_ptr<MMDiTRunner> mmdit = std::shared_ptr<MMDiTRunner>(new MMDiTRunner(backend));
+        std::shared_ptr<MMDiTRunner> mmdit = std::shared_ptr<MMDiTRunner>(new MMDiTRunner(backend, false));
         {
             LOG_INFO("loading from '%s'", file_path.c_str());
 
@@ -960,7 +961,7 @@ struct MMDiTRunner : public GGMLRunner {
                 return;
             }
 
-            bool success = model_loader.load_tensors(tensors, backend);
+            bool success = model_loader.load_tensors(tensors);
 
             if (!success) {
                 LOG_ERROR("load tensors from model loader failed");
diff --git a/model.cpp b/model.cpp
index 89f9abc..1cb1507 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1048,12 +1048,12 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s
             }
         }
         for (int i = GGML_MAX_DIMS; i < n_dims; i++) {
-            shape->ne[GGML_MAX_DIMS - 1] *= ne[i]; // stack to last dim;
+            shape->ne[GGML_MAX_DIMS - 1] *= ne[i];  // stack to last dim;
         }
         return true;
     };
 
-    ctx_gguf_               = gguf_init_from_file_ext(file_path.c_str(), {true, &ctx_meta_}, on_tensor_shape_read);
+    ctx_gguf_ = gguf_init_from_file_ext(file_path.c_str(), {true, &ctx_meta_}, on_tensor_shape_read);
     if (!ctx_gguf_) {
         LOG_ERROR("failed to open '%s'", file_path.c_str());
         return false;
@@ -1917,7 +1917,7 @@ std::vector<TensorStorage> remove_duplicates(const std::vector<TensorStorage>& v
     return res;
 }
 
-bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend) {
+bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
     std::vector<TensorStorage> processed_tensor_storages;
     for (auto& tensor_storage : tensor_storages) {
         // LOG_DEBUG("%s", name.c_str());
@@ -2115,7 +2115,6 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
 }
 
 bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
-                               ggml_backend_t backend,
                                std::set<std::string> ignore_tensors) {
     std::set<std::string> tensor_names_in_file;
     auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
@@ -2155,7 +2154,7 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
         return true;
     };
 
-    bool success = load_tensors(on_new_tensor_cb, backend);
+    bool success = load_tensors(on_new_tensor_cb);
     if (!success) {
         LOG_ERROR("load tensors from file failed");
         return false;
@@ -2299,7 +2298,7 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
         return true;
     };
 
-    bool success = load_tensors(on_new_tensor_cb, backend);
+    bool success = load_tensors(on_new_tensor_cb);
     ggml_backend_free(backend);
     LOG_INFO("load tensors done");
     LOG_INFO("trying to save tensors to %s", file_path.c_str());
diff --git a/model.h b/model.h
index 10a7449..8dd2e87 100644
--- a/model.h
+++ b/model.h
@@ -245,9 +245,8 @@ public:
     ggml_type get_diffusion_model_wtype();
     ggml_type get_vae_wtype();
     void set_wtype_override(ggml_type wtype, std::string prefix = "");
-    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend);
+    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb);
     bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
-                      ggml_backend_t backend,
                       std::set<std::string> ignore_tensors = {});
 
     bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules);
diff --git a/pmid.hpp b/pmid.hpp
index e2a0f62..9b725de 100644
--- a/pmid.hpp
+++ b/pmid.hpp
@@ -624,12 +624,13 @@ public:
 
 public:
     PhotoMakerIDEncoder(ggml_backend_t backend,
+                        bool offload_params_to_cpu,
                         const String2GGMLType& tensor_types,
                         const std::string prefix,
                         SDVersion version = VERSION_SDXL,
                         PMVersion pm_v    = PM_VERSION_1,
                         float sty         = 20.f)
-        : GGMLRunner(backend),
+        : GGMLRunner(backend, offload_params_to_cpu),
           version(version),
           pm_version(pm_v),
           style_strength(sty) {
@@ -785,10 +786,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
     bool applied     = false;
 
     PhotoMakerIDEmbed(ggml_backend_t backend,
+                      bool offload_params_to_cpu,
                       ModelLoader* ml,
                       const std::string& file_path = "",
                       const std::string& prefix    = "")
-        : file_path(file_path), GGMLRunner(backend), model_loader(ml) {
+        : file_path(file_path), GGMLRunner(backend, offload_params_to_cpu), model_loader(ml) {
         if (!model_loader->init_from_file(file_path, prefix)) {
             load_failed = true;
         }
@@ -828,11 +830,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
             return true;
         };
 
-        model_loader->load_tensors(on_new_tensor_cb, backend);
+        model_loader->load_tensors(on_new_tensor_cb);
         alloc_params_buffer();
 
         dry_run = false;
-        model_loader->load_tensors(on_new_tensor_cb, backend);
+        model_loader->load_tensors(on_new_tensor_cb);
 
         LOG_DEBUG("finished loading PhotoMaker ID Embeds ");
         return true;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 08b9f4d..50796a5 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -104,9 +104,10 @@ public:
     std::shared_ptr<PhotoMakerIDEmbed> pmid_id_embeds;
 
     std::string taesd_path;
-    bool use_tiny_autoencoder = false;
-    bool vae_tiling           = false;
-    bool stacked_id           = false;
+    bool use_tiny_autoencoder  = false;
+    bool vae_tiling            = false;
+    bool offload_params_to_cpu = false;
+    bool stacked_id            = false;
 
     bool is_using_v_parameterization     = false;
     bool is_using_edm_v_parameterization = false;
@@ -180,6 +181,7 @@ public:
         taesd_path              = SAFE_STR(sd_ctx_params->taesd_path);
         use_tiny_autoencoder    = taesd_path.size() > 0;
         vae_tiling              = sd_ctx_params->vae_tiling;
+        offload_params_to_cpu   = sd_ctx_params->offload_params_to_cpu;
 
         if (sd_ctx_params->rng_type == STD_DEFAULT_RNG) {
             rng = std::make_shared<STDDefaultRNG>();
@@ -327,8 +329,12 @@ public:
                 if (sd_ctx_params->diffusion_flash_attn) {
                     LOG_WARN("flash attention in this diffusion model is currently unsupported!");
                 }
-                cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
-                diffusion_model  = std::make_shared<MMDiTModel>(backend, model_loader.tensor_storages_types);
+                cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend,
+                                                                     offload_params_to_cpu,
+                                                                     model_loader.tensor_storages_types);
+                diffusion_model  = std::make_shared<MMDiTModel>(backend,
+                                                               offload_params_to_cpu,
+                                                               model_loader.tensor_storages_types);
             } else if (sd_version_is_flux(version)) {
                 bool is_chroma = false;
                 for (auto pair : model_loader.tensor_storages_types) {
@@ -339,43 +345,52 @@ public:
                 }
                 if (is_chroma) {
                     cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
+                                                                        offload_params_to_cpu,
                                                                         model_loader.tensor_storages_types,
                                                                         -1,
                                                                         sd_ctx_params->chroma_use_t5_mask,
                                                                         sd_ctx_params->chroma_t5_mask_pad);
                 } else {
-                    cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
+                    cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend,
+                                                                          offload_params_to_cpu,
+                                                                          model_loader.tensor_storages_types);
                 }
                 diffusion_model = std::make_shared<FluxModel>(backend,
+                                                              offload_params_to_cpu,
                                                               model_loader.tensor_storages_types,
                                                               version,
                                                               sd_ctx_params->diffusion_flash_attn,
                                                               sd_ctx_params->chroma_use_dit_mask);
             } else if (sd_version_is_wan(version)) {
                 cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
+                                                                    offload_params_to_cpu,
                                                                     model_loader.tensor_storages_types,
                                                                     -1,
                                                                     true,
                                                                     1,
                                                                     true);
                 diffusion_model  = std::make_shared<WanModel>(backend,
+                                                             offload_params_to_cpu,
                                                              model_loader.tensor_storages_types,
                                                              version,
                                                              sd_ctx_params->diffusion_flash_attn);
             } else {  // SD1.x SD2.x SDXL
                 if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) {
                     cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
+                                                                                           offload_params_to_cpu,
                                                                                            model_loader.tensor_storages_types,
                                                                                            SAFE_STR(sd_ctx_params->embedding_dir),
                                                                                            version,
                                                                                            PM_VERSION_2);
                 } else {
                     cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
+                                                                                           offload_params_to_cpu,
                                                                                            model_loader.tensor_storages_types,
                                                                                            SAFE_STR(sd_ctx_params->embedding_dir),
                                                                                            version);
                 }
                 diffusion_model = std::make_shared<UNetModel>(backend,
+                                                              offload_params_to_cpu,
                                                               model_loader.tensor_storages_types,
                                                               version,
                                                               sd_ctx_params->diffusion_flash_attn);
@@ -396,6 +411,7 @@ public:
 
             if (sd_version_is_wan(version)) {
                 first_stage_model = std::make_shared<WAN::WanVAERunner>(vae_backend,
+                                                                        offload_params_to_cpu,
                                                                         model_loader.tensor_storages_types,
                                                                         "first_stage_model",
                                                                         vae_decode_only);
@@ -403,6 +419,7 @@ public:
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
             } else if (!use_tiny_autoencoder) {
                 first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend,
+                                                                    offload_params_to_cpu,
                                                                     model_loader.tensor_storages_types,
                                                                     "first_stage_model",
                                                                     vae_decode_only,
@@ -412,6 +429,7 @@ public:
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
             } else {
                 tae_first_stage = std::make_shared<TinyAutoEncoder>(vae_backend,
+                                                                    offload_params_to_cpu,
                                                                     model_loader.tensor_storages_types,
                                                                     "decoder.layers",
                                                                     vae_decode_only,
@@ -427,14 +445,26 @@ public:
                 } else {
                     controlnet_backend = backend;
                 }
-                control_net = std::make_shared<ControlNet>(controlnet_backend, model_loader.tensor_storages_types, version);
+                control_net = std::make_shared<ControlNet>(controlnet_backend,
+                                                           offload_params_to_cpu,
+                                                           model_loader.tensor_storages_types,
+                                                           version);
             }
 
             if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) {
-                pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend, model_loader.tensor_storages_types, "pmid", version, PM_VERSION_2);
+                pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend,
+                                                                   offload_params_to_cpu,
+                                                                   model_loader.tensor_storages_types,
+                                                                   "pmid",
+                                                                   version,
+                                                                   PM_VERSION_2);
                 LOG_INFO("using PhotoMaker Version 2");
             } else {
-                pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend, model_loader.tensor_storages_types, "pmid", version);
+                pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend,
+                                                                   offload_params_to_cpu,
+                                                                   model_loader.tensor_storages_types,
+                                                                   "pmid",
+                                                                   version);
             }
             if (strlen(SAFE_STR(sd_ctx_params->stacked_id_embed_dir)) > 0) {
                 pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->stacked_id_embed_dir, "");
@@ -489,7 +519,7 @@ public:
         if (version == VERSION_SVD) {
             ignore_tensors.insert("conditioner.embedders.3");
         }
-        bool success = model_loader.load_tensors(tensors, backend, ignore_tensors);
+        bool success = model_loader.load_tensors(tensors, ignore_tensors);
         if (!success) {
             LOG_ERROR("load tensors from model loader failed");
             ggml_free(ctx);
@@ -1354,6 +1384,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->wtype                   = SD_TYPE_COUNT;
     sd_ctx_params->rng_type                = CUDA_RNG;
     sd_ctx_params->schedule                = DEFAULT;
+    sd_ctx_params->offload_params_to_cpu   = false;
     sd_ctx_params->keep_clip_on_cpu        = false;
     sd_ctx_params->keep_control_net_on_cpu = false;
     sd_ctx_params->keep_vae_on_cpu         = false;
@@ -1388,6 +1419,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "wtype: %s\n"
              "rng_type: %s\n"
              "schedule: %s\n"
+             "offload_params_to_cpu: %s\n"
              "keep_clip_on_cpu: %s\n"
              "keep_control_net_on_cpu: %s\n"
              "keep_vae_on_cpu: %s\n"
@@ -1413,6 +1445,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              sd_type_name(sd_ctx_params->wtype),
              sd_rng_type_name(sd_ctx_params->rng_type),
              sd_schedule_name(sd_ctx_params->schedule),
+             BOOL_STR(sd_ctx_params->offload_params_to_cpu),
              BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
              BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
              BOOL_STR(sd_ctx_params->keep_vae_on_cpu),
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 644f930..a6a87dd 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -130,6 +130,7 @@ typedef struct {
     enum sd_type_t wtype;
     enum rng_type_t rng_type;
     enum schedule_t schedule;
+    bool offload_params_to_cpu;
     bool keep_clip_on_cpu;
     bool keep_control_net_on_cpu;
     bool keep_vae_on_cpu;
@@ -236,10 +237,13 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
 typedef struct upscaler_ctx_t upscaler_ctx_t;
 
 SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
+                                        bool offload_params_to_cpu,
                                         int n_threads);
 SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
 
-SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
+SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
+                          sd_image_t input_image,
+                          uint32_t upscale_factor);
 
 SD_API bool convert(const char* input_path,
                     const char* vae_path,
diff --git a/t5.hpp b/t5.hpp
index 408c256..f149dad 100644
--- a/t5.hpp
+++ b/t5.hpp
@@ -756,10 +756,11 @@ struct T5Runner : public GGMLRunner {
     std::vector<int> relative_position_bucket_vec;
 
     T5Runner(ggml_backend_t backend,
+             bool offload_params_to_cpu,
              const String2GGMLType& tensor_types,
              const std::string prefix,
              bool is_umt5 = false)
-        : GGMLRunner(backend) {
+        : GGMLRunner(backend, offload_params_to_cpu) {
         if (is_umt5) {
             params.vocab_size         = 256384;
             params.relative_attention = false;
@@ -900,10 +901,11 @@ struct T5Embedder {
     T5Runner model;
 
     T5Embedder(ggml_backend_t backend,
+               bool offload_params_to_cpu,
                const String2GGMLType& tensor_types = {},
                const std::string prefix            = "",
                bool is_umt5                        = false)
-        : model(backend, tensor_types, prefix, is_umt5), tokenizer(is_umt5) {
+        : model(backend, offload_params_to_cpu, tensor_types, prefix, is_umt5), tokenizer(is_umt5) {
     }
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
@@ -1012,13 +1014,13 @@ struct T5Embedder {
             }
         }
 
-        std::shared_ptr<T5Embedder> t5 = std::shared_ptr<T5Embedder>(new T5Embedder(backend, tensor_types, "", true));
+        std::shared_ptr<T5Embedder> t5 = std::shared_ptr<T5Embedder>(new T5Embedder(backend, false, tensor_types, "", true));
 
         t5->alloc_params_buffer();
         std::map<std::string, ggml_tensor*> tensors;
         t5->get_param_tensors(tensors, "");
 
-        bool success = model_loader.load_tensors(tensors, backend);
+        bool success = model_loader.load_tensors(tensors);
 
         if (!success) {
             LOG_ERROR("load tensors from model loader failed");
diff --git a/tae.hpp b/tae.hpp
index 51fb94f..da5aa56 100644
--- a/tae.hpp
+++ b/tae.hpp
@@ -196,13 +196,14 @@ struct TinyAutoEncoder : public GGMLRunner {
     bool decode_only = false;
 
     TinyAutoEncoder(ggml_backend_t backend,
+                    bool offload_params_to_cpu,
                     const String2GGMLType& tensor_types,
                     const std::string prefix,
                     bool decoder_only = true,
                     SDVersion version = VERSION_SD1)
         : decode_only(decoder_only),
           taesd(decoder_only, version),
-          GGMLRunner(backend) {
+          GGMLRunner(backend, offload_params_to_cpu) {
         taesd.init(params_ctx, tensor_types, prefix);
     }
 
@@ -226,7 +227,7 @@ struct TinyAutoEncoder : public GGMLRunner {
             return false;
         }
 
-        bool success = model_loader.load_tensors(taesd_tensors, backend, ignore_tensors);
+        bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors);
 
         if (!success) {
             LOG_ERROR("load tae tensors from model loader failed");
diff --git a/unet.hpp b/unet.hpp
index 7ab4934..847911d 100644
--- a/unet.hpp
+++ b/unet.hpp
@@ -538,11 +538,12 @@ struct UNetModelRunner : public GGMLRunner {
     UnetModelBlock unet;
 
     UNetModelRunner(ggml_backend_t backend,
+                    bool offload_params_to_cpu,
                     const String2GGMLType& tensor_types,
                     const std::string prefix,
                     SDVersion version = VERSION_SD1,
                     bool flash_attn   = false)
-        : GGMLRunner(backend), unet(version, tensor_types, flash_attn) {
+        : GGMLRunner(backend, offload_params_to_cpu), unet(version, tensor_types, flash_attn) {
         unet.init(params_ctx, tensor_types, prefix);
     }
 
diff --git a/upscaler.cpp b/upscaler.cpp
index 1372134..c7fb305 100644
--- a/upscaler.cpp
+++ b/upscaler.cpp
@@ -14,7 +14,8 @@ struct UpscalerGGML {
         : n_threads(n_threads) {
     }
 
-    bool load_from_file(const std::string& esrgan_path) {
+    bool load_from_file(const std::string& esrgan_path,
+                        bool offload_params_to_cpu) {
 #ifdef SD_USE_CUDA
         LOG_DEBUG("Using CUDA backend");
         backend = ggml_backend_cuda_init(0);
@@ -46,7 +47,7 @@ struct UpscalerGGML {
             backend = ggml_backend_cpu_init();
         }
         LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
-        esrgan_upscaler = std::make_shared<ESRGAN>(backend, model_loader.tensor_storages_types);
+        esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, model_loader.tensor_storages_types);
         if (!esrgan_upscaler->load_from_file(esrgan_path)) {
             return false;
         }
@@ -104,6 +105,7 @@ struct upscaler_ctx_t {
 };
 
 upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
+                                 bool offload_params_to_cpu,
                                  int n_threads) {
     upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
     if (upscaler_ctx == NULL) {
@@ -116,7 +118,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
         return NULL;
     }
 
-    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path)) {
+    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu)) {
         delete upscaler_ctx->upscaler;
         upscaler_ctx->upscaler = NULL;
         free(upscaler_ctx);
diff --git a/vae.hpp b/vae.hpp
index fcbe091..dc44dde 100644
--- a/vae.hpp
+++ b/vae.hpp
@@ -521,8 +521,8 @@ public:
 };
 
 struct VAE : public GGMLRunner {
-    VAE(ggml_backend_t backend)
-        : GGMLRunner(backend) {}
+    VAE(ggml_backend_t backend, bool offload_params_to_cpu)
+        : GGMLRunner(backend, offload_params_to_cpu) {}
     virtual void compute(const int n_threads,
                          struct ggml_tensor* z,
                          bool decode_graph,
@@ -536,12 +536,13 @@ struct AutoEncoderKL : public VAE {
     AutoencodingEngine ae;
 
     AutoEncoderKL(ggml_backend_t backend,
+                  bool offload_params_to_cpu,
                   const String2GGMLType& tensor_types,
                   const std::string prefix,
                   bool decode_only       = false,
                   bool use_video_decoder = false,
                   SDVersion version      = VERSION_SD1)
-        : decode_only(decode_only), ae(decode_only, use_video_decoder, version), VAE(backend) {
+        : decode_only(decode_only), ae(decode_only, use_video_decoder, version), VAE(backend, offload_params_to_cpu) {
         ae.init(params_ctx, tensor_types, prefix);
     }
 
diff --git a/wan.hpp b/wan.hpp
index 25f2c17..f6dbaad 100644
--- a/wan.hpp
+++ b/wan.hpp
@@ -767,10 +767,11 @@ namespace WAN {
         std::vector<FeatCache> _feat_vec_map;
 
         WanVAERunner(ggml_backend_t backend,
+                     bool offload_params_to_cpu,
                      const String2GGMLType& tensor_types = {},
                      const std::string prefix            = "",
                      bool decode_only                    = false)
-            : decode_only(decode_only), ae(decode_only), VAE(backend) {
+            : decode_only(decode_only), ae(decode_only), VAE(backend, offload_params_to_cpu) {
             ae.init(params_ctx, tensor_types, prefix);
             rest_feat_vec_map();
         }
@@ -857,7 +858,7 @@ namespace WAN {
                         feat_cache_vec.is_rep   = true;
                         _feat_vec_map[feat_idx] = feat_cache_vec;
                     } else if (feat_cache != NULL) {
-                        _feat_vec_map[feat_idx] = FeatCache(backend, feat_cache);
+                        _feat_vec_map[feat_idx] = FeatCache(runtime_backend, feat_cache);
                     }
                 }
                 GGMLRunner::free_compute_buffer();
@@ -897,7 +898,7 @@ namespace WAN {
                             feat_cache_vec.is_rep   = true;
                             _feat_vec_map[feat_idx] = feat_cache_vec;
                         } else if (feat_cache != NULL) {
-                            _feat_vec_map[feat_idx] = FeatCache(backend, feat_cache);
+                            _feat_vec_map[feat_idx] = FeatCache(runtime_backend, feat_cache);
                         }
                     }
 
@@ -943,7 +944,7 @@ namespace WAN {
             ggml_backend_t backend = ggml_backend_cuda_init(0);
             // ggml_backend_t backend            = ggml_backend_cpu_init();
             ggml_type model_data_type         = GGML_TYPE_F16;
-            std::shared_ptr<WanVAERunner> vae = std::shared_ptr<WanVAERunner>(new WanVAERunner(backend));
+            std::shared_ptr<WanVAERunner> vae = std::shared_ptr<WanVAERunner>(new WanVAERunner(backend, false));
             {
                 LOG_INFO("loading from '%s'", file_path.c_str());
 
@@ -957,7 +958,7 @@ namespace WAN {
                     return;
                 }
 
-                bool success = model_loader.load_tensors(tensors, backend);
+                bool success = model_loader.load_tensors(tensors);
 
                 if (!success) {
                     LOG_ERROR("load tensors from model loader failed");
@@ -1564,11 +1565,12 @@ namespace WAN {
         SDVersion version;
 
         WanRunner(ggml_backend_t backend,
+                  bool offload_params_to_cpu,
                   const String2GGMLType& tensor_types = {},
                   const std::string prefix            = "",
                   SDVersion version                   = VERSION_WAN2,
                   bool flash_attn                     = false)
-            : GGMLRunner(backend) {
+            : GGMLRunner(backend, offload_params_to_cpu) {
             wan_params.flash_attn = flash_attn;
             wan_params.num_layers = 0;
             for (auto pair : tensor_types) {
@@ -1747,6 +1749,7 @@ namespace WAN {
             }
 
             std::shared_ptr<WanRunner> wan = std::shared_ptr<WanRunner>(new WanRunner(backend,
+                                                                                      false,
                                                                                       tensor_types,
                                                                                       "model.diffusion_model"));
 
@@ -1754,7 +1757,7 @@ namespace WAN {
             std::map<std::string, ggml_tensor*> tensors;
             wan->get_param_tensors(tensors, "model.diffusion_model");
 
-            bool success = model_loader.load_tensors(tensors, backend);
+            bool success = model_loader.load_tensors(tensors);
 
             if (!success) {
                 LOG_ERROR("load tensors from model loader failed");