add offload params to cpu support

This commit is contained in:
leejet 2025-08-17 03:13:16 +08:00
parent b0833eb4d8
commit 9b29de27a8
21 changed files with 283 additions and 100 deletions

View File

@ -868,12 +868,13 @@ struct CLIPTextModelRunner : public GGMLRunner {
CLIPTextModel model; CLIPTextModel model;
CLIPTextModelRunner(ggml_backend_t backend, CLIPTextModelRunner(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types, const String2GGMLType& tensor_types,
const std::string prefix, const std::string prefix,
CLIPVersion version = OPENAI_CLIP_VIT_L_14, CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool with_final_ln = true, bool with_final_ln = true,
int clip_skip_value = -1) int clip_skip_value = -1)
: GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) { : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, clip_skip_value) {
model.init(params_ctx, tensor_types, prefix); model.init(params_ctx, tensor_types, prefix);
} }

View File

@ -57,6 +57,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
std::vector<std::string> readed_embeddings; std::vector<std::string> readed_embeddings;
FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend, FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types, const String2GGMLType& tensor_types,
const std::string& embd_dir, const std::string& embd_dir,
SDVersion version = VERSION_SD1, SDVersion version = VERSION_SD1,
@ -64,12 +65,12 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
int clip_skip = -1) int clip_skip = -1)
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) { : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
if (sd_version_is_sd1(version)) { if (sd_version_is_sd1(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14); text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
} else if (sd_version_is_sd2(version)) { } else if (sd_version_is_sd2(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14); text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
} else if (sd_version_is_sdxl(version)) { } else if (sd_version_is_sdxl(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
} }
set_clip_skip(clip_skip); set_clip_skip(clip_skip);
} }
@ -154,7 +155,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
} }
return true; return true;
}; };
model_loader.load_tensors(on_load, NULL); model_loader.load_tensors(on_load);
readed_embeddings.push_back(embd_name); readed_embeddings.push_back(embd_name);
if (embd) { if (embd) {
int64_t hidden_size = text_model->model.hidden_size; int64_t hidden_size = text_model->model.hidden_size;
@ -618,8 +619,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
struct FrozenCLIPVisionEmbedder : public GGMLRunner { struct FrozenCLIPVisionEmbedder : public GGMLRunner {
CLIPVisionModelProjection vision_model; CLIPVisionModelProjection vision_model;
FrozenCLIPVisionEmbedder(ggml_backend_t backend, const String2GGMLType& tensor_types = {}) FrozenCLIPVisionEmbedder(ggml_backend_t backend,
: vision_model(OPEN_CLIP_VIT_H_14, true), GGMLRunner(backend) { bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {})
: vision_model(OPEN_CLIP_VIT_H_14, true), GGMLRunner(backend, offload_params_to_cpu) {
vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer"); vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer");
} }
@ -663,12 +666,13 @@ struct SD3CLIPEmbedder : public Conditioner {
std::shared_ptr<T5Runner> t5; std::shared_ptr<T5Runner> t5;
SD3CLIPEmbedder(ggml_backend_t backend, SD3CLIPEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2GGMLType& tensor_types = {},
int clip_skip = -1) int clip_skip = -1)
: clip_g_tokenizer(0) { : clip_g_tokenizer(0) {
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer"); t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
set_clip_skip(clip_skip); set_clip_skip(clip_skip);
} }
@ -1010,10 +1014,11 @@ struct FluxCLIPEmbedder : public Conditioner {
size_t chunk_len = 256; size_t chunk_len = 256;
FluxCLIPEmbedder(ggml_backend_t backend, FluxCLIPEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2GGMLType& tensor_types = {},
int clip_skip = -1) { int clip_skip = -1) {
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer"); t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
set_clip_skip(clip_skip); set_clip_skip(clip_skip);
} }
@ -1232,13 +1237,14 @@ struct T5CLIPEmbedder : public Conditioner {
bool is_umt5 = false; bool is_umt5 = false;
T5CLIPEmbedder(ggml_backend_t backend, T5CLIPEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2GGMLType& tensor_types = {},
int clip_skip = -1, int clip_skip = -1,
bool use_mask = false, bool use_mask = false,
int mask_pad = 1, int mask_pad = 1,
bool is_umt5 = false) bool is_umt5 = false)
: use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) { : use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) {
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer", is_umt5); t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5);
} }
void set_clip_skip(int clip_skip) { void set_clip_skip(int clip_skip) {

View File

@ -317,9 +317,10 @@ struct ControlNet : public GGMLRunner {
bool guided_hint_cached = false; bool guided_hint_cached = false;
ControlNet(ggml_backend_t backend, ControlNet(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2GGMLType& tensor_types = {},
SDVersion version = VERSION_SD1) SDVersion version = VERSION_SD1)
: GGMLRunner(backend), control_net(version) { : GGMLRunner(backend, offload_params_to_cpu), control_net(version) {
control_net.init(params_ctx, tensor_types, ""); control_net.init(params_ctx, tensor_types, "");
} }
@ -346,7 +347,7 @@ struct ControlNet : public GGMLRunner {
control_buffer_size += ggml_nbytes(controls[i]); control_buffer_size += ggml_nbytes(controls[i]);
} }
control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, backend); control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, runtime_backend);
LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f); LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f);
} }
@ -443,7 +444,7 @@ struct ControlNet : public GGMLRunner {
return false; return false;
} }
bool success = model_loader.load_tensors(tensors, backend, ignore_tensors); bool success = model_loader.load_tensors(tensors, ignore_tensors);
if (!success) { if (!success) {
LOG_ERROR("load control net tensors from model loader failed"); LOG_ERROR("load control net tensors from model loader failed");

View File

@ -33,10 +33,11 @@ struct UNetModel : public DiffusionModel {
UNetModelRunner unet; UNetModelRunner unet;
UNetModel(ggml_backend_t backend, UNetModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2GGMLType& tensor_types = {},
SDVersion version = VERSION_SD1, SDVersion version = VERSION_SD1,
bool flash_attn = false) bool flash_attn = false)
: unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) { : unet(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn) {
} }
void alloc_params_buffer() { void alloc_params_buffer() {
@ -86,8 +87,9 @@ struct MMDiTModel : public DiffusionModel {
MMDiTRunner mmdit; MMDiTRunner mmdit;
MMDiTModel(ggml_backend_t backend, MMDiTModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}) const String2GGMLType& tensor_types = {})
: mmdit(backend, tensor_types, "model.diffusion_model") { : mmdit(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model") {
} }
void alloc_params_buffer() { void alloc_params_buffer() {
@ -136,11 +138,12 @@ struct FluxModel : public DiffusionModel {
Flux::FluxRunner flux; Flux::FluxRunner flux;
FluxModel(ggml_backend_t backend, FluxModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2GGMLType& tensor_types = {},
SDVersion version = VERSION_FLUX, SDVersion version = VERSION_FLUX,
bool flash_attn = false, bool flash_attn = false,
bool use_mask = false) bool use_mask = false)
: flux(backend, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) { : flux(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) {
} }
void alloc_params_buffer() { void alloc_params_buffer() {
@ -189,10 +192,11 @@ struct WanModel : public DiffusionModel {
WAN::WanRunner wan; WAN::WanRunner wan;
WanModel(ggml_backend_t backend, WanModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2GGMLType& tensor_types = {},
SDVersion version = VERSION_FLUX, SDVersion version = VERSION_FLUX,
bool flash_attn = false) bool flash_attn = false)
: wan(backend, tensor_types, "model.diffusion_model", version, flash_attn) { : wan(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn) {
} }
void alloc_params_buffer() { void alloc_params_buffer() {

View File

@ -142,8 +142,10 @@ struct ESRGAN : public GGMLRunner {
int scale = 4; int scale = 4;
int tile_size = 128; // avoid cuda OOM for 4gb VRAM int tile_size = 128; // avoid cuda OOM for 4gb VRAM
ESRGAN(ggml_backend_t backend, const String2GGMLType& tensor_types = {}) ESRGAN(ggml_backend_t backend,
: GGMLRunner(backend) { bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {})
: GGMLRunner(backend, offload_params_to_cpu) {
rrdb_net.init(params_ctx, tensor_types, ""); rrdb_net.init(params_ctx, tensor_types, "");
} }
@ -164,7 +166,7 @@ struct ESRGAN : public GGMLRunner {
return false; return false;
} }
bool success = model_loader.load_tensors(esrgan_tensors, backend); bool success = model_loader.load_tensors(esrgan_tensors);
if (!success) { if (!success) {
LOG_ERROR("load esrgan tensors from model loader failed"); LOG_ERROR("load esrgan tensors from model loader failed");

View File

@ -95,6 +95,7 @@ struct SDParams {
int64_t seed = 42; int64_t seed = 42;
bool verbose = false; bool verbose = false;
bool vae_tiling = false; bool vae_tiling = false;
bool offload_params_to_cpu = false;
bool control_net_cpu = false; bool control_net_cpu = false;
bool normalize_input = false; bool normalize_input = false;
bool clip_on_cpu = false; bool clip_on_cpu = false;
@ -141,8 +142,9 @@ void print_params(SDParams params) {
for (auto& path : params.ref_image_paths) { for (auto& path : params.ref_image_paths) {
printf(" %s\n", path.c_str()); printf(" %s\n", path.c_str());
}; };
printf(" clip on cpu: %s\n", params.clip_on_cpu ? "true" : "false"); printf(" offload_params_to_cpu: %s\n", params.offload_params_to_cpu ? "true" : "false");
printf(" controlnet cpu: %s\n", params.control_net_cpu ? "true" : "false"); printf(" clip_on_cpu: %s\n", params.clip_on_cpu ? "true" : "false");
printf(" control_net_cpu: %s\n", params.control_net_cpu ? "true" : "false");
printf(" vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false"); printf(" vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
printf(" diffusion flash attention:%s\n", params.diffusion_flash_attn ? "true" : "false"); printf(" diffusion flash attention:%s\n", params.diffusion_flash_attn ? "true" : "false");
printf(" strength(control): %.2f\n", params.control_strength); printf(" strength(control): %.2f\n", params.control_strength);
@ -461,6 +463,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
options.bool_options = { options.bool_options = {
{"", "--vae-tiling", "", true, &params.vae_tiling}, {"", "--vae-tiling", "", true, &params.vae_tiling},
{"", "--offload-to-cpu", "", true, &params.offload_params_to_cpu},
{"", "--control-net-cpu", "", true, &params.control_net_cpu}, {"", "--control-net-cpu", "", true, &params.control_net_cpu},
{"", "--normalize-input", "", true, &params.normalize_input}, {"", "--normalize-input", "", true, &params.normalize_input},
{"", "--clip-on-cpu", "", true, &params.clip_on_cpu}, {"", "--clip-on-cpu", "", true, &params.clip_on_cpu},
@ -943,6 +946,7 @@ int main(int argc, const char* argv[]) {
params.wtype, params.wtype,
params.rng_type, params.rng_type,
params.schedule, params.schedule,
params.offload_params_to_cpu,
params.clip_on_cpu, params.clip_on_cpu,
params.control_net_cpu, params.control_net_cpu,
params.vae_on_cpu, params.vae_on_cpu,
@ -1058,6 +1062,7 @@ int main(int argc, const char* argv[]) {
int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth
if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) { if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) {
upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(), upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
params.offload_params_to_cpu,
params.n_threads); params.n_threads);
if (upscaler_ctx == NULL) { if (upscaler_ctx == NULL) {

View File

@ -881,12 +881,13 @@ namespace Flux {
bool use_mask = false; bool use_mask = false;
FluxRunner(ggml_backend_t backend, FluxRunner(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2GGMLType& tensor_types = {},
const std::string prefix = "", const std::string prefix = "",
SDVersion version = VERSION_FLUX, SDVersion version = VERSION_FLUX,
bool flash_attn = false, bool flash_attn = false,
bool use_mask = false) bool use_mask = false)
: GGMLRunner(backend), use_mask(use_mask) { : GGMLRunner(backend, offload_params_to_cpu), use_mask(use_mask) {
flux_params.flash_attn = flash_attn; flux_params.flash_attn = flash_attn;
flux_params.guidance_embed = false; flux_params.guidance_embed = false;
flux_params.depth = 0; flux_params.depth = 0;
@ -1085,7 +1086,7 @@ namespace Flux {
// ggml_backend_t backend = ggml_backend_cuda_init(0); // ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init(); ggml_backend_t backend = ggml_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_Q8_0; ggml_type model_data_type = GGML_TYPE_Q8_0;
std::shared_ptr<FluxRunner> flux = std::shared_ptr<FluxRunner>(new FluxRunner(backend)); std::shared_ptr<FluxRunner> flux = std::shared_ptr<FluxRunner>(new FluxRunner(backend, false));
{ {
LOG_INFO("loading from '%s'", file_path.c_str()); LOG_INFO("loading from '%s'", file_path.c_str());
@ -1099,7 +1100,7 @@ namespace Flux {
return; return;
} }
bool success = model_loader.load_tensors(tensors, backend); bool success = model_loader.load_tensors(tensors);
if (!success) { if (!success) {
LOG_ERROR("load tensors from model loader failed"); LOG_ERROR("load tensors from model loader failed");

View File

@ -1230,16 +1230,20 @@ struct GGMLRunner {
protected: protected:
typedef std::function<struct ggml_cgraph*()> get_graph_cb_t; typedef std::function<struct ggml_cgraph*()> get_graph_cb_t;
struct ggml_context* params_ctx = NULL; ggml_backend_t params_backend = NULL;
ggml_backend_buffer_t params_buffer = NULL; ggml_backend_t runtime_backend = NULL;
struct ggml_context* params_ctx = NULL;
ggml_backend_buffer_t params_buffer = NULL;
struct ggml_context* offload_ctx = NULL;
ggml_backend_buffer_t runtime_params_buffer = NULL;
bool params_on_runtime_backend = false;
struct ggml_context* compute_ctx = NULL; struct ggml_context* compute_ctx = NULL;
struct ggml_gallocr* compute_allocr = NULL; struct ggml_gallocr* compute_allocr = NULL;
std::map<struct ggml_tensor*, const void*> backend_tensor_data_map; std::map<struct ggml_tensor*, const void*> backend_tensor_data_map;
ggml_backend_t backend = NULL;
void alloc_params_ctx() { void alloc_params_ctx() {
struct ggml_init_params params; struct ggml_init_params params;
params.mem_size = static_cast<size_t>(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead()); params.mem_size = static_cast<size_t>(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead());
@ -1248,6 +1252,10 @@ protected:
params_ctx = ggml_init(params); params_ctx = ggml_init(params);
GGML_ASSERT(params_ctx != NULL); GGML_ASSERT(params_ctx != NULL);
if (params_backend != runtime_backend) {
offload_ctx = ggml_init(params);
GGML_ASSERT(offload_ctx != NULL);
}
} }
void free_params_ctx() { void free_params_ctx() {
@ -1255,6 +1263,10 @@ protected:
ggml_free(params_ctx); ggml_free(params_ctx);
params_ctx = NULL; params_ctx = NULL;
} }
if (offload_ctx != NULL) {
ggml_free(offload_ctx);
offload_ctx = NULL;
}
} }
void alloc_compute_ctx() { void alloc_compute_ctx() {
@ -1281,7 +1293,7 @@ protected:
reset_compute_ctx(); reset_compute_ctx();
struct ggml_cgraph* gf = get_graph(); struct ggml_cgraph* gf = get_graph();
backend_tensor_data_map.clear(); backend_tensor_data_map.clear();
compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(runtime_backend));
if (!ggml_gallocr_reserve(compute_allocr, gf)) { if (!ggml_gallocr_reserve(compute_allocr, gf)) {
// failed to allocate the compute buffer // failed to allocate the compute buffer
@ -1295,7 +1307,7 @@ protected:
LOG_DEBUG("%s compute buffer size: %.2f MB(%s)", LOG_DEBUG("%s compute buffer size: %.2f MB(%s)",
get_desc().c_str(), get_desc().c_str(),
compute_buffer_size / 1024.0 / 1024.0, compute_buffer_size / 1024.0 / 1024.0,
ggml_backend_is_cpu(backend) ? "RAM" : "VRAM"); ggml_backend_is_cpu(runtime_backend) ? "RAM" : "VRAM");
return true; return true;
} }
@ -1310,12 +1322,96 @@ protected:
backend_tensor_data_map.clear(); backend_tensor_data_map.clear();
} }
bool offload_params_to_runtime_backend() {
if (params_backend == runtime_backend) {
return true;
}
if (params_on_runtime_backend) {
return true;
}
GGML_ASSERT(runtime_params_buffer == NULL);
int64_t t0 = ggml_time_ms();
size_t num_tensors = ggml_tensor_num(offload_ctx);
if (num_tensors == 0) {
for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != NULL; t = ggml_get_next_tensor(params_ctx, t)) {
GGML_ASSERT(t->view_src == NULL);
ggml_dup_tensor(offload_ctx, t);
}
}
num_tensors = ggml_tensor_num(offload_ctx);
GGML_ASSERT(num_tensors == ggml_tensor_num(params_ctx));
runtime_params_buffer = ggml_backend_alloc_ctx_tensors(offload_ctx, runtime_backend);
if (runtime_params_buffer == NULL) {
LOG_ERROR("%s alloc runtime params backend buffer failed, num_tensors = %i",
get_desc().c_str(),
num_tensors);
return false;
}
ggml_tensor* t = ggml_get_first_tensor(params_ctx);
ggml_tensor* offload_t = ggml_get_first_tensor(offload_ctx);
while (t != NULL && offload_t != NULL) {
ggml_backend_tensor_copy(t, offload_t);
std::swap(t->buffer, offload_t->buffer);
std::swap(t->data, offload_t->data);
t = ggml_get_next_tensor(params_ctx, t);
offload_t = ggml_get_next_tensor(offload_ctx, offload_t);
}
int64_t t1 = ggml_time_ms();
size_t params_buffer_size = ggml_backend_buffer_get_size(runtime_params_buffer);
LOG_INFO("%s offload params (%6.2f MB, %i tensors) to runtime backend (%s), taking %.2fs",
get_desc().c_str(),
params_buffer_size / (1024.f * 1024.f),
num_tensors,
ggml_backend_name(runtime_backend),
(t1 - t0) * 1.0f / 1000);
params_on_runtime_backend = true;
return true;
}
void offload_params_to_params_backend() {
if (!params_on_runtime_backend) {
return;
}
ggml_tensor* t = ggml_get_first_tensor(params_ctx);
ggml_tensor* offload_t = ggml_get_first_tensor(offload_ctx);
while (t != NULL && offload_t != NULL) {
t->buffer = offload_t->buffer;
t->data = offload_t->data;
offload_t->buffer = NULL;
offload_t->data = NULL;
t = ggml_get_next_tensor(params_ctx, t);
offload_t = ggml_get_next_tensor(offload_ctx, offload_t);
}
if (runtime_params_buffer != NULL) {
ggml_backend_buffer_free(runtime_params_buffer);
runtime_params_buffer = NULL;
}
params_on_runtime_backend = false;
}
public: public:
virtual std::string get_desc() = 0; virtual std::string get_desc() = 0;
GGMLRunner(ggml_backend_t backend) GGMLRunner(ggml_backend_t backend, bool offload_params_to_cpu = false)
: backend(backend) { : runtime_backend(backend) {
alloc_params_ctx(); alloc_params_ctx();
if (!ggml_backend_is_cpu(runtime_backend) && offload_params_to_cpu) {
params_backend = ggml_backend_cpu_init();
} else {
params_backend = runtime_backend;
}
} }
virtual ~GGMLRunner() { virtual ~GGMLRunner() {
@ -1323,6 +1419,9 @@ public:
free_compute_buffer(); free_compute_buffer();
free_params_ctx(); free_params_ctx();
free_compute_ctx(); free_compute_ctx();
if (params_backend != runtime_backend) {
ggml_backend_free(params_backend);
}
} }
void reset_compute_ctx() { void reset_compute_ctx() {
@ -1332,7 +1431,7 @@ public:
bool alloc_params_buffer() { bool alloc_params_buffer() {
size_t num_tensors = ggml_tensor_num(params_ctx); size_t num_tensors = ggml_tensor_num(params_ctx);
params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, backend); params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend);
if (params_buffer == NULL) { if (params_buffer == NULL) {
LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i", LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
get_desc().c_str(), get_desc().c_str(),
@ -1342,14 +1441,9 @@ public:
size_t params_buffer_size = ggml_backend_buffer_get_size(params_buffer); size_t params_buffer_size = ggml_backend_buffer_get_size(params_buffer);
LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)", LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)",
get_desc().c_str(), get_desc().c_str(),
params_buffer_size / (1024.0 * 1024.0), params_buffer_size / (1024.f * 1024.f),
ggml_backend_is_cpu(backend) ? "RAM" : "VRAM", ggml_backend_is_cpu(params_backend) ? "RAM" : "VRAM",
num_tensors); num_tensors);
// printf("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)\n",
// get_desc().c_str(),
// params_buffer_size / (1024.0 * 1024.0),
// ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
// num_tensors);
return true; return true;
} }
@ -1372,6 +1466,7 @@ public:
ggml_gallocr_free(compute_allocr); ggml_gallocr_free(compute_allocr);
compute_allocr = NULL; compute_allocr = NULL;
} }
offload_params_to_params_backend();
} }
// do copy after alloc graph // do copy after alloc graph
@ -1385,7 +1480,7 @@ public:
return NULL; return NULL;
} }
// it's performing a compute, check if backend isn't cpu // it's performing a compute, check if backend isn't cpu
if (!ggml_backend_is_cpu(backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) { if (!ggml_backend_is_cpu(runtime_backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) {
// pass input tensors to gpu memory // pass input tensors to gpu memory
auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor); auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);
@ -1401,16 +1496,20 @@ public:
bool free_compute_buffer_immediately = true, bool free_compute_buffer_immediately = true,
struct ggml_tensor** output = NULL, struct ggml_tensor** output = NULL,
struct ggml_context* output_ctx = NULL) { struct ggml_context* output_ctx = NULL) {
if (!offload_params_to_runtime_backend()) {
LOG_ERROR("%s offload params to runtime backend failed", get_desc().c_str());
return;
}
alloc_compute_buffer(get_graph); alloc_compute_buffer(get_graph);
reset_compute_ctx(); reset_compute_ctx();
struct ggml_cgraph* gf = get_graph(); struct ggml_cgraph* gf = get_graph();
GGML_ASSERT(ggml_gallocr_alloc_graph(compute_allocr, gf)); GGML_ASSERT(ggml_gallocr_alloc_graph(compute_allocr, gf));
cpy_data_to_backend_tensor(); cpy_data_to_backend_tensor();
if (ggml_backend_is_cpu(backend)) { if (ggml_backend_is_cpu(runtime_backend)) {
ggml_backend_cpu_set_n_threads(backend, n_threads); ggml_backend_cpu_set_n_threads(runtime_backend, n_threads);
} }
ggml_backend_graph_compute(backend, gf); ggml_backend_graph_compute(runtime_backend, gf);
#ifdef GGML_PERF #ifdef GGML_PERF
ggml_graph_print(gf); ggml_graph_print(gf);
#endif #endif
@ -1420,7 +1519,7 @@ public:
*output = ggml_dup_tensor(output_ctx, result); *output = ggml_dup_tensor(output_ctx, result);
} }
if (*output != NULL) { if (*output != NULL) {
ggml_backend_tensor_get_and_sync(backend, result, (*output)->data, 0, ggml_nbytes(*output)); ggml_backend_tensor_get_and_sync(runtime_backend, result, (*output)->data, 0, ggml_nbytes(*output));
} }
} }

View File

@ -92,6 +92,7 @@ struct LoraModel : public GGMLRunner {
float multiplier = 1.0f; float multiplier = 1.0f;
std::map<std::string, struct ggml_tensor*> lora_tensors; std::map<std::string, struct ggml_tensor*> lora_tensors;
std::map<ggml_tensor*, ggml_tensor*> original_weight_to_final_weight;
std::string file_path; std::string file_path;
ModelLoader model_loader; ModelLoader model_loader;
bool load_failed = false; bool load_failed = false;
@ -103,7 +104,7 @@ struct LoraModel : public GGMLRunner {
LoraModel(ggml_backend_t backend, LoraModel(ggml_backend_t backend,
const std::string& file_path = "", const std::string& file_path = "",
const std::string prefix = "") const std::string prefix = "")
: file_path(file_path), GGMLRunner(backend) { : file_path(file_path), GGMLRunner(backend, false) {
if (!model_loader.init_from_file(file_path, prefix)) { if (!model_loader.init_from_file(file_path, prefix)) {
load_failed = true; load_failed = true;
} }
@ -151,11 +152,11 @@ struct LoraModel : public GGMLRunner {
return true; return true;
}; };
model_loader.load_tensors(on_new_tensor_cb, backend); model_loader.load_tensors(on_new_tensor_cb);
alloc_params_buffer(); alloc_params_buffer();
// exit(0); // exit(0);
dry_run = false; dry_run = false;
model_loader.load_tensors(on_new_tensor_cb, backend); model_loader.load_tensors(on_new_tensor_cb);
LOG_DEBUG("lora type: \"%s\"/\"%s\"", lora_downs[type].c_str(), lora_ups[type].c_str()); LOG_DEBUG("lora type: \"%s\"/\"%s\"", lora_downs[type].c_str(), lora_ups[type].c_str());
@ -790,6 +791,11 @@ struct LoraModel : public GGMLRunner {
updown = ggml_merge_lora(compute_ctx, lora_down, lora_up, lora_mid); updown = ggml_merge_lora(compute_ctx, lora_down, lora_up, lora_mid);
} }
scale_value *= multiplier; scale_value *= multiplier;
ggml_tensor* original_weight = weight;
if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(weight->buffer)) {
weight = ggml_dup_tensor(compute_ctx, weight);
set_backend_tensor_data(weight, original_weight->data);
}
updown = ggml_reshape(compute_ctx, updown, weight); updown = ggml_reshape(compute_ctx, updown, weight);
GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight)); GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
updown = ggml_scale_inplace(compute_ctx, updown, scale_value); updown = ggml_scale_inplace(compute_ctx, updown, scale_value);
@ -805,6 +811,9 @@ struct LoraModel : public GGMLRunner {
} }
// final_weight = ggml_add_inplace(compute_ctx, weight, updown); // apply directly // final_weight = ggml_add_inplace(compute_ctx, weight, updown); // apply directly
ggml_build_forward_expand(gf, final_weight); ggml_build_forward_expand(gf, final_weight);
if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_weight->buffer)) {
original_weight_to_final_weight[original_weight] = final_weight;
}
break; break;
} }
} }
@ -839,7 +848,14 @@ struct LoraModel : public GGMLRunner {
auto get_graph = [&]() -> struct ggml_cgraph* { auto get_graph = [&]() -> struct ggml_cgraph* {
return build_lora_graph(model_tensors, version); return build_lora_graph(model_tensors, version);
}; };
GGMLRunner::compute(get_graph, n_threads, true); GGMLRunner::compute(get_graph, n_threads, false);
for (auto item : original_weight_to_final_weight) {
ggml_tensor* original_weight = item.first;
ggml_tensor* final_weight = item.second;
ggml_backend_tensor_copy(final_weight, original_weight);
}
GGMLRunner::free_compute_buffer();
} }
}; };

View File

@ -846,9 +846,10 @@ struct MMDiTRunner : public GGMLRunner {
MMDiT mmdit; MMDiT mmdit;
MMDiTRunner(ggml_backend_t backend, MMDiTRunner(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2GGMLType& tensor_types = {},
const std::string prefix = "") const std::string prefix = "")
: GGMLRunner(backend), mmdit(tensor_types) { : GGMLRunner(backend, offload_params_to_cpu), mmdit(tensor_types) {
mmdit.init(params_ctx, tensor_types, prefix); mmdit.init(params_ctx, tensor_types, prefix);
} }
@ -946,7 +947,7 @@ struct MMDiTRunner : public GGMLRunner {
// ggml_backend_t backend = ggml_backend_cuda_init(0); // ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init(); ggml_backend_t backend = ggml_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_F16; ggml_type model_data_type = GGML_TYPE_F16;
std::shared_ptr<MMDiTRunner> mmdit = std::shared_ptr<MMDiTRunner>(new MMDiTRunner(backend)); std::shared_ptr<MMDiTRunner> mmdit = std::shared_ptr<MMDiTRunner>(new MMDiTRunner(backend, false));
{ {
LOG_INFO("loading from '%s'", file_path.c_str()); LOG_INFO("loading from '%s'", file_path.c_str());
@ -960,7 +961,7 @@ struct MMDiTRunner : public GGMLRunner {
return; return;
} }
bool success = model_loader.load_tensors(tensors, backend); bool success = model_loader.load_tensors(tensors);
if (!success) { if (!success) {
LOG_ERROR("load tensors from model loader failed"); LOG_ERROR("load tensors from model loader failed");

View File

@ -1048,12 +1048,12 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s
} }
} }
for (int i = GGML_MAX_DIMS; i < n_dims; i++) { for (int i = GGML_MAX_DIMS; i < n_dims; i++) {
shape->ne[GGML_MAX_DIMS - 1] *= ne[i]; // stack to last dim; shape->ne[GGML_MAX_DIMS - 1] *= ne[i]; // stack to last dim;
} }
return true; return true;
}; };
ctx_gguf_ = gguf_init_from_file_ext(file_path.c_str(), {true, &ctx_meta_}, on_tensor_shape_read); ctx_gguf_ = gguf_init_from_file_ext(file_path.c_str(), {true, &ctx_meta_}, on_tensor_shape_read);
if (!ctx_gguf_) { if (!ctx_gguf_) {
LOG_ERROR("failed to open '%s'", file_path.c_str()); LOG_ERROR("failed to open '%s'", file_path.c_str());
return false; return false;
@ -1917,7 +1917,7 @@ std::vector<TensorStorage> remove_duplicates(const std::vector<TensorStorage>& v
return res; return res;
} }
bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend) { bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
std::vector<TensorStorage> processed_tensor_storages; std::vector<TensorStorage> processed_tensor_storages;
for (auto& tensor_storage : tensor_storages) { for (auto& tensor_storage : tensor_storages) {
// LOG_DEBUG("%s", name.c_str()); // LOG_DEBUG("%s", name.c_str());
@ -2115,7 +2115,6 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
} }
bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tensors, bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
ggml_backend_t backend,
std::set<std::string> ignore_tensors) { std::set<std::string> ignore_tensors) {
std::set<std::string> tensor_names_in_file; std::set<std::string> tensor_names_in_file;
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
@ -2155,7 +2154,7 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
return true; return true;
}; };
bool success = load_tensors(on_new_tensor_cb, backend); bool success = load_tensors(on_new_tensor_cb);
if (!success) { if (!success) {
LOG_ERROR("load tensors from file failed"); LOG_ERROR("load tensors from file failed");
return false; return false;
@ -2299,7 +2298,7 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
return true; return true;
}; };
bool success = load_tensors(on_new_tensor_cb, backend); bool success = load_tensors(on_new_tensor_cb);
ggml_backend_free(backend); ggml_backend_free(backend);
LOG_INFO("load tensors done"); LOG_INFO("load tensors done");
LOG_INFO("trying to save tensors to %s", file_path.c_str()); LOG_INFO("trying to save tensors to %s", file_path.c_str());

View File

@ -245,9 +245,8 @@ public:
ggml_type get_diffusion_model_wtype(); ggml_type get_diffusion_model_wtype();
ggml_type get_vae_wtype(); ggml_type get_vae_wtype();
void set_wtype_override(ggml_type wtype, std::string prefix = ""); void set_wtype_override(ggml_type wtype, std::string prefix = "");
bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend); bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb);
bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors, bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
ggml_backend_t backend,
std::set<std::string> ignore_tensors = {}); std::set<std::string> ignore_tensors = {});
bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules); bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules);

View File

@ -624,12 +624,13 @@ public:
public: public:
PhotoMakerIDEncoder(ggml_backend_t backend, PhotoMakerIDEncoder(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types, const String2GGMLType& tensor_types,
const std::string prefix, const std::string prefix,
SDVersion version = VERSION_SDXL, SDVersion version = VERSION_SDXL,
PMVersion pm_v = PM_VERSION_1, PMVersion pm_v = PM_VERSION_1,
float sty = 20.f) float sty = 20.f)
: GGMLRunner(backend), : GGMLRunner(backend, offload_params_to_cpu),
version(version), version(version),
pm_version(pm_v), pm_version(pm_v),
style_strength(sty) { style_strength(sty) {
@ -785,10 +786,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
bool applied = false; bool applied = false;
PhotoMakerIDEmbed(ggml_backend_t backend, PhotoMakerIDEmbed(ggml_backend_t backend,
bool offload_params_to_cpu,
ModelLoader* ml, ModelLoader* ml,
const std::string& file_path = "", const std::string& file_path = "",
const std::string& prefix = "") const std::string& prefix = "")
: file_path(file_path), GGMLRunner(backend), model_loader(ml) { : file_path(file_path), GGMLRunner(backend, offload_params_to_cpu), model_loader(ml) {
if (!model_loader->init_from_file(file_path, prefix)) { if (!model_loader->init_from_file(file_path, prefix)) {
load_failed = true; load_failed = true;
} }
@ -828,11 +830,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
return true; return true;
}; };
model_loader->load_tensors(on_new_tensor_cb, backend); model_loader->load_tensors(on_new_tensor_cb);
alloc_params_buffer(); alloc_params_buffer();
dry_run = false; dry_run = false;
model_loader->load_tensors(on_new_tensor_cb, backend); model_loader->load_tensors(on_new_tensor_cb);
LOG_DEBUG("finished loading PhotoMaker ID Embeds "); LOG_DEBUG("finished loading PhotoMaker ID Embeds ");
return true; return true;

View File

@ -104,9 +104,10 @@ public:
std::shared_ptr<PhotoMakerIDEmbed> pmid_id_embeds; std::shared_ptr<PhotoMakerIDEmbed> pmid_id_embeds;
std::string taesd_path; std::string taesd_path;
bool use_tiny_autoencoder = false; bool use_tiny_autoencoder = false;
bool vae_tiling = false; bool vae_tiling = false;
bool stacked_id = false; bool offload_params_to_cpu = false;
bool stacked_id = false;
bool is_using_v_parameterization = false; bool is_using_v_parameterization = false;
bool is_using_edm_v_parameterization = false; bool is_using_edm_v_parameterization = false;
@ -180,6 +181,7 @@ public:
taesd_path = SAFE_STR(sd_ctx_params->taesd_path); taesd_path = SAFE_STR(sd_ctx_params->taesd_path);
use_tiny_autoencoder = taesd_path.size() > 0; use_tiny_autoencoder = taesd_path.size() > 0;
vae_tiling = sd_ctx_params->vae_tiling; vae_tiling = sd_ctx_params->vae_tiling;
offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu;
if (sd_ctx_params->rng_type == STD_DEFAULT_RNG) { if (sd_ctx_params->rng_type == STD_DEFAULT_RNG) {
rng = std::make_shared<STDDefaultRNG>(); rng = std::make_shared<STDDefaultRNG>();
@ -327,8 +329,12 @@ public:
if (sd_ctx_params->diffusion_flash_attn) { if (sd_ctx_params->diffusion_flash_attn) {
LOG_WARN("flash attention in this diffusion model is currently unsupported!"); LOG_WARN("flash attention in this diffusion model is currently unsupported!");
} }
cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, model_loader.tensor_storages_types); cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend,
diffusion_model = std::make_shared<MMDiTModel>(backend, model_loader.tensor_storages_types); offload_params_to_cpu,
model_loader.tensor_storages_types);
diffusion_model = std::make_shared<MMDiTModel>(backend,
offload_params_to_cpu,
model_loader.tensor_storages_types);
} else if (sd_version_is_flux(version)) { } else if (sd_version_is_flux(version)) {
bool is_chroma = false; bool is_chroma = false;
for (auto pair : model_loader.tensor_storages_types) { for (auto pair : model_loader.tensor_storages_types) {
@ -339,43 +345,52 @@ public:
} }
if (is_chroma) { if (is_chroma) {
cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend, cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
offload_params_to_cpu,
model_loader.tensor_storages_types, model_loader.tensor_storages_types,
-1, -1,
sd_ctx_params->chroma_use_t5_mask, sd_ctx_params->chroma_use_t5_mask,
sd_ctx_params->chroma_t5_mask_pad); sd_ctx_params->chroma_t5_mask_pad);
} else { } else {
cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types); cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend,
offload_params_to_cpu,
model_loader.tensor_storages_types);
} }
diffusion_model = std::make_shared<FluxModel>(backend, diffusion_model = std::make_shared<FluxModel>(backend,
offload_params_to_cpu,
model_loader.tensor_storages_types, model_loader.tensor_storages_types,
version, version,
sd_ctx_params->diffusion_flash_attn, sd_ctx_params->diffusion_flash_attn,
sd_ctx_params->chroma_use_dit_mask); sd_ctx_params->chroma_use_dit_mask);
} else if (sd_version_is_wan(version)) { } else if (sd_version_is_wan(version)) {
cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend, cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
offload_params_to_cpu,
model_loader.tensor_storages_types, model_loader.tensor_storages_types,
-1, -1,
true, true,
1, 1,
true); true);
diffusion_model = std::make_shared<WanModel>(backend, diffusion_model = std::make_shared<WanModel>(backend,
offload_params_to_cpu,
model_loader.tensor_storages_types, model_loader.tensor_storages_types,
version, version,
sd_ctx_params->diffusion_flash_attn); sd_ctx_params->diffusion_flash_attn);
} else { // SD1.x SD2.x SDXL } else { // SD1.x SD2.x SDXL
if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) { if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) {
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
offload_params_to_cpu,
model_loader.tensor_storages_types, model_loader.tensor_storages_types,
SAFE_STR(sd_ctx_params->embedding_dir), SAFE_STR(sd_ctx_params->embedding_dir),
version, version,
PM_VERSION_2); PM_VERSION_2);
} else { } else {
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
offload_params_to_cpu,
model_loader.tensor_storages_types, model_loader.tensor_storages_types,
SAFE_STR(sd_ctx_params->embedding_dir), SAFE_STR(sd_ctx_params->embedding_dir),
version); version);
} }
diffusion_model = std::make_shared<UNetModel>(backend, diffusion_model = std::make_shared<UNetModel>(backend,
offload_params_to_cpu,
model_loader.tensor_storages_types, model_loader.tensor_storages_types,
version, version,
sd_ctx_params->diffusion_flash_attn); sd_ctx_params->diffusion_flash_attn);
@ -396,6 +411,7 @@ public:
if (sd_version_is_wan(version)) { if (sd_version_is_wan(version)) {
first_stage_model = std::make_shared<WAN::WanVAERunner>(vae_backend, first_stage_model = std::make_shared<WAN::WanVAERunner>(vae_backend,
offload_params_to_cpu,
model_loader.tensor_storages_types, model_loader.tensor_storages_types,
"first_stage_model", "first_stage_model",
vae_decode_only); vae_decode_only);
@ -403,6 +419,7 @@ public:
first_stage_model->get_param_tensors(tensors, "first_stage_model"); first_stage_model->get_param_tensors(tensors, "first_stage_model");
} else if (!use_tiny_autoencoder) { } else if (!use_tiny_autoencoder) {
first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend, first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend,
offload_params_to_cpu,
model_loader.tensor_storages_types, model_loader.tensor_storages_types,
"first_stage_model", "first_stage_model",
vae_decode_only, vae_decode_only,
@ -412,6 +429,7 @@ public:
first_stage_model->get_param_tensors(tensors, "first_stage_model"); first_stage_model->get_param_tensors(tensors, "first_stage_model");
} else { } else {
tae_first_stage = std::make_shared<TinyAutoEncoder>(vae_backend, tae_first_stage = std::make_shared<TinyAutoEncoder>(vae_backend,
offload_params_to_cpu,
model_loader.tensor_storages_types, model_loader.tensor_storages_types,
"decoder.layers", "decoder.layers",
vae_decode_only, vae_decode_only,
@ -427,14 +445,26 @@ public:
} else { } else {
controlnet_backend = backend; controlnet_backend = backend;
} }
control_net = std::make_shared<ControlNet>(controlnet_backend, model_loader.tensor_storages_types, version); control_net = std::make_shared<ControlNet>(controlnet_backend,
offload_params_to_cpu,
model_loader.tensor_storages_types,
version);
} }
if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) { if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) {
pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend, model_loader.tensor_storages_types, "pmid", version, PM_VERSION_2); pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend,
offload_params_to_cpu,
model_loader.tensor_storages_types,
"pmid",
version,
PM_VERSION_2);
LOG_INFO("using PhotoMaker Version 2"); LOG_INFO("using PhotoMaker Version 2");
} else { } else {
pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend, model_loader.tensor_storages_types, "pmid", version); pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend,
offload_params_to_cpu,
model_loader.tensor_storages_types,
"pmid",
version);
} }
if (strlen(SAFE_STR(sd_ctx_params->stacked_id_embed_dir)) > 0) { if (strlen(SAFE_STR(sd_ctx_params->stacked_id_embed_dir)) > 0) {
pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->stacked_id_embed_dir, ""); pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->stacked_id_embed_dir, "");
@ -489,7 +519,7 @@ public:
if (version == VERSION_SVD) { if (version == VERSION_SVD) {
ignore_tensors.insert("conditioner.embedders.3"); ignore_tensors.insert("conditioner.embedders.3");
} }
bool success = model_loader.load_tensors(tensors, backend, ignore_tensors); bool success = model_loader.load_tensors(tensors, ignore_tensors);
if (!success) { if (!success) {
LOG_ERROR("load tensors from model loader failed"); LOG_ERROR("load tensors from model loader failed");
ggml_free(ctx); ggml_free(ctx);
@ -1354,6 +1384,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
sd_ctx_params->wtype = SD_TYPE_COUNT; sd_ctx_params->wtype = SD_TYPE_COUNT;
sd_ctx_params->rng_type = CUDA_RNG; sd_ctx_params->rng_type = CUDA_RNG;
sd_ctx_params->schedule = DEFAULT; sd_ctx_params->schedule = DEFAULT;
sd_ctx_params->offload_params_to_cpu = false;
sd_ctx_params->keep_clip_on_cpu = false; sd_ctx_params->keep_clip_on_cpu = false;
sd_ctx_params->keep_control_net_on_cpu = false; sd_ctx_params->keep_control_net_on_cpu = false;
sd_ctx_params->keep_vae_on_cpu = false; sd_ctx_params->keep_vae_on_cpu = false;
@ -1388,6 +1419,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
"wtype: %s\n" "wtype: %s\n"
"rng_type: %s\n" "rng_type: %s\n"
"schedule: %s\n" "schedule: %s\n"
"offload_params_to_cpu: %s\n"
"keep_clip_on_cpu: %s\n" "keep_clip_on_cpu: %s\n"
"keep_control_net_on_cpu: %s\n" "keep_control_net_on_cpu: %s\n"
"keep_vae_on_cpu: %s\n" "keep_vae_on_cpu: %s\n"
@ -1413,6 +1445,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
sd_type_name(sd_ctx_params->wtype), sd_type_name(sd_ctx_params->wtype),
sd_rng_type_name(sd_ctx_params->rng_type), sd_rng_type_name(sd_ctx_params->rng_type),
sd_schedule_name(sd_ctx_params->schedule), sd_schedule_name(sd_ctx_params->schedule),
BOOL_STR(sd_ctx_params->offload_params_to_cpu),
BOOL_STR(sd_ctx_params->keep_clip_on_cpu), BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
BOOL_STR(sd_ctx_params->keep_control_net_on_cpu), BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
BOOL_STR(sd_ctx_params->keep_vae_on_cpu), BOOL_STR(sd_ctx_params->keep_vae_on_cpu),

View File

@ -130,6 +130,7 @@ typedef struct {
enum sd_type_t wtype; enum sd_type_t wtype;
enum rng_type_t rng_type; enum rng_type_t rng_type;
enum schedule_t schedule; enum schedule_t schedule;
bool offload_params_to_cpu;
bool keep_clip_on_cpu; bool keep_clip_on_cpu;
bool keep_control_net_on_cpu; bool keep_control_net_on_cpu;
bool keep_vae_on_cpu; bool keep_vae_on_cpu;
@ -236,10 +237,13 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
typedef struct upscaler_ctx_t upscaler_ctx_t; typedef struct upscaler_ctx_t upscaler_ctx_t;
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path, SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
bool offload_params_to_cpu,
int n_threads); int n_threads);
SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx); SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor); SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
sd_image_t input_image,
uint32_t upscale_factor);
SD_API bool convert(const char* input_path, SD_API bool convert(const char* input_path,
const char* vae_path, const char* vae_path,

10
t5.hpp
View File

@ -756,10 +756,11 @@ struct T5Runner : public GGMLRunner {
std::vector<int> relative_position_bucket_vec; std::vector<int> relative_position_bucket_vec;
T5Runner(ggml_backend_t backend, T5Runner(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types, const String2GGMLType& tensor_types,
const std::string prefix, const std::string prefix,
bool is_umt5 = false) bool is_umt5 = false)
: GGMLRunner(backend) { : GGMLRunner(backend, offload_params_to_cpu) {
if (is_umt5) { if (is_umt5) {
params.vocab_size = 256384; params.vocab_size = 256384;
params.relative_attention = false; params.relative_attention = false;
@ -900,10 +901,11 @@ struct T5Embedder {
T5Runner model; T5Runner model;
T5Embedder(ggml_backend_t backend, T5Embedder(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2GGMLType& tensor_types = {},
const std::string prefix = "", const std::string prefix = "",
bool is_umt5 = false) bool is_umt5 = false)
: model(backend, tensor_types, prefix, is_umt5), tokenizer(is_umt5) { : model(backend, offload_params_to_cpu, tensor_types, prefix, is_umt5), tokenizer(is_umt5) {
} }
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) { void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
@ -1012,13 +1014,13 @@ struct T5Embedder {
} }
} }
std::shared_ptr<T5Embedder> t5 = std::shared_ptr<T5Embedder>(new T5Embedder(backend, tensor_types, "", true)); std::shared_ptr<T5Embedder> t5 = std::shared_ptr<T5Embedder>(new T5Embedder(backend, false, tensor_types, "", true));
t5->alloc_params_buffer(); t5->alloc_params_buffer();
std::map<std::string, ggml_tensor*> tensors; std::map<std::string, ggml_tensor*> tensors;
t5->get_param_tensors(tensors, ""); t5->get_param_tensors(tensors, "");
bool success = model_loader.load_tensors(tensors, backend); bool success = model_loader.load_tensors(tensors);
if (!success) { if (!success) {
LOG_ERROR("load tensors from model loader failed"); LOG_ERROR("load tensors from model loader failed");

View File

@ -196,13 +196,14 @@ struct TinyAutoEncoder : public GGMLRunner {
bool decode_only = false; bool decode_only = false;
TinyAutoEncoder(ggml_backend_t backend, TinyAutoEncoder(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types, const String2GGMLType& tensor_types,
const std::string prefix, const std::string prefix,
bool decoder_only = true, bool decoder_only = true,
SDVersion version = VERSION_SD1) SDVersion version = VERSION_SD1)
: decode_only(decoder_only), : decode_only(decoder_only),
taesd(decoder_only, version), taesd(decoder_only, version),
GGMLRunner(backend) { GGMLRunner(backend, offload_params_to_cpu) {
taesd.init(params_ctx, tensor_types, prefix); taesd.init(params_ctx, tensor_types, prefix);
} }
@ -226,7 +227,7 @@ struct TinyAutoEncoder : public GGMLRunner {
return false; return false;
} }
bool success = model_loader.load_tensors(taesd_tensors, backend, ignore_tensors); bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors);
if (!success) { if (!success) {
LOG_ERROR("load tae tensors from model loader failed"); LOG_ERROR("load tae tensors from model loader failed");

View File

@ -538,11 +538,12 @@ struct UNetModelRunner : public GGMLRunner {
UnetModelBlock unet; UnetModelBlock unet;
UNetModelRunner(ggml_backend_t backend, UNetModelRunner(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types, const String2GGMLType& tensor_types,
const std::string prefix, const std::string prefix,
SDVersion version = VERSION_SD1, SDVersion version = VERSION_SD1,
bool flash_attn = false) bool flash_attn = false)
: GGMLRunner(backend), unet(version, tensor_types, flash_attn) { : GGMLRunner(backend, offload_params_to_cpu), unet(version, tensor_types, flash_attn) {
unet.init(params_ctx, tensor_types, prefix); unet.init(params_ctx, tensor_types, prefix);
} }

View File

@ -14,7 +14,8 @@ struct UpscalerGGML {
: n_threads(n_threads) { : n_threads(n_threads) {
} }
bool load_from_file(const std::string& esrgan_path) { bool load_from_file(const std::string& esrgan_path,
bool offload_params_to_cpu) {
#ifdef SD_USE_CUDA #ifdef SD_USE_CUDA
LOG_DEBUG("Using CUDA backend"); LOG_DEBUG("Using CUDA backend");
backend = ggml_backend_cuda_init(0); backend = ggml_backend_cuda_init(0);
@ -46,7 +47,7 @@ struct UpscalerGGML {
backend = ggml_backend_cpu_init(); backend = ggml_backend_cpu_init();
} }
LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type)); LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
esrgan_upscaler = std::make_shared<ESRGAN>(backend, model_loader.tensor_storages_types); esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, model_loader.tensor_storages_types);
if (!esrgan_upscaler->load_from_file(esrgan_path)) { if (!esrgan_upscaler->load_from_file(esrgan_path)) {
return false; return false;
} }
@ -104,6 +105,7 @@ struct upscaler_ctx_t {
}; };
upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str, upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
bool offload_params_to_cpu,
int n_threads) { int n_threads) {
upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t)); upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
if (upscaler_ctx == NULL) { if (upscaler_ctx == NULL) {
@ -116,7 +118,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
return NULL; return NULL;
} }
if (!upscaler_ctx->upscaler->load_from_file(esrgan_path)) { if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu)) {
delete upscaler_ctx->upscaler; delete upscaler_ctx->upscaler;
upscaler_ctx->upscaler = NULL; upscaler_ctx->upscaler = NULL;
free(upscaler_ctx); free(upscaler_ctx);

View File

@ -521,8 +521,8 @@ public:
}; };
struct VAE : public GGMLRunner { struct VAE : public GGMLRunner {
VAE(ggml_backend_t backend) VAE(ggml_backend_t backend, bool offload_params_to_cpu)
: GGMLRunner(backend) {} : GGMLRunner(backend, offload_params_to_cpu) {}
virtual void compute(const int n_threads, virtual void compute(const int n_threads,
struct ggml_tensor* z, struct ggml_tensor* z,
bool decode_graph, bool decode_graph,
@ -536,12 +536,13 @@ struct AutoEncoderKL : public VAE {
AutoencodingEngine ae; AutoencodingEngine ae;
AutoEncoderKL(ggml_backend_t backend, AutoEncoderKL(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types, const String2GGMLType& tensor_types,
const std::string prefix, const std::string prefix,
bool decode_only = false, bool decode_only = false,
bool use_video_decoder = false, bool use_video_decoder = false,
SDVersion version = VERSION_SD1) SDVersion version = VERSION_SD1)
: decode_only(decode_only), ae(decode_only, use_video_decoder, version), VAE(backend) { : decode_only(decode_only), ae(decode_only, use_video_decoder, version), VAE(backend, offload_params_to_cpu) {
ae.init(params_ctx, tensor_types, prefix); ae.init(params_ctx, tensor_types, prefix);
} }

17
wan.hpp
View File

@ -767,10 +767,11 @@ namespace WAN {
std::vector<FeatCache> _feat_vec_map; std::vector<FeatCache> _feat_vec_map;
WanVAERunner(ggml_backend_t backend, WanVAERunner(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2GGMLType& tensor_types = {},
const std::string prefix = "", const std::string prefix = "",
bool decode_only = false) bool decode_only = false)
: decode_only(decode_only), ae(decode_only), VAE(backend) { : decode_only(decode_only), ae(decode_only), VAE(backend, offload_params_to_cpu) {
ae.init(params_ctx, tensor_types, prefix); ae.init(params_ctx, tensor_types, prefix);
rest_feat_vec_map(); rest_feat_vec_map();
} }
@ -857,7 +858,7 @@ namespace WAN {
feat_cache_vec.is_rep = true; feat_cache_vec.is_rep = true;
_feat_vec_map[feat_idx] = feat_cache_vec; _feat_vec_map[feat_idx] = feat_cache_vec;
} else if (feat_cache != NULL) { } else if (feat_cache != NULL) {
_feat_vec_map[feat_idx] = FeatCache(backend, feat_cache); _feat_vec_map[feat_idx] = FeatCache(runtime_backend, feat_cache);
} }
} }
GGMLRunner::free_compute_buffer(); GGMLRunner::free_compute_buffer();
@ -897,7 +898,7 @@ namespace WAN {
feat_cache_vec.is_rep = true; feat_cache_vec.is_rep = true;
_feat_vec_map[feat_idx] = feat_cache_vec; _feat_vec_map[feat_idx] = feat_cache_vec;
} else if (feat_cache != NULL) { } else if (feat_cache != NULL) {
_feat_vec_map[feat_idx] = FeatCache(backend, feat_cache); _feat_vec_map[feat_idx] = FeatCache(runtime_backend, feat_cache);
} }
} }
@ -943,7 +944,7 @@ namespace WAN {
ggml_backend_t backend = ggml_backend_cuda_init(0); ggml_backend_t backend = ggml_backend_cuda_init(0);
// ggml_backend_t backend = ggml_backend_cpu_init(); // ggml_backend_t backend = ggml_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_F16; ggml_type model_data_type = GGML_TYPE_F16;
std::shared_ptr<WanVAERunner> vae = std::shared_ptr<WanVAERunner>(new WanVAERunner(backend)); std::shared_ptr<WanVAERunner> vae = std::shared_ptr<WanVAERunner>(new WanVAERunner(backend, false));
{ {
LOG_INFO("loading from '%s'", file_path.c_str()); LOG_INFO("loading from '%s'", file_path.c_str());
@ -957,7 +958,7 @@ namespace WAN {
return; return;
} }
bool success = model_loader.load_tensors(tensors, backend); bool success = model_loader.load_tensors(tensors);
if (!success) { if (!success) {
LOG_ERROR("load tensors from model loader failed"); LOG_ERROR("load tensors from model loader failed");
@ -1564,11 +1565,12 @@ namespace WAN {
SDVersion version; SDVersion version;
WanRunner(ggml_backend_t backend, WanRunner(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2GGMLType& tensor_types = {}, const String2GGMLType& tensor_types = {},
const std::string prefix = "", const std::string prefix = "",
SDVersion version = VERSION_WAN2, SDVersion version = VERSION_WAN2,
bool flash_attn = false) bool flash_attn = false)
: GGMLRunner(backend) { : GGMLRunner(backend, offload_params_to_cpu) {
wan_params.flash_attn = flash_attn; wan_params.flash_attn = flash_attn;
wan_params.num_layers = 0; wan_params.num_layers = 0;
for (auto pair : tensor_types) { for (auto pair : tensor_types) {
@ -1747,6 +1749,7 @@ namespace WAN {
} }
std::shared_ptr<WanRunner> wan = std::shared_ptr<WanRunner>(new WanRunner(backend, std::shared_ptr<WanRunner> wan = std::shared_ptr<WanRunner>(new WanRunner(backend,
false,
tensor_types, tensor_types,
"model.diffusion_model")); "model.diffusion_model"));
@ -1754,7 +1757,7 @@ namespace WAN {
std::map<std::string, ggml_tensor*> tensors; std::map<std::string, ggml_tensor*> tensors;
wan->get_param_tensors(tensors, "model.diffusion_model"); wan->get_param_tensors(tensors, "model.diffusion_model");
bool success = model_loader.load_tensors(tensors, backend); bool success = model_loader.load_tensors(tensors);
if (!success) { if (!success) {
LOG_ERROR("load tensors from model loader failed"); LOG_ERROR("load tensors from model loader failed");