mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-13 05:48:56 +00:00
add offload params to cpu support
This commit is contained in:
parent
b0833eb4d8
commit
9b29de27a8
3
clip.hpp
3
clip.hpp
@ -868,12 +868,13 @@ struct CLIPTextModelRunner : public GGMLRunner {
|
|||||||
CLIPTextModel model;
|
CLIPTextModel model;
|
||||||
|
|
||||||
CLIPTextModelRunner(ggml_backend_t backend,
|
CLIPTextModelRunner(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
const String2GGMLType& tensor_types,
|
const String2GGMLType& tensor_types,
|
||||||
const std::string prefix,
|
const std::string prefix,
|
||||||
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
|
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
|
||||||
bool with_final_ln = true,
|
bool with_final_ln = true,
|
||||||
int clip_skip_value = -1)
|
int clip_skip_value = -1)
|
||||||
: GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) {
|
: GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, clip_skip_value) {
|
||||||
model.init(params_ctx, tensor_types, prefix);
|
model.init(params_ctx, tensor_types, prefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -57,6 +57,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
|||||||
std::vector<std::string> readed_embeddings;
|
std::vector<std::string> readed_embeddings;
|
||||||
|
|
||||||
FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
|
FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
const String2GGMLType& tensor_types,
|
const String2GGMLType& tensor_types,
|
||||||
const std::string& embd_dir,
|
const std::string& embd_dir,
|
||||||
SDVersion version = VERSION_SD1,
|
SDVersion version = VERSION_SD1,
|
||||||
@ -64,12 +65,12 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
|||||||
int clip_skip = -1)
|
int clip_skip = -1)
|
||||||
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
|
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
|
||||||
if (sd_version_is_sd1(version)) {
|
if (sd_version_is_sd1(version)) {
|
||||||
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
|
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
|
||||||
} else if (sd_version_is_sd2(version)) {
|
} else if (sd_version_is_sd2(version)) {
|
||||||
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
|
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
|
||||||
} else if (sd_version_is_sdxl(version)) {
|
} else if (sd_version_is_sdxl(version)) {
|
||||||
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
|
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
|
||||||
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
|
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
|
||||||
}
|
}
|
||||||
set_clip_skip(clip_skip);
|
set_clip_skip(clip_skip);
|
||||||
}
|
}
|
||||||
@ -154,7 +155,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
|||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
model_loader.load_tensors(on_load, NULL);
|
model_loader.load_tensors(on_load);
|
||||||
readed_embeddings.push_back(embd_name);
|
readed_embeddings.push_back(embd_name);
|
||||||
if (embd) {
|
if (embd) {
|
||||||
int64_t hidden_size = text_model->model.hidden_size;
|
int64_t hidden_size = text_model->model.hidden_size;
|
||||||
@ -618,8 +619,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
|||||||
struct FrozenCLIPVisionEmbedder : public GGMLRunner {
|
struct FrozenCLIPVisionEmbedder : public GGMLRunner {
|
||||||
CLIPVisionModelProjection vision_model;
|
CLIPVisionModelProjection vision_model;
|
||||||
|
|
||||||
FrozenCLIPVisionEmbedder(ggml_backend_t backend, const String2GGMLType& tensor_types = {})
|
FrozenCLIPVisionEmbedder(ggml_backend_t backend,
|
||||||
: vision_model(OPEN_CLIP_VIT_H_14, true), GGMLRunner(backend) {
|
bool offload_params_to_cpu,
|
||||||
|
const String2GGMLType& tensor_types = {})
|
||||||
|
: vision_model(OPEN_CLIP_VIT_H_14, true), GGMLRunner(backend, offload_params_to_cpu) {
|
||||||
vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer");
|
vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -663,12 +666,13 @@ struct SD3CLIPEmbedder : public Conditioner {
|
|||||||
std::shared_ptr<T5Runner> t5;
|
std::shared_ptr<T5Runner> t5;
|
||||||
|
|
||||||
SD3CLIPEmbedder(ggml_backend_t backend,
|
SD3CLIPEmbedder(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
const String2GGMLType& tensor_types = {},
|
const String2GGMLType& tensor_types = {},
|
||||||
int clip_skip = -1)
|
int clip_skip = -1)
|
||||||
: clip_g_tokenizer(0) {
|
: clip_g_tokenizer(0) {
|
||||||
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
|
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
|
||||||
clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
|
clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
|
||||||
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
|
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
|
||||||
set_clip_skip(clip_skip);
|
set_clip_skip(clip_skip);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1010,10 +1014,11 @@ struct FluxCLIPEmbedder : public Conditioner {
|
|||||||
size_t chunk_len = 256;
|
size_t chunk_len = 256;
|
||||||
|
|
||||||
FluxCLIPEmbedder(ggml_backend_t backend,
|
FluxCLIPEmbedder(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
const String2GGMLType& tensor_types = {},
|
const String2GGMLType& tensor_types = {},
|
||||||
int clip_skip = -1) {
|
int clip_skip = -1) {
|
||||||
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
|
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
|
||||||
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
|
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
|
||||||
set_clip_skip(clip_skip);
|
set_clip_skip(clip_skip);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1232,13 +1237,14 @@ struct T5CLIPEmbedder : public Conditioner {
|
|||||||
bool is_umt5 = false;
|
bool is_umt5 = false;
|
||||||
|
|
||||||
T5CLIPEmbedder(ggml_backend_t backend,
|
T5CLIPEmbedder(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
const String2GGMLType& tensor_types = {},
|
const String2GGMLType& tensor_types = {},
|
||||||
int clip_skip = -1,
|
int clip_skip = -1,
|
||||||
bool use_mask = false,
|
bool use_mask = false,
|
||||||
int mask_pad = 1,
|
int mask_pad = 1,
|
||||||
bool is_umt5 = false)
|
bool is_umt5 = false)
|
||||||
: use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) {
|
: use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) {
|
||||||
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer", is_umt5);
|
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5);
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_clip_skip(int clip_skip) {
|
void set_clip_skip(int clip_skip) {
|
||||||
|
|||||||
@ -317,9 +317,10 @@ struct ControlNet : public GGMLRunner {
|
|||||||
bool guided_hint_cached = false;
|
bool guided_hint_cached = false;
|
||||||
|
|
||||||
ControlNet(ggml_backend_t backend,
|
ControlNet(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
const String2GGMLType& tensor_types = {},
|
const String2GGMLType& tensor_types = {},
|
||||||
SDVersion version = VERSION_SD1)
|
SDVersion version = VERSION_SD1)
|
||||||
: GGMLRunner(backend), control_net(version) {
|
: GGMLRunner(backend, offload_params_to_cpu), control_net(version) {
|
||||||
control_net.init(params_ctx, tensor_types, "");
|
control_net.init(params_ctx, tensor_types, "");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -346,7 +347,7 @@ struct ControlNet : public GGMLRunner {
|
|||||||
control_buffer_size += ggml_nbytes(controls[i]);
|
control_buffer_size += ggml_nbytes(controls[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, backend);
|
control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, runtime_backend);
|
||||||
|
|
||||||
LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f);
|
LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f);
|
||||||
}
|
}
|
||||||
@ -443,7 +444,7 @@ struct ControlNet : public GGMLRunner {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool success = model_loader.load_tensors(tensors, backend, ignore_tensors);
|
bool success = model_loader.load_tensors(tensors, ignore_tensors);
|
||||||
|
|
||||||
if (!success) {
|
if (!success) {
|
||||||
LOG_ERROR("load control net tensors from model loader failed");
|
LOG_ERROR("load control net tensors from model loader failed");
|
||||||
|
|||||||
@ -33,10 +33,11 @@ struct UNetModel : public DiffusionModel {
|
|||||||
UNetModelRunner unet;
|
UNetModelRunner unet;
|
||||||
|
|
||||||
UNetModel(ggml_backend_t backend,
|
UNetModel(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
const String2GGMLType& tensor_types = {},
|
const String2GGMLType& tensor_types = {},
|
||||||
SDVersion version = VERSION_SD1,
|
SDVersion version = VERSION_SD1,
|
||||||
bool flash_attn = false)
|
bool flash_attn = false)
|
||||||
: unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
|
: unet(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void alloc_params_buffer() {
|
void alloc_params_buffer() {
|
||||||
@ -86,8 +87,9 @@ struct MMDiTModel : public DiffusionModel {
|
|||||||
MMDiTRunner mmdit;
|
MMDiTRunner mmdit;
|
||||||
|
|
||||||
MMDiTModel(ggml_backend_t backend,
|
MMDiTModel(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
const String2GGMLType& tensor_types = {})
|
const String2GGMLType& tensor_types = {})
|
||||||
: mmdit(backend, tensor_types, "model.diffusion_model") {
|
: mmdit(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model") {
|
||||||
}
|
}
|
||||||
|
|
||||||
void alloc_params_buffer() {
|
void alloc_params_buffer() {
|
||||||
@ -136,11 +138,12 @@ struct FluxModel : public DiffusionModel {
|
|||||||
Flux::FluxRunner flux;
|
Flux::FluxRunner flux;
|
||||||
|
|
||||||
FluxModel(ggml_backend_t backend,
|
FluxModel(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
const String2GGMLType& tensor_types = {},
|
const String2GGMLType& tensor_types = {},
|
||||||
SDVersion version = VERSION_FLUX,
|
SDVersion version = VERSION_FLUX,
|
||||||
bool flash_attn = false,
|
bool flash_attn = false,
|
||||||
bool use_mask = false)
|
bool use_mask = false)
|
||||||
: flux(backend, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) {
|
: flux(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void alloc_params_buffer() {
|
void alloc_params_buffer() {
|
||||||
@ -189,10 +192,11 @@ struct WanModel : public DiffusionModel {
|
|||||||
WAN::WanRunner wan;
|
WAN::WanRunner wan;
|
||||||
|
|
||||||
WanModel(ggml_backend_t backend,
|
WanModel(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
const String2GGMLType& tensor_types = {},
|
const String2GGMLType& tensor_types = {},
|
||||||
SDVersion version = VERSION_FLUX,
|
SDVersion version = VERSION_FLUX,
|
||||||
bool flash_attn = false)
|
bool flash_attn = false)
|
||||||
: wan(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
|
: wan(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void alloc_params_buffer() {
|
void alloc_params_buffer() {
|
||||||
|
|||||||
@ -142,8 +142,10 @@ struct ESRGAN : public GGMLRunner {
|
|||||||
int scale = 4;
|
int scale = 4;
|
||||||
int tile_size = 128; // avoid cuda OOM for 4gb VRAM
|
int tile_size = 128; // avoid cuda OOM for 4gb VRAM
|
||||||
|
|
||||||
ESRGAN(ggml_backend_t backend, const String2GGMLType& tensor_types = {})
|
ESRGAN(ggml_backend_t backend,
|
||||||
: GGMLRunner(backend) {
|
bool offload_params_to_cpu,
|
||||||
|
const String2GGMLType& tensor_types = {})
|
||||||
|
: GGMLRunner(backend, offload_params_to_cpu) {
|
||||||
rrdb_net.init(params_ctx, tensor_types, "");
|
rrdb_net.init(params_ctx, tensor_types, "");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -164,7 +166,7 @@ struct ESRGAN : public GGMLRunner {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool success = model_loader.load_tensors(esrgan_tensors, backend);
|
bool success = model_loader.load_tensors(esrgan_tensors);
|
||||||
|
|
||||||
if (!success) {
|
if (!success) {
|
||||||
LOG_ERROR("load esrgan tensors from model loader failed");
|
LOG_ERROR("load esrgan tensors from model loader failed");
|
||||||
|
|||||||
@ -95,6 +95,7 @@ struct SDParams {
|
|||||||
int64_t seed = 42;
|
int64_t seed = 42;
|
||||||
bool verbose = false;
|
bool verbose = false;
|
||||||
bool vae_tiling = false;
|
bool vae_tiling = false;
|
||||||
|
bool offload_params_to_cpu = false;
|
||||||
bool control_net_cpu = false;
|
bool control_net_cpu = false;
|
||||||
bool normalize_input = false;
|
bool normalize_input = false;
|
||||||
bool clip_on_cpu = false;
|
bool clip_on_cpu = false;
|
||||||
@ -141,8 +142,9 @@ void print_params(SDParams params) {
|
|||||||
for (auto& path : params.ref_image_paths) {
|
for (auto& path : params.ref_image_paths) {
|
||||||
printf(" %s\n", path.c_str());
|
printf(" %s\n", path.c_str());
|
||||||
};
|
};
|
||||||
printf(" clip on cpu: %s\n", params.clip_on_cpu ? "true" : "false");
|
printf(" offload_params_to_cpu: %s\n", params.offload_params_to_cpu ? "true" : "false");
|
||||||
printf(" controlnet cpu: %s\n", params.control_net_cpu ? "true" : "false");
|
printf(" clip_on_cpu: %s\n", params.clip_on_cpu ? "true" : "false");
|
||||||
|
printf(" control_net_cpu: %s\n", params.control_net_cpu ? "true" : "false");
|
||||||
printf(" vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
|
printf(" vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
|
||||||
printf(" diffusion flash attention:%s\n", params.diffusion_flash_attn ? "true" : "false");
|
printf(" diffusion flash attention:%s\n", params.diffusion_flash_attn ? "true" : "false");
|
||||||
printf(" strength(control): %.2f\n", params.control_strength);
|
printf(" strength(control): %.2f\n", params.control_strength);
|
||||||
@ -461,6 +463,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
|
|||||||
|
|
||||||
options.bool_options = {
|
options.bool_options = {
|
||||||
{"", "--vae-tiling", "", true, ¶ms.vae_tiling},
|
{"", "--vae-tiling", "", true, ¶ms.vae_tiling},
|
||||||
|
{"", "--offload-to-cpu", "", true, ¶ms.offload_params_to_cpu},
|
||||||
{"", "--control-net-cpu", "", true, ¶ms.control_net_cpu},
|
{"", "--control-net-cpu", "", true, ¶ms.control_net_cpu},
|
||||||
{"", "--normalize-input", "", true, ¶ms.normalize_input},
|
{"", "--normalize-input", "", true, ¶ms.normalize_input},
|
||||||
{"", "--clip-on-cpu", "", true, ¶ms.clip_on_cpu},
|
{"", "--clip-on-cpu", "", true, ¶ms.clip_on_cpu},
|
||||||
@ -943,6 +946,7 @@ int main(int argc, const char* argv[]) {
|
|||||||
params.wtype,
|
params.wtype,
|
||||||
params.rng_type,
|
params.rng_type,
|
||||||
params.schedule,
|
params.schedule,
|
||||||
|
params.offload_params_to_cpu,
|
||||||
params.clip_on_cpu,
|
params.clip_on_cpu,
|
||||||
params.control_net_cpu,
|
params.control_net_cpu,
|
||||||
params.vae_on_cpu,
|
params.vae_on_cpu,
|
||||||
@ -1058,6 +1062,7 @@ int main(int argc, const char* argv[]) {
|
|||||||
int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth
|
int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth
|
||||||
if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) {
|
if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) {
|
||||||
upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
|
upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
|
||||||
|
params.offload_params_to_cpu,
|
||||||
params.n_threads);
|
params.n_threads);
|
||||||
|
|
||||||
if (upscaler_ctx == NULL) {
|
if (upscaler_ctx == NULL) {
|
||||||
|
|||||||
7
flux.hpp
7
flux.hpp
@ -881,12 +881,13 @@ namespace Flux {
|
|||||||
bool use_mask = false;
|
bool use_mask = false;
|
||||||
|
|
||||||
FluxRunner(ggml_backend_t backend,
|
FluxRunner(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
const String2GGMLType& tensor_types = {},
|
const String2GGMLType& tensor_types = {},
|
||||||
const std::string prefix = "",
|
const std::string prefix = "",
|
||||||
SDVersion version = VERSION_FLUX,
|
SDVersion version = VERSION_FLUX,
|
||||||
bool flash_attn = false,
|
bool flash_attn = false,
|
||||||
bool use_mask = false)
|
bool use_mask = false)
|
||||||
: GGMLRunner(backend), use_mask(use_mask) {
|
: GGMLRunner(backend, offload_params_to_cpu), use_mask(use_mask) {
|
||||||
flux_params.flash_attn = flash_attn;
|
flux_params.flash_attn = flash_attn;
|
||||||
flux_params.guidance_embed = false;
|
flux_params.guidance_embed = false;
|
||||||
flux_params.depth = 0;
|
flux_params.depth = 0;
|
||||||
@ -1085,7 +1086,7 @@ namespace Flux {
|
|||||||
// ggml_backend_t backend = ggml_backend_cuda_init(0);
|
// ggml_backend_t backend = ggml_backend_cuda_init(0);
|
||||||
ggml_backend_t backend = ggml_backend_cpu_init();
|
ggml_backend_t backend = ggml_backend_cpu_init();
|
||||||
ggml_type model_data_type = GGML_TYPE_Q8_0;
|
ggml_type model_data_type = GGML_TYPE_Q8_0;
|
||||||
std::shared_ptr<FluxRunner> flux = std::shared_ptr<FluxRunner>(new FluxRunner(backend));
|
std::shared_ptr<FluxRunner> flux = std::shared_ptr<FluxRunner>(new FluxRunner(backend, false));
|
||||||
{
|
{
|
||||||
LOG_INFO("loading from '%s'", file_path.c_str());
|
LOG_INFO("loading from '%s'", file_path.c_str());
|
||||||
|
|
||||||
@ -1099,7 +1100,7 @@ namespace Flux {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool success = model_loader.load_tensors(tensors, backend);
|
bool success = model_loader.load_tensors(tensors);
|
||||||
|
|
||||||
if (!success) {
|
if (!success) {
|
||||||
LOG_ERROR("load tensors from model loader failed");
|
LOG_ERROR("load tensors from model loader failed");
|
||||||
|
|||||||
141
ggml_extend.hpp
141
ggml_extend.hpp
@ -1230,16 +1230,20 @@ struct GGMLRunner {
|
|||||||
protected:
|
protected:
|
||||||
typedef std::function<struct ggml_cgraph*()> get_graph_cb_t;
|
typedef std::function<struct ggml_cgraph*()> get_graph_cb_t;
|
||||||
|
|
||||||
struct ggml_context* params_ctx = NULL;
|
ggml_backend_t params_backend = NULL;
|
||||||
ggml_backend_buffer_t params_buffer = NULL;
|
ggml_backend_t runtime_backend = NULL;
|
||||||
|
|
||||||
|
struct ggml_context* params_ctx = NULL;
|
||||||
|
ggml_backend_buffer_t params_buffer = NULL;
|
||||||
|
struct ggml_context* offload_ctx = NULL;
|
||||||
|
ggml_backend_buffer_t runtime_params_buffer = NULL;
|
||||||
|
bool params_on_runtime_backend = false;
|
||||||
|
|
||||||
struct ggml_context* compute_ctx = NULL;
|
struct ggml_context* compute_ctx = NULL;
|
||||||
struct ggml_gallocr* compute_allocr = NULL;
|
struct ggml_gallocr* compute_allocr = NULL;
|
||||||
|
|
||||||
std::map<struct ggml_tensor*, const void*> backend_tensor_data_map;
|
std::map<struct ggml_tensor*, const void*> backend_tensor_data_map;
|
||||||
|
|
||||||
ggml_backend_t backend = NULL;
|
|
||||||
|
|
||||||
void alloc_params_ctx() {
|
void alloc_params_ctx() {
|
||||||
struct ggml_init_params params;
|
struct ggml_init_params params;
|
||||||
params.mem_size = static_cast<size_t>(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead());
|
params.mem_size = static_cast<size_t>(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead());
|
||||||
@ -1248,6 +1252,10 @@ protected:
|
|||||||
|
|
||||||
params_ctx = ggml_init(params);
|
params_ctx = ggml_init(params);
|
||||||
GGML_ASSERT(params_ctx != NULL);
|
GGML_ASSERT(params_ctx != NULL);
|
||||||
|
if (params_backend != runtime_backend) {
|
||||||
|
offload_ctx = ggml_init(params);
|
||||||
|
GGML_ASSERT(offload_ctx != NULL);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void free_params_ctx() {
|
void free_params_ctx() {
|
||||||
@ -1255,6 +1263,10 @@ protected:
|
|||||||
ggml_free(params_ctx);
|
ggml_free(params_ctx);
|
||||||
params_ctx = NULL;
|
params_ctx = NULL;
|
||||||
}
|
}
|
||||||
|
if (offload_ctx != NULL) {
|
||||||
|
ggml_free(offload_ctx);
|
||||||
|
offload_ctx = NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void alloc_compute_ctx() {
|
void alloc_compute_ctx() {
|
||||||
@ -1281,7 +1293,7 @@ protected:
|
|||||||
reset_compute_ctx();
|
reset_compute_ctx();
|
||||||
struct ggml_cgraph* gf = get_graph();
|
struct ggml_cgraph* gf = get_graph();
|
||||||
backend_tensor_data_map.clear();
|
backend_tensor_data_map.clear();
|
||||||
compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
|
compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(runtime_backend));
|
||||||
|
|
||||||
if (!ggml_gallocr_reserve(compute_allocr, gf)) {
|
if (!ggml_gallocr_reserve(compute_allocr, gf)) {
|
||||||
// failed to allocate the compute buffer
|
// failed to allocate the compute buffer
|
||||||
@ -1295,7 +1307,7 @@ protected:
|
|||||||
LOG_DEBUG("%s compute buffer size: %.2f MB(%s)",
|
LOG_DEBUG("%s compute buffer size: %.2f MB(%s)",
|
||||||
get_desc().c_str(),
|
get_desc().c_str(),
|
||||||
compute_buffer_size / 1024.0 / 1024.0,
|
compute_buffer_size / 1024.0 / 1024.0,
|
||||||
ggml_backend_is_cpu(backend) ? "RAM" : "VRAM");
|
ggml_backend_is_cpu(runtime_backend) ? "RAM" : "VRAM");
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1310,12 +1322,96 @@ protected:
|
|||||||
backend_tensor_data_map.clear();
|
backend_tensor_data_map.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool offload_params_to_runtime_backend() {
|
||||||
|
if (params_backend == runtime_backend) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (params_on_runtime_backend) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
GGML_ASSERT(runtime_params_buffer == NULL);
|
||||||
|
int64_t t0 = ggml_time_ms();
|
||||||
|
size_t num_tensors = ggml_tensor_num(offload_ctx);
|
||||||
|
if (num_tensors == 0) {
|
||||||
|
for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != NULL; t = ggml_get_next_tensor(params_ctx, t)) {
|
||||||
|
GGML_ASSERT(t->view_src == NULL);
|
||||||
|
ggml_dup_tensor(offload_ctx, t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
num_tensors = ggml_tensor_num(offload_ctx);
|
||||||
|
GGML_ASSERT(num_tensors == ggml_tensor_num(params_ctx));
|
||||||
|
|
||||||
|
runtime_params_buffer = ggml_backend_alloc_ctx_tensors(offload_ctx, runtime_backend);
|
||||||
|
|
||||||
|
if (runtime_params_buffer == NULL) {
|
||||||
|
LOG_ERROR("%s alloc runtime params backend buffer failed, num_tensors = %i",
|
||||||
|
get_desc().c_str(),
|
||||||
|
num_tensors);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor* t = ggml_get_first_tensor(params_ctx);
|
||||||
|
ggml_tensor* offload_t = ggml_get_first_tensor(offload_ctx);
|
||||||
|
|
||||||
|
while (t != NULL && offload_t != NULL) {
|
||||||
|
ggml_backend_tensor_copy(t, offload_t);
|
||||||
|
std::swap(t->buffer, offload_t->buffer);
|
||||||
|
std::swap(t->data, offload_t->data);
|
||||||
|
|
||||||
|
t = ggml_get_next_tensor(params_ctx, t);
|
||||||
|
offload_t = ggml_get_next_tensor(offload_ctx, offload_t);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t t1 = ggml_time_ms();
|
||||||
|
|
||||||
|
size_t params_buffer_size = ggml_backend_buffer_get_size(runtime_params_buffer);
|
||||||
|
LOG_INFO("%s offload params (%6.2f MB, %i tensors) to runtime backend (%s), taking %.2fs",
|
||||||
|
get_desc().c_str(),
|
||||||
|
params_buffer_size / (1024.f * 1024.f),
|
||||||
|
num_tensors,
|
||||||
|
ggml_backend_name(runtime_backend),
|
||||||
|
(t1 - t0) * 1.0f / 1000);
|
||||||
|
|
||||||
|
params_on_runtime_backend = true;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void offload_params_to_params_backend() {
|
||||||
|
if (!params_on_runtime_backend) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
ggml_tensor* t = ggml_get_first_tensor(params_ctx);
|
||||||
|
ggml_tensor* offload_t = ggml_get_first_tensor(offload_ctx);
|
||||||
|
|
||||||
|
while (t != NULL && offload_t != NULL) {
|
||||||
|
t->buffer = offload_t->buffer;
|
||||||
|
t->data = offload_t->data;
|
||||||
|
offload_t->buffer = NULL;
|
||||||
|
offload_t->data = NULL;
|
||||||
|
|
||||||
|
t = ggml_get_next_tensor(params_ctx, t);
|
||||||
|
offload_t = ggml_get_next_tensor(offload_ctx, offload_t);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (runtime_params_buffer != NULL) {
|
||||||
|
ggml_backend_buffer_free(runtime_params_buffer);
|
||||||
|
runtime_params_buffer = NULL;
|
||||||
|
}
|
||||||
|
params_on_runtime_backend = false;
|
||||||
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
virtual std::string get_desc() = 0;
|
virtual std::string get_desc() = 0;
|
||||||
|
|
||||||
GGMLRunner(ggml_backend_t backend)
|
GGMLRunner(ggml_backend_t backend, bool offload_params_to_cpu = false)
|
||||||
: backend(backend) {
|
: runtime_backend(backend) {
|
||||||
alloc_params_ctx();
|
alloc_params_ctx();
|
||||||
|
if (!ggml_backend_is_cpu(runtime_backend) && offload_params_to_cpu) {
|
||||||
|
params_backend = ggml_backend_cpu_init();
|
||||||
|
} else {
|
||||||
|
params_backend = runtime_backend;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual ~GGMLRunner() {
|
virtual ~GGMLRunner() {
|
||||||
@ -1323,6 +1419,9 @@ public:
|
|||||||
free_compute_buffer();
|
free_compute_buffer();
|
||||||
free_params_ctx();
|
free_params_ctx();
|
||||||
free_compute_ctx();
|
free_compute_ctx();
|
||||||
|
if (params_backend != runtime_backend) {
|
||||||
|
ggml_backend_free(params_backend);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void reset_compute_ctx() {
|
void reset_compute_ctx() {
|
||||||
@ -1332,7 +1431,7 @@ public:
|
|||||||
|
|
||||||
bool alloc_params_buffer() {
|
bool alloc_params_buffer() {
|
||||||
size_t num_tensors = ggml_tensor_num(params_ctx);
|
size_t num_tensors = ggml_tensor_num(params_ctx);
|
||||||
params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, backend);
|
params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend);
|
||||||
if (params_buffer == NULL) {
|
if (params_buffer == NULL) {
|
||||||
LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
|
LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
|
||||||
get_desc().c_str(),
|
get_desc().c_str(),
|
||||||
@ -1342,14 +1441,9 @@ public:
|
|||||||
size_t params_buffer_size = ggml_backend_buffer_get_size(params_buffer);
|
size_t params_buffer_size = ggml_backend_buffer_get_size(params_buffer);
|
||||||
LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)",
|
LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)",
|
||||||
get_desc().c_str(),
|
get_desc().c_str(),
|
||||||
params_buffer_size / (1024.0 * 1024.0),
|
params_buffer_size / (1024.f * 1024.f),
|
||||||
ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
|
ggml_backend_is_cpu(params_backend) ? "RAM" : "VRAM",
|
||||||
num_tensors);
|
num_tensors);
|
||||||
// printf("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)\n",
|
|
||||||
// get_desc().c_str(),
|
|
||||||
// params_buffer_size / (1024.0 * 1024.0),
|
|
||||||
// ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
|
|
||||||
// num_tensors);
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1372,6 +1466,7 @@ public:
|
|||||||
ggml_gallocr_free(compute_allocr);
|
ggml_gallocr_free(compute_allocr);
|
||||||
compute_allocr = NULL;
|
compute_allocr = NULL;
|
||||||
}
|
}
|
||||||
|
offload_params_to_params_backend();
|
||||||
}
|
}
|
||||||
|
|
||||||
// do copy after alloc graph
|
// do copy after alloc graph
|
||||||
@ -1385,7 +1480,7 @@ public:
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
// it's performing a compute, check if backend isn't cpu
|
// it's performing a compute, check if backend isn't cpu
|
||||||
if (!ggml_backend_is_cpu(backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) {
|
if (!ggml_backend_is_cpu(runtime_backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) {
|
||||||
// pass input tensors to gpu memory
|
// pass input tensors to gpu memory
|
||||||
auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);
|
auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);
|
||||||
|
|
||||||
@ -1401,16 +1496,20 @@ public:
|
|||||||
bool free_compute_buffer_immediately = true,
|
bool free_compute_buffer_immediately = true,
|
||||||
struct ggml_tensor** output = NULL,
|
struct ggml_tensor** output = NULL,
|
||||||
struct ggml_context* output_ctx = NULL) {
|
struct ggml_context* output_ctx = NULL) {
|
||||||
|
if (!offload_params_to_runtime_backend()) {
|
||||||
|
LOG_ERROR("%s offload params to runtime backend failed", get_desc().c_str());
|
||||||
|
return;
|
||||||
|
}
|
||||||
alloc_compute_buffer(get_graph);
|
alloc_compute_buffer(get_graph);
|
||||||
reset_compute_ctx();
|
reset_compute_ctx();
|
||||||
struct ggml_cgraph* gf = get_graph();
|
struct ggml_cgraph* gf = get_graph();
|
||||||
GGML_ASSERT(ggml_gallocr_alloc_graph(compute_allocr, gf));
|
GGML_ASSERT(ggml_gallocr_alloc_graph(compute_allocr, gf));
|
||||||
cpy_data_to_backend_tensor();
|
cpy_data_to_backend_tensor();
|
||||||
if (ggml_backend_is_cpu(backend)) {
|
if (ggml_backend_is_cpu(runtime_backend)) {
|
||||||
ggml_backend_cpu_set_n_threads(backend, n_threads);
|
ggml_backend_cpu_set_n_threads(runtime_backend, n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_graph_compute(backend, gf);
|
ggml_backend_graph_compute(runtime_backend, gf);
|
||||||
#ifdef GGML_PERF
|
#ifdef GGML_PERF
|
||||||
ggml_graph_print(gf);
|
ggml_graph_print(gf);
|
||||||
#endif
|
#endif
|
||||||
@ -1420,7 +1519,7 @@ public:
|
|||||||
*output = ggml_dup_tensor(output_ctx, result);
|
*output = ggml_dup_tensor(output_ctx, result);
|
||||||
}
|
}
|
||||||
if (*output != NULL) {
|
if (*output != NULL) {
|
||||||
ggml_backend_tensor_get_and_sync(backend, result, (*output)->data, 0, ggml_nbytes(*output));
|
ggml_backend_tensor_get_and_sync(runtime_backend, result, (*output)->data, 0, ggml_nbytes(*output));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
24
lora.hpp
24
lora.hpp
@ -92,6 +92,7 @@ struct LoraModel : public GGMLRunner {
|
|||||||
|
|
||||||
float multiplier = 1.0f;
|
float multiplier = 1.0f;
|
||||||
std::map<std::string, struct ggml_tensor*> lora_tensors;
|
std::map<std::string, struct ggml_tensor*> lora_tensors;
|
||||||
|
std::map<ggml_tensor*, ggml_tensor*> original_weight_to_final_weight;
|
||||||
std::string file_path;
|
std::string file_path;
|
||||||
ModelLoader model_loader;
|
ModelLoader model_loader;
|
||||||
bool load_failed = false;
|
bool load_failed = false;
|
||||||
@ -103,7 +104,7 @@ struct LoraModel : public GGMLRunner {
|
|||||||
LoraModel(ggml_backend_t backend,
|
LoraModel(ggml_backend_t backend,
|
||||||
const std::string& file_path = "",
|
const std::string& file_path = "",
|
||||||
const std::string prefix = "")
|
const std::string prefix = "")
|
||||||
: file_path(file_path), GGMLRunner(backend) {
|
: file_path(file_path), GGMLRunner(backend, false) {
|
||||||
if (!model_loader.init_from_file(file_path, prefix)) {
|
if (!model_loader.init_from_file(file_path, prefix)) {
|
||||||
load_failed = true;
|
load_failed = true;
|
||||||
}
|
}
|
||||||
@ -151,11 +152,11 @@ struct LoraModel : public GGMLRunner {
|
|||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
model_loader.load_tensors(on_new_tensor_cb, backend);
|
model_loader.load_tensors(on_new_tensor_cb);
|
||||||
alloc_params_buffer();
|
alloc_params_buffer();
|
||||||
// exit(0);
|
// exit(0);
|
||||||
dry_run = false;
|
dry_run = false;
|
||||||
model_loader.load_tensors(on_new_tensor_cb, backend);
|
model_loader.load_tensors(on_new_tensor_cb);
|
||||||
|
|
||||||
LOG_DEBUG("lora type: \"%s\"/\"%s\"", lora_downs[type].c_str(), lora_ups[type].c_str());
|
LOG_DEBUG("lora type: \"%s\"/\"%s\"", lora_downs[type].c_str(), lora_ups[type].c_str());
|
||||||
|
|
||||||
@ -790,6 +791,11 @@ struct LoraModel : public GGMLRunner {
|
|||||||
updown = ggml_merge_lora(compute_ctx, lora_down, lora_up, lora_mid);
|
updown = ggml_merge_lora(compute_ctx, lora_down, lora_up, lora_mid);
|
||||||
}
|
}
|
||||||
scale_value *= multiplier;
|
scale_value *= multiplier;
|
||||||
|
ggml_tensor* original_weight = weight;
|
||||||
|
if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(weight->buffer)) {
|
||||||
|
weight = ggml_dup_tensor(compute_ctx, weight);
|
||||||
|
set_backend_tensor_data(weight, original_weight->data);
|
||||||
|
}
|
||||||
updown = ggml_reshape(compute_ctx, updown, weight);
|
updown = ggml_reshape(compute_ctx, updown, weight);
|
||||||
GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
|
GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
|
||||||
updown = ggml_scale_inplace(compute_ctx, updown, scale_value);
|
updown = ggml_scale_inplace(compute_ctx, updown, scale_value);
|
||||||
@ -805,6 +811,9 @@ struct LoraModel : public GGMLRunner {
|
|||||||
}
|
}
|
||||||
// final_weight = ggml_add_inplace(compute_ctx, weight, updown); // apply directly
|
// final_weight = ggml_add_inplace(compute_ctx, weight, updown); // apply directly
|
||||||
ggml_build_forward_expand(gf, final_weight);
|
ggml_build_forward_expand(gf, final_weight);
|
||||||
|
if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_weight->buffer)) {
|
||||||
|
original_weight_to_final_weight[original_weight] = final_weight;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -839,7 +848,14 @@ struct LoraModel : public GGMLRunner {
|
|||||||
auto get_graph = [&]() -> struct ggml_cgraph* {
|
auto get_graph = [&]() -> struct ggml_cgraph* {
|
||||||
return build_lora_graph(model_tensors, version);
|
return build_lora_graph(model_tensors, version);
|
||||||
};
|
};
|
||||||
GGMLRunner::compute(get_graph, n_threads, true);
|
GGMLRunner::compute(get_graph, n_threads, false);
|
||||||
|
for (auto item : original_weight_to_final_weight) {
|
||||||
|
ggml_tensor* original_weight = item.first;
|
||||||
|
ggml_tensor* final_weight = item.second;
|
||||||
|
|
||||||
|
ggml_backend_tensor_copy(final_weight, original_weight);
|
||||||
|
}
|
||||||
|
GGMLRunner::free_compute_buffer();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -846,9 +846,10 @@ struct MMDiTRunner : public GGMLRunner {
|
|||||||
MMDiT mmdit;
|
MMDiT mmdit;
|
||||||
|
|
||||||
MMDiTRunner(ggml_backend_t backend,
|
MMDiTRunner(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
const String2GGMLType& tensor_types = {},
|
const String2GGMLType& tensor_types = {},
|
||||||
const std::string prefix = "")
|
const std::string prefix = "")
|
||||||
: GGMLRunner(backend), mmdit(tensor_types) {
|
: GGMLRunner(backend, offload_params_to_cpu), mmdit(tensor_types) {
|
||||||
mmdit.init(params_ctx, tensor_types, prefix);
|
mmdit.init(params_ctx, tensor_types, prefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -946,7 +947,7 @@ struct MMDiTRunner : public GGMLRunner {
|
|||||||
// ggml_backend_t backend = ggml_backend_cuda_init(0);
|
// ggml_backend_t backend = ggml_backend_cuda_init(0);
|
||||||
ggml_backend_t backend = ggml_backend_cpu_init();
|
ggml_backend_t backend = ggml_backend_cpu_init();
|
||||||
ggml_type model_data_type = GGML_TYPE_F16;
|
ggml_type model_data_type = GGML_TYPE_F16;
|
||||||
std::shared_ptr<MMDiTRunner> mmdit = std::shared_ptr<MMDiTRunner>(new MMDiTRunner(backend));
|
std::shared_ptr<MMDiTRunner> mmdit = std::shared_ptr<MMDiTRunner>(new MMDiTRunner(backend, false));
|
||||||
{
|
{
|
||||||
LOG_INFO("loading from '%s'", file_path.c_str());
|
LOG_INFO("loading from '%s'", file_path.c_str());
|
||||||
|
|
||||||
@ -960,7 +961,7 @@ struct MMDiTRunner : public GGMLRunner {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool success = model_loader.load_tensors(tensors, backend);
|
bool success = model_loader.load_tensors(tensors);
|
||||||
|
|
||||||
if (!success) {
|
if (!success) {
|
||||||
LOG_ERROR("load tensors from model loader failed");
|
LOG_ERROR("load tensors from model loader failed");
|
||||||
|
|||||||
11
model.cpp
11
model.cpp
@ -1048,12 +1048,12 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (int i = GGML_MAX_DIMS; i < n_dims; i++) {
|
for (int i = GGML_MAX_DIMS; i < n_dims; i++) {
|
||||||
shape->ne[GGML_MAX_DIMS - 1] *= ne[i]; // stack to last dim;
|
shape->ne[GGML_MAX_DIMS - 1] *= ne[i]; // stack to last dim;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
ctx_gguf_ = gguf_init_from_file_ext(file_path.c_str(), {true, &ctx_meta_}, on_tensor_shape_read);
|
ctx_gguf_ = gguf_init_from_file_ext(file_path.c_str(), {true, &ctx_meta_}, on_tensor_shape_read);
|
||||||
if (!ctx_gguf_) {
|
if (!ctx_gguf_) {
|
||||||
LOG_ERROR("failed to open '%s'", file_path.c_str());
|
LOG_ERROR("failed to open '%s'", file_path.c_str());
|
||||||
return false;
|
return false;
|
||||||
@ -1917,7 +1917,7 @@ std::vector<TensorStorage> remove_duplicates(const std::vector<TensorStorage>& v
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend) {
|
bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
|
||||||
std::vector<TensorStorage> processed_tensor_storages;
|
std::vector<TensorStorage> processed_tensor_storages;
|
||||||
for (auto& tensor_storage : tensor_storages) {
|
for (auto& tensor_storage : tensor_storages) {
|
||||||
// LOG_DEBUG("%s", name.c_str());
|
// LOG_DEBUG("%s", name.c_str());
|
||||||
@ -2115,7 +2115,6 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
|
bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
|
||||||
ggml_backend_t backend,
|
|
||||||
std::set<std::string> ignore_tensors) {
|
std::set<std::string> ignore_tensors) {
|
||||||
std::set<std::string> tensor_names_in_file;
|
std::set<std::string> tensor_names_in_file;
|
||||||
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
|
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
|
||||||
@ -2155,7 +2154,7 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
|
|||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
bool success = load_tensors(on_new_tensor_cb, backend);
|
bool success = load_tensors(on_new_tensor_cb);
|
||||||
if (!success) {
|
if (!success) {
|
||||||
LOG_ERROR("load tensors from file failed");
|
LOG_ERROR("load tensors from file failed");
|
||||||
return false;
|
return false;
|
||||||
@ -2299,7 +2298,7 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
|
|||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
bool success = load_tensors(on_new_tensor_cb, backend);
|
bool success = load_tensors(on_new_tensor_cb);
|
||||||
ggml_backend_free(backend);
|
ggml_backend_free(backend);
|
||||||
LOG_INFO("load tensors done");
|
LOG_INFO("load tensors done");
|
||||||
LOG_INFO("trying to save tensors to %s", file_path.c_str());
|
LOG_INFO("trying to save tensors to %s", file_path.c_str());
|
||||||
|
|||||||
3
model.h
3
model.h
@ -245,9 +245,8 @@ public:
|
|||||||
ggml_type get_diffusion_model_wtype();
|
ggml_type get_diffusion_model_wtype();
|
||||||
ggml_type get_vae_wtype();
|
ggml_type get_vae_wtype();
|
||||||
void set_wtype_override(ggml_type wtype, std::string prefix = "");
|
void set_wtype_override(ggml_type wtype, std::string prefix = "");
|
||||||
bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend);
|
bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb);
|
||||||
bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
|
bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
|
||||||
ggml_backend_t backend,
|
|
||||||
std::set<std::string> ignore_tensors = {});
|
std::set<std::string> ignore_tensors = {});
|
||||||
|
|
||||||
bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules);
|
bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules);
|
||||||
|
|||||||
10
pmid.hpp
10
pmid.hpp
@ -624,12 +624,13 @@ public:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
PhotoMakerIDEncoder(ggml_backend_t backend,
|
PhotoMakerIDEncoder(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
const String2GGMLType& tensor_types,
|
const String2GGMLType& tensor_types,
|
||||||
const std::string prefix,
|
const std::string prefix,
|
||||||
SDVersion version = VERSION_SDXL,
|
SDVersion version = VERSION_SDXL,
|
||||||
PMVersion pm_v = PM_VERSION_1,
|
PMVersion pm_v = PM_VERSION_1,
|
||||||
float sty = 20.f)
|
float sty = 20.f)
|
||||||
: GGMLRunner(backend),
|
: GGMLRunner(backend, offload_params_to_cpu),
|
||||||
version(version),
|
version(version),
|
||||||
pm_version(pm_v),
|
pm_version(pm_v),
|
||||||
style_strength(sty) {
|
style_strength(sty) {
|
||||||
@ -785,10 +786,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
|
|||||||
bool applied = false;
|
bool applied = false;
|
||||||
|
|
||||||
PhotoMakerIDEmbed(ggml_backend_t backend,
|
PhotoMakerIDEmbed(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
ModelLoader* ml,
|
ModelLoader* ml,
|
||||||
const std::string& file_path = "",
|
const std::string& file_path = "",
|
||||||
const std::string& prefix = "")
|
const std::string& prefix = "")
|
||||||
: file_path(file_path), GGMLRunner(backend), model_loader(ml) {
|
: file_path(file_path), GGMLRunner(backend, offload_params_to_cpu), model_loader(ml) {
|
||||||
if (!model_loader->init_from_file(file_path, prefix)) {
|
if (!model_loader->init_from_file(file_path, prefix)) {
|
||||||
load_failed = true;
|
load_failed = true;
|
||||||
}
|
}
|
||||||
@ -828,11 +830,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
|
|||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
model_loader->load_tensors(on_new_tensor_cb, backend);
|
model_loader->load_tensors(on_new_tensor_cb);
|
||||||
alloc_params_buffer();
|
alloc_params_buffer();
|
||||||
|
|
||||||
dry_run = false;
|
dry_run = false;
|
||||||
model_loader->load_tensors(on_new_tensor_cb, backend);
|
model_loader->load_tensors(on_new_tensor_cb);
|
||||||
|
|
||||||
LOG_DEBUG("finished loading PhotoMaker ID Embeds ");
|
LOG_DEBUG("finished loading PhotoMaker ID Embeds ");
|
||||||
return true;
|
return true;
|
||||||
|
|||||||
@ -104,9 +104,10 @@ public:
|
|||||||
std::shared_ptr<PhotoMakerIDEmbed> pmid_id_embeds;
|
std::shared_ptr<PhotoMakerIDEmbed> pmid_id_embeds;
|
||||||
|
|
||||||
std::string taesd_path;
|
std::string taesd_path;
|
||||||
bool use_tiny_autoencoder = false;
|
bool use_tiny_autoencoder = false;
|
||||||
bool vae_tiling = false;
|
bool vae_tiling = false;
|
||||||
bool stacked_id = false;
|
bool offload_params_to_cpu = false;
|
||||||
|
bool stacked_id = false;
|
||||||
|
|
||||||
bool is_using_v_parameterization = false;
|
bool is_using_v_parameterization = false;
|
||||||
bool is_using_edm_v_parameterization = false;
|
bool is_using_edm_v_parameterization = false;
|
||||||
@ -180,6 +181,7 @@ public:
|
|||||||
taesd_path = SAFE_STR(sd_ctx_params->taesd_path);
|
taesd_path = SAFE_STR(sd_ctx_params->taesd_path);
|
||||||
use_tiny_autoencoder = taesd_path.size() > 0;
|
use_tiny_autoencoder = taesd_path.size() > 0;
|
||||||
vae_tiling = sd_ctx_params->vae_tiling;
|
vae_tiling = sd_ctx_params->vae_tiling;
|
||||||
|
offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu;
|
||||||
|
|
||||||
if (sd_ctx_params->rng_type == STD_DEFAULT_RNG) {
|
if (sd_ctx_params->rng_type == STD_DEFAULT_RNG) {
|
||||||
rng = std::make_shared<STDDefaultRNG>();
|
rng = std::make_shared<STDDefaultRNG>();
|
||||||
@ -327,8 +329,12 @@ public:
|
|||||||
if (sd_ctx_params->diffusion_flash_attn) {
|
if (sd_ctx_params->diffusion_flash_attn) {
|
||||||
LOG_WARN("flash attention in this diffusion model is currently unsupported!");
|
LOG_WARN("flash attention in this diffusion model is currently unsupported!");
|
||||||
}
|
}
|
||||||
cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
|
cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend,
|
||||||
diffusion_model = std::make_shared<MMDiTModel>(backend, model_loader.tensor_storages_types);
|
offload_params_to_cpu,
|
||||||
|
model_loader.tensor_storages_types);
|
||||||
|
diffusion_model = std::make_shared<MMDiTModel>(backend,
|
||||||
|
offload_params_to_cpu,
|
||||||
|
model_loader.tensor_storages_types);
|
||||||
} else if (sd_version_is_flux(version)) {
|
} else if (sd_version_is_flux(version)) {
|
||||||
bool is_chroma = false;
|
bool is_chroma = false;
|
||||||
for (auto pair : model_loader.tensor_storages_types) {
|
for (auto pair : model_loader.tensor_storages_types) {
|
||||||
@ -339,43 +345,52 @@ public:
|
|||||||
}
|
}
|
||||||
if (is_chroma) {
|
if (is_chroma) {
|
||||||
cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
|
cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
|
||||||
|
offload_params_to_cpu,
|
||||||
model_loader.tensor_storages_types,
|
model_loader.tensor_storages_types,
|
||||||
-1,
|
-1,
|
||||||
sd_ctx_params->chroma_use_t5_mask,
|
sd_ctx_params->chroma_use_t5_mask,
|
||||||
sd_ctx_params->chroma_t5_mask_pad);
|
sd_ctx_params->chroma_t5_mask_pad);
|
||||||
} else {
|
} else {
|
||||||
cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
|
cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend,
|
||||||
|
offload_params_to_cpu,
|
||||||
|
model_loader.tensor_storages_types);
|
||||||
}
|
}
|
||||||
diffusion_model = std::make_shared<FluxModel>(backend,
|
diffusion_model = std::make_shared<FluxModel>(backend,
|
||||||
|
offload_params_to_cpu,
|
||||||
model_loader.tensor_storages_types,
|
model_loader.tensor_storages_types,
|
||||||
version,
|
version,
|
||||||
sd_ctx_params->diffusion_flash_attn,
|
sd_ctx_params->diffusion_flash_attn,
|
||||||
sd_ctx_params->chroma_use_dit_mask);
|
sd_ctx_params->chroma_use_dit_mask);
|
||||||
} else if (sd_version_is_wan(version)) {
|
} else if (sd_version_is_wan(version)) {
|
||||||
cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
|
cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
|
||||||
|
offload_params_to_cpu,
|
||||||
model_loader.tensor_storages_types,
|
model_loader.tensor_storages_types,
|
||||||
-1,
|
-1,
|
||||||
true,
|
true,
|
||||||
1,
|
1,
|
||||||
true);
|
true);
|
||||||
diffusion_model = std::make_shared<WanModel>(backend,
|
diffusion_model = std::make_shared<WanModel>(backend,
|
||||||
|
offload_params_to_cpu,
|
||||||
model_loader.tensor_storages_types,
|
model_loader.tensor_storages_types,
|
||||||
version,
|
version,
|
||||||
sd_ctx_params->diffusion_flash_attn);
|
sd_ctx_params->diffusion_flash_attn);
|
||||||
} else { // SD1.x SD2.x SDXL
|
} else { // SD1.x SD2.x SDXL
|
||||||
if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) {
|
if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) {
|
||||||
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
|
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
|
||||||
|
offload_params_to_cpu,
|
||||||
model_loader.tensor_storages_types,
|
model_loader.tensor_storages_types,
|
||||||
SAFE_STR(sd_ctx_params->embedding_dir),
|
SAFE_STR(sd_ctx_params->embedding_dir),
|
||||||
version,
|
version,
|
||||||
PM_VERSION_2);
|
PM_VERSION_2);
|
||||||
} else {
|
} else {
|
||||||
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
|
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
|
||||||
|
offload_params_to_cpu,
|
||||||
model_loader.tensor_storages_types,
|
model_loader.tensor_storages_types,
|
||||||
SAFE_STR(sd_ctx_params->embedding_dir),
|
SAFE_STR(sd_ctx_params->embedding_dir),
|
||||||
version);
|
version);
|
||||||
}
|
}
|
||||||
diffusion_model = std::make_shared<UNetModel>(backend,
|
diffusion_model = std::make_shared<UNetModel>(backend,
|
||||||
|
offload_params_to_cpu,
|
||||||
model_loader.tensor_storages_types,
|
model_loader.tensor_storages_types,
|
||||||
version,
|
version,
|
||||||
sd_ctx_params->diffusion_flash_attn);
|
sd_ctx_params->diffusion_flash_attn);
|
||||||
@ -396,6 +411,7 @@ public:
|
|||||||
|
|
||||||
if (sd_version_is_wan(version)) {
|
if (sd_version_is_wan(version)) {
|
||||||
first_stage_model = std::make_shared<WAN::WanVAERunner>(vae_backend,
|
first_stage_model = std::make_shared<WAN::WanVAERunner>(vae_backend,
|
||||||
|
offload_params_to_cpu,
|
||||||
model_loader.tensor_storages_types,
|
model_loader.tensor_storages_types,
|
||||||
"first_stage_model",
|
"first_stage_model",
|
||||||
vae_decode_only);
|
vae_decode_only);
|
||||||
@ -403,6 +419,7 @@ public:
|
|||||||
first_stage_model->get_param_tensors(tensors, "first_stage_model");
|
first_stage_model->get_param_tensors(tensors, "first_stage_model");
|
||||||
} else if (!use_tiny_autoencoder) {
|
} else if (!use_tiny_autoencoder) {
|
||||||
first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend,
|
first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend,
|
||||||
|
offload_params_to_cpu,
|
||||||
model_loader.tensor_storages_types,
|
model_loader.tensor_storages_types,
|
||||||
"first_stage_model",
|
"first_stage_model",
|
||||||
vae_decode_only,
|
vae_decode_only,
|
||||||
@ -412,6 +429,7 @@ public:
|
|||||||
first_stage_model->get_param_tensors(tensors, "first_stage_model");
|
first_stage_model->get_param_tensors(tensors, "first_stage_model");
|
||||||
} else {
|
} else {
|
||||||
tae_first_stage = std::make_shared<TinyAutoEncoder>(vae_backend,
|
tae_first_stage = std::make_shared<TinyAutoEncoder>(vae_backend,
|
||||||
|
offload_params_to_cpu,
|
||||||
model_loader.tensor_storages_types,
|
model_loader.tensor_storages_types,
|
||||||
"decoder.layers",
|
"decoder.layers",
|
||||||
vae_decode_only,
|
vae_decode_only,
|
||||||
@ -427,14 +445,26 @@ public:
|
|||||||
} else {
|
} else {
|
||||||
controlnet_backend = backend;
|
controlnet_backend = backend;
|
||||||
}
|
}
|
||||||
control_net = std::make_shared<ControlNet>(controlnet_backend, model_loader.tensor_storages_types, version);
|
control_net = std::make_shared<ControlNet>(controlnet_backend,
|
||||||
|
offload_params_to_cpu,
|
||||||
|
model_loader.tensor_storages_types,
|
||||||
|
version);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) {
|
if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) {
|
||||||
pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend, model_loader.tensor_storages_types, "pmid", version, PM_VERSION_2);
|
pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend,
|
||||||
|
offload_params_to_cpu,
|
||||||
|
model_loader.tensor_storages_types,
|
||||||
|
"pmid",
|
||||||
|
version,
|
||||||
|
PM_VERSION_2);
|
||||||
LOG_INFO("using PhotoMaker Version 2");
|
LOG_INFO("using PhotoMaker Version 2");
|
||||||
} else {
|
} else {
|
||||||
pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend, model_loader.tensor_storages_types, "pmid", version);
|
pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend,
|
||||||
|
offload_params_to_cpu,
|
||||||
|
model_loader.tensor_storages_types,
|
||||||
|
"pmid",
|
||||||
|
version);
|
||||||
}
|
}
|
||||||
if (strlen(SAFE_STR(sd_ctx_params->stacked_id_embed_dir)) > 0) {
|
if (strlen(SAFE_STR(sd_ctx_params->stacked_id_embed_dir)) > 0) {
|
||||||
pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->stacked_id_embed_dir, "");
|
pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->stacked_id_embed_dir, "");
|
||||||
@ -489,7 +519,7 @@ public:
|
|||||||
if (version == VERSION_SVD) {
|
if (version == VERSION_SVD) {
|
||||||
ignore_tensors.insert("conditioner.embedders.3");
|
ignore_tensors.insert("conditioner.embedders.3");
|
||||||
}
|
}
|
||||||
bool success = model_loader.load_tensors(tensors, backend, ignore_tensors);
|
bool success = model_loader.load_tensors(tensors, ignore_tensors);
|
||||||
if (!success) {
|
if (!success) {
|
||||||
LOG_ERROR("load tensors from model loader failed");
|
LOG_ERROR("load tensors from model loader failed");
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
@ -1354,6 +1384,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
|
|||||||
sd_ctx_params->wtype = SD_TYPE_COUNT;
|
sd_ctx_params->wtype = SD_TYPE_COUNT;
|
||||||
sd_ctx_params->rng_type = CUDA_RNG;
|
sd_ctx_params->rng_type = CUDA_RNG;
|
||||||
sd_ctx_params->schedule = DEFAULT;
|
sd_ctx_params->schedule = DEFAULT;
|
||||||
|
sd_ctx_params->offload_params_to_cpu = false;
|
||||||
sd_ctx_params->keep_clip_on_cpu = false;
|
sd_ctx_params->keep_clip_on_cpu = false;
|
||||||
sd_ctx_params->keep_control_net_on_cpu = false;
|
sd_ctx_params->keep_control_net_on_cpu = false;
|
||||||
sd_ctx_params->keep_vae_on_cpu = false;
|
sd_ctx_params->keep_vae_on_cpu = false;
|
||||||
@ -1388,6 +1419,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
|||||||
"wtype: %s\n"
|
"wtype: %s\n"
|
||||||
"rng_type: %s\n"
|
"rng_type: %s\n"
|
||||||
"schedule: %s\n"
|
"schedule: %s\n"
|
||||||
|
"offload_params_to_cpu: %s\n"
|
||||||
"keep_clip_on_cpu: %s\n"
|
"keep_clip_on_cpu: %s\n"
|
||||||
"keep_control_net_on_cpu: %s\n"
|
"keep_control_net_on_cpu: %s\n"
|
||||||
"keep_vae_on_cpu: %s\n"
|
"keep_vae_on_cpu: %s\n"
|
||||||
@ -1413,6 +1445,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
|||||||
sd_type_name(sd_ctx_params->wtype),
|
sd_type_name(sd_ctx_params->wtype),
|
||||||
sd_rng_type_name(sd_ctx_params->rng_type),
|
sd_rng_type_name(sd_ctx_params->rng_type),
|
||||||
sd_schedule_name(sd_ctx_params->schedule),
|
sd_schedule_name(sd_ctx_params->schedule),
|
||||||
|
BOOL_STR(sd_ctx_params->offload_params_to_cpu),
|
||||||
BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
|
BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
|
||||||
BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
|
BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
|
||||||
BOOL_STR(sd_ctx_params->keep_vae_on_cpu),
|
BOOL_STR(sd_ctx_params->keep_vae_on_cpu),
|
||||||
|
|||||||
@ -130,6 +130,7 @@ typedef struct {
|
|||||||
enum sd_type_t wtype;
|
enum sd_type_t wtype;
|
||||||
enum rng_type_t rng_type;
|
enum rng_type_t rng_type;
|
||||||
enum schedule_t schedule;
|
enum schedule_t schedule;
|
||||||
|
bool offload_params_to_cpu;
|
||||||
bool keep_clip_on_cpu;
|
bool keep_clip_on_cpu;
|
||||||
bool keep_control_net_on_cpu;
|
bool keep_control_net_on_cpu;
|
||||||
bool keep_vae_on_cpu;
|
bool keep_vae_on_cpu;
|
||||||
@ -236,10 +237,13 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
|
|||||||
typedef struct upscaler_ctx_t upscaler_ctx_t;
|
typedef struct upscaler_ctx_t upscaler_ctx_t;
|
||||||
|
|
||||||
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
|
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
int n_threads);
|
int n_threads);
|
||||||
SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
|
SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
|
||||||
|
|
||||||
SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
|
SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
|
||||||
|
sd_image_t input_image,
|
||||||
|
uint32_t upscale_factor);
|
||||||
|
|
||||||
SD_API bool convert(const char* input_path,
|
SD_API bool convert(const char* input_path,
|
||||||
const char* vae_path,
|
const char* vae_path,
|
||||||
|
|||||||
10
t5.hpp
10
t5.hpp
@ -756,10 +756,11 @@ struct T5Runner : public GGMLRunner {
|
|||||||
std::vector<int> relative_position_bucket_vec;
|
std::vector<int> relative_position_bucket_vec;
|
||||||
|
|
||||||
T5Runner(ggml_backend_t backend,
|
T5Runner(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
const String2GGMLType& tensor_types,
|
const String2GGMLType& tensor_types,
|
||||||
const std::string prefix,
|
const std::string prefix,
|
||||||
bool is_umt5 = false)
|
bool is_umt5 = false)
|
||||||
: GGMLRunner(backend) {
|
: GGMLRunner(backend, offload_params_to_cpu) {
|
||||||
if (is_umt5) {
|
if (is_umt5) {
|
||||||
params.vocab_size = 256384;
|
params.vocab_size = 256384;
|
||||||
params.relative_attention = false;
|
params.relative_attention = false;
|
||||||
@ -900,10 +901,11 @@ struct T5Embedder {
|
|||||||
T5Runner model;
|
T5Runner model;
|
||||||
|
|
||||||
T5Embedder(ggml_backend_t backend,
|
T5Embedder(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
const String2GGMLType& tensor_types = {},
|
const String2GGMLType& tensor_types = {},
|
||||||
const std::string prefix = "",
|
const std::string prefix = "",
|
||||||
bool is_umt5 = false)
|
bool is_umt5 = false)
|
||||||
: model(backend, tensor_types, prefix, is_umt5), tokenizer(is_umt5) {
|
: model(backend, offload_params_to_cpu, tensor_types, prefix, is_umt5), tokenizer(is_umt5) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
|
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
|
||||||
@ -1012,13 +1014,13 @@ struct T5Embedder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::shared_ptr<T5Embedder> t5 = std::shared_ptr<T5Embedder>(new T5Embedder(backend, tensor_types, "", true));
|
std::shared_ptr<T5Embedder> t5 = std::shared_ptr<T5Embedder>(new T5Embedder(backend, false, tensor_types, "", true));
|
||||||
|
|
||||||
t5->alloc_params_buffer();
|
t5->alloc_params_buffer();
|
||||||
std::map<std::string, ggml_tensor*> tensors;
|
std::map<std::string, ggml_tensor*> tensors;
|
||||||
t5->get_param_tensors(tensors, "");
|
t5->get_param_tensors(tensors, "");
|
||||||
|
|
||||||
bool success = model_loader.load_tensors(tensors, backend);
|
bool success = model_loader.load_tensors(tensors);
|
||||||
|
|
||||||
if (!success) {
|
if (!success) {
|
||||||
LOG_ERROR("load tensors from model loader failed");
|
LOG_ERROR("load tensors from model loader failed");
|
||||||
|
|||||||
5
tae.hpp
5
tae.hpp
@ -196,13 +196,14 @@ struct TinyAutoEncoder : public GGMLRunner {
|
|||||||
bool decode_only = false;
|
bool decode_only = false;
|
||||||
|
|
||||||
TinyAutoEncoder(ggml_backend_t backend,
|
TinyAutoEncoder(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
const String2GGMLType& tensor_types,
|
const String2GGMLType& tensor_types,
|
||||||
const std::string prefix,
|
const std::string prefix,
|
||||||
bool decoder_only = true,
|
bool decoder_only = true,
|
||||||
SDVersion version = VERSION_SD1)
|
SDVersion version = VERSION_SD1)
|
||||||
: decode_only(decoder_only),
|
: decode_only(decoder_only),
|
||||||
taesd(decoder_only, version),
|
taesd(decoder_only, version),
|
||||||
GGMLRunner(backend) {
|
GGMLRunner(backend, offload_params_to_cpu) {
|
||||||
taesd.init(params_ctx, tensor_types, prefix);
|
taesd.init(params_ctx, tensor_types, prefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -226,7 +227,7 @@ struct TinyAutoEncoder : public GGMLRunner {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool success = model_loader.load_tensors(taesd_tensors, backend, ignore_tensors);
|
bool success = model_loader.load_tensors(taesd_tensors, ignore_tensors);
|
||||||
|
|
||||||
if (!success) {
|
if (!success) {
|
||||||
LOG_ERROR("load tae tensors from model loader failed");
|
LOG_ERROR("load tae tensors from model loader failed");
|
||||||
|
|||||||
3
unet.hpp
3
unet.hpp
@ -538,11 +538,12 @@ struct UNetModelRunner : public GGMLRunner {
|
|||||||
UnetModelBlock unet;
|
UnetModelBlock unet;
|
||||||
|
|
||||||
UNetModelRunner(ggml_backend_t backend,
|
UNetModelRunner(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
const String2GGMLType& tensor_types,
|
const String2GGMLType& tensor_types,
|
||||||
const std::string prefix,
|
const std::string prefix,
|
||||||
SDVersion version = VERSION_SD1,
|
SDVersion version = VERSION_SD1,
|
||||||
bool flash_attn = false)
|
bool flash_attn = false)
|
||||||
: GGMLRunner(backend), unet(version, tensor_types, flash_attn) {
|
: GGMLRunner(backend, offload_params_to_cpu), unet(version, tensor_types, flash_attn) {
|
||||||
unet.init(params_ctx, tensor_types, prefix);
|
unet.init(params_ctx, tensor_types, prefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -14,7 +14,8 @@ struct UpscalerGGML {
|
|||||||
: n_threads(n_threads) {
|
: n_threads(n_threads) {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool load_from_file(const std::string& esrgan_path) {
|
bool load_from_file(const std::string& esrgan_path,
|
||||||
|
bool offload_params_to_cpu) {
|
||||||
#ifdef SD_USE_CUDA
|
#ifdef SD_USE_CUDA
|
||||||
LOG_DEBUG("Using CUDA backend");
|
LOG_DEBUG("Using CUDA backend");
|
||||||
backend = ggml_backend_cuda_init(0);
|
backend = ggml_backend_cuda_init(0);
|
||||||
@ -46,7 +47,7 @@ struct UpscalerGGML {
|
|||||||
backend = ggml_backend_cpu_init();
|
backend = ggml_backend_cpu_init();
|
||||||
}
|
}
|
||||||
LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
|
LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
|
||||||
esrgan_upscaler = std::make_shared<ESRGAN>(backend, model_loader.tensor_storages_types);
|
esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, model_loader.tensor_storages_types);
|
||||||
if (!esrgan_upscaler->load_from_file(esrgan_path)) {
|
if (!esrgan_upscaler->load_from_file(esrgan_path)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -104,6 +105,7 @@ struct upscaler_ctx_t {
|
|||||||
};
|
};
|
||||||
|
|
||||||
upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
|
upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
int n_threads) {
|
int n_threads) {
|
||||||
upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
|
upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
|
||||||
if (upscaler_ctx == NULL) {
|
if (upscaler_ctx == NULL) {
|
||||||
@ -116,7 +118,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!upscaler_ctx->upscaler->load_from_file(esrgan_path)) {
|
if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu)) {
|
||||||
delete upscaler_ctx->upscaler;
|
delete upscaler_ctx->upscaler;
|
||||||
upscaler_ctx->upscaler = NULL;
|
upscaler_ctx->upscaler = NULL;
|
||||||
free(upscaler_ctx);
|
free(upscaler_ctx);
|
||||||
|
|||||||
7
vae.hpp
7
vae.hpp
@ -521,8 +521,8 @@ public:
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct VAE : public GGMLRunner {
|
struct VAE : public GGMLRunner {
|
||||||
VAE(ggml_backend_t backend)
|
VAE(ggml_backend_t backend, bool offload_params_to_cpu)
|
||||||
: GGMLRunner(backend) {}
|
: GGMLRunner(backend, offload_params_to_cpu) {}
|
||||||
virtual void compute(const int n_threads,
|
virtual void compute(const int n_threads,
|
||||||
struct ggml_tensor* z,
|
struct ggml_tensor* z,
|
||||||
bool decode_graph,
|
bool decode_graph,
|
||||||
@ -536,12 +536,13 @@ struct AutoEncoderKL : public VAE {
|
|||||||
AutoencodingEngine ae;
|
AutoencodingEngine ae;
|
||||||
|
|
||||||
AutoEncoderKL(ggml_backend_t backend,
|
AutoEncoderKL(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
const String2GGMLType& tensor_types,
|
const String2GGMLType& tensor_types,
|
||||||
const std::string prefix,
|
const std::string prefix,
|
||||||
bool decode_only = false,
|
bool decode_only = false,
|
||||||
bool use_video_decoder = false,
|
bool use_video_decoder = false,
|
||||||
SDVersion version = VERSION_SD1)
|
SDVersion version = VERSION_SD1)
|
||||||
: decode_only(decode_only), ae(decode_only, use_video_decoder, version), VAE(backend) {
|
: decode_only(decode_only), ae(decode_only, use_video_decoder, version), VAE(backend, offload_params_to_cpu) {
|
||||||
ae.init(params_ctx, tensor_types, prefix);
|
ae.init(params_ctx, tensor_types, prefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
17
wan.hpp
17
wan.hpp
@ -767,10 +767,11 @@ namespace WAN {
|
|||||||
std::vector<FeatCache> _feat_vec_map;
|
std::vector<FeatCache> _feat_vec_map;
|
||||||
|
|
||||||
WanVAERunner(ggml_backend_t backend,
|
WanVAERunner(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
const String2GGMLType& tensor_types = {},
|
const String2GGMLType& tensor_types = {},
|
||||||
const std::string prefix = "",
|
const std::string prefix = "",
|
||||||
bool decode_only = false)
|
bool decode_only = false)
|
||||||
: decode_only(decode_only), ae(decode_only), VAE(backend) {
|
: decode_only(decode_only), ae(decode_only), VAE(backend, offload_params_to_cpu) {
|
||||||
ae.init(params_ctx, tensor_types, prefix);
|
ae.init(params_ctx, tensor_types, prefix);
|
||||||
rest_feat_vec_map();
|
rest_feat_vec_map();
|
||||||
}
|
}
|
||||||
@ -857,7 +858,7 @@ namespace WAN {
|
|||||||
feat_cache_vec.is_rep = true;
|
feat_cache_vec.is_rep = true;
|
||||||
_feat_vec_map[feat_idx] = feat_cache_vec;
|
_feat_vec_map[feat_idx] = feat_cache_vec;
|
||||||
} else if (feat_cache != NULL) {
|
} else if (feat_cache != NULL) {
|
||||||
_feat_vec_map[feat_idx] = FeatCache(backend, feat_cache);
|
_feat_vec_map[feat_idx] = FeatCache(runtime_backend, feat_cache);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGMLRunner::free_compute_buffer();
|
GGMLRunner::free_compute_buffer();
|
||||||
@ -897,7 +898,7 @@ namespace WAN {
|
|||||||
feat_cache_vec.is_rep = true;
|
feat_cache_vec.is_rep = true;
|
||||||
_feat_vec_map[feat_idx] = feat_cache_vec;
|
_feat_vec_map[feat_idx] = feat_cache_vec;
|
||||||
} else if (feat_cache != NULL) {
|
} else if (feat_cache != NULL) {
|
||||||
_feat_vec_map[feat_idx] = FeatCache(backend, feat_cache);
|
_feat_vec_map[feat_idx] = FeatCache(runtime_backend, feat_cache);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -943,7 +944,7 @@ namespace WAN {
|
|||||||
ggml_backend_t backend = ggml_backend_cuda_init(0);
|
ggml_backend_t backend = ggml_backend_cuda_init(0);
|
||||||
// ggml_backend_t backend = ggml_backend_cpu_init();
|
// ggml_backend_t backend = ggml_backend_cpu_init();
|
||||||
ggml_type model_data_type = GGML_TYPE_F16;
|
ggml_type model_data_type = GGML_TYPE_F16;
|
||||||
std::shared_ptr<WanVAERunner> vae = std::shared_ptr<WanVAERunner>(new WanVAERunner(backend));
|
std::shared_ptr<WanVAERunner> vae = std::shared_ptr<WanVAERunner>(new WanVAERunner(backend, false));
|
||||||
{
|
{
|
||||||
LOG_INFO("loading from '%s'", file_path.c_str());
|
LOG_INFO("loading from '%s'", file_path.c_str());
|
||||||
|
|
||||||
@ -957,7 +958,7 @@ namespace WAN {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool success = model_loader.load_tensors(tensors, backend);
|
bool success = model_loader.load_tensors(tensors);
|
||||||
|
|
||||||
if (!success) {
|
if (!success) {
|
||||||
LOG_ERROR("load tensors from model loader failed");
|
LOG_ERROR("load tensors from model loader failed");
|
||||||
@ -1564,11 +1565,12 @@ namespace WAN {
|
|||||||
SDVersion version;
|
SDVersion version;
|
||||||
|
|
||||||
WanRunner(ggml_backend_t backend,
|
WanRunner(ggml_backend_t backend,
|
||||||
|
bool offload_params_to_cpu,
|
||||||
const String2GGMLType& tensor_types = {},
|
const String2GGMLType& tensor_types = {},
|
||||||
const std::string prefix = "",
|
const std::string prefix = "",
|
||||||
SDVersion version = VERSION_WAN2,
|
SDVersion version = VERSION_WAN2,
|
||||||
bool flash_attn = false)
|
bool flash_attn = false)
|
||||||
: GGMLRunner(backend) {
|
: GGMLRunner(backend, offload_params_to_cpu) {
|
||||||
wan_params.flash_attn = flash_attn;
|
wan_params.flash_attn = flash_attn;
|
||||||
wan_params.num_layers = 0;
|
wan_params.num_layers = 0;
|
||||||
for (auto pair : tensor_types) {
|
for (auto pair : tensor_types) {
|
||||||
@ -1747,6 +1749,7 @@ namespace WAN {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::shared_ptr<WanRunner> wan = std::shared_ptr<WanRunner>(new WanRunner(backend,
|
std::shared_ptr<WanRunner> wan = std::shared_ptr<WanRunner>(new WanRunner(backend,
|
||||||
|
false,
|
||||||
tensor_types,
|
tensor_types,
|
||||||
"model.diffusion_model"));
|
"model.diffusion_model"));
|
||||||
|
|
||||||
@ -1754,7 +1757,7 @@ namespace WAN {
|
|||||||
std::map<std::string, ggml_tensor*> tensors;
|
std::map<std::string, ggml_tensor*> tensors;
|
||||||
wan->get_param_tensors(tensors, "model.diffusion_model");
|
wan->get_param_tensors(tensors, "model.diffusion_model");
|
||||||
|
|
||||||
bool success = model_loader.load_tensors(tensors, backend);
|
bool success = model_loader.load_tensors(tensors);
|
||||||
|
|
||||||
if (!success) {
|
if (!success) {
|
||||||
LOG_ERROR("load tensors from model loader failed");
|
LOG_ERROR("load tensors from model loader failed");
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user