diff --git a/diffusion_model.hpp b/diffusion_model.hpp index c67587c..312266e 100644 --- a/diffusion_model.hpp +++ b/diffusion_model.hpp @@ -202,14 +202,16 @@ struct FluxModel : public DiffusionModel { }; struct WanModel : public DiffusionModel { + std::string prefix; WAN::WanRunner wan; WanModel(ggml_backend_t backend, bool offload_params_to_cpu, const String2GGMLType& tensor_types = {}, - SDVersion version = VERSION_FLUX, + const std::string prefix = "model.diffusion_model", + SDVersion version = VERSION_WAN2, bool flash_attn = false) - : wan(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn) { + : prefix(prefix), wan(backend, offload_params_to_cpu, tensor_types, prefix, version, flash_attn) { } std::string get_desc() { @@ -229,7 +231,7 @@ struct WanModel : public DiffusionModel { } void get_param_tensors(std::map& tensors) { - wan.get_param_tensors(tensors, "model.diffusion_model"); + wan.get_param_tensors(tensors, prefix); } size_t get_params_buffer_size() { diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index bbbc8b1..4a96400 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -56,6 +56,7 @@ struct SDParams { std::string clip_vision_path; std::string t5xxl_path; std::string diffusion_model_path; + std::string high_noise_diffusion_model_path; std::string vae_path; std::string taesd_path; std::string esrgan_path; @@ -74,49 +75,50 @@ struct SDParams { std::string prompt; std::string negative_prompt; - float cfg_scale = 7.0f; - float img_cfg_scale = INFINITY; - float guidance = 3.5f; - float eta = 0.f; - float style_ratio = 20.f; - int clip_skip = -1; // <= 0 represents unspecified - int width = 512; - int height = 512; - int batch_count = 1; + float style_ratio = 20.f; + int clip_skip = -1; // <= 0 represents unspecified + int width = 512; + int height = 512; + int batch_count = 1; + + std::vector skip_layers = {7, 8, 9}; + sd_sample_params_t sample_params; + + std::vector high_noise_skip_layers = {7, 8, 9}; + sd_sample_params_t high_noise_sample_params; int video_frames = 1; int fps = 16; - sample_method_t sample_method = EULER_A; - scheduler_t scheduler = DEFAULT; - int sample_steps = 20; - float strength = 0.75f; - float control_strength = 0.9f; - rng_type_t rng_type = CUDA_RNG; - int64_t seed = 42; - bool verbose = false; - bool vae_tiling = false; - bool offload_params_to_cpu = false; - bool control_net_cpu = false; - bool normalize_input = false; - bool clip_on_cpu = false; - bool vae_on_cpu = false; - bool diffusion_flash_attn = false; - bool canny_preprocess = false; - bool color = false; - int upscale_repeats = 1; - - std::vector skip_layers = {7, 8, 9}; - float slg_scale = 0.f; - float skip_layer_start = 0.01f; - float skip_layer_end = 0.2f; + float strength = 0.75f; + float control_strength = 0.9f; + rng_type_t rng_type = CUDA_RNG; + int64_t seed = 42; + bool verbose = false; + bool vae_tiling = false; + bool offload_params_to_cpu = false; + bool control_net_cpu = false; + bool normalize_input = false; + bool clip_on_cpu = false; + bool vae_on_cpu = false; + bool diffusion_flash_attn = false; + bool canny_preprocess = false; + bool color = false; + int upscale_repeats = 1; bool chroma_use_dit_mask = true; bool chroma_use_t5_mask = false; int chroma_t5_mask_pad = 1; + + SDParams() { + sd_sample_params_init(&sample_params); + sd_sample_params_init(&high_noise_sample_params); + } }; void print_params(SDParams params) { + char* sample_params_str = sd_sample_params_to_str(¶ms.sample_params); + char* high_noise_sample_params_str = sd_sample_params_to_str(¶ms.high_noise_sample_params); printf("Option: \n"); printf(" n_threads: %d\n", params.n_threads); printf(" mode: %s\n", modes_str[params.mode]); @@ -127,6 +129,7 @@ void print_params(SDParams params) { printf(" clip_vision_path: %s\n", params.clip_vision_path.c_str()); printf(" t5xxl_path: %s\n", params.t5xxl_path.c_str()); printf(" diffusion_model_path: %s\n", params.diffusion_model_path.c_str()); + printf(" high_noise_diffusion_model_path: %s\n", params.high_noise_diffusion_model_path.c_str()); printf(" vae_path: %s\n", params.vae_path.c_str()); printf(" taesd_path: %s\n", params.taesd_path.c_str()); printf(" esrgan_path: %s\n", params.esrgan_path.c_str()); @@ -152,17 +155,11 @@ void print_params(SDParams params) { printf(" strength(control): %.2f\n", params.control_strength); printf(" prompt: %s\n", params.prompt.c_str()); printf(" negative_prompt: %s\n", params.negative_prompt.c_str()); - printf(" cfg_scale: %.2f\n", params.cfg_scale); - printf(" img_cfg_scale: %.2f\n", params.img_cfg_scale); - printf(" slg_scale: %.2f\n", params.slg_scale); - printf(" guidance: %.2f\n", params.guidance); - printf(" eta: %.2f\n", params.eta); printf(" clip_skip: %d\n", params.clip_skip); printf(" width: %d\n", params.width); printf(" height: %d\n", params.height); - printf(" sample_method: %s\n", sd_sample_method_name(params.sample_method)); - printf(" scheduler: %s\n", sd_schedule_name(params.scheduler)); - printf(" sample_steps: %d\n", params.sample_steps); + printf(" sample_params: %s\n", SAFE_STR(sample_params_str)); + printf(" high_noise_sample_params: %s\n", SAFE_STR(high_noise_sample_params_str)); printf(" strength(img2img): %.2f\n", params.strength); printf(" rng: %s\n", sd_rng_type_name(params.rng_type)); printf(" seed: %ld\n", params.seed); @@ -174,6 +171,8 @@ void print_params(SDParams params) { printf(" chroma_t5_mask_pad: %d\n", params.chroma_t5_mask_pad); printf(" video_frames: %d\n", params.video_frames); printf(" fps: %d\n", params.fps); + free(sample_params_str); + free(high_noise_sample_params_str); } void print_usage(int argc, const char* argv[]) { @@ -186,6 +185,7 @@ void print_usage(int argc, const char* argv[]) { printf(" If threads <= 0, then threads will be set to the number of CPU physical cores\n"); printf(" -m, --model [MODEL] path to full model\n"); printf(" --diffusion-model path to the standalone diffusion model\n"); + printf(" --high-noise-diffusion-model path to the standalone high noise diffusion model\n"); printf(" --clip_l path to the clip-l text encoder\n"); printf(" --clip_g path to the clip-g text encoder\n"); printf(" --clip_vision path to the clip-vision encoder\n"); @@ -219,6 +219,23 @@ void print_usage(int argc, const char* argv[]) { printf(" --skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])\n"); printf(" --skip-layer-start START SLG enabling point: (default: 0.01)\n"); printf(" --skip-layer-end END SLG disabling point: (default: 0.2)\n"); + printf(" --scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)\n"); + printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n"); + printf(" sampling method (default: \"euler_a\")\n"); + printf(" --steps STEPS number of sample steps (default: 20)\n"); + printf(" --high-noise-cfg-scale SCALE (high noise) unconditional guidance scale: (default: 7.0)\n"); + printf(" --high-noise-img-cfg-scale SCALE (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n"); + printf(" --high-noise-guidance SCALE (high noise) distilled guidance scale for models with guidance input (default: 3.5)\n"); + printf(" --high-noise-slg-scale SCALE (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n"); + printf(" 0 means disabled, a value of 2.5 is nice for sd3.5 medium\n"); + printf(" --high-noise-eta SCALE (high noise) eta in DDIM, only for DDIM and TCD: (default: 0)\n"); + printf(" --high-noise-skip-layers LAYERS (high noise) Layers to skip for SLG steps: (default: [7,8,9])\n"); + printf(" --high-noise-skip-layer-start (high noise) SLG enabling point: (default: 0.01)\n"); + printf(" --high-noise-skip-layer-end END (high noise) SLG disabling point: (default: 0.2)\n"); + printf(" --high-noise-scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)\n"); + printf(" --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n"); + printf(" (high noise) sampling method (default: \"euler_a\")\n"); + printf(" --high-noise-steps STEPS (high noise) number of sample steps (default: 20)\n"); printf(" SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n"); printf(" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n"); printf(" --style-ratio STYLE-RATIO strength for keeping input identity (default: 20)\n"); @@ -226,13 +243,9 @@ void print_usage(int argc, const char* argv[]) { printf(" 1.0 corresponds to full destruction of information in init image\n"); printf(" -H, --height H image height, in pixel space (default: 512)\n"); printf(" -W, --width W image width, in pixel space (default: 512)\n"); - printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n"); - printf(" sampling method (default: \"euler_a\")\n"); - printf(" --steps STEPS number of sample steps (default: 20)\n"); printf(" --rng {std_default, cuda} RNG (default: cuda)\n"); printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n"); printf(" -b, --batch-count COUNT number of images to generate\n"); - printf(" --scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)\n"); printf(" --clip-skip N ignore last_dot_pos layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n"); printf(" <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n"); printf(" --vae-tiling process vae in tiles to reduce memory usage\n"); @@ -420,6 +433,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { {"", "--clip_vision", "", ¶ms.clip_vision_path}, {"", "--t5xxl", "", ¶ms.t5xxl_path}, {"", "--diffusion-model", "", ¶ms.diffusion_model_path}, + {"", "--high-noise-diffusion-model", "", ¶ms.high_noise_diffusion_model_path}, {"", "--vae", "", ¶ms.vae_path}, {"", "--taesd", "", ¶ms.taesd_path}, {"", "--control-net", "", ¶ms.control_net_path}, @@ -443,7 +457,8 @@ void parse_args(int argc, const char** argv, SDParams& params) { {"", "--upscale-repeats", "", ¶ms.upscale_repeats}, {"-H", "--height", "", ¶ms.height}, {"-W", "--width", "", ¶ms.width}, - {"", "--steps", "", ¶ms.sample_steps}, + {"", "--steps", "", ¶ms.sample_params.sample_steps}, + {"", "--high-noise-steps", "", ¶ms.high_noise_sample_params.sample_steps}, {"", "--clip-skip", "", ¶ms.clip_skip}, {"-b", "--batch-count", "", ¶ms.batch_count}, {"", "--chroma-t5-mask-pad", "", ¶ms.chroma_t5_mask_pad}, @@ -452,17 +467,23 @@ void parse_args(int argc, const char** argv, SDParams& params) { }; options.float_options = { - {"", "--cfg-scale", "", ¶ms.cfg_scale}, - {"", "--img-cfg-scale", "", ¶ms.img_cfg_scale}, - {"", "--guidance", "", ¶ms.guidance}, - {"", "--eta", "", ¶ms.eta}, + {"", "--cfg-scale", "", ¶ms.sample_params.guidance.txt_cfg}, + {"", "--img-cfg-scale", "", ¶ms.sample_params.guidance.img_cfg}, + {"", "--guidance", "", ¶ms.sample_params.guidance.distilled_guidance}, + {"", "--slg-scale", "", ¶ms.sample_params.guidance.slg.scale}, + {"", "--skip-layer-start", "", ¶ms.sample_params.guidance.slg.layer_start}, + {"", "--skip-layer-end", "", ¶ms.sample_params.guidance.slg.layer_end}, + {"", "--eta", "", ¶ms.sample_params.eta}, + {"", "--high-noise-cfg-scale", "", ¶ms.high_noise_sample_params.guidance.txt_cfg}, + {"", "--high-noise-img-cfg-scale", "", ¶ms.high_noise_sample_params.guidance.img_cfg}, + {"", "--high-noise-guidance", "", ¶ms.high_noise_sample_params.guidance.distilled_guidance}, + {"", "--high-noise-slg-scale", "", ¶ms.high_noise_sample_params.guidance.slg.scale}, + {"", "--high-noise-skip-layer-start", "", ¶ms.high_noise_sample_params.guidance.slg.layer_start}, + {"", "--high-noise-skip-layer-end", "", ¶ms.high_noise_sample_params.guidance.slg.layer_end}, + {"", "--high-noise-eta", "", ¶ms.high_noise_sample_params.eta}, {"", "--strength", "", ¶ms.strength}, {"", "--style-ratio", "", ¶ms.style_ratio}, {"", "--control-strength", "", ¶ms.control_strength}, - {"", "--slg-scale", "", ¶ms.slg_scale}, - {"", "--skip-layer-start", "", ¶ms.skip_layer_start}, - {"", "--skip-layer-end", "", ¶ms.skip_layer_end}, - }; options.bool_options = { @@ -535,9 +556,9 @@ void parse_args(int argc, const char** argv, SDParams& params) { if (++index >= argc) { return -1; } - const char* arg = argv[index]; - params.scheduler = str_to_schedule(arg); - if (params.scheduler == SCHEDULE_COUNT) { + const char* arg = argv[index]; + params.sample_params.scheduler = str_to_schedule(arg); + if (params.sample_params.scheduler == SCHEDULE_COUNT) { fprintf(stderr, "error: invalid scheduler %s\n", arg); return -1; @@ -545,13 +566,27 @@ void parse_args(int argc, const char** argv, SDParams& params) { return 1; }; + auto on_high_noise_schedule_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + params.high_noise_sample_params.scheduler = str_to_schedule(arg); + if (params.high_noise_sample_params.scheduler == SCHEDULE_COUNT) { + fprintf(stderr, "error: invalid high noise scheduler %s\n", + arg); + return -1; + } + return 1; + }; + auto on_sample_method_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } - const char* arg = argv[index]; - params.sample_method = str_to_sample_method(arg); - if (params.sample_method == SAMPLE_METHOD_COUNT) { + const char* arg = argv[index]; + params.sample_params.sample_method = str_to_sample_method(arg); + if (params.sample_params.sample_method == SAMPLE_METHOD_COUNT) { fprintf(stderr, "error: invalid sample method %s\n", arg); return -1; @@ -559,6 +594,20 @@ void parse_args(int argc, const char** argv, SDParams& params) { return 1; }; + auto on_high_noise_sample_method_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + params.high_noise_sample_params.sample_method = str_to_sample_method(arg); + if (params.high_noise_sample_params.sample_method == SAMPLE_METHOD_COUNT) { + fprintf(stderr, "error: invalid high noise sample method %s\n", + arg); + return -1; + } + return 1; + }; + auto on_seed_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; @@ -600,6 +649,33 @@ void parse_args(int argc, const char** argv, SDParams& params) { return 1; }; + auto on_high_noise_skip_layers_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + std::string layers_str = argv[index]; + if (layers_str[0] != '[' || layers_str[layers_str.size() - 1] != ']') { + return -1; + } + + layers_str = layers_str.substr(1, layers_str.size() - 2); + + std::regex regex("[, ]+"); + std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1); + std::sregex_token_iterator end; + std::vector tokens(iter, end); + std::vector layers; + for (const auto& token : tokens) { + try { + layers.push_back(std::stoi(token)); + } catch (const std::invalid_argument& e) { + return -1; + } + } + params.high_noise_skip_layers = layers; + return 1; + }; + auto on_ref_image_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; @@ -616,6 +692,9 @@ void parse_args(int argc, const char** argv, SDParams& params) { {"", "--sampling-method", "", on_sample_method_arg}, {"", "--scheduler", "", on_schedule_arg}, {"", "--skip-layers", "", on_skip_layers_arg}, + {"", "--high-noise-sampling-method", "", on_high_noise_sample_method_arg}, + {"", "--high-noise-scheduler", "", on_high_noise_schedule_arg}, + {"", "--high-noise-skip-layers", "", on_high_noise_skip_layers_arg}, {"-r", "--ref-image", "", on_ref_image_arg}, {"-h", "--help", "", on_help_arg}, }; @@ -657,11 +736,16 @@ void parse_args(int argc, const char** argv, SDParams& params) { exit(1); } - if (params.sample_steps <= 0) { + if (params.sample_params.sample_steps <= 0) { fprintf(stderr, "error: the sample_steps must be greater than 0\n"); exit(1); } + if (params.high_noise_sample_params.sample_steps <= 0) { + fprintf(stderr, "error: the high_noise_sample_steps must be greater than 0\n"); + exit(1); + } + if (params.strength < 0.f || params.strength > 1.f) { fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n"); exit(1); @@ -697,8 +781,12 @@ void parse_args(int argc, const char** argv, SDParams& params) { } } - if (!isfinite(params.img_cfg_scale)) { - params.img_cfg_scale = params.cfg_scale; + if (!isfinite(params.sample_params.guidance.img_cfg)) { + params.sample_params.guidance.img_cfg = params.sample_params.guidance.txt_cfg; + } + + if (!isfinite(params.high_noise_sample_params.guidance.img_cfg)) { + params.high_noise_sample_params.guidance.img_cfg = params.high_noise_sample_params.guidance.txt_cfg; } } @@ -719,27 +807,27 @@ std::string get_image_params(SDParams params, int64_t seed) { if (params.negative_prompt.size() != 0) { parameter_string += "Negative prompt: " + params.negative_prompt + "\n"; } - parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", "; - parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", "; - if (params.slg_scale != 0 && params.skip_layers.size() != 0) { - parameter_string += "SLG scale: " + std::to_string(params.cfg_scale) + ", "; + parameter_string += "Steps: " + std::to_string(params.sample_params.sample_steps) + ", "; + parameter_string += "CFG scale: " + std::to_string(params.sample_params.guidance.txt_cfg) + ", "; + if (params.sample_params.guidance.slg.scale != 0 && params.skip_layers.size() != 0) { + parameter_string += "SLG scale: " + std::to_string(params.sample_params.guidance.txt_cfg) + ", "; parameter_string += "Skip layers: ["; for (const auto& layer : params.skip_layers) { parameter_string += std::to_string(layer) + ", "; } parameter_string += "], "; - parameter_string += "Skip layer start: " + std::to_string(params.skip_layer_start) + ", "; - parameter_string += "Skip layer end: " + std::to_string(params.skip_layer_end) + ", "; + parameter_string += "Skip layer start: " + std::to_string(params.sample_params.guidance.slg.layer_start) + ", "; + parameter_string += "Skip layer end: " + std::to_string(params.sample_params.guidance.slg.layer_end) + ", "; } - parameter_string += "Guidance: " + std::to_string(params.guidance) + ", "; - parameter_string += "Eta: " + std::to_string(params.eta) + ", "; + parameter_string += "Guidance: " + std::to_string(params.sample_params.guidance.distilled_guidance) + ", "; + parameter_string += "Eta: " + std::to_string(params.sample_params.eta) + ", "; parameter_string += "Seed: " + std::to_string(seed) + ", "; parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", "; parameter_string += "Model: " + sd_basename(params.model_path) + ", "; parameter_string += "RNG: " + std::string(sd_rng_type_name(params.rng_type)) + ", "; - parameter_string += "Sampler: " + std::string(sd_sample_method_name(params.sample_method)); - if (params.scheduler != DEFAULT) { - parameter_string += " " + std::string(sd_schedule_name(params.scheduler)); + parameter_string += "Sampler: " + std::string(sd_sample_method_name(params.sample_params.sample_method)); + if (params.sample_params.scheduler != DEFAULT) { + parameter_string += " " + std::string(sd_schedule_name(params.sample_params.scheduler)); } parameter_string += ", "; for (const auto& te : {params.clip_l_path, params.clip_g_path, params.t5xxl_path}) { @@ -806,23 +894,10 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { int main(int argc, const char* argv[]) { SDParams params; parse_args(argc, argv, params); - sd_guidance_params_t guidance_params = {params.cfg_scale, - params.img_cfg_scale, - params.guidance, - { - params.skip_layers.data(), - params.skip_layers.size(), - params.skip_layer_start, - params.skip_layer_end, - params.slg_scale, - }}; - sd_sample_params_t sample_params = { - guidance_params, - params.scheduler, - params.sample_method, - params.sample_steps, - params.eta, - }; + params.sample_params.guidance.slg.layers = params.skip_layers.data(); + params.sample_params.guidance.slg.layer_count = params.skip_layers.size(); + params.high_noise_sample_params.guidance.slg.layers = params.high_noise_skip_layers.data(); + params.high_noise_sample_params.guidance.slg.layer_count = params.high_noise_skip_layers.size(); sd_set_log_callback(sd_log_cb, (void*)¶ms); @@ -983,6 +1058,7 @@ int main(int argc, const char* argv[]) { params.clip_vision_path.c_str(), params.t5xxl_path.c_str(), params.diffusion_model_path.c_str(), + params.high_noise_diffusion_model_path.c_str(), params.vae_path.c_str(), params.taesd_path.c_str(), params.control_net_path.c_str(), @@ -1066,7 +1142,7 @@ int main(int argc, const char* argv[]) { mask_image, params.width, params.height, - sample_params, + params.sample_params, params.strength, params.seed, params.batch_count, @@ -1087,7 +1163,8 @@ int main(int argc, const char* argv[]) { input_image, params.width, params.height, - sample_params, + params.sample_params, + params.high_noise_sample_params, params.strength, params.seed, params.video_frames, diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 64b57f7..c89f243 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -96,6 +96,7 @@ public: std::shared_ptr cond_stage_model; std::shared_ptr clip_vision; // for svd or wan2.1 i2v std::shared_ptr diffusion_model; + std::shared_ptr high_noise_diffusion_model; std::shared_ptr first_stage_model; std::shared_ptr tae_first_stage; std::shared_ptr control_net; @@ -207,6 +208,13 @@ public: } } + if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) { + LOG_INFO("loading high noise diffusion model from '%s'", sd_ctx_params->high_noise_diffusion_model_path); + if (!model_loader.init_from_file(sd_ctx_params->high_noise_diffusion_model_path, "model.high_noise_diffusion_model.")) { + LOG_WARN("loading diffusion model from '%s' failed", sd_ctx_params->high_noise_diffusion_model_path); + } + } + bool is_unet = model_loader.model_is_unet(); if (strlen(SAFE_STR(sd_ctx_params->clip_l_path)) > 0) { @@ -380,8 +388,17 @@ public: diffusion_model = std::make_shared(backend, offload_params_to_cpu, model_loader.tensor_storages_types, + "model.diffusion_model", version, sd_ctx_params->diffusion_flash_attn); + if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) { + high_noise_diffusion_model = std::make_shared(backend, + offload_params_to_cpu, + model_loader.tensor_storages_types, + "model.high_noise_diffusion_model", + version, + sd_ctx_params->diffusion_flash_attn); + } if (diffusion_model->get_desc() == "Wan2.1-I2V-14B") { clip_vision = std::make_shared(backend, offload_params_to_cpu, @@ -417,6 +434,11 @@ public: diffusion_model->alloc_params_buffer(); diffusion_model->get_param_tensors(tensors); + if (high_noise_diffusion_model) { + high_noise_diffusion_model->alloc_params_buffer(); + high_noise_diffusion_model->get_param_tensors(tensors); + } + if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) { LOG_INFO("VAE Autoencoder: Using CPU backend"); vae_backend = ggml_backend_cpu_init(); @@ -546,7 +568,10 @@ public: { size_t clip_params_mem_size = cond_stage_model->get_params_buffer_size(); size_t unet_params_mem_size = diffusion_model->get_params_buffer_size(); - size_t vae_params_mem_size = 0; + if (high_noise_diffusion_model) { + unet_params_mem_size += high_noise_diffusion_model->get_params_buffer_size(); + } + size_t vae_params_mem_size = 0; if (!use_tiny_autoencoder) { vae_params_mem_size = first_stage_model->get_params_buffer_size(); } else { @@ -923,6 +948,8 @@ public: } ggml_tensor* sample(ggml_context* work_ctx, + std::shared_ptr work_diffusion_model, + bool inverse_noise_scaling, ggml_tensor* init_latent, ggml_tensor* noise, SDCondition cond, @@ -952,7 +979,10 @@ public: size_t steps = sigmas.size() - 1; struct ggml_tensor* x = ggml_dup_tensor(work_ctx, init_latent); copy_ggml_tensor(x, init_latent); - x = denoiser->noise_scaling(sigmas[0], noise, x); + + if (noise) { + x = denoiser->noise_scaling(sigmas[0], noise, x); + } struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x); @@ -1015,31 +1045,31 @@ public: if (start_merge_step == -1 || step <= start_merge_step) { // cond - diffusion_model->compute(n_threads, - noised_input, - timesteps, - cond.c_crossattn, - cond.c_concat, - cond.c_vector, - guidance_tensor, - ref_latents, - -1, - controls, - control_strength, - &out_cond); + work_diffusion_model->compute(n_threads, + noised_input, + timesteps, + cond.c_crossattn, + cond.c_concat, + cond.c_vector, + guidance_tensor, + ref_latents, + -1, + controls, + control_strength, + &out_cond); } else { - diffusion_model->compute(n_threads, - noised_input, - timesteps, - id_cond.c_crossattn, - cond.c_concat, - id_cond.c_vector, - guidance_tensor, - ref_latents, - -1, - controls, - control_strength, - &out_cond); + work_diffusion_model->compute(n_threads, + noised_input, + timesteps, + id_cond.c_crossattn, + cond.c_concat, + id_cond.c_vector, + guidance_tensor, + ref_latents, + -1, + controls, + control_strength, + &out_cond); } float* negative_data = NULL; @@ -1049,35 +1079,35 @@ public: control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector); controls = control_net->controls; } - diffusion_model->compute(n_threads, - noised_input, - timesteps, - uncond.c_crossattn, - uncond.c_concat, - uncond.c_vector, - guidance_tensor, - ref_latents, - -1, - controls, - control_strength, - &out_uncond); + work_diffusion_model->compute(n_threads, + noised_input, + timesteps, + uncond.c_crossattn, + uncond.c_concat, + uncond.c_vector, + guidance_tensor, + ref_latents, + -1, + controls, + control_strength, + &out_uncond); negative_data = (float*)out_uncond->data; } float* img_cond_data = NULL; if (has_img_cond) { - diffusion_model->compute(n_threads, - noised_input, - timesteps, - img_cond.c_crossattn, - img_cond.c_concat, - img_cond.c_vector, - guidance_tensor, - ref_latents, - -1, - controls, - control_strength, - &out_img_cond); + work_diffusion_model->compute(n_threads, + noised_input, + timesteps, + img_cond.c_crossattn, + img_cond.c_concat, + img_cond.c_vector, + guidance_tensor, + ref_latents, + -1, + controls, + control_strength, + &out_img_cond); img_cond_data = (float*)out_img_cond->data; } @@ -1087,20 +1117,20 @@ public: if (is_skiplayer_step) { LOG_DEBUG("Skipping layers at step %d\n", step); // skip layer (same as conditionned) - diffusion_model->compute(n_threads, - noised_input, - timesteps, - cond.c_crossattn, - cond.c_concat, - cond.c_vector, - guidance_tensor, - ref_latents, - -1, - controls, - control_strength, - &out_skip, - NULL, - skip_layers); + work_diffusion_model->compute(n_threads, + noised_input, + timesteps, + cond.c_crossattn, + cond.c_concat, + cond.c_vector, + guidance_tensor, + ref_latents, + -1, + controls, + control_strength, + &out_skip, + NULL, + skip_layers); skip_layer_data = (float*)out_skip->data; } float* vec_denoised = (float*)denoised->data; @@ -1152,13 +1182,15 @@ public: sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng, eta); - x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x); + if (inverse_noise_scaling) { + x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x); + } if (control_net) { control_net->free_control_ctx(); control_net->free_compute_buffer(); } - diffusion_model->free_compute_buffer(); + work_diffusion_model->free_compute_buffer(); return x; } @@ -1446,6 +1478,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "clip_vision_path: %s\n" "t5xxl_path: %s\n" "diffusion_model_path: %s\n" + "high_noise_diffusion_model_path: %s\n" "vae_path: %s\n" "taesd_path: %s\n" "control_net_path: %s\n" @@ -1472,6 +1505,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { SAFE_STR(sd_ctx_params->clip_vision_path), SAFE_STR(sd_ctx_params->t5xxl_path), SAFE_STR(sd_ctx_params->diffusion_model_path), + SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path), SAFE_STR(sd_ctx_params->vae_path), SAFE_STR(sd_ctx_params->taesd_path), SAFE_STR(sd_ctx_params->control_net_path), @@ -1602,6 +1636,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) { memset((void*)sd_vid_gen_params, 0, sizeof(sd_vid_gen_params_t)); sd_sample_params_init(&sd_vid_gen_params->sample_params); + sd_sample_params_init(&sd_vid_gen_params->high_noise_sample_params); sd_vid_gen_params->width = 512; sd_vid_gen_params->height = 512; sd_vid_gen_params->strength = 0.75f; @@ -1902,6 +1937,8 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, } struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, + sd_ctx->sd->diffusion_model, + true, x_t, noise, cond, @@ -2243,7 +2280,13 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s sd_ctx->sd->init_scheduler(sd_vid_gen_params->sample_params.scheduler); - std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps); + int high_noise_sample_steps = 0; + if (sd_ctx->sd->high_noise_diffusion_model) { + sd_ctx->sd->init_scheduler(sd_vid_gen_params->high_noise_sample_params.scheduler); + high_noise_sample_steps = sd_vid_gen_params->high_noise_sample_params.sample_steps; + } + + std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps + high_noise_sample_steps); struct ggml_init_params params; params.mem_size = static_cast(100 * 1024) * 1024; // 100 MB @@ -2331,7 +2374,6 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s } ggml_tensor* init_latent = generate_init_latent(sd_ctx, work_ctx, width, height, frames, true); - sample_steps = sigmas.size() - 1; // Get learned condition bool zero_out_masked = true; @@ -2347,7 +2389,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s cond.c_concat = concat_latent; cond.c_vector = clip_vision_output; SDCondition uncond; - if (sd_vid_gen_params->sample_params.guidance.txt_cfg != 1.0) { + if (sd_vid_gen_params->sample_params.guidance.txt_cfg != 1.0 || sd_vid_gen_params->high_noise_sample_params.guidance.txt_cfg != 1.0) { uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx, sd_ctx->sd->n_threads, negative_prompt, @@ -2372,15 +2414,50 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s int C = 16; struct ggml_tensor* final_latent; + struct ggml_tensor* x_t = init_latent; + struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, T, C); + ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng); + // High Noise Sample + if (high_noise_sample_steps > 0) { + LOG_DEBUG("sample(high noise) %dx%dx%d", W, H, T); + int64_t sampling_start = ggml_time_ms(); + + std::vector high_noise_sigmas = std::vector(sigmas.begin(), sigmas.begin() + high_noise_sample_steps + 1); + sigmas = std::vector(sigmas.begin() + high_noise_sample_steps, sigmas.end()); + + x_t = sd_ctx->sd->sample(work_ctx, + sd_ctx->sd->high_noise_diffusion_model, + false, + x_t, + noise, + cond, + uncond, + {}, + NULL, + 0, + sd_vid_gen_params->high_noise_sample_params.guidance, + sd_vid_gen_params->high_noise_sample_params.eta, + sd_vid_gen_params->high_noise_sample_params.sample_method, + high_noise_sigmas, + -1, + {}); + + int64_t sampling_end = ggml_time_ms(); + LOG_INFO("sampling(high noise) completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->high_noise_diffusion_model->free_params_buffer(); + } + noise = NULL; + } + // Sample { LOG_DEBUG("sample %dx%dx%d", W, H, T); - int64_t sampling_start = ggml_time_ms(); - struct ggml_tensor* x_t = init_latent; - struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, T, C); - ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng); + int64_t sampling_start = ggml_time_ms(); final_latent = sd_ctx->sd->sample(work_ctx, + sd_ctx->sd->diffusion_model, + true, x_t, noise, cond, @@ -2397,10 +2474,9 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s int64_t sampling_end = ggml_time_ms(); LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); - } - - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->diffusion_model->free_params_buffer(); + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->diffusion_model->free_params_buffer(); + } } int64_t t4 = ggml_time_ms(); diff --git a/stable-diffusion.h b/stable-diffusion.h index 63c5265..32c0a94 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -118,6 +118,7 @@ typedef struct { const char* clip_vision_path; const char* t5xxl_path; const char* diffusion_model_path; + const char* high_noise_diffusion_model_path; const char* vae_path; const char* taesd_path; const char* control_net_path; @@ -199,6 +200,7 @@ typedef struct { int width; int height; sd_sample_params_t sample_params; + sd_sample_params_t high_noise_sample_params; float strength; int64_t seed; int video_frames; @@ -229,6 +231,9 @@ SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params); SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params); SD_API void free_sd_ctx(sd_ctx_t* sd_ctx); +SD_API void sd_sample_params_init(sd_sample_params_t* sample_params); +SD_API char* sd_sample_params_to_str(const sd_sample_params_t* sample_params); + SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params); SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params); SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params); diff --git a/wan.hpp b/wan.hpp index 40d7cbc..f198396 100644 --- a/wan.hpp +++ b/wan.hpp @@ -1579,7 +1579,7 @@ namespace WAN { wan_params.num_layers = 0; for (auto pair : tensor_types) { std::string tensor_name = pair.first; - if (tensor_name.find("model.diffusion_model.") == std::string::npos) + if (tensor_name.find(prefix) == std::string::npos) continue; size_t pos = tensor_name.find("blocks."); if (pos != std::string::npos) {