diff --git a/denoiser.hpp b/denoiser.hpp index 12ba8a7..3b6be75 100644 --- a/denoiser.hpp +++ b/denoiser.hpp @@ -356,7 +356,7 @@ struct Denoiser { virtual ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) = 0; virtual ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) = 0; - virtual std::vector get_sigmas(uint32_t n, scheduler_t scheduler_type, SDVersion version) { + virtual std::vector get_sigmas(uint32_t n, int /*image_seq_len*/, scheduler_t scheduler_type, SDVersion version) { auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1); std::shared_ptr scheduler; switch (scheduler_type) { @@ -582,10 +582,14 @@ struct FluxFlowDenoiser : public Denoiser { set_parameters(shift); } - void set_parameters(float shift = 1.15f) { + void set_shift(float shift) { this->shift = shift; - for (int i = 1; i < TIMESTEPS + 1; i++) { - sigmas[i - 1] = t_to_sigma(i / TIMESTEPS * TIMESTEPS); + } + + void set_parameters(float shift) { + set_shift(shift); + for (int i = 0; i < TIMESTEPS; i++) { + sigmas[i] = t_to_sigma(i); } } @@ -627,6 +631,38 @@ struct FluxFlowDenoiser : public Denoiser { } }; +struct Flux2FlowDenoiser : public FluxFlowDenoiser { + Flux2FlowDenoiser() = default; + + float compute_empirical_mu(uint32_t n, int image_seq_len) { + const float a1 = 8.73809524e-05f; + const float b1 = 1.89833333f; + const float a2 = 0.00016927f; + const float b2 = 0.45666666f; + + if (image_seq_len > 4300) { + float mu = a2 * image_seq_len + b2; + return mu; + } + + float m_200 = a2 * image_seq_len + b2; + float m_10 = a1 * image_seq_len + b1; + + float a = (m_200 - m_10) / 190.0f; + float b = m_200 - 200.0f * a; + float mu = a * n + b; + + return mu; + } + + std::vector get_sigmas(uint32_t n, int image_seq_len, scheduler_t scheduler_type, SDVersion version) override { + float mu = compute_empirical_mu(n, image_seq_len); + LOG_DEBUG("Flux2FlowDenoiser: set shift to %.3f", mu); + set_shift(mu); + return Denoiser::get_sigmas(n, image_seq_len, scheduler_type, version); + } +}; + typedef std::function denoise_cb_t; // k diffusion reverse ODE: dx = (x - D(x;\sigma)) / \sigma dt; \sigma(t) = t diff --git a/examples/cli/README.md b/examples/cli/README.md index d0062cf..f6490ea 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -101,7 +101,7 @@ Options: -s, --seed RNG seed (default: 42, use random seed for < 0) --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise) - --prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow] + --prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow] --lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights contain any quantized parameters, the at_runtime mode will be used; otherwise, immediately will be used.The immediately mode may have precision and diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index cb4f868..28548b4 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -1193,7 +1193,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { on_sample_method_arg}, {"", "--prediction", - "prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow]", + "prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]", on_prediction_arg}, {"", "--lora-apply-mode", diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 92b719f..a7ba034 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -802,6 +802,11 @@ public: denoiser = std::make_shared(shift); break; } + case FLUX2_FLOW_PRED: { + LOG_INFO("running in Flux2 FLOW mode"); + denoiser = std::make_shared(); + break; + } default: { LOG_ERROR("Unknown parametrization %i", sd_ctx_params->prediction); return false; @@ -834,7 +839,7 @@ public: shift = 3.0; } denoiser = std::make_shared(shift); - } else if (sd_version_is_flux(version) || sd_version_is_flux2(version)) { + } else if (sd_version_is_flux(version)) { LOG_INFO("running in Flux FLOW mode"); float shift = sd_ctx_params->flow_shift; if (shift == INFINITY) { @@ -844,11 +849,11 @@ public: shift = 1.15f; } } - if (sd_version_is_flux2(version)) { - shift = 2.05f; - } } denoiser = std::make_shared(shift); + } else if (sd_version_is_flux2(version)) { + LOG_INFO("running in Flux2 FLOW mode"); + denoiser = std::make_shared(); } else if (sd_version_is_wan(version)) { LOG_INFO("running in FLOW mode"); float shift = sd_ctx_params->flow_shift; @@ -1869,6 +1874,11 @@ public: return latent_channel; } + int get_image_seq_len(int h, int w) { + int vae_scale_factor = get_vae_scale_factor(); + return (h / vae_scale_factor) * (w / vae_scale_factor); + } + ggml_tensor* generate_init_latent(ggml_context* work_ctx, int width, int height, @@ -2361,6 +2371,7 @@ const char* prediction_to_str[] = { "edm_v", "sd3_flow", "flux_flow", + "flux2_flow", }; const char* sd_prediction_name(enum prediction_t prediction) { @@ -3131,7 +3142,10 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); int sample_steps = sd_img_gen_params->sample_params.sample_steps; - std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps, sd_img_gen_params->sample_params.scheduler, sd_ctx->sd->version); + std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps, + sd_ctx->sd->get_image_seq_len(height, width), + sd_img_gen_params->sample_params.scheduler, + sd_ctx->sd->version); ggml_tensor* init_latent = nullptr; ggml_tensor* concat_latent = nullptr; @@ -3384,7 +3398,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s if (high_noise_sample_steps > 0) { total_steps += high_noise_sample_steps; } - std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps, sd_vid_gen_params->sample_params.scheduler, sd_ctx->sd->version); + std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps, 0, sd_vid_gen_params->sample_params.scheduler, sd_ctx->sd->version); if (high_noise_sample_steps < 0) { // timesteps ∝ sigmas for Flow models (like wan2.2 a14b) diff --git a/stable-diffusion.h b/stable-diffusion.h index 505bb3c..4e3f8ea 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -71,6 +71,7 @@ enum prediction_t { EDM_V_PRED, SD3_FLOW_PRED, FLUX_FLOW_PRED, + FLUX2_FLOW_PRED, PREDICTION_COUNT };