2026-05-09 08:48:51 +00:00
8 changed files with 81 additions and 51 deletions
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -44,6 +44,7 @@ Context Options:
                                           CPU physical cores
  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
  --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
  --vae-tiling                             process vae in tiles to reduce memory usage
  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
@ -108,7 +109,6 @@ Generation Options:
  --skip-layer-start <float>               SLG enabling point (default: 0.01)
  --skip-layer-end <float>                 SLG disabling point (default: 0.2)
  --eta <float>                            eta in DDIM, only for DDIM/TCD/res_multistep/res_2s (default: 0)
  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
  --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
  --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
  --high-noise-guidance <float>            (high noise) distilled guidance scale for models with guidance input (default: 3.5)
--- a/examples/common/common.hpp
+++ b/examples/common/common.hpp
@ -581,6 +581,10 @@ struct SDContextParams {
             "--vae-tile-overlap",
             "tile overlap for vae tiling, in fraction of tile size (default: 0.5)",
             &vae_tiling_params.target_overlap},
            {"",
             "--flow-shift",
             "shift value for Flow models like SD3.x or WAN (default: auto)",
             &flow_shift},
        };
        options.bool_options = {
@ -899,6 +903,7 @@ struct SDContextParams {
            << "  photo_maker_path: \"" << photo_maker_path << "\",\n"
            << "  rng_type: " << sd_rng_type_name(rng_type) << ",\n"
            << "  sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n"
            << "  flow_shift: " << (std::isinf(flow_shift) ? "INF" : std::to_string(flow_shift)) << "\n"
            << "  offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
            << "  enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
            << "  control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
@ -981,6 +986,7 @@ struct SDContextParams {
            chroma_use_t5_mask,
            chroma_t5_mask_pad,
            qwen_image_zero_cond_t,
            flow_shift,
        };
        return sd_ctx_params;
    }
@ -1200,10 +1206,6 @@ struct SDGenerationParams {
             "--eta",
             "eta in DDIM, only for DDIM and TCD (default: 0)",
             &sample_params.eta},
            {"",
             "--flow-shift",
             "shift value for Flow models like SD3.x or WAN (default: auto)",
             &sample_params.flow_shift},
            {"",
             "--high-noise-cfg-scale",
             "(high noise) unconditional guidance scale: (default: 7.0)",
@ -1604,7 +1606,6 @@ struct SDGenerationParams {
        load_if_exists("cfg_scale", sample_params.guidance.txt_cfg);
        load_if_exists("img_cfg_scale", sample_params.guidance.img_cfg);
        load_if_exists("guidance", sample_params.guidance.distilled_guidance);
        load_if_exists("flow_shift", sample_params.flow_shift);
        auto load_sampler_if_exists = [&](const char* key, enum sample_method_t& out) {
            if (j.contains(key) && j[key].is_string()) {
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -36,6 +36,7 @@ Context Options:
                                           CPU physical cores
  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
  --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
  --vae-tiling                             process vae in tiles to reduce memory usage
  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
@ -100,7 +101,6 @@ Default Generation Options:
  --skip-layer-start <float>               SLG enabling point (default: 0.01)
  --skip-layer-end <float>                 SLG disabling point (default: 0.2)
  --eta <float>                            eta in DDIM, only for DDIM/TCD/res_multistep/res_2s (default: 0)
  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
  --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
  --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
  --high-noise-guidance <float>            (high noise) distilled guidance scale for models with guidance input (default: 3.5)
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@ -201,6 +201,7 @@ typedef struct {
    bool chroma_use_t5_mask;
    int chroma_t5_mask_pad;
    bool qwen_image_zero_cond_t;
    float flow_shift;
 } sd_ctx_params_t;
 typedef struct {
@ -234,7 +235,6 @@ typedef struct {
    int shifted_timestep;
    float* custom_sigmas;
    int custom_sigmas_count;
    float flow_shift;
 } sd_sample_params_t;
 typedef struct {
--- a/src/denoiser.hpp
+++ b/src/denoiser.hpp
@ -657,8 +657,9 @@ struct DiscreteFlowDenoiser : public Denoiser {
    float sigma_data = 1.0f;
-    DiscreteFlowDenoiser(float shift = 3.0f) {
+    DiscreteFlowDenoiser(float shift = 3.0f)
-        set_shift(shift);
+        : shift(shift) {
        set_parameters();
    }
    void set_parameters() {
@ -667,11 +668,6 @@ struct DiscreteFlowDenoiser : public Denoiser {
        }
    }
    void set_shift(float shift) {
        this->shift = shift;
        set_parameters();
    }
    float sigma_min() override {
        return sigmas[0];
    }
@ -714,8 +710,34 @@ float flux_time_shift(float mu, float sigma, float t) {
    return ::expf(mu) / (::expf(mu) + ::powf((1.0f / t - 1.0f), sigma));
 }
-struct FluxFlowDenoiser : public DiscreteFlowDenoiser {
+struct FluxFlowDenoiser : public Denoiser {
-    FluxFlowDenoiser() = default;
+    float sigmas[TIMESTEPS];
    float shift = 1.15f;
    float sigma_data = 1.0f;
    FluxFlowDenoiser(float shift = 1.15f) {
        set_parameters(shift);
    }
    void set_shift(float shift) {
        this->shift = shift;
    }
    void set_parameters(float shift) {
        set_shift(shift);
        for (int i = 0; i < TIMESTEPS; i++) {
            sigmas[i] = t_to_sigma(static_cast<float>(i));
        }
    }
    float sigma_min() override {
        return sigmas[0];
    }
    float sigma_max() override {
        return sigmas[TIMESTEPS - 1];
    }
    float sigma_to_t(float sigma) override {
        return sigma;
@ -725,6 +747,26 @@ struct FluxFlowDenoiser : public DiscreteFlowDenoiser {
        t = t + 1;
        return flux_time_shift(shift, 1.0f, t / TIMESTEPS);
    }
    std::vector<float> get_scalings(float sigma) override {
        float c_skip = 1.0f;
        float c_out  = -sigma;
        float c_in   = 1.0f;
        return {c_skip, c_out, c_in};
    }
    // this function will modify noise/latent
    ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override {
        ggml_ext_tensor_scale_inplace(noise, sigma);
        ggml_ext_tensor_scale_inplace(latent, 1.0f - sigma);
        ggml_ext_tensor_add_inplace(latent, noise);
        return latent;
    }
    ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override {
        ggml_ext_tensor_scale_inplace(latent, 1.0f / (1.0f - sigma));
        return latent;
    }
 };
 struct Flux2FlowDenoiser : public FluxFlowDenoiser {
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -115,7 +115,6 @@ public:
    int n_threads                    = -1;
    float scale_factor               = 0.18215f;
    float shift_factor               = 0.f;
    float default_flow_shift         = INFINITY;
    std::shared_ptr<Conditioner> cond_stage_model;
    std::shared_ptr<FrozenCLIPVisionEmbedder> clip_vision;  // for svd or wan2.1 i2v
@ -882,6 +881,7 @@ public:
        // init denoiser
        {
            prediction_t pred_type = sd_ctx_params->prediction;
            float flow_shift       = sd_ctx_params->flow_shift;
            if (pred_type == PREDICTION_COUNT) {
                if (sd_version_is_sd2(version)) {
@ -906,19 +906,22 @@ public:
                           sd_version_is_qwen_image(version) ||
                           sd_version_is_z_image(version)) {
                    pred_type = FLOW_PRED;
-                    if (sd_version_is_wan(version)) {
+                    if (flow_shift == INFINITY) {
-                        default_flow_shift = 5.f;
+                        if (sd_version_is_wan(version)) {
-                    } else {
+                            flow_shift = 5.f;
-                        default_flow_shift = 3.f;
+                        } else {
                            flow_shift = 3.f;
                        }
                    }
                } else if (sd_version_is_flux(version)) {
                    pred_type = FLUX_FLOW_PRED;
-                    default_flow_shift = 1.0f;  // TODO: validate
+                    if (flow_shift == INFINITY) {
-                    for (const auto& [name, tensor_storage] : tensor_storage_map) {
+                        flow_shift = 1.0f;  // TODO: validate
-                        if (starts_with(name, "model.diffusion_model.guidance_in.in_layer.weight")) {
+                        for (const auto& [name, tensor_storage] : tensor_storage_map) {
-                            default_flow_shift = 1.15f;
+                            if (starts_with(name, "model.diffusion_model.guidance_in.in_layer.weight")) {
-                            break;
+                                flow_shift = 1.15f;
                            }
                        }
                    }
                } else if (sd_version_is_flux2(version)) {
@ -942,12 +945,12 @@ public:
                    break;
                case FLOW_PRED: {
                    LOG_INFO("running in FLOW mode");
-                    denoiser = std::make_shared<DiscreteFlowDenoiser>();
+                    denoiser = std::make_shared<DiscreteFlowDenoiser>(flow_shift);
                    break;
                }
                case FLUX_FLOW_PRED: {
                    LOG_INFO("running in Flux FLOW mode");
-                    denoiser = std::make_shared<FluxFlowDenoiser>();
+                    denoiser = std::make_shared<FluxFlowDenoiser>(flow_shift);
                    break;
                }
                case FLUX2_FLOW_PRED: {
@ -2708,16 +2711,6 @@ public:
        ggml_ext_tensor_clamp_inplace(result, 0.0f, 1.0f);
        return result;
    }
    void set_flow_shift(float flow_shift = INFINITY) {
        auto flow_denoiser = std::dynamic_pointer_cast<DiscreteFlowDenoiser>(denoiser);
        if (flow_denoiser) {
            if (flow_shift == INFINITY) {
                flow_shift = default_flow_shift;
            }
            flow_denoiser->set_shift(flow_shift);
        }
    }
 };
 /*================================================= SD API ==================================================*/
@ -2938,6 +2931,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
    sd_ctx_params->chroma_use_dit_mask     = true;
    sd_ctx_params->chroma_use_t5_mask      = false;
    sd_ctx_params->chroma_t5_mask_pad      = 1;
    sd_ctx_params->flow_shift              = INFINITY;
 }
 char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
@ -3029,7 +3023,6 @@ void sd_sample_params_init(sd_sample_params_t* sample_params) {
    sample_params->sample_steps                = 20;
    sample_params->custom_sigmas               = nullptr;
    sample_params->custom_sigmas_count         = 0;
    sample_params->flow_shift                  = INFINITY;
 }
 char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
@ -3050,8 +3043,7 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
             "sample_method: %s, "
             "sample_steps: %d, "
             "eta: %.2f, "
-             "shifted_timestep: %d, "
+             "shifted_timestep: %d)",
             "flow_shift: %.2f)",
             sample_params->guidance.txt_cfg,
             std::isfinite(sample_params->guidance.img_cfg)
                 ? sample_params->guidance.img_cfg
@ -3065,8 +3057,7 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
             sd_sample_method_name(sample_params->sample_method),
             sample_params->sample_steps,
             sample_params->eta,
-             sample_params->shifted_timestep,
+             sample_params->shifted_timestep);
             sample_params->flow_shift);
    return buf;
 }
@ -3537,8 +3528,6 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
    size_t t0 = ggml_time_ms();
    sd_ctx->sd->set_flow_shift(sd_img_gen_params->sample_params.flow_shift);
    // Apply lora
    sd_ctx->sd->apply_loras(sd_img_gen_params->loras, sd_img_gen_params->lora_count);
@ -3814,8 +3803,6 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
    }
    LOG_INFO("generate_video %dx%dx%d", width, height, frames);
    sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift);
    enum sample_method_t sample_method = sd_vid_gen_params->sample_params.sample_method;
    if (sample_method == SAMPLE_METHOD_COUNT) {
        sample_method = sd_get_default_sample_method(sd_ctx);
--- a/src/vae.hpp
+++ b/src/vae.hpp
@ -141,7 +141,7 @@ public:
            v = ggml_reshape_3d(ctx->ggml_ctx, v, c, h * w, n);                        // [N, h * w, in_channels]
        }
-        h_ = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, 1, nullptr, false, ctx->flash_attn_enabled);
+        h_ = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, 1, nullptr, true, ctx->flash_attn_enabled);
        if (use_linear) {
            h_ = proj_out->forward(ctx, h_);  // [N, h * w, in_channels]
--- a/src/wan.hpp
+++ b/src/wan.hpp
@ -572,8 +572,8 @@ namespace WAN {
            auto v = qkv_vec[2];
            v      = ggml_reshape_3d(ctx->ggml_ctx, v, h * w, c, n);  // [t, c, h * w]
-            v = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, v, 1, 0, 2, 3));                            // [t, h * w, c]
+            v = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, v, 1, 0, 2, 3));                           // [t, h * w, c]
-            x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, 1, nullptr, false, ctx->flash_attn_enabled);  // [t, h * w, c]
+            x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, 1, nullptr, true, ctx->flash_attn_enabled);  // [t, h * w, c]
            x = ggml_ext_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));  // [t, c, h * w]
            x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, c, n);                             // [t, c, h, w]