fix: zero Wan2.2 TI2V timesteps for fixed frames (#1604 )

perf: keep chunk-K residency engaged with runtime LoRA (#1598 )
refactor: img-cond->img_uncond (#1594 )
2026-06-24 23:26:43 +00:00 · 2026-06-03 23:32:31 +08:00 · 2026-06-03 23:12:00 +08:00 · 2026-06-03 22:57:42 +08:00
4 changed files with 187 additions and 149 deletions
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@ -2432,12 +2432,17 @@ protected:
        GGML_ASSERT(gf != nullptr);

        // Keep the plan and resident params under the same live-VRAM cap.
+        // Add back our own resident buffer so we don't see chunk-K's
+        // allocation as "taken" VRAM and shrink the budget on every step.
        size_t effective_budget = max_graph_vram_bytes;
        if (stream_layers_enabled && max_graph_vram_bytes > 0 && runtime_backend != nullptr) {
            ggml_backend_dev_t dev = ggml_backend_get_device(runtime_backend);
            if (dev != nullptr && ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
                size_t free_vram = 0, total_vram = 0;
                ggml_backend_dev_memory(dev, &free_vram, &total_vram);
+                if (resident_runtime_params_buffer != nullptr) {
+                    free_vram += ggml_backend_buffer_get_size(resident_runtime_params_buffer);
+                }
                constexpr size_t safety_margin = 512ull * 1024 * 1024;
                size_t free_clamp              = (free_vram > safety_margin) ? (free_vram - safety_margin) : 0;
                if (free_clamp < effective_budget) {
@ -2815,39 +2820,54 @@ public:
                                                            bool no_return = false) {
        GGML_ASSERT(gf != nullptr);

-        // Runtime LoRA mutates CPU weights between calls, so resident GPU
-        // copies would go stale.
-        if (weight_adapter != nullptr) {
-            restore_resident_params();
-        } else {
-            sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan;
-            if (base_plan.available) {
-                sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_bytes);
+        // Runtime LoRA composes `weight + diff` in the compute graph via
+        // ggml_add; the resident weight tensor's data is never mutated, so
+        // chunk-K residency stays valid across sampling steps.
+        // Reserve room for the worst merged segment so chunk-K can't grow
+        // large enough to starve later partial-param allocations.
+        size_t worst_merged_segment_footprint = 0;
+        for (const auto& seg : plan.segments) {
+            const size_t fp = seg.input_param_bytes +
+                              seg.compute_buffer_size +
+                              seg.output_bytes +
+                              seg.input_previous_cut_bytes +
+                              seg.input_external_bytes;
+            if (fp > worst_merged_segment_footprint) {
+                worst_merged_segment_footprint = fp;
+            }
+        }
+        const size_t residency_budget_for_annotate =
+            residency_budget_bytes > worst_merged_segment_footprint
+                ? residency_budget_bytes - worst_merged_segment_footprint
+                : 0;

-                std::vector<ggml_tensor*> resident_params;
-                uint64_t token = 0;
-                for (const auto& segment : base_plan.segments) {
-                    if (segment.residency != sd::ggml_graph_cut::SegmentResidency::RESIDENT) {
-                        continue;
-                    }
-                    auto seg_params = sd::ggml_graph_cut::param_tensors(gf, segment);
-                    for (ggml_tensor* t : seg_params) {
-                        if (t == nullptr)
-                            continue;
-                        resident_params.push_back(t);
-                        token ^= reinterpret_cast<uintptr_t>(t) * 0x9E3779B97F4A7C15ull;
-                    }
+        sd::ggml_graph_cut::Plan& base_plan = graph_cut_plan_cache_.graph_cut_plan;
+        if (base_plan.available) {
+            sd::ggml_graph_cut::annotate_residency(base_plan, residency_budget_for_annotate);
+
+            std::vector<ggml_tensor*> resident_params;
+            uint64_t token = 0;
+            for (const auto& segment : base_plan.segments) {
+                if (segment.residency != sd::ggml_graph_cut::SegmentResidency::RESIDENT) {
+                    continue;
                }
-                if (token != resident_state_token) {
-                    restore_resident_params();
-                    if (!resident_params.empty()) {
-                        if (offload_resident_params(resident_params)) {
-                            resident_state_token = token;
-                        } else {
-                            LOG_ERROR("%s chunk-K: resident offload failed; continuing with per-segment streaming",
-                                      get_desc().c_str());
-                            restore_resident_params();
-                        }
+                auto seg_params = sd::ggml_graph_cut::param_tensors(gf, segment);
+                for (ggml_tensor* t : seg_params) {
+                    if (t == nullptr)
+                        continue;
+                    resident_params.push_back(t);
+                    token ^= reinterpret_cast<uintptr_t>(t) * 0x9E3779B97F4A7C15ull;
+                }
+            }
+            if (token != resident_state_token) {
+                restore_resident_params();
+                if (!resident_params.empty()) {
+                    if (offload_resident_params(resident_params)) {
+                        resident_state_token = token;
+                    } else {
+                        LOG_ERROR("%s chunk-K: resident offload failed; continuing with per-segment streaming",
+                                  get_desc().c_str());
+                        restore_resident_params();
                    }
                }
            }
--- a/src/guidance.cpp
+++ b/src/guidance.cpp
@ -82,17 +82,18 @@ namespace sd::guidance {
        output.pred                        = pred_cond;
        if (has_tensor(input.pred_uncond)) {
            const sd::Tensor<float>& pred_uncond = *input.pred_uncond;
-            if (has_tensor(input.pred_img_cond)) {
-                const sd::Tensor<float>& pred_img_cond = *input.pred_img_cond;
-                output.pred                            = pred_uncond +
-                              image_guidance_scale_ * (pred_img_cond - pred_uncond) +
-                              guidance_scale_ * (pred_cond - pred_img_cond);
+            if (has_tensor(input.pred_img_uncond)) {
+                const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
+                output.pred                              = pred_img_uncond +
+                              image_guidance_scale_ * (pred_uncond - pred_img_uncond) +
+                              guidance_scale_ * (pred_cond - pred_uncond);
+
            } else {
                output.pred = pred_uncond + guidance_scale_ * (pred_cond - pred_uncond);
            }
-        } else if (has_tensor(input.pred_img_cond)) {
-            const sd::Tensor<float>& pred_img_cond = *input.pred_img_cond;
-            output.pred                            = pred_img_cond + guidance_scale_ * (pred_cond - pred_img_cond);
+        } else if (has_tensor(input.pred_img_uncond)) {
+            const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
+            output.pred                              = pred_img_uncond + guidance_scale_ * (pred_cond - pred_img_uncond);
        }

        return output;
@ -108,20 +109,20 @@ namespace sd::guidance {

    static sd::Tensor<float> calculate_guidance_delta(const sd::Tensor<float>& pred_cond,
                                                      const sd::Tensor<float>* pred_uncond,
-                                                      const sd::Tensor<float>* pred_img_cond,
+                                                      const sd::Tensor<float>* pred_img_uncond,
                                                      float guidance_scale,
                                                      float image_guidance_scale) {
-        if (pred_img_cond != nullptr) {
+        if (pred_img_uncond != nullptr) {
            if (pred_uncond != nullptr && guidance_scale == 1.0f) {
-                return *pred_img_cond - *pred_uncond;
+                return *pred_uncond - *pred_img_uncond;
            }
            if (pred_uncond != nullptr) {
                return pred_cond +
-                       (*pred_uncond * (1.0f - image_guidance_scale) +
-                        *pred_img_cond * (image_guidance_scale - guidance_scale)) /
+                       (*pred_uncond * (image_guidance_scale - guidance_scale) +
+                        *pred_img_uncond * (1.0f - image_guidance_scale)) /
                           (guidance_scale - 1.0f);
            }
-            return pred_cond - *pred_img_cond;
+            return pred_cond - *pred_img_uncond;
        }
        return pred_cond - *pred_uncond;
    }
@ -139,28 +140,28 @@ namespace sd::guidance {
        output.pred                        = pred_cond;
        if (has_tensor(input.pred_uncond)) {
            const sd::Tensor<float>& pred_uncond = *input.pred_uncond;
-            if (has_tensor(input.pred_img_cond)) {
-                const sd::Tensor<float>& pred_img_cond = *input.pred_img_cond;
-                output.pred                            = pred_uncond +
-                              image_guidance_scale_ * (pred_img_cond - pred_uncond) +
-                              guidance_scale_ * (pred_cond - pred_img_cond);
+            if (has_tensor(input.pred_img_uncond)) {
+                const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
+                output.pred                              = pred_img_uncond +
+                              image_guidance_scale_ * (pred_uncond - pred_img_uncond) +
+                              guidance_scale_ * (pred_cond - pred_uncond);
            } else {
                output.pred = pred_uncond + guidance_scale_ * (pred_cond - pred_uncond);
            }
-        } else if (has_tensor(input.pred_img_cond)) {
-            const sd::Tensor<float>& pred_img_cond = *input.pred_img_cond;
-            output.pred                            = pred_img_cond + guidance_scale_ * (pred_cond - pred_img_cond);
+        } else if (has_tensor(input.pred_img_uncond)) {
+            const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
+            output.pred                              = pred_img_uncond + guidance_scale_ * (pred_cond - pred_img_uncond);
        }
-        if (!has_tensor(input.pred_uncond) && !has_tensor(input.pred_img_cond)) {
+        if (!has_tensor(input.pred_uncond) && !has_tensor(input.pred_img_uncond)) {
            return output;
        }

-        const sd::Tensor<float>* pred_uncond   = input.pred_uncond;
-        const sd::Tensor<float>* pred_img_cond = input.pred_img_cond;
+        const sd::Tensor<float>* pred_uncond     = input.pred_uncond;
+        const sd::Tensor<float>* pred_img_uncond = input.pred_img_uncond;

        sd::Tensor<float> deltas = calculate_guidance_delta(pred_cond,
                                                            pred_uncond,
-                                                            pred_img_cond,
+                                                            pred_img_uncond,
                                                            guidance_scale_,
                                                            image_guidance_scale_);
        if (params_.momentum != 0.0f) {
@ -202,11 +203,11 @@ namespace sd::guidance {
        if (pred_uncond != nullptr) {
            if (guidance_scale_ != 1.0f) {
                output.pred = pred_cond + (guidance_scale_ - 1.0f) * deltas;
-            } else if (pred_img_cond != nullptr) {
+            } else if (pred_img_uncond != nullptr) {
                output.pred = pred_cond + (image_guidance_scale_ - 1.0f) * deltas;
            }
-        } else if (pred_img_cond != nullptr) {
-            output.pred = *pred_img_cond + guidance_scale_ * deltas;
+        } else if (pred_img_uncond != nullptr) {
+            output.pred = *pred_img_uncond + guidance_scale_ * deltas;
        }

        return output;
--- a/src/guidance.h
+++ b/src/guidance.h
@ -29,11 +29,11 @@ namespace sd::guidance {
    bool parse_skip_layer_guidance_uncond_arg(const char* extra_sample_args);

    struct GuidanceInput {
-        int step                               = 0;
-        size_t schedule_size                   = 0;
-        const sd::Tensor<float>* pred_cond     = nullptr;
-        const sd::Tensor<float>* pred_uncond   = nullptr;
-        const sd::Tensor<float>* pred_img_cond = nullptr;
+        int step                                 = 0;
+        size_t schedule_size                     = 0;
+        const sd::Tensor<float>* pred_cond       = nullptr;
+        const sd::Tensor<float>* pred_uncond     = nullptr;
+        const sd::Tensor<float>* pred_img_uncond = nullptr;

        std::function<sd::Tensor<float>()> predict_skip_layer;
    };
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -1680,12 +1680,15 @@ public:
                                         const sd::Tensor<float>& init_latent,
                                         const sd::Tensor<float>& denoise_mask) {
        if (diffusion_model->get_desc() == "Wan2.2-TI2V-5B") {
-            auto new_timesteps = std::vector<float>(static_cast<size_t>(init_latent.shape()[2]), timesteps[0]);
+            int64_t frame_count = init_latent.shape()[2];
+            auto new_timesteps  = std::vector<float>(static_cast<size_t>(frame_count), timesteps[0]);

-            if (!denoise_mask.empty()) {
-                float value = denoise_mask.dim() == 5 ? denoise_mask.index(0, 0, 0, 0, 0) : denoise_mask.index(0, 0, 0, 0);
-                if (value == 0.f) {
-                    new_timesteps[0] = 0.f;
+            if (!denoise_mask.empty() && denoise_mask.dim() >= 4 && denoise_mask.shape()[2] == frame_count) {
+                for (int64_t frame = 0; frame < frame_count; ++frame) {
+                    float value = denoise_mask.dim() == 5 ? denoise_mask.index(0, 0, frame, 0, 0) : denoise_mask.index(0, 0, frame, 0);
+                    if (value == 0.f) {
+                        new_timesteps[static_cast<size_t>(frame)] = 0.f;
+                    }
                }
            }
            return new_timesteps;
@ -1945,7 +1948,7 @@ public:
                             sd::Tensor<float> noise,
                             const SDCondition& cond,
                             const SDCondition& uncond,
-                             const SDCondition& img_cond,
+                             const SDCondition& img_uncond,
                             const SDCondition& id_cond,
                             const sd::Tensor<float>& control_image,
                             float control_strength,
@ -2070,7 +2073,7 @@ public:

            sd::Tensor<float> cond_out;
            sd::Tensor<float> uncond_out;
-            sd::Tensor<float> img_cond_out;
+            sd::Tensor<float> img_uncond_out;
            sd_sample::SampleStepCacheDispatcher step_cache(cache_runtime, step, sigma);
            std::vector<sd::Tensor<float>> controls;
            DiffusionParams diffusion_params;
@ -2089,7 +2092,7 @@ public:
                                    &controls);

            static const std::vector<sd::Tensor<float>> empty_ref_latents;
-            bool uncond_without_ref_latents = !img_cond.empty() &&
+            bool uncond_without_ref_latents = !img_uncond.empty() &&
                                              !ref_latents.empty() &&
                                              sd_version_supports_ref_latent_img_cfg(version);

@ -2176,26 +2179,27 @@ public:
                    uncond_skip_layers = &skip_layer_guidance.layers();
                }
                uncond_out = run_condition(uncond,
-                                           nullptr,
-                                           uncond_skip_layers,
-                                           uncond_without_ref_latents ? &empty_ref_latents : nullptr);
+                                           uncond.c_concat.empty() ? nullptr : &uncond.c_concat,
+                                           uncond_skip_layers);
                if (uncond_out.empty()) {
                    return {};
                }
            }
-            if (!img_cond.empty()) {
-                img_cond_out = run_condition(img_cond,
-                                             cond.c_concat.empty() ? nullptr : &cond.c_concat);
-                if (img_cond_out.empty()) {
+            if (!img_uncond.empty()) {
+                img_uncond_out = run_condition(img_uncond,
+                                               img_uncond.c_concat.empty() ? nullptr : &img_uncond.c_concat,
+                                               nullptr,
+                                               uncond_without_ref_latents ? &empty_ref_latents : nullptr);
+                if (img_uncond_out.empty()) {
                    return {};
                }
            }
            sd::guidance::GuidanceInput guidance_input;
-            guidance_input.step          = step;
-            guidance_input.schedule_size = sigmas.size();
-            guidance_input.pred_cond     = &cond_out;
-            guidance_input.pred_uncond   = uncond_out.empty() ? nullptr : &uncond_out;
-            guidance_input.pred_img_cond = img_cond_out.empty() ? nullptr : &img_cond_out;
+            guidance_input.step            = step;
+            guidance_input.schedule_size   = sigmas.size();
+            guidance_input.pred_cond       = &cond_out;
+            guidance_input.pred_uncond     = uncond_out.empty() ? nullptr : &uncond_out;
+            guidance_input.pred_img_uncond = img_uncond_out.empty() ? nullptr : &img_uncond_out;

            sd::guidance::GuiderOutput guided = primary_guidance.forward(guidance_input, {});
            if (guided.pred.empty()) {
@ -2222,7 +2226,9 @@ public:
            sd::guidance::GuiderOutput output;
            output.pred = denoised;
            if (needs_uncond_denoised) {
-                const sd::Tensor<float>& base_uncond = !uncond_out.empty() ? uncond_out : cond_out;
+                const sd::Tensor<float>& base_uncond = !img_uncond_out.empty()
+                                                           ? img_uncond_out
+                                                           : (!uncond_out.empty() ? uncond_out : cond_out);
                output.pred_uncond                   = base_uncond * c_out + x * c_skip;
            }
            if (cache_runtime.spectrum_enabled) {
@ -3196,9 +3202,9 @@ struct GenerationRequest {
    int diffusion_model_down_factor          = -1;
    int64_t seed                             = -1;
    bool use_uncond                          = false;
-    bool use_img_cond                        = false;
+    bool use_img_uncond                      = false;
    bool use_high_noise_uncond               = false;
-    bool use_high_noise_img_cond             = false;
+    bool use_high_noise_img_uncond           = false;
    bool has_ref_images                      = false;
    const sd_cache_params_t* cache_params    = nullptr;
    int batch_count                          = 1;
@ -3356,33 +3362,36 @@ struct GenerationRequest {
    static void resolve_guidance(sd_ctx_t* sd_ctx,
                                 sd_guidance_params_t* guidance,
                                 bool* use_uncond,
-                                 bool* use_img_cond,
+                                 bool* use_img_uncond,
                                 bool has_ref_images,
                                 const char* stage_name = nullptr) {
        GGML_ASSERT(guidance != nullptr);
        GGML_ASSERT(use_uncond != nullptr);
-        GGML_ASSERT(use_img_cond != nullptr);
-        // out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond)
-        // img_cfg == txt_cfg means that img_cfg is not used
-        bool img_cfg_was_unset = !std::isfinite(guidance->img_cfg);
-        if (!std::isfinite(guidance->img_cfg)) {
-            guidance->img_cfg = guidance->txt_cfg;
+        GGML_ASSERT(use_img_uncond != nullptr);
+        // out_img_uncond + text_cfg_scale * (out_cond - out_uncond) + image_cfg_scale * (out_uncond - out_img_uncond)
+        // -> text_cfg_scale * out_cond + (image_cfg_scale - text_cfg_scale) * out_uncond + (1 - image_cfg_scale) * out_img_uncond
+        // out_cond       : prompt, image latent
+        // out_uncond     : negative prompt, image latent
+        // out_img_uncond : negative prompt, zero image latent
+        // image_cfg_scale == 1 reduces 3-cond CFG to 2-cond CFG.
+        bool img_cfg_was_set = std::isfinite(guidance->img_cfg);
+        if (!img_cfg_was_set) {
+            guidance->img_cfg = 1.f;
        }

        if (!sd_version_supports_img_cfg(sd_ctx->sd->version, has_ref_images)) {
-            if (!img_cfg_was_unset && guidance->img_cfg != guidance->txt_cfg) {
-                LOG_WARN("2-conditioning CFG is not supported with this model, disabling it for better performance");
+            if (img_cfg_was_set && guidance->img_cfg != 1.f) {
+                LOG_WARN("3-conditioning CFG is not supported with this model, disabling it for better performance");
            }
-            guidance->img_cfg = guidance->txt_cfg;
-        }
-
-        if (guidance->txt_cfg != 1.f) {
-            *use_uncond = true;
+            guidance->img_cfg = 1.f;
        }

        if (guidance->img_cfg != guidance->txt_cfg) {
-            *use_img_cond = true;
-            *use_uncond   = true;
+            *use_uncond = true;
+        }
+
+        if (guidance->img_cfg != 1.f) {
+            *use_img_uncond = true;
        }

        if (guidance->txt_cfg < 1.f) {
@ -3401,12 +3410,12 @@ struct GenerationRequest {
        resolve_hires();
        seed = resolve_seed(seed);

-        resolve_guidance(sd_ctx, &guidance, &use_uncond, &use_img_cond, has_ref_images);
+        resolve_guidance(sd_ctx, &guidance, &use_uncond, &use_img_uncond, has_ref_images);
        if (sd_ctx->sd->high_noise_diffusion_model) {
            resolve_guidance(sd_ctx,
                             &high_noise_guidance,
                             &use_high_noise_uncond,
-                             &use_high_noise_img_cond,
+                             &use_high_noise_img_uncond,
                             has_ref_images,
                             "high noise: ");
        }
@ -3525,7 +3534,7 @@ struct SamplePlan {
 struct ImageGenerationLatents {
    sd::Tensor<float> init_latent;
    sd::Tensor<float> concat_latent;
-    sd::Tensor<float> uncond_concat_latent;
+    sd::Tensor<float> img_uncond_concat_latent;
    sd::Tensor<float> audio_latent;
    sd::Tensor<float> video_positions;
    sd::Tensor<float> control_image;
@ -3848,7 +3857,7 @@ static int get_ltxav_num_audio_latents(int frames, int fps) {
 struct ImageGenerationEmbeds {
    SDCondition cond;
    SDCondition uncond;
-    SDCondition img_cond;
+    SDCondition img_uncond;
    SDCondition id_cond;
 };

@ -4007,7 +4016,7 @@ static std::optional<ImageGenerationLatents> prepare_image_generation_latents(sd
        LOG_WARN("This model needs at least one reference image; using an empty reference");
        ref_images.push_back(sd::zeros<float>({request->width, request->height, 3, 1}));
        request->guidance.img_cfg = request->guidance.txt_cfg;
-        request->use_img_cond     = false;
+        request->use_img_uncond   = false;
    }

    if (!ref_images.empty()) {
@ -4060,7 +4069,7 @@ static std::optional<ImageGenerationLatents> prepare_image_generation_latents(sd
    }

    sd::Tensor<float> concat_latent;
-    sd::Tensor<float> uncond_concat_latent;
+    sd::Tensor<float> img_uncond_concat_latent;
    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
        sd::Tensor<float> masked_init_latent;

@ -4088,8 +4097,8 @@ static std::optional<ImageGenerationLatents> prepare_image_generation_latents(sd
                                                   request->height / request->vae_scale_factor});
            mask      = mask.permute({1, 3, 0, 2}).reshape({request->width / request->vae_scale_factor, request->height / request->vae_scale_factor, request->vae_scale_factor * request->vae_scale_factor, 1});

-            concat_latent        = sd::ops::concat(masked_init_latent, mask, 2);
-            uncond_concat_latent = sd::ops::concat(uncond_masked_init_latent, mask, 2);
+            concat_latent            = sd::ops::concat(masked_init_latent, mask, 2);
+            img_uncond_concat_latent = sd::ops::concat(uncond_masked_init_latent, mask, 2);
        } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
            concat_latent = sd::ops::concat(masked_init_latent, latent_mask, 2);
            if (!control_latent.empty()) {
@ -4098,16 +4107,16 @@ static std::optional<ImageGenerationLatents> prepare_image_generation_latents(sd
                concat_latent = sd::ops::concat(concat_latent, sd::Tensor<float>::zeros_like(masked_init_latent), 2);
            }

-            uncond_concat_latent = sd::ops::concat(uncond_masked_init_latent, latent_mask, 2);
-            uncond_concat_latent = sd::ops::concat(uncond_concat_latent, sd::Tensor<float>::zeros_like(masked_init_latent), 2);
+            img_uncond_concat_latent = sd::ops::concat(uncond_masked_init_latent, latent_mask, 2);
+            img_uncond_concat_latent = sd::ops::concat(img_uncond_concat_latent, sd::Tensor<float>::zeros_like(masked_init_latent), 2);
        } else {  // SD1.x SD2.x SDXL inpaint
-            concat_latent        = sd::ops::concat(latent_mask, masked_init_latent, 2);
-            uncond_concat_latent = sd::ops::concat(latent_mask, uncond_masked_init_latent, 2);
+            concat_latent            = sd::ops::concat(latent_mask, masked_init_latent, 2);
+            img_uncond_concat_latent = sd::ops::concat(latent_mask, uncond_masked_init_latent, 2);
        }
    }
    if (sd_version_is_unet_edit(sd_ctx->sd->version)) {
-        concat_latent        = sd::ops::interpolate<float>(ref_latents[0], init_latent.shape());
-        uncond_concat_latent = sd::Tensor<float>::zeros_like(concat_latent);
+        concat_latent            = sd::ops::interpolate<float>(ref_latents[0], init_latent.shape());
+        img_uncond_concat_latent = sd::Tensor<float>::zeros_like(concat_latent);
    }
    if (sd_ctx->sd->version == VERSION_FLUX_CONTROLS) {
        if (!control_latent.empty()) {
@ -4115,7 +4124,7 @@ static std::optional<ImageGenerationLatents> prepare_image_generation_latents(sd
        } else {
            concat_latent = sd::Tensor<float>::zeros_like(init_latent);
        }
-        uncond_concat_latent = sd::Tensor<float>::zeros_like(concat_latent);
+        img_uncond_concat_latent = sd::Tensor<float>::zeros_like(concat_latent);
    }

    if (sd_img_gen_params->init_image.data != nullptr || sd_img_gen_params->ref_images_count > 0) {
@ -4124,12 +4133,12 @@ static std::optional<ImageGenerationLatents> prepare_image_generation_latents(sd
    }

    ImageGenerationLatents latents;
-    latents.init_latent          = std::move(init_latent);
-    latents.concat_latent        = std::move(concat_latent);
-    latents.uncond_concat_latent = std::move(uncond_concat_latent);
-    latents.control_image        = std::move(control_image_tensor);
-    latents.ref_images           = std::move(ref_images);
-    latents.ref_latents          = std::move(ref_latents);
+    latents.init_latent              = std::move(init_latent);
+    latents.concat_latent            = std::move(concat_latent);
+    latents.img_uncond_concat_latent = std::move(img_uncond_concat_latent);
+    latents.control_image            = std::move(control_image_tensor);
+    latents.ref_images               = std::move(ref_images);
+    latents.ref_latents              = std::move(ref_latents);

    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
        latent_mask = sd::ops::max_pool_2d(latent_mask,
@ -4163,7 +4172,7 @@ static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_c
        cond.c_concat = latents->concat_latent;  // TODO: optimize
    }

-    bool use_ref_latent_img_cfg = request->use_img_cond &&
+    bool use_ref_latent_img_cfg = request->use_img_uncond &&
                                  !latents->ref_images.empty() &&
                                  sd_version_supports_ref_latent_img_cfg(sd_ctx->sd->version);

@ -4180,24 +4189,32 @@ static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_c
        uncond                           = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads,
                                                                                               condition_params);
        if (uncond.c_concat.empty()) {
-            uncond.c_concat = latents->uncond_concat_latent;  // TODO: optimize
+            uncond.c_concat = latents->concat_latent;  // TODO: optimize
        }
    }

-    SDCondition img_cond;
-    if (request->use_img_cond) {
-        if (use_ref_latent_img_cfg) {
-            img_cond = uncond;
-
-            std::vector<sd::Tensor<float>> empty_ref_images;
-            condition_params.ref_images = &empty_ref_images;
-            uncond                      = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads,
-                                                                                              condition_params);
-            if (uncond.c_concat.empty()) {
-                uncond.c_concat = latents->uncond_concat_latent;  // TODO: optimize
-            }
+    SDCondition img_uncond;
+    if (request->use_img_uncond) {
+        if ((request->use_uncond || request->use_high_noise_uncond) && (latents->ref_images.empty() || !use_ref_latent_img_cfg)) {
+            img_uncond = SDCondition(uncond.c_crossattn, uncond.c_vector, latents->img_uncond_concat_latent);
        } else {
-            img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat);
+            bool zero_out_masked = false;
+            if (sd_version_is_sdxl(sd_ctx->sd->version) &&
+                request->negative_prompt.empty() &&
+                !sd_ctx->sd->is_using_edm_v_parameterization) {
+                zero_out_masked = true;
+            }
+            condition_params.text            = request->negative_prompt;
+            condition_params.zero_out_masked = zero_out_masked;
+            if (use_ref_latent_img_cfg) {
+                std::vector<sd::Tensor<float>> empty_ref_images;
+                condition_params.ref_images = &empty_ref_images;
+            }
+            img_uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads,
+                                                                             condition_params);
+            if (img_uncond.c_concat.empty()) {
+                img_uncond.c_concat = latents->img_uncond_concat_latent;  // TODO: optimize
+            }
        }
    }

@ -4209,10 +4226,10 @@ static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_c
    }

    ImageGenerationEmbeds embeds;
-    embeds.img_cond = std::move(img_cond);
-    embeds.cond     = std::move(cond);
-    embeds.uncond   = std::move(uncond);
-    embeds.id_cond  = std::move(id_cond);
+    embeds.img_uncond = std::move(img_uncond);
+    embeds.cond       = std::move(cond);
+    embeds.uncond     = std::move(uncond);
+    embeds.id_cond    = std::move(id_cond);

    return embeds;
 }
@ -4492,7 +4509,7 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
                                                   std::move(noise),
                                                   embeds.cond,
                                                   embeds.uncond,
-                                                   embeds.img_cond,
+                                                   embeds.img_uncond,
                                                   embeds.id_cond,
                                                   latents.control_image,
                                                   request.control_strength,
@ -4612,7 +4629,7 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
                                                            std::move(noise),
                                                            embeds.cond,
                                                            embeds.uncond,
-                                                            embeds.img_cond,
+                                                            embeds.img_uncond,
                                                            embeds.id_cond,
                                                            latents.control_image,
                                                            request.control_strength,
@ -5327,7 +5344,7 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
                                                           std::move(noise),
                                                           embeds.cond,
                                                           request.use_high_noise_uncond ? embeds.uncond : SDCondition(),
-                                                           embeds.img_cond,
+                                                           embeds.img_uncond,
                                                           embeds.id_cond,
                                                           sd::Tensor<float>(),
                                                           0.f,
@ -5373,7 +5390,7 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
                                                        std::move(noise),
                                                        embeds.cond,
                                                        request.use_uncond ? embeds.uncond : SDCondition(),
-                                                        embeds.img_cond,
+                                                        embeds.img_uncond,
                                                        embeds.id_cond,
                                                        sd::Tensor<float>(),
                                                        0.f,
@ -5517,7 +5534,7 @@ SD_API bool generate_video(sd_ctx_t* sd_ctx,
                                            std::move(noise),
                                            embeds.cond,
                                          hires_request.use_uncond ? embeds.uncond : SDCondition(),
-                                            embeds.img_cond,
+                                            embeds.img_uncond,
                                            embeds.id_cond,
                                            sd::Tensor<float>(),
                                            0.f,
Author	SHA1	Message	Date
leejet	1f9ee88e09	fix: zero Wan2.2 TI2V timesteps for fixed frames (#1604 )	2026-06-03 23:32:31 +08:00
fszontagh	a7f2e03da4	perf: keep chunk-K residency engaged with runtime LoRA (#1598 )	2026-06-03 23:12:00 +08:00
stduhpf	4513e3fda9	refactor: img-cond->img_uncond (#1594 ) * refactor: img-cond->img_uncond * align APG and CFG++ with img-uncond CFG * set default img_cfg to 1.f --------- Co-authored-by: leejet <leejet714@gmail.com>	2026-06-03 22:57:42 +08:00