feat: auto-detect max VRAM budget with --max-vram -1 (#1498 )

refactor: unify Euler, Euler Ancestral and DDIM implementations (#1474 )
fix: Fix broken GCC 16 build (enforce C11/C++17 compile ) (#1478 )
2026-06-25 07:36:38 +00:00 · 2026-05-16 16:14:25 +08:00 · 2026-05-16 16:13:28 +08:00 · 2026-05-16 16:10:16 +08:00 · 2026-05-16 16:08:31 +08:00
10 changed files with 103 additions and 97 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -69,6 +69,12 @@ option(SD_BUILD_SHARED_GGML_LIB      "sd: build ggml as a separate shared lib" O
 option(SD_USE_SYSTEM_GGML            "sd: use system-installed GGML library" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 if(SD_CUDA)
    message("-- Use CUDA as backend stable-diffusion")
    set(GGML_CUDA ON)
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -55,7 +55,7 @@ Context Options:
                                           then threads will be set to the number of CPU physical cores
  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
  --max-vram <float>                       maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
-                                           graph splitting
+                                           graph splitting; -1 auto-detects free VRAM minus 1 GiB
  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM
                                           when needed
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@ -397,7 +397,7 @@ ArgOptions SDContextParams::get_options() {
    options.float_options = {
        {"",
         "--max-vram",
-         "maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting",
+         "maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; -1 auto-detects free VRAM minus 1 GiB",
         &max_vram},
    };
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -157,7 +157,7 @@ Context Options:
                                           then threads will be set to the number of CPU physical cores
  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
  --max-vram <float>                       maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
-                                           graph splitting
+                                           graph splitting; -1 auto-detects free VRAM minus 1 GiB
  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM
                                           when needed
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@ -205,7 +205,7 @@ typedef struct {
    bool chroma_use_t5_mask;
    int chroma_t5_mask_pad;
    bool qwen_image_zero_cond_t;
-    float max_vram;
+    float max_vram;  // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB)
 } sd_ctx_params_t;
 typedef struct {
--- a/src/denoiser.hpp
+++ b/src/denoiser.hpp
@ -824,45 +824,33 @@ static std::tuple<float, float, float> get_ancestral_step(float sigma_from,
 static sd::Tensor<float> sample_euler_ancestral(denoise_cb_t model,
                                                sd::Tensor<float> x,
                                                const std::vector<float>& sigmas,
-                                                std::shared_ptr<RNG> rng,
+                                                std::shared_ptr<RNG> rng = nullptr,
-                                                float eta) {
+                                                bool is_flow_denoiser    = false,
                                                float eta                = 0.f) {
    int steps = static_cast<int>(sigmas.size()) - 1;
    for (int i = 0; i < steps; i++) {
        float sigma       = sigmas[i];
        float sigma_to    = sigmas[i + 1];
        auto denoised_opt = model(x, sigma, i + 1, nullptr);
        if (denoised_opt.empty()) {
            return {};
        }
-        sd::Tensor<float> denoised  = std::move(denoised_opt);
+        sd::Tensor<float> denoised = std::move(denoised_opt);
-        sd::Tensor<float> d         = (x - denoised) / sigma;
+        if (sigma_to == 0.f) {
-        auto [sigma_down, sigma_up] = get_ancestral_step(sigmas[i], sigmas[i + 1], eta);
+            x = denoised;
-        x += d * (sigma_down - sigmas[i]);
+        } else if (eta == 0.f) {
-        if (sigmas[i + 1] > 0) {
+            float sigma_ratio = sigma_to / sigma;
-            x += sd::Tensor<float>::randn_like(x, rng) * sigma_up;
+            x                 = sigma_ratio * x + (1.0 - sigma_ratio) * denoised;
-        }
+        } else {
-    }
+            auto [sigma_down, sigma_up, alpha_scale] = get_ancestral_step(sigma, sigma_to, eta, is_flow_denoiser);
-    return x;
+            float sigma_ratio                        = sigma_down / sigma;
-}
+            x                                        = sigma_ratio * x + (1.0f - sigma_ratio) * denoised;
-
+            if (sigma_up > 0.f) {
-static sd::Tensor<float> sample_euler_flow(denoise_cb_t model,
+                if (is_flow_denoiser) {
-                                           sd::Tensor<float> x,
+                    x *= alpha_scale;
-                                           const std::vector<float>& sigmas,
+                }
-                                           std::shared_ptr<RNG> rng,
+                x += sd::Tensor<float>::randn_like(x, rng) * sigma_up;
-                                           float eta) {
+            }
    int steps = static_cast<int>(sigmas.size()) - 1;
    for (int i = 0; i < steps; i++) {
        float sigma       = sigmas[i];
        auto denoised_opt = model(x, sigma, i + 1, nullptr);
        if (denoised_opt.empty()) {
            return {};
        }
        sd::Tensor<float> denoised               = std::move(denoised_opt);
        auto [sigma_down, sigma_up, alpha_scale] = get_ancestral_step_flow(sigma, sigmas[i + 1], eta);
        float sigma_ratio                        = sigma_down / sigma;
        x                                        = sigma_ratio * x + (1.0f - sigma_ratio) * denoised;
        if (sigma_up > 0.0f) {
            x = alpha_scale * x + sd::Tensor<float>::randn_like(x, rng) * sigma_up;
        }
    }
    return x;
@ -1633,46 +1621,6 @@ static sd::Tensor<float> sample_er_sde(denoise_cb_t model,
    return x;
 }
 static sd::Tensor<float> sample_ddim_trailing(denoise_cb_t model,
                                              sd::Tensor<float> x,
                                              const std::vector<float>& sigmas,
                                              std::shared_ptr<RNG> rng,
                                              float eta) {
    int steps = static_cast<int>(sigmas.size()) - 1;
    for (int i = 0; i < steps; i++) {
        float sigma    = sigmas[i];
        float sigma_to = sigmas[i + 1];
        auto model_output_opt = model(x, sigma, i + 1, nullptr);
        if (model_output_opt.empty()) {
            return {};
        }
        sd::Tensor<float> model_output = std::move(model_output_opt);
        model_output                   = (x - model_output) * (1.0f / sigma);
        float alpha_prod_t      = 1.0f / (sigma * sigma + 1.0f);
        float alpha_prod_t_prev = 1.0f / (sigma_to * sigma_to + 1.0f);
        float beta_prod_t       = 1.0f - alpha_prod_t;
        sd::Tensor<float> pred_original_sample = ((x / std::sqrt(sigma * sigma + 1)) -
                                                  std::sqrt(beta_prod_t) * model_output) *
                                                 (1.0f / std::sqrt(alpha_prod_t));
        float beta_prod_t_prev = 1.0f - alpha_prod_t_prev;
        float variance         = (beta_prod_t_prev / beta_prod_t) *
                         (1.0f - alpha_prod_t / alpha_prod_t_prev);
        float std_dev_t = eta * std::sqrt(variance);
        x = pred_original_sample +
            std::sqrt((1.0f - alpha_prod_t_prev - std::pow(std_dev_t, 2)) / alpha_prod_t_prev) * model_output;
        if (eta > 0) {
            x += std_dev_t / std::sqrt(alpha_prod_t_prev) * sd::Tensor<float>::randn_like(x, rng);
        }
    }
    return x;
 }
 static sd::Tensor<float> sample_tcd(denoise_cb_t model,
                                    sd::Tensor<float> x,
                                    const std::vector<float>& sigmas,
@ -1715,12 +1663,12 @@ static sd::Tensor<float> sample_tcd(denoise_cb_t model,
        int timestep_s    = (int)floor((1 - eta) * prev_timestep);
        float sigma       = sigmas[i];
-        auto model_output_opt = model(x, sigma, i + 1, nullptr);
+        auto denoised_opt = model(x, sigma, i + 1, nullptr);
-        if (model_output_opt.empty()) {
+        if (denoised_opt.empty()) {
            return {};
        }
-        sd::Tensor<float> model_output = std::move(model_output_opt);
+        sd::Tensor<float> denoised = std::move(denoised_opt);
-        model_output                   = (x - model_output) * (1.0f / sigma);
+        sd::Tensor<float> d        = (x - denoised) / sigma;
        float alpha_prod_t      = 1.0f / (sigma * sigma + 1.0f);
        float beta_prod_t       = 1.0f - alpha_prod_t;
@ -1728,12 +1676,8 @@ static sd::Tensor<float> sample_tcd(denoise_cb_t model,
        float alpha_prod_s      = static_cast<float>(alphas_cumprod[timestep_s]);
        float beta_prod_s       = 1.0f - alpha_prod_s;
-        sd::Tensor<float> pred_original_sample = ((x / std::sqrt(sigma * sigma + 1)) -
+        x = std::sqrt(alpha_prod_s / alpha_prod_t_prev) * denoised +
-                                                  std::sqrt(beta_prod_t) * model_output) *
+            std::sqrt(beta_prod_s / alpha_prod_t_prev) * d;
                                                 (1.0f / std::sqrt(alpha_prod_t));
        x = std::sqrt(alpha_prod_s / alpha_prod_t_prev) * pred_original_sample +
            std::sqrt(beta_prod_s / alpha_prod_t_prev) * model_output;
        if (eta > 0 && sigma_to > 0.0f) {
            x = std::sqrt(alpha_prod_t_prev / alpha_prod_s) * x +
@ -1804,10 +1748,7 @@ static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
                                            const char* extra_sample_args) {
    switch (method) {
        case EULER_A_SAMPLE_METHOD:
-            if (is_flow_denoiser)
+            return sample_euler_ancestral(model, std::move(x), sigmas, rng, is_flow_denoiser, eta);
                return sample_euler_flow(model, std::move(x), sigmas, rng, eta);
            else
                return sample_euler_ancestral(model, std::move(x), sigmas, rng, eta);
        case EULER_SAMPLE_METHOD:
            return sample_euler(model, std::move(x), sigmas);
        case HEUN_SAMPLE_METHOD:
@ -1836,7 +1777,8 @@ static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
        case ER_SDE_SAMPLE_METHOD:
            return sample_er_sde(model, std::move(x), sigmas, rng, is_flow_denoiser, eta);
        case DDIM_TRAILING_SAMPLE_METHOD:
-            return sample_ddim_trailing(model, std::move(x), sigmas, rng, eta);
+            // DDIM is equivalent to Euler Ancestral with the Simple scheduler
            return sample_euler_ancestral(model, std::move(x), sigmas, rng, is_flow_denoiser, eta);
        case TCD_SAMPLE_METHOD:
            return sample_tcd(model, std::move(x), sigmas, rng, eta);
        case EULER_CFG_PP_SAMPLE_METHOD:
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@ -2732,6 +2732,9 @@ public:
                rebuild_params_tensor_set();
                return true;
            }
        } else {
            LOG_DEBUG("%s skipping params allocation (no tensors)", get_desc().c_str());
            return true;
        }
        params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend);
        if (params_buffer == nullptr) {
--- a/src/ggml_graph_cut.cpp
+++ b/src/ggml_graph_cut.cpp
@ -16,6 +16,9 @@
 namespace sd::ggml_graph_cut {
    static constexpr double MAX_VRAM_BYTES_PER_GIB      = 1024.0 * 1024.0 * 1024.0;
    static constexpr size_t MAX_VRAM_AUTO_RESERVE_BYTES = 1024ULL * 1024ULL * 1024ULL;
    static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) {
        if (tensor == nullptr) {
            return "<null>";
@ -79,6 +82,58 @@ namespace sd::ggml_graph_cut {
               segment.output_bytes;
    }
    size_t max_vram_gib_to_bytes(float max_vram) {
        if (max_vram <= 0.f) {
            return 0;
        }
        return static_cast<size_t>(static_cast<double>(max_vram) * MAX_VRAM_BYTES_PER_GIB);
    }
    static float max_vram_bytes_to_gib(size_t max_vram_bytes) {
        return static_cast<float>(static_cast<double>(max_vram_bytes) / MAX_VRAM_BYTES_PER_GIB);
    }
    static size_t resolve_auto_max_vram_bytes(ggml_backend_t backend) {
        if (backend == nullptr) {
            LOG_WARN("--max-vram -1 requested, but no backend is available; disabling graph splitting");
            return 0;
        }
        ggml_backend_dev_t dev = ggml_backend_get_device(backend);
        if (dev == nullptr) {
            LOG_WARN("--max-vram -1 requested, but no backend device is available; disabling graph splitting");
            return 0;
        }
        if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
            LOG_WARN("--max-vram -1 requested, but the main backend is CPU; disabling graph splitting");
            return 0;
        }
        size_t free_vram  = 0;
        size_t total_vram = 0;
        ggml_backend_dev_memory(dev, &free_vram, &total_vram);
        if (free_vram <= MAX_VRAM_AUTO_RESERVE_BYTES) {
            LOG_WARN("--max-vram -1 requested, but free VRAM is %.2f GiB; reserving 1.00 GiB leaves no graph budget",
                     free_vram / MAX_VRAM_BYTES_PER_GIB);
            return 0;
        }
        const size_t max_vram_bytes = free_vram - MAX_VRAM_AUTO_RESERVE_BYTES;
        LOG_INFO("--max-vram -1 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving 1.00 GiB; using %.2f GiB",
                 free_vram / MAX_VRAM_BYTES_PER_GIB,
                 total_vram / MAX_VRAM_BYTES_PER_GIB,
                 max_vram_bytes / MAX_VRAM_BYTES_PER_GIB);
        return max_vram_bytes;
    }
    float resolve_max_vram_gib(float max_vram, ggml_backend_t backend) {
        if (max_vram != -1.f) {
            return max_vram;
        }
        return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(backend));
    }
    static Segment make_segment_seed(const Plan& plan,
                                     size_t start_segment_index,
                                     size_t end_segment_index) {
--- a/src/ggml_graph_cut.h
+++ b/src/ggml_graph_cut.h
@ -83,6 +83,8 @@ namespace sd::ggml_graph_cut {
                                          ggml_cgraph* gf,
                                          const Segment& segment,
                                          const char* log_desc);
    size_t max_vram_gib_to_bytes(float max_vram);
    float resolve_max_vram_gib(float max_vram, ggml_backend_t backend);
    Plan build_plan(ggml_backend_t backend,
                    ggml_cgraph* gf,
                    const std::unordered_set<const ggml_tensor*>& params_tensor_set,
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -1,4 +1,5 @@
 #include "ggml_extend.hpp"
 #include "ggml_graph_cut.h"
 #include "model.h"
 #include "rng.hpp"
@ -209,6 +210,7 @@ public:
        ggml_log_set(ggml_log_callback_default, nullptr);
        init_backend();
        max_vram = sd::ggml_graph_cut::resolve_max_vram_gib(max_vram, backend);
        ModelLoader model_loader;
@ -426,9 +428,7 @@ public:
        bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;
-        const size_t max_graph_vram_bytes = max_vram <= 0.f
+        const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(max_vram);
                                                ? 0
                                                : static_cast<size_t>(static_cast<double>(max_vram) * 1024.0 * 1024.0 * 1024.0);
        {
            clip_backend = backend;
@ -3597,9 +3597,7 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
            hires_upscaler                    = std::make_unique<UpscalerGGML>(sd_ctx->sd->n_threads,
                                                            false,
                                                            request.hires.upscale_tile_size);
-            const size_t max_graph_vram_bytes = sd_ctx->sd->max_vram <= 0.f
+            const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(sd_ctx->sd->max_vram);
                                                    ? 0
                                                    : static_cast<size_t>(static_cast<double>(sd_ctx->sd->max_vram) * 1024.0 * 1024.0 * 1024.0);
            hires_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes);
            if (!hires_upscaler->load_from_file(request.hires.model_path,
                                                sd_ctx->sd->offload_params_to_cpu,
Author	SHA1	Message	Date
leejet	38b14adb67	feat: auto-detect max VRAM budget with --max-vram -1 (#1498 )	2026-05-16 16:14:25 +08:00
Wagner Bruna	fd1a2794f3	refactor: unify Euler, Euler Ancestral and DDIM implementations (#1474 )	2026-05-16 16:13:28 +08:00
cphlipot	db08b84607	fix: Fix broken GCC 16 build (enforce C11/C++17 compile ) (#1478 )	2026-05-16 16:10:16 +08:00
Wagner Bruna	686856edca	chore: do not report the fake VAE "allocation" as an error (#1494 )	2026-05-16 16:08:31 +08:00