feat: auto-detect max VRAM budget with --max-vram -1 (#1498)

This commit is contained in:
leejet 2026-05-16 16:14:25 +08:00 committed by GitHub
parent fd1a2794f3
commit 38b14adb67
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 65 additions and 10 deletions

View File

@ -55,7 +55,7 @@ Context Options:
then threads will be set to the number of CPU physical cores then threads will be set to the number of CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma --chroma-t5-mask-pad <int> t5 mask pad size of chroma
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables --max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
graph splitting graph splitting; -1 auto-detects free VRAM minus 1 GiB
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
when needed when needed

View File

@ -397,7 +397,7 @@ ArgOptions SDContextParams::get_options() {
options.float_options = { options.float_options = {
{"", {"",
"--max-vram", "--max-vram",
"maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting", "maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; -1 auto-detects free VRAM minus 1 GiB",
&max_vram}, &max_vram},
}; };

View File

@ -157,7 +157,7 @@ Context Options:
then threads will be set to the number of CPU physical cores then threads will be set to the number of CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma --chroma-t5-mask-pad <int> t5 mask pad size of chroma
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables --max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
graph splitting graph splitting; -1 auto-detects free VRAM minus 1 GiB
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
when needed when needed

View File

@ -205,7 +205,7 @@ typedef struct {
bool chroma_use_t5_mask; bool chroma_use_t5_mask;
int chroma_t5_mask_pad; int chroma_t5_mask_pad;
bool qwen_image_zero_cond_t; bool qwen_image_zero_cond_t;
float max_vram; float max_vram; // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB)
} sd_ctx_params_t; } sd_ctx_params_t;
typedef struct { typedef struct {

View File

@ -16,6 +16,9 @@
namespace sd::ggml_graph_cut { namespace sd::ggml_graph_cut {
static constexpr double MAX_VRAM_BYTES_PER_GIB = 1024.0 * 1024.0 * 1024.0;
static constexpr size_t MAX_VRAM_AUTO_RESERVE_BYTES = 1024ULL * 1024ULL * 1024ULL;
static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) { static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) {
if (tensor == nullptr) { if (tensor == nullptr) {
return "<null>"; return "<null>";
@ -79,6 +82,58 @@ namespace sd::ggml_graph_cut {
segment.output_bytes; segment.output_bytes;
} }
size_t max_vram_gib_to_bytes(float max_vram) {
if (max_vram <= 0.f) {
return 0;
}
return static_cast<size_t>(static_cast<double>(max_vram) * MAX_VRAM_BYTES_PER_GIB);
}
static float max_vram_bytes_to_gib(size_t max_vram_bytes) {
return static_cast<float>(static_cast<double>(max_vram_bytes) / MAX_VRAM_BYTES_PER_GIB);
}
static size_t resolve_auto_max_vram_bytes(ggml_backend_t backend) {
if (backend == nullptr) {
LOG_WARN("--max-vram -1 requested, but no backend is available; disabling graph splitting");
return 0;
}
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
if (dev == nullptr) {
LOG_WARN("--max-vram -1 requested, but no backend device is available; disabling graph splitting");
return 0;
}
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
LOG_WARN("--max-vram -1 requested, but the main backend is CPU; disabling graph splitting");
return 0;
}
size_t free_vram = 0;
size_t total_vram = 0;
ggml_backend_dev_memory(dev, &free_vram, &total_vram);
if (free_vram <= MAX_VRAM_AUTO_RESERVE_BYTES) {
LOG_WARN("--max-vram -1 requested, but free VRAM is %.2f GiB; reserving 1.00 GiB leaves no graph budget",
free_vram / MAX_VRAM_BYTES_PER_GIB);
return 0;
}
const size_t max_vram_bytes = free_vram - MAX_VRAM_AUTO_RESERVE_BYTES;
LOG_INFO("--max-vram -1 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving 1.00 GiB; using %.2f GiB",
free_vram / MAX_VRAM_BYTES_PER_GIB,
total_vram / MAX_VRAM_BYTES_PER_GIB,
max_vram_bytes / MAX_VRAM_BYTES_PER_GIB);
return max_vram_bytes;
}
float resolve_max_vram_gib(float max_vram, ggml_backend_t backend) {
if (max_vram != -1.f) {
return max_vram;
}
return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(backend));
}
static Segment make_segment_seed(const Plan& plan, static Segment make_segment_seed(const Plan& plan,
size_t start_segment_index, size_t start_segment_index,
size_t end_segment_index) { size_t end_segment_index) {

View File

@ -83,6 +83,8 @@ namespace sd::ggml_graph_cut {
ggml_cgraph* gf, ggml_cgraph* gf,
const Segment& segment, const Segment& segment,
const char* log_desc); const char* log_desc);
size_t max_vram_gib_to_bytes(float max_vram);
float resolve_max_vram_gib(float max_vram, ggml_backend_t backend);
Plan build_plan(ggml_backend_t backend, Plan build_plan(ggml_backend_t backend,
ggml_cgraph* gf, ggml_cgraph* gf,
const std::unordered_set<const ggml_tensor*>& params_tensor_set, const std::unordered_set<const ggml_tensor*>& params_tensor_set,

View File

@ -1,4 +1,5 @@
#include "ggml_extend.hpp" #include "ggml_extend.hpp"
#include "ggml_graph_cut.h"
#include "model.h" #include "model.h"
#include "rng.hpp" #include "rng.hpp"
@ -209,6 +210,7 @@ public:
ggml_log_set(ggml_log_callback_default, nullptr); ggml_log_set(ggml_log_callback_default, nullptr);
init_backend(); init_backend();
max_vram = sd::ggml_graph_cut::resolve_max_vram_gib(max_vram, backend);
ModelLoader model_loader; ModelLoader model_loader;
@ -426,9 +428,7 @@ public:
bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu; bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;
const size_t max_graph_vram_bytes = max_vram <= 0.f const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(max_vram);
? 0
: static_cast<size_t>(static_cast<double>(max_vram) * 1024.0 * 1024.0 * 1024.0);
{ {
clip_backend = backend; clip_backend = backend;
@ -3597,9 +3597,7 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
hires_upscaler = std::make_unique<UpscalerGGML>(sd_ctx->sd->n_threads, hires_upscaler = std::make_unique<UpscalerGGML>(sd_ctx->sd->n_threads,
false, false,
request.hires.upscale_tile_size); request.hires.upscale_tile_size);
const size_t max_graph_vram_bytes = sd_ctx->sd->max_vram <= 0.f const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(sd_ctx->sd->max_vram);
? 0
: static_cast<size_t>(static_cast<double>(sd_ctx->sd->max_vram) * 1024.0 * 1024.0 * 1024.0);
hires_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes); hires_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes);
if (!hires_upscaler->load_from_file(request.hires.model_path, if (!hires_upscaler->load_from_file(request.hires.model_path,
sd_ctx->sd->offload_params_to_cpu, sd_ctx->sd->offload_params_to_cpu,