feat: make negative max_vram control the amount of spare vram (#1503)

This commit is contained in:
Wagner Bruna 2026-05-18 12:00:06 -03:00 committed by GitHub
parent 21fd4e6788
commit f683c88a28
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 18 additions and 15 deletions

View File

@ -55,7 +55,8 @@ Context Options:
then threads will be set to the number of CPU physical cores then threads will be set to the number of CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma --chroma-t5-mask-pad <int> t5 mask pad size of chroma
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables --max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
graph splitting; -1 auto-detects free VRAM minus 1 GiB graph splitting; a negative value auto-detects free VRAM, sparing the
specified value (e.g. -0.5 will keep at least 0.5 GiB free)
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
when needed when needed

View File

@ -413,7 +413,7 @@ ArgOptions SDContextParams::get_options() {
options.float_options = { options.float_options = {
{"", {"",
"--max-vram", "--max-vram",
"maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; -1 auto-detects free VRAM minus 1 GiB", "maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value (e.g. -0.5 will keep at least 0.5 GiB free)",
&max_vram}, &max_vram},
}; };

View File

@ -157,7 +157,8 @@ Context Options:
then threads will be set to the number of CPU physical cores then threads will be set to the number of CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma --chroma-t5-mask-pad <int> t5 mask pad size of chroma
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables --max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
graph splitting; -1 auto-detects free VRAM minus 1 GiB graph splitting; a negative value auto-detects free VRAM, sparing the
specified value (e.g. -0.5 will keep at least 0.5 GiB free)
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
when needed when needed

View File

@ -17,7 +17,6 @@
namespace sd::ggml_graph_cut { namespace sd::ggml_graph_cut {
static constexpr double MAX_VRAM_BYTES_PER_GIB = 1024.0 * 1024.0 * 1024.0; static constexpr double MAX_VRAM_BYTES_PER_GIB = 1024.0 * 1024.0 * 1024.0;
static constexpr size_t MAX_VRAM_AUTO_RESERVE_BYTES = 1024ULL * 1024ULL * 1024ULL;
static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) { static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) {
if (tensor == nullptr) { if (tensor == nullptr) {
@ -93,45 +92,47 @@ namespace sd::ggml_graph_cut {
return static_cast<float>(static_cast<double>(max_vram_bytes) / MAX_VRAM_BYTES_PER_GIB); return static_cast<float>(static_cast<double>(max_vram_bytes) / MAX_VRAM_BYTES_PER_GIB);
} }
static size_t resolve_auto_max_vram_bytes(ggml_backend_t backend) { static size_t resolve_auto_max_vram_bytes(float spare_vram, ggml_backend_t backend) {
if (backend == nullptr) { if (backend == nullptr) {
LOG_WARN("--max-vram -1 requested, but no backend is available; disabling graph splitting"); LOG_WARN("--max-vram < 0 requested, but no backend is available; disabling graph splitting");
return 0; return 0;
} }
ggml_backend_dev_t dev = ggml_backend_get_device(backend); ggml_backend_dev_t dev = ggml_backend_get_device(backend);
if (dev == nullptr) { if (dev == nullptr) {
LOG_WARN("--max-vram -1 requested, but no backend device is available; disabling graph splitting"); LOG_WARN("--max-vram < 0 requested, but no backend device is available; disabling graph splitting");
return 0; return 0;
} }
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) { if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
LOG_WARN("--max-vram -1 requested, but the main backend is CPU; disabling graph splitting"); LOG_WARN("--max-vram < 0 requested, but the main backend is CPU; disabling graph splitting");
return 0; return 0;
} }
size_t free_vram = 0; size_t free_vram = 0;
size_t total_vram = 0; size_t total_vram = 0;
ggml_backend_dev_memory(dev, &free_vram, &total_vram); ggml_backend_dev_memory(dev, &free_vram, &total_vram);
size_t spare_bytes = static_cast<size_t>(MAX_VRAM_BYTES_PER_GIB * spare_vram);
if (free_vram <= MAX_VRAM_AUTO_RESERVE_BYTES) { if (free_vram <= spare_bytes) {
LOG_WARN("--max-vram -1 requested, but free VRAM is %.2f GiB; reserving 1.00 GiB leaves no graph budget", LOG_WARN("--max-vram < 0 requested, but free VRAM is %.2f GiB; reserving %.2f GiB leaves no graph budget",
free_vram / MAX_VRAM_BYTES_PER_GIB); free_vram / MAX_VRAM_BYTES_PER_GIB, spare_vram);
return 0; return 0;
} }
const size_t max_vram_bytes = free_vram - MAX_VRAM_AUTO_RESERVE_BYTES; const size_t max_vram_bytes = free_vram - spare_bytes;
LOG_INFO("--max-vram -1 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving 1.00 GiB; using %.2f GiB", LOG_INFO("--max-vram < 0 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving %.2f GiB; using %.2f GiB",
free_vram / MAX_VRAM_BYTES_PER_GIB, free_vram / MAX_VRAM_BYTES_PER_GIB,
total_vram / MAX_VRAM_BYTES_PER_GIB, total_vram / MAX_VRAM_BYTES_PER_GIB,
spare_vram,
max_vram_bytes / MAX_VRAM_BYTES_PER_GIB); max_vram_bytes / MAX_VRAM_BYTES_PER_GIB);
return max_vram_bytes; return max_vram_bytes;
} }
float resolve_max_vram_gib(float max_vram, ggml_backend_t backend) { float resolve_max_vram_gib(float max_vram, ggml_backend_t backend) {
if (max_vram != -1.f) { if (max_vram >= 0.f) {
return max_vram; return max_vram;
} }
return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(backend)); return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(-max_vram, backend));
} }
static Segment make_segment_seed(const Plan& plan, static Segment make_segment_seed(const Plan& plan,