Compare commits

..

4 Commits

10 changed files with 103 additions and 97 deletions

View File

@ -69,6 +69,12 @@ option(SD_BUILD_SHARED_GGML_LIB "sd: build ggml as a separate shared lib" O
option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF) option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF)
#option(SD_BUILD_SERVER "sd: build server example" ON) #option(SD_BUILD_SERVER "sd: build server example" ON)
set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED true)
if(SD_CUDA) if(SD_CUDA)
message("-- Use CUDA as backend stable-diffusion") message("-- Use CUDA as backend stable-diffusion")
set(GGML_CUDA ON) set(GGML_CUDA ON)

View File

@ -55,7 +55,7 @@ Context Options:
then threads will be set to the number of CPU physical cores then threads will be set to the number of CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma --chroma-t5-mask-pad <int> t5 mask pad size of chroma
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables --max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
graph splitting graph splitting; -1 auto-detects free VRAM minus 1 GiB
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
when needed when needed

View File

@ -397,7 +397,7 @@ ArgOptions SDContextParams::get_options() {
options.float_options = { options.float_options = {
{"", {"",
"--max-vram", "--max-vram",
"maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting", "maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; -1 auto-detects free VRAM minus 1 GiB",
&max_vram}, &max_vram},
}; };

View File

@ -157,7 +157,7 @@ Context Options:
then threads will be set to the number of CPU physical cores then threads will be set to the number of CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma --chroma-t5-mask-pad <int> t5 mask pad size of chroma
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables --max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
graph splitting graph splitting; -1 auto-detects free VRAM minus 1 GiB
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
when needed when needed

View File

@ -205,7 +205,7 @@ typedef struct {
bool chroma_use_t5_mask; bool chroma_use_t5_mask;
int chroma_t5_mask_pad; int chroma_t5_mask_pad;
bool qwen_image_zero_cond_t; bool qwen_image_zero_cond_t;
float max_vram; float max_vram; // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB)
} sd_ctx_params_t; } sd_ctx_params_t;
typedef struct { typedef struct {

View File

@ -824,45 +824,33 @@ static std::tuple<float, float, float> get_ancestral_step(float sigma_from,
static sd::Tensor<float> sample_euler_ancestral(denoise_cb_t model, static sd::Tensor<float> sample_euler_ancestral(denoise_cb_t model,
sd::Tensor<float> x, sd::Tensor<float> x,
const std::vector<float>& sigmas, const std::vector<float>& sigmas,
std::shared_ptr<RNG> rng, std::shared_ptr<RNG> rng = nullptr,
float eta) { bool is_flow_denoiser = false,
float eta = 0.f) {
int steps = static_cast<int>(sigmas.size()) - 1; int steps = static_cast<int>(sigmas.size()) - 1;
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
float sigma = sigmas[i]; float sigma = sigmas[i];
float sigma_to = sigmas[i + 1];
auto denoised_opt = model(x, sigma, i + 1, nullptr); auto denoised_opt = model(x, sigma, i + 1, nullptr);
if (denoised_opt.empty()) { if (denoised_opt.empty()) {
return {}; return {};
} }
sd::Tensor<float> denoised = std::move(denoised_opt); sd::Tensor<float> denoised = std::move(denoised_opt);
sd::Tensor<float> d = (x - denoised) / sigma; if (sigma_to == 0.f) {
auto [sigma_down, sigma_up] = get_ancestral_step(sigmas[i], sigmas[i + 1], eta); x = denoised;
x += d * (sigma_down - sigmas[i]); } else if (eta == 0.f) {
if (sigmas[i + 1] > 0) { float sigma_ratio = sigma_to / sigma;
x += sd::Tensor<float>::randn_like(x, rng) * sigma_up; x = sigma_ratio * x + (1.0 - sigma_ratio) * denoised;
} } else {
} auto [sigma_down, sigma_up, alpha_scale] = get_ancestral_step(sigma, sigma_to, eta, is_flow_denoiser);
return x;
}
static sd::Tensor<float> sample_euler_flow(denoise_cb_t model,
sd::Tensor<float> x,
const std::vector<float>& sigmas,
std::shared_ptr<RNG> rng,
float eta) {
int steps = static_cast<int>(sigmas.size()) - 1;
for (int i = 0; i < steps; i++) {
float sigma = sigmas[i];
auto denoised_opt = model(x, sigma, i + 1, nullptr);
if (denoised_opt.empty()) {
return {};
}
sd::Tensor<float> denoised = std::move(denoised_opt);
auto [sigma_down, sigma_up, alpha_scale] = get_ancestral_step_flow(sigma, sigmas[i + 1], eta);
float sigma_ratio = sigma_down / sigma; float sigma_ratio = sigma_down / sigma;
x = sigma_ratio * x + (1.0f - sigma_ratio) * denoised; x = sigma_ratio * x + (1.0f - sigma_ratio) * denoised;
if (sigma_up > 0.f) {
if (sigma_up > 0.0f) { if (is_flow_denoiser) {
x = alpha_scale * x + sd::Tensor<float>::randn_like(x, rng) * sigma_up; x *= alpha_scale;
}
x += sd::Tensor<float>::randn_like(x, rng) * sigma_up;
}
} }
} }
return x; return x;
@ -1633,46 +1621,6 @@ static sd::Tensor<float> sample_er_sde(denoise_cb_t model,
return x; return x;
} }
static sd::Tensor<float> sample_ddim_trailing(denoise_cb_t model,
sd::Tensor<float> x,
const std::vector<float>& sigmas,
std::shared_ptr<RNG> rng,
float eta) {
int steps = static_cast<int>(sigmas.size()) - 1;
for (int i = 0; i < steps; i++) {
float sigma = sigmas[i];
float sigma_to = sigmas[i + 1];
auto model_output_opt = model(x, sigma, i + 1, nullptr);
if (model_output_opt.empty()) {
return {};
}
sd::Tensor<float> model_output = std::move(model_output_opt);
model_output = (x - model_output) * (1.0f / sigma);
float alpha_prod_t = 1.0f / (sigma * sigma + 1.0f);
float alpha_prod_t_prev = 1.0f / (sigma_to * sigma_to + 1.0f);
float beta_prod_t = 1.0f - alpha_prod_t;
sd::Tensor<float> pred_original_sample = ((x / std::sqrt(sigma * sigma + 1)) -
std::sqrt(beta_prod_t) * model_output) *
(1.0f / std::sqrt(alpha_prod_t));
float beta_prod_t_prev = 1.0f - alpha_prod_t_prev;
float variance = (beta_prod_t_prev / beta_prod_t) *
(1.0f - alpha_prod_t / alpha_prod_t_prev);
float std_dev_t = eta * std::sqrt(variance);
x = pred_original_sample +
std::sqrt((1.0f - alpha_prod_t_prev - std::pow(std_dev_t, 2)) / alpha_prod_t_prev) * model_output;
if (eta > 0) {
x += std_dev_t / std::sqrt(alpha_prod_t_prev) * sd::Tensor<float>::randn_like(x, rng);
}
}
return x;
}
static sd::Tensor<float> sample_tcd(denoise_cb_t model, static sd::Tensor<float> sample_tcd(denoise_cb_t model,
sd::Tensor<float> x, sd::Tensor<float> x,
const std::vector<float>& sigmas, const std::vector<float>& sigmas,
@ -1715,12 +1663,12 @@ static sd::Tensor<float> sample_tcd(denoise_cb_t model,
int timestep_s = (int)floor((1 - eta) * prev_timestep); int timestep_s = (int)floor((1 - eta) * prev_timestep);
float sigma = sigmas[i]; float sigma = sigmas[i];
auto model_output_opt = model(x, sigma, i + 1, nullptr); auto denoised_opt = model(x, sigma, i + 1, nullptr);
if (model_output_opt.empty()) { if (denoised_opt.empty()) {
return {}; return {};
} }
sd::Tensor<float> model_output = std::move(model_output_opt); sd::Tensor<float> denoised = std::move(denoised_opt);
model_output = (x - model_output) * (1.0f / sigma); sd::Tensor<float> d = (x - denoised) / sigma;
float alpha_prod_t = 1.0f / (sigma * sigma + 1.0f); float alpha_prod_t = 1.0f / (sigma * sigma + 1.0f);
float beta_prod_t = 1.0f - alpha_prod_t; float beta_prod_t = 1.0f - alpha_prod_t;
@ -1728,12 +1676,8 @@ static sd::Tensor<float> sample_tcd(denoise_cb_t model,
float alpha_prod_s = static_cast<float>(alphas_cumprod[timestep_s]); float alpha_prod_s = static_cast<float>(alphas_cumprod[timestep_s]);
float beta_prod_s = 1.0f - alpha_prod_s; float beta_prod_s = 1.0f - alpha_prod_s;
sd::Tensor<float> pred_original_sample = ((x / std::sqrt(sigma * sigma + 1)) - x = std::sqrt(alpha_prod_s / alpha_prod_t_prev) * denoised +
std::sqrt(beta_prod_t) * model_output) * std::sqrt(beta_prod_s / alpha_prod_t_prev) * d;
(1.0f / std::sqrt(alpha_prod_t));
x = std::sqrt(alpha_prod_s / alpha_prod_t_prev) * pred_original_sample +
std::sqrt(beta_prod_s / alpha_prod_t_prev) * model_output;
if (eta > 0 && sigma_to > 0.0f) { if (eta > 0 && sigma_to > 0.0f) {
x = std::sqrt(alpha_prod_t_prev / alpha_prod_s) * x + x = std::sqrt(alpha_prod_t_prev / alpha_prod_s) * x +
@ -1804,10 +1748,7 @@ static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
const char* extra_sample_args) { const char* extra_sample_args) {
switch (method) { switch (method) {
case EULER_A_SAMPLE_METHOD: case EULER_A_SAMPLE_METHOD:
if (is_flow_denoiser) return sample_euler_ancestral(model, std::move(x), sigmas, rng, is_flow_denoiser, eta);
return sample_euler_flow(model, std::move(x), sigmas, rng, eta);
else
return sample_euler_ancestral(model, std::move(x), sigmas, rng, eta);
case EULER_SAMPLE_METHOD: case EULER_SAMPLE_METHOD:
return sample_euler(model, std::move(x), sigmas); return sample_euler(model, std::move(x), sigmas);
case HEUN_SAMPLE_METHOD: case HEUN_SAMPLE_METHOD:
@ -1836,7 +1777,8 @@ static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
case ER_SDE_SAMPLE_METHOD: case ER_SDE_SAMPLE_METHOD:
return sample_er_sde(model, std::move(x), sigmas, rng, is_flow_denoiser, eta); return sample_er_sde(model, std::move(x), sigmas, rng, is_flow_denoiser, eta);
case DDIM_TRAILING_SAMPLE_METHOD: case DDIM_TRAILING_SAMPLE_METHOD:
return sample_ddim_trailing(model, std::move(x), sigmas, rng, eta); // DDIM is equivalent to Euler Ancestral with the Simple scheduler
return sample_euler_ancestral(model, std::move(x), sigmas, rng, is_flow_denoiser, eta);
case TCD_SAMPLE_METHOD: case TCD_SAMPLE_METHOD:
return sample_tcd(model, std::move(x), sigmas, rng, eta); return sample_tcd(model, std::move(x), sigmas, rng, eta);
case EULER_CFG_PP_SAMPLE_METHOD: case EULER_CFG_PP_SAMPLE_METHOD:

View File

@ -2732,6 +2732,9 @@ public:
rebuild_params_tensor_set(); rebuild_params_tensor_set();
return true; return true;
} }
} else {
LOG_DEBUG("%s skipping params allocation (no tensors)", get_desc().c_str());
return true;
} }
params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend); params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend);
if (params_buffer == nullptr) { if (params_buffer == nullptr) {

View File

@ -16,6 +16,9 @@
namespace sd::ggml_graph_cut { namespace sd::ggml_graph_cut {
static constexpr double MAX_VRAM_BYTES_PER_GIB = 1024.0 * 1024.0 * 1024.0;
static constexpr size_t MAX_VRAM_AUTO_RESERVE_BYTES = 1024ULL * 1024ULL * 1024ULL;
static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) { static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) {
if (tensor == nullptr) { if (tensor == nullptr) {
return "<null>"; return "<null>";
@ -79,6 +82,58 @@ namespace sd::ggml_graph_cut {
segment.output_bytes; segment.output_bytes;
} }
size_t max_vram_gib_to_bytes(float max_vram) {
if (max_vram <= 0.f) {
return 0;
}
return static_cast<size_t>(static_cast<double>(max_vram) * MAX_VRAM_BYTES_PER_GIB);
}
static float max_vram_bytes_to_gib(size_t max_vram_bytes) {
return static_cast<float>(static_cast<double>(max_vram_bytes) / MAX_VRAM_BYTES_PER_GIB);
}
static size_t resolve_auto_max_vram_bytes(ggml_backend_t backend) {
if (backend == nullptr) {
LOG_WARN("--max-vram -1 requested, but no backend is available; disabling graph splitting");
return 0;
}
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
if (dev == nullptr) {
LOG_WARN("--max-vram -1 requested, but no backend device is available; disabling graph splitting");
return 0;
}
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
LOG_WARN("--max-vram -1 requested, but the main backend is CPU; disabling graph splitting");
return 0;
}
size_t free_vram = 0;
size_t total_vram = 0;
ggml_backend_dev_memory(dev, &free_vram, &total_vram);
if (free_vram <= MAX_VRAM_AUTO_RESERVE_BYTES) {
LOG_WARN("--max-vram -1 requested, but free VRAM is %.2f GiB; reserving 1.00 GiB leaves no graph budget",
free_vram / MAX_VRAM_BYTES_PER_GIB);
return 0;
}
const size_t max_vram_bytes = free_vram - MAX_VRAM_AUTO_RESERVE_BYTES;
LOG_INFO("--max-vram -1 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving 1.00 GiB; using %.2f GiB",
free_vram / MAX_VRAM_BYTES_PER_GIB,
total_vram / MAX_VRAM_BYTES_PER_GIB,
max_vram_bytes / MAX_VRAM_BYTES_PER_GIB);
return max_vram_bytes;
}
float resolve_max_vram_gib(float max_vram, ggml_backend_t backend) {
if (max_vram != -1.f) {
return max_vram;
}
return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(backend));
}
static Segment make_segment_seed(const Plan& plan, static Segment make_segment_seed(const Plan& plan,
size_t start_segment_index, size_t start_segment_index,
size_t end_segment_index) { size_t end_segment_index) {

View File

@ -83,6 +83,8 @@ namespace sd::ggml_graph_cut {
ggml_cgraph* gf, ggml_cgraph* gf,
const Segment& segment, const Segment& segment,
const char* log_desc); const char* log_desc);
size_t max_vram_gib_to_bytes(float max_vram);
float resolve_max_vram_gib(float max_vram, ggml_backend_t backend);
Plan build_plan(ggml_backend_t backend, Plan build_plan(ggml_backend_t backend,
ggml_cgraph* gf, ggml_cgraph* gf,
const std::unordered_set<const ggml_tensor*>& params_tensor_set, const std::unordered_set<const ggml_tensor*>& params_tensor_set,

View File

@ -1,4 +1,5 @@
#include "ggml_extend.hpp" #include "ggml_extend.hpp"
#include "ggml_graph_cut.h"
#include "model.h" #include "model.h"
#include "rng.hpp" #include "rng.hpp"
@ -209,6 +210,7 @@ public:
ggml_log_set(ggml_log_callback_default, nullptr); ggml_log_set(ggml_log_callback_default, nullptr);
init_backend(); init_backend();
max_vram = sd::ggml_graph_cut::resolve_max_vram_gib(max_vram, backend);
ModelLoader model_loader; ModelLoader model_loader;
@ -426,9 +428,7 @@ public:
bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu; bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;
const size_t max_graph_vram_bytes = max_vram <= 0.f const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(max_vram);
? 0
: static_cast<size_t>(static_cast<double>(max_vram) * 1024.0 * 1024.0 * 1024.0);
{ {
clip_backend = backend; clip_backend = backend;
@ -3597,9 +3597,7 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
hires_upscaler = std::make_unique<UpscalerGGML>(sd_ctx->sd->n_threads, hires_upscaler = std::make_unique<UpscalerGGML>(sd_ctx->sd->n_threads,
false, false,
request.hires.upscale_tile_size); request.hires.upscale_tile_size);
const size_t max_graph_vram_bytes = sd_ctx->sd->max_vram <= 0.f const size_t max_graph_vram_bytes = sd::ggml_graph_cut::max_vram_gib_to_bytes(sd_ctx->sd->max_vram);
? 0
: static_cast<size_t>(static_cast<double>(sd_ctx->sd->max_vram) * 1024.0 * 1024.0 * 1024.0);
hires_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes); hires_upscaler->set_max_graph_vram_bytes(max_graph_vram_bytes);
if (!hires_upscaler->load_from_file(request.hires.model_path, if (!hires_upscaler->load_from_file(request.hires.model_path,
sd_ctx->sd->offload_params_to_cpu, sd_ctx->sd->offload_params_to_cpu,