From 6fa7ca9317beca4786b191a636e6f849611b9e53 Mon Sep 17 00:00:00 2001 From: JusteLeo Date: Sun, 15 Mar 2026 09:40:14 +0100 Subject: [PATCH 1/5] docs: add Anima2 gguf download link to anima.md (#1335) --- docs/anima.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/anima.md b/docs/anima.md index 9c94178..debc370 100644 --- a/docs/anima.md +++ b/docs/anima.md @@ -5,6 +5,7 @@ - Download Anima - safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/diffusion_models - gguf: https://huggingface.co/Bedovyy/Anima-GGUF/tree/main + - gguf Anima2: https://huggingface.co/JusteLeo/Anima2-GGUF/tree/main - Download vae - safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/vae - Download Qwen3-0.6B-Base @@ -17,4 +18,4 @@ .\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\anima-preview.safetensors --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_06b_base.safetensors -p "a lovely cat holding a sign says 'anima.cpp'" --cfg-scale 6.0 --sampling-method euler -v --offload-to-cpu --diffusion-fa ``` -anima image example \ No newline at end of file +anima image example From adfef629009ac68e5cd9316ad40f63a2ab10b174 Mon Sep 17 00:00:00 2001 From: rmatif Date: Sun, 15 Mar 2026 09:41:05 +0100 Subject: [PATCH 2/5] feat: add generic DiT support to spectrum cache (#1336) --- examples/cli/README.md | 2 +- examples/common/common.hpp | 4 ++-- examples/server/README.md | 2 +- src/stable-diffusion.cpp | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/cli/README.md b/examples/cli/README.md index 6c2ef1e..0450be9 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -139,7 +139,7 @@ Generation Options: --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) -r, --ref-image reference image for Flux Kontext models (can be used multiple times) --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), - 'spectrum' (UNET Chebyshev+Taylor forecasting) + 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting) --cache-option named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples: diff --git a/examples/common/common.hpp b/examples/common/common.hpp index 9c50c15..896edc3 100644 --- a/examples/common/common.hpp +++ b/examples/common/common.hpp @@ -1513,11 +1513,11 @@ struct SDGenerationParams { on_ref_image_arg}, {"", "--cache-mode", - "caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level)", + "caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)", on_cache_mode_arg}, {"", "--cache-option", - "named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples: \"threshold=0.25\" or \"threshold=1.5,reset=0\"", + "named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples: \"threshold=0.25\" or \"threshold=1.5,reset=0\"", on_cache_option_arg}, {"", "--cache-preset", diff --git a/examples/server/README.md b/examples/server/README.md index 7554436..8ed3baa 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -129,7 +129,7 @@ Default Generation Options: --skip-layers layers to skip for SLG steps (default: [7,8,9]) --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) -r, --ref-image reference image for Flux Kontext models (can be used multiple times) - --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level) + --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting) --cache-option named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples: "threshold=0.25" or "threshold=1.5,reset=0" diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 613ebb0..2c80c9e 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -1797,9 +1797,9 @@ public: } } } else if (cache_params->mode == SD_CACHE_SPECTRUM) { - bool spectrum_supported = sd_version_is_unet(version); + bool spectrum_supported = sd_version_is_unet(version) || sd_version_is_dit(version); if (!spectrum_supported) { - LOG_WARN("Spectrum requested but not supported for this model type (only UNET models)"); + LOG_WARN("Spectrum requested but not supported for this model type (only UNET and DiT models)"); } else { SpectrumConfig spectrum_config; spectrum_config.w = cache_params->spectrum_w; From f6968bc58949bfd407a003e7bdac249c3b242cad Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sun, 15 Mar 2026 05:42:47 -0300 Subject: [PATCH 3/5] chore: remove SD_FAST_SOFTMAX build flag (#1338) --- CMakeLists.txt | 7 ------- 1 file changed, 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b90086e..bad1ba4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,7 +36,6 @@ option(SD_VULKAN "sd: vulkan backend" OFF) option(SD_OPENCL "sd: opencl backend" OFF) option(SD_SYCL "sd: sycl backend" OFF) option(SD_MUSA "sd: musa backend" OFF) -option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF) option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF) option(SD_BUILD_SHARED_GGML_LIB "sd: build ggml as a separate shared lib" OFF) option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF) @@ -70,18 +69,12 @@ if (SD_HIPBLAS) message("-- Use HIPBLAS as backend stable-diffusion") set(GGML_HIP ON) add_definitions(-DSD_USE_CUDA) - if(SD_FAST_SOFTMAX) - set(GGML_CUDA_FAST_SOFTMAX ON) - endif() endif () if(SD_MUSA) message("-- Use MUSA as backend stable-diffusion") set(GGML_MUSA ON) add_definitions(-DSD_USE_CUDA) - if(SD_FAST_SOFTMAX) - set(GGML_CUDA_FAST_SOFTMAX ON) - endif() endif() set(SD_LIB stable-diffusion) From 630ee03f23bd9947f610dd9fe038c56c0ff9c2de Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sun, 15 Mar 2026 05:43:46 -0300 Subject: [PATCH 4/5] refactor: move all cache parameter defaults to the library (#1327) --- docs/caching.md | 10 +---- examples/cli/README.md | 1 - examples/common/common.hpp | 64 +++--------------------------- examples/server/README.md | 1 - src/cache_dit.hpp | 81 -------------------------------------- src/stable-diffusion.cpp | 21 ++++++++-- 6 files changed, 24 insertions(+), 154 deletions(-) diff --git a/docs/caching.md b/docs/caching.md index 559b26a..cb103ae 100644 --- a/docs/caching.md +++ b/docs/caching.md @@ -80,7 +80,7 @@ Uses Taylor series approximation to predict block outputs: Combines DBCache and TaylorSeer: ```bash ---cache-mode cache-dit --cache-preset fast +--cache-mode cache-dit ``` #### Parameters @@ -92,14 +92,6 @@ Combines DBCache and TaylorSeer: | `threshold` | L1 residual difference threshold | 0.08 | | `warmup` | Steps before caching starts | 8 | -#### Presets - -Available presets: `slow`, `medium`, `fast`, `ultra` (or `s`, `m`, `f`, `u`). - -```bash ---cache-mode cache-dit --cache-preset fast -``` - #### SCM Options Steps Computation Mask controls which steps can be cached: diff --git a/examples/cli/README.md b/examples/cli/README.md index 0450be9..904f3c4 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -144,7 +144,6 @@ Generation Options: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples: "threshold=0.25" or "threshold=1.5,reset=0" or "w=0.4,window=2" - --cache-preset cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u' --scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache --scm-policy SCM policy: 'dynamic' (default) or 'static' ``` diff --git a/examples/common/common.hpp b/examples/common/common.hpp index 896edc3..9389b03 100644 --- a/examples/common/common.hpp +++ b/examples/common/common.hpp @@ -1047,7 +1047,6 @@ struct SDGenerationParams { std::string cache_mode; std::string cache_option; - std::string cache_preset; std::string scm_mask; bool scm_policy_dynamic = true; sd_cache_params_t cache_params{}; @@ -1461,21 +1460,6 @@ struct SDGenerationParams { return 1; }; - auto on_cache_preset_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - cache_preset = argv_to_utf8(index, argv); - if (cache_preset != "slow" && cache_preset != "s" && cache_preset != "S" && - cache_preset != "medium" && cache_preset != "m" && cache_preset != "M" && - cache_preset != "fast" && cache_preset != "f" && cache_preset != "F" && - cache_preset != "ultra" && cache_preset != "u" && cache_preset != "U") { - fprintf(stderr, "error: invalid cache preset '%s', must be 'slow'/'s', 'medium'/'m', 'fast'/'f', or 'ultra'/'u'\n", cache_preset.c_str()); - return -1; - } - return 1; - }; - options.manual_options = { {"-s", "--seed", @@ -1519,10 +1503,6 @@ struct SDGenerationParams { "--cache-option", "named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples: \"threshold=0.25\" or \"threshold=1.5,reset=0\"", on_cache_option_arg}, - {"", - "--cache-preset", - "cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u'", - on_cache_preset_arg}, {"", "--scm-mask", "SCM steps mask for cache-dit: comma-separated 0/1 (e.g., \"1,1,1,0,0,1,0,0,1,0\") - 1=compute, 0=can cache", @@ -1575,7 +1555,6 @@ struct SDGenerationParams { load_if_exists("negative_prompt", negative_prompt); load_if_exists("cache_mode", cache_mode); load_if_exists("cache_option", cache_option); - load_if_exists("cache_preset", cache_preset); load_if_exists("scm_mask", scm_mask); load_if_exists("clip_skip", clip_skip); @@ -1810,48 +1789,17 @@ struct SDGenerationParams { if (!cache_mode.empty()) { if (cache_mode == "easycache") { - cache_params.mode = SD_CACHE_EASYCACHE; - cache_params.reuse_threshold = 0.2f; - cache_params.start_percent = 0.15f; - cache_params.end_percent = 0.95f; - cache_params.error_decay_rate = 1.0f; - cache_params.use_relative_threshold = true; - cache_params.reset_error_on_compute = true; + cache_params.mode = SD_CACHE_EASYCACHE; } else if (cache_mode == "ucache") { - cache_params.mode = SD_CACHE_UCACHE; - cache_params.reuse_threshold = 1.0f; - cache_params.start_percent = 0.15f; - cache_params.end_percent = 0.95f; - cache_params.error_decay_rate = 1.0f; - cache_params.use_relative_threshold = true; - cache_params.reset_error_on_compute = true; + cache_params.mode = SD_CACHE_UCACHE; } else if (cache_mode == "dbcache") { - cache_params.mode = SD_CACHE_DBCACHE; - cache_params.Fn_compute_blocks = 8; - cache_params.Bn_compute_blocks = 0; - cache_params.residual_diff_threshold = 0.08f; - cache_params.max_warmup_steps = 8; + cache_params.mode = SD_CACHE_DBCACHE; } else if (cache_mode == "taylorseer") { - cache_params.mode = SD_CACHE_TAYLORSEER; - cache_params.Fn_compute_blocks = 8; - cache_params.Bn_compute_blocks = 0; - cache_params.residual_diff_threshold = 0.08f; - cache_params.max_warmup_steps = 8; + cache_params.mode = SD_CACHE_TAYLORSEER; } else if (cache_mode == "cache-dit") { - cache_params.mode = SD_CACHE_CACHE_DIT; - cache_params.Fn_compute_blocks = 8; - cache_params.Bn_compute_blocks = 0; - cache_params.residual_diff_threshold = 0.08f; - cache_params.max_warmup_steps = 8; + cache_params.mode = SD_CACHE_CACHE_DIT; } else if (cache_mode == "spectrum") { - cache_params.mode = SD_CACHE_SPECTRUM; - cache_params.spectrum_w = 0.40f; - cache_params.spectrum_m = 3; - cache_params.spectrum_lam = 1.0f; - cache_params.spectrum_window_size = 2; - cache_params.spectrum_flex_window = 0.50f; - cache_params.spectrum_warmup_steps = 4; - cache_params.spectrum_stop_percent = 0.9f; + cache_params.mode = SD_CACHE_SPECTRUM; } if (!cache_option.empty()) { diff --git a/examples/server/README.md b/examples/server/README.md index 8ed3baa..38deff6 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -133,7 +133,6 @@ Default Generation Options: --cache-option named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples: "threshold=0.25" or "threshold=1.5,reset=0" - --cache-preset cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u' --scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache --scm-policy SCM policy: 'dynamic' (default) or 'static' ``` diff --git a/src/cache_dit.hpp b/src/cache_dit.hpp index 6fe104d..4e3cf69 100644 --- a/src/cache_dit.hpp +++ b/src/cache_dit.hpp @@ -603,87 +603,6 @@ inline std::vector generate_scm_mask( return mask; } -inline std::vector get_scm_preset(const std::string& preset, int total_steps) { - struct Preset { - std::vector compute_bins; - std::vector cache_bins; - }; - - Preset slow = {{8, 3, 3, 2, 1, 1}, {1, 2, 2, 2, 3}}; - Preset medium = {{6, 2, 2, 2, 2, 1}, {1, 3, 3, 3, 3}}; - Preset fast = {{6, 1, 1, 1, 1, 1}, {1, 3, 4, 5, 4}}; - Preset ultra = {{4, 1, 1, 1, 1}, {2, 5, 6, 7}}; - - Preset* p = nullptr; - if (preset == "slow" || preset == "s" || preset == "S") - p = &slow; - else if (preset == "medium" || preset == "m" || preset == "M") - p = &medium; - else if (preset == "fast" || preset == "f" || preset == "F") - p = &fast; - else if (preset == "ultra" || preset == "u" || preset == "U") - p = &ultra; - else - return {}; - - if (total_steps != 28 && total_steps > 0) { - float scale = static_cast(total_steps) / 28.0f; - std::vector scaled_compute, scaled_cache; - - for (int v : p->compute_bins) { - scaled_compute.push_back(std::max(1, static_cast(v * scale + 0.5f))); - } - for (int v : p->cache_bins) { - scaled_cache.push_back(std::max(1, static_cast(v * scale + 0.5f))); - } - - return generate_scm_mask(scaled_compute, scaled_cache, total_steps); - } - - return generate_scm_mask(p->compute_bins, p->cache_bins, total_steps); -} - -inline float get_preset_threshold(const std::string& preset) { - if (preset == "slow" || preset == "s" || preset == "S") - return 0.20f; - if (preset == "medium" || preset == "m" || preset == "M") - return 0.25f; - if (preset == "fast" || preset == "f" || preset == "F") - return 0.30f; - if (preset == "ultra" || preset == "u" || preset == "U") - return 0.34f; - return 0.08f; -} - -inline int get_preset_warmup(const std::string& preset) { - if (preset == "slow" || preset == "s" || preset == "S") - return 8; - if (preset == "medium" || preset == "m" || preset == "M") - return 6; - if (preset == "fast" || preset == "f" || preset == "F") - return 6; - if (preset == "ultra" || preset == "u" || preset == "U") - return 4; - return 8; -} - -inline int get_preset_Fn(const std::string& preset) { - if (preset == "slow" || preset == "s" || preset == "S") - return 8; - if (preset == "medium" || preset == "m" || preset == "M") - return 8; - if (preset == "fast" || preset == "f" || preset == "F") - return 6; - if (preset == "ultra" || preset == "u" || preset == "U") - return 4; - return 8; -} - -inline int get_preset_Bn(const std::string& preset) { - (void)preset; - return 0; -} - inline void parse_dbcache_options(const std::string& opts, DBCacheConfig& cfg) { if (opts.empty()) return; diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 2c80c9e..d4b64ee 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -98,6 +98,19 @@ void suppress_pp(int step, int steps, float time, void* data) { return; } +static float get_cache_reuse_threshold(const sd_cache_params_t& params) { + float reuse_threshold = params.reuse_threshold; + if (reuse_threshold == INFINITY) { + if (params.mode == SD_CACHE_EASYCACHE) { + reuse_threshold = 0.2; + } + else if (params.mode == SD_CACHE_UCACHE) { + reuse_threshold = 1.0; + } + } + return std::max(0.0f, reuse_threshold); +} + /*=============================================== StableDiffusionGGML ================================================*/ class StableDiffusionGGML { @@ -1715,7 +1728,7 @@ public: } else { EasyCacheConfig easycache_config; easycache_config.enabled = true; - easycache_config.reuse_threshold = std::max(0.0f, cache_params->reuse_threshold); + easycache_config.reuse_threshold = get_cache_reuse_threshold(*cache_params); easycache_config.start_percent = cache_params->start_percent; easycache_config.end_percent = cache_params->end_percent; easycache_state.init(easycache_config, denoiser.get()); @@ -1736,7 +1749,7 @@ public: } else { UCacheConfig ucache_config; ucache_config.enabled = true; - ucache_config.reuse_threshold = std::max(0.0f, cache_params->reuse_threshold); + ucache_config.reuse_threshold = get_cache_reuse_threshold(*cache_params); ucache_config.start_percent = cache_params->start_percent; ucache_config.end_percent = cache_params->end_percent; ucache_config.error_decay_rate = std::max(0.0f, std::min(1.0f, cache_params->error_decay_rate)); @@ -2983,7 +2996,7 @@ enum lora_apply_mode_t str_to_lora_apply_mode(const char* str) { void sd_cache_params_init(sd_cache_params_t* cache_params) { *cache_params = {}; cache_params->mode = SD_CACHE_DISABLED; - cache_params->reuse_threshold = 1.0f; + cache_params->reuse_threshold = INFINITY; cache_params->start_percent = 0.15f; cache_params->end_percent = 0.95f; cache_params->error_decay_rate = 1.0f; @@ -3229,7 +3242,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { snprintf(buf + strlen(buf), 4096 - strlen(buf), "cache: %s (threshold=%.3f, start=%.2f, end=%.2f)\n", cache_mode_str, - sd_img_gen_params->cache.reuse_threshold, + get_cache_reuse_threshold(sd_img_gen_params->cache), sd_img_gen_params->cache.start_percent, sd_img_gen_params->cache.end_percent); free(sample_params_str); From 83eabd7c0123eeb8cf4b96588c059dfacc4883e6 Mon Sep 17 00:00:00 2001 From: Kevin Nause Date: Sun, 15 Mar 2026 04:46:01 -0400 Subject: [PATCH 5/5] ci: add CUDA Dockerfile (#1314) --- .github/workflows/build.yml | 2 +- Dockerfile.cuda | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) create mode 100644 Dockerfile.cuda diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b2da00d..9816e42 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -162,7 +162,7 @@ jobs: strategy: matrix: - variant: [musa, sycl, vulkan] + variant: [musa, sycl, vulkan, cuda] env: REGISTRY: ghcr.io diff --git a/Dockerfile.cuda b/Dockerfile.cuda new file mode 100644 index 0000000..13fef89 --- /dev/null +++ b/Dockerfile.cuda @@ -0,0 +1,25 @@ +ARG CUDA_VERSION=12.6.3 +ARG UBUNTU_VERSION=24.04 + +FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION} AS build + +RUN apt-get update && apt-get install -y --no-install-recommends build-essential git ccache cmake + +WORKDIR /sd.cpp + +COPY . . + +ARG CUDACXX=/usr/local/cuda/bin/nvcc +RUN cmake . -B ./build -DSD_CUDA=ON +RUN cmake --build ./build --config Release --parallel + +FROM nvidia/cuda:${CUDA_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION} AS runtime + +RUN apt-get update && \ + apt-get install --yes --no-install-recommends libgomp1 && \ + apt-get clean + +COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli +COPY --from=build /sd.cpp/build/bin/sd-server /sd-server + +ENTRYPOINT [ "/sd-cli" ]