diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 666887d9..1fbcbf94 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -21,11 +21,13 @@ on:
"**/*.c",
"**/*.cpp",
"**/*.cu",
+ "examples/server/frontend/**",
]
pull_request:
types: [opened, synchronize, reopened]
paths:
[
+ ".github/workflows/**",
"**/CMakeLists.txt",
"**/Makefile",
"**/*.h",
@@ -33,6 +35,7 @@ on:
"**/*.c",
"**/*.cpp",
"**/*.cu",
+ "examples/server/frontend/**",
]
env:
@@ -53,6 +56,16 @@ jobs:
with:
submodules: recursive
+ - name: Setup Node
+ uses: actions/setup-node@v4
+ with:
+ node-version: 20
+
+ - name: Setup pnpm
+ uses: pnpm/action-setup@v4
+ with:
+ version: 9
+
- name: Dependencies
id: depends
run: |
@@ -70,7 +83,7 @@ jobs:
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
- uses: pr-mpt/actions-commit-hash@v2
+ uses: prompt/actions-commit-hash@v2
- name: Fetch system info
id: system-info
@@ -106,6 +119,16 @@ jobs:
with:
submodules: recursive
+ - name: Setup Node
+ uses: actions/setup-node@v4
+ with:
+ node-version: 20
+
+ - name: Setup pnpm
+ uses: pnpm/action-setup@v4
+ with:
+ version: 9
+
- name: Dependencies
id: depends
run: |
@@ -123,7 +146,7 @@ jobs:
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
- uses: pr-mpt/actions-commit-hash@v2
+ uses: prompt/actions-commit-hash@v2
- name: Fetch system info
id: system-info
@@ -162,7 +185,7 @@ jobs:
strategy:
matrix:
- variant: [musa, sycl, vulkan]
+ variant: [musa, sycl, vulkan, cuda]
env:
REGISTRY: ghcr.io
@@ -174,10 +197,20 @@ jobs:
with:
submodules: recursive
+ - name: Setup Node
+ uses: actions/setup-node@v4
+ with:
+ node-version: 20
+
+ - name: Setup pnpm
+ uses: pnpm/action-setup@v4
+ with:
+ version: 9
+
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
- uses: pr-mpt/actions-commit-hash@v2
+ uses: prompt/actions-commit-hash@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
@@ -223,6 +256,16 @@ jobs:
with:
submodules: recursive
+ - name: Setup Node
+ uses: actions/setup-node@v4
+ with:
+ node-version: 20
+
+ - name: Setup pnpm
+ uses: pnpm/action-setup@v4
+ with:
+ version: 9
+
- name: Dependencies
id: depends
run: |
@@ -240,7 +283,7 @@ jobs:
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
- uses: pr-mpt/actions-commit-hash@v2
+ uses: prompt/actions-commit-hash@v2
- name: Fetch system info
id: system-info
@@ -294,6 +337,16 @@ jobs:
with:
submodules: recursive
+ - name: Setup Node
+ uses: actions/setup-node@v4
+ with:
+ node-version: 20
+
+ - name: Setup pnpm
+ uses: pnpm/action-setup@v4
+ with:
+ version: 9
+
- name: Install cuda-toolkit
id: cuda-toolkit
if: ${{ matrix.build == 'cuda12' }}
@@ -340,7 +393,7 @@ jobs:
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
- uses: pr-mpt/actions-commit-hash@v2
+ uses: prompt/actions-commit-hash@v2
- name: Pack artifacts
id: pack_artifacts
@@ -399,6 +452,16 @@ jobs:
with:
submodules: recursive
+ - name: Setup Node
+ uses: actions/setup-node@v4
+ with:
+ node-version: 20
+
+ - name: Setup pnpm
+ uses: pnpm/action-setup@v4
+ with:
+ version: 9
+
- name: Cache ROCm Installation
id: cache-rocm
uses: actions/cache@v4
@@ -463,7 +526,7 @@ jobs:
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
- uses: pr-mpt/actions-commit-hash@v2
+ uses: prompt/actions-commit-hash@v2
- name: Pack artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -502,6 +565,16 @@ jobs:
with:
submodules: recursive
+ - name: Setup Node
+ uses: actions/setup-node@v4
+ with:
+ node-version: 20
+
+ - name: Setup pnpm
+ uses: pnpm/action-setup@v4
+ with:
+ version: 9
+
- name: Free disk space
run: |
# Remove preinstalled SDKs and caches not needed for this job
@@ -581,7 +654,7 @@ jobs:
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
- uses: pr-mpt/actions-commit-hash@v2
+ uses: prompt/actions-commit-hash@v2
- name: Prepare artifacts
id: prepare_artifacts
@@ -660,7 +733,7 @@ jobs:
- name: Get commit hash
id: commit
- uses: pr-mpt/actions-commit-hash@v2
+ uses: prompt/actions-commit-hash@v2
- name: Create release
id: create_release
diff --git a/.gitmodules b/.gitmodules
index 5a785197..5d66c879 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
[submodule "ggml"]
path = ggml
url = https://github.com/ggml-org/ggml.git
+[submodule "examples/server/frontend"]
+ path = examples/server/frontend
+ url = https://github.com/leejet/stable-ui.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b90086ea..bad1ba4c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,7 +36,6 @@ option(SD_VULKAN "sd: vulkan backend" OFF)
option(SD_OPENCL "sd: opencl backend" OFF)
option(SD_SYCL "sd: sycl backend" OFF)
option(SD_MUSA "sd: musa backend" OFF)
-option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF)
option(SD_BUILD_SHARED_GGML_LIB "sd: build ggml as a separate shared lib" OFF)
option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF)
@@ -70,18 +69,12 @@ if (SD_HIPBLAS)
message("-- Use HIPBLAS as backend stable-diffusion")
set(GGML_HIP ON)
add_definitions(-DSD_USE_CUDA)
- if(SD_FAST_SOFTMAX)
- set(GGML_CUDA_FAST_SOFTMAX ON)
- endif()
endif ()
if(SD_MUSA)
message("-- Use MUSA as backend stable-diffusion")
set(GGML_MUSA ON)
add_definitions(-DSD_USE_CUDA)
- if(SD_FAST_SOFTMAX)
- set(GGML_CUDA_FAST_SOFTMAX ON)
- endif()
endif()
set(SD_LIB stable-diffusion)
diff --git a/Dockerfile.cuda b/Dockerfile.cuda
new file mode 100644
index 00000000..4deb7247
--- /dev/null
+++ b/Dockerfile.cuda
@@ -0,0 +1,25 @@
+ARG CUDA_VERSION=12.6.3
+ARG UBUNTU_VERSION=24.04
+
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION} AS build
+
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential git ccache cmake
+
+WORKDIR /sd.cpp
+
+COPY . .
+
+ARG CUDACXX=/usr/local/cuda/bin/nvcc
+RUN cmake . -B ./build -DSD_CUDA=ON
+RUN cmake --build ./build --config Release -j$(nproc)
+
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION} AS runtime
+
+RUN apt-get update && \
+ apt-get install --yes --no-install-recommends libgomp1 && \
+ apt-get clean
+
+COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
+COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
+
+ENTRYPOINT [ "/sd-cli" ]
diff --git a/docs/anima.md b/docs/anima.md
index 9c941785..debc370b 100644
--- a/docs/anima.md
+++ b/docs/anima.md
@@ -5,6 +5,7 @@
- Download Anima
- safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/Bedovyy/Anima-GGUF/tree/main
+ - gguf Anima2: https://huggingface.co/JusteLeo/Anima2-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/vae
- Download Qwen3-0.6B-Base
@@ -17,4 +18,4 @@
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\anima-preview.safetensors --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_06b_base.safetensors -p "a lovely cat holding a sign says 'anima.cpp'" --cfg-scale 6.0 --sampling-method euler -v --offload-to-cpu --diffusion-fa
```
-
\ No newline at end of file
+
diff --git a/docs/caching.md b/docs/caching.md
index 7b4be3ce..b02a541b 100644
--- a/docs/caching.md
+++ b/docs/caching.md
@@ -11,6 +11,7 @@ Caching methods accelerate diffusion inference by reusing intermediate computati
| `dbcache` | DiT models | Block-level L1 residual threshold |
| `taylorseer` | DiT models | Taylor series approximation |
| `cache-dit` | DiT models | Combined DBCache + TaylorSeer |
+| `spectrum` | UNET and DiT models | Chebyshev + Taylor output forecasting |
### UCache (UNET Models)
@@ -79,7 +80,7 @@ Uses Taylor series approximation to predict block outputs:
Combines DBCache and TaylorSeer:
```bash
---cache-mode cache-dit --cache-preset fast
+--cache-mode cache-dit
```
#### Parameters
@@ -91,14 +92,6 @@ Combines DBCache and TaylorSeer:
| `threshold` | L1 residual difference threshold | 0.08 |
| `warmup` | Steps before caching starts | 8 |
-#### Presets
-
-Available presets: `slow`, `medium`, `fast`, `ultra` (or `s`, `m`, `f`, `u`).
-
-```bash
---cache-mode cache-dit --cache-preset fast
-```
-
#### SCM Options
Steps Computation Mask controls which steps can be cached:
@@ -118,6 +111,28 @@ Mask values: `1` = compute, `0` = can cache.
--scm-policy dynamic
```
+### Spectrum (UNET and DiT Models)
+
+Spectrum uses Chebyshev polynomial fitting blended with Taylor extrapolation to predict denoised outputs, skipping entire forward passes. Based on the paper [Spectrum: Adaptive Spectral Feature Forecasting for Efficient Diffusion Sampling](https://github.com/tingyu215/Spectrum).
+
+```bash
+sd-cli -m model.safetensors -p "a cat" --cache-mode spectrum
+```
+
+#### Parameters
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `w` | Chebyshev vs Taylor blend weight (0=Taylor, 1=Chebyshev) | 0.40 |
+| `m` | Chebyshev polynomial degree | 3 |
+| `lam` | Ridge regression regularization | 1.0 |
+| `window` | Initial window size (compute every N steps) | 2 |
+| `flex` | Window growth per computed step after warmup | 0.50 |
+| `warmup` | Steps to always compute before caching starts | 4 |
+| `stop` | Stop caching at this fraction of total steps | 0.9 |
+
+```
+
### Performance Tips
- Start with default thresholds and adjust based on output quality
diff --git a/examples/cli/README.md b/examples/cli/README.md
index 564e5ce0..904f3c44 100644
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@@ -138,11 +138,12 @@ Generation Options:
--skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
- --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level)
+ --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level),
+ 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
--cache-option named cache params (key=value format, comma-separated). easycache/ucache:
- threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
- "threshold=0.25" or "threshold=1.5,reset=0"
- --cache-preset cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u'
+ threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=;
+ spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples:
+ "threshold=0.25" or "threshold=1.5,reset=0" or "w=0.4,window=2"
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
--scm-policy SCM policy: 'dynamic' (default) or 'static'
```
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index f9e4928e..ddb88c97 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -601,7 +601,7 @@ int main(int argc, const char* argv[]) {
if (gen_params.end_image_path.size() > 0) {
vae_decode_only = false;
- if (!load_image_and_update_size(gen_params.init_image_path, end_image)) {
+ if (!load_image_and_update_size(gen_params.end_image_path, end_image)) {
return 1;
}
}
diff --git a/examples/common/common.hpp b/examples/common/common.hpp
index 369c1f07..9389b03a 100644
--- a/examples/common/common.hpp
+++ b/examples/common/common.hpp
@@ -1047,7 +1047,6 @@ struct SDGenerationParams {
std::string cache_mode;
std::string cache_option;
- std::string cache_preset;
std::string scm_mask;
bool scm_policy_dynamic = true;
sd_cache_params_t cache_params{};
@@ -1422,8 +1421,8 @@ struct SDGenerationParams {
}
cache_mode = argv_to_utf8(index, argv);
if (cache_mode != "easycache" && cache_mode != "ucache" &&
- cache_mode != "dbcache" && cache_mode != "taylorseer" && cache_mode != "cache-dit") {
- fprintf(stderr, "error: invalid cache mode '%s', must be 'easycache', 'ucache', 'dbcache', 'taylorseer', or 'cache-dit'\n", cache_mode.c_str());
+ cache_mode != "dbcache" && cache_mode != "taylorseer" && cache_mode != "cache-dit" && cache_mode != "spectrum") {
+ fprintf(stderr, "error: invalid cache mode '%s', must be 'easycache', 'ucache', 'dbcache', 'taylorseer', 'cache-dit', or 'spectrum'\n", cache_mode.c_str());
return -1;
}
return 1;
@@ -1461,21 +1460,6 @@ struct SDGenerationParams {
return 1;
};
- auto on_cache_preset_arg = [&](int argc, const char** argv, int index) {
- if (++index >= argc) {
- return -1;
- }
- cache_preset = argv_to_utf8(index, argv);
- if (cache_preset != "slow" && cache_preset != "s" && cache_preset != "S" &&
- cache_preset != "medium" && cache_preset != "m" && cache_preset != "M" &&
- cache_preset != "fast" && cache_preset != "f" && cache_preset != "F" &&
- cache_preset != "ultra" && cache_preset != "u" && cache_preset != "U") {
- fprintf(stderr, "error: invalid cache preset '%s', must be 'slow'/'s', 'medium'/'m', 'fast'/'f', or 'ultra'/'u'\n", cache_preset.c_str());
- return -1;
- }
- return 1;
- };
-
options.manual_options = {
{"-s",
"--seed",
@@ -1513,16 +1497,12 @@ struct SDGenerationParams {
on_ref_image_arg},
{"",
"--cache-mode",
- "caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level)",
+ "caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)",
on_cache_mode_arg},
{"",
"--cache-option",
- "named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples: \"threshold=0.25\" or \"threshold=1.5,reset=0\"",
+ "named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples: \"threshold=0.25\" or \"threshold=1.5,reset=0\"",
on_cache_option_arg},
- {"",
- "--cache-preset",
- "cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u'",
- on_cache_preset_arg},
{"",
"--scm-mask",
"SCM steps mask for cache-dit: comma-separated 0/1 (e.g., \"1,1,1,0,0,1,0,0,1,0\") - 1=compute, 0=can cache",
@@ -1575,7 +1555,6 @@ struct SDGenerationParams {
load_if_exists("negative_prompt", negative_prompt);
load_if_exists("cache_mode", cache_mode);
load_if_exists("cache_option", cache_option);
- load_if_exists("cache_preset", cache_preset);
load_if_exists("scm_mask", scm_mask);
load_if_exists("clip_skip", clip_skip);
@@ -1779,7 +1758,23 @@ struct SDGenerationParams {
} else if (key == "Bn" || key == "bn") {
cache_params.Bn_compute_blocks = std::stoi(val);
} else if (key == "warmup") {
- cache_params.max_warmup_steps = std::stoi(val);
+ if (cache_mode == "spectrum") {
+ cache_params.spectrum_warmup_steps = std::stoi(val);
+ } else {
+ cache_params.max_warmup_steps = std::stoi(val);
+ }
+ } else if (key == "w") {
+ cache_params.spectrum_w = std::stof(val);
+ } else if (key == "m") {
+ cache_params.spectrum_m = std::stoi(val);
+ } else if (key == "lam") {
+ cache_params.spectrum_lam = std::stof(val);
+ } else if (key == "window") {
+ cache_params.spectrum_window_size = std::stoi(val);
+ } else if (key == "flex") {
+ cache_params.spectrum_flex_window = std::stof(val);
+ } else if (key == "stop") {
+ cache_params.spectrum_stop_percent = std::stof(val);
} else {
LOG_ERROR("error: unknown cache parameter '%s'", key.c_str());
return false;
@@ -1794,39 +1789,17 @@ struct SDGenerationParams {
if (!cache_mode.empty()) {
if (cache_mode == "easycache") {
- cache_params.mode = SD_CACHE_EASYCACHE;
- cache_params.reuse_threshold = 0.2f;
- cache_params.start_percent = 0.15f;
- cache_params.end_percent = 0.95f;
- cache_params.error_decay_rate = 1.0f;
- cache_params.use_relative_threshold = true;
- cache_params.reset_error_on_compute = true;
+ cache_params.mode = SD_CACHE_EASYCACHE;
} else if (cache_mode == "ucache") {
- cache_params.mode = SD_CACHE_UCACHE;
- cache_params.reuse_threshold = 1.0f;
- cache_params.start_percent = 0.15f;
- cache_params.end_percent = 0.95f;
- cache_params.error_decay_rate = 1.0f;
- cache_params.use_relative_threshold = true;
- cache_params.reset_error_on_compute = true;
+ cache_params.mode = SD_CACHE_UCACHE;
} else if (cache_mode == "dbcache") {
- cache_params.mode = SD_CACHE_DBCACHE;
- cache_params.Fn_compute_blocks = 8;
- cache_params.Bn_compute_blocks = 0;
- cache_params.residual_diff_threshold = 0.08f;
- cache_params.max_warmup_steps = 8;
+ cache_params.mode = SD_CACHE_DBCACHE;
} else if (cache_mode == "taylorseer") {
- cache_params.mode = SD_CACHE_TAYLORSEER;
- cache_params.Fn_compute_blocks = 8;
- cache_params.Bn_compute_blocks = 0;
- cache_params.residual_diff_threshold = 0.08f;
- cache_params.max_warmup_steps = 8;
+ cache_params.mode = SD_CACHE_TAYLORSEER;
} else if (cache_mode == "cache-dit") {
- cache_params.mode = SD_CACHE_CACHE_DIT;
- cache_params.Fn_compute_blocks = 8;
- cache_params.Bn_compute_blocks = 0;
- cache_params.residual_diff_threshold = 0.08f;
- cache_params.max_warmup_steps = 8;
+ cache_params.mode = SD_CACHE_CACHE_DIT;
+ } else if (cache_mode == "spectrum") {
+ cache_params.mode = SD_CACHE_SPECTRUM;
}
if (!cache_option.empty()) {
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index d1912608..8f5beba8 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -1,6 +1,73 @@
set(TARGET sd-server)
+option(SD_SERVER_BUILD_FRONTEND "Build server frontend with pnpm" ON)
+
+set(FRONTEND_DIR "${CMAKE_CURRENT_SOURCE_DIR}/frontend")
+set(GENERATED_HTML_HEADER "${FRONTEND_DIR}/dist/gen_index_html.h")
+
+set(HAVE_FRONTEND_BUILD OFF)
+
+if(SD_SERVER_BUILD_FRONTEND AND EXISTS "${FRONTEND_DIR}")
+ if(WIN32)
+ find_program(PNPM_EXECUTABLE NAMES pnpm.cmd pnpm)
+ else()
+ find_program(PNPM_EXECUTABLE NAMES pnpm)
+ endif()
+
+ if(PNPM_EXECUTABLE)
+ message(STATUS "Frontend dir found: ${FRONTEND_DIR}")
+ message(STATUS "pnpm found: ${PNPM_EXECUTABLE}")
+
+ set(HAVE_FRONTEND_BUILD ON)
+
+ add_custom_target(${TARGET}_frontend_install
+ COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" install
+ WORKING_DIRECTORY "${FRONTEND_DIR}"
+ COMMENT "Installing frontend dependencies"
+ VERBATIM
+ )
+
+ add_custom_target(${TARGET}_frontend_build
+ COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" run build
+ WORKING_DIRECTORY "${FRONTEND_DIR}"
+ COMMENT "Building frontend"
+ VERBATIM
+ )
+
+ add_custom_target(${TARGET}_frontend_header
+ COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" run build:header
+ WORKING_DIRECTORY "${FRONTEND_DIR}"
+ COMMENT "Generating gen_index_html.h"
+ VERBATIM
+ )
+
+ add_dependencies(${TARGET}_frontend_build ${TARGET}_frontend_install)
+ add_dependencies(${TARGET}_frontend_header ${TARGET}_frontend_build)
+
+ add_custom_target(${TARGET}_frontend
+ DEPENDS ${TARGET}_frontend_header
+ )
+
+ set_source_files_properties("${GENERATED_HTML_HEADER}" PROPERTIES GENERATED TRUE)
+ else()
+ message(WARNING "pnpm not found, frontend build disabled")
+ endif()
+else()
+ message(STATUS "Frontend disabled or directory not found: ${FRONTEND_DIR}")
+endif()
+
add_executable(${TARGET} main.cpp)
+
+if(HAVE_FRONTEND_BUILD)
+ add_dependencies(${TARGET} ${TARGET}_frontend)
+ target_sources(${TARGET} PRIVATE "${GENERATED_HTML_HEADER}")
+ target_include_directories(${TARGET} PRIVATE "${FRONTEND_DIR}/dist")
+ target_compile_definitions(${TARGET} PRIVATE HAVE_INDEX_HTML)
+ message(STATUS "HAVE_INDEX_HTML enabled")
+else()
+ message(STATUS "HAVE_INDEX_HTML disabled")
+endif()
+
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PUBLIC c_std_11 cxx_std_17)
\ No newline at end of file
diff --git a/examples/server/README.md b/examples/server/README.md
index 75544364..8aa2158f 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -1,3 +1,92 @@
+# Frontend
+
+## Build with Frontend
+
+The server can optionally build the web frontend and embed it into the binary as `gen_index_html.h`.
+
+### Requirements
+
+Install the following tools:
+
+* **Node.js** ≥ 22.18
+ https://nodejs.org/
+
+* **pnpm** ≥ 10
+ Install via npm:
+
+```bash
+npm install -g pnpm
+```
+
+Verify installation:
+
+```bash
+node -v
+pnpm -v
+```
+
+### Install frontend dependencies
+
+Go to the frontend directory and install dependencies:
+
+```bash
+cd examples/server/frontend
+pnpm install
+```
+
+### Build the server with CMake
+
+Enable the frontend build option when configuring CMake:
+
+```bash
+cmake -B build -DSD_SERVER_BUILD_FRONTEND=ON
+cmake --build build --config Release
+```
+
+If `pnpm` is available, the build system will automatically run:
+
+```
+pnpm run build
+pnpm run build:header
+```
+
+and embed the generated frontend into the server binary.
+
+## Frontend Repository
+
+The web frontend is maintained in a **separate repository**, https://github.com/leejet/stable-ui.
+
+If you want to modify the UI or frontend logic, please submit pull requests to the **frontend repository**.
+
+This repository (`stable-diffusion.cpp`) only vendors the frontend periodically. Changes from the frontend repo are synchronized:
+
+* approximately **every 1–2 weeks**, or
+* when there are **major frontend updates**
+
+Because of this, frontend changes will **not appear here immediately** after being merged upstream.
+
+## Using an external frontend
+
+By default, the server uses the **embedded frontend** generated during the build (`gen_index_html.h`).
+
+You can also serve a custom frontend file instead of the embedded one by using:
+
+```bash
+--serve-html-path
+```
+
+For example:
+
+```bash
+sd-server --serve-html-path ./index.html
+```
+
+In this case, the server will load and serve the specified `index.html` file instead of the embedded frontend. This is useful when:
+
+* developing or testing frontend changes
+* using a custom UI
+* avoiding rebuilding the binary after frontend modifications
+
# Run
```
@@ -129,11 +218,10 @@ Default Generation Options:
--skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
- --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level)
+ --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
--cache-option named cache params (key=value format, comma-separated). easycache/ucache:
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
"threshold=0.25" or "threshold=1.5,reset=0"
- --cache-preset cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u'
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
--scm-policy SCM policy: 'dynamic' (default) or 'static'
```
diff --git a/examples/server/frontend b/examples/server/frontend
new file mode 160000
index 00000000..1a34176c
--- /dev/null
+++ b/examples/server/frontend
@@ -0,0 +1 @@
+Subproject commit 1a34176cd6d39ad3a226b2b69047e71f6797f6bc
diff --git a/examples/server/main.cpp b/examples/server/main.cpp
index cc9e66cc..6e4340a6 100644
--- a/examples/server/main.cpp
+++ b/examples/server/main.cpp
@@ -13,6 +13,10 @@
#include "common/common.hpp"
+#ifdef HAVE_INDEX_HTML
+#include "frontend/dist/gen_index_html.h"
+#endif
+
namespace fs = std::filesystem;
// ----------------------- helpers -----------------------
@@ -380,7 +384,13 @@ int main(int argc, const char** argv) {
return httplib::Server::HandlerResponse::Unhandled;
});
- // root
+ // index html
+ std::string index_html;
+#ifdef HAVE_INDEX_HTML
+ index_html.assign(reinterpret_cast(index_html_bytes), index_html_size);
+#else
+ index_html = "Stable Diffusion Server is running";
+#endif
svr.Get("/", [&](const httplib::Request&, httplib::Response& res) {
if (!svr_params.serve_html_path.empty()) {
std::ifstream file(svr_params.serve_html_path);
@@ -392,7 +402,7 @@ int main(int argc, const char** argv) {
res.set_content("Error: Unable to read HTML file", "text/plain");
}
} else {
- res.set_content("Stable Diffusion Server is running", "text/plain");
+ res.set_content(index_html, "text/html");
}
});
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index 51b2b329..029c2ab1 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -251,6 +251,7 @@ enum sd_cache_mode_t {
SD_CACHE_DBCACHE,
SD_CACHE_TAYLORSEER,
SD_CACHE_CACHE_DIT,
+ SD_CACHE_SPECTRUM,
};
typedef struct {
@@ -271,6 +272,13 @@ typedef struct {
int taylorseer_skip_interval;
const char* scm_mask;
bool scm_policy_dynamic;
+ float spectrum_w;
+ int spectrum_m;
+ float spectrum_lam;
+ int spectrum_window_size;
+ float spectrum_flex_window;
+ int spectrum_warmup_steps;
+ float spectrum_stop_percent;
} sd_cache_params_t;
typedef struct {
diff --git a/src/anima.hpp b/src/anima.hpp
index 191a096d..5850cc3e 100644
--- a/src/anima.hpp
+++ b/src/anima.hpp
@@ -13,9 +13,9 @@
namespace Anima {
constexpr int ANIMA_GRAPH_SIZE = 65536;
- __STATIC_INLINE__ struct ggml_tensor* apply_gate(struct ggml_context* ctx,
- struct ggml_tensor* x,
- struct ggml_tensor* gate) {
+ __STATIC_INLINE__ ggml_tensor* apply_gate(ggml_context* ctx,
+ ggml_tensor* x,
+ ggml_tensor* gate) {
gate = ggml_reshape_3d(ctx, gate, gate->ne[0], 1, gate->ne[1]); // [N, 1, C]
return ggml_mul(ctx, x, gate);
}
@@ -26,7 +26,7 @@ namespace Anima {
blocks["proj.1"] = std::make_shared(in_dim, out_dim, false);
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto proj = std::dynamic_pointer_cast(blocks["proj.1"]);
return proj->forward(ctx, x);
}
@@ -39,7 +39,7 @@ namespace Anima {
blocks["1.linear_2"] = std::make_shared(in_dim, out_dim, false);
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto linear_1 = std::dynamic_pointer_cast(blocks["1.linear_1"]);
auto linear_2 = std::dynamic_pointer_cast(blocks["1.linear_2"]);
@@ -62,10 +62,10 @@ namespace Anima {
blocks["2"] = std::make_shared(hidden_features, 3 * in_features, false);
}
- std::pair forward(GGMLRunnerContext* ctx,
- struct ggml_tensor* hidden_states,
- struct ggml_tensor* embedded_timestep,
- struct ggml_tensor* temb = nullptr) {
+ std::pair forward(GGMLRunnerContext* ctx,
+ ggml_tensor* hidden_states,
+ ggml_tensor* embedded_timestep,
+ ggml_tensor* temb = nullptr) {
auto norm = std::dynamic_pointer_cast(blocks["norm"]);
auto linear_1 = std::dynamic_pointer_cast(blocks["1"]);
auto linear_2 = std::dynamic_pointer_cast(blocks["2"]);
@@ -102,10 +102,10 @@ namespace Anima {
blocks["2"] = std::make_shared(hidden_features, 2 * in_features, false);
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx,
- struct ggml_tensor* hidden_states,
- struct ggml_tensor* embedded_timestep,
- struct ggml_tensor* temb = nullptr) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx,
+ ggml_tensor* hidden_states,
+ ggml_tensor* embedded_timestep,
+ ggml_tensor* temb = nullptr) {
auto norm = std::dynamic_pointer_cast(blocks["norm"]);
auto linear_1 = std::dynamic_pointer_cast(blocks["1"]);
auto linear_2 = std::dynamic_pointer_cast(blocks["2"]);
@@ -152,11 +152,11 @@ namespace Anima {
blocks[this->out_proj_name] = std::make_shared(inner_dim, query_dim, false);
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx,
- struct ggml_tensor* hidden_states,
- struct ggml_tensor* encoder_hidden_states = nullptr,
- struct ggml_tensor* pe_q = nullptr,
- struct ggml_tensor* pe_k = nullptr) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx,
+ ggml_tensor* hidden_states,
+ ggml_tensor* encoder_hidden_states = nullptr,
+ ggml_tensor* pe_q = nullptr,
+ ggml_tensor* pe_k = nullptr) {
if (encoder_hidden_states == nullptr) {
encoder_hidden_states = hidden_states;
}
@@ -183,7 +183,7 @@ namespace Anima {
q4 = q_norm->forward(ctx, q4);
k4 = k_norm->forward(ctx, k4);
- struct ggml_tensor* attn_out = nullptr;
+ ggml_tensor* attn_out = nullptr;
if (pe_q != nullptr || pe_k != nullptr) {
if (pe_q == nullptr) {
pe_q = pe_k;
@@ -227,7 +227,7 @@ namespace Anima {
blocks["layer2"] = std::make_shared(hidden_dim, dim, false);
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto layer1 = std::dynamic_pointer_cast(blocks["layer1"]);
auto layer2 = std::dynamic_pointer_cast(blocks["layer2"]);
@@ -245,7 +245,7 @@ namespace Anima {
blocks["2"] = std::make_shared(hidden_dim, dim, true);
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto layer0 = std::dynamic_pointer_cast(blocks["0"]);
auto layer2 = std::dynamic_pointer_cast(blocks["2"]);
@@ -267,11 +267,11 @@ namespace Anima {
blocks["mlp"] = std::make_shared(model_dim, model_dim * 4);
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx,
- struct ggml_tensor* x,
- struct ggml_tensor* context,
- struct ggml_tensor* target_pe,
- struct ggml_tensor* context_pe) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx,
+ ggml_tensor* x,
+ ggml_tensor* context,
+ ggml_tensor* target_pe,
+ ggml_tensor* context_pe) {
auto norm_self_attn = std::dynamic_pointer_cast(blocks["norm_self_attn"]);
auto self_attn = std::dynamic_pointer_cast(blocks["self_attn"]);
auto norm_cross_attn = std::dynamic_pointer_cast(blocks["norm_cross_attn"]);
@@ -317,11 +317,11 @@ namespace Anima {
blocks["norm"] = std::make_shared(target_dim, 1e-6f);
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx,
- struct ggml_tensor* source_hidden_states,
- struct ggml_tensor* target_input_ids,
- struct ggml_tensor* target_pe,
- struct ggml_tensor* source_pe) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx,
+ ggml_tensor* source_hidden_states,
+ ggml_tensor* target_input_ids,
+ ggml_tensor* target_pe,
+ ggml_tensor* source_pe) {
GGML_ASSERT(target_input_ids != nullptr);
if (ggml_n_dims(target_input_ids) == 1) {
target_input_ids = ggml_reshape_2d(ctx->ggml_ctx, target_input_ids, target_input_ids->ne[0], 1);
@@ -360,12 +360,12 @@ namespace Anima {
blocks["mlp"] = std::make_shared(hidden_size, hidden_size * mlp_ratio);
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx,
- struct ggml_tensor* hidden_states,
- struct ggml_tensor* encoder_hidden_states,
- struct ggml_tensor* embedded_timestep,
- struct ggml_tensor* temb,
- struct ggml_tensor* image_pe) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx,
+ ggml_tensor* hidden_states,
+ ggml_tensor* encoder_hidden_states,
+ ggml_tensor* embedded_timestep,
+ ggml_tensor* temb,
+ ggml_tensor* image_pe) {
auto norm1 = std::dynamic_pointer_cast(blocks["adaln_modulation_self_attn"]);
auto attn1 = std::dynamic_pointer_cast(blocks["self_attn"]);
auto norm2 = std::dynamic_pointer_cast(blocks["adaln_modulation_cross_attn"]);
@@ -402,10 +402,10 @@ namespace Anima {
blocks["linear"] = std::make_shared(hidden_size, patch_size * patch_size * out_channels, false);
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx,
- struct ggml_tensor* hidden_states,
- struct ggml_tensor* embedded_timestep,
- struct ggml_tensor* temb) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx,
+ ggml_tensor* hidden_states,
+ ggml_tensor* embedded_timestep,
+ ggml_tensor* temb) {
auto adaln = std::dynamic_pointer_cast(blocks["adaln_modulation"]);
auto linear = std::dynamic_pointer_cast(blocks["linear"]);
@@ -445,15 +445,15 @@ namespace Anima {
blocks["llm_adapter"] = std::make_shared(1024, 1024, 1024, 6, 16);
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx,
- struct ggml_tensor* x,
- struct ggml_tensor* timestep,
- struct ggml_tensor* encoder_hidden_states,
- struct ggml_tensor* image_pe,
- struct ggml_tensor* t5_ids = nullptr,
- struct ggml_tensor* t5_weights = nullptr,
- struct ggml_tensor* adapter_q_pe = nullptr,
- struct ggml_tensor* adapter_k_pe = nullptr) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx,
+ ggml_tensor* x,
+ ggml_tensor* timestep,
+ ggml_tensor* encoder_hidden_states,
+ ggml_tensor* image_pe,
+ ggml_tensor* t5_ids = nullptr,
+ ggml_tensor* t5_weights = nullptr,
+ ggml_tensor* adapter_q_pe = nullptr,
+ ggml_tensor* adapter_k_pe = nullptr) {
GGML_ASSERT(x->ne[3] == 1);
auto x_embedder = std::dynamic_pointer_cast(blocks["x_embedder"]);
@@ -553,7 +553,7 @@ namespace Anima {
return "anima";
}
- void get_param_tensors(std::map& tensors, const std::string prefix) {
+ void get_param_tensors(std::map& tensors, const std::string prefix) {
net.get_param_tensors(tensors, prefix + ".net");
}
@@ -602,19 +602,18 @@ namespace Anima {
return Rope::embed_nd(ids, bs, axis_thetas, axes_dim);
}
- struct ggml_cgraph* build_graph(struct ggml_tensor* x,
- struct ggml_tensor* timesteps,
- struct ggml_tensor* context,
- struct ggml_tensor* t5_ids = nullptr,
- struct ggml_tensor* t5_weights = nullptr) {
+ ggml_cgraph* build_graph(const sd::Tensor& x_tensor,
+ const sd::Tensor& timesteps_tensor,
+ const sd::Tensor& context_tensor = {},
+ const sd::Tensor& t5_ids_tensor = {},
+ const sd::Tensor& t5_weights_tensor = {}) {
+ ggml_tensor* x = make_input(x_tensor);
+ ggml_tensor* timesteps = make_input(timesteps_tensor);
+ ggml_tensor* context = make_optional_input(context_tensor);
+ ggml_tensor* t5_ids = make_optional_input(t5_ids_tensor);
+ ggml_tensor* t5_weights = make_optional_input(t5_weights_tensor);
GGML_ASSERT(x->ne[3] == 1);
- struct ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE);
-
- x = to_backend(x);
- timesteps = to_backend(timesteps);
- context = to_backend(context);
- t5_ids = to_backend(t5_ids);
- t5_weights = to_backend(t5_weights);
+ ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE);
int64_t pad_h = (net.patch_size - x->ne[1] % net.patch_size) % net.patch_size;
int64_t pad_w = (net.patch_size - x->ne[0] % net.patch_size) % net.patch_size;
@@ -667,18 +666,16 @@ namespace Anima {
return gf;
}
- bool compute(int n_threads,
- struct ggml_tensor* x,
- struct ggml_tensor* timesteps,
- struct ggml_tensor* context,
- struct ggml_tensor* t5_ids = nullptr,
- struct ggml_tensor* t5_weights = nullptr,
- struct ggml_tensor** output = nullptr,
- struct ggml_context* output_ctx = nullptr) {
- auto get_graph = [&]() -> struct ggml_cgraph* {
+ sd::Tensor compute(int n_threads,
+ const sd::Tensor& x,
+ const sd::Tensor& timesteps,
+ const sd::Tensor& context = {},
+ const sd::Tensor& t5_ids = {},
+ const sd::Tensor& t5_weights = {}) {
+ auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(x, timesteps, context, t5_ids, t5_weights);
};
- return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+ return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim());
}
};
} // namespace Anima
diff --git a/src/auto_encoder_kl.hpp b/src/auto_encoder_kl.hpp
new file mode 100644
index 00000000..039fb9df
--- /dev/null
+++ b/src/auto_encoder_kl.hpp
@@ -0,0 +1,852 @@
+#ifndef __AUTO_ENCODER_KL_HPP__
+#define __AUTO_ENCODER_KL_HPP__
+
+#include "vae.hpp"
+
+/*================================================== AutoEncoderKL ===================================================*/
+
+#define VAE_GRAPH_SIZE 20480
+
+class ResnetBlock : public UnaryBlock {
+protected:
+ int64_t in_channels;
+ int64_t out_channels;
+
+public:
+ ResnetBlock(int64_t in_channels,
+ int64_t out_channels)
+ : in_channels(in_channels),
+ out_channels(out_channels) {
+ // temb_channels is always 0
+ blocks["norm1"] = std::shared_ptr(new GroupNorm32(in_channels));
+ blocks["conv1"] = std::shared_ptr(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+
+ blocks["norm2"] = std::shared_ptr(new GroupNorm32(out_channels));
+ blocks["conv2"] = std::shared_ptr(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+
+ if (out_channels != in_channels) {
+ blocks["nin_shortcut"] = std::shared_ptr(new Conv2d(in_channels, out_channels, {1, 1}));
+ }
+ }
+
+ ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+ // x: [N, in_channels, h, w]
+ // t_emb is always None
+ auto norm1 = std::dynamic_pointer_cast(blocks["norm1"]);
+ auto conv1 = std::dynamic_pointer_cast(blocks["conv1"]);
+ auto norm2 = std::dynamic_pointer_cast(blocks["norm2"]);
+ auto conv2 = std::dynamic_pointer_cast(blocks["conv2"]);
+
+ auto h = x;
+ h = norm1->forward(ctx, h);
+ h = ggml_silu_inplace(ctx->ggml_ctx, h); // swish
+ h = conv1->forward(ctx, h);
+ // return h;
+
+ h = norm2->forward(ctx, h);
+ h = ggml_silu_inplace(ctx->ggml_ctx, h); // swish
+ // dropout, skip for inference
+ h = conv2->forward(ctx, h);
+
+ // skip connection
+ if (out_channels != in_channels) {
+ auto nin_shortcut = std::dynamic_pointer_cast(blocks["nin_shortcut"]);
+
+ x = nin_shortcut->forward(ctx, x); // [N, out_channels, h, w]
+ }
+
+ h = ggml_add(ctx->ggml_ctx, h, x);
+ return h; // [N, out_channels, h, w]
+ }
+};
+
+class AttnBlock : public UnaryBlock {
+protected:
+ int64_t in_channels;
+ bool use_linear;
+
+ void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
+ auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
+ if (iter != tensor_storage_map.end()) {
+ if (iter->second.n_dims == 4 && use_linear) {
+ use_linear = false;
+ blocks["q"] = std::make_shared(in_channels, in_channels, std::pair{1, 1});
+ blocks["k"] = std::make_shared(in_channels, in_channels, std::pair{1, 1});
+ blocks["v"] = std::make_shared(in_channels, in_channels, std::pair{1, 1});
+ blocks["proj_out"] = std::make_shared(in_channels, in_channels, std::pair{1, 1});
+ } else if (iter->second.n_dims == 2 && !use_linear) {
+ use_linear = true;
+ blocks["q"] = std::make_shared(in_channels, in_channels);
+ blocks["k"] = std::make_shared(in_channels, in_channels);
+ blocks["v"] = std::make_shared(in_channels, in_channels);
+ blocks["proj_out"] = std::make_shared(in_channels, in_channels);
+ }
+ }
+ }
+
+public:
+ AttnBlock(int64_t in_channels, bool use_linear)
+ : in_channels(in_channels), use_linear(use_linear) {
+ blocks["norm"] = std::shared_ptr(new GroupNorm32(in_channels));
+ if (use_linear) {
+ blocks["q"] = std::shared_ptr(new Linear(in_channels, in_channels));
+ blocks["k"] = std::shared_ptr(new Linear(in_channels, in_channels));
+ blocks["v"] = std::shared_ptr(new Linear(in_channels, in_channels));
+ blocks["proj_out"] = std::shared_ptr(new Linear(in_channels, in_channels));
+ } else {
+ blocks["q"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1}));
+ blocks["k"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1}));
+ blocks["v"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1}));
+ blocks["proj_out"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1}));
+ }
+ }
+
+ ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+ // x: [N, in_channels, h, w]
+ auto norm = std::dynamic_pointer_cast(blocks["norm"]);
+ auto q_proj = std::dynamic_pointer_cast(blocks["q"]);
+ auto k_proj = std::dynamic_pointer_cast(blocks["k"]);
+ auto v_proj = std::dynamic_pointer_cast(blocks["v"]);
+ auto proj_out = std::dynamic_pointer_cast(blocks["proj_out"]);
+
+ auto h_ = norm->forward(ctx, x);
+
+ const int64_t n = h_->ne[3];
+ const int64_t c = h_->ne[2];
+ const int64_t h = h_->ne[1];
+ const int64_t w = h_->ne[0];
+
+ ggml_tensor* q;
+ ggml_tensor* k;
+ ggml_tensor* v;
+ if (use_linear) {
+ h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 2, 0, 3)); // [N, h, w, in_channels]
+ h_ = ggml_reshape_3d(ctx->ggml_ctx, h_, c, h * w, n); // [N, h * w, in_channels]
+
+ q = q_proj->forward(ctx, h_); // [N, h * w, in_channels]
+ k = k_proj->forward(ctx, h_); // [N, h * w, in_channels]
+ v = v_proj->forward(ctx, h_); // [N, h * w, in_channels]
+ } else {
+ q = q_proj->forward(ctx, h_); // [N, in_channels, h, w]
+ q = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, q, 1, 2, 0, 3)); // [N, h, w, in_channels]
+ q = ggml_reshape_3d(ctx->ggml_ctx, q, c, h * w, n); // [N, h * w, in_channels]
+
+ k = k_proj->forward(ctx, h_); // [N, in_channels, h, w]
+ k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 1, 2, 0, 3)); // [N, h, w, in_channels]
+ k = ggml_reshape_3d(ctx->ggml_ctx, k, c, h * w, n); // [N, h * w, in_channels]
+
+ v = v_proj->forward(ctx, h_); // [N, in_channels, h, w]
+ v = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, v, 1, 2, 0, 3)); // [N, h, w, in_channels]
+ v = ggml_reshape_3d(ctx->ggml_ctx, v, c, h * w, n); // [N, h * w, in_channels]
+ }
+
+ h_ = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, 1, nullptr, false, ctx->flash_attn_enabled);
+
+ if (use_linear) {
+ h_ = proj_out->forward(ctx, h_); // [N, h * w, in_channels]
+
+ h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 0, 2, 3)); // [N, in_channels, h * w]
+ h_ = ggml_reshape_4d(ctx->ggml_ctx, h_, w, h, c, n); // [N, in_channels, h, w]
+ } else {
+ h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 0, 2, 3)); // [N, in_channels, h * w]
+ h_ = ggml_reshape_4d(ctx->ggml_ctx, h_, w, h, c, n); // [N, in_channels, h, w]
+
+ h_ = proj_out->forward(ctx, h_); // [N, in_channels, h, w]
+ }
+
+ h_ = ggml_add(ctx->ggml_ctx, h_, x);
+ return h_;
+ }
+};
+
+class AE3DConv : public Conv2d {
+public:
+ AE3DConv(int64_t in_channels,
+ int64_t out_channels,
+ std::pair kernel_size,
+ int video_kernel_size = 3,
+ std::pair stride = {1, 1},
+ std::pair padding = {0, 0},
+ std::pair dilation = {1, 1},
+ bool bias = true)
+ : Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias) {
+ int kernel_padding = video_kernel_size / 2;
+ blocks["time_mix_conv"] = std::shared_ptr(new Conv3d(out_channels,
+ out_channels,
+ {video_kernel_size, 1, 1},
+ {1, 1, 1},
+ {kernel_padding, 0, 0}));
+ }
+
+ ggml_tensor* forward(GGMLRunnerContext* ctx,
+ ggml_tensor* x) override {
+ // timesteps always None
+ // skip_video always False
+ // x: [N, IC, IH, IW]
+ // result: [N, OC, OH, OW]
+ auto time_mix_conv = std::dynamic_pointer_cast(blocks["time_mix_conv"]);
+
+ x = Conv2d::forward(ctx, x);
+ // timesteps = x.shape[0]
+ // x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
+ // x = conv3d(x)
+ // return rearrange(x, "b c t h w -> (b t) c h w")
+ int64_t T = x->ne[3];
+ int64_t B = x->ne[3] / T;
+ int64_t C = x->ne[2];
+ int64_t H = x->ne[1];
+ int64_t W = x->ne[0];
+
+ x = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w)
+ x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w)
+ x = time_mix_conv->forward(ctx, x); // [B, OC, T, OH * OW]
+ x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
+ x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
+ return x; // [B*T, OC, OH, OW]
+ }
+};
+
+class VideoResnetBlock : public ResnetBlock {
+protected:
+ void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+ enum ggml_type wtype = get_type(prefix + "mix_factor", tensor_storage_map, GGML_TYPE_F32);
+ params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
+ }
+
+ float get_alpha() {
+ float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
+ return sigmoid(alpha);
+ }
+
+public:
+ VideoResnetBlock(int64_t in_channels,
+ int64_t out_channels,
+ int video_kernel_size = 3)
+ : ResnetBlock(in_channels, out_channels) {
+ // merge_strategy is always learned
+ blocks["time_stack"] = std::shared_ptr(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true));
+ }
+
+ ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+ // x: [N, in_channels, h, w] aka [b*t, in_channels, h, w]
+ // return: [N, out_channels, h, w] aka [b*t, out_channels, h, w]
+ // t_emb is always None
+ // skip_video is always False
+ // timesteps is always None
+ auto time_stack = std::dynamic_pointer_cast(blocks["time_stack"]);
+
+ x = ResnetBlock::forward(ctx, x); // [N, out_channels, h, w]
+ // return x;
+
+ int64_t T = x->ne[3];
+ int64_t B = x->ne[3] / T;
+ int64_t C = x->ne[2];
+ int64_t H = x->ne[1];
+ int64_t W = x->ne[0];
+
+ x = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w)
+ x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w)
+ auto x_mix = x;
+
+ x = time_stack->forward(ctx, x); // b t c (h w)
+
+ float alpha = get_alpha();
+ x = ggml_add(ctx->ggml_ctx,
+ ggml_ext_scale(ctx->ggml_ctx, x, alpha),
+ ggml_ext_scale(ctx->ggml_ctx, x_mix, 1.0f - alpha));
+
+ x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
+ x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
+
+ return x;
+ }
+};
+
+// ldm.modules.diffusionmodules.model.Encoder
+class Encoder : public GGMLBlock {
+protected:
+ int ch = 128;
+ std::vector ch_mult = {1, 2, 4, 4};
+ int num_res_blocks = 2;
+ int in_channels = 3;
+ int z_channels = 4;
+ bool double_z = true;
+
+public:
+ Encoder(int ch,
+ std::vector ch_mult,
+ int num_res_blocks,
+ int in_channels,
+ int z_channels,
+ bool double_z = true,
+ bool use_linear_projection = false)
+ : ch(ch),
+ ch_mult(ch_mult),
+ num_res_blocks(num_res_blocks),
+ in_channels(in_channels),
+ z_channels(z_channels),
+ double_z(double_z) {
+ blocks["conv_in"] = std::shared_ptr(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1}));
+
+ size_t num_resolutions = ch_mult.size();
+
+ int block_in = 1;
+ for (int i = 0; i < num_resolutions; i++) {
+ if (i == 0) {
+ block_in = ch;
+ } else {
+ block_in = ch * ch_mult[i - 1];
+ }
+ int block_out = ch * ch_mult[i];
+ for (int j = 0; j < num_res_blocks; j++) {
+ std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j);
+ blocks[name] = std::shared_ptr(new ResnetBlock(block_in, block_out));
+ block_in = block_out;
+ }
+ if (i != num_resolutions - 1) {
+ std::string name = "down." + std::to_string(i) + ".downsample";
+ blocks[name] = std::shared_ptr(new DownSampleBlock(block_in, block_in, true));
+ }
+ }
+
+ blocks["mid.block_1"] = std::shared_ptr(new ResnetBlock(block_in, block_in));
+ blocks["mid.attn_1"] = std::shared_ptr(new AttnBlock(block_in, use_linear_projection));
+ blocks["mid.block_2"] = std::shared_ptr(new ResnetBlock(block_in, block_in));
+
+ blocks["norm_out"] = std::shared_ptr(new GroupNorm32(block_in));
+ blocks["conv_out"] = std::shared_ptr(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}));
+ }
+
+ virtual ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+ // x: [N, in_channels, h, w]
+
+ auto conv_in = std::dynamic_pointer_cast(blocks["conv_in"]);
+ auto mid_block_1 = std::dynamic_pointer_cast(blocks["mid.block_1"]);
+ auto mid_attn_1 = std::dynamic_pointer_cast(blocks["mid.attn_1"]);
+ auto mid_block_2 = std::dynamic_pointer_cast(blocks["mid.block_2"]);
+ auto norm_out = std::dynamic_pointer_cast(blocks["norm_out"]);
+ auto conv_out = std::dynamic_pointer_cast(blocks["conv_out"]);
+
+ auto h = conv_in->forward(ctx, x); // [N, ch, h, w]
+
+ // downsampling
+ size_t num_resolutions = ch_mult.size();
+ for (int i = 0; i < num_resolutions; i++) {
+ for (int j = 0; j < num_res_blocks; j++) {
+ std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j);
+ auto down_block = std::dynamic_pointer_cast(blocks[name]);
+
+ h = down_block->forward(ctx, h);
+ }
+ if (i != num_resolutions - 1) {
+ std::string name = "down." + std::to_string(i) + ".downsample";
+ auto down_sample = std::dynamic_pointer_cast(blocks[name]);
+
+ h = down_sample->forward(ctx, h);
+ }
+ }
+
+ // middle
+ h = mid_block_1->forward(ctx, h);
+ h = mid_attn_1->forward(ctx, h);
+ h = mid_block_2->forward(ctx, h); // [N, block_in, h, w]
+
+ // end
+ h = norm_out->forward(ctx, h);
+ h = ggml_silu_inplace(ctx->ggml_ctx, h); // nonlinearity/swish
+ h = conv_out->forward(ctx, h); // [N, z_channels*2, h, w]
+ return h;
+ }
+};
+
+// ldm.modules.diffusionmodules.model.Decoder
+class Decoder : public GGMLBlock {
+protected:
+ int ch = 128;
+ int out_ch = 3;
+ std::vector ch_mult = {1, 2, 4, 4};
+ int num_res_blocks = 2;
+ int z_channels = 4;
+ bool video_decoder = false;
+ int video_kernel_size = 3;
+
+ virtual std::shared_ptr get_conv_out(int64_t in_channels,
+ int64_t out_channels,
+ std::pair kernel_size,
+ std::pair stride = {1, 1},
+ std::pair padding = {0, 0}) {
+ if (video_decoder) {
+ return std::shared_ptr(new AE3DConv(in_channels, out_channels, kernel_size, video_kernel_size, stride, padding));
+ } else {
+ return std::shared_ptr(new Conv2d(in_channels, out_channels, kernel_size, stride, padding));
+ }
+ }
+
+ virtual std::shared_ptr get_resnet_block(int64_t in_channels,
+ int64_t out_channels) {
+ if (video_decoder) {
+ return std::shared_ptr(new VideoResnetBlock(in_channels, out_channels, video_kernel_size));
+ } else {
+ return std::shared_ptr(new ResnetBlock(in_channels, out_channels));
+ }
+ }
+
+public:
+ Decoder(int ch,
+ int out_ch,
+ std::vector ch_mult,
+ int num_res_blocks,
+ int z_channels,
+ bool use_linear_projection = false,
+ bool video_decoder = false,
+ int video_kernel_size = 3)
+ : ch(ch),
+ out_ch(out_ch),
+ ch_mult(ch_mult),
+ num_res_blocks(num_res_blocks),
+ z_channels(z_channels),
+ video_decoder(video_decoder),
+ video_kernel_size(video_kernel_size) {
+ int num_resolutions = static_cast(ch_mult.size());
+ int block_in = ch * ch_mult[num_resolutions - 1];
+
+ blocks["conv_in"] = std::shared_ptr(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}));
+
+ blocks["mid.block_1"] = get_resnet_block(block_in, block_in);
+ blocks["mid.attn_1"] = std::shared_ptr(new AttnBlock(block_in, use_linear_projection));
+ blocks["mid.block_2"] = get_resnet_block(block_in, block_in);
+
+ for (int i = num_resolutions - 1; i >= 0; i--) {
+ int mult = ch_mult[i];
+ int block_out = ch * mult;
+ for (int j = 0; j < num_res_blocks + 1; j++) {
+ std::string name = "up." + std::to_string(i) + ".block." + std::to_string(j);
+ blocks[name] = get_resnet_block(block_in, block_out);
+
+ block_in = block_out;
+ }
+ if (i != 0) {
+ std::string name = "up." + std::to_string(i) + ".upsample";
+ blocks[name] = std::shared_ptr(new UpSampleBlock(block_in, block_in));
+ }
+ }
+
+ blocks["norm_out"] = std::shared_ptr(new GroupNorm32(block_in));
+ blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1});
+ }
+
+ virtual ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* z) {
+ // z: [N, z_channels, h, w]
+ // alpha is always 0
+ // merge_strategy is always learned
+ // time_mode is always conv-only, so we need to replace conv_out_op/resnet_op to AE3DConv/VideoResBlock
+ // AttnVideoBlock will not be used
+ auto conv_in = std::dynamic_pointer_cast(blocks["conv_in"]);
+ auto mid_block_1 = std::dynamic_pointer_cast(blocks["mid.block_1"]);
+ auto mid_attn_1 = std::dynamic_pointer_cast(blocks["mid.attn_1"]);
+ auto mid_block_2 = std::dynamic_pointer_cast(blocks["mid.block_2"]);
+ auto norm_out = std::dynamic_pointer_cast(blocks["norm_out"]);
+ auto conv_out = std::dynamic_pointer_cast(blocks["conv_out"]);
+
+ // conv_in
+ auto h = conv_in->forward(ctx, z); // [N, block_in, h, w]
+
+ // middle
+ h = mid_block_1->forward(ctx, h);
+ // return h;
+
+ h = mid_attn_1->forward(ctx, h);
+ h = mid_block_2->forward(ctx, h); // [N, block_in, h, w]
+
+ // upsampling
+ int num_resolutions = static_cast(ch_mult.size());
+ for (int i = num_resolutions - 1; i >= 0; i--) {
+ for (int j = 0; j < num_res_blocks + 1; j++) {
+ std::string name = "up." + std::to_string(i) + ".block." + std::to_string(j);
+ auto up_block = std::dynamic_pointer_cast(blocks[name]);
+
+ h = up_block->forward(ctx, h);
+ }
+ if (i != 0) {
+ std::string name = "up." + std::to_string(i) + ".upsample";
+ auto up_sample = std::dynamic_pointer_cast(blocks[name]);
+
+ h = up_sample->forward(ctx, h);
+ }
+ }
+
+ h = norm_out->forward(ctx, h);
+ h = ggml_silu_inplace(ctx->ggml_ctx, h); // nonlinearity/swish
+ h = conv_out->forward(ctx, h); // [N, out_ch, h*8, w*8]
+ return h;
+ }
+};
+
+// ldm.models.autoencoder.AutoencoderKL
+class AutoEncoderKLModel : public GGMLBlock {
+protected:
+ SDVersion version;
+ bool decode_only = true;
+ bool use_video_decoder = false;
+ bool use_quant = true;
+ int embed_dim = 4;
+ struct {
+ int z_channels = 4;
+ int resolution = 256;
+ int in_channels = 3;
+ int out_ch = 3;
+ int ch = 128;
+ std::vector ch_mult = {1, 2, 4, 4};
+ int num_res_blocks = 2;
+ bool double_z = true;
+ } dd_config;
+
+public:
+ AutoEncoderKLModel(SDVersion version = VERSION_SD1,
+ bool decode_only = true,
+ bool use_linear_projection = false,
+ bool use_video_decoder = false)
+ : version(version), decode_only(decode_only), use_video_decoder(use_video_decoder) {
+ if (sd_version_is_dit(version)) {
+ if (sd_version_is_flux2(version)) {
+ dd_config.z_channels = 32;
+ embed_dim = 32;
+ } else {
+ use_quant = false;
+ dd_config.z_channels = 16;
+ }
+ }
+ if (use_video_decoder) {
+ use_quant = false;
+ }
+ blocks["decoder"] = std::shared_ptr(new Decoder(dd_config.ch,
+ dd_config.out_ch,
+ dd_config.ch_mult,
+ dd_config.num_res_blocks,
+ dd_config.z_channels,
+ use_linear_projection,
+ use_video_decoder));
+ if (use_quant) {
+ blocks["post_quant_conv"] = std::shared_ptr(new Conv2d(dd_config.z_channels,
+ embed_dim,
+ {1, 1}));
+ }
+ if (!decode_only) {
+ blocks["encoder"] = std::shared_ptr(new Encoder(dd_config.ch,
+ dd_config.ch_mult,
+ dd_config.num_res_blocks,
+ dd_config.in_channels,
+ dd_config.z_channels,
+ dd_config.double_z,
+ use_linear_projection));
+ if (use_quant) {
+ int factor = dd_config.double_z ? 2 : 1;
+
+ blocks["quant_conv"] = std::shared_ptr(new Conv2d(embed_dim * factor,
+ dd_config.z_channels * factor,
+ {1, 1}));
+ }
+ }
+ }
+
+ ggml_tensor* decode(GGMLRunnerContext* ctx, ggml_tensor* z) {
+ // z: [N, z_channels, h, w]
+ if (sd_version_is_flux2(version)) {
+ // [N, C*p*p, h, w] -> [N, C, h*p, w*p]
+ int64_t p = 2;
+
+ int64_t N = z->ne[3];
+ int64_t C = z->ne[2] / p / p;
+ int64_t h = z->ne[1];
+ int64_t w = z->ne[0];
+ int64_t H = h * p;
+ int64_t W = w * p;
+
+ z = ggml_reshape_4d(ctx->ggml_ctx, z, w * h, p * p, C, N); // [N, C, p*p, h*w]
+ z = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, z, 1, 0, 2, 3)); // [N, C, h*w, p*p]
+ z = ggml_reshape_4d(ctx->ggml_ctx, z, p, p, w, h * C * N); // [N*C*h, w, p, p]
+ z = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, z, 0, 2, 1, 3)); // [N*C*h, p, w, p]
+ z = ggml_reshape_4d(ctx->ggml_ctx, z, W, H, C, N); // [N, C, h*p, w*p]
+ }
+
+ if (use_quant) {
+ auto post_quant_conv = std::dynamic_pointer_cast(blocks["post_quant_conv"]);
+ z = post_quant_conv->forward(ctx, z); // [N, z_channels, h, w]
+ }
+ auto decoder = std::dynamic_pointer_cast(blocks["decoder"]);
+
+ ggml_set_name(z, "bench-start");
+ auto h = decoder->forward(ctx, z);
+ ggml_set_name(h, "bench-end");
+ return h;
+ }
+
+ ggml_tensor* encode(GGMLRunnerContext* ctx, ggml_tensor* x) {
+ // x: [N, in_channels, h, w]
+ auto encoder = std::dynamic_pointer_cast(blocks["encoder"]);
+
+ auto z = encoder->forward(ctx, x); // [N, 2*z_channels, h/8, w/8]
+ if (use_quant) {
+ auto quant_conv = std::dynamic_pointer_cast(blocks["quant_conv"]);
+ z = quant_conv->forward(ctx, z); // [N, 2*embed_dim, h/8, w/8]
+ }
+ if (sd_version_is_flux2(version)) {
+ z = ggml_ext_chunk(ctx->ggml_ctx, z, 2, 2)[0];
+
+ // [N, C, H, W] -> [N, C*p*p, H/p, W/p]
+ int64_t p = 2;
+ int64_t N = z->ne[3];
+ int64_t C = z->ne[2];
+ int64_t H = z->ne[1];
+ int64_t W = z->ne[0];
+ int64_t h = H / p;
+ int64_t w = W / p;
+
+ z = ggml_reshape_4d(ctx->ggml_ctx, z, p, w, p, h * C * N); // [N*C*h, p, w, p]
+ z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 0, 2, 1, 3)); // [N*C*h, w, p, p]
+ z = ggml_reshape_4d(ctx->ggml_ctx, z, p * p, w * h, C, N); // [N, C, h*w, p*p]
+ z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 1, 0, 2, 3)); // [N, C, p*p, h*w]
+ z = ggml_reshape_4d(ctx->ggml_ctx, z, w, h, p * p * C, N); // [N, C*p*p, h*w]
+ }
+ return z;
+ }
+
+ int get_encoder_output_channels() {
+ int factor = dd_config.double_z ? 2 : 1;
+ if (sd_version_is_flux2(version)) {
+ return dd_config.z_channels * 4;
+ }
+ return dd_config.z_channels * factor;
+ }
+};
+
+struct AutoEncoderKL : public VAE {
+ float scale_factor = 1.f;
+ float shift_factor = 0.f;
+ bool decode_only = true;
+ AutoEncoderKLModel ae;
+
+ AutoEncoderKL(ggml_backend_t backend,
+ bool offload_params_to_cpu,
+ const String2TensorStorage& tensor_storage_map,
+ const std::string prefix,
+ bool decode_only = false,
+ bool use_video_decoder = false,
+ SDVersion version = VERSION_SD1)
+ : decode_only(decode_only), VAE(version, backend, offload_params_to_cpu) {
+ if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
+ scale_factor = 0.18215f;
+ shift_factor = 0.f;
+ } else if (sd_version_is_sdxl(version)) {
+ scale_factor = 0.13025f;
+ shift_factor = 0.f;
+ } else if (sd_version_is_sd3(version)) {
+ scale_factor = 1.5305f;
+ shift_factor = 0.0609f;
+ } else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) {
+ scale_factor = 0.3611f;
+ shift_factor = 0.1159f;
+ } else if (sd_version_is_flux2(version)) {
+ scale_factor = 1.0f;
+ shift_factor = 0.f;
+ }
+ bool use_linear_projection = false;
+ for (const auto& [name, tensor_storage] : tensor_storage_map) {
+ if (!starts_with(name, prefix)) {
+ continue;
+ }
+ if (ends_with(name, "attn_1.proj_out.weight")) {
+ if (tensor_storage.n_dims == 2) {
+ use_linear_projection = true;
+ }
+ break;
+ }
+ }
+ ae = AutoEncoderKLModel(version, decode_only, use_linear_projection, use_video_decoder);
+ ae.init(params_ctx, tensor_storage_map, prefix);
+ }
+
+ void set_conv2d_scale(float scale) override {
+ std::vector blocks;
+ ae.get_all_blocks(blocks);
+ for (auto block : blocks) {
+ if (block->get_desc() == "Conv2d") {
+ auto conv_block = (Conv2d*)block;
+ conv_block->set_scale(scale);
+ }
+ }
+ }
+
+ std::string get_desc() override {
+ return "vae";
+ }
+
+ void get_param_tensors(std::map& tensors, const std::string prefix) override {
+ ae.get_param_tensors(tensors, prefix);
+ }
+
+ ggml_cgraph* build_graph(const sd::Tensor& z_tensor, bool decode_graph) {
+ ggml_cgraph* gf = ggml_new_graph(compute_ctx);
+ ggml_tensor* z = make_input(z_tensor);
+
+ auto runner_ctx = get_context();
+
+ ggml_tensor* out = decode_graph ? ae.decode(&runner_ctx, z) : ae.encode(&runner_ctx, z);
+
+ ggml_build_forward_expand(gf, out);
+
+ return gf;
+ }
+
+ sd::Tensor _compute(const int n_threads,
+ const sd::Tensor& z,
+ bool decode_graph) override {
+ GGML_ASSERT(!decode_only || decode_graph);
+ auto get_graph = [&]() -> ggml_cgraph* {
+ return build_graph(z, decode_graph);
+ };
+ return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), z.dim());
+ }
+
+ sd::Tensor gaussian_latent_sample(const sd::Tensor& moments, std::shared_ptr rng) {
+ // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
+ auto chunks = sd::ops::chunk(moments, 2, 2);
+ const auto& mean = chunks[0];
+ const auto& logvar = chunks[1];
+ sd::Tensor stddev = sd::ops::exp(0.5f * sd::ops::clamp(logvar, -30.0f, 20.0f));
+ sd::Tensor noise = sd::Tensor::randn_like(mean, rng);
+ sd::Tensor latents = mean + stddev * noise;
+ return latents;
+ }
+
+ sd::Tensor vae_output_to_latents(const sd::Tensor& vae_output, std::shared_ptr rng) override {
+ if (sd_version_is_flux2(version)) {
+ return vae_output;
+ } else if (version == VERSION_SD1_PIX2PIX) {
+ return sd::ops::chunk(vae_output, 2, 2)[0];
+ } else {
+ return gaussian_latent_sample(vae_output, rng);
+ }
+ }
+
+ std::pair, sd::Tensor> get_latents_mean_std(const sd::Tensor& latents, int channel_dim) {
+ GGML_ASSERT(channel_dim >= 0 && static_cast(channel_dim) < static_cast(latents.dim()));
+ if (sd_version_is_flux2(version)) {
+ GGML_ASSERT(latents.shape()[channel_dim] == 128);
+ std::vector stats_shape(static_cast(latents.dim()), 1);
+ stats_shape[static_cast(channel_dim)] = latents.shape()[channel_dim];
+
+ auto mean_tensor = sd::Tensor::from_vector({-0.0676f, -0.0715f, -0.0753f, -0.0745f, 0.0223f, 0.0180f, 0.0142f, 0.0184f,
+ -0.0001f, -0.0063f, -0.0002f, -0.0031f, -0.0272f, -0.0281f, -0.0276f, -0.0290f,
+ -0.0769f, -0.0672f, -0.0902f, -0.0892f, 0.0168f, 0.0152f, 0.0079f, 0.0086f,
+ 0.0083f, 0.0015f, 0.0003f, -0.0043f, -0.0439f, -0.0419f, -0.0438f, -0.0431f,
+ -0.0102f, -0.0132f, -0.0066f, -0.0048f, -0.0311f, -0.0306f, -0.0279f, -0.0180f,
+ 0.0030f, 0.0015f, 0.0126f, 0.0145f, 0.0347f, 0.0338f, 0.0337f, 0.0283f,
+ 0.0020f, 0.0047f, 0.0047f, 0.0050f, 0.0123f, 0.0081f, 0.0081f, 0.0146f,
+ 0.0681f, 0.0679f, 0.0767f, 0.0732f, -0.0462f, -0.0474f, -0.0392f, -0.0511f,
+ -0.0528f, -0.0477f, -0.0470f, -0.0517f, -0.0317f, -0.0316f, -0.0345f, -0.0283f,
+ 0.0510f, 0.0445f, 0.0578f, 0.0458f, -0.0412f, -0.0458f, -0.0487f, -0.0467f,
+ -0.0088f, -0.0106f, -0.0088f, -0.0046f, -0.0376f, -0.0432f, -0.0436f, -0.0499f,
+ 0.0118f, 0.0166f, 0.0203f, 0.0279f, 0.0113f, 0.0129f, 0.0016f, 0.0072f,
+ -0.0118f, -0.0018f, -0.0141f, -0.0054f, -0.0091f, -0.0138f, -0.0145f, -0.0187f,
+ 0.0323f, 0.0305f, 0.0259f, 0.0300f, 0.0540f, 0.0614f, 0.0495f, 0.0590f,
+ -0.0511f, -0.0603f, -0.0478f, -0.0524f, -0.0227f, -0.0274f, -0.0154f, -0.0255f,
+ -0.0572f, -0.0565f, -0.0518f, -0.0496f, 0.0116f, 0.0054f, 0.0163f, 0.0104f});
+ mean_tensor.reshape_(stats_shape);
+ auto std_tensor = sd::Tensor::from_vector({1.8029f, 1.7786f, 1.7868f, 1.7837f, 1.7717f, 1.7590f, 1.7610f, 1.7479f,
+ 1.7336f, 1.7373f, 1.7340f, 1.7343f, 1.8626f, 1.8527f, 1.8629f, 1.8589f,
+ 1.7593f, 1.7526f, 1.7556f, 1.7583f, 1.7363f, 1.7400f, 1.7355f, 1.7394f,
+ 1.7342f, 1.7246f, 1.7392f, 1.7304f, 1.7551f, 1.7513f, 1.7559f, 1.7488f,
+ 1.8449f, 1.8454f, 1.8550f, 1.8535f, 1.8240f, 1.7813f, 1.7854f, 1.7945f,
+ 1.8047f, 1.7876f, 1.7695f, 1.7676f, 1.7782f, 1.7667f, 1.7925f, 1.7848f,
+ 1.7579f, 1.7407f, 1.7483f, 1.7368f, 1.7961f, 1.7998f, 1.7920f, 1.7925f,
+ 1.7780f, 1.7747f, 1.7727f, 1.7749f, 1.7526f, 1.7447f, 1.7657f, 1.7495f,
+ 1.7775f, 1.7720f, 1.7813f, 1.7813f, 1.8162f, 1.8013f, 1.8023f, 1.8033f,
+ 1.7527f, 1.7331f, 1.7563f, 1.7482f, 1.7610f, 1.7507f, 1.7681f, 1.7613f,
+ 1.7665f, 1.7545f, 1.7828f, 1.7726f, 1.7896f, 1.7999f, 1.7864f, 1.7760f,
+ 1.7613f, 1.7625f, 1.7560f, 1.7577f, 1.7783f, 1.7671f, 1.7810f, 1.7799f,
+ 1.7201f, 1.7068f, 1.7265f, 1.7091f, 1.7793f, 1.7578f, 1.7502f, 1.7455f,
+ 1.7587f, 1.7500f, 1.7525f, 1.7362f, 1.7616f, 1.7572f, 1.7444f, 1.7430f,
+ 1.7509f, 1.7610f, 1.7634f, 1.7612f, 1.7254f, 1.7135f, 1.7321f, 1.7226f,
+ 1.7664f, 1.7624f, 1.7718f, 1.7664f, 1.7457f, 1.7441f, 1.7569f, 1.7530f});
+ std_tensor.reshape_(stats_shape);
+ return {std::move(mean_tensor), std::move(std_tensor)};
+ } else {
+ GGML_ABORT("unknown version %d", version);
+ }
+ }
+
+ sd::Tensor diffusion_to_vae_latents(const sd::Tensor& latents) override {
+ if (sd_version_is_flux2(version)) {
+ int channel_dim = 2;
+ auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim);
+ return (latents * std_tensor) / scale_factor + mean_tensor;
+ }
+ return (latents / scale_factor) + shift_factor;
+ }
+
+ sd::Tensor vae_to_diffusion_latents(const sd::Tensor& latents) override {
+ if (sd_version_is_flux2(version)) {
+ int channel_dim = 2;
+ auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim);
+ return ((latents - mean_tensor) * scale_factor) / std_tensor;
+ }
+ return (latents - shift_factor) * scale_factor;
+ }
+
+ int get_encoder_output_channels(int input_channels) {
+ return ae.get_encoder_output_channels();
+ }
+
+ void test() {
+ ggml_init_params params;
+ params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB
+ params.mem_buffer = nullptr;
+ params.no_alloc = false;
+
+ ggml_context* ctx = ggml_init(params);
+ GGML_ASSERT(ctx != nullptr);
+
+ {
+ // CPU, x{1, 3, 64, 64}: Pass
+ // CUDA, x{1, 3, 64, 64}: Pass, but sill get wrong result for some image, may be due to interlnal nan
+ // CPU, x{2, 3, 64, 64}: Wrong result
+ // CUDA, x{2, 3, 64, 64}: Wrong result, and different from CPU result
+ sd::Tensor x({64, 64, 3, 2});
+ x.fill_(0.5f);
+ print_sd_tensor(x);
+ sd::Tensor out;
+
+ int64_t t0 = ggml_time_ms();
+ auto out_opt = _compute(8, x, false);
+ int64_t t1 = ggml_time_ms();
+
+ GGML_ASSERT(!out_opt.empty());
+ out = std::move(out_opt);
+ print_sd_tensor(out);
+ LOG_DEBUG("encode test done in %lldms", t1 - t0);
+ }
+
+ if (false) {
+ // CPU, z{1, 4, 8, 8}: Pass
+ // CUDA, z{1, 4, 8, 8}: Pass
+ // CPU, z{3, 4, 8, 8}: Wrong result
+ // CUDA, z{3, 4, 8, 8}: Wrong result, and different from CPU result
+ sd::Tensor z({8, 8, 4, 1});
+ z.fill_(0.5f);
+ print_sd_tensor(z);
+ sd::Tensor out;
+
+ int64_t t0 = ggml_time_ms();
+ auto out_opt = _compute(8, z, true);
+ int64_t t1 = ggml_time_ms();
+
+ GGML_ASSERT(!out_opt.empty());
+ out = std::move(out_opt);
+ print_sd_tensor(out);
+ LOG_DEBUG("decode test done in %lldms", t1 - t0);
+ }
+ };
+};
+
+#endif // __AUTO_ENCODER_KL_HPP__
diff --git a/src/cache_dit.hpp b/src/cache_dit.hpp
index 6fe104da..dad67d45 100644
--- a/src/cache_dit.hpp
+++ b/src/cache_dit.hpp
@@ -8,7 +8,9 @@
#include
#include
+#include "condition_cache_utils.hpp"
#include "ggml_extend.hpp"
+#include "tensor.hpp"
struct DBCacheConfig {
bool enabled = false;
@@ -603,87 +605,6 @@ inline std::vector generate_scm_mask(
return mask;
}
-inline std::vector get_scm_preset(const std::string& preset, int total_steps) {
- struct Preset {
- std::vector compute_bins;
- std::vector cache_bins;
- };
-
- Preset slow = {{8, 3, 3, 2, 1, 1}, {1, 2, 2, 2, 3}};
- Preset medium = {{6, 2, 2, 2, 2, 1}, {1, 3, 3, 3, 3}};
- Preset fast = {{6, 1, 1, 1, 1, 1}, {1, 3, 4, 5, 4}};
- Preset ultra = {{4, 1, 1, 1, 1}, {2, 5, 6, 7}};
-
- Preset* p = nullptr;
- if (preset == "slow" || preset == "s" || preset == "S")
- p = &slow;
- else if (preset == "medium" || preset == "m" || preset == "M")
- p = &medium;
- else if (preset == "fast" || preset == "f" || preset == "F")
- p = &fast;
- else if (preset == "ultra" || preset == "u" || preset == "U")
- p = &ultra;
- else
- return {};
-
- if (total_steps != 28 && total_steps > 0) {
- float scale = static_cast(total_steps) / 28.0f;
- std::vector scaled_compute, scaled_cache;
-
- for (int v : p->compute_bins) {
- scaled_compute.push_back(std::max(1, static_cast(v * scale + 0.5f)));
- }
- for (int v : p->cache_bins) {
- scaled_cache.push_back(std::max(1, static_cast(v * scale + 0.5f)));
- }
-
- return generate_scm_mask(scaled_compute, scaled_cache, total_steps);
- }
-
- return generate_scm_mask(p->compute_bins, p->cache_bins, total_steps);
-}
-
-inline float get_preset_threshold(const std::string& preset) {
- if (preset == "slow" || preset == "s" || preset == "S")
- return 0.20f;
- if (preset == "medium" || preset == "m" || preset == "M")
- return 0.25f;
- if (preset == "fast" || preset == "f" || preset == "F")
- return 0.30f;
- if (preset == "ultra" || preset == "u" || preset == "U")
- return 0.34f;
- return 0.08f;
-}
-
-inline int get_preset_warmup(const std::string& preset) {
- if (preset == "slow" || preset == "s" || preset == "S")
- return 8;
- if (preset == "medium" || preset == "m" || preset == "M")
- return 6;
- if (preset == "fast" || preset == "f" || preset == "F")
- return 6;
- if (preset == "ultra" || preset == "u" || preset == "U")
- return 4;
- return 8;
-}
-
-inline int get_preset_Fn(const std::string& preset) {
- if (preset == "slow" || preset == "s" || preset == "S")
- return 8;
- if (preset == "medium" || preset == "m" || preset == "M")
- return 8;
- if (preset == "fast" || preset == "f" || preset == "F")
- return 6;
- if (preset == "ultra" || preset == "u" || preset == "U")
- return 4;
- return 8;
-}
-
-inline int get_preset_Bn(const std::string& preset) {
- (void)preset;
- return 0;
-}
-
inline void parse_dbcache_options(const std::string& opts, DBCacheConfig& cfg) {
if (opts.empty())
return;
@@ -852,35 +773,37 @@ struct CacheDitConditionState {
return it != cache_diffs.end() && !it->second.diff.empty();
}
- void update_cache(const void* cond, const float* input, const float* output, size_t size) {
+ void update_cache(const void* cond, const sd::Tensor& input, const sd::Tensor& output) {
CacheEntry& entry = cache_diffs[cond];
- entry.diff.resize(size);
- for (size_t i = 0; i < size; i++) {
- entry.diff[i] = output[i] - input[i];
+ if (!sd::store_condition_cache_diff(&entry.diff, input, output)) {
+ entry.prev_input.clear();
+ entry.prev_output.clear();
+ entry.has_prev = false;
+ return;
}
+ size_t size = static_cast(output.numel());
+ const float* input_data = input.data();
+ const float* output_data = output.data();
entry.prev_input.resize(size);
entry.prev_output.resize(size);
for (size_t i = 0; i < size; i++) {
- entry.prev_input[i] = input[i];
- entry.prev_output[i] = output[i];
+ entry.prev_input[i] = input_data[i];
+ entry.prev_output[i] = output_data[i];
}
entry.has_prev = true;
}
- void apply_cache(const void* cond, const float* input, float* output, size_t size) {
+ void apply_cache(const void* cond,
+ const sd::Tensor& input,
+ sd::Tensor* output) {
auto it = cache_diffs.find(cond);
if (it == cache_diffs.end() || it->second.diff.empty())
return;
- if (it->second.diff.size() != size)
- return;
-
- for (size_t i = 0; i < size; i++) {
- output[i] = input[i] + it->second.diff[i];
- }
+ sd::apply_condition_cache_diff(it->second.diff, input, output);
}
- bool before_condition(const void* cond, struct ggml_tensor* input, struct ggml_tensor* output, float sigma, int step_index) {
+ bool before_condition(const void* cond, const sd::Tensor& input, sd::Tensor* output, float sigma, int step_index) {
if (!enabled() || step_index < 0)
return false;
@@ -900,8 +823,7 @@ struct CacheDitConditionState {
if (skip_current_step) {
if (has_cache(cond)) {
- apply_cache(cond, (float*)input->data, (float*)output->data,
- static_cast(ggml_nelements(output)));
+ apply_cache(cond, input, output);
return true;
}
return false;
@@ -914,13 +836,13 @@ struct CacheDitConditionState {
if (it == cache_diffs.end() || !it->second.has_prev)
return false;
- size_t ne = static_cast(ggml_nelements(input));
+ size_t ne = static_cast(input.numel());
if (it->second.prev_input.size() != ne)
return false;
- float* input_data = (float*)input->data;
- float diff = CacheDitState::calculate_residual_diff(
- it->second.prev_input.data(), input_data, ne);
+ const float* input_data = input.data();
+ float diff = CacheDitState::calculate_residual_diff(
+ it->second.prev_input.data(), input_data, ne);
float effective_threshold = config.residual_diff_threshold;
if (config.Fn_compute_blocks > 0) {
@@ -940,7 +862,7 @@ struct CacheDitConditionState {
cached_steps.push_back(current_step_index);
continuous_cached_steps++;
accumulated_residual_diff += diff;
- apply_cache(cond, input_data, (float*)output->data, ne);
+ apply_cache(cond, input, output);
return true;
}
@@ -948,15 +870,14 @@ struct CacheDitConditionState {
return false;
}
- void after_condition(const void* cond, struct ggml_tensor* input, struct ggml_tensor* output) {
+ void after_condition(const void* cond, const sd::Tensor& input, const sd::Tensor& output) {
if (!step_is_active())
return;
- size_t ne = static_cast(ggml_nelements(output));
- update_cache(cond, (float*)input->data, (float*)output->data, ne);
+ update_cache(cond, input, output);
if (cond == anchor_condition && taylor_config.enabled) {
- taylor_state.update_derivatives((float*)output->data, ne, current_step_index);
+ taylor_state.update_derivatives(output.data(), static_cast(output.numel()), current_step_index);
}
}
diff --git a/src/clip.hpp b/src/clip.hpp
index adecd4d2..8f2ac064 100644
--- a/src/clip.hpp
+++ b/src/clip.hpp
@@ -473,7 +473,7 @@ public:
}
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [N, n_token, d_model]
auto fc1 = std::dynamic_pointer_cast(blocks["fc1"]);
auto fc2 = std::dynamic_pointer_cast(blocks["fc2"]);
@@ -511,7 +511,7 @@ public:
blocks["mlp"] = std::shared_ptr(new CLIPMLP(d_model, intermediate_size));
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* mask = nullptr) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* mask = nullptr) {
// x: [N, n_token, d_model]
auto self_attn = std::dynamic_pointer_cast(blocks["self_attn"]);
auto layer_norm1 = std::dynamic_pointer_cast(blocks["layer_norm1"]);
@@ -541,10 +541,10 @@ public:
}
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx,
- struct ggml_tensor* x,
- struct ggml_tensor* mask = nullptr,
- int clip_skip = -1) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx,
+ ggml_tensor* x,
+ ggml_tensor* mask = nullptr,
+ int clip_skip = -1) {
// x: [N, n_token, d_model]
int layer_idx = n_layer - 1;
// LOG_DEBUG("clip_skip %d", clip_skip);
@@ -573,7 +573,7 @@ protected:
int64_t num_positions;
bool force_clip_f32;
- void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+ void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type token_wtype = GGML_TYPE_F32;
if (!force_clip_f32) {
token_wtype = get_type(prefix + "token_embedding.weight", tensor_storage_map, GGML_TYPE_F32);
@@ -597,13 +597,13 @@ public:
force_clip_f32(force_clip_f32) {
}
- struct ggml_tensor* get_token_embed_weight() {
+ ggml_tensor* get_token_embed_weight() {
return params["token_embedding.weight"];
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx,
- struct ggml_tensor* input_ids,
- struct ggml_tensor* custom_embed_weight) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx,
+ ggml_tensor* input_ids,
+ ggml_tensor* custom_embed_weight) {
// input_ids: [N, n_token]
auto token_embed_weight = params["token_embedding.weight"];
auto position_embed_weight = params["position_embedding.weight"];
@@ -630,7 +630,7 @@ protected:
int num_patches;
int64_t num_positions;
- void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+ void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type patch_wtype = GGML_TYPE_F16;
enum ggml_type class_wtype = GGML_TYPE_F32;
enum ggml_type position_wtype = GGML_TYPE_F32;
@@ -653,7 +653,7 @@ public:
num_positions = num_patches + 1;
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* pixel_values) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* pixel_values) {
// pixel_values: [N, num_channels, image_size, image_size]
// return: [N, num_positions, embed_dim]
GGML_ASSERT(pixel_values->ne[0] == image_size && pixel_values->ne[1] == image_size && pixel_values->ne[2] == num_channels);
@@ -663,20 +663,20 @@ public:
auto position_embed_weight = params["position_embedding.weight"];
// concat(patch_embedding, class_embedding) + position_embedding
- struct ggml_tensor* patch_embedding;
+ ggml_tensor* patch_embedding;
int64_t N = pixel_values->ne[3];
patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
patch_embedding = ggml_reshape_3d(ctx->ggml_ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches]
patch_embedding = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim]
patch_embedding = ggml_reshape_4d(ctx->ggml_ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1]
- struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, embed_dim, N);
- class_embedding = ggml_repeat(ctx->ggml_ctx, class_embed_weight, class_embedding); // [N, embed_dim]
- class_embedding = ggml_reshape_4d(ctx->ggml_ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1]
+ ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, embed_dim, N);
+ class_embedding = ggml_repeat(ctx->ggml_ctx, class_embed_weight, class_embedding); // [N, embed_dim]
+ class_embedding = ggml_reshape_4d(ctx->ggml_ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1]
- struct ggml_tensor* x = ggml_concat(ctx->ggml_ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1]
- x = ggml_reshape_3d(ctx->ggml_ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
- x = ggml_add(ctx->ggml_ctx, x, position_embed_weight);
+ ggml_tensor* x = ggml_concat(ctx->ggml_ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1]
+ x = ggml_reshape_3d(ctx->ggml_ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
+ x = ggml_add(ctx->ggml_ctx, x, position_embed_weight);
return x; // [N, num_positions, embed_dim]
}
};
@@ -693,7 +693,7 @@ enum CLIPVersion {
class CLIPTextModel : public GGMLBlock {
protected:
- void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+ void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
if (version == OPEN_CLIP_VIT_BIGG_14) {
enum ggml_type wtype = GGML_TYPE_F32;
params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
@@ -734,18 +734,18 @@ public:
blocks["final_layer_norm"] = std::shared_ptr(new LayerNorm(hidden_size));
}
- struct ggml_tensor* get_token_embed_weight() {
+ ggml_tensor* get_token_embed_weight() {
auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]);
return embeddings->get_token_embed_weight();
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx,
- struct ggml_tensor* input_ids,
- struct ggml_tensor* tkn_embeddings,
- struct ggml_tensor* mask = nullptr,
- size_t max_token_idx = 0,
- bool return_pooled = false,
- int clip_skip = -1) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx,
+ ggml_tensor* input_ids,
+ ggml_tensor* tkn_embeddings,
+ ggml_tensor* mask = nullptr,
+ size_t max_token_idx = 0,
+ bool return_pooled = false,
+ int clip_skip = -1) {
// input_ids: [N, n_token]
auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]);
auto encoder = std::dynamic_pointer_cast(blocks["encoder"]);
@@ -804,10 +804,10 @@ public:
blocks["post_layernorm"] = std::shared_ptr(new LayerNorm(hidden_size));
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx,
- struct ggml_tensor* pixel_values,
- bool return_pooled = true,
- int clip_skip = -1) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx,
+ ggml_tensor* pixel_values,
+ bool return_pooled = true,
+ int clip_skip = -1) {
// pixel_values: [N, num_channels, image_size, image_size]
auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]);
auto pre_layernorm = std::dynamic_pointer_cast(blocks["pre_layernorm"]);
@@ -839,7 +839,7 @@ protected:
int64_t out_features;
bool transpose_weight;
- void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+ void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
if (transpose_weight) {
params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
@@ -856,8 +856,8 @@ public:
out_features(out_features),
transpose_weight(transpose_weight) {}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
- struct ggml_tensor* w = params["weight"];
+ ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+ ggml_tensor* w = params["weight"];
if (transpose_weight) {
w = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, w));
}
@@ -886,10 +886,10 @@ public:
blocks["visual_projection"] = std::shared_ptr(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w));
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx,
- struct ggml_tensor* pixel_values,
- bool return_pooled = true,
- int clip_skip = -1) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx,
+ ggml_tensor* pixel_values,
+ bool return_pooled = true,
+ int clip_skip = -1) {
// pixel_values: [N, num_channels, image_size, image_size]
// return: [N, projection_dim] if return_pooled else [N, n_token, hidden_size]
auto vision_model = std::dynamic_pointer_cast(blocks["vision_model"]);
@@ -936,17 +936,17 @@ struct CLIPTextModelRunner : public GGMLRunner {
return "clip";
}
- void get_param_tensors(std::map& tensors, const std::string prefix) {
+ void get_param_tensors(std::map& tensors, const std::string prefix) {
model.get_param_tensors(tensors, prefix);
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx,
- struct ggml_tensor* input_ids,
- struct ggml_tensor* embeddings,
- struct ggml_tensor* mask,
- size_t max_token_idx = 0,
- bool return_pooled = false,
- int clip_skip = -1) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx,
+ ggml_tensor* input_ids,
+ ggml_tensor* embeddings,
+ ggml_tensor* mask,
+ size_t max_token_idx = 0,
+ bool return_pooled = false,
+ int clip_skip = -1) {
size_t N = input_ids->ne[1];
size_t n_token = input_ids->ne[0];
if (input_ids->ne[0] > model.n_token) {
@@ -957,17 +957,16 @@ struct CLIPTextModelRunner : public GGMLRunner {
return model.forward(ctx, input_ids, embeddings, mask, max_token_idx, return_pooled, clip_skip);
}
- struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
- int num_custom_embeddings = 0,
- void* custom_embeddings_data = nullptr,
- size_t max_token_idx = 0,
- bool return_pooled = false,
- int clip_skip = -1) {
- struct ggml_cgraph* gf = new_graph_custom(2048);
+ ggml_cgraph* build_graph(const sd::Tensor& input_ids_tensor,
+ int num_custom_embeddings = 0,
+ void* custom_embeddings_data = nullptr,
+ size_t max_token_idx = 0,
+ bool return_pooled = false,
+ int clip_skip = -1) {
+ ggml_cgraph* gf = new_graph_custom(2048);
+ ggml_tensor* input_ids = make_input(input_ids_tensor);
- input_ids = to_backend(input_ids);
-
- struct ggml_tensor* embeddings = nullptr;
+ ggml_tensor* embeddings = nullptr;
if (num_custom_embeddings > 0 && custom_embeddings_data != nullptr) {
auto token_embed_weight = model.get_token_embed_weight();
@@ -997,26 +996,28 @@ struct CLIPTextModelRunner : public GGMLRunner {
auto runner_ctx = get_context();
- struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, embeddings, attention_mask, max_token_idx, return_pooled, clip_skip);
+ ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, embeddings, attention_mask, max_token_idx, return_pooled, clip_skip);
ggml_build_forward_expand(gf, hidden_states);
return gf;
}
- bool compute(const int n_threads,
- struct ggml_tensor* input_ids,
- int num_custom_embeddings,
- void* custom_embeddings_data,
- size_t max_token_idx,
- bool return_pooled,
- int clip_skip,
- ggml_tensor** output,
- ggml_context* output_ctx = nullptr) {
- auto get_graph = [&]() -> struct ggml_cgraph* {
+ sd::Tensor compute(const int n_threads,
+ const sd::Tensor& input_ids,
+ int num_custom_embeddings,
+ void* custom_embeddings_data,
+ size_t max_token_idx,
+ bool return_pooled,
+ int clip_skip) {
+ auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
};
- return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+ auto result = GGMLRunner::compute(get_graph, n_threads, true);
+ if (return_pooled) {
+ return take_or_empty(std::move(result));
+ }
+ return restore_trailing_singleton_dims(std::move(result), 3);
}
};
diff --git a/src/common_block.hpp b/src/common_block.hpp
index 435afa4f..2cef389a 100644
--- a/src/common_block.hpp
+++ b/src/common_block.hpp
@@ -23,7 +23,7 @@ public:
}
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [N, channels, h, w]
if (vae_downsample) {
auto conv = std::dynamic_pointer_cast(blocks["conv"]);
@@ -52,7 +52,7 @@ public:
blocks["conv"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [N, channels, h, w]
auto conv = std::dynamic_pointer_cast(blocks["conv"]);
@@ -121,7 +121,7 @@ public:
}
}
- virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = nullptr) {
+ virtual ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* emb = nullptr) {
// For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
// [N, c, t, h, w] => [N, c, t, h * w]
// x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
@@ -188,7 +188,7 @@ public:
blocks["proj"] = std::shared_ptr(new Linear(dim_in, dim_out * 2));
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
+ ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
// x: [ne3, ne2, ne1, dim_in]
// return: [ne3, ne2, ne1, dim_out]
auto proj = std::dynamic_pointer_cast(blocks["proj"]);
@@ -214,7 +214,7 @@ public:
blocks["proj"] = std::shared_ptr(new Linear(dim_in, dim_out, bias));
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
+ ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
// x: [ne3, ne2, ne1, dim_in]
// return: [ne3, ne2, ne1, dim_out]
auto proj = std::dynamic_pointer_cast(blocks["proj"]);
@@ -258,7 +258,7 @@ public:
blocks["net.2"] = std::shared_ptr(new Linear(inner_dim, dim_out, true, false, force_prec_f32, scale));
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [ne3, ne2, ne1, dim]
// return: [ne3, ne2, ne1, dim_out]
@@ -297,9 +297,9 @@ public:
// to_out_1 is nn.Dropout(), skip for inference
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx,
- struct ggml_tensor* x,
- struct ggml_tensor* context) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx,
+ ggml_tensor* x,
+ ggml_tensor* context) {
// x: [N, n_token, query_dim]
// context: [N, n_context, context_dim]
// return: [N, n_token, query_dim]
@@ -355,9 +355,9 @@ public:
}
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx,
- struct ggml_tensor* x,
- struct ggml_tensor* context) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx,
+ ggml_tensor* x,
+ ggml_tensor* context) {
// x: [N, n_token, query_dim]
// context: [N, n_context, context_dim]
// return: [N, n_token, query_dim]
@@ -406,7 +406,7 @@ protected:
int64_t context_dim = 768; // hidden_size, 1024 for VERSION_SD2
bool use_linear = false;
- void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
+ void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
if (iter != tensor_storage_map.end()) {
int64_t inner_dim = n_head * d_head;
@@ -456,9 +456,9 @@ public:
}
}
- virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx,
- struct ggml_tensor* x,
- struct ggml_tensor* context) {
+ virtual ggml_tensor* forward(GGMLRunnerContext* ctx,
+ ggml_tensor* x,
+ ggml_tensor* context) {
// x: [N, in_channels, h, w]
// context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
auto norm = std::dynamic_pointer_cast(blocks["norm"]);
@@ -510,7 +510,7 @@ public:
class AlphaBlender : public GGMLBlock {
protected:
- void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
+ void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
// Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
enum ggml_type wtype = GGML_TYPE_F32;
params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
@@ -530,9 +530,9 @@ public:
// since mix_factor.shape is [1,], we don't need rearrange using rearrange_pattern
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx,
- struct ggml_tensor* x_spatial,
- struct ggml_tensor* x_temporal) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx,
+ ggml_tensor* x_spatial,
+ ggml_tensor* x_temporal) {
// image_only_indicator is always tensor([0.])
float alpha = get_alpha();
auto x = ggml_add(ctx->ggml_ctx,
@@ -555,10 +555,10 @@ public:
blocks["time_mixer"] = std::shared_ptr(new AlphaBlender());
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx,
- struct ggml_tensor* x,
- struct ggml_tensor* emb,
- int num_video_frames) {
+ ggml_tensor* forward(GGMLRunnerContext* ctx,
+ ggml_tensor* x,
+ ggml_tensor* emb,
+ int num_video_frames) {
// x: [N, channels, h, w] aka [b*t, channels, h, w]
// emb: [N, emb_channels] aka [b*t, emb_channels]
// image_only_indicator is always tensor([0.])
diff --git a/src/common_dit.hpp b/src/common_dit.hpp
index 0e6f0f08..30141d42 100644
--- a/src/common_dit.hpp
+++ b/src/common_dit.hpp
@@ -4,11 +4,11 @@
#include "ggml_extend.hpp"
namespace DiT {
- ggml_tensor* patchify(ggml_context* ctx,
- ggml_tensor* x,
- int pw,
- int ph,
- bool patch_last = true) {
+ inline ggml_tensor* patchify(ggml_context* ctx,
+ ggml_tensor* x,
+ int pw,
+ int ph,
+ bool patch_last = true) {
// x: [N, C, H, W]
// return: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C]
int64_t N = x->ne[3];
@@ -33,13 +33,13 @@ namespace DiT {
return x;
}
- ggml_tensor* unpatchify(ggml_context* ctx,
- ggml_tensor* x,
- int64_t h,
- int64_t w,
- int ph,
- int pw,
- bool patch_last = true) {
+ inline ggml_tensor* unpatchify(ggml_context* ctx,
+ ggml_tensor* x,
+ int64_t h,
+ int64_t w,
+ int ph,
+ int pw,
+ bool patch_last = true) {
// x: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C]
// return: [N, C, H, W]
int64_t N = x->ne[2];
@@ -64,10 +64,10 @@ namespace DiT {
return x;
}
- ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
- ggml_tensor* x,
- int ph,
- int pw) {
+ inline ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
+ ggml_tensor* x,
+ int ph,
+ int pw) {
int64_t W = x->ne[0];
int64_t H = x->ne[1];
@@ -77,23 +77,23 @@ namespace DiT {
return x;
}
- ggml_tensor* pad_and_patchify(GGMLRunnerContext* ctx,
- ggml_tensor* x,
- int ph,
- int pw,
- bool patch_last = true) {
+ inline ggml_tensor* pad_and_patchify(GGMLRunnerContext* ctx,
+ ggml_tensor* x,
+ int ph,
+ int pw,
+ bool patch_last = true) {
x = pad_to_patch_size(ctx, x, ph, pw);
x = patchify(ctx->ggml_ctx, x, ph, pw, patch_last);
return x;
}
- ggml_tensor* unpatchify_and_crop(ggml_context* ctx,
- ggml_tensor* x,
- int64_t H,
- int64_t W,
- int ph,
- int pw,
- bool patch_last = true) {
+ inline ggml_tensor* unpatchify_and_crop(ggml_context* ctx,
+ ggml_tensor* x,
+ int64_t H,
+ int64_t W,
+ int ph,
+ int pw,
+ bool patch_last = true) {
int pad_h = (ph - H % ph) % ph;
int pad_w = (pw - W % pw) % pw;
int64_t h = ((H + pad_h) / ph);
@@ -105,4 +105,4 @@ namespace DiT {
}
} // namespace DiT
-#endif // __COMMON_DIT_HPP__
\ No newline at end of file
+#endif // __COMMON_DIT_HPP__
diff --git a/src/condition_cache_utils.hpp b/src/condition_cache_utils.hpp
new file mode 100644
index 00000000..903d64e3
--- /dev/null
+++ b/src/condition_cache_utils.hpp
@@ -0,0 +1,64 @@
+#ifndef __CONDITION_CACHE_UTILS_HPP__
+#define __CONDITION_CACHE_UTILS_HPP__
+
+#include
+
+#include "tensor.hpp"
+
+namespace sd {
+
+ inline bool store_condition_cache_diff(std::vector* diff,
+ const sd::Tensor& input,
+ const sd::Tensor& output) {
+ if (diff == nullptr || input.empty() || output.empty()) {
+ return false;
+ }
+
+ size_t input_size = static_cast(input.numel());
+ size_t output_size = static_cast(output.numel());
+ if (input_size == 0 || input_size != output_size) {
+ diff->clear();
+ return false;
+ }
+
+ const float* input_data = input.data();
+ const float* output_data = output.data();
+ if (input_data == nullptr || output_data == nullptr) {
+ diff->clear();
+ return false;
+ }
+
+ diff->resize(output_size);
+ for (size_t i = 0; i < output_size; ++i) {
+ (*diff)[i] = output_data[i] - input_data[i];
+ }
+ return true;
+ }
+
+ inline bool apply_condition_cache_diff(const std::vector& diff,
+ const sd::Tensor& input,
+ sd::Tensor* output) {
+ if (output == nullptr || input.empty() || diff.empty()) {
+ return false;
+ }
+
+ size_t input_size = static_cast(input.numel());
+ if (input_size == 0 || diff.size() != input_size) {
+ return false;
+ }
+
+ *output = input;
+ float* output_data = output->data();
+ if (output_data == nullptr) {
+ return false;
+ }
+
+ for (size_t i = 0; i < input_size; ++i) {
+ output_data[i] += diff[i];
+ }
+ return true;
+ }
+
+} // namespace sd
+
+#endif // __CONDITION_CACHE_UTILS_HPP__
diff --git a/src/conditioner.hpp b/src/conditioner.hpp
index d4a3146b..05167cfd 100644
--- a/src/conditioner.hpp
+++ b/src/conditioner.hpp
@@ -1,53 +1,97 @@
#ifndef __CONDITIONER_HPP__
#define __CONDITIONER_HPP__
+#include
+
#include "clip.hpp"
#include "llm.hpp"
#include "t5.hpp"
+#include "tensor_ggml.hpp"
struct SDCondition {
- struct ggml_tensor* c_crossattn = nullptr; // aka context
- struct ggml_tensor* c_vector = nullptr; // aka y
- struct ggml_tensor* c_concat = nullptr;
+ sd::Tensor c_crossattn;
+ sd::Tensor c_vector;
+ sd::Tensor c_concat;
+ sd::Tensor c_t5_ids;
+ sd::Tensor c_t5_weights;
- std::vector extra_c_crossattns;
+ std::vector> extra_c_crossattns;
SDCondition() = default;
- SDCondition(struct ggml_tensor* c_crossattn,
- struct ggml_tensor* c_vector,
- struct ggml_tensor* c_concat,
- const std::vector& extra_c_crossattns = {})
- : c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat), extra_c_crossattns(extra_c_crossattns) {}
+
+ SDCondition(sd::Tensor c_crossattn,
+ sd::Tensor c_vector,
+ sd::Tensor