diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 76b1793..9816e42 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -70,7 +70,7 @@ jobs:
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
- uses: pr-mpt/actions-commit-hash@v2
+ uses: prompt/actions-commit-hash@v2
- name: Fetch system info
id: system-info
@@ -123,7 +123,7 @@ jobs:
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
- uses: pr-mpt/actions-commit-hash@v2
+ uses: prompt/actions-commit-hash@v2
- name: Fetch system info
id: system-info
@@ -162,7 +162,7 @@ jobs:
strategy:
matrix:
- variant: [musa, sycl, vulkan]
+ variant: [musa, sycl, vulkan, cuda]
env:
REGISTRY: ghcr.io
@@ -177,7 +177,7 @@ jobs:
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
- uses: pr-mpt/actions-commit-hash@v2
+ uses: prompt/actions-commit-hash@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
@@ -240,7 +240,7 @@ jobs:
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
- uses: pr-mpt/actions-commit-hash@v2
+ uses: prompt/actions-commit-hash@v2
- name: Fetch system info
id: system-info
@@ -340,7 +340,7 @@ jobs:
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
- uses: pr-mpt/actions-commit-hash@v2
+ uses: prompt/actions-commit-hash@v2
- name: Pack artifacts
id: pack_artifacts
@@ -463,7 +463,7 @@ jobs:
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
- uses: pr-mpt/actions-commit-hash@v2
+ uses: prompt/actions-commit-hash@v2
- name: Pack artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -485,6 +485,146 @@ jobs:
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
+ ubuntu-latest-rocm:
+ runs-on: ubuntu-latest
+ container: rocm/dev-ubuntu-24.04:7.2
+
+ env:
+ ROCM_VERSION: "7.2"
+ UBUNTU_VERSION: "24.04"
+ GPU_TARGETS: "gfx1151;gfx1150;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+
+ steps:
+ - run: apt-get update && apt-get install -y git
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+ with:
+ submodules: recursive
+
+ - name: Free disk space
+ run: |
+ # Remove preinstalled SDKs and caches not needed for this job
+ sudo rm -rf /usr/share/dotnet || true
+ sudo rm -rf /usr/local/lib/android || true
+ sudo rm -rf /opt/ghc || true
+ sudo rm -rf /usr/local/.ghcup || true
+ sudo rm -rf /opt/hostedtoolcache || true
+
+ # Remove old package lists and caches
+ sudo rm -rf /var/lib/apt/lists/* || true
+ sudo apt clean
+
+ - name: Dependencies
+ id: depends
+ run: |
+ sudo apt-get update
+ sudo apt install -y \
+ cmake \
+ hip-dev \
+ hipblas-dev \
+ ninja-build \
+ rocm-dev \
+ zip
+ # Clean apt caches to recover disk space
+ sudo apt clean
+ sudo rm -rf /var/lib/apt/lists/* || true
+
+ - name: Setup ROCm Environment
+ run: |
+ # Add ROCm to PATH for current session
+ echo "/opt/rocm/bin" >> $GITHUB_PATH
+
+ # Build regex pattern from ${{ env.GPU_TARGETS }} (match target as substring)
+ TARGET_REGEX="($(printf '%s' "${{ env.GPU_TARGETS }}" | sed 's/;/|/g'))"
+
+ # Remove library files for architectures we're not building for to save disk space
+ echo "Cleaning up unneeded architecture files..."
+ cd /opt/rocm/lib/rocblas/library
+ # Keep only our target architectures
+ for file in *; do
+ if printf '%s' "$file" | grep -q 'gfx'; then
+ if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
+ echo "Removing $file" &&
+ sudo rm -f "$file";
+ fi
+ fi
+ done
+
+ cd /opt/rocm/lib/hipblaslt/library
+ for file in *; do
+ if printf '%s' "$file" | grep -q 'gfx'; then
+ if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
+ echo "Removing $file" &&
+ sudo rm -f "$file";
+ fi
+ fi
+ done
+
+ - name: Build
+ id: cmake_build
+ run: |
+ mkdir build
+ cd build
+ cmake .. -G Ninja \
+ -DCMAKE_CXX_COMPILER=amdclang++ \
+ -DCMAKE_C_COMPILER=amdclang \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DSD_HIPBLAS=ON \
+ -DGPU_TARGETS="${{ env.GPU_TARGETS }}" \
+ -DAMDGPU_TARGETS="${{ env.GPU_TARGETS }}" \
+ -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
+ -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+ -DSD_BUILD_SHARED_LIBS=ON
+ cmake --build . --config Release
+
+ - name: Get commit hash
+ id: commit
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: prompt/actions-commit-hash@v2
+
+ - name: Prepare artifacts
+ id: prepare_artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ run: |
+ # Copy licenses
+ cp ggml/LICENSE ./build/bin/ggml.txt
+ cp LICENSE ./build/bin/stable-diffusion.cpp.txt
+
+ # Move ROCm runtime libraries (to avoid double space consumption)
+ sudo mv /opt/rocm/lib/librocsparse.so* ./build/bin/
+ sudo mv /opt/rocm/lib/libhsa-runtime64.so* ./build/bin/
+ sudo mv /opt/rocm/lib/libamdhip64.so* ./build/bin/
+ sudo mv /opt/rocm/lib/libhipblas.so* ./build/bin/
+ sudo mv /opt/rocm/lib/libhipblaslt.so* ./build/bin/
+ sudo mv /opt/rocm/lib/librocblas.so* ./build/bin/
+ sudo mv /opt/rocm/lib/rocblas/ ./build/bin/
+ sudo mv /opt/rocm/lib/hipblaslt/ ./build/bin/
+
+ - name: Fetch system info
+ id: system-info
+ run: |
+ echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
+ echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
+ echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
+ echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
+
+ - name: Pack artifacts
+ id: pack_artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ run: |
+ cp ggml/LICENSE ./build/bin/ggml.txt
+ cp LICENSE ./build/bin/stable-diffusion.cpp.txt
+ zip -y -r sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip ./build/bin
+
+ - name: Upload artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: actions/upload-artifact@v4
+ with:
+ name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
+ path: |
+ sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
+
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -493,6 +633,7 @@ jobs:
needs:
- ubuntu-latest-cmake
- ubuntu-latest-cmake-vulkan
+ - ubuntu-latest-rocm
- build-and-push-docker-images
- macOS-latest-cmake
- windows-latest-cmake
@@ -519,7 +660,7 @@ jobs:
- name: Get commit hash
id: commit
- uses: pr-mpt/actions-commit-hash@v2
+ uses: prompt/actions-commit-hash@v2
- name: Create release
id: create_release
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e731d95..bad1ba4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,7 +36,6 @@ option(SD_VULKAN "sd: vulkan backend" OFF)
option(SD_OPENCL "sd: opencl backend" OFF)
option(SD_SYCL "sd: sycl backend" OFF)
option(SD_MUSA "sd: musa backend" OFF)
-option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF)
option(SD_BUILD_SHARED_GGML_LIB "sd: build ggml as a separate shared lib" OFF)
option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF)
@@ -70,26 +69,22 @@ if (SD_HIPBLAS)
message("-- Use HIPBLAS as backend stable-diffusion")
set(GGML_HIP ON)
add_definitions(-DSD_USE_CUDA)
- if(SD_FAST_SOFTMAX)
- set(GGML_CUDA_FAST_SOFTMAX ON)
- endif()
endif ()
if(SD_MUSA)
message("-- Use MUSA as backend stable-diffusion")
set(GGML_MUSA ON)
add_definitions(-DSD_USE_CUDA)
- if(SD_FAST_SOFTMAX)
- set(GGML_CUDA_FAST_SOFTMAX ON)
- endif()
endif()
set(SD_LIB stable-diffusion)
file(GLOB SD_LIB_SOURCES
- "*.h"
- "*.cpp"
- "*.hpp"
+ "src/*.h"
+ "src/*.cpp"
+ "src/*.hpp"
+ "src/vocab/*.h"
+ "src/vocab/*.cpp"
)
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
@@ -119,7 +114,7 @@ endif()
message(STATUS "stable-diffusion.cpp commit ${SDCPP_BUILD_COMMIT}")
set_property(
- SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp
+ SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/version.cpp
APPEND PROPERTY COMPILE_DEFINITIONS
SDCPP_BUILD_COMMIT=${SDCPP_BUILD_COMMIT} SDCPP_BUILD_VERSION=${SDCPP_BUILD_VERSION}
)
@@ -182,6 +177,7 @@ endif()
add_subdirectory(thirdparty)
target_link_libraries(${SD_LIB} PUBLIC ggml zip)
+target_include_directories(${SD_LIB} PUBLIC . include)
target_include_directories(${SD_LIB} PUBLIC . thirdparty)
target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)
@@ -190,7 +186,7 @@ if (SD_BUILD_EXAMPLES)
add_subdirectory(examples)
endif()
-set(SD_PUBLIC_HEADERS stable-diffusion.h)
+set(SD_PUBLIC_HEADERS include/stable-diffusion.h)
set_target_properties(${SD_LIB} PROPERTIES PUBLIC_HEADER "${SD_PUBLIC_HEADERS}")
install(TARGETS ${SD_LIB} LIBRARY PUBLIC_HEADER)
diff --git a/Dockerfile.cuda b/Dockerfile.cuda
new file mode 100644
index 0000000..13fef89
--- /dev/null
+++ b/Dockerfile.cuda
@@ -0,0 +1,25 @@
+ARG CUDA_VERSION=12.6.3
+ARG UBUNTU_VERSION=24.04
+
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION} AS build
+
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential git ccache cmake
+
+WORKDIR /sd.cpp
+
+COPY . .
+
+ARG CUDACXX=/usr/local/cuda/bin/nvcc
+RUN cmake . -B ./build -DSD_CUDA=ON
+RUN cmake --build ./build --config Release --parallel
+
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION} AS runtime
+
+RUN apt-get update && \
+ apt-get install --yes --no-install-recommends libgomp1 && \
+ apt-get clean
+
+COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
+COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
+
+ENTRYPOINT [ "/sd-cli" ]
diff --git a/README.md b/README.md
index 89e0b02..b5bb497 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,9 @@ API and command-line option may change frequently.***
## 🔥Important News
+* **2026/01/18** 🚀 stable-diffusion.cpp now supports **FLUX.2-klein**
+ 👉 Details: [PR #1193](https://github.com/leejet/stable-diffusion.cpp/pull/1193)
+
* **2025/12/01** 🚀 stable-diffusion.cpp now supports **Z-Image**
👉 Details: [PR #1020](https://github.com/leejet/stable-diffusion.cpp/pull/1020)
@@ -50,6 +53,7 @@ API and command-line option may change frequently.***
- [Qwen Image](./docs/qwen_image.md)
- [Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md)
+ - [Anima](./docs/anima.md)
- Image Edit Models
- [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Qwen Image Edit series](./docs/qwen_image_edit.md)
@@ -136,6 +140,7 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe
- [🔥Wan2.1/Wan2.2](./docs/wan.md)
- [🔥Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md)
+- [Anima](./docs/anima.md)
- [LoRA](./docs/lora.md)
- [LCM/LCM-LoRA](./docs/lcm.md)
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
diff --git a/assets/anima/example.png b/assets/anima/example.png
new file mode 100644
index 0000000..ab91dbf
Binary files /dev/null and b/assets/anima/example.png differ
diff --git a/assets/z_image/base_bf16.png b/assets/z_image/base_bf16.png
new file mode 100644
index 0000000..f2b918c
Binary files /dev/null and b/assets/z_image/base_bf16.png differ
diff --git a/docs/anima.md b/docs/anima.md
new file mode 100644
index 0000000..debc370
--- /dev/null
+++ b/docs/anima.md
@@ -0,0 +1,21 @@
+# How to Use
+
+## Download weights
+
+- Download Anima
+ - safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/diffusion_models
+ - gguf: https://huggingface.co/Bedovyy/Anima-GGUF/tree/main
+ - gguf Anima2: https://huggingface.co/JusteLeo/Anima2-GGUF/tree/main
+- Download vae
+ - safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/vae
+- Download Qwen3-0.6B-Base
+ - safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/text_encoders
+ - gguf: https://huggingface.co/mradermacher/Qwen3-0.6B-Base-GGUF/tree/main
+
+## Examples
+
+```sh
+.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\anima-preview.safetensors --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_06b_base.safetensors -p "a lovely cat holding a sign says 'anima.cpp'" --cfg-scale 6.0 --sampling-method euler -v --offload-to-cpu --diffusion-fa
+```
+
+
diff --git a/docs/caching.md b/docs/caching.md
index 7b4be3c..cb103ae 100644
--- a/docs/caching.md
+++ b/docs/caching.md
@@ -11,6 +11,7 @@ Caching methods accelerate diffusion inference by reusing intermediate computati
| `dbcache` | DiT models | Block-level L1 residual threshold |
| `taylorseer` | DiT models | Taylor series approximation |
| `cache-dit` | DiT models | Combined DBCache + TaylorSeer |
+| `spectrum` | UNET models | Chebyshev + Taylor output forecasting |
### UCache (UNET Models)
@@ -79,7 +80,7 @@ Uses Taylor series approximation to predict block outputs:
Combines DBCache and TaylorSeer:
```bash
---cache-mode cache-dit --cache-preset fast
+--cache-mode cache-dit
```
#### Parameters
@@ -91,14 +92,6 @@ Combines DBCache and TaylorSeer:
| `threshold` | L1 residual difference threshold | 0.08 |
| `warmup` | Steps before caching starts | 8 |
-#### Presets
-
-Available presets: `slow`, `medium`, `fast`, `ultra` (or `s`, `m`, `f`, `u`).
-
-```bash
---cache-mode cache-dit --cache-preset fast
-```
-
#### SCM Options
Steps Computation Mask controls which steps can be cached:
@@ -118,6 +111,28 @@ Mask values: `1` = compute, `0` = can cache.
--scm-policy dynamic
```
+### Spectrum (UNET Models)
+
+Spectrum uses Chebyshev polynomial fitting blended with Taylor extrapolation to predict denoised outputs, skipping entire UNet forward passes. Based on the paper [Spectrum: Adaptive Spectral Feature Forecasting for Efficient Diffusion Sampling](https://github.com/tingyu215/Spectrum).
+
+```bash
+sd-cli -m model.safetensors -p "a cat" --cache-mode spectrum
+```
+
+#### Parameters
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `w` | Chebyshev vs Taylor blend weight (0=Taylor, 1=Chebyshev) | 0.40 |
+| `m` | Chebyshev polynomial degree | 3 |
+| `lam` | Ridge regression regularization | 1.0 |
+| `window` | Initial window size (compute every N steps) | 2 |
+| `flex` | Window growth per computed step after warmup | 0.50 |
+| `warmup` | Steps to always compute before caching starts | 4 |
+| `stop` | Stop caching at this fraction of total steps | 0.9 |
+
+```
+
### Performance Tips
- Start with default thresholds and adjust based on output quality
diff --git a/docs/distilled_sd.md b/docs/distilled_sd.md
index 232c022..3174b18 100644
--- a/docs/distilled_sd.md
+++ b/docs/distilled_sd.md
@@ -1,8 +1,8 @@
-# Running distilled models: SSD1B and SDx.x with tiny U-Nets
+# Running distilled models: SSD1B, Vega and SDx.x with tiny U-Nets
## Preface
-These models feature a reduced U-Net architecture. Unlike standard SDXL models, the SSD-1B U-Net contains only one middle block and fewer attention layers in its up- and down-blocks, resulting in significantly smaller file sizes. Using these models can reduce inference time by more than 33%. For more details, refer to Segmind's paper: https://arxiv.org/abs/2401.02677v1.
+These models feature a reduced U-Net architecture. Unlike standard SDXL models, the SSD-1B and Vega U-Net contains only one middle block and fewer attention layers in its up- and down-blocks, resulting in significantly smaller file sizes. Using these models can reduce inference time by more than 33%. For more details, refer to Segmind's paper: https://arxiv.org/abs/2401.02677v1.
Similarly, SD1.x- and SD2.x-style models with a tiny U-Net consist of only 6 U-Net blocks, leading to very small files and time savings of up to 50%. For more information, see the paper: https://arxiv.org/pdf/2305.15798.pdf.
## SSD1B
@@ -17,7 +17,17 @@ Useful LoRAs are also available:
* https://huggingface.co/seungminh/lora-swarovski-SSD-1B/resolve/main/pytorch_lora_weights.safetensors
* https://huggingface.co/kylielee505/mylcmlorassd/resolve/main/pytorch_lora_weights.safetensors
-These files can be used out-of-the-box, unlike the models described in the next section.
+## Vega
+
+Segmind's Vega model is available online here:
+
+ * https://huggingface.co/segmind/Segmind-Vega/resolve/main/segmind-vega.safetensors
+
+VegaRT is an example for an LCM-LoRA:
+
+ * https://huggingface.co/segmind/Segmind-VegaRT/resolve/main/pytorch_lora_weights.safetensors
+
+Both files can be used out-of-the-box, unlike the models described in next sections.
## SD1.x, SD2.x with tiny U-Nets
diff --git a/docs/esrgan.md b/docs/esrgan.md
index 7723172..39a9760 100644
--- a/docs/esrgan.md
+++ b/docs/esrgan.md
@@ -1,6 +1,6 @@
## Using ESRGAN to upscale results
-You can use ESRGAN to upscale the generated images. At the moment, only the [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth) model is supported. Support for more models of this architecture will be added soon.
+You can use ESRGAN—such as the model [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth)—to upscale the generated images and improve their overall resolution and clarity.
- Specify the model path using the `--upscale-model PATH` parameter. example:
diff --git a/docs/z_image.md b/docs/z_image.md
index 122f1f2..2ea66f9 100644
--- a/docs/z_image.md
+++ b/docs/z_image.md
@@ -7,6 +7,9 @@ You can run Z-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or ev
- Download Z-Image-Turbo
- safetensors: https://huggingface.co/Comfy-Org/z_image_turbo/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/leejet/Z-Image-Turbo-GGUF/tree/main
+- Download Z-Image
+ - safetensors: https://huggingface.co/Comfy-Org/z_image/tree/main/split_files/diffusion_models
+ - gguf: https://huggingface.co/unsloth/Z-Image-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
- Download Qwen3 4b
@@ -15,12 +18,22 @@ You can run Z-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or ev
## Examples
+### Z-Image-Turbo
+
```
.\bin\Release\sd-cli.exe --diffusion-model z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512
```
+### Z-Image-Base
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\z_image_bf16.safetensors --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512
+```
+
+
+
## Comparison of Different Quantization Types
| bf16 | q8_0 | q6_K | q5_0 | q4_K | q4_0 | q3_K | q2_K|
diff --git a/examples/cli/README.md b/examples/cli/README.md
index 84dd5c7..904f3c4 100644
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@@ -4,11 +4,12 @@
usage: ./bin/sd-cli [options]
CLI Options:
- -o, --output path to write result image to. you can use printf-style %d format specifiers for image sequences (default: ./output.png) (eg. output_%03d.png)
- --output-begin-idx starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)
+ -o, --output path to write result image to. you can use printf-style %d format specifiers for image sequences (default:
+ ./output.png) (eg. output_%03d.png)
--preview-path path to write preview image to (default: ./preview.png)
--preview-interval interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
every step)
+ --output-begin-idx starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)
--canny apply canny preprocessor (edge detection)
--convert-name convert tensor name (for convert mode)
-v, --verbose print extra info
@@ -44,7 +45,6 @@ Context Options:
CPU physical cores
--chroma-t5-mask-pad t5 mask pad size of chroma
--vae-tile-overlap tile overlap for vae tiling, in fraction of tile size (default: 0.5)
- --flow-shift shift value for Flow models like SD3.x or WAN (default: auto)
--vae-tiling process vae in tiles to reduce memory usage
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
@@ -52,13 +52,15 @@ Context Options:
--control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram)
--vae-on-cpu keep vae in cpu (for low vram)
- --diffusion-fa use flash attention in the diffusion model
+ --fa use flash attention
+ --diffusion-fa use flash attention in the diffusion model only
--diffusion-conv-direct use ggml_conv2d_direct in the diffusion model
--vae-conv-direct use ggml_conv2d_direct in the vae model
--circular enable circular padding for convolutions
--circularx enable circular RoPE wrapping on x-axis (width) only
--circulary enable circular RoPE wrapping on y-axis (height) only
--chroma-disable-dit-mask disable dit mask for chroma
+ --qwen-image-zero-cond-t enable zero_cond_t for qwen image
--chroma-enable-t5-mask enable t5 mask for chroma
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
type of the weight file
@@ -108,6 +110,7 @@ Generation Options:
--skip-layer-start SLG enabling point (default: 0.01)
--skip-layer-end SLG disabling point (default: 0.2)
--eta eta in DDIM, only for DDIM and TCD (default: 0)
+ --flow-shift shift value for Flow models like SD3.x or WAN (default: auto)
--high-noise-cfg-scale (high noise) unconditional guidance scale: (default: 7.0)
--high-noise-img-cfg-scale (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
--high-noise-guidance (high noise) distilled guidance scale for models with guidance input (default: 3.5)
@@ -124,20 +127,23 @@ Generation Options:
--disable-auto-resize-ref-image disable auto resize of ref images
-s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
- tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
+ tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a
+ otherwise)
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
- ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
+ ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan,
+ euler_a otherwise
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
- kl_optimal, lcm], default: discrete
+ kl_optimal, lcm, bong_tangent], default: discrete
--sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
--skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
- --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level)
+ --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level),
+ 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
--cache-option named cache params (key=value format, comma-separated). easycache/ucache:
- threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
- "threshold=0.25" or "threshold=1.5,reset=0"
- --cache-preset cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u'
+ threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=;
+ spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples:
+ "threshold=0.25" or "threshold=1.5,reset=0" or "w=0.4,window=2"
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
--scm-policy SCM policy: 'dynamic' (default) or 'static'
```
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index ddc2828..f9e4928 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -245,7 +245,7 @@ std::string get_image_params(const SDCliParams& cli_params, const SDContextParam
parameter_string += "Guidance: " + std::to_string(gen_params.sample_params.guidance.distilled_guidance) + ", ";
parameter_string += "Eta: " + std::to_string(gen_params.sample_params.eta) + ", ";
parameter_string += "Seed: " + std::to_string(seed) + ", ";
- parameter_string += "Size: " + std::to_string(gen_params.width) + "x" + std::to_string(gen_params.height) + ", ";
+ parameter_string += "Size: " + std::to_string(gen_params.get_resolved_width()) + "x" + std::to_string(gen_params.get_resolved_height()) + ", ";
parameter_string += "Model: " + sd_basename(ctx_params.model_path) + ", ";
parameter_string += "RNG: " + std::string(sd_rng_type_name(ctx_params.rng_type)) + ", ";
if (ctx_params.sampler_rng_type != RNG_TYPE_COUNT) {
@@ -394,12 +394,15 @@ bool save_results(const SDCliParams& cli_params,
fs::path base_path = out_path;
fs::path ext = out_path.has_extension() ? out_path.extension() : fs::path{};
- if (!ext.empty())
- base_path.replace_extension();
std::string ext_lower = ext.string();
std::transform(ext_lower.begin(), ext_lower.end(), ext_lower.begin(), ::tolower);
bool is_jpg = (ext_lower == ".jpg" || ext_lower == ".jpeg" || ext_lower == ".jpe");
+ if (!ext.empty()) {
+ if (is_jpg || ext_lower == ".png") {
+ base_path.replace_extension();
+ }
+ }
int output_begin_idx = cli_params.output_begin_idx;
if (output_begin_idx < 0) {
@@ -409,7 +412,7 @@ bool save_results(const SDCliParams& cli_params,
auto write_image = [&](const fs::path& path, int idx) {
const sd_image_t& img = results[idx];
if (!img.data)
- return;
+ return false;
std::string params = get_image_params(cli_params, ctx_params, gen_params, gen_params.seed + idx);
int ok = 0;
@@ -419,8 +422,11 @@ bool save_results(const SDCliParams& cli_params,
ok = stbi_write_png(path.string().c_str(), img.width, img.height, img.channel, img.data, 0, params.c_str());
}
LOG_INFO("save result image %d to '%s' (%s)", idx, path.string().c_str(), ok ? "success" : "failure");
+ return ok != 0;
};
+ int sucessful_reults = 0;
+
if (std::regex_search(cli_params.output_path, format_specifier_regex)) {
if (!is_jpg && ext_lower != ".png")
ext = ".png";
@@ -429,9 +435,12 @@ bool save_results(const SDCliParams& cli_params,
for (int i = 0; i < num_results; ++i) {
fs::path img_path = format_frame_idx(pattern.string(), output_begin_idx + i);
- write_image(img_path, i);
+ if (write_image(img_path, i)) {
+ sucessful_reults++;
+ }
}
- return true;
+ LOG_INFO("%d/%d images saved", sucessful_reults, num_results);
+ return sucessful_reults != 0;
}
if (cli_params.mode == VID_GEN && num_results > 1) {
@@ -439,9 +448,13 @@ bool save_results(const SDCliParams& cli_params,
ext = ".avi";
fs::path video_path = base_path;
video_path += ext;
- create_mjpg_avi_from_sd_images(video_path.string().c_str(), results, num_results, gen_params.fps);
- LOG_INFO("save result MJPG AVI video to '%s'", video_path.string().c_str());
- return true;
+ if (create_mjpg_avi_from_sd_images(video_path.string().c_str(), results, num_results, gen_params.fps) == 0) {
+ LOG_INFO("save result MJPG AVI video to '%s'", video_path.string().c_str());
+ return true;
+ } else {
+ LOG_ERROR("Failed to save result MPG AVI video to '%s'", video_path.string().c_str());
+ return false;
+ }
}
if (!is_jpg && ext_lower != ".png")
@@ -453,10 +466,12 @@ bool save_results(const SDCliParams& cli_params,
img_path += "_" + std::to_string(output_begin_idx + i);
}
img_path += ext;
- write_image(img_path, i);
+ if (write_image(img_path, i)) {
+ sucessful_reults++;
+ }
}
-
- return true;
+ LOG_INFO("%d/%d images saved", sucessful_reults, num_results);
+ return sucessful_reults != 0;
}
int main(int argc, const char* argv[]) {
@@ -526,10 +541,10 @@ int main(int argc, const char* argv[]) {
}
bool vae_decode_only = true;
- sd_image_t init_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
- sd_image_t end_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
- sd_image_t control_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
- sd_image_t mask_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 1, nullptr};
+ sd_image_t init_image = {0, 0, 3, nullptr};
+ sd_image_t end_image = {0, 0, 3, nullptr};
+ sd_image_t control_image = {0, 0, 3, nullptr};
+ sd_image_t mask_image = {0, 0, 1, nullptr};
std::vector ref_images;
std::vector pmid_images;
std::vector control_frames;
@@ -556,57 +571,79 @@ int main(int argc, const char* argv[]) {
control_frames.clear();
};
+ auto load_image_and_update_size = [&](const std::string& path,
+ sd_image_t& image,
+ bool resize_image = true,
+ int expected_channel = 3) -> bool {
+ int expected_width = 0;
+ int expected_height = 0;
+ if (resize_image && gen_params.width_and_height_are_set()) {
+ expected_width = gen_params.width;
+ expected_height = gen_params.height;
+ }
+
+ if (!load_sd_image_from_file(&image, path.c_str(), expected_width, expected_height, expected_channel)) {
+ LOG_ERROR("load image from '%s' failed", path.c_str());
+ release_all_resources();
+ return false;
+ }
+
+ gen_params.set_width_and_height_if_unset(image.width, image.height);
+ return true;
+ };
+
if (gen_params.init_image_path.size() > 0) {
vae_decode_only = false;
-
- int width = 0;
- int height = 0;
- init_image.data = load_image_from_file(gen_params.init_image_path.c_str(), width, height, gen_params.width, gen_params.height);
- if (init_image.data == nullptr) {
- LOG_ERROR("load image from '%s' failed", gen_params.init_image_path.c_str());
- release_all_resources();
+ if (!load_image_and_update_size(gen_params.init_image_path, init_image)) {
return 1;
}
}
if (gen_params.end_image_path.size() > 0) {
vae_decode_only = false;
-
- int width = 0;
- int height = 0;
- end_image.data = load_image_from_file(gen_params.end_image_path.c_str(), width, height, gen_params.width, gen_params.height);
- if (end_image.data == nullptr) {
- LOG_ERROR("load image from '%s' failed", gen_params.end_image_path.c_str());
- release_all_resources();
+ if (!load_image_and_update_size(gen_params.init_image_path, end_image)) {
return 1;
}
}
+ if (gen_params.ref_image_paths.size() > 0) {
+ vae_decode_only = false;
+ for (auto& path : gen_params.ref_image_paths) {
+ sd_image_t ref_image = {0, 0, 3, nullptr};
+ if (!load_image_and_update_size(path, ref_image, false)) {
+ return 1;
+ }
+ ref_images.push_back(ref_image);
+ }
+ }
+
if (gen_params.mask_image_path.size() > 0) {
- int c = 0;
- int width = 0;
- int height = 0;
- mask_image.data = load_image_from_file(gen_params.mask_image_path.c_str(), width, height, gen_params.width, gen_params.height, 1);
- if (mask_image.data == nullptr) {
+ if (!load_sd_image_from_file(&mask_image,
+ gen_params.mask_image_path.c_str(),
+ gen_params.get_resolved_width(),
+ gen_params.get_resolved_height(),
+ 1)) {
LOG_ERROR("load image from '%s' failed", gen_params.mask_image_path.c_str());
release_all_resources();
return 1;
}
} else {
- mask_image.data = (uint8_t*)malloc(gen_params.width * gen_params.height);
+ mask_image.data = (uint8_t*)malloc(gen_params.get_resolved_width() * gen_params.get_resolved_height());
if (mask_image.data == nullptr) {
LOG_ERROR("malloc mask image failed");
release_all_resources();
return 1;
}
- memset(mask_image.data, 255, gen_params.width * gen_params.height);
+ mask_image.width = gen_params.get_resolved_width();
+ mask_image.height = gen_params.get_resolved_height();
+ memset(mask_image.data, 255, gen_params.get_resolved_width() * gen_params.get_resolved_height());
}
if (gen_params.control_image_path.size() > 0) {
- int width = 0;
- int height = 0;
- control_image.data = load_image_from_file(gen_params.control_image_path.c_str(), width, height, gen_params.width, gen_params.height);
- if (control_image.data == nullptr) {
+ if (!load_sd_image_from_file(&control_image,
+ gen_params.control_image_path.c_str(),
+ gen_params.get_resolved_width(),
+ gen_params.get_resolved_height())) {
LOG_ERROR("load image from '%s' failed", gen_params.control_image_path.c_str());
release_all_resources();
return 1;
@@ -621,29 +658,11 @@ int main(int argc, const char* argv[]) {
}
}
- if (gen_params.ref_image_paths.size() > 0) {
- vae_decode_only = false;
- for (auto& path : gen_params.ref_image_paths) {
- int width = 0;
- int height = 0;
- uint8_t* image_buffer = load_image_from_file(path.c_str(), width, height);
- if (image_buffer == nullptr) {
- LOG_ERROR("load image from '%s' failed", path.c_str());
- release_all_resources();
- return 1;
- }
- ref_images.push_back({(uint32_t)width,
- (uint32_t)height,
- 3,
- image_buffer});
- }
- }
-
if (!gen_params.control_video_path.empty()) {
if (!load_images_from_dir(gen_params.control_video_path,
control_frames,
- gen_params.width,
- gen_params.height,
+ gen_params.get_resolved_width(),
+ gen_params.get_resolved_height(),
gen_params.video_frames,
cli_params.verbose)) {
release_all_resources();
@@ -717,8 +736,8 @@ int main(int argc, const char* argv[]) {
gen_params.auto_resize_ref_image,
gen_params.increase_ref_index,
mask_image,
- gen_params.width,
- gen_params.height,
+ gen_params.get_resolved_width(),
+ gen_params.get_resolved_height(),
gen_params.sample_params,
gen_params.strength,
gen_params.seed,
@@ -748,8 +767,8 @@ int main(int argc, const char* argv[]) {
end_image,
control_frames.data(),
(int)control_frames.size(),
- gen_params.width,
- gen_params.height,
+ gen_params.get_resolved_width(),
+ gen_params.get_resolved_height(),
gen_params.sample_params,
gen_params.high_noise_sample_params,
gen_params.moe_boundary,
diff --git a/examples/common/common.hpp b/examples/common/common.hpp
index d299da5..9389b03 100644
--- a/examples/common/common.hpp
+++ b/examples/common/common.hpp
@@ -445,7 +445,7 @@ struct SDContextParams {
std::string photo_maker_path;
sd_type_t wtype = SD_TYPE_COUNT;
std::string tensor_type_rules;
- std::string lora_model_dir;
+ std::string lora_model_dir = ".";
std::map embedding_map;
std::vector embedding_vec;
@@ -457,6 +457,7 @@ struct SDContextParams {
bool control_net_cpu = false;
bool clip_on_cpu = false;
bool vae_on_cpu = false;
+ bool flash_attn = false;
bool diffusion_flash_attn = false;
bool diffusion_conv_direct = false;
bool vae_conv_direct = false;
@@ -580,10 +581,6 @@ struct SDContextParams {
"--vae-tile-overlap",
"tile overlap for vae tiling, in fraction of tile size (default: 0.5)",
&vae_tiling_params.target_overlap},
- {"",
- "--flow-shift",
- "shift value for Flow models like SD3.x or WAN (default: auto)",
- &flow_shift},
};
options.bool_options = {
@@ -615,9 +612,13 @@ struct SDContextParams {
"--vae-on-cpu",
"keep vae in cpu (for low vram)",
true, &vae_on_cpu},
+ {"",
+ "--fa",
+ "use flash attention",
+ true, &flash_attn},
{"",
"--diffusion-fa",
- "use flash attention in the diffusion model",
+ "use flash attention in the diffusion model only",
true, &diffusion_flash_attn},
{"",
"--diffusion-conv-direct",
@@ -898,12 +899,12 @@ struct SDContextParams {
<< " photo_maker_path: \"" << photo_maker_path << "\",\n"
<< " rng_type: " << sd_rng_type_name(rng_type) << ",\n"
<< " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n"
- << " flow_shift: " << (std::isinf(flow_shift) ? "INF" : std::to_string(flow_shift)) << "\n"
<< " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
<< " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
<< " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
<< " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n"
<< " vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n"
+ << " flash_attn: " << (flash_attn ? "true" : "false") << ",\n"
<< " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n"
<< " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n"
<< " vae_conv_direct: " << (vae_conv_direct ? "true" : "false") << ",\n"
@@ -968,6 +969,7 @@ struct SDContextParams {
clip_on_cpu,
control_net_cpu,
vae_on_cpu,
+ flash_attn,
diffusion_flash_attn,
taesd_preview,
diffusion_conv_direct,
@@ -979,7 +981,6 @@ struct SDContextParams {
chroma_use_t5_mask,
chroma_t5_mask_pad,
qwen_image_zero_cond_t,
- flow_shift,
};
return sd_ctx_params;
}
@@ -1024,8 +1025,8 @@ struct SDGenerationParams {
std::string prompt_with_lora; // for metadata record only
std::string negative_prompt;
int clip_skip = -1; // <= 0 represents unspecified
- int width = 512;
- int height = 512;
+ int width = -1;
+ int height = -1;
int batch_count = 1;
std::string init_image_path;
std::string end_image_path;
@@ -1046,7 +1047,6 @@ struct SDGenerationParams {
std::string cache_mode;
std::string cache_option;
- std::string cache_preset;
std::string scm_mask;
bool scm_policy_dynamic = true;
sd_cache_params_t cache_params{};
@@ -1199,6 +1199,10 @@ struct SDGenerationParams {
"--eta",
"eta in DDIM, only for DDIM and TCD (default: 0)",
&sample_params.eta},
+ {"",
+ "--flow-shift",
+ "shift value for Flow models like SD3.x or WAN (default: auto)",
+ &sample_params.flow_shift},
{"",
"--high-noise-cfg-scale",
"(high noise) unconditional guidance scale: (default: 7.0)",
@@ -1417,8 +1421,8 @@ struct SDGenerationParams {
}
cache_mode = argv_to_utf8(index, argv);
if (cache_mode != "easycache" && cache_mode != "ucache" &&
- cache_mode != "dbcache" && cache_mode != "taylorseer" && cache_mode != "cache-dit") {
- fprintf(stderr, "error: invalid cache mode '%s', must be 'easycache', 'ucache', 'dbcache', 'taylorseer', or 'cache-dit'\n", cache_mode.c_str());
+ cache_mode != "dbcache" && cache_mode != "taylorseer" && cache_mode != "cache-dit" && cache_mode != "spectrum") {
+ fprintf(stderr, "error: invalid cache mode '%s', must be 'easycache', 'ucache', 'dbcache', 'taylorseer', 'cache-dit', or 'spectrum'\n", cache_mode.c_str());
return -1;
}
return 1;
@@ -1456,21 +1460,6 @@ struct SDGenerationParams {
return 1;
};
- auto on_cache_preset_arg = [&](int argc, const char** argv, int index) {
- if (++index >= argc) {
- return -1;
- }
- cache_preset = argv_to_utf8(index, argv);
- if (cache_preset != "slow" && cache_preset != "s" && cache_preset != "S" &&
- cache_preset != "medium" && cache_preset != "m" && cache_preset != "M" &&
- cache_preset != "fast" && cache_preset != "f" && cache_preset != "F" &&
- cache_preset != "ultra" && cache_preset != "u" && cache_preset != "U") {
- fprintf(stderr, "error: invalid cache preset '%s', must be 'slow'/'s', 'medium'/'m', 'fast'/'f', or 'ultra'/'u'\n", cache_preset.c_str());
- return -1;
- }
- return 1;
- };
-
options.manual_options = {
{"-s",
"--seed",
@@ -1478,17 +1467,17 @@ struct SDGenerationParams {
on_seed_arg},
{"",
"--sampling-method",
- "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] "
+ "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s] "
"(default: euler for Flux/SD3/Wan, euler_a otherwise)",
on_sample_method_arg},
{"",
"--high-noise-sampling-method",
- "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd]"
+ "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s]"
" default: euler for Flux/SD3/Wan, euler_a otherwise",
on_high_noise_sample_method_arg},
{"",
"--scheduler",
- "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, kl_optimal, lcm], default: discrete",
+ "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent], default: discrete",
on_scheduler_arg},
{"",
"--sigmas",
@@ -1508,16 +1497,12 @@ struct SDGenerationParams {
on_ref_image_arg},
{"",
"--cache-mode",
- "caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level)",
+ "caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)",
on_cache_mode_arg},
{"",
"--cache-option",
- "named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples: \"threshold=0.25\" or \"threshold=1.5,reset=0\"",
+ "named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=; spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples: \"threshold=0.25\" or \"threshold=1.5,reset=0\"",
on_cache_option_arg},
- {"",
- "--cache-preset",
- "cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u'",
- on_cache_preset_arg},
{"",
"--scm-mask",
"SCM steps mask for cache-dit: comma-separated 0/1 (e.g., \"1,1,1,0,0,1,0,0,1,0\") - 1=compute, 0=can cache",
@@ -1570,7 +1555,6 @@ struct SDGenerationParams {
load_if_exists("negative_prompt", negative_prompt);
load_if_exists("cache_mode", cache_mode);
load_if_exists("cache_option", cache_option);
- load_if_exists("cache_preset", cache_preset);
load_if_exists("scm_mask", scm_mask);
load_if_exists("clip_skip", clip_skip);
@@ -1599,6 +1583,7 @@ struct SDGenerationParams {
load_if_exists("cfg_scale", sample_params.guidance.txt_cfg);
load_if_exists("img_cfg_scale", sample_params.guidance.img_cfg);
load_if_exists("guidance", sample_params.guidance.distilled_guidance);
+ load_if_exists("flow_shift", sample_params.flow_shift);
auto load_sampler_if_exists = [&](const char* key, enum sample_method_t& out) {
if (j.contains(key) && j[key].is_string()) {
@@ -1705,17 +1690,24 @@ struct SDGenerationParams {
}
}
+ bool width_and_height_are_set() const {
+ return width > 0 && height > 0;
+ }
+
+ void set_width_and_height_if_unset(int w, int h) {
+ if (!width_and_height_are_set()) {
+ LOG_INFO("set width x height to %d x %d", w, h);
+ width = w;
+ height = h;
+ }
+ }
+
+ int get_resolved_width() const { return (width > 0) ? width : 512; }
+
+ int get_resolved_height() const { return (height > 0) ? height : 512; }
+
bool process_and_check(SDMode mode, const std::string& lora_model_dir) {
prompt_with_lora = prompt;
- if (width <= 0) {
- LOG_ERROR("error: the width must be greater than 0\n");
- return false;
- }
-
- if (height <= 0) {
- LOG_ERROR("error: the height must be greater than 0\n");
- return false;
- }
if (sample_params.sample_steps <= 0) {
LOG_ERROR("error: the sample_steps must be greater than 0\n");
@@ -1766,7 +1758,23 @@ struct SDGenerationParams {
} else if (key == "Bn" || key == "bn") {
cache_params.Bn_compute_blocks = std::stoi(val);
} else if (key == "warmup") {
- cache_params.max_warmup_steps = std::stoi(val);
+ if (cache_mode == "spectrum") {
+ cache_params.spectrum_warmup_steps = std::stoi(val);
+ } else {
+ cache_params.max_warmup_steps = std::stoi(val);
+ }
+ } else if (key == "w") {
+ cache_params.spectrum_w = std::stof(val);
+ } else if (key == "m") {
+ cache_params.spectrum_m = std::stoi(val);
+ } else if (key == "lam") {
+ cache_params.spectrum_lam = std::stof(val);
+ } else if (key == "window") {
+ cache_params.spectrum_window_size = std::stoi(val);
+ } else if (key == "flex") {
+ cache_params.spectrum_flex_window = std::stof(val);
+ } else if (key == "stop") {
+ cache_params.spectrum_stop_percent = std::stof(val);
} else {
LOG_ERROR("error: unknown cache parameter '%s'", key.c_str());
return false;
@@ -1781,39 +1789,17 @@ struct SDGenerationParams {
if (!cache_mode.empty()) {
if (cache_mode == "easycache") {
- cache_params.mode = SD_CACHE_EASYCACHE;
- cache_params.reuse_threshold = 0.2f;
- cache_params.start_percent = 0.15f;
- cache_params.end_percent = 0.95f;
- cache_params.error_decay_rate = 1.0f;
- cache_params.use_relative_threshold = true;
- cache_params.reset_error_on_compute = true;
+ cache_params.mode = SD_CACHE_EASYCACHE;
} else if (cache_mode == "ucache") {
- cache_params.mode = SD_CACHE_UCACHE;
- cache_params.reuse_threshold = 1.0f;
- cache_params.start_percent = 0.15f;
- cache_params.end_percent = 0.95f;
- cache_params.error_decay_rate = 1.0f;
- cache_params.use_relative_threshold = true;
- cache_params.reset_error_on_compute = true;
+ cache_params.mode = SD_CACHE_UCACHE;
} else if (cache_mode == "dbcache") {
- cache_params.mode = SD_CACHE_DBCACHE;
- cache_params.Fn_compute_blocks = 8;
- cache_params.Bn_compute_blocks = 0;
- cache_params.residual_diff_threshold = 0.08f;
- cache_params.max_warmup_steps = 8;
+ cache_params.mode = SD_CACHE_DBCACHE;
} else if (cache_mode == "taylorseer") {
- cache_params.mode = SD_CACHE_TAYLORSEER;
- cache_params.Fn_compute_blocks = 8;
- cache_params.Bn_compute_blocks = 0;
- cache_params.residual_diff_threshold = 0.08f;
- cache_params.max_warmup_steps = 8;
+ cache_params.mode = SD_CACHE_TAYLORSEER;
} else if (cache_mode == "cache-dit") {
- cache_params.mode = SD_CACHE_CACHE_DIT;
- cache_params.Fn_compute_blocks = 8;
- cache_params.Bn_compute_blocks = 0;
- cache_params.residual_diff_threshold = 0.08f;
- cache_params.max_warmup_steps = 8;
+ cache_params.mode = SD_CACHE_CACHE_DIT;
+ } else if (cache_mode == "spectrum") {
+ cache_params.mode = SD_CACHE_SPECTRUM;
}
if (!cache_option.empty()) {
@@ -2083,6 +2069,22 @@ uint8_t* load_image_from_file(const char* image_path,
return load_image_common(false, image_path, 0, width, height, expected_width, expected_height, expected_channel);
}
+bool load_sd_image_from_file(sd_image_t* image,
+ const char* image_path,
+ int expected_width = 0,
+ int expected_height = 0,
+ int expected_channel = 3) {
+ int width;
+ int height;
+ image->data = load_image_common(false, image_path, 0, width, height, expected_width, expected_height, expected_channel);
+ if (image->data == nullptr) {
+ return false;
+ }
+ image->width = width;
+ image->height = height;
+ return true;
+}
+
uint8_t* load_image_from_memory(const char* image_bytes,
int len,
int& width,
diff --git a/examples/server/README.md b/examples/server/README.md
index 7e66815..38deff6 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -4,12 +4,12 @@
usage: ./bin/sd-server [options]
Svr Options:
- -l, --listen-ip server listen ip (default: 127.0.0.1)
- --listen-port server listen port (default: 1234)
- --serve-html-path path to HTML file to serve at root (optional)
- -v, --verbose print extra info
- --color colors the logging tags according to level
- -h, --help show this help message and exit
+ -l, --listen-ip server listen ip (default: 127.0.0.1)
+ --serve-html-path path to HTML file to serve at root (optional)
+ --listen-port server listen port (default: 1234)
+ -v, --verbose print extra info
+ --color colors the logging tags according to level
+ -h, --help show this help message and exit
Context Options:
-m, --model path to full model
@@ -36,21 +36,22 @@ Context Options:
CPU physical cores
--chroma-t5-mask-pad t5 mask pad size of chroma
--vae-tile-overlap tile overlap for vae tiling, in fraction of tile size (default: 0.5)
- --flow-shift shift value for Flow models like SD3.x or WAN (default: auto)
--vae-tiling process vae in tiles to reduce memory usage
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
+ --mmap whether to memory-map model
--control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram)
--vae-on-cpu keep vae in cpu (for low vram)
- --mmap whether to memory-map model
- --diffusion-fa use flash attention in the diffusion model
+ --fa use flash attention
+ --diffusion-fa use flash attention in the diffusion model only
--diffusion-conv-direct use ggml_conv2d_direct in the diffusion model
--vae-conv-direct use ggml_conv2d_direct in the vae model
--circular enable circular padding for convolutions
--circularx enable circular RoPE wrapping on x-axis (width) only
--circulary enable circular RoPE wrapping on y-axis (height) only
--chroma-disable-dit-mask disable dit mask for chroma
+ --qwen-image-zero-cond-t enable zero_cond_t for qwen image
--chroma-enable-t5-mask enable t5 mask for chroma
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
type of the weight file
@@ -100,6 +101,7 @@ Default Generation Options:
--skip-layer-start SLG enabling point (default: 0.01)
--skip-layer-end SLG disabling point (default: 0.2)
--eta eta in DDIM, only for DDIM and TCD (default: 0)
+ --flow-shift shift value for Flow models like SD3.x or WAN (default: auto)
--high-noise-cfg-scale (high noise) unconditional guidance scale: (default: 7.0)
--high-noise-img-cfg-scale (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
--high-noise-guidance (high noise) distilled guidance scale for models with guidance input (default: 3.5)
@@ -116,20 +118,21 @@ Default Generation Options:
--disable-auto-resize-ref-image disable auto resize of ref images
-s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
- tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
+ tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a
+ otherwise)
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
- ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
+ ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan,
+ euler_a otherwise
--scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
- kl_optimal, lcm], default: discrete
+ kl_optimal, lcm, bong_tangent], default: discrete
--sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
--skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
- --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level)
+ --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
--cache-option named cache params (key=value format, comma-separated). easycache/ucache:
threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
"threshold=0.25" or "threshold=1.5,reset=0"
- --cache-preset cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u'
--scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
--scm-policy SCM policy: 'dynamic' (default) or 'static'
```
diff --git a/examples/server/main.cpp b/examples/server/main.cpp
index 2b49655..6e4340a 100644
--- a/examples/server/main.cpp
+++ b/examples/server/main.cpp
@@ -267,6 +267,24 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
log_print(level, log, svr_params->verbose, svr_params->color);
}
+struct LoraEntry {
+ std::string name;
+ std::string path;
+ std::string fullpath;
+};
+
+void free_results(sd_image_t* result_images, int num_results) {
+ if (result_images) {
+ for (int i = 0; i < num_results; ++i) {
+ if (result_images[i].data) {
+ stbi_image_free(result_images[i].data);
+ result_images[i].data = nullptr;
+ }
+ }
+ }
+ free(result_images);
+}
+
int main(int argc, const char** argv) {
if (argc > 1 && std::string(argv[1]) == "--version") {
std::cout << version_string() << "\n";
@@ -297,6 +315,56 @@ int main(int argc, const char** argv) {
std::mutex sd_ctx_mutex;
+ std::vector lora_cache;
+ std::mutex lora_mutex;
+
+ auto refresh_lora_cache = [&]() {
+ std::vector new_cache;
+
+ fs::path lora_dir = ctx_params.lora_model_dir;
+ if (fs::exists(lora_dir) && fs::is_directory(lora_dir)) {
+ auto is_lora_ext = [](const fs::path& p) {
+ auto ext = p.extension().string();
+ std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+ return ext == ".gguf" || ext == ".pt" || ext == ".pth" || ext == ".safetensors";
+ };
+
+ for (auto& entry : fs::recursive_directory_iterator(lora_dir)) {
+ if (!entry.is_regular_file())
+ continue;
+ const fs::path& p = entry.path();
+ if (!is_lora_ext(p))
+ continue;
+
+ LoraEntry e;
+ e.name = p.stem().u8string();
+ e.fullpath = p.u8string();
+ std::string rel = p.lexically_relative(lora_dir).u8string();
+ std::replace(rel.begin(), rel.end(), '\\', '/');
+ e.path = rel;
+
+ new_cache.push_back(std::move(e));
+ }
+ }
+
+ std::sort(new_cache.begin(), new_cache.end(),
+ [](const LoraEntry& a, const LoraEntry& b) {
+ return a.path < b.path;
+ });
+
+ {
+ std::lock_guard lock(lora_mutex);
+ lora_cache = std::move(new_cache);
+ }
+ };
+
+ auto get_lora_full_path = [&](const std::string& path) -> std::string {
+ std::lock_guard lock(lora_mutex);
+ auto it = std::find_if(lora_cache.begin(), lora_cache.end(),
+ [&](const LoraEntry& e) { return e.path == path; });
+ return (it != lora_cache.end()) ? it->fullpath : "";
+ };
+
httplib::Server svr;
svr.set_pre_routing_handler([](const httplib::Request& req, httplib::Response& res) {
@@ -361,8 +429,8 @@ int main(int argc, const char** argv) {
std::string size = j.value("size", "");
std::string output_format = j.value("output_format", "png");
int output_compression = j.value("output_compression", 100);
- int width = 512;
- int height = 512;
+ int width = default_gen_params.width > 0 ? default_gen_params.width : 512;
+ int height = default_gen_params.width > 0 ? default_gen_params.height : 512;
if (!size.empty()) {
auto pos = size.find('x');
if (pos != std::string::npos) {
@@ -491,6 +559,7 @@ int main(int argc, const char** argv) {
item["b64_json"] = b64;
out["data"].push_back(item);
}
+ free_results(results, num_results);
res.set_content(out.dump(), "application/json");
res.status = 200;
@@ -521,8 +590,9 @@ int main(int argc, const char** argv) {
std::string sd_cpp_extra_args_str = extract_and_remove_sd_cpp_extra_args(prompt);
- size_t image_count = req.form.get_file_count("image[]");
- if (image_count == 0) {
+ size_t image_count = req.form.get_file_count("image[]");
+ bool has_legacy_image = req.form.has_file("image");
+ if (image_count == 0 && !has_legacy_image) {
res.status = 400;
res.set_content(R"({"error":"at least one image[] required"})", "application/json");
return;
@@ -533,6 +603,10 @@ int main(int argc, const char** argv) {
auto file = req.form.get_file("image[]", i);
images_bytes.emplace_back(file.content.begin(), file.content.end());
}
+ if (image_count == 0 && has_legacy_image) {
+ auto file = req.form.get_file("image");
+ images_bytes.emplace_back(file.content.begin(), file.content.end());
+ }
std::vector mask_bytes;
if (req.form.has_file("mask")) {
@@ -550,7 +624,7 @@ int main(int argc, const char** argv) {
n = std::clamp(n, 1, 8);
std::string size = req.form.get_field("size");
- int width = 512, height = 512;
+ int width = -1, height = -1;
if (!size.empty()) {
auto pos = size.find('x');
if (pos != std::string::npos) {
@@ -607,15 +681,31 @@ int main(int argc, const char** argv) {
LOG_DEBUG("%s\n", gen_params.to_string().c_str());
- sd_image_t init_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
- sd_image_t control_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
+ sd_image_t init_image = {0, 0, 3, nullptr};
+ sd_image_t control_image = {0, 0, 3, nullptr};
std::vector pmid_images;
+ auto get_resolved_width = [&gen_params, &default_gen_params]() -> int {
+ if (gen_params.width > 0)
+ return gen_params.width;
+ if (default_gen_params.width > 0)
+ return default_gen_params.width;
+ return 512;
+ };
+ auto get_resolved_height = [&gen_params, &default_gen_params]() -> int {
+ if (gen_params.height > 0)
+ return gen_params.height;
+ if (default_gen_params.height > 0)
+ return default_gen_params.height;
+ return 512;
+ };
+
std::vector ref_images;
ref_images.reserve(images_bytes.size());
for (auto& bytes : images_bytes) {
- int img_w = width;
- int img_h = height;
+ int img_w;
+ int img_h;
+
uint8_t* raw_pixels = load_image_from_memory(
reinterpret_cast(bytes.data()),
static_cast(bytes.size()),
@@ -627,22 +717,31 @@ int main(int argc, const char** argv) {
}
sd_image_t img{(uint32_t)img_w, (uint32_t)img_h, 3, raw_pixels};
+ gen_params.set_width_and_height_if_unset(img.width, img.height);
ref_images.push_back(img);
}
sd_image_t mask_image = {0};
if (!mask_bytes.empty()) {
- int mask_w = width;
- int mask_h = height;
+ int expected_width = 0;
+ int expected_height = 0;
+ if (gen_params.width_and_height_are_set()) {
+ expected_width = gen_params.width;
+ expected_height = gen_params.height;
+ }
+ int mask_w;
+ int mask_h;
+
uint8_t* mask_raw = load_image_from_memory(
reinterpret_cast(mask_bytes.data()),
static_cast(mask_bytes.size()),
mask_w, mask_h,
- width, height, 1);
+ expected_width, expected_height, 1);
mask_image = {(uint32_t)mask_w, (uint32_t)mask_h, 1, mask_raw};
+ gen_params.set_width_and_height_if_unset(mask_image.width, mask_image.height);
} else {
- mask_image.width = width;
- mask_image.height = height;
+ mask_image.width = get_resolved_width();
+ mask_image.height = get_resolved_height();
mask_image.channel = 1;
mask_image.data = nullptr;
}
@@ -659,8 +758,8 @@ int main(int argc, const char** argv) {
gen_params.auto_resize_ref_image,
gen_params.increase_ref_index,
mask_image,
- gen_params.width,
- gen_params.height,
+ get_resolved_width(),
+ get_resolved_height(),
gen_params.sample_params,
gen_params.strength,
gen_params.seed,
@@ -705,6 +804,7 @@ int main(int argc, const char** argv) {
item["b64_json"] = b64;
out["data"].push_back(item);
}
+ free_results(results, num_results);
res.set_content(out.dump(), "application/json");
res.status = 200;
@@ -743,8 +843,8 @@ int main(int argc, const char** argv) {
std::string negative_prompt = j.value("negative_prompt", "");
int width = j.value("width", 512);
int height = j.value("height", 512);
- int steps = j.value("steps", -1);
- float cfg_scale = j.value("cfg_scale", 7.f);
+ int steps = j.value("steps", default_gen_params.sample_params.sample_steps);
+ float cfg_scale = j.value("cfg_scale", default_gen_params.sample_params.guidance.txt_cfg);
int64_t seed = j.value("seed", -1);
int batch_size = j.value("batch_size", 1);
int clip_skip = j.value("clip_skip", -1);
@@ -777,6 +877,38 @@ int main(int argc, const char** argv) {
return bad("prompt required");
}
+ std::vector sd_loras;
+ std::vector lora_path_storage;
+
+ if (j.contains("lora") && j["lora"].is_array()) {
+ for (const auto& item : j["lora"]) {
+ if (!item.is_object()) {
+ continue;
+ }
+
+ std::string path = item.value("path", "");
+ float multiplier = item.value("multiplier", 1.0f);
+ bool is_high_noise = item.value("is_high_noise", false);
+
+ if (path.empty()) {
+ return bad("lora.path required");
+ }
+
+ std::string fullpath = get_lora_full_path(path);
+ if (fullpath.empty()) {
+ return bad("invalid lora path: " + path);
+ }
+
+ lora_path_storage.push_back(fullpath);
+ sd_lora_t l;
+ l.is_high_noise = is_high_noise;
+ l.multiplier = multiplier;
+ l.path = lora_path_storage.back().c_str();
+
+ sd_loras.push_back(l);
+ }
+ }
+
auto get_sample_method = [](std::string name) -> enum sample_method_t {
enum sample_method_t result = str_to_sample_method(name.c_str());
if (result != SAMPLE_METHOD_COUNT) return result;
@@ -795,7 +927,11 @@ int main(int argc, const char** argv) {
{"lcm", LCM_SAMPLE_METHOD},
{"ddim", DDIM_TRAILING_SAMPLE_METHOD},
{"dpm++ 2m", DPMPP2M_SAMPLE_METHOD},
- {"k_dpmpp_2m", DPMPP2M_SAMPLE_METHOD}};
+ {"k_dpmpp_2m", DPMPP2M_SAMPLE_METHOD},
+ {"res multistep", RES_MULTISTEP_SAMPLE_METHOD},
+ {"k_res_multistep", RES_MULTISTEP_SAMPLE_METHOD},
+ {"res 2s", RES_2S_SAMPLE_METHOD},
+ {"k_res_2s", RES_2S_SAMPLE_METHOD}};
auto it = hardcoded.find(name);
if (it != hardcoded.end()) return it->second;
return SAMPLE_METHOD_COUNT;
@@ -805,16 +941,13 @@ int main(int argc, const char** argv) {
enum scheduler_t scheduler = str_to_scheduler(scheduler_name.c_str());
- // avoid excessive resource usage
-
- SDGenerationParams gen_params = default_gen_params;
- gen_params.prompt = prompt;
- gen_params.negative_prompt = negative_prompt;
- gen_params.width = width;
- gen_params.height = height;
- gen_params.seed = seed;
- gen_params.sample_params.sample_steps = steps;
- gen_params.batch_count = batch_size;
+ SDGenerationParams gen_params = default_gen_params;
+ gen_params.prompt = prompt;
+ gen_params.negative_prompt = negative_prompt;
+ gen_params.seed = seed;
+ gen_params.sample_params.sample_steps = steps;
+ gen_params.batch_count = batch_size;
+ gen_params.sample_params.guidance.txt_cfg = cfg_scale;
if (clip_skip > 0) {
gen_params.clip_skip = clip_skip;
@@ -828,38 +961,66 @@ int main(int argc, const char** argv) {
gen_params.sample_params.scheduler = scheduler;
}
+ // re-read to avoid applying 512 as default before the provided
+ // images and/or server command-line
+ gen_params.width = j.value("width", -1);
+ gen_params.height = j.value("height", -1);
+
LOG_DEBUG("%s\n", gen_params.to_string().c_str());
- sd_image_t init_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
- sd_image_t control_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
- sd_image_t mask_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 1, nullptr};
+ sd_image_t init_image = {0, 0, 3, nullptr};
+ sd_image_t control_image = {0, 0, 3, nullptr};
+ sd_image_t mask_image = {0, 0, 1, nullptr};
std::vector mask_data;
std::vector pmid_images;
std::vector ref_images;
- if (img2img) {
- auto decode_image = [](sd_image_t& image, std::string encoded) -> bool {
- // remove data URI prefix if present ("data:image/png;base64,")
- auto comma_pos = encoded.find(',');
- if (comma_pos != std::string::npos) {
- encoded = encoded.substr(comma_pos + 1);
- }
- std::vector img_data = base64_decode(encoded);
- if (!img_data.empty()) {
- int img_w = image.width;
- int img_h = image.height;
- uint8_t* raw_data = load_image_from_memory(
- (const char*)img_data.data(), (int)img_data.size(),
- img_w, img_h,
- image.width, image.height, image.channel);
- if (raw_data) {
- image = {(uint32_t)img_w, (uint32_t)img_h, image.channel, raw_data};
- return true;
- }
- }
- return false;
- };
+ auto get_resolved_width = [&gen_params, &default_gen_params]() -> int {
+ if (gen_params.width > 0)
+ return gen_params.width;
+ if (default_gen_params.width > 0)
+ return default_gen_params.width;
+ return 512;
+ };
+ auto get_resolved_height = [&gen_params, &default_gen_params]() -> int {
+ if (gen_params.height > 0)
+ return gen_params.height;
+ if (default_gen_params.height > 0)
+ return default_gen_params.height;
+ return 512;
+ };
+ auto decode_image = [&gen_params](sd_image_t& image, std::string encoded) -> bool {
+ // remove data URI prefix if present ("data:image/png;base64,")
+ auto comma_pos = encoded.find(',');
+ if (comma_pos != std::string::npos) {
+ encoded = encoded.substr(comma_pos + 1);
+ }
+ std::vector img_data = base64_decode(encoded);
+ if (!img_data.empty()) {
+ int expected_width = 0;
+ int expected_height = 0;
+ if (gen_params.width_and_height_are_set()) {
+ expected_width = gen_params.width;
+ expected_height = gen_params.height;
+ }
+ int img_w;
+ int img_h;
+
+ uint8_t* raw_data = load_image_from_memory(
+ (const char*)img_data.data(), (int)img_data.size(),
+ img_w, img_h,
+ expected_width, expected_height, image.channel);
+ if (raw_data) {
+ image = {(uint32_t)img_w, (uint32_t)img_h, image.channel, raw_data};
+ gen_params.set_width_and_height_if_unset(image.width, image.height);
+ return true;
+ }
+ }
+ return false;
+ };
+
+ if (img2img) {
if (j.contains("init_images") && j["init_images"].is_array() && !j["init_images"].empty()) {
std::string encoded = j["init_images"][0].get();
decode_image(init_image, encoded);
@@ -875,23 +1036,15 @@ int main(int argc, const char** argv) {
}
}
} else {
- mask_data = std::vector(width * height, 255);
- mask_image.width = width;
- mask_image.height = height;
+ int m_width = get_resolved_width();
+ int m_height = get_resolved_height();
+ mask_data = std::vector(m_width * m_height, 255);
+ mask_image.width = m_width;
+ mask_image.height = m_height;
mask_image.channel = 1;
mask_image.data = mask_data.data();
}
- if (j.contains("extra_images") && j["extra_images"].is_array()) {
- for (auto extra_image : j["extra_images"]) {
- std::string encoded = extra_image.get();
- sd_image_t tmp_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
- if (decode_image(tmp_image, encoded)) {
- ref_images.push_back(tmp_image);
- }
- }
- }
-
float denoising_strength = j.value("denoising_strength", -1.f);
if (denoising_strength >= 0.f) {
denoising_strength = std::min(denoising_strength, 1.0f);
@@ -899,9 +1052,19 @@ int main(int argc, const char** argv) {
}
}
+ if (j.contains("extra_images") && j["extra_images"].is_array()) {
+ for (auto extra_image : j["extra_images"]) {
+ std::string encoded = extra_image.get();
+ sd_image_t tmp_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
+ if (decode_image(tmp_image, encoded)) {
+ ref_images.push_back(tmp_image);
+ }
+ }
+ }
+
sd_img_gen_params_t img_gen_params = {
- gen_params.lora_vec.data(),
- static_cast(gen_params.lora_vec.size()),
+ sd_loras.data(),
+ static_cast(sd_loras.size()),
gen_params.prompt.c_str(),
gen_params.negative_prompt.c_str(),
gen_params.clip_skip,
@@ -911,8 +1074,8 @@ int main(int argc, const char** argv) {
gen_params.auto_resize_ref_image,
gen_params.increase_ref_index,
mask_image,
- gen_params.width,
- gen_params.height,
+ get_resolved_width(),
+ get_resolved_height(),
gen_params.sample_params,
gen_params.strength,
gen_params.seed,
@@ -962,6 +1125,7 @@ int main(int argc, const char** argv) {
std::string b64 = base64_encode(image_bytes);
out["images"].push_back(b64);
}
+ free_results(results, num_results);
res.set_content(out.dump(), "application/json");
res.status = 200;
@@ -993,6 +1157,23 @@ int main(int argc, const char** argv) {
sdapi_any2img(req, res, true);
});
+ svr.Get("/sdapi/v1/loras", [&](const httplib::Request&, httplib::Response& res) {
+ refresh_lora_cache();
+
+ json result = json::array();
+ {
+ std::lock_guard lock(lora_mutex);
+ for (const auto& e : lora_cache) {
+ json item;
+ item["name"] = e.name;
+ item["path"] = e.path;
+ result.push_back(item);
+ }
+ }
+
+ res.set_content(result.dump(), "application/json");
+ });
+
svr.Get("/sdapi/v1/samplers", [&](const httplib::Request&, httplib::Response& res) {
std::vector sampler_names;
sampler_names.push_back("default");
diff --git a/format-code.sh b/format-code.sh
index d2a75bd..ac5fd34 100644
--- a/format-code.sh
+++ b/format-code.sh
@@ -1,4 +1,4 @@
-for f in *.cpp *.h *.hpp examples/cli/*.cpp examples/common/*.hpp examples/cli/*.h examples/server/*.cpp; do
+for f in src/*.cpp src/*.h src/*.hpp src/vocab/*.h src/vocab/*.cpp examples/cli/*.cpp examples/common/*.hpp examples/cli/*.h examples/server/*.cpp; do
[[ "$f" == vocab* ]] && continue
echo "formatting '$f'"
# if [ "$f" != "stable-diffusion.h" ]; then
diff --git a/ggml b/ggml
index 8891ab6..a8db410 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit 8891ab6fc742ac1198736d3da3b73c730e42af84
+Subproject commit a8db410a252c8c8f2d120c6f2e7133ebe032f35d
diff --git a/stable-diffusion.h b/include/stable-diffusion.h
similarity index 97%
rename from stable-diffusion.h
rename to include/stable-diffusion.h
index 8f040d2..029c2ab 100644
--- a/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -48,6 +48,8 @@ enum sample_method_t {
LCM_SAMPLE_METHOD,
DDIM_TRAILING_SAMPLE_METHOD,
TCD_SAMPLE_METHOD,
+ RES_MULTISTEP_SAMPLE_METHOD,
+ RES_2S_SAMPLE_METHOD,
SAMPLE_METHOD_COUNT
};
@@ -62,6 +64,7 @@ enum scheduler_t {
SMOOTHSTEP_SCHEDULER,
KL_OPTIMAL_SCHEDULER,
LCM_SCHEDULER,
+ BONG_TANGENT_SCHEDULER,
SCHEDULER_COUNT
};
@@ -186,6 +189,7 @@ typedef struct {
bool keep_clip_on_cpu;
bool keep_control_net_on_cpu;
bool keep_vae_on_cpu;
+ bool flash_attn;
bool diffusion_flash_attn;
bool tae_preview_only;
bool diffusion_conv_direct;
@@ -197,7 +201,6 @@ typedef struct {
bool chroma_use_t5_mask;
int chroma_t5_mask_pad;
bool qwen_image_zero_cond_t;
- float flow_shift;
} sd_ctx_params_t;
typedef struct {
@@ -231,6 +234,7 @@ typedef struct {
int shifted_timestep;
float* custom_sigmas;
int custom_sigmas_count;
+ float flow_shift;
} sd_sample_params_t;
typedef struct {
@@ -247,6 +251,7 @@ enum sd_cache_mode_t {
SD_CACHE_DBCACHE,
SD_CACHE_TAYLORSEER,
SD_CACHE_CACHE_DIT,
+ SD_CACHE_SPECTRUM,
};
typedef struct {
@@ -267,6 +272,13 @@ typedef struct {
int taylorseer_skip_interval;
const char* scm_mask;
bool scm_policy_dynamic;
+ float spectrum_w;
+ int spectrum_m;
+ float spectrum_lam;
+ int spectrum_window_size;
+ float spectrum_flex_window;
+ int spectrum_warmup_steps;
+ float spectrum_stop_percent;
} sd_cache_params_t;
typedef struct {
diff --git a/face_detect.py b/script/face_detect.py
similarity index 97%
rename from face_detect.py
rename to script/face_detect.py
index 7131af3..e7a3eae 100644
--- a/face_detect.py
+++ b/script/face_detect.py
@@ -1,88 +1,88 @@
-import os
-import sys
-
-import numpy as np
-import torch
-from diffusers.utils import load_image
-# pip install insightface==0.7.3
-from insightface.app import FaceAnalysis
-from insightface.data import get_image as ins_get_image
-from safetensors.torch import save_file
-
-###
-# https://github.com/cubiq/ComfyUI_IPAdapter_plus/issues/165#issue-2055829543
-###
-class FaceAnalysis2(FaceAnalysis):
- # NOTE: allows setting det_size for each detection call.
- # the model allows it but the wrapping code from insightface
- # doesn't show it, and people end up loading duplicate models
- # for different sizes where there is absolutely no need to
- def get(self, img, max_num=0, det_size=(640, 640)):
- if det_size is not None:
- self.det_model.input_size = det_size
-
- return super().get(img, max_num)
-
-def analyze_faces(face_analysis: FaceAnalysis, img_data: np.ndarray, det_size=(640, 640)):
- # NOTE: try detect faces, if no faces detected, lower det_size until it does
- detection_sizes = [None] + [(size, size) for size in range(640, 256, -64)] + [(256, 256)]
-
- for size in detection_sizes:
- faces = face_analysis.get(img_data, det_size=size)
- if len(faces) > 0:
- return faces
-
- return []
-
-if __name__ == "__main__":
- #face_detector = FaceAnalysis2(providers=['CUDAExecutionProvider'], allowed_modules=['detection', 'recognition'])
- face_detector = FaceAnalysis2(providers=['CPUExecutionProvider'], allowed_modules=['detection', 'recognition'])
- face_detector.prepare(ctx_id=0, det_size=(640, 640))
- #input_folder_name = './scarletthead_woman'
- input_folder_name = sys.argv[1]
- image_basename_list = os.listdir(input_folder_name)
- image_path_list = sorted([os.path.join(input_folder_name, basename) for basename in image_basename_list])
-
- input_id_images = []
- for image_path in image_path_list:
- input_id_images.append(load_image(image_path))
-
- id_embed_list = []
-
- for img in input_id_images:
- img = np.array(img)
- img = img[:, :, ::-1]
- faces = analyze_faces(face_detector, img)
- if len(faces) > 0:
- id_embed_list.append(torch.from_numpy((faces[0]['embedding'])))
-
- if len(id_embed_list) == 0:
- raise ValueError(f"No face detected in input image pool")
-
- id_embeds = torch.stack(id_embed_list)
-
- # for r in id_embeds:
- # print(r)
- # #torch.save(id_embeds, input_folder_name+'/id_embeds.pt');
- # weights = dict()
- # weights["id_embeds"] = id_embeds
- # save_file(weights, input_folder_name+'/id_embeds.safetensors')
-
- binary_data = id_embeds.numpy().tobytes()
- two = 4
- zero = 0
- one = 1
- tensor_name = "id_embeds"
-# Write binary data to a file
- with open(input_folder_name+'/id_embeds.bin', "wb") as f:
- f.write(two.to_bytes(4, byteorder='little'))
- f.write((len(tensor_name)).to_bytes(4, byteorder='little'))
- f.write(zero.to_bytes(4, byteorder='little'))
- f.write((id_embeds.shape[1]).to_bytes(4, byteorder='little'))
- f.write((id_embeds.shape[0]).to_bytes(4, byteorder='little'))
- f.write(one.to_bytes(4, byteorder='little'))
- f.write(one.to_bytes(4, byteorder='little'))
- f.write(tensor_name.encode('ascii'))
- f.write(binary_data)
-
+import os
+import sys
+
+import numpy as np
+import torch
+from diffusers.utils import load_image
+# pip install insightface==0.7.3
+from insightface.app import FaceAnalysis
+from insightface.data import get_image as ins_get_image
+from safetensors.torch import save_file
+
+###
+# https://github.com/cubiq/ComfyUI_IPAdapter_plus/issues/165#issue-2055829543
+###
+class FaceAnalysis2(FaceAnalysis):
+ # NOTE: allows setting det_size for each detection call.
+ # the model allows it but the wrapping code from insightface
+ # doesn't show it, and people end up loading duplicate models
+ # for different sizes where there is absolutely no need to
+ def get(self, img, max_num=0, det_size=(640, 640)):
+ if det_size is not None:
+ self.det_model.input_size = det_size
+
+ return super().get(img, max_num)
+
+def analyze_faces(face_analysis: FaceAnalysis, img_data: np.ndarray, det_size=(640, 640)):
+ # NOTE: try detect faces, if no faces detected, lower det_size until it does
+ detection_sizes = [None] + [(size, size) for size in range(640, 256, -64)] + [(256, 256)]
+
+ for size in detection_sizes:
+ faces = face_analysis.get(img_data, det_size=size)
+ if len(faces) > 0:
+ return faces
+
+ return []
+
+if __name__ == "__main__":
+ #face_detector = FaceAnalysis2(providers=['CUDAExecutionProvider'], allowed_modules=['detection', 'recognition'])
+ face_detector = FaceAnalysis2(providers=['CPUExecutionProvider'], allowed_modules=['detection', 'recognition'])
+ face_detector.prepare(ctx_id=0, det_size=(640, 640))
+ #input_folder_name = './scarletthead_woman'
+ input_folder_name = sys.argv[1]
+ image_basename_list = os.listdir(input_folder_name)
+ image_path_list = sorted([os.path.join(input_folder_name, basename) for basename in image_basename_list])
+
+ input_id_images = []
+ for image_path in image_path_list:
+ input_id_images.append(load_image(image_path))
+
+ id_embed_list = []
+
+ for img in input_id_images:
+ img = np.array(img)
+ img = img[:, :, ::-1]
+ faces = analyze_faces(face_detector, img)
+ if len(faces) > 0:
+ id_embed_list.append(torch.from_numpy((faces[0]['embedding'])))
+
+ if len(id_embed_list) == 0:
+ raise ValueError(f"No face detected in input image pool")
+
+ id_embeds = torch.stack(id_embed_list)
+
+ # for r in id_embeds:
+ # print(r)
+ # #torch.save(id_embeds, input_folder_name+'/id_embeds.pt');
+ # weights = dict()
+ # weights["id_embeds"] = id_embeds
+ # save_file(weights, input_folder_name+'/id_embeds.safetensors')
+
+ binary_data = id_embeds.numpy().tobytes()
+ two = 4
+ zero = 0
+ one = 1
+ tensor_name = "id_embeds"
+# Write binary data to a file
+ with open(input_folder_name+'/id_embeds.bin', "wb") as f:
+ f.write(two.to_bytes(4, byteorder='little'))
+ f.write((len(tensor_name)).to_bytes(4, byteorder='little'))
+ f.write(zero.to_bytes(4, byteorder='little'))
+ f.write((id_embeds.shape[1]).to_bytes(4, byteorder='little'))
+ f.write((id_embeds.shape[0]).to_bytes(4, byteorder='little'))
+ f.write(one.to_bytes(4, byteorder='little'))
+ f.write(one.to_bytes(4, byteorder='little'))
+ f.write(tensor_name.encode('ascii'))
+ f.write(binary_data)
+
\ No newline at end of file
diff --git a/src/anima.hpp b/src/anima.hpp
new file mode 100644
index 0000000..191a096
--- /dev/null
+++ b/src/anima.hpp
@@ -0,0 +1,686 @@
+#ifndef __ANIMA_HPP__
+#define __ANIMA_HPP__
+
+#include
+#include
+#include
+#include
+
+#include "common_block.hpp"
+#include "flux.hpp"
+#include "rope.hpp"
+
+namespace Anima {
+ constexpr int ANIMA_GRAPH_SIZE = 65536;
+
+ __STATIC_INLINE__ struct ggml_tensor* apply_gate(struct ggml_context* ctx,
+ struct ggml_tensor* x,
+ struct ggml_tensor* gate) {
+ gate = ggml_reshape_3d(ctx, gate, gate->ne[0], 1, gate->ne[1]); // [N, 1, C]
+ return ggml_mul(ctx, x, gate);
+ }
+
+ struct XEmbedder : public GGMLBlock {
+ public:
+ XEmbedder(int64_t in_dim, int64_t out_dim) {
+ blocks["proj.1"] = std::make_shared(in_dim, out_dim, false);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+ auto proj = std::dynamic_pointer_cast(blocks["proj.1"]);
+ return proj->forward(ctx, x);
+ }
+ };
+
+ struct TimestepEmbedder : public GGMLBlock {
+ public:
+ TimestepEmbedder(int64_t in_dim, int64_t out_dim) {
+ blocks["1.linear_1"] = std::make_shared(in_dim, in_dim, false);
+ blocks["1.linear_2"] = std::make_shared(in_dim, out_dim, false);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+ auto linear_1 = std::dynamic_pointer_cast(blocks["1.linear_1"]);
+ auto linear_2 = std::dynamic_pointer_cast(blocks["1.linear_2"]);
+
+ x = linear_1->forward(ctx, x);
+ x = ggml_silu_inplace(ctx->ggml_ctx, x);
+ x = linear_2->forward(ctx, x);
+ return x;
+ }
+ };
+
+ struct AdaLayerNormZero : public GGMLBlock {
+ protected:
+ int64_t in_features;
+
+ public:
+ AdaLayerNormZero(int64_t in_features, int64_t hidden_features = 256)
+ : in_features(in_features) {
+ blocks["norm"] = std::make_shared(in_features, 1e-6f, false, false);
+ blocks["1"] = std::make_shared(in_features, hidden_features, false);
+ blocks["2"] = std::make_shared(hidden_features, 3 * in_features, false);
+ }
+
+ std::pair forward(GGMLRunnerContext* ctx,
+ struct ggml_tensor* hidden_states,
+ struct ggml_tensor* embedded_timestep,
+ struct ggml_tensor* temb = nullptr) {
+ auto norm = std::dynamic_pointer_cast(blocks["norm"]);
+ auto linear_1 = std::dynamic_pointer_cast(blocks["1"]);
+ auto linear_2 = std::dynamic_pointer_cast(blocks["2"]);
+
+ auto emb = ggml_silu(ctx->ggml_ctx, embedded_timestep);
+ emb = linear_1->forward(ctx, emb);
+ emb = linear_2->forward(ctx, emb); // [N, 3*C]
+
+ if (temb != nullptr) {
+ emb = ggml_add(ctx->ggml_ctx, emb, temb);
+ }
+
+ auto emb_chunks = ggml_ext_chunk(ctx->ggml_ctx, emb, 3, 0);
+ auto shift = emb_chunks[0];
+ auto scale = emb_chunks[1];
+ auto gate = emb_chunks[2];
+
+ auto x = norm->forward(ctx, hidden_states);
+ x = Flux::modulate(ctx->ggml_ctx, x, shift, scale);
+
+ return {x, gate};
+ }
+ };
+
+ struct AdaLayerNorm : public GGMLBlock {
+ protected:
+ int64_t embedding_dim;
+
+ public:
+ AdaLayerNorm(int64_t in_features, int64_t hidden_features = 256)
+ : embedding_dim(in_features) {
+ blocks["norm"] = std::make_shared(in_features, 1e-6f, false, false);
+ blocks["1"] = std::make_shared(in_features, hidden_features, false);
+ blocks["2"] = std::make_shared(hidden_features, 2 * in_features, false);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+ struct ggml_tensor* hidden_states,
+ struct ggml_tensor* embedded_timestep,
+ struct ggml_tensor* temb = nullptr) {
+ auto norm = std::dynamic_pointer_cast(blocks["norm"]);
+ auto linear_1 = std::dynamic_pointer_cast(blocks["1"]);
+ auto linear_2 = std::dynamic_pointer_cast(blocks["2"]);
+
+ auto emb = ggml_silu(ctx->ggml_ctx, embedded_timestep);
+ emb = linear_1->forward(ctx, emb);
+ emb = linear_2->forward(ctx, emb); // [N, 2*C]
+
+ if (temb != nullptr) {
+ auto temb_2c = ggml_view_2d(ctx->ggml_ctx, temb, 2 * embedding_dim, temb->ne[1], temb->nb[1], 0);
+ emb = ggml_add(ctx->ggml_ctx, emb, temb_2c);
+ }
+
+ auto emb_chunks = ggml_ext_chunk(ctx->ggml_ctx, emb, 2, 0);
+ auto shift = emb_chunks[0];
+ auto scale = emb_chunks[1];
+
+ auto x = norm->forward(ctx, hidden_states);
+ x = Flux::modulate(ctx->ggml_ctx, x, shift, scale);
+ return x;
+ }
+ };
+
+ struct AnimaAttention : public GGMLBlock {
+ protected:
+ int64_t num_heads;
+ int64_t head_dim;
+ std::string out_proj_name;
+
+ public:
+ AnimaAttention(int64_t query_dim,
+ int64_t context_dim,
+ int64_t num_heads,
+ int64_t head_dim,
+ const std::string& out_proj_name = "output_proj")
+ : num_heads(num_heads), head_dim(head_dim), out_proj_name(out_proj_name) {
+ int64_t inner_dim = num_heads * head_dim;
+
+ blocks["q_proj"] = std::make_shared(query_dim, inner_dim, false);
+ blocks["k_proj"] = std::make_shared(context_dim, inner_dim, false);
+ blocks["v_proj"] = std::make_shared(context_dim, inner_dim, false);
+ blocks["q_norm"] = std::make_shared(head_dim, 1e-6f);
+ blocks["k_norm"] = std::make_shared(head_dim, 1e-6f);
+ blocks[this->out_proj_name] = std::make_shared(inner_dim, query_dim, false);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+ struct ggml_tensor* hidden_states,
+ struct ggml_tensor* encoder_hidden_states = nullptr,
+ struct ggml_tensor* pe_q = nullptr,
+ struct ggml_tensor* pe_k = nullptr) {
+ if (encoder_hidden_states == nullptr) {
+ encoder_hidden_states = hidden_states;
+ }
+
+ auto q_proj = std::dynamic_pointer_cast(blocks["q_proj"]);
+ auto k_proj = std::dynamic_pointer_cast(blocks["k_proj"]);
+ auto v_proj = std::dynamic_pointer_cast(blocks["v_proj"]);
+ auto q_norm = std::dynamic_pointer_cast(blocks["q_norm"]);
+ auto k_norm = std::dynamic_pointer_cast(blocks["k_norm"]);
+ auto out_proj = std::dynamic_pointer_cast(blocks[out_proj_name]);
+
+ auto q = q_proj->forward(ctx, hidden_states);
+ auto k = k_proj->forward(ctx, encoder_hidden_states);
+ auto v = v_proj->forward(ctx, encoder_hidden_states);
+
+ int64_t N = q->ne[2];
+ int64_t L_q = q->ne[1];
+ int64_t L_k = k->ne[1];
+
+ auto q4 = ggml_reshape_4d(ctx->ggml_ctx, q, head_dim, num_heads, L_q, N); // [N, L_q, H, D]
+ auto k4 = ggml_reshape_4d(ctx->ggml_ctx, k, head_dim, num_heads, L_k, N); // [N, L_k, H, D]
+ auto v4 = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim, num_heads, L_k, N); // [N, L_k, H, D]
+
+ q4 = q_norm->forward(ctx, q4);
+ k4 = k_norm->forward(ctx, k4);
+
+ struct ggml_tensor* attn_out = nullptr;
+ if (pe_q != nullptr || pe_k != nullptr) {
+ if (pe_q == nullptr) {
+ pe_q = pe_k;
+ }
+ if (pe_k == nullptr) {
+ pe_k = pe_q;
+ }
+ auto q_rope = Rope::apply_rope(ctx->ggml_ctx, q4, pe_q, false);
+ auto k_rope = Rope::apply_rope(ctx->ggml_ctx, k4, pe_k, false);
+ attn_out = ggml_ext_attention_ext(ctx->ggml_ctx,
+ ctx->backend,
+ q_rope,
+ k_rope,
+ v4,
+ num_heads,
+ nullptr,
+ true,
+ ctx->flash_attn_enabled);
+ } else {
+ auto q_flat = ggml_reshape_3d(ctx->ggml_ctx, q4, head_dim * num_heads, L_q, N);
+ auto k_flat = ggml_reshape_3d(ctx->ggml_ctx, k4, head_dim * num_heads, L_k, N);
+ attn_out = ggml_ext_attention_ext(ctx->ggml_ctx,
+ ctx->backend,
+ q_flat,
+ k_flat,
+ v,
+ num_heads,
+ nullptr,
+ false,
+ ctx->flash_attn_enabled);
+ }
+
+ return out_proj->forward(ctx, attn_out);
+ }
+ };
+
+ struct AnimaMLP : public GGMLBlock {
+ public:
+ AnimaMLP(int64_t dim, int64_t hidden_dim) {
+ blocks["layer1"] = std::make_shared(dim, hidden_dim, false);
+ blocks["layer2"] = std::make_shared(hidden_dim, dim, false);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+ auto layer1 = std::dynamic_pointer_cast(blocks["layer1"]);
+ auto layer2 = std::dynamic_pointer_cast(blocks["layer2"]);
+
+ x = layer1->forward(ctx, x);
+ x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
+ x = layer2->forward(ctx, x);
+ return x;
+ }
+ };
+
+ struct AdapterMLP : public GGMLBlock {
+ public:
+ AdapterMLP(int64_t dim, int64_t hidden_dim) {
+ blocks["0"] = std::make_shared(dim, hidden_dim, true);
+ blocks["2"] = std::make_shared(hidden_dim, dim, true);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+ auto layer0 = std::dynamic_pointer_cast(blocks["0"]);
+ auto layer2 = std::dynamic_pointer_cast(blocks["2"]);
+
+ x = layer0->forward(ctx, x);
+ x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
+ x = layer2->forward(ctx, x);
+ return x;
+ }
+ };
+
+ struct LLMAdapterBlock : public GGMLBlock {
+ public:
+ LLMAdapterBlock(int64_t model_dim = 1024, int64_t source_dim = 1024, int64_t num_heads = 16, int64_t head_dim = 64) {
+ blocks["norm_self_attn"] = std::make_shared(model_dim, 1e-6f);
+ blocks["self_attn"] = std::make_shared(model_dim, model_dim, num_heads, head_dim, "o_proj");
+ blocks["norm_cross_attn"] = std::make_shared(model_dim, 1e-6f);
+ blocks["cross_attn"] = std::make_shared(model_dim, source_dim, num_heads, head_dim, "o_proj");
+ blocks["norm_mlp"] = std::make_shared(model_dim, 1e-6f);
+ blocks["mlp"] = std::make_shared(model_dim, model_dim * 4);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+ struct ggml_tensor* x,
+ struct ggml_tensor* context,
+ struct ggml_tensor* target_pe,
+ struct ggml_tensor* context_pe) {
+ auto norm_self_attn = std::dynamic_pointer_cast(blocks["norm_self_attn"]);
+ auto self_attn = std::dynamic_pointer_cast(blocks["self_attn"]);
+ auto norm_cross_attn = std::dynamic_pointer_cast(blocks["norm_cross_attn"]);
+ auto cross_attn = std::dynamic_pointer_cast(blocks["cross_attn"]);
+ auto norm_mlp = std::dynamic_pointer_cast(blocks["norm_mlp"]);
+ auto mlp = std::dynamic_pointer_cast(blocks["mlp"]);
+
+ auto h = norm_self_attn->forward(ctx, x);
+ h = self_attn->forward(ctx, h, nullptr, target_pe, target_pe);
+ x = ggml_add(ctx->ggml_ctx, x, h);
+
+ h = norm_cross_attn->forward(ctx, x);
+ h = cross_attn->forward(ctx, h, context, target_pe, context_pe);
+ x = ggml_add(ctx->ggml_ctx, x, h);
+
+ h = norm_mlp->forward(ctx, x);
+ h = mlp->forward(ctx, h);
+ x = ggml_add(ctx->ggml_ctx, x, h);
+
+ return x;
+ }
+ };
+
+ struct LLMAdapter : public GGMLBlock {
+ protected:
+ int num_layers;
+
+ public:
+ LLMAdapter(int64_t source_dim = 1024,
+ int64_t target_dim = 1024,
+ int64_t model_dim = 1024,
+ int num_layers = 6,
+ int num_heads = 16)
+ : num_layers(num_layers) {
+ int64_t head_dim = model_dim / num_heads;
+
+ blocks["embed"] = std::make_shared(32128, target_dim);
+ for (int i = 0; i < num_layers; i++) {
+ blocks["blocks." + std::to_string(i)] =
+ std::make_shared(model_dim, source_dim, num_heads, head_dim);
+ }
+ blocks["out_proj"] = std::make_shared(model_dim, target_dim, true);
+ blocks["norm"] = std::make_shared(target_dim, 1e-6f);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+ struct ggml_tensor* source_hidden_states,
+ struct ggml_tensor* target_input_ids,
+ struct ggml_tensor* target_pe,
+ struct ggml_tensor* source_pe) {
+ GGML_ASSERT(target_input_ids != nullptr);
+ if (ggml_n_dims(target_input_ids) == 1) {
+ target_input_ids = ggml_reshape_2d(ctx->ggml_ctx, target_input_ids, target_input_ids->ne[0], 1);
+ }
+
+ auto embed = std::dynamic_pointer_cast(blocks["embed"]);
+ auto out_proj = std::dynamic_pointer_cast(blocks["out_proj"]);
+ auto norm = std::dynamic_pointer_cast(blocks["norm"]);
+
+ auto x = embed->forward(ctx, target_input_ids); // [N, target_len, target_dim]
+
+ for (int i = 0; i < num_layers; i++) {
+ auto block = std::dynamic_pointer_cast(blocks["blocks." + std::to_string(i)]);
+ x = block->forward(ctx, x, source_hidden_states, target_pe, source_pe);
+ }
+
+ x = out_proj->forward(ctx, x);
+ x = norm->forward(ctx, x);
+ return x;
+ }
+ };
+
+ struct TransformerBlock : public GGMLBlock {
+ public:
+ TransformerBlock(int64_t hidden_size,
+ int64_t text_embed_dim,
+ int64_t num_heads,
+ int64_t head_dim,
+ int64_t mlp_ratio = 4,
+ int64_t adaln_lora_dim = 256) {
+ blocks["adaln_modulation_self_attn"] = std::make_shared(hidden_size, adaln_lora_dim);
+ blocks["self_attn"] = std::make_shared(hidden_size, hidden_size, num_heads, head_dim);
+ blocks["adaln_modulation_cross_attn"] = std::make_shared(hidden_size, adaln_lora_dim);
+ blocks["cross_attn"] = std::make_shared(hidden_size, text_embed_dim, num_heads, head_dim);
+ blocks["adaln_modulation_mlp"] = std::make_shared(hidden_size, adaln_lora_dim);
+ blocks["mlp"] = std::make_shared(hidden_size, hidden_size * mlp_ratio);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+ struct ggml_tensor* hidden_states,
+ struct ggml_tensor* encoder_hidden_states,
+ struct ggml_tensor* embedded_timestep,
+ struct ggml_tensor* temb,
+ struct ggml_tensor* image_pe) {
+ auto norm1 = std::dynamic_pointer_cast(blocks["adaln_modulation_self_attn"]);
+ auto attn1 = std::dynamic_pointer_cast(blocks["self_attn"]);
+ auto norm2 = std::dynamic_pointer_cast(blocks["adaln_modulation_cross_attn"]);
+ auto attn2 = std::dynamic_pointer_cast(blocks["cross_attn"]);
+ auto norm3 = std::dynamic_pointer_cast(blocks["adaln_modulation_mlp"]);
+ auto mlp = std::dynamic_pointer_cast(blocks["mlp"]);
+
+ auto [normed1, gate1] = norm1->forward(ctx, hidden_states, embedded_timestep, temb);
+ auto h = attn1->forward(ctx, normed1, nullptr, image_pe, image_pe);
+ hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate1));
+
+ auto [normed2, gate2] = norm2->forward(ctx, hidden_states, embedded_timestep, temb);
+ h = attn2->forward(ctx, normed2, encoder_hidden_states, nullptr, nullptr);
+ hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate2));
+
+ auto [normed3, gate3] = norm3->forward(ctx, hidden_states, embedded_timestep, temb);
+ h = mlp->forward(ctx, normed3);
+ hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate3));
+
+ return hidden_states;
+ }
+ };
+
+ struct FinalLayer : public GGMLBlock {
+ protected:
+ int64_t hidden_size;
+ int64_t patch_size;
+ int64_t out_channels;
+
+ public:
+ FinalLayer(int64_t hidden_size, int64_t patch_size, int64_t out_channels)
+ : hidden_size(hidden_size), patch_size(patch_size), out_channels(out_channels) {
+ blocks["adaln_modulation"] = std::make_shared(hidden_size, 256);
+ blocks["linear"] = std::make_shared(hidden_size, patch_size * patch_size * out_channels, false);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+ struct ggml_tensor* hidden_states,
+ struct ggml_tensor* embedded_timestep,
+ struct ggml_tensor* temb) {
+ auto adaln = std::dynamic_pointer_cast(blocks["adaln_modulation"]);
+ auto linear = std::dynamic_pointer_cast(blocks["linear"]);
+
+ hidden_states = adaln->forward(ctx, hidden_states, embedded_timestep, temb);
+ hidden_states = linear->forward(ctx, hidden_states);
+ return hidden_states;
+ }
+ };
+
+ struct AnimaNet : public GGMLBlock {
+ public:
+ int64_t in_channels = 16;
+ int64_t out_channels = 16;
+ int64_t hidden_size = 2048;
+ int64_t text_embed_dim = 1024;
+ int64_t num_heads = 16;
+ int64_t head_dim = 128;
+ int patch_size = 2;
+ int64_t num_layers = 28;
+ std::vector axes_dim = {44, 42, 42};
+ int theta = 10000;
+
+ public:
+ AnimaNet() = default;
+ explicit AnimaNet(int64_t num_layers)
+ : num_layers(num_layers) {
+ blocks["x_embedder"] = std::make_shared((in_channels + 1) * patch_size * patch_size, hidden_size);
+ blocks["t_embedder"] = std::make_shared(hidden_size, hidden_size * 3);
+ blocks["t_embedding_norm"] = std::make_shared(hidden_size, 1e-6f);
+ for (int i = 0; i < num_layers; i++) {
+ blocks["blocks." + std::to_string(i)] = std::make_shared(hidden_size,
+ text_embed_dim,
+ num_heads,
+ head_dim);
+ }
+ blocks["final_layer"] = std::make_shared(hidden_size, patch_size, out_channels);
+ blocks["llm_adapter"] = std::make_shared(1024, 1024, 1024, 6, 16);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+ struct ggml_tensor* x,
+ struct ggml_tensor* timestep,
+ struct ggml_tensor* encoder_hidden_states,
+ struct ggml_tensor* image_pe,
+ struct ggml_tensor* t5_ids = nullptr,
+ struct ggml_tensor* t5_weights = nullptr,
+ struct ggml_tensor* adapter_q_pe = nullptr,
+ struct ggml_tensor* adapter_k_pe = nullptr) {
+ GGML_ASSERT(x->ne[3] == 1);
+
+ auto x_embedder = std::dynamic_pointer_cast(blocks["x_embedder"]);
+ auto t_embedder = std::dynamic_pointer_cast(blocks["t_embedder"]);
+ auto t_embedding_norm = std::dynamic_pointer_cast(blocks["t_embedding_norm"]);
+ auto final_layer = std::dynamic_pointer_cast(blocks["final_layer"]);
+ auto llm_adapter = std::dynamic_pointer_cast(blocks["llm_adapter"]);
+
+ int64_t W = x->ne[0];
+ int64_t H = x->ne[1];
+
+ auto padding_mask = ggml_ext_zeros(ctx->ggml_ctx, x->ne[0], x->ne[1], 1, x->ne[3]);
+ x = ggml_concat(ctx->ggml_ctx, x, padding_mask, 2); // [N, C + 1, H, W]
+
+ x = DiT::pad_and_patchify(ctx, x, patch_size, patch_size); // [N, h*w, (C+1)*ph*pw]
+
+ x = x_embedder->forward(ctx, x);
+
+ auto timestep_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast(hidden_size));
+ auto temb = t_embedder->forward(ctx, timestep_proj);
+ auto embedded_timestep = t_embedding_norm->forward(ctx, timestep_proj);
+
+ if (t5_ids != nullptr) {
+ auto adapted_context = llm_adapter->forward(ctx, encoder_hidden_states, t5_ids, adapter_q_pe, adapter_k_pe);
+ if (t5_weights != nullptr) {
+ auto w = t5_weights;
+ if (ggml_n_dims(w) == 1) {
+ w = ggml_reshape_3d(ctx->ggml_ctx, w, 1, w->ne[0], 1);
+ }
+ w = ggml_repeat_4d(ctx->ggml_ctx, w, adapted_context->ne[0], adapted_context->ne[1], adapted_context->ne[2], 1);
+ adapted_context = ggml_mul(ctx->ggml_ctx, adapted_context, w);
+ }
+ if (adapted_context->ne[1] < 512) {
+ auto pad_ctx = ggml_ext_zeros(ctx->ggml_ctx,
+ adapted_context->ne[0],
+ 512 - adapted_context->ne[1],
+ adapted_context->ne[2],
+ 1);
+ adapted_context = ggml_concat(ctx->ggml_ctx, adapted_context, pad_ctx, 1);
+ } else if (adapted_context->ne[1] > 512) {
+ adapted_context = ggml_ext_slice(ctx->ggml_ctx, adapted_context, 1, 0, 512);
+ }
+ encoder_hidden_states = adapted_context;
+ }
+
+ for (int i = 0; i < num_layers; i++) {
+ auto block = std::dynamic_pointer_cast(blocks["blocks." + std::to_string(i)]);
+ x = block->forward(ctx, x, encoder_hidden_states, embedded_timestep, temb, image_pe);
+ }
+
+ x = final_layer->forward(ctx, x, embedded_timestep, temb); // [N, h*w, ph*pw*C]
+
+ x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, patch_size, patch_size, false); // [N, C, H, W]
+
+ return x;
+ }
+ };
+
+ struct AnimaRunner : public GGMLRunner {
+ public:
+ std::vector image_pe_vec;
+ std::vector adapter_q_pe_vec;
+ std::vector adapter_k_pe_vec;
+ AnimaNet net;
+
+ AnimaRunner(ggml_backend_t backend,
+ bool offload_params_to_cpu,
+ const String2TensorStorage& tensor_storage_map = {},
+ const std::string prefix = "model.diffusion_model")
+ : GGMLRunner(backend, offload_params_to_cpu) {
+ int64_t num_layers = 0;
+ std::string layer_tag = prefix + ".net.blocks.";
+ for (const auto& kv : tensor_storage_map) {
+ const std::string& tensor_name = kv.first;
+ size_t pos = tensor_name.find(layer_tag);
+ if (pos == std::string::npos) {
+ continue;
+ }
+ size_t start = pos + layer_tag.size();
+ size_t end = tensor_name.find('.', start);
+ if (end == std::string::npos) {
+ continue;
+ }
+ int64_t layer_id = atoll(tensor_name.substr(start, end - start).c_str());
+ num_layers = std::max(num_layers, layer_id + 1);
+ }
+ if (num_layers <= 0) {
+ num_layers = 28;
+ }
+ LOG_INFO("anima net layers: %" PRId64, num_layers);
+
+ net = AnimaNet(num_layers);
+ net.init(params_ctx, tensor_storage_map, prefix + ".net");
+ }
+
+ std::string get_desc() override {
+ return "anima";
+ }
+
+ void get_param_tensors(std::map& tensors, const std::string prefix) {
+ net.get_param_tensors(tensors, prefix + ".net");
+ }
+
+ static std::vector gen_1d_rope_pe_vec(int64_t seq_len, int dim, float theta = 10000.f) {
+ std::vector pos(seq_len);
+ for (int64_t i = 0; i < seq_len; i++) {
+ pos[i] = static_cast(i);
+ }
+ auto rope_emb = Rope::rope(pos, dim, theta);
+ return Rope::flatten(rope_emb);
+ }
+
+ static float calc_ntk_factor(float extrapolation_ratio, int axis_dim) {
+ if (extrapolation_ratio == 1.0f || axis_dim <= 2) {
+ return 1.0f;
+ }
+ return std::pow(extrapolation_ratio, static_cast(axis_dim) / static_cast(axis_dim - 2));
+ }
+
+ static std::vector gen_anima_image_pe_vec(int bs,
+ int h,
+ int w,
+ int patch_size,
+ int theta,
+ const std::vector& axes_dim,
+ float h_extrapolation_ratio,
+ float w_extrapolation_ratio,
+ float t_extrapolation_ratio) {
+ static const std::vector empty_ref_latents;
+ auto ids = Rope::gen_flux_ids(h,
+ w,
+ patch_size,
+ bs,
+ static_cast(axes_dim.size()),
+ 0,
+ {},
+ empty_ref_latents,
+ false,
+ 1.0f);
+
+ std::vector axis_thetas = {
+ static_cast(theta) * calc_ntk_factor(t_extrapolation_ratio, axes_dim[0]),
+ static_cast(theta) * calc_ntk_factor(h_extrapolation_ratio, axes_dim[1]),
+ static_cast(theta) * calc_ntk_factor(w_extrapolation_ratio, axes_dim[2]),
+ };
+ return Rope::embed_nd(ids, bs, axis_thetas, axes_dim);
+ }
+
+ struct ggml_cgraph* build_graph(struct ggml_tensor* x,
+ struct ggml_tensor* timesteps,
+ struct ggml_tensor* context,
+ struct ggml_tensor* t5_ids = nullptr,
+ struct ggml_tensor* t5_weights = nullptr) {
+ GGML_ASSERT(x->ne[3] == 1);
+ struct ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE);
+
+ x = to_backend(x);
+ timesteps = to_backend(timesteps);
+ context = to_backend(context);
+ t5_ids = to_backend(t5_ids);
+ t5_weights = to_backend(t5_weights);
+
+ int64_t pad_h = (net.patch_size - x->ne[1] % net.patch_size) % net.patch_size;
+ int64_t pad_w = (net.patch_size - x->ne[0] % net.patch_size) % net.patch_size;
+ int64_t h_pad = x->ne[1] + pad_h;
+ int64_t w_pad = x->ne[0] + pad_w;
+
+ image_pe_vec = gen_anima_image_pe_vec(1,
+ static_cast(h_pad),
+ static_cast(w_pad),
+ static_cast(net.patch_size),
+ net.theta,
+ net.axes_dim,
+ 4.0f,
+ 4.0f,
+ 1.0f);
+ int64_t image_pos_len = static_cast(image_pe_vec.size()) / (2 * 2 * (net.head_dim / 2));
+ auto image_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, net.head_dim / 2, image_pos_len);
+ set_backend_tensor_data(image_pe, image_pe_vec.data());
+
+ ggml_tensor* adapter_q_pe = nullptr;
+ ggml_tensor* adapter_k_pe = nullptr;
+ if (t5_ids != nullptr) {
+ int64_t target_len = t5_ids->ne[0];
+ int64_t source_len = context->ne[1];
+
+ adapter_q_pe_vec = gen_1d_rope_pe_vec(target_len, 64, 10000.f);
+ adapter_k_pe_vec = gen_1d_rope_pe_vec(source_len, 64, 10000.f);
+
+ int64_t target_pos_len = static_cast(adapter_q_pe_vec.size()) / (2 * 2 * 32);
+ int64_t source_pos_len = static_cast(adapter_k_pe_vec.size()) / (2 * 2 * 32);
+
+ adapter_q_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, 32, target_pos_len);
+ adapter_k_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, 32, source_pos_len);
+ set_backend_tensor_data(adapter_q_pe, adapter_q_pe_vec.data());
+ set_backend_tensor_data(adapter_k_pe, adapter_k_pe_vec.data());
+ }
+
+ auto runner_ctx = get_context();
+ auto out = net.forward(&runner_ctx,
+ x,
+ timesteps,
+ context,
+ image_pe,
+ t5_ids,
+ t5_weights,
+ adapter_q_pe,
+ adapter_k_pe);
+
+ ggml_build_forward_expand(gf, out);
+ return gf;
+ }
+
+ bool compute(int n_threads,
+ struct ggml_tensor* x,
+ struct ggml_tensor* timesteps,
+ struct ggml_tensor* context,
+ struct ggml_tensor* t5_ids = nullptr,
+ struct ggml_tensor* t5_weights = nullptr,
+ struct ggml_tensor** output = nullptr,
+ struct ggml_context* output_ctx = nullptr) {
+ auto get_graph = [&]() -> struct ggml_cgraph* {
+ return build_graph(x, timesteps, context, t5_ids, t5_weights);
+ };
+ return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+ }
+ };
+} // namespace Anima
+
+#endif // __ANIMA_HPP__
diff --git a/cache_dit.hpp b/src/cache_dit.hpp
similarity index 91%
rename from cache_dit.hpp
rename to src/cache_dit.hpp
index 6fe104d..4e3cf69 100644
--- a/cache_dit.hpp
+++ b/src/cache_dit.hpp
@@ -603,87 +603,6 @@ inline std::vector generate_scm_mask(
return mask;
}
-inline std::vector get_scm_preset(const std::string& preset, int total_steps) {
- struct Preset {
- std::vector compute_bins;
- std::vector cache_bins;
- };
-
- Preset slow = {{8, 3, 3, 2, 1, 1}, {1, 2, 2, 2, 3}};
- Preset medium = {{6, 2, 2, 2, 2, 1}, {1, 3, 3, 3, 3}};
- Preset fast = {{6, 1, 1, 1, 1, 1}, {1, 3, 4, 5, 4}};
- Preset ultra = {{4, 1, 1, 1, 1}, {2, 5, 6, 7}};
-
- Preset* p = nullptr;
- if (preset == "slow" || preset == "s" || preset == "S")
- p = &slow;
- else if (preset == "medium" || preset == "m" || preset == "M")
- p = &medium;
- else if (preset == "fast" || preset == "f" || preset == "F")
- p = &fast;
- else if (preset == "ultra" || preset == "u" || preset == "U")
- p = &ultra;
- else
- return {};
-
- if (total_steps != 28 && total_steps > 0) {
- float scale = static_cast(total_steps) / 28.0f;
- std::vector scaled_compute, scaled_cache;
-
- for (int v : p->compute_bins) {
- scaled_compute.push_back(std::max(1, static_cast(v * scale + 0.5f)));
- }
- for (int v : p->cache_bins) {
- scaled_cache.push_back(std::max(1, static_cast(v * scale + 0.5f)));
- }
-
- return generate_scm_mask(scaled_compute, scaled_cache, total_steps);
- }
-
- return generate_scm_mask(p->compute_bins, p->cache_bins, total_steps);
-}
-
-inline float get_preset_threshold(const std::string& preset) {
- if (preset == "slow" || preset == "s" || preset == "S")
- return 0.20f;
- if (preset == "medium" || preset == "m" || preset == "M")
- return 0.25f;
- if (preset == "fast" || preset == "f" || preset == "F")
- return 0.30f;
- if (preset == "ultra" || preset == "u" || preset == "U")
- return 0.34f;
- return 0.08f;
-}
-
-inline int get_preset_warmup(const std::string& preset) {
- if (preset == "slow" || preset == "s" || preset == "S")
- return 8;
- if (preset == "medium" || preset == "m" || preset == "M")
- return 6;
- if (preset == "fast" || preset == "f" || preset == "F")
- return 6;
- if (preset == "ultra" || preset == "u" || preset == "U")
- return 4;
- return 8;
-}
-
-inline int get_preset_Fn(const std::string& preset) {
- if (preset == "slow" || preset == "s" || preset == "S")
- return 8;
- if (preset == "medium" || preset == "m" || preset == "M")
- return 8;
- if (preset == "fast" || preset == "f" || preset == "F")
- return 6;
- if (preset == "ultra" || preset == "u" || preset == "U")
- return 4;
- return 8;
-}
-
-inline int get_preset_Bn(const std::string& preset) {
- (void)preset;
- return 0;
-}
-
inline void parse_dbcache_options(const std::string& opts, DBCacheConfig& cfg) {
if (opts.empty())
return;
diff --git a/clip.hpp b/src/clip.hpp
similarity index 95%
rename from clip.hpp
rename to src/clip.hpp
index 7a6ebe9..adecd4d 100644
--- a/clip.hpp
+++ b/src/clip.hpp
@@ -4,6 +4,7 @@
#include "ggml_extend.hpp"
#include "model.h"
#include "tokenize_util.h"
+#include "vocab/vocab.h"
/*================================================== CLIPTokenizer ===================================================*/
@@ -110,7 +111,7 @@ public:
if (merges_utf8_str.size() > 0) {
load_from_merges(merges_utf8_str);
} else {
- load_from_merges(ModelLoader::load_merges());
+ load_from_merges(load_clip_merges());
}
add_special_token("<|startoftext|>");
add_special_token("<|endoftext|>");
@@ -479,9 +480,9 @@ public:
x = fc1->forward(ctx, x);
if (use_gelu) {
- x = ggml_gelu_inplace(ctx->ggml_ctx, x);
+ x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
} else {
- x = ggml_gelu_quick_inplace(ctx->ggml_ctx, x);
+ x = ggml_ext_gelu_quick(ctx->ggml_ctx, x, true);
}
x = fc2->forward(ctx, x);
return x;
@@ -510,7 +511,7 @@ public:
blocks["mlp"] = std::shared_ptr(new CLIPMLP(d_model, intermediate_size));
}
- struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, bool mask = true) {
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* mask = nullptr) {
// x: [N, n_token, d_model]
auto self_attn = std::dynamic_pointer_cast(blocks["self_attn"]);
auto layer_norm1 = std::dynamic_pointer_cast(blocks["layer_norm1"]);
@@ -542,8 +543,8 @@ public:
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
- int clip_skip = -1,
- bool mask = true) {
+ struct ggml_tensor* mask = nullptr,
+ int clip_skip = -1) {
// x: [N, n_token, d_model]
int layer_idx = n_layer - 1;
// LOG_DEBUG("clip_skip %d", clip_skip);
@@ -741,16 +742,17 @@ public:
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids,
struct ggml_tensor* tkn_embeddings,
- size_t max_token_idx = 0,
- bool return_pooled = false,
- int clip_skip = -1) {
+ struct ggml_tensor* mask = nullptr,
+ size_t max_token_idx = 0,
+ bool return_pooled = false,
+ int clip_skip = -1) {
// input_ids: [N, n_token]
auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]);
auto encoder = std::dynamic_pointer_cast(blocks["encoder"]);
auto final_layer_norm = std::dynamic_pointer_cast(blocks["final_layer_norm"]);
auto x = embeddings->forward(ctx, input_ids, tkn_embeddings); // [N, n_token, hidden_size]
- x = encoder->forward(ctx, x, return_pooled ? -1 : clip_skip, true);
+ x = encoder->forward(ctx, x, mask, return_pooled ? -1 : clip_skip);
if (return_pooled || with_final_ln) {
x = final_layer_norm->forward(ctx, x);
}
@@ -814,10 +816,11 @@ public:
auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim]
x = pre_layernorm->forward(ctx, x);
- x = encoder->forward(ctx, x, clip_skip, false);
- // print_ggml_tensor(x, true, "ClipVisionModel x: ");
+ x = encoder->forward(ctx, x, nullptr, clip_skip);
+
auto last_hidden_state = x;
- x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]
+
+ x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]
GGML_ASSERT(x->ne[3] == 1);
if (return_pooled) {
@@ -905,6 +908,8 @@ public:
struct CLIPTextModelRunner : public GGMLRunner {
CLIPTextModel model;
+ std::vector attention_mask_vec;
+
CLIPTextModelRunner(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map,
@@ -938,6 +943,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids,
struct ggml_tensor* embeddings,
+ struct ggml_tensor* mask,
size_t max_token_idx = 0,
bool return_pooled = false,
int clip_skip = -1) {
@@ -948,7 +954,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
input_ids = ggml_reshape_2d(ctx->ggml_ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
}
- return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
+ return model.forward(ctx, input_ids, embeddings, mask, max_token_idx, return_pooled, clip_skip);
}
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
@@ -975,9 +981,23 @@ struct CLIPTextModelRunner : public GGMLRunner {
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
}
+ int n_tokens = static_cast(input_ids->ne[0]);
+ attention_mask_vec.resize(n_tokens * n_tokens);
+ for (int i0 = 0; i0 < n_tokens; i0++) {
+ for (int i1 = 0; i1 < n_tokens; i1++) {
+ float value = 0.f;
+ if (i0 > i1) {
+ value = -INFINITY;
+ }
+ attention_mask_vec[i1 * n_tokens + i0] = value;
+ }
+ }
+ auto attention_mask = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, n_tokens, n_tokens);
+ set_backend_tensor_data(attention_mask, attention_mask_vec.data());
+
auto runner_ctx = get_context();
- struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
+ struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, embeddings, attention_mask, max_token_idx, return_pooled, clip_skip);
ggml_build_forward_expand(gf, hidden_states);
diff --git a/common.hpp b/src/common_block.hpp
similarity index 98%
rename from common.hpp
rename to src/common_block.hpp
index 13ab103..435afa4 100644
--- a/common.hpp
+++ b/src/common_block.hpp
@@ -1,5 +1,5 @@
-#ifndef __COMMON_HPP__
-#define __COMMON_HPP__
+#ifndef __COMMON_BLOCK_HPP__
+#define __COMMON_BLOCK_HPP__
#include "ggml_extend.hpp"
@@ -200,7 +200,7 @@ public:
gate = ggml_cont(ctx->ggml_ctx, gate);
- gate = ggml_gelu_inplace(ctx->ggml_ctx, gate);
+ gate = ggml_ext_gelu(ctx->ggml_ctx, gate, true);
x = ggml_mul(ctx->ggml_ctx, x, gate); // [ne3, ne2, ne1, dim_out]
@@ -220,7 +220,7 @@ public:
auto proj = std::dynamic_pointer_cast(blocks["proj"]);
x = proj->forward(ctx, x);
- x = ggml_gelu_inplace(ctx->ggml_ctx, x);
+ x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
return x;
}
};
@@ -317,7 +317,7 @@ public:
auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]
- x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim]
+ x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim]
x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]
return x;
@@ -536,8 +536,8 @@ public:
// image_only_indicator is always tensor([0.])
float alpha = get_alpha();
auto x = ggml_add(ctx->ggml_ctx,
- ggml_scale(ctx->ggml_ctx, x_spatial, alpha),
- ggml_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha));
+ ggml_ext_scale(ctx->ggml_ctx, x_spatial, alpha),
+ ggml_ext_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha));
return x;
}
};
@@ -590,4 +590,4 @@ public:
}
};
-#endif // __COMMON_HPP__
+#endif // __COMMON_BLOCK_HPP__
diff --git a/src/common_dit.hpp b/src/common_dit.hpp
new file mode 100644
index 0000000..0e6f0f0
--- /dev/null
+++ b/src/common_dit.hpp
@@ -0,0 +1,108 @@
+#ifndef __COMMON_DIT_HPP__
+#define __COMMON_DIT_HPP__
+
+#include "ggml_extend.hpp"
+
+namespace DiT {
+ ggml_tensor* patchify(ggml_context* ctx,
+ ggml_tensor* x,
+ int pw,
+ int ph,
+ bool patch_last = true) {
+ // x: [N, C, H, W]
+ // return: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C]
+ int64_t N = x->ne[3];
+ int64_t C = x->ne[2];
+ int64_t H = x->ne[1];
+ int64_t W = x->ne[0];
+ int64_t h = H / ph;
+ int64_t w = W / pw;
+
+ GGML_ASSERT(h * ph == H && w * pw == W);
+
+ x = ggml_reshape_4d(ctx, x, pw, w, ph, h * C * N); // [N*C*h, ph, w, pw]
+ x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, w, ph, pw]
+ x = ggml_reshape_4d(ctx, x, pw * ph, w * h, C, N); // [N, C, h*w, ph*pw]
+ if (patch_last) {
+ x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, h*w, C, ph*pw]
+ x = ggml_reshape_3d(ctx, x, pw * ph * C, w * h, N); // [N, h*w, C*ph*pw]
+ } else {
+ x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 2, 0, 1, 3)); // [N, h*w, C, ph*pw]
+ x = ggml_reshape_3d(ctx, x, C * pw * ph, w * h, N); // [N, h*w, ph*pw*C]
+ }
+ return x;
+ }
+
+ ggml_tensor* unpatchify(ggml_context* ctx,
+ ggml_tensor* x,
+ int64_t h,
+ int64_t w,
+ int ph,
+ int pw,
+ bool patch_last = true) {
+ // x: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C]
+ // return: [N, C, H, W]
+ int64_t N = x->ne[2];
+ int64_t C = x->ne[0] / ph / pw;
+ int64_t H = h * ph;
+ int64_t W = w * pw;
+
+ GGML_ASSERT(C * ph * pw == x->ne[0]);
+
+ if (patch_last) {
+ x = ggml_reshape_4d(ctx, x, pw * ph, C, w * h, N); // [N, h*w, C, ph*pw]
+ x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, C, h*w, ph*pw]
+ } else {
+ x = ggml_reshape_4d(ctx, x, C, pw * ph, w * h, N); // [N, h*w, ph*pw, C]
+ x = ggml_cont(ctx, ggml_permute(ctx, x, 2, 0, 1, 3)); // [N, C, h*w, ph*pw]
+ }
+
+ x = ggml_reshape_4d(ctx, x, pw, ph, w, h * C * N); // [N*C*h, w, ph, pw]
+ x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, ph, w, pw]
+ x = ggml_reshape_4d(ctx, x, W, H, C, N); // [N, C, h*ph, w*pw]
+
+ return x;
+ }
+
+ ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
+ ggml_tensor* x,
+ int ph,
+ int pw) {
+ int64_t W = x->ne[0];
+ int64_t H = x->ne[1];
+
+ int pad_h = (ph - H % ph) % ph;
+ int pad_w = (pw - W % pw) % pw;
+ x = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
+ return x;
+ }
+
+ ggml_tensor* pad_and_patchify(GGMLRunnerContext* ctx,
+ ggml_tensor* x,
+ int ph,
+ int pw,
+ bool patch_last = true) {
+ x = pad_to_patch_size(ctx, x, ph, pw);
+ x = patchify(ctx->ggml_ctx, x, ph, pw, patch_last);
+ return x;
+ }
+
+ ggml_tensor* unpatchify_and_crop(ggml_context* ctx,
+ ggml_tensor* x,
+ int64_t H,
+ int64_t W,
+ int ph,
+ int pw,
+ bool patch_last = true) {
+ int pad_h = (ph - H % ph) % ph;
+ int pad_w = (pw - W % pw) % pw;
+ int64_t h = ((H + pad_h) / ph);
+ int64_t w = ((W + pad_w) / pw);
+ x = unpatchify(ctx, x, h, w, ph, pw, patch_last); // [N, C, H + pad_h, W + pad_w]
+ x = ggml_ext_slice(ctx, x, 1, 0, H); // [N, C, H, W + pad_w]
+ x = ggml_ext_slice(ctx, x, 0, 0, W); // [N, C, H, W]
+ return x;
+ }
+} // namespace DiT
+
+#endif // __COMMON_DIT_HPP__
\ No newline at end of file
diff --git a/conditioner.hpp b/src/conditioner.hpp
similarity index 83%
rename from conditioner.hpp
rename to src/conditioner.hpp
index a4e84aa..d4a3146 100644
--- a/conditioner.hpp
+++ b/src/conditioner.hpp
@@ -10,9 +10,14 @@ struct SDCondition {
struct ggml_tensor* c_vector = nullptr; // aka y
struct ggml_tensor* c_concat = nullptr;
+ std::vector extra_c_crossattns;
+
SDCondition() = default;
- SDCondition(struct ggml_tensor* c_crossattn, struct ggml_tensor* c_vector, struct ggml_tensor* c_concat)
- : c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat) {}
+ SDCondition(struct ggml_tensor* c_crossattn,
+ struct ggml_tensor* c_vector,
+ struct ggml_tensor* c_concat,
+ const std::vector& extra_c_crossattns = {})
+ : c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat), extra_c_crossattns(extra_c_crossattns) {}
};
struct ConditionerParams {
@@ -34,6 +39,7 @@ struct Conditioner {
virtual void free_params_buffer() = 0;
virtual void get_param_tensors(std::map& tensors) = 0;
virtual size_t get_params_buffer_size() = 0;
+ virtual void set_flash_attention_enabled(bool enabled) = 0;
virtual void set_weight_adapter(const std::shared_ptr& adapter) {}
virtual std::tuple> get_learned_condition_with_trigger(ggml_context* work_ctx,
int n_threads,
@@ -115,6 +121,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
return buffer_size;
}
+ void set_flash_attention_enabled(bool enabled) override {
+ text_model->set_flash_attention_enabled(enabled);
+ if (sd_version_is_sdxl(version)) {
+ text_model2->set_flash_attention_enabled(enabled);
+ }
+ }
+
void set_weight_adapter(const std::shared_ptr& adapter) override {
text_model->set_weight_adapter(adapter);
if (sd_version_is_sdxl(version)) {
@@ -783,6 +796,18 @@ struct SD3CLIPEmbedder : public Conditioner {
return buffer_size;
}
+ void set_flash_attention_enabled(bool enabled) override {
+ if (clip_l) {
+ clip_l->set_flash_attention_enabled(enabled);
+ }
+ if (clip_g) {
+ clip_g->set_flash_attention_enabled(enabled);
+ }
+ if (t5) {
+ t5->set_flash_attention_enabled(enabled);
+ }
+ }
+
void set_weight_adapter(const std::shared_ptr& adapter) override {
if (clip_l) {
clip_l->set_weight_adapter(adapter);
@@ -1191,6 +1216,15 @@ struct FluxCLIPEmbedder : public Conditioner {
return buffer_size;
}
+ void set_flash_attention_enabled(bool enabled) override {
+ if (clip_l) {
+ clip_l->set_flash_attention_enabled(enabled);
+ }
+ if (t5) {
+ t5->set_flash_attention_enabled(enabled);
+ }
+ }
+
void set_weight_adapter(const std::shared_ptr& adapter) {
if (clip_l) {
clip_l->set_weight_adapter(adapter);
@@ -1440,6 +1474,12 @@ struct T5CLIPEmbedder : public Conditioner {
return buffer_size;
}
+ void set_flash_attention_enabled(bool enabled) override {
+ if (t5) {
+ t5->set_flash_attention_enabled(enabled);
+ }
+ }
+
void set_weight_adapter(const std::shared_ptr& adapter) override {
if (t5) {
t5->set_weight_adapter(adapter);
@@ -1601,6 +1641,142 @@ struct T5CLIPEmbedder : public Conditioner {
}
};
+struct AnimaConditioner : public Conditioner {
+ std::shared_ptr qwen_tokenizer;
+ T5UniGramTokenizer t5_tokenizer;
+ std::shared_ptr llm;
+
+ AnimaConditioner(ggml_backend_t backend,
+ bool offload_params_to_cpu,
+ const String2TensorStorage& tensor_storage_map = {}) {
+ qwen_tokenizer = std::make_shared();
+ llm = std::make_shared(LLM::LLMArch::QWEN3,
+ backend,
+ offload_params_to_cpu,
+ tensor_storage_map,
+ "text_encoders.llm",
+ false);
+ }
+
+ void get_param_tensors(std::map& tensors) override {
+ llm->get_param_tensors(tensors, "text_encoders.llm");
+ }
+
+ void alloc_params_buffer() override {
+ llm->alloc_params_buffer();
+ }
+
+ void free_params_buffer() override {
+ llm->free_params_buffer();
+ }
+
+ size_t get_params_buffer_size() override {
+ return llm->get_params_buffer_size();
+ }
+
+ void set_flash_attention_enabled(bool enabled) override {
+ llm->set_flash_attention_enabled(enabled);
+ }
+
+ void set_weight_adapter(const std::shared_ptr& adapter) override {
+ llm->set_weight_adapter(adapter);
+ }
+
+ std::tuple, std::vector, std::vector, std::vector> tokenize(std::string text) {
+ auto parsed_attention = parse_prompt_attention(text);
+
+ {
+ std::stringstream ss;
+ ss << "[";
+ for (const auto& item : parsed_attention) {
+ ss << "['" << item.first << "', " << item.second << "], ";
+ }
+ ss << "]";
+ LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
+ }
+
+ std::vector qwen_tokens;
+ std::vector qwen_weights;
+ std::vector t5_tokens;
+ std::vector t5_weights;
+
+ for (const auto& item : parsed_attention) {
+ const std::string& curr_text = item.first;
+ std::vector curr_tokens = qwen_tokenizer->tokenize(curr_text, nullptr);
+ qwen_tokens.insert(qwen_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+ // Anima uses uniform Qwen token weights.
+ qwen_weights.insert(qwen_weights.end(), curr_tokens.size(), 1.f);
+ }
+ if (qwen_tokens.empty()) {
+ qwen_tokens.push_back(151643); // qwen3 pad token
+ qwen_weights.push_back(1.f);
+ }
+
+ for (const auto& item : parsed_attention) {
+ const std::string& curr_text = item.first;
+ float curr_weight = item.second;
+ std::vector curr_tokens = t5_tokenizer.Encode(curr_text, true);
+ t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+ t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
+ }
+
+ return {qwen_tokens, qwen_weights, t5_tokens, t5_weights};
+ }
+
+ SDCondition get_learned_condition(ggml_context* work_ctx,
+ int n_threads,
+ const ConditionerParams& conditioner_params) override {
+ int64_t t0 = ggml_time_ms();
+
+ auto tokenized = tokenize(conditioner_params.text);
+ auto& qwen_tokens = std::get<0>(tokenized);
+ auto& qwen_weights = std::get<1>(tokenized);
+ auto& t5_tokens = std::get<2>(tokenized);
+ auto& t5_weights = std::get<3>(tokenized);
+
+ auto input_ids = vector_to_ggml_tensor_i32(work_ctx, qwen_tokens);
+
+ struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 1024]
+ llm->compute(n_threads,
+ input_ids,
+ nullptr,
+ {},
+ {},
+ &hidden_states,
+ work_ctx);
+
+ {
+ auto tensor = hidden_states;
+ float original_mean = ggml_ext_tensor_mean(tensor);
+ for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+ for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+ for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+ float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
+ value *= qwen_weights[i1];
+ ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
+ }
+ }
+ }
+ float new_mean = ggml_ext_tensor_mean(tensor);
+ if (new_mean != 0.f) {
+ ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
+ }
+ }
+
+ struct ggml_tensor* t5_ids_tensor = nullptr;
+ struct ggml_tensor* t5_weight_tensor = nullptr;
+ if (!t5_tokens.empty()) {
+ t5_ids_tensor = vector_to_ggml_tensor_i32(work_ctx, t5_tokens);
+ t5_weight_tensor = vector_to_ggml_tensor(work_ctx, t5_weights);
+ }
+
+ int64_t t1 = ggml_time_ms();
+ LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
+
+ return {hidden_states, t5_weight_tensor, t5_ids_tensor};
+ }
+};
+
struct LLMEmbedder : public Conditioner {
SDVersion version;
std::shared_ptr tokenizer;
@@ -1650,6 +1826,10 @@ struct LLMEmbedder : public Conditioner {
return buffer_size;
}
+ void set_flash_attention_enabled(bool enabled) override {
+ llm->set_flash_attention_enabled(enabled);
+ }
+
void set_weight_adapter(const std::shared_ptr& adapter) override {
if (llm) {
llm->set_weight_adapter(adapter);
@@ -1657,18 +1837,23 @@ struct LLMEmbedder : public Conditioner {
}
std::tuple, std::vector> tokenize(std::string text,
- std::pair attn_range,
+ const std::pair& attn_range,
size_t max_length = 0,
bool padding = false) {
std::vector> parsed_attention;
- parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f);
- if (attn_range.second - attn_range.first > 0) {
- auto new_parsed_attention = parse_prompt_attention(text.substr(attn_range.first, attn_range.second - attn_range.first));
- parsed_attention.insert(parsed_attention.end(),
- new_parsed_attention.begin(),
- new_parsed_attention.end());
+ if (attn_range.first >= 0 && attn_range.second > 0) {
+ parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f);
+ if (attn_range.second - attn_range.first > 0) {
+ auto new_parsed_attention = parse_prompt_attention(text.substr(attn_range.first, attn_range.second - attn_range.first));
+ parsed_attention.insert(parsed_attention.end(),
+ new_parsed_attention.begin(),
+ new_parsed_attention.end());
+ }
+ parsed_attention.emplace_back(text.substr(attn_range.second), 1.f);
+ } else {
+ parsed_attention.emplace_back(text, 1.f);
}
- parsed_attention.emplace_back(text.substr(attn_range.second), 1.f);
+
{
std::stringstream ss;
ss << "[";
@@ -1699,156 +1884,27 @@ struct LLMEmbedder : public Conditioner {
return {tokens, weights};
}
- SDCondition get_learned_condition(ggml_context* work_ctx,
- int n_threads,
- const ConditionerParams& conditioner_params) override {
- std::string prompt;
- std::vector> image_embeds;
- std::pair prompt_attn_range;
- int prompt_template_encode_start_idx = 34;
- int max_length = 0;
- std::set out_layers;
- std::vector tokens;
- std::vector weights;
+ ggml_tensor* encode_prompt(ggml_context* work_ctx,
+ int n_threads,
+ const std::string prompt,
+ const std::pair& prompt_attn_range,
+ int max_length,
+ int min_length,
+ std::vector> image_embeds,
+ const std::set& out_layers,
+ int prompt_template_encode_start_idx) {
+ auto tokens_and_weights = tokenize(prompt, prompt_attn_range);
+ auto& tokens = std::get<0>(tokens_and_weights);
+ auto& weights = std::get<1>(tokens_and_weights);
std::vector mask;
- if (llm->enable_vision && conditioner_params.ref_images.size() > 0) {
- LOG_INFO("QwenImageEditPlusPipeline");
- prompt_template_encode_start_idx = 64;
- int image_embed_idx = 64 + 6;
-
- int min_pixels = 384 * 384;
- int max_pixels = 560 * 560;
- std::string placeholder = "<|image_pad|>";
- std::string img_prompt;
-
- for (int i = 0; i < conditioner_params.ref_images.size(); i++) {
- sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]);
- double factor = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
- int height = image.height;
- int width = image.width;
- int h_bar = static_cast(std::round(height / factor) * factor);
- int w_bar = static_cast(std::round(width / factor) * factor);
-
- if (static_cast(h_bar) * w_bar > max_pixels) {
- double beta = std::sqrt((height * width) / static_cast(max_pixels));
- h_bar = std::max(static_cast(factor),
- static_cast(std::floor(height / beta / factor)) * static_cast(factor));
- w_bar = std::max(static_cast(factor),
- static_cast(std::floor(width / beta / factor)) * static_cast(factor));
- } else if (static_cast(h_bar) * w_bar < min_pixels) {
- double beta = std::sqrt(static_cast(min_pixels) / (height * width));
- h_bar = static_cast(std::ceil(height * beta / factor)) * static_cast(factor);
- w_bar = static_cast(std::ceil(width * beta / factor)) * static_cast