diff --git a/.dockerignore b/.dockerignore
index 64a58a78..4627a217 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,4 +1,5 @@
build*/
+docs/
test/
.cache/
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7f78c354..666887d9 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -38,6 +38,10 @@ on:
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+concurrency:
+ group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+ cancel-in-progress: true
+
jobs:
ubuntu-latest-cmake:
runs-on: ubuntu-latest
@@ -92,6 +96,123 @@ jobs:
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
+ ubuntu-latest-cmake-vulkan:
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v3
+ with:
+ submodules: recursive
+
+ - name: Dependencies
+ id: depends
+ run: |
+ sudo apt-get update
+ sudo apt-get install build-essential libvulkan-dev glslc
+
+ - name: Build
+ id: cmake_build
+ run: |
+ mkdir build
+ cd build
+ cmake .. -DSD_BUILD_SHARED_LIBS=ON -DSD_VULKAN=ON
+ cmake --build . --config Release
+
+ - name: Get commit hash
+ id: commit
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: pr-mpt/actions-commit-hash@v2
+
+ - name: Fetch system info
+ id: system-info
+ run: |
+ echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
+ echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
+ echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
+ echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
+
+ - name: Pack artifacts
+ id: pack_artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ run: |
+ cp ggml/LICENSE ./build/bin/ggml.txt
+ cp LICENSE ./build/bin/stable-diffusion.cpp.txt
+ zip -j sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip ./build/bin/*
+
+ - name: Upload artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: actions/upload-artifact@v4
+ with:
+ name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip
+ path: |
+ sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip
+
+ build-and-push-docker-images:
+ name: Build and push container images
+ runs-on: ubuntu-latest
+
+ permissions:
+ contents: read
+ packages: write
+ id-token: write
+ attestations: write
+ artifact-metadata: write
+
+ strategy:
+ matrix:
+ variant: [musa, sycl, vulkan]
+
+ env:
+ REGISTRY: ghcr.io
+ IMAGE_NAME: ${{ github.repository }}
+
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v6
+ with:
+ submodules: recursive
+
+ - name: Get commit hash
+ id: commit
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: pr-mpt/actions-commit-hash@v2
+
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@v3
+
+ - name: Log in to the container registry
+ uses: docker/login-action@v3
+ with:
+ registry: ${{ env.REGISTRY }}
+ username: ${{ github.actor }}
+ password: ${{ secrets.GITHUB_TOKEN }}
+
+ - name: Extract metadata for Docker
+ id: meta
+ uses: docker/metadata-action@v5
+ with:
+ images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+
+ - name: Free Disk Space (Ubuntu)
+ uses: jlumbroso/free-disk-space@v1.3.1
+ with:
+ # this might remove tools that are actually needed,
+ # if set to "true" but frees about 6 GB
+ tool-cache: false
+
+ - name: Build and push Docker image
+ id: build-push
+ uses: docker/build-push-action@v6
+ with:
+ platforms: linux/amd64
+ push: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ file: Dockerfile.${{ matrix.variant }}
+ tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH_NAME }}-${{ matrix.variant }}
+ labels: ${{ steps.meta.outputs.labels }}
+ annotations: ${{ steps.meta.outputs.annotations }}
+
macOS-latest-cmake:
runs-on: macos-latest
@@ -146,7 +267,7 @@ jobs:
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
windows-latest-cmake:
- runs-on: windows-2025
+ runs-on: windows-2022
env:
VULKAN_VERSION: 1.4.328.1
@@ -163,8 +284,8 @@ jobs:
- build: "avx512"
defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
- build: "cuda12"
- defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES='61;70;75;80;86;89;90;100;120'"
- - build: 'vulkan'
+ defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES='61;70;75;80;86;89;90;100;120' -DCMAKE_CUDA_FLAGS='-Xcudafe \"--diag_suppress=177\" -Xcudafe \"--diag_suppress=550\"'"
+ - build: "vulkan"
defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
steps:
- name: Clone
@@ -191,13 +312,17 @@ jobs:
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
+ - name: Activate MSVC environment
+ id: msvc_dev_cmd
+ uses: ilammy/msvc-dev-cmd@v1
+
- name: Build
id: cmake_build
run: |
mkdir build
cd build
- cmake .. ${{ matrix.defines }}
- cmake --build . --config Release
+ cmake .. -DCMAKE_CXX_FLAGS='/bigobj' -G Ninja -DCMAKE_C_COMPILER=cl.exe -DCMAKE_CXX_COMPILER=cl.exe -DCMAKE_BUILD_TYPE=Release ${{ matrix.defines }}
+ cmake --build .
- name: Check AVX512F support
id: check_avx512f
@@ -360,6 +485,146 @@ jobs:
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
+ ubuntu-latest-rocm:
+ runs-on: ubuntu-latest
+ container: rocm/dev-ubuntu-24.04:7.2
+
+ env:
+ ROCM_VERSION: "7.2"
+ UBUNTU_VERSION: "24.04"
+ GPU_TARGETS: "gfx1151;gfx1150;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+
+ steps:
+ - run: apt-get update && apt-get install -y git
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+ with:
+ submodules: recursive
+
+ - name: Free disk space
+ run: |
+ # Remove preinstalled SDKs and caches not needed for this job
+ sudo rm -rf /usr/share/dotnet || true
+ sudo rm -rf /usr/local/lib/android || true
+ sudo rm -rf /opt/ghc || true
+ sudo rm -rf /usr/local/.ghcup || true
+ sudo rm -rf /opt/hostedtoolcache || true
+
+ # Remove old package lists and caches
+ sudo rm -rf /var/lib/apt/lists/* || true
+ sudo apt clean
+
+ - name: Dependencies
+ id: depends
+ run: |
+ sudo apt-get update
+ sudo apt install -y \
+ cmake \
+ hip-dev \
+ hipblas-dev \
+ ninja-build \
+ rocm-dev \
+ zip
+ # Clean apt caches to recover disk space
+ sudo apt clean
+ sudo rm -rf /var/lib/apt/lists/* || true
+
+ - name: Setup ROCm Environment
+ run: |
+ # Add ROCm to PATH for current session
+ echo "/opt/rocm/bin" >> $GITHUB_PATH
+
+ # Build regex pattern from ${{ env.GPU_TARGETS }} (match target as substring)
+ TARGET_REGEX="($(printf '%s' "${{ env.GPU_TARGETS }}" | sed 's/;/|/g'))"
+
+ # Remove library files for architectures we're not building for to save disk space
+ echo "Cleaning up unneeded architecture files..."
+ cd /opt/rocm/lib/rocblas/library
+ # Keep only our target architectures
+ for file in *; do
+ if printf '%s' "$file" | grep -q 'gfx'; then
+ if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
+ echo "Removing $file" &&
+ sudo rm -f "$file";
+ fi
+ fi
+ done
+
+ cd /opt/rocm/lib/hipblaslt/library
+ for file in *; do
+ if printf '%s' "$file" | grep -q 'gfx'; then
+ if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
+ echo "Removing $file" &&
+ sudo rm -f "$file";
+ fi
+ fi
+ done
+
+ - name: Build
+ id: cmake_build
+ run: |
+ mkdir build
+ cd build
+ cmake .. -G Ninja \
+ -DCMAKE_CXX_COMPILER=amdclang++ \
+ -DCMAKE_C_COMPILER=amdclang \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DSD_HIPBLAS=ON \
+ -DGPU_TARGETS="${{ env.GPU_TARGETS }}" \
+ -DAMDGPU_TARGETS="${{ env.GPU_TARGETS }}" \
+ -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
+ -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+ -DSD_BUILD_SHARED_LIBS=ON
+ cmake --build . --config Release
+
+ - name: Get commit hash
+ id: commit
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: pr-mpt/actions-commit-hash@v2
+
+ - name: Prepare artifacts
+ id: prepare_artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ run: |
+ # Copy licenses
+ cp ggml/LICENSE ./build/bin/ggml.txt
+ cp LICENSE ./build/bin/stable-diffusion.cpp.txt
+
+ # Move ROCm runtime libraries (to avoid double space consumption)
+ sudo mv /opt/rocm/lib/librocsparse.so* ./build/bin/
+ sudo mv /opt/rocm/lib/libhsa-runtime64.so* ./build/bin/
+ sudo mv /opt/rocm/lib/libamdhip64.so* ./build/bin/
+ sudo mv /opt/rocm/lib/libhipblas.so* ./build/bin/
+ sudo mv /opt/rocm/lib/libhipblaslt.so* ./build/bin/
+ sudo mv /opt/rocm/lib/librocblas.so* ./build/bin/
+ sudo mv /opt/rocm/lib/rocblas/ ./build/bin/
+ sudo mv /opt/rocm/lib/hipblaslt/ ./build/bin/
+
+ - name: Fetch system info
+ id: system-info
+ run: |
+ echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
+ echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
+ echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
+ echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
+
+ - name: Pack artifacts
+ id: pack_artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ run: |
+ cp ggml/LICENSE ./build/bin/ggml.txt
+ cp LICENSE ./build/bin/stable-diffusion.cpp.txt
+ zip -y -r sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip ./build/bin
+
+ - name: Upload artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: actions/upload-artifact@v4
+ with:
+ name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
+ path: |
+ sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
+
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -367,6 +632,9 @@ jobs:
needs:
- ubuntu-latest-cmake
+ - ubuntu-latest-cmake-vulkan
+ - ubuntu-latest-rocm
+ - build-and-push-docker-images
- macOS-latest-cmake
- windows-latest-cmake
- windows-latest-cmake-hip
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ea1c47b..b90086ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,6 +8,11 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
endif()
+if (MSVC)
+ add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+ add_compile_definitions(_SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)
+endif()
+
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
@@ -82,9 +87,11 @@ endif()
set(SD_LIB stable-diffusion)
file(GLOB SD_LIB_SOURCES
- "*.h"
- "*.cpp"
- "*.hpp"
+ "src/*.h"
+ "src/*.cpp"
+ "src/*.hpp"
+ "src/vocab/*.h"
+ "src/vocab/*.cpp"
)
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
@@ -114,7 +121,7 @@ endif()
message(STATUS "stable-diffusion.cpp commit ${SDCPP_BUILD_COMMIT}")
set_property(
- SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp
+ SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/version.cpp
APPEND PROPERTY COMPILE_DEFINITIONS
SDCPP_BUILD_COMMIT=${SDCPP_BUILD_COMMIT} SDCPP_BUILD_VERSION=${SDCPP_BUILD_VERSION}
)
@@ -177,6 +184,7 @@ endif()
add_subdirectory(thirdparty)
target_link_libraries(${SD_LIB} PUBLIC ggml zip)
+target_include_directories(${SD_LIB} PUBLIC . include)
target_include_directories(${SD_LIB} PUBLIC . thirdparty)
target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)
@@ -185,7 +193,7 @@ if (SD_BUILD_EXAMPLES)
add_subdirectory(examples)
endif()
-set(SD_PUBLIC_HEADERS stable-diffusion.h)
+set(SD_PUBLIC_HEADERS include/stable-diffusion.h)
set_target_properties(${SD_LIB} PROPERTIES PUBLIC_HEADER "${SD_PUBLIC_HEADERS}")
install(TARGETS ${SD_LIB} LIBRARY PUBLIC_HEADER)
diff --git a/Dockerfile b/Dockerfile
index da73021c..26a8f41c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-ARG UBUNTU_VERSION=22.04
+ARG UBUNTU_VERSION=24.04
FROM ubuntu:$UBUNTU_VERSION AS build
@@ -18,5 +18,6 @@ RUN apt-get update && \
apt-get clean
COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
+COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
ENTRYPOINT [ "/sd-cli" ]
\ No newline at end of file
diff --git a/Dockerfile.musa b/Dockerfile.musa
index 0eac3d7f..2d95f817 100644
--- a/Dockerfile.musa
+++ b/Dockerfile.musa
@@ -19,5 +19,6 @@ RUN mkdir build && cd build && \
FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 as runtime
COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
+COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
ENTRYPOINT [ "/sd-cli" ]
\ No newline at end of file
diff --git a/Dockerfile.sycl b/Dockerfile.sycl
index 6bcb91da..466d5517 100644
--- a/Dockerfile.sycl
+++ b/Dockerfile.sycl
@@ -15,5 +15,6 @@ RUN mkdir build && cd build && \
FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS runtime
COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
+COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
ENTRYPOINT [ "/sd-cli" ]
diff --git a/Dockerfile.vulkan b/Dockerfile.vulkan
new file mode 100644
index 00000000..5ba6cb05
--- /dev/null
+++ b/Dockerfile.vulkan
@@ -0,0 +1,23 @@
+ARG UBUNTU_VERSION=24.04
+
+FROM ubuntu:$UBUNTU_VERSION AS build
+
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake libvulkan-dev glslc
+
+WORKDIR /sd.cpp
+
+COPY . .
+
+RUN cmake . -B ./build -DSD_VULKAN=ON
+RUN cmake --build ./build --config Release --parallel
+
+FROM ubuntu:$UBUNTU_VERSION AS runtime
+
+RUN apt-get update && \
+ apt-get install --yes --no-install-recommends libgomp1 libvulkan1 mesa-vulkan-drivers && \
+ apt-get clean
+
+COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
+COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
+
+ENTRYPOINT [ "/sd-cli" ]
diff --git a/README.md b/README.md
index aa29f849..b5bb4975 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,9 @@ API and command-line option may change frequently.***
## 🔥Important News
+* **2026/01/18** 🚀 stable-diffusion.cpp now supports **FLUX.2-klein**
+ 👉 Details: [PR #1193](https://github.com/leejet/stable-diffusion.cpp/pull/1193)
+
* **2025/12/01** 🚀 stable-diffusion.cpp now supports **Z-Image**
👉 Details: [PR #1020](https://github.com/leejet/stable-diffusion.cpp/pull/1020)
@@ -43,16 +46,17 @@ API and command-line option may change frequently.***
- SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
- [Some SD1.x and SDXL distilled models](./docs/distilled_sd.md)
- [SD3/SD3.5](./docs/sd3.md)
- - [FlUX.1-dev/FlUX.1-schnell](./docs/flux.md)
- - [FLUX.2-dev](./docs/flux2.md)
+ - [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
+ - [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
- [Chroma](./docs/chroma.md)
- [Chroma1-Radiance](./docs/chroma_radiance.md)
- [Qwen Image](./docs/qwen_image.md)
- [Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md)
+ - [Anima](./docs/anima.md)
- Image Edit Models
- [FLUX.1-Kontext-dev](./docs/kontext.md)
- - [Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
+ - [Qwen Image Edit series](./docs/qwen_image_edit.md)
- Video Models
- [Wan2.1/Wan2.2](./docs/wan.md)
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
@@ -70,7 +74,7 @@ API and command-line option may change frequently.***
- SYCL
- Supported weight formats
- Pytorch checkpoint (`.ckpt` or `.pth`)
- - Safetensors (`./safetensors`)
+ - Safetensors (`.safetensors`)
- GGUF (`.gguf`)
- Supported platforms
- Linux
@@ -127,15 +131,16 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe
- [SD1.x/SD2.x/SDXL](./docs/sd.md)
- [SD3/SD3.5](./docs/sd3.md)
-- [FlUX.1-dev/FlUX.1-schnell](./docs/flux.md)
-- [FLUX.2-dev](./docs/flux2.md)
+- [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
+- [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
- [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Chroma](./docs/chroma.md)
- [🔥Qwen Image](./docs/qwen_image.md)
-- [🔥Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
+- [🔥Qwen Image Edit series](./docs/qwen_image_edit.md)
- [🔥Wan2.1/Wan2.2](./docs/wan.md)
- [🔥Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md)
+- [Anima](./docs/anima.md)
- [LoRA](./docs/lora.md)
- [LCM/LCM-LoRA](./docs/lcm.md)
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
@@ -143,6 +148,7 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe
- [Using TAESD to faster decoding](./docs/taesd.md)
- [Docker](./docs/docker.md)
- [Quantization and GGUF](./docs/quantization_and_gguf.md)
+- [Inference acceleration via caching](./docs/caching.md)
## Bindings
diff --git a/assets/anima/example.png b/assets/anima/example.png
new file mode 100644
index 00000000..ab91dbf2
Binary files /dev/null and b/assets/anima/example.png differ
diff --git a/assets/flux2/flux2-klein-4b-edit.png b/assets/flux2/flux2-klein-4b-edit.png
new file mode 100644
index 00000000..481a0a6f
Binary files /dev/null and b/assets/flux2/flux2-klein-4b-edit.png differ
diff --git a/assets/flux2/flux2-klein-4b.png b/assets/flux2/flux2-klein-4b.png
new file mode 100644
index 00000000..2809752c
Binary files /dev/null and b/assets/flux2/flux2-klein-4b.png differ
diff --git a/assets/flux2/flux2-klein-9b-edit.png b/assets/flux2/flux2-klein-9b-edit.png
new file mode 100644
index 00000000..41228f1d
Binary files /dev/null and b/assets/flux2/flux2-klein-9b-edit.png differ
diff --git a/assets/flux2/flux2-klein-9b.png b/assets/flux2/flux2-klein-9b.png
new file mode 100644
index 00000000..48adea2a
Binary files /dev/null and b/assets/flux2/flux2-klein-9b.png differ
diff --git a/assets/flux2/flux2-klein-base-4b.png b/assets/flux2/flux2-klein-base-4b.png
new file mode 100644
index 00000000..f29a123d
Binary files /dev/null and b/assets/flux2/flux2-klein-base-4b.png differ
diff --git a/assets/flux2/flux2-klein-base-9b.png b/assets/flux2/flux2-klein-base-9b.png
new file mode 100644
index 00000000..6241f425
Binary files /dev/null and b/assets/flux2/flux2-klein-base-9b.png differ
diff --git a/assets/qwen/qwen_image_edit_2511.png b/assets/qwen/qwen_image_edit_2511.png
new file mode 100644
index 00000000..18a26dac
Binary files /dev/null and b/assets/qwen/qwen_image_edit_2511.png differ
diff --git a/assets/z_image/base_bf16.png b/assets/z_image/base_bf16.png
new file mode 100644
index 00000000..f2b918c0
Binary files /dev/null and b/assets/z_image/base_bf16.png differ
diff --git a/docs/anima.md b/docs/anima.md
new file mode 100644
index 00000000..9c941785
--- /dev/null
+++ b/docs/anima.md
@@ -0,0 +1,20 @@
+# How to Use
+
+## Download weights
+
+- Download Anima
+ - safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/diffusion_models
+ - gguf: https://huggingface.co/Bedovyy/Anima-GGUF/tree/main
+- Download vae
+ - safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/vae
+- Download Qwen3-0.6B-Base
+ - safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/text_encoders
+ - gguf: https://huggingface.co/mradermacher/Qwen3-0.6B-Base-GGUF/tree/main
+
+## Examples
+
+```sh
+.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\anima-preview.safetensors --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_06b_base.safetensors -p "a lovely cat holding a sign says 'anima.cpp'" --cfg-scale 6.0 --sampling-method euler -v --offload-to-cpu --diffusion-fa
+```
+
+
\ No newline at end of file
diff --git a/docs/caching.md b/docs/caching.md
new file mode 100644
index 00000000..7b4be3ce
--- /dev/null
+++ b/docs/caching.md
@@ -0,0 +1,126 @@
+## Caching
+
+Caching methods accelerate diffusion inference by reusing intermediate computations when changes between steps are small.
+
+### Cache Modes
+
+| Mode | Target | Description |
+|------|--------|-------------|
+| `ucache` | UNET models | Condition-level caching with error tracking |
+| `easycache` | DiT models | Condition-level cache |
+| `dbcache` | DiT models | Block-level L1 residual threshold |
+| `taylorseer` | DiT models | Taylor series approximation |
+| `cache-dit` | DiT models | Combined DBCache + TaylorSeer |
+
+### UCache (UNET Models)
+
+UCache caches the residual difference (output - input) and reuses it when input changes are below threshold.
+
+```bash
+sd-cli -m model.safetensors -p "a cat" --cache-mode ucache --cache-option "threshold=1.5"
+```
+
+#### Parameters
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `threshold` | Error threshold for reuse decision | 1.0 |
+| `start` | Start caching at this percent of steps | 0.15 |
+| `end` | Stop caching at this percent of steps | 0.95 |
+| `decay` | Error decay rate (0-1) | 1.0 |
+| `relative` | Scale threshold by output norm (0/1) | 1 |
+| `reset` | Reset error after computing (0/1) | 1 |
+
+#### Reset Parameter
+
+The `reset` parameter controls error accumulation behavior:
+
+- `reset=1` (default): Resets accumulated error after each computed step. More aggressive caching, works well with most samplers.
+- `reset=0`: Keeps error accumulated. More conservative, recommended for `euler_a` sampler.
+
+### EasyCache (DiT Models)
+
+Condition-level caching for DiT models. Caches and reuses outputs when input changes are below threshold.
+
+```bash
+--cache-mode easycache --cache-option "threshold=0.3"
+```
+
+#### Parameters
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `threshold` | Input change threshold for reuse | 0.2 |
+| `start` | Start caching at this percent of steps | 0.15 |
+| `end` | Stop caching at this percent of steps | 0.95 |
+
+### Cache-DIT (DiT Models)
+
+For DiT models like FLUX and QWEN, use block-level caching modes.
+
+#### DBCache
+
+Caches blocks based on L1 residual difference threshold:
+
+```bash
+--cache-mode dbcache --cache-option "threshold=0.25,warmup=4"
+```
+
+#### TaylorSeer
+
+Uses Taylor series approximation to predict block outputs:
+
+```bash
+--cache-mode taylorseer
+```
+
+#### Cache-DIT (Combined)
+
+Combines DBCache and TaylorSeer:
+
+```bash
+--cache-mode cache-dit --cache-preset fast
+```
+
+#### Parameters
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `Fn` | Front blocks to always compute | 8 |
+| `Bn` | Back blocks to always compute | 0 |
+| `threshold` | L1 residual difference threshold | 0.08 |
+| `warmup` | Steps before caching starts | 8 |
+
+#### Presets
+
+Available presets: `slow`, `medium`, `fast`, `ultra` (or `s`, `m`, `f`, `u`).
+
+```bash
+--cache-mode cache-dit --cache-preset fast
+```
+
+#### SCM Options
+
+Steps Computation Mask controls which steps can be cached:
+
+```bash
+--scm-mask "1,1,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1"
+```
+
+Mask values: `1` = compute, `0` = can cache.
+
+| Policy | Description |
+|--------|-------------|
+| `dynamic` | Check threshold before caching |
+| `static` | Always cache on cacheable steps |
+
+```bash
+--scm-policy dynamic
+```
+
+### Performance Tips
+
+- Start with default thresholds and adjust based on output quality
+- Lower threshold = better quality, less speedup
+- Higher threshold = more speedup, potential quality loss
+- More steps generally means more caching opportunities
diff --git a/docs/distilled_sd.md b/docs/distilled_sd.md
index 478305f2..3174b18f 100644
--- a/docs/distilled_sd.md
+++ b/docs/distilled_sd.md
@@ -1,8 +1,8 @@
-# Running distilled models: SSD1B and SDx.x with tiny U-Nets
+# Running distilled models: SSD1B, Vega and SDx.x with tiny U-Nets
## Preface
-These models feature a reduced U-Net architecture. Unlike standard SDXL models, the SSD-1B U-Net contains only one middle block and fewer attention layers in its up- and down-blocks, resulting in significantly smaller file sizes. Using these models can reduce inference time by more than 33%. For more details, refer to Segmind's paper: https://arxiv.org/abs/2401.02677v1.
+These models feature a reduced U-Net architecture. Unlike standard SDXL models, the SSD-1B and Vega U-Net contains only one middle block and fewer attention layers in its up- and down-blocks, resulting in significantly smaller file sizes. Using these models can reduce inference time by more than 33%. For more details, refer to Segmind's paper: https://arxiv.org/abs/2401.02677v1.
Similarly, SD1.x- and SD2.x-style models with a tiny U-Net consist of only 6 U-Net blocks, leading to very small files and time savings of up to 50%. For more information, see the paper: https://arxiv.org/pdf/2305.15798.pdf.
## SSD1B
@@ -17,7 +17,17 @@ Useful LoRAs are also available:
* https://huggingface.co/seungminh/lora-swarovski-SSD-1B/resolve/main/pytorch_lora_weights.safetensors
* https://huggingface.co/kylielee505/mylcmlorassd/resolve/main/pytorch_lora_weights.safetensors
-These files can be used out-of-the-box, unlike the models described in the next section.
+## Vega
+
+Segmind's Vega model is available online here:
+
+ * https://huggingface.co/segmind/Segmind-Vega/resolve/main/segmind-vega.safetensors
+
+VegaRT is an example for an LCM-LoRA:
+
+ * https://huggingface.co/segmind/Segmind-VegaRT/resolve/main/pytorch_lora_weights.safetensors
+
+Both files can be used out-of-the-box, unlike the models described in next sections.
## SD1.x, SD2.x with tiny U-Nets
@@ -83,7 +93,7 @@ python convert_diffusers_to_original_stable_diffusion.py \
The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
-### Another available .ckpt file:
+##### Another available .ckpt file:
* https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
@@ -97,3 +107,31 @@ for key, value in ckpt['state_dict'].items():
ckpt['state_dict'][key] = value.contiguous()
torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
```
+
+
+### SDXS-512
+
+Another very tiny and **incredibly fast** model is SDXS by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
+
+##### 1. Download the diffusers model from Hugging Face using Python:
+
+```python
+from diffusers import StableDiffusionPipeline
+pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
+pipe.save_pretrained(save_directory="sdxs")
+```
+##### 2. Create a safetensors file
+
+```bash
+python convert_diffusers_to_original_stable_diffusion.py \
+ --model_path sdxs --checkpoint_path sdxs.safetensors --half --use_safetensors
+```
+
+##### 3. Run the model as follows:
+
+```bash
+~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
+ --cfg-scale 1 --steps 1
+```
+
+Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here.
diff --git a/docs/docker.md b/docs/docker.md
index 26a5f714..660ed257 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -1,15 +1,39 @@
-## Docker
+# Docker
-### Building using Docker
+## Run CLI
+
+```shell
+docker run --rm -v /path/to/models:/models -v /path/to/output/:/output ghcr.io/leejet/stable-diffusion.cpp:master [args...]
+# For example
+# docker run --rm -v ./models:/models -v ./build:/output ghcr.io/leejet/stable-diffusion.cpp:master -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
+```
+
+## Run server
+
+```shell
+docker run --rm --init -v /path/to/models:/models -v /path/to/output/:/output -p "1234:1234" --entrypoint "/sd-server" ghcr.io/leejet/stable-diffusion.cpp:master [args...]
+# For example
+# docker run --rm --init -v ./models:/models -v ./build:/output -p "1234:1234" --entrypoint "/sd-server" ghcr.io/leejet/stable-diffusion.cpp:master -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
+```
+
+## Building using Docker
```shell
docker build -t sd .
```
-### Run
+## Building variants using Docker
+
+Vulkan:
```shell
-docker run -v /path/to/models:/models -v /path/to/output/:/output sd-cli [args...]
+docker build -f Dockerfile.vulkan -t sd .
+```
+
+## Run locally built image's CLI
+
+```shell
+docker run --rm -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
# For example
-# docker run -v ./models:/models -v ./build:/output sd-cli -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
-```
\ No newline at end of file
+# docker run --rm -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
+```
diff --git a/docs/esrgan.md b/docs/esrgan.md
index 77231726..39a97605 100644
--- a/docs/esrgan.md
+++ b/docs/esrgan.md
@@ -1,6 +1,6 @@
## Using ESRGAN to upscale results
-You can use ESRGAN to upscale the generated images. At the moment, only the [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth) model is supported. Support for more models of this architecture will be added soon.
+You can use ESRGAN—such as the model [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth)—to upscale the generated images and improve their overall resolution and clarity.
- Specify the model path using the `--upscale-model PATH` parameter. example:
diff --git a/docs/flux2.md b/docs/flux2.md
index 0c2c6d2b..1524478c 100644
--- a/docs/flux2.md
+++ b/docs/flux2.md
@@ -1,6 +1,8 @@
# How to Use
-## Download weights
+## Flux.2-dev
+
+### Download weights
- Download FLUX.2-dev
- gguf: https://huggingface.co/city96/FLUX.2-dev-gguf/tree/main
@@ -9,7 +11,7 @@
- Download Mistral-Small-3.2-24B-Instruct-2506-GGUF
- gguf: https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF/tree/main
-## Examples
+### Examples
```
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux2-dev-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Mistral-Small-3.2-24B-Instruct-2506-Q4_K_M.gguf -r .\kontext_input.png -p "change 'flux.cpp' to 'flux2-dev.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu
@@ -17,5 +19,74 @@
+## Flux.2 klein 4B / Flux.2 klein base 4B
+### Download weights
+- Download FLUX.2-klein-4B
+ - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-4B
+ - gguf: https://huggingface.co/leejet/FLUX.2-klein-4B-GGUF/tree/main
+- Download FLUX.2-klein-base-4B
+ - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-base-4B
+ - gguf: https://huggingface.co/leejet/FLUX.2-klein-base-4B-GGUF/tree/main
+- Download vae
+ - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
+- Download Qwen3 4b
+ - safetensors: https://huggingface.co/Comfy-Org/flux2-klein-4B/tree/main/split_files/text_encoders
+ - gguf: https://huggingface.co/unsloth/Qwen3-4B-GGUF/tree/main
+
+### Examples
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-4b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -p "a lovely cat" --cfg-scale 1.0 --steps 4 -v --offload-to-cpu --diffusion-fa
+```
+
+
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-4b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -r .\kontext_input.png -p "change 'flux.cpp' to 'klein.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu --steps 4
+```
+
+
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-base-4b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -p "a lovely cat" --cfg-scale 4.0 --steps 20 -v --offload-to-cpu --diffusion-fa
+```
+
+
+
+## Flux.2 klein 9B / Flux.2 klein base 9B
+
+### Download weights
+
+- Download FLUX.2-klein-9B
+ - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-9B
+ - gguf: https://huggingface.co/leejet/FLUX.2-klein-9B-GGUF/tree/main
+- Download FLUX.2-klein-base-9B
+ - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-base-9B
+ - gguf: https://huggingface.co/leejet/FLUX.2-klein-base-9B-GGUF/tree/main
+- Download vae
+ - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
+- Download Qwen3 8B
+ - safetensors: https://huggingface.co/Comfy-Org/flux2-klein-9B/tree/main/split_files/text_encoders
+ - gguf: https://huggingface.co/unsloth/Qwen3-8B-GGUF/tree/main
+
+### Examples
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-9b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_8b.safetensors -p "a lovely cat" --cfg-scale 1.0 --steps 4 -v --offload-to-cpu --diffusion-fa
+```
+
+
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-9b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_8b.safetensors -r .\kontext_input.png -p "change 'flux.cpp' to 'klein.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu --steps 4
+```
+
+
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\flux-2-klein-base-9b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_3_8b.safetensors -p "a lovely cat" --cfg-scale 4.0 --steps 20 -v --offload-to-cpu --diffusion-fa
+```
+
+
\ No newline at end of file
diff --git a/docs/qwen_image_edit.md b/docs/qwen_image_edit.md
index d376a283..4a8b0172 100644
--- a/docs/qwen_image_edit.md
+++ b/docs/qwen_image_edit.md
@@ -9,6 +9,9 @@
- Qwen Image Edit 2509
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/QuantStack/Qwen-Image-Edit-2509-GGUF/tree/main
+ - Qwen Image Edit 2511
+ - safetensors: https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/tree/main/split_files/diffusion_models
+ - gguf: https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/vae
- Download qwen_2.5_vl 7b
@@ -32,4 +35,14 @@
.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --llm_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
```
-
\ No newline at end of file
+
+
+### Qwen Image Edit 2511
+
+To use the new Qwen Image Edit 2511 mode, the `--qwen-image-zero-cond-t` flag must be enabled; otherwise, image editing quality will degrade significantly.
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\qwen-image-edit-2511-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --qwen-image-zero-cond-t
+```
+
+
\ No newline at end of file
diff --git a/docs/taesd.md b/docs/taesd.md
index 5160b793..a41c64d4 100644
--- a/docs/taesd.md
+++ b/docs/taesd.md
@@ -14,4 +14,26 @@ curl -L -O https://huggingface.co/madebyollin/taesd/resolve/main/diffusion_pytor
```bash
sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --taesd ../models/diffusion_pytorch_model.safetensors
-```
\ No newline at end of file
+```
+
+### Qwen-Image and wan (TAEHV)
+
+sd.cpp also supports [TAEHV](https://github.com/madebyollin/taehv) (#937), which can be used for Qwen-Image and wan.
+
+- For **Qwen-Image and wan2.1 and wan2.2-A14B**, download the wan2.1 tae [safetensors weights](https://github.com/madebyollin/taehv/blob/main/safetensors/taew2_1.safetensors)
+
+ Or curl
+
+ ```bash
+ curl -L -O https://github.com/madebyollin/taehv/raw/refs/heads/main/safetensors/taew2_1.safetensors
+ ```
+
+- For **wan2.2-TI2V-5B**, use the wan2.2 tae [safetensors weights](https://github.com/madebyollin/taehv/blob/main/safetensors/taew2_2.safetensors)
+
+ Or curl
+
+ ```bash
+ curl -L -O https://github.com/madebyollin/taehv/raw/refs/heads/main/safetensors/taew2_2.safetensors
+ ```
+
+Then simply replace the `--vae xxx.safetensors` with `--tae xxx.safetensors` in the commands. If it still out of VRAM, add `--vae-conv-direct` to your command though might be slower.
diff --git a/docs/wan.md b/docs/wan.md
index ce15ba58..6f5749c8 100644
--- a/docs/wan.md
+++ b/docs/wan.md
@@ -39,6 +39,9 @@
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/vae/wan_2.1_vae.safetensors
- wan_2.2_vae (for Wan2.2 TI2V 5B only)
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/blob/main/split_files/vae/wan2.2_vae.safetensors
+
+ > Wan models vae requires really much VRAM! If you do not have enough VRAM, please try tae instead, though the results may be poorer. For tae usage, please refer to [taesd](taesd.md)
+
- Download umt5_xxl
- safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/text_encoders/umt5_xxl_fp16.safetensors
- gguf: https://huggingface.co/city96/umt5-xxl-encoder-gguf/tree/main
diff --git a/docs/z_image.md b/docs/z_image.md
index 122f1f20..2ea66f9b 100644
--- a/docs/z_image.md
+++ b/docs/z_image.md
@@ -7,6 +7,9 @@ You can run Z-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or ev
- Download Z-Image-Turbo
- safetensors: https://huggingface.co/Comfy-Org/z_image_turbo/tree/main/split_files/diffusion_models
- gguf: https://huggingface.co/leejet/Z-Image-Turbo-GGUF/tree/main
+- Download Z-Image
+ - safetensors: https://huggingface.co/Comfy-Org/z_image/tree/main/split_files/diffusion_models
+ - gguf: https://huggingface.co/unsloth/Z-Image-GGUF/tree/main
- Download vae
- safetensors: https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
- Download Qwen3 4b
@@ -15,12 +18,22 @@ You can run Z-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or ev
## Examples
+### Z-Image-Turbo
+
```
.\bin\Release\sd-cli.exe --diffusion-model z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512
```
+### Z-Image-Base
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\z_image_bf16.safetensors --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512
+```
+
+
+
## Comparison of Different Quantization Types
| bf16 | q8_0 | q6_K | q5_0 | q4_K | q4_0 | q3_K | q2_K|
diff --git a/examples/cli/README.md b/examples/cli/README.md
index 8531b2ae..564e5ce0 100644
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@@ -4,11 +4,14 @@
usage: ./bin/sd-cli [options]
CLI Options:
- -o, --output path to write result image to (default: ./output.png)
+ -o, --output path to write result image to. you can use printf-style %d format specifiers for image sequences (default:
+ ./output.png) (eg. output_%03d.png)
--preview-path path to write preview image to (default: ./preview.png)
--preview-interval interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
every step)
+ --output-begin-idx starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)
--canny apply canny preprocessor (edge detection)
+ --convert-name convert tensor name (for convert mode)
-v, --verbose print extra info
--color colors the logging tags according to level
--taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae)
@@ -42,17 +45,22 @@ Context Options:
CPU physical cores
--chroma-t5-mask-pad t5 mask pad size of chroma
--vae-tile-overlap tile overlap for vae tiling, in fraction of tile size (default: 0.5)
- --flow-shift shift value for Flow models like SD3.x or WAN (default: auto)
--vae-tiling process vae in tiles to reduce memory usage
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
+ --mmap whether to memory-map model
--control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram)
--vae-on-cpu keep vae in cpu (for low vram)
- --diffusion-fa use flash attention in the diffusion model
+ --fa use flash attention
+ --diffusion-fa use flash attention in the diffusion model only
--diffusion-conv-direct use ggml_conv2d_direct in the diffusion model
--vae-conv-direct use ggml_conv2d_direct in the vae model
+ --circular enable circular padding for convolutions
+ --circularx enable circular RoPE wrapping on x-axis (width) only
+ --circulary enable circular RoPE wrapping on y-axis (height) only
--chroma-disable-dit-mask disable dit mask for chroma
+ --qwen-image-zero-cond-t enable zero_cond_t for qwen image
--chroma-enable-t5-mask enable t5 mask for chroma
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
type of the weight file
@@ -93,6 +101,7 @@ Generation Options:
--timestep-shift shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
NitroSD-Vibrant
--upscale-repeats Run the ESRGAN upscaler this many times (default: 1)
+ --upscale-tile-size tile size for ESRGAN upscaling (default: 128)
--cfg-scale unconditional guidance scale: (default: 7.0)
--img-cfg-scale image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
--guidance distilled guidance scale for models with guidance input (default: 3.5)
@@ -101,6 +110,7 @@ Generation Options:
--skip-layer-start SLG enabling point (default: 0.01)
--skip-layer-end SLG disabling point (default: 0.2)
--eta eta in DDIM, only for DDIM and TCD (default: 0)
+ --flow-shift shift value for Flow models like SD3.x or WAN (default: auto)
--high-noise-cfg-scale (high noise) unconditional guidance scale: (default: 7.0)
--high-noise-img-cfg-scale (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
--high-noise-guidance (high noise) distilled guidance scale for models with guidance input (default: 3.5)
@@ -117,14 +127,22 @@ Generation Options:
--disable-auto-resize-ref-image disable auto resize of ref images
-s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
- tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
+ tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a
+ otherwise)
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
- ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
- --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm],
- default: discrete
+ ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan,
+ euler_a otherwise
+ --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
+ kl_optimal, lcm, bong_tangent], default: discrete
--sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
--skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
- --easycache enable EasyCache for DiT models with optional "threshold,start_percent,end_percent" (default: 0.2,0.15,0.95)
+ --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level)
+ --cache-option named cache params (key=value format, comma-separated). easycache/ucache:
+ threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
+ "threshold=0.25" or "threshold=1.5,reset=0"
+ --cache-preset cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u'
+ --scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
+ --scm-policy SCM policy: 'dynamic' (default) or 'static'
```
diff --git a/examples/cli/avi_writer.h b/examples/cli/avi_writer.h
index 84b204af..53b4749c 100644
--- a/examples/cli/avi_writer.h
+++ b/examples/cli/avi_writer.h
@@ -172,9 +172,9 @@ int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int
// Write '00dc' chunk (video frame)
fwrite("00dc", 4, 1, f);
- write_u32_le(f, jpeg_data.size);
+ write_u32_le(f, (uint32_t)jpeg_data.size);
index[i].offset = ftell(f) - 8;
- index[i].size = jpeg_data.size;
+ index[i].size = (uint32_t)jpeg_data.size;
fwrite(jpeg_data.buf, 1, jpeg_data.size, f);
// Align to even byte size
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 42b909e4..f9e4928e 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -26,12 +26,16 @@ const char* previews_str[] = {
"vae",
};
+std::regex format_specifier_regex("(?:[^%]|^)(?:%%)*(%\\d{0,3}d)");
+
struct SDCliParams {
SDMode mode = IMG_GEN;
std::string output_path = "output.png";
+ int output_begin_idx = -1;
bool verbose = false;
bool canny_preprocess = false;
+ bool convert_name = false;
preview_t preview_method = PREVIEW_NONE;
int preview_interval = 1;
@@ -49,7 +53,7 @@ struct SDCliParams {
options.string_options = {
{"-o",
"--output",
- "path to write result image to (default: ./output.png)",
+ "path to write result image to. you can use printf-style %d format specifiers for image sequences (default: ./output.png) (eg. output_%03d.png)",
&output_path},
{"",
"--preview-path",
@@ -62,6 +66,10 @@ struct SDCliParams {
"--preview-interval",
"interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at every step)",
&preview_interval},
+ {"",
+ "--output-begin-idx",
+ "starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)",
+ &output_begin_idx},
};
options.bool_options = {
@@ -69,6 +77,10 @@ struct SDCliParams {
"--canny",
"apply canny preprocessor (edge detection)",
true, &canny_preprocess},
+ {"",
+ "--convert-name",
+ "convert tensor name (for convert mode)",
+ true, &convert_name},
{"-v",
"--verbose",
"print extra info",
@@ -174,6 +186,7 @@ struct SDCliParams {
<< " verbose: " << (verbose ? "true" : "false") << ",\n"
<< " color: " << (color ? "true" : "false") << ",\n"
<< " canny_preprocess: " << (canny_preprocess ? "true" : "false") << ",\n"
+ << " convert_name: " << (convert_name ? "true" : "false") << ",\n"
<< " preview_method: " << previews_str[preview_method] << ",\n"
<< " preview_interval: " << preview_interval << ",\n"
<< " preview_path: \"" << preview_path << "\",\n"
@@ -232,7 +245,7 @@ std::string get_image_params(const SDCliParams& cli_params, const SDContextParam
parameter_string += "Guidance: " + std::to_string(gen_params.sample_params.guidance.distilled_guidance) + ", ";
parameter_string += "Eta: " + std::to_string(gen_params.sample_params.eta) + ", ";
parameter_string += "Seed: " + std::to_string(seed) + ", ";
- parameter_string += "Size: " + std::to_string(gen_params.width) + "x" + std::to_string(gen_params.height) + ", ";
+ parameter_string += "Size: " + std::to_string(gen_params.get_resolved_width()) + "x" + std::to_string(gen_params.get_resolved_height()) + ", ";
parameter_string += "Model: " + sd_basename(ctx_params.model_path) + ", ";
parameter_string += "RNG: " + std::string(sd_rng_type_name(ctx_params.rng_type)) + ", ";
if (ctx_params.sampler_rng_type != RNG_TYPE_COUNT) {
@@ -338,6 +351,129 @@ void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy,
}
}
+std::string format_frame_idx(std::string pattern, int frame_idx) {
+ std::smatch match;
+ std::string result = pattern;
+ while (std::regex_search(result, match, format_specifier_regex)) {
+ std::string specifier = match.str(1);
+ char buffer[32];
+ snprintf(buffer, sizeof(buffer), specifier.c_str(), frame_idx);
+ result.replace(match.position(1), match.length(1), buffer);
+ }
+
+ // Then replace all '%%' with '%'
+ size_t pos = 0;
+ while ((pos = result.find("%%", pos)) != std::string::npos) {
+ result.replace(pos, 2, "%");
+ pos += 1;
+ }
+ return result;
+}
+
+bool save_results(const SDCliParams& cli_params,
+ const SDContextParams& ctx_params,
+ const SDGenerationParams& gen_params,
+ sd_image_t* results,
+ int num_results) {
+ if (results == nullptr || num_results <= 0) {
+ return false;
+ }
+
+ namespace fs = std::filesystem;
+ fs::path out_path = cli_params.output_path;
+
+ if (!out_path.parent_path().empty()) {
+ std::error_code ec;
+ fs::create_directories(out_path.parent_path(), ec);
+ if (ec) {
+ LOG_ERROR("failed to create directory '%s': %s",
+ out_path.parent_path().string().c_str(), ec.message().c_str());
+ return false;
+ }
+ }
+
+ fs::path base_path = out_path;
+ fs::path ext = out_path.has_extension() ? out_path.extension() : fs::path{};
+
+ std::string ext_lower = ext.string();
+ std::transform(ext_lower.begin(), ext_lower.end(), ext_lower.begin(), ::tolower);
+ bool is_jpg = (ext_lower == ".jpg" || ext_lower == ".jpeg" || ext_lower == ".jpe");
+ if (!ext.empty()) {
+ if (is_jpg || ext_lower == ".png") {
+ base_path.replace_extension();
+ }
+ }
+
+ int output_begin_idx = cli_params.output_begin_idx;
+ if (output_begin_idx < 0) {
+ output_begin_idx = 0;
+ }
+
+ auto write_image = [&](const fs::path& path, int idx) {
+ const sd_image_t& img = results[idx];
+ if (!img.data)
+ return false;
+
+ std::string params = get_image_params(cli_params, ctx_params, gen_params, gen_params.seed + idx);
+ int ok = 0;
+ if (is_jpg) {
+ ok = stbi_write_jpg(path.string().c_str(), img.width, img.height, img.channel, img.data, 90, params.c_str());
+ } else {
+ ok = stbi_write_png(path.string().c_str(), img.width, img.height, img.channel, img.data, 0, params.c_str());
+ }
+ LOG_INFO("save result image %d to '%s' (%s)", idx, path.string().c_str(), ok ? "success" : "failure");
+ return ok != 0;
+ };
+
+ int sucessful_reults = 0;
+
+ if (std::regex_search(cli_params.output_path, format_specifier_regex)) {
+ if (!is_jpg && ext_lower != ".png")
+ ext = ".png";
+ fs::path pattern = base_path;
+ pattern += ext;
+
+ for (int i = 0; i < num_results; ++i) {
+ fs::path img_path = format_frame_idx(pattern.string(), output_begin_idx + i);
+ if (write_image(img_path, i)) {
+ sucessful_reults++;
+ }
+ }
+ LOG_INFO("%d/%d images saved", sucessful_reults, num_results);
+ return sucessful_reults != 0;
+ }
+
+ if (cli_params.mode == VID_GEN && num_results > 1) {
+ if (ext_lower != ".avi")
+ ext = ".avi";
+ fs::path video_path = base_path;
+ video_path += ext;
+ if (create_mjpg_avi_from_sd_images(video_path.string().c_str(), results, num_results, gen_params.fps) == 0) {
+ LOG_INFO("save result MJPG AVI video to '%s'", video_path.string().c_str());
+ return true;
+ } else {
+ LOG_ERROR("Failed to save result MPG AVI video to '%s'", video_path.string().c_str());
+ return false;
+ }
+ }
+
+ if (!is_jpg && ext_lower != ".png")
+ ext = ".png";
+
+ for (int i = 0; i < num_results; ++i) {
+ fs::path img_path = base_path;
+ if (num_results > 1) {
+ img_path += "_" + std::to_string(output_begin_idx + i);
+ }
+ img_path += ext;
+ if (write_image(img_path, i)) {
+ sucessful_reults++;
+ }
+ }
+ LOG_INFO("%d/%d images saved", sucessful_reults, num_results);
+ return sucessful_reults != 0;
+}
+
int main(int argc, const char* argv[]) {
if (argc > 1 && std::string(argv[1]) == "--version") {
std::cout << version_string() << "\n";
@@ -387,7 +523,8 @@ int main(int argc, const char* argv[]) {
ctx_params.vae_path.c_str(),
cli_params.output_path.c_str(),
ctx_params.wtype,
- ctx_params.tensor_type_rules.c_str());
+ ctx_params.tensor_type_rules.c_str(),
+ cli_params.convert_name);
if (!success) {
LOG_ERROR("convert '%s'/'%s' to '%s' failed",
ctx_params.model_path.c_str(),
@@ -404,10 +541,10 @@ int main(int argc, const char* argv[]) {
}
bool vae_decode_only = true;
- sd_image_t init_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
- sd_image_t end_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
- sd_image_t control_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
- sd_image_t mask_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 1, nullptr};
+ sd_image_t init_image = {0, 0, 3, nullptr};
+ sd_image_t end_image = {0, 0, 3, nullptr};
+ sd_image_t control_image = {0, 0, 3, nullptr};
+ sd_image_t mask_image = {0, 0, 1, nullptr};
std::vector ref_images;
std::vector pmid_images;
std::vector control_frames;
@@ -434,57 +571,79 @@ int main(int argc, const char* argv[]) {
control_frames.clear();
};
+ auto load_image_and_update_size = [&](const std::string& path,
+ sd_image_t& image,
+ bool resize_image = true,
+ int expected_channel = 3) -> bool {
+ int expected_width = 0;
+ int expected_height = 0;
+ if (resize_image && gen_params.width_and_height_are_set()) {
+ expected_width = gen_params.width;
+ expected_height = gen_params.height;
+ }
+
+ if (!load_sd_image_from_file(&image, path.c_str(), expected_width, expected_height, expected_channel)) {
+ LOG_ERROR("load image from '%s' failed", path.c_str());
+ release_all_resources();
+ return false;
+ }
+
+ gen_params.set_width_and_height_if_unset(image.width, image.height);
+ return true;
+ };
+
if (gen_params.init_image_path.size() > 0) {
vae_decode_only = false;
-
- int width = 0;
- int height = 0;
- init_image.data = load_image_from_file(gen_params.init_image_path.c_str(), width, height, gen_params.width, gen_params.height);
- if (init_image.data == nullptr) {
- LOG_ERROR("load image from '%s' failed", gen_params.init_image_path.c_str());
- release_all_resources();
+ if (!load_image_and_update_size(gen_params.init_image_path, init_image)) {
return 1;
}
}
if (gen_params.end_image_path.size() > 0) {
vae_decode_only = false;
-
- int width = 0;
- int height = 0;
- end_image.data = load_image_from_file(gen_params.end_image_path.c_str(), width, height, gen_params.width, gen_params.height);
- if (end_image.data == nullptr) {
- LOG_ERROR("load image from '%s' failed", gen_params.end_image_path.c_str());
- release_all_resources();
+ if (!load_image_and_update_size(gen_params.init_image_path, end_image)) {
return 1;
}
}
+ if (gen_params.ref_image_paths.size() > 0) {
+ vae_decode_only = false;
+ for (auto& path : gen_params.ref_image_paths) {
+ sd_image_t ref_image = {0, 0, 3, nullptr};
+ if (!load_image_and_update_size(path, ref_image, false)) {
+ return 1;
+ }
+ ref_images.push_back(ref_image);
+ }
+ }
+
if (gen_params.mask_image_path.size() > 0) {
- int c = 0;
- int width = 0;
- int height = 0;
- mask_image.data = load_image_from_file(gen_params.mask_image_path.c_str(), width, height, gen_params.width, gen_params.height, 1);
- if (mask_image.data == nullptr) {
+ if (!load_sd_image_from_file(&mask_image,
+ gen_params.mask_image_path.c_str(),
+ gen_params.get_resolved_width(),
+ gen_params.get_resolved_height(),
+ 1)) {
LOG_ERROR("load image from '%s' failed", gen_params.mask_image_path.c_str());
release_all_resources();
return 1;
}
} else {
- mask_image.data = (uint8_t*)malloc(gen_params.width * gen_params.height);
- memset(mask_image.data, 255, gen_params.width * gen_params.height);
+ mask_image.data = (uint8_t*)malloc(gen_params.get_resolved_width() * gen_params.get_resolved_height());
if (mask_image.data == nullptr) {
LOG_ERROR("malloc mask image failed");
release_all_resources();
return 1;
}
+ mask_image.width = gen_params.get_resolved_width();
+ mask_image.height = gen_params.get_resolved_height();
+ memset(mask_image.data, 255, gen_params.get_resolved_width() * gen_params.get_resolved_height());
}
if (gen_params.control_image_path.size() > 0) {
- int width = 0;
- int height = 0;
- control_image.data = load_image_from_file(gen_params.control_image_path.c_str(), width, height, gen_params.width, gen_params.height);
- if (control_image.data == nullptr) {
+ if (!load_sd_image_from_file(&control_image,
+ gen_params.control_image_path.c_str(),
+ gen_params.get_resolved_width(),
+ gen_params.get_resolved_height())) {
LOG_ERROR("load image from '%s' failed", gen_params.control_image_path.c_str());
release_all_resources();
return 1;
@@ -499,29 +658,11 @@ int main(int argc, const char* argv[]) {
}
}
- if (gen_params.ref_image_paths.size() > 0) {
- vae_decode_only = false;
- for (auto& path : gen_params.ref_image_paths) {
- int width = 0;
- int height = 0;
- uint8_t* image_buffer = load_image_from_file(path.c_str(), width, height);
- if (image_buffer == nullptr) {
- LOG_ERROR("load image from '%s' failed", path.c_str());
- release_all_resources();
- return 1;
- }
- ref_images.push_back({(uint32_t)width,
- (uint32_t)height,
- 3,
- image_buffer});
- }
- }
-
if (!gen_params.control_video_path.empty()) {
if (!load_images_from_dir(gen_params.control_video_path,
control_frames,
- gen_params.width,
- gen_params.height,
+ gen_params.get_resolved_width(),
+ gen_params.get_resolved_height(),
gen_params.video_frames,
cli_params.verbose)) {
release_all_resources();
@@ -579,7 +720,7 @@ int main(int argc, const char* argv[]) {
}
if (gen_params.sample_params.scheduler == SCHEDULER_COUNT) {
- gen_params.sample_params.scheduler = sd_get_default_scheduler(sd_ctx);
+ gen_params.sample_params.scheduler = sd_get_default_scheduler(sd_ctx, gen_params.sample_params.sample_method);
}
if (cli_params.mode == IMG_GEN) {
@@ -595,8 +736,8 @@ int main(int argc, const char* argv[]) {
gen_params.auto_resize_ref_image,
gen_params.increase_ref_index,
mask_image,
- gen_params.width,
- gen_params.height,
+ gen_params.get_resolved_width(),
+ gen_params.get_resolved_height(),
gen_params.sample_params,
gen_params.strength,
gen_params.seed,
@@ -610,7 +751,7 @@ int main(int argc, const char* argv[]) {
gen_params.pm_style_strength,
}, // pm_params
ctx_params.vae_tiling_params,
- gen_params.easycache_params,
+ gen_params.cache_params,
};
results = generate_image(sd_ctx, &img_gen_params);
@@ -626,8 +767,8 @@ int main(int argc, const char* argv[]) {
end_image,
control_frames.data(),
(int)control_frames.size(),
- gen_params.width,
- gen_params.height,
+ gen_params.get_resolved_width(),
+ gen_params.get_resolved_height(),
gen_params.sample_params,
gen_params.high_noise_sample_params,
gen_params.moe_boundary,
@@ -635,7 +776,8 @@ int main(int argc, const char* argv[]) {
gen_params.seed,
gen_params.video_frames,
gen_params.vace_strength,
- gen_params.easycache_params,
+ ctx_params.vae_tiling_params,
+ gen_params.cache_params,
};
results = generate_video(sd_ctx, &vid_gen_params, &num_results);
@@ -680,67 +822,8 @@ int main(int argc, const char* argv[]) {
}
}
- // create directory if not exists
- {
- const fs::path out_path = cli_params.output_path;
- if (const fs::path out_dir = out_path.parent_path(); !out_dir.empty()) {
- std::error_code ec;
- fs::create_directories(out_dir, ec); // OK if already exists
- if (ec) {
- LOG_ERROR("failed to create directory '%s': %s",
- out_dir.string().c_str(), ec.message().c_str());
- return 1;
- }
- }
- }
-
- std::string base_path;
- std::string file_ext;
- std::string file_ext_lower;
- bool is_jpg;
- size_t last_dot_pos = cli_params.output_path.find_last_of(".");
- size_t last_slash_pos = std::min(cli_params.output_path.find_last_of("/"),
- cli_params.output_path.find_last_of("\\"));
- if (last_dot_pos != std::string::npos && (last_slash_pos == std::string::npos || last_dot_pos > last_slash_pos)) { // filename has extension
- base_path = cli_params.output_path.substr(0, last_dot_pos);
- file_ext = file_ext_lower = cli_params.output_path.substr(last_dot_pos);
- std::transform(file_ext.begin(), file_ext.end(), file_ext_lower.begin(), ::tolower);
- is_jpg = (file_ext_lower == ".jpg" || file_ext_lower == ".jpeg" || file_ext_lower == ".jpe");
- } else {
- base_path = cli_params.output_path;
- file_ext = file_ext_lower = "";
- is_jpg = false;
- }
-
- if (cli_params.mode == VID_GEN && num_results > 1) {
- std::string vid_output_path = cli_params.output_path;
- if (file_ext_lower == ".png") {
- vid_output_path = base_path + ".avi";
- }
- create_mjpg_avi_from_sd_images(vid_output_path.c_str(), results, num_results, gen_params.fps);
- LOG_INFO("save result MJPG AVI video to '%s'\n", vid_output_path.c_str());
- } else {
- // appending ".png" to absent or unknown extension
- if (!is_jpg && file_ext_lower != ".png") {
- base_path += file_ext;
- file_ext = ".png";
- }
- for (int i = 0; i < num_results; i++) {
- if (results[i].data == nullptr) {
- continue;
- }
- int write_ok;
- std::string final_image_path = i > 0 ? base_path + "_" + std::to_string(i + 1) + file_ext : base_path + file_ext;
- if (is_jpg) {
- write_ok = stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
- results[i].data, 90, get_image_params(cli_params, ctx_params, gen_params, gen_params.seed + i).c_str());
- LOG_INFO("save result JPEG image to '%s' (%s)", final_image_path.c_str(), write_ok == 0 ? "failure" : "success");
- } else {
- write_ok = stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
- results[i].data, 0, get_image_params(cli_params, ctx_params, gen_params, gen_params.seed + i).c_str());
- LOG_INFO("save result PNG image to '%s' (%s)", final_image_path.c_str(), write_ok == 0 ? "failure" : "success");
- }
- }
+ if (!save_results(cli_params, ctx_params, gen_params, results, num_results)) {
+ return 1;
}
for (int i = 0; i < num_results; i++) {
@@ -752,4 +835,4 @@ int main(int argc, const char* argv[]) {
release_all_resources();
return 0;
-}
\ No newline at end of file
+}
diff --git a/examples/common/common.hpp b/examples/common/common.hpp
index f3a56136..369c1f07 100644
--- a/examples/common/common.hpp
+++ b/examples/common/common.hpp
@@ -95,17 +95,28 @@ static void print_utf8(FILE* stream, const char* utf8) {
? GetStdHandle(STD_ERROR_HANDLE)
: GetStdHandle(STD_OUTPUT_HANDLE);
- int wlen = MultiByteToWideChar(CP_UTF8, 0, utf8, -1, NULL, 0);
- if (wlen <= 0)
- return;
+ DWORD mode;
+ BOOL is_console = GetConsoleMode(h, &mode);
- wchar_t* wbuf = (wchar_t*)malloc(wlen * sizeof(wchar_t));
- MultiByteToWideChar(CP_UTF8, 0, utf8, -1, wbuf, wlen);
+ if (is_console) {
+ int wlen = MultiByteToWideChar(CP_UTF8, 0, utf8, -1, NULL, 0);
+ if (wlen <= 0)
+ return;
- DWORD written;
- WriteConsoleW(h, wbuf, wlen - 1, &written, NULL);
+ wchar_t* wbuf = (wchar_t*)malloc(wlen * sizeof(wchar_t));
+ if (!wbuf)
+ return;
- free(wbuf);
+ MultiByteToWideChar(CP_UTF8, 0, utf8, -1, wbuf, wlen);
+
+ DWORD written;
+ WriteConsoleW(h, wbuf, wlen - 1, &written, NULL);
+
+ free(wbuf);
+ } else {
+ DWORD written;
+ WriteFile(h, utf8, (DWORD)strlen(utf8), &written, NULL);
+ }
#else
fputs(utf8, stream);
#endif
@@ -434,7 +445,7 @@ struct SDContextParams {
std::string photo_maker_path;
sd_type_t wtype = SD_TYPE_COUNT;
std::string tensor_type_rules;
- std::string lora_model_dir;
+ std::string lora_model_dir = ".";
std::map embedding_map;
std::vector embedding_vec;
@@ -442,17 +453,25 @@ struct SDContextParams {
rng_type_t rng_type = CUDA_RNG;
rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
bool offload_params_to_cpu = false;
+ bool enable_mmap = false;
bool control_net_cpu = false;
bool clip_on_cpu = false;
bool vae_on_cpu = false;
+ bool flash_attn = false;
bool diffusion_flash_attn = false;
bool diffusion_conv_direct = false;
bool vae_conv_direct = false;
+ bool circular = false;
+ bool circular_x = false;
+ bool circular_y = false;
+
bool chroma_use_dit_mask = true;
bool chroma_use_t5_mask = false;
int chroma_t5_mask_pad = 1;
+ bool qwen_image_zero_cond_t = false;
+
prediction_t prediction = PREDICTION_COUNT;
lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;
@@ -562,10 +581,6 @@ struct SDContextParams {
"--vae-tile-overlap",
"tile overlap for vae tiling, in fraction of tile size (default: 0.5)",
&vae_tiling_params.target_overlap},
- {"",
- "--flow-shift",
- "shift value for Flow models like SD3.x or WAN (default: auto)",
- &flow_shift},
};
options.bool_options = {
@@ -581,6 +596,10 @@ struct SDContextParams {
"--offload-to-cpu",
"place the weights in RAM to save VRAM, and automatically load them into VRAM when needed",
true, &offload_params_to_cpu},
+ {"",
+ "--mmap",
+ "whether to memory-map model",
+ true, &enable_mmap},
{"",
"--control-net-cpu",
"keep controlnet in cpu (for low vram)",
@@ -593,9 +612,13 @@ struct SDContextParams {
"--vae-on-cpu",
"keep vae in cpu (for low vram)",
true, &vae_on_cpu},
+ {"",
+ "--fa",
+ "use flash attention",
+ true, &flash_attn},
{"",
"--diffusion-fa",
- "use flash attention in the diffusion model",
+ "use flash attention in the diffusion model only",
true, &diffusion_flash_attn},
{"",
"--diffusion-conv-direct",
@@ -605,10 +628,26 @@ struct SDContextParams {
"--vae-conv-direct",
"use ggml_conv2d_direct in the vae model",
true, &vae_conv_direct},
+ {"",
+ "--circular",
+ "enable circular padding for convolutions",
+ true, &circular},
+ {"",
+ "--circularx",
+ "enable circular RoPE wrapping on x-axis (width) only",
+ true, &circular_x},
+ {"",
+ "--circulary",
+ "enable circular RoPE wrapping on y-axis (height) only",
+ true, &circular_y},
{"",
"--chroma-disable-dit-mask",
"disable dit mask for chroma",
false, &chroma_use_dit_mask},
+ {"",
+ "--qwen-image-zero-cond-t",
+ "enable zero_cond_t for qwen image",
+ true, &qwen_image_zero_cond_t},
{"",
"--chroma-enable-t5-mask",
"enable t5 mask for chroma",
@@ -771,7 +810,7 @@ struct SDContextParams {
}
void build_embedding_map() {
- static const std::vector valid_ext = {".pt", ".safetensors", ".gguf"};
+ static const std::vector valid_ext = {".gguf", ".safetensors", ".pt"};
if (!fs::exists(embedding_dir) || !fs::is_directory(embedding_dir)) {
return;
@@ -860,15 +899,20 @@ struct SDContextParams {
<< " photo_maker_path: \"" << photo_maker_path << "\",\n"
<< " rng_type: " << sd_rng_type_name(rng_type) << ",\n"
<< " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n"
- << " flow_shift: " << (std::isinf(flow_shift) ? "INF" : std::to_string(flow_shift)) << "\n"
<< " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
+ << " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
<< " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
<< " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n"
<< " vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n"
+ << " flash_attn: " << (flash_attn ? "true" : "false") << ",\n"
<< " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n"
<< " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n"
<< " vae_conv_direct: " << (vae_conv_direct ? "true" : "false") << ",\n"
+ << " circular: " << (circular ? "true" : "false") << ",\n"
+ << " circular_x: " << (circular_x ? "true" : "false") << ",\n"
+ << " circular_y: " << (circular_y ? "true" : "false") << ",\n"
<< " chroma_use_dit_mask: " << (chroma_use_dit_mask ? "true" : "false") << ",\n"
+ << " qwen_image_zero_cond_t: " << (qwen_image_zero_cond_t ? "true" : "false") << ",\n"
<< " chroma_use_t5_mask: " << (chroma_use_t5_mask ? "true" : "false") << ",\n"
<< " chroma_t5_mask_pad: " << chroma_t5_mask_pad << ",\n"
<< " prediction: " << sd_prediction_name(prediction) << ",\n"
@@ -921,18 +965,22 @@ struct SDContextParams {
prediction,
lora_apply_mode,
offload_params_to_cpu,
+ enable_mmap,
clip_on_cpu,
control_net_cpu,
vae_on_cpu,
+ flash_attn,
diffusion_flash_attn,
taesd_preview,
diffusion_conv_direct,
vae_conv_direct,
+ circular || circular_x,
+ circular || circular_y,
force_sdxl_vae_conv_scale,
chroma_use_dit_mask,
chroma_use_t5_mask,
chroma_t5_mask_pad,
- flow_shift,
+ qwen_image_zero_cond_t,
};
return sd_ctx_params;
}
@@ -977,8 +1025,8 @@ struct SDGenerationParams {
std::string prompt_with_lora; // for metadata record only
std::string negative_prompt;
int clip_skip = -1; // <= 0 represents unspecified
- int width = 512;
- int height = 512;
+ int width = -1;
+ int height = -1;
int batch_count = 1;
std::string init_image_path;
std::string end_image_path;
@@ -997,8 +1045,12 @@ struct SDGenerationParams {
std::vector custom_sigmas;
- std::string easycache_option;
- sd_easycache_params_t easycache_params;
+ std::string cache_mode;
+ std::string cache_option;
+ std::string cache_preset;
+ std::string scm_mask;
+ bool scm_policy_dynamic = true;
+ sd_cache_params_t cache_params{};
float moe_boundary = 0.875f;
int video_frames = 1;
@@ -1148,6 +1200,10 @@ struct SDGenerationParams {
"--eta",
"eta in DDIM, only for DDIM and TCD (default: 0)",
&sample_params.eta},
+ {"",
+ "--flow-shift",
+ "shift value for Flow models like SD3.x or WAN (default: auto)",
+ &sample_params.flow_shift},
{"",
"--high-noise-cfg-scale",
"(high noise) unconditional guidance scale: (default: 7.0)",
@@ -1335,10 +1391,10 @@ struct SDGenerationParams {
if (!item.empty()) {
try {
custom_sigmas.push_back(std::stof(item));
- } catch (const std::invalid_argument& e) {
+ } catch (const std::invalid_argument&) {
LOG_ERROR("error: invalid float value '%s' in --sigmas", item.c_str());
return -1;
- } catch (const std::out_of_range& e) {
+ } catch (const std::out_of_range&) {
LOG_ERROR("error: float value '%s' out of range in --sigmas", item.c_str());
return -1;
}
@@ -1360,36 +1416,64 @@ struct SDGenerationParams {
return 1;
};
- auto on_easycache_arg = [&](int argc, const char** argv, int index) {
- const std::string default_values = "0.2,0.15,0.95";
- auto looks_like_value = [](const std::string& token) {
- if (token.empty()) {
- return false;
- }
- if (token[0] != '-') {
- return true;
- }
- if (token.size() == 1) {
- return false;
- }
- unsigned char next = static_cast(token[1]);
- return std::isdigit(next) || token[1] == '.';
- };
+ auto on_cache_mode_arg = [&](int argc, const char** argv, int index) {
+ if (++index >= argc) {
+ return -1;
+ }
+ cache_mode = argv_to_utf8(index, argv);
+ if (cache_mode != "easycache" && cache_mode != "ucache" &&
+ cache_mode != "dbcache" && cache_mode != "taylorseer" && cache_mode != "cache-dit") {
+ fprintf(stderr, "error: invalid cache mode '%s', must be 'easycache', 'ucache', 'dbcache', 'taylorseer', or 'cache-dit'\n", cache_mode.c_str());
+ return -1;
+ }
+ return 1;
+ };
- std::string option_value;
- int consumed = 0;
- if (index + 1 < argc) {
- std::string next_arg = argv[index + 1];
- if (looks_like_value(next_arg)) {
- option_value = argv_to_utf8(index + 1, argv);
- consumed = 1;
- }
+ auto on_cache_option_arg = [&](int argc, const char** argv, int index) {
+ if (++index >= argc) {
+ return -1;
}
- if (option_value.empty()) {
- option_value = default_values;
+ cache_option = argv_to_utf8(index, argv);
+ return 1;
+ };
+
+ auto on_scm_mask_arg = [&](int argc, const char** argv, int index) {
+ if (++index >= argc) {
+ return -1;
}
- easycache_option = option_value;
- return consumed;
+ scm_mask = argv_to_utf8(index, argv);
+ return 1;
+ };
+
+ auto on_scm_policy_arg = [&](int argc, const char** argv, int index) {
+ if (++index >= argc) {
+ return -1;
+ }
+ std::string policy = argv_to_utf8(index, argv);
+ if (policy == "dynamic") {
+ scm_policy_dynamic = true;
+ } else if (policy == "static") {
+ scm_policy_dynamic = false;
+ } else {
+ fprintf(stderr, "error: invalid scm policy '%s', must be 'dynamic' or 'static'\n", policy.c_str());
+ return -1;
+ }
+ return 1;
+ };
+
+ auto on_cache_preset_arg = [&](int argc, const char** argv, int index) {
+ if (++index >= argc) {
+ return -1;
+ }
+ cache_preset = argv_to_utf8(index, argv);
+ if (cache_preset != "slow" && cache_preset != "s" && cache_preset != "S" &&
+ cache_preset != "medium" && cache_preset != "m" && cache_preset != "M" &&
+ cache_preset != "fast" && cache_preset != "f" && cache_preset != "F" &&
+ cache_preset != "ultra" && cache_preset != "u" && cache_preset != "U") {
+ fprintf(stderr, "error: invalid cache preset '%s', must be 'slow'/'s', 'medium'/'m', 'fast'/'f', or 'ultra'/'u'\n", cache_preset.c_str());
+ return -1;
+ }
+ return 1;
};
options.manual_options = {
@@ -1399,17 +1483,17 @@ struct SDGenerationParams {
on_seed_arg},
{"",
"--sampling-method",
- "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] "
+ "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s] "
"(default: euler for Flux/SD3/Wan, euler_a otherwise)",
on_sample_method_arg},
{"",
"--high-noise-sampling-method",
- "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd]"
+ "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s]"
" default: euler for Flux/SD3/Wan, euler_a otherwise",
on_high_noise_sample_method_arg},
{"",
"--scheduler",
- "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm], default: discrete",
+ "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, kl_optimal, lcm, bong_tangent], default: discrete",
on_scheduler_arg},
{"",
"--sigmas",
@@ -1428,9 +1512,25 @@ struct SDGenerationParams {
"reference image for Flux Kontext models (can be used multiple times)",
on_ref_image_arg},
{"",
- "--easycache",
- "enable EasyCache for DiT models with optional \"threshold,start_percent,end_percent\" (default: 0.2,0.15,0.95)",
- on_easycache_arg},
+ "--cache-mode",
+ "caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level)",
+ on_cache_mode_arg},
+ {"",
+ "--cache-option",
+ "named cache params (key=value format, comma-separated). easycache/ucache: threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples: \"threshold=0.25\" or \"threshold=1.5,reset=0\"",
+ on_cache_option_arg},
+ {"",
+ "--cache-preset",
+ "cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u'",
+ on_cache_preset_arg},
+ {"",
+ "--scm-mask",
+ "SCM steps mask for cache-dit: comma-separated 0/1 (e.g., \"1,1,1,0,0,1,0,0,1,0\") - 1=compute, 0=can cache",
+ on_scm_mask_arg},
+ {"",
+ "--scm-policy",
+ "SCM policy: 'dynamic' (default) or 'static'",
+ on_scm_policy_arg},
};
@@ -1473,7 +1573,10 @@ struct SDGenerationParams {
load_if_exists("prompt", prompt);
load_if_exists("negative_prompt", negative_prompt);
- load_if_exists("easycache_option", easycache_option);
+ load_if_exists("cache_mode", cache_mode);
+ load_if_exists("cache_option", cache_option);
+ load_if_exists("cache_preset", cache_preset);
+ load_if_exists("scm_mask", scm_mask);
load_if_exists("clip_skip", clip_skip);
load_if_exists("width", width);
@@ -1496,9 +1599,30 @@ struct SDGenerationParams {
load_if_exists("skip_layers", skip_layers);
load_if_exists("high_noise_skip_layers", high_noise_skip_layers);
+ load_if_exists("steps", sample_params.sample_steps);
+ load_if_exists("high_noise_steps", high_noise_sample_params.sample_steps);
load_if_exists("cfg_scale", sample_params.guidance.txt_cfg);
load_if_exists("img_cfg_scale", sample_params.guidance.img_cfg);
load_if_exists("guidance", sample_params.guidance.distilled_guidance);
+ load_if_exists("flow_shift", sample_params.flow_shift);
+
+ auto load_sampler_if_exists = [&](const char* key, enum sample_method_t& out) {
+ if (j.contains(key) && j[key].is_string()) {
+ enum sample_method_t tmp = str_to_sample_method(j[key].get().c_str());
+ if (tmp != SAMPLE_METHOD_COUNT) {
+ out = tmp;
+ }
+ }
+ };
+ load_sampler_if_exists("sample_method", sample_params.sample_method);
+ load_sampler_if_exists("high_noise_sample_method", high_noise_sample_params.sample_method);
+
+ if (j.contains("scheduler") && j["scheduler"].is_string()) {
+ enum scheduler_t tmp = str_to_scheduler(j["scheduler"].get().c_str());
+ if (tmp != SCHEDULER_COUNT) {
+ sample_params.scheduler = tmp;
+ }
+ }
return true;
}
@@ -1508,7 +1632,7 @@ struct SDGenerationParams {
return;
}
static const std::regex re(R"(]+):([^>]+)>)");
- static const std::vector valid_ext = {".pt", ".safetensors", ".gguf"};
+ static const std::vector valid_ext = {".gguf", ".safetensors", ".pt"};
std::smatch m;
std::string tmp = prompt;
@@ -1587,17 +1711,24 @@ struct SDGenerationParams {
}
}
+ bool width_and_height_are_set() const {
+ return width > 0 && height > 0;
+ }
+
+ void set_width_and_height_if_unset(int w, int h) {
+ if (!width_and_height_are_set()) {
+ LOG_INFO("set width x height to %d x %d", w, h);
+ width = w;
+ height = h;
+ }
+ }
+
+ int get_resolved_width() const { return (width > 0) ? width : 512; }
+
+ int get_resolved_height() const { return (height > 0) ? height : 512; }
+
bool process_and_check(SDMode mode, const std::string& lora_model_dir) {
prompt_with_lora = prompt;
- if (width <= 0) {
- LOG_ERROR("error: the width must be greater than 0\n");
- return false;
- }
-
- if (height <= 0) {
- LOG_ERROR("error: the height must be greater than 0\n");
- return false;
- }
if (sample_params.sample_steps <= 0) {
LOG_ERROR("error: the sample_steps must be greater than 0\n");
@@ -1613,57 +1744,118 @@ struct SDGenerationParams {
return false;
}
- if (!easycache_option.empty()) {
- float values[3] = {0.0f, 0.0f, 0.0f};
- std::stringstream ss(easycache_option);
+ sd_cache_params_init(&cache_params);
+
+ auto parse_named_params = [&](const std::string& opt_str) -> bool {
+ std::stringstream ss(opt_str);
std::string token;
- int idx = 0;
while (std::getline(ss, token, ',')) {
- auto trim = [](std::string& s) {
- const char* whitespace = " \t\r\n";
- auto start = s.find_first_not_of(whitespace);
- if (start == std::string::npos) {
- s.clear();
- return;
- }
- auto end = s.find_last_not_of(whitespace);
- s = s.substr(start, end - start + 1);
- };
- trim(token);
- if (token.empty()) {
- LOG_ERROR("error: invalid easycache option '%s'", easycache_option.c_str());
- return false;
- }
- if (idx >= 3) {
- LOG_ERROR("error: easycache expects exactly 3 comma-separated values (threshold,start,end)\n");
+ size_t eq_pos = token.find('=');
+ if (eq_pos == std::string::npos) {
+ LOG_ERROR("error: cache option '%s' missing '=' separator", token.c_str());
return false;
}
+ std::string key = token.substr(0, eq_pos);
+ std::string val = token.substr(eq_pos + 1);
try {
- values[idx] = std::stof(token);
+ if (key == "threshold") {
+ if (cache_mode == "easycache" || cache_mode == "ucache") {
+ cache_params.reuse_threshold = std::stof(val);
+ } else {
+ cache_params.residual_diff_threshold = std::stof(val);
+ }
+ } else if (key == "start") {
+ cache_params.start_percent = std::stof(val);
+ } else if (key == "end") {
+ cache_params.end_percent = std::stof(val);
+ } else if (key == "decay") {
+ cache_params.error_decay_rate = std::stof(val);
+ } else if (key == "relative") {
+ cache_params.use_relative_threshold = (std::stof(val) != 0.0f);
+ } else if (key == "reset") {
+ cache_params.reset_error_on_compute = (std::stof(val) != 0.0f);
+ } else if (key == "Fn" || key == "fn") {
+ cache_params.Fn_compute_blocks = std::stoi(val);
+ } else if (key == "Bn" || key == "bn") {
+ cache_params.Bn_compute_blocks = std::stoi(val);
+ } else if (key == "warmup") {
+ cache_params.max_warmup_steps = std::stoi(val);
+ } else {
+ LOG_ERROR("error: unknown cache parameter '%s'", key.c_str());
+ return false;
+ }
} catch (const std::exception&) {
- LOG_ERROR("error: invalid easycache value '%s'", token.c_str());
+ LOG_ERROR("error: invalid value '%s' for parameter '%s'", val.c_str(), key.c_str());
return false;
}
- idx++;
}
- if (idx != 3) {
- LOG_ERROR("error: easycache expects exactly 3 comma-separated values (threshold,start,end)\n");
- return false;
+ return true;
+ };
+
+ if (!cache_mode.empty()) {
+ if (cache_mode == "easycache") {
+ cache_params.mode = SD_CACHE_EASYCACHE;
+ cache_params.reuse_threshold = 0.2f;
+ cache_params.start_percent = 0.15f;
+ cache_params.end_percent = 0.95f;
+ cache_params.error_decay_rate = 1.0f;
+ cache_params.use_relative_threshold = true;
+ cache_params.reset_error_on_compute = true;
+ } else if (cache_mode == "ucache") {
+ cache_params.mode = SD_CACHE_UCACHE;
+ cache_params.reuse_threshold = 1.0f;
+ cache_params.start_percent = 0.15f;
+ cache_params.end_percent = 0.95f;
+ cache_params.error_decay_rate = 1.0f;
+ cache_params.use_relative_threshold = true;
+ cache_params.reset_error_on_compute = true;
+ } else if (cache_mode == "dbcache") {
+ cache_params.mode = SD_CACHE_DBCACHE;
+ cache_params.Fn_compute_blocks = 8;
+ cache_params.Bn_compute_blocks = 0;
+ cache_params.residual_diff_threshold = 0.08f;
+ cache_params.max_warmup_steps = 8;
+ } else if (cache_mode == "taylorseer") {
+ cache_params.mode = SD_CACHE_TAYLORSEER;
+ cache_params.Fn_compute_blocks = 8;
+ cache_params.Bn_compute_blocks = 0;
+ cache_params.residual_diff_threshold = 0.08f;
+ cache_params.max_warmup_steps = 8;
+ } else if (cache_mode == "cache-dit") {
+ cache_params.mode = SD_CACHE_CACHE_DIT;
+ cache_params.Fn_compute_blocks = 8;
+ cache_params.Bn_compute_blocks = 0;
+ cache_params.residual_diff_threshold = 0.08f;
+ cache_params.max_warmup_steps = 8;
}
- if (values[0] < 0.0f) {
- LOG_ERROR("error: easycache threshold must be non-negative\n");
- return false;
+
+ if (!cache_option.empty()) {
+ if (!parse_named_params(cache_option)) {
+ return false;
+ }
}
- if (values[1] < 0.0f || values[1] >= 1.0f || values[2] <= 0.0f || values[2] > 1.0f || values[1] >= values[2]) {
- LOG_ERROR("error: easycache start/end percents must satisfy 0.0 <= start < end <= 1.0\n");
- return false;
+
+ if (cache_mode == "easycache" || cache_mode == "ucache") {
+ if (cache_params.reuse_threshold < 0.0f) {
+ LOG_ERROR("error: cache threshold must be non-negative");
+ return false;
+ }
+ if (cache_params.start_percent < 0.0f || cache_params.start_percent >= 1.0f ||
+ cache_params.end_percent <= 0.0f || cache_params.end_percent > 1.0f ||
+ cache_params.start_percent >= cache_params.end_percent) {
+ LOG_ERROR("error: cache start/end percents must satisfy 0.0 <= start < end <= 1.0");
+ return false;
+ }
}
- easycache_params.enabled = true;
- easycache_params.reuse_threshold = values[0];
- easycache_params.start_percent = values[1];
- easycache_params.end_percent = values[2];
- } else {
- easycache_params.enabled = false;
+ }
+
+ if (cache_params.mode == SD_CACHE_DBCACHE ||
+ cache_params.mode == SD_CACHE_TAYLORSEER ||
+ cache_params.mode == SD_CACHE_CACHE_DIT) {
+ if (!scm_mask.empty()) {
+ cache_params.scm_mask = scm_mask.c_str();
+ }
+ cache_params.scm_policy_dynamic = scm_policy_dynamic;
}
sample_params.guidance.slg.layers = skip_layers.data();
@@ -1765,12 +1957,13 @@ struct SDGenerationParams {
<< " high_noise_skip_layers: " << vec_to_string(high_noise_skip_layers) << ",\n"
<< " high_noise_sample_params: " << high_noise_sample_params_str << ",\n"
<< " custom_sigmas: " << vec_to_string(custom_sigmas) << ",\n"
- << " easycache_option: \"" << easycache_option << "\",\n"
- << " easycache: "
- << (easycache_params.enabled ? "enabled" : "disabled")
- << " (threshold=" << easycache_params.reuse_threshold
- << ", start=" << easycache_params.start_percent
- << ", end=" << easycache_params.end_percent << "),\n"
+ << " cache_mode: \"" << cache_mode << "\",\n"
+ << " cache_option: \"" << cache_option << "\",\n"
+ << " cache: "
+ << (cache_params.mode != SD_CACHE_DISABLED ? "enabled" : "disabled")
+ << " (threshold=" << cache_params.reuse_threshold
+ << ", start=" << cache_params.start_percent
+ << ", end=" << cache_params.end_percent << "),\n"
<< " moe_boundary: " << moe_boundary << ",\n"
<< " video_frames: " << video_frames << ",\n"
<< " fps: " << fps << ",\n"
@@ -1903,6 +2096,22 @@ uint8_t* load_image_from_file(const char* image_path,
return load_image_common(false, image_path, 0, width, height, expected_width, expected_height, expected_channel);
}
+bool load_sd_image_from_file(sd_image_t* image,
+ const char* image_path,
+ int expected_width = 0,
+ int expected_height = 0,
+ int expected_channel = 3) {
+ int width;
+ int height;
+ image->data = load_image_common(false, image_path, 0, width, height, expected_width, expected_height, expected_channel);
+ if (image->data == nullptr) {
+ return false;
+ }
+ image->width = width;
+ image->height = height;
+ return true;
+}
+
uint8_t* load_image_from_memory(const char* image_bytes,
int len,
int& width,
@@ -1911,4 +2120,4 @@ uint8_t* load_image_from_memory(const char* image_bytes,
int expected_height = 0,
int expected_channel = 3) {
return load_image_common(true, image_bytes, len, width, height, expected_width, expected_height, expected_channel);
-}
\ No newline at end of file
+}
diff --git a/examples/server/README.md b/examples/server/README.md
index a475856f..75544364 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -4,11 +4,12 @@
usage: ./bin/sd-server [options]
Svr Options:
- -l, --listen-ip server listen ip (default: 127.0.0.1)
- --listen-port server listen port (default: 1234)
- -v, --verbose print extra info
- --color colors the logging tags according to level
- -h, --help show this help message and exit
+ -l, --listen-ip server listen ip (default: 127.0.0.1)
+ --serve-html-path path to HTML file to serve at root (optional)
+ --listen-port server listen port (default: 1234)
+ -v, --verbose print extra info
+ --color colors the logging tags according to level
+ -h, --help show this help message and exit
Context Options:
-m, --model path to full model
@@ -35,17 +36,22 @@ Context Options:
CPU physical cores
--chroma-t5-mask-pad t5 mask pad size of chroma
--vae-tile-overlap tile overlap for vae tiling, in fraction of tile size (default: 0.5)
- --flow-shift shift value for Flow models like SD3.x or WAN (default: auto)
--vae-tiling process vae in tiles to reduce memory usage
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
+ --mmap whether to memory-map model
--control-net-cpu keep controlnet in cpu (for low vram)
--clip-on-cpu keep clip in cpu (for low vram)
--vae-on-cpu keep vae in cpu (for low vram)
- --diffusion-fa use flash attention in the diffusion model
+ --fa use flash attention
+ --diffusion-fa use flash attention in the diffusion model only
--diffusion-conv-direct use ggml_conv2d_direct in the diffusion model
--vae-conv-direct use ggml_conv2d_direct in the vae model
+ --circular enable circular padding for convolutions
+ --circularx enable circular RoPE wrapping on x-axis (width) only
+ --circulary enable circular RoPE wrapping on y-axis (height) only
--chroma-disable-dit-mask disable dit mask for chroma
+ --qwen-image-zero-cond-t enable zero_cond_t for qwen image
--chroma-enable-t5-mask enable t5 mask for chroma
--type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
type of the weight file
@@ -95,6 +101,7 @@ Default Generation Options:
--skip-layer-start SLG enabling point (default: 0.01)
--skip-layer-end SLG disabling point (default: 0.2)
--eta eta in DDIM, only for DDIM and TCD (default: 0)
+ --flow-shift shift value for Flow models like SD3.x or WAN (default: auto)
--high-noise-cfg-scale (high noise) unconditional guidance scale: (default: 7.0)
--high-noise-img-cfg-scale (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
--high-noise-guidance (high noise) distilled guidance scale for models with guidance input (default: 3.5)
@@ -111,14 +118,22 @@ Default Generation Options:
--disable-auto-resize-ref-image disable auto resize of ref images
-s, --seed RNG seed (default: 42, use random seed for < 0)
--sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
- tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
+ tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a
+ otherwise)
--high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
- ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
- --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm],
- default: discrete
+ ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan,
+ euler_a otherwise
+ --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
+ kl_optimal, lcm, bong_tangent], default: discrete
--sigmas custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
--skip-layers layers to skip for SLG steps (default: [7,8,9])
--high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9])
-r, --ref-image reference image for Flux Kontext models (can be used multiple times)
- --easycache enable EasyCache for DiT models with optional "threshold,start_percent,end_percent" (default: 0.2,0.15,0.95)
-```
\ No newline at end of file
+ --cache-mode caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level)
+ --cache-option named cache params (key=value format, comma-separated). easycache/ucache:
+ threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
+ "threshold=0.25" or "threshold=1.5,reset=0"
+ --cache-preset cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u'
+ --scm-mask SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
+ --scm-policy SCM policy: 'dynamic' (default) or 'static'
+```
diff --git a/examples/server/main.cpp b/examples/server/main.cpp
index 39359fbb..cc9e66cc 100644
--- a/examples/server/main.cpp
+++ b/examples/server/main.cpp
@@ -44,7 +44,7 @@ inline bool is_base64(unsigned char c) {
}
std::vector base64_decode(const std::string& encoded_string) {
- int in_len = encoded_string.size();
+ int in_len = static_cast(encoded_string.size());
int i = 0;
int j = 0;
int in_ = 0;
@@ -86,27 +86,13 @@ std::vector base64_decode(const std::string& encoded_string) {
return ret;
}
-std::string iso_timestamp_now() {
- using namespace std::chrono;
- auto now = system_clock::now();
- std::time_t t = system_clock::to_time_t(now);
- std::tm tm{};
-#ifdef _MSC_VER
- gmtime_s(&tm, &t);
-#else
- gmtime_r(&t, &tm);
-#endif
- std::ostringstream oss;
- oss << std::put_time(&tm, "%Y-%m-%dT%H:%M:%SZ");
- return oss.str();
-}
-
struct SDSvrParams {
std::string listen_ip = "127.0.0.1";
int listen_port = 1234;
- bool normal_exit = false;
- bool verbose = false;
- bool color = false;
+ std::string serve_html_path;
+ bool normal_exit = false;
+ bool verbose = false;
+ bool color = false;
ArgOptions get_options() {
ArgOptions options;
@@ -115,7 +101,11 @@ struct SDSvrParams {
{"-l",
"--listen-ip",
"server listen ip (default: 127.0.0.1)",
- &listen_ip}};
+ &listen_ip},
+ {"",
+ "--serve-html-path",
+ "path to HTML file to serve at root (optional)",
+ &serve_html_path}};
options.int_options = {
{"",
@@ -159,6 +149,11 @@ struct SDSvrParams {
LOG_ERROR("error: listen_port should be in the range [0, 65535]");
return false;
}
+
+ if (!serve_html_path.empty() && !fs::exists(serve_html_path)) {
+ LOG_ERROR("error: serve_html_path file does not exist: %s", serve_html_path.c_str());
+ return false;
+ }
return true;
}
@@ -167,6 +162,7 @@ struct SDSvrParams {
oss << "SDSvrParams {\n"
<< " listen_ip: " << listen_ip << ",\n"
<< " listen_port: \"" << listen_port << "\",\n"
+ << " serve_html_path: \"" << serve_html_path << "\",\n"
<< "}";
return oss.str();
}
@@ -191,12 +187,18 @@ void parse_args(int argc, const char** argv, SDSvrParams& svr_params, SDContextP
exit(svr_params.normal_exit ? 0 : 1);
}
+ const bool random_seed_requested = default_gen_params.seed < 0;
+
if (!svr_params.process_and_check() ||
!ctx_params.process_and_check(IMG_GEN) ||
!default_gen_params.process_and_check(IMG_GEN, ctx_params.lora_model_dir)) {
print_usage(argc, argv, options_vec);
exit(1);
}
+
+ if (random_seed_requested) {
+ default_gen_params.seed = -1;
+ }
}
std::string extract_and_remove_sd_cpp_extra_args(std::string& text) {
@@ -261,6 +263,24 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
log_print(level, log, svr_params->verbose, svr_params->color);
}
+struct LoraEntry {
+ std::string name;
+ std::string path;
+ std::string fullpath;
+};
+
+void free_results(sd_image_t* result_images, int num_results) {
+ if (result_images) {
+ for (int i = 0; i < num_results; ++i) {
+ if (result_images[i].data) {
+ stbi_image_free(result_images[i].data);
+ result_images[i].data = nullptr;
+ }
+ }
+ }
+ free(result_images);
+}
+
int main(int argc, const char** argv) {
if (argc > 1 && std::string(argv[1]) == "--version") {
std::cout << version_string() << "\n";
@@ -291,6 +311,56 @@ int main(int argc, const char** argv) {
std::mutex sd_ctx_mutex;
+ std::vector lora_cache;
+ std::mutex lora_mutex;
+
+ auto refresh_lora_cache = [&]() {
+ std::vector new_cache;
+
+ fs::path lora_dir = ctx_params.lora_model_dir;
+ if (fs::exists(lora_dir) && fs::is_directory(lora_dir)) {
+ auto is_lora_ext = [](const fs::path& p) {
+ auto ext = p.extension().string();
+ std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+ return ext == ".gguf" || ext == ".pt" || ext == ".pth" || ext == ".safetensors";
+ };
+
+ for (auto& entry : fs::recursive_directory_iterator(lora_dir)) {
+ if (!entry.is_regular_file())
+ continue;
+ const fs::path& p = entry.path();
+ if (!is_lora_ext(p))
+ continue;
+
+ LoraEntry e;
+ e.name = p.stem().u8string();
+ e.fullpath = p.u8string();
+ std::string rel = p.lexically_relative(lora_dir).u8string();
+ std::replace(rel.begin(), rel.end(), '\\', '/');
+ e.path = rel;
+
+ new_cache.push_back(std::move(e));
+ }
+ }
+
+ std::sort(new_cache.begin(), new_cache.end(),
+ [](const LoraEntry& a, const LoraEntry& b) {
+ return a.path < b.path;
+ });
+
+ {
+ std::lock_guard lock(lora_mutex);
+ lora_cache = std::move(new_cache);
+ }
+ };
+
+ auto get_lora_full_path = [&](const std::string& path) -> std::string {
+ std::lock_guard lock(lora_mutex);
+ auto it = std::find_if(lora_cache.begin(), lora_cache.end(),
+ [&](const LoraEntry& e) { return e.path == path; });
+ return (it != lora_cache.end()) ? it->fullpath : "";
+ };
+
httplib::Server svr;
svr.set_pre_routing_handler([](const httplib::Request& req, httplib::Response& res) {
@@ -310,9 +380,20 @@ int main(int argc, const char** argv) {
return httplib::Server::HandlerResponse::Unhandled;
});
- // health
+ // root
svr.Get("/", [&](const httplib::Request&, httplib::Response& res) {
- res.set_content(R"({"ok":true,"service":"sd-cpp-http"})", "application/json");
+ if (!svr_params.serve_html_path.empty()) {
+ std::ifstream file(svr_params.serve_html_path);
+ if (file) {
+ std::string content((std::istreambuf_iterator(file)), std::istreambuf_iterator());
+ res.set_content(content, "text/html");
+ } else {
+ res.status = 500;
+ res.set_content("Error: Unable to read HTML file", "text/plain");
+ }
+ } else {
+ res.set_content("Stable Diffusion Server is running", "text/plain");
+ }
});
// models endpoint (minimal)
@@ -338,8 +419,8 @@ int main(int argc, const char** argv) {
std::string size = j.value("size", "");
std::string output_format = j.value("output_format", "png");
int output_compression = j.value("output_compression", 100);
- int width = 512;
- int height = 512;
+ int width = default_gen_params.width > 0 ? default_gen_params.width : 512;
+ int height = default_gen_params.width > 0 ? default_gen_params.height : 512;
if (!size.empty()) {
auto pos = size.find('x');
if (pos != std::string::npos) {
@@ -376,7 +457,7 @@ int main(int argc, const char** argv) {
}
json out;
- out["created"] = iso_timestamp_now();
+ out["created"] = static_cast(std::time(nullptr));
out["data"] = json::array();
out["output_format"] = output_format;
@@ -392,6 +473,9 @@ int main(int argc, const char** argv) {
return;
}
+ if (gen_params.sample_params.sample_steps > 100)
+ gen_params.sample_params.sample_steps = 100;
+
if (!gen_params.process_and_check(IMG_GEN, "")) {
res.status = 400;
res.set_content(R"({"error":"invalid params"})", "application/json");
@@ -432,7 +516,7 @@ int main(int argc, const char** argv) {
gen_params.pm_style_strength,
}, // pm_params
ctx_params.vae_tiling_params,
- gen_params.easycache_params,
+ gen_params.cache_params,
};
sd_image_t* results = nullptr;
@@ -465,6 +549,7 @@ int main(int argc, const char** argv) {
item["b64_json"] = b64;
out["data"].push_back(item);
}
+ free_results(results, num_results);
res.set_content(out.dump(), "application/json");
res.status = 200;
@@ -495,8 +580,9 @@ int main(int argc, const char** argv) {
std::string sd_cpp_extra_args_str = extract_and_remove_sd_cpp_extra_args(prompt);
- size_t image_count = req.form.get_file_count("image[]");
- if (image_count == 0) {
+ size_t image_count = req.form.get_file_count("image[]");
+ bool has_legacy_image = req.form.has_file("image");
+ if (image_count == 0 && !has_legacy_image) {
res.status = 400;
res.set_content(R"({"error":"at least one image[] required"})", "application/json");
return;
@@ -507,9 +593,13 @@ int main(int argc, const char** argv) {
auto file = req.form.get_file("image[]", i);
images_bytes.emplace_back(file.content.begin(), file.content.end());
}
+ if (image_count == 0 && has_legacy_image) {
+ auto file = req.form.get_file("image");
+ images_bytes.emplace_back(file.content.begin(), file.content.end());
+ }
std::vector mask_bytes;
- if (req.form.has_field("mask")) {
+ if (req.form.has_file("mask")) {
auto file = req.form.get_file("mask");
mask_bytes.assign(file.content.begin(), file.content.end());
}
@@ -524,7 +614,7 @@ int main(int argc, const char** argv) {
n = std::clamp(n, 1, 8);
std::string size = req.form.get_field("size");
- int width = 512, height = 512;
+ int width = -1, height = -1;
if (!size.empty()) {
auto pos = size.find('x');
if (pos != std::string::npos) {
@@ -570,6 +660,9 @@ int main(int argc, const char** argv) {
return;
}
+ if (gen_params.sample_params.sample_steps > 100)
+ gen_params.sample_params.sample_steps = 100;
+
if (!gen_params.process_and_check(IMG_GEN, "")) {
res.status = 400;
res.set_content(R"({"error":"invalid params"})", "application/json");
@@ -578,18 +671,34 @@ int main(int argc, const char** argv) {
LOG_DEBUG("%s\n", gen_params.to_string().c_str());
- sd_image_t init_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
- sd_image_t control_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
+ sd_image_t init_image = {0, 0, 3, nullptr};
+ sd_image_t control_image = {0, 0, 3, nullptr};
std::vector pmid_images;
+ auto get_resolved_width = [&gen_params, &default_gen_params]() -> int {
+ if (gen_params.width > 0)
+ return gen_params.width;
+ if (default_gen_params.width > 0)
+ return default_gen_params.width;
+ return 512;
+ };
+ auto get_resolved_height = [&gen_params, &default_gen_params]() -> int {
+ if (gen_params.height > 0)
+ return gen_params.height;
+ if (default_gen_params.height > 0)
+ return default_gen_params.height;
+ return 512;
+ };
+
std::vector ref_images;
ref_images.reserve(images_bytes.size());
for (auto& bytes : images_bytes) {
- int img_w = width;
- int img_h = height;
+ int img_w;
+ int img_h;
+
uint8_t* raw_pixels = load_image_from_memory(
reinterpret_cast(bytes.data()),
- bytes.size(),
+ static_cast(bytes.size()),
img_w, img_h,
width, height, 3);
@@ -598,22 +707,31 @@ int main(int argc, const char** argv) {
}
sd_image_t img{(uint32_t)img_w, (uint32_t)img_h, 3, raw_pixels};
+ gen_params.set_width_and_height_if_unset(img.width, img.height);
ref_images.push_back(img);
}
sd_image_t mask_image = {0};
if (!mask_bytes.empty()) {
- int mask_w = width;
- int mask_h = height;
+ int expected_width = 0;
+ int expected_height = 0;
+ if (gen_params.width_and_height_are_set()) {
+ expected_width = gen_params.width;
+ expected_height = gen_params.height;
+ }
+ int mask_w;
+ int mask_h;
+
uint8_t* mask_raw = load_image_from_memory(
reinterpret_cast(mask_bytes.data()),
- mask_bytes.size(),
+ static_cast(mask_bytes.size()),
mask_w, mask_h,
- width, height, 1);
+ expected_width, expected_height, 1);
mask_image = {(uint32_t)mask_w, (uint32_t)mask_h, 1, mask_raw};
+ gen_params.set_width_and_height_if_unset(mask_image.width, mask_image.height);
} else {
- mask_image.width = width;
- mask_image.height = height;
+ mask_image.width = get_resolved_width();
+ mask_image.height = get_resolved_height();
mask_image.channel = 1;
mask_image.data = nullptr;
}
@@ -630,8 +748,8 @@ int main(int argc, const char** argv) {
gen_params.auto_resize_ref_image,
gen_params.increase_ref_index,
mask_image,
- gen_params.width,
- gen_params.height,
+ get_resolved_width(),
+ get_resolved_height(),
gen_params.sample_params,
gen_params.strength,
gen_params.seed,
@@ -645,7 +763,7 @@ int main(int argc, const char** argv) {
gen_params.pm_style_strength,
}, // pm_params
ctx_params.vae_tiling_params,
- gen_params.easycache_params,
+ gen_params.cache_params,
};
sd_image_t* results = nullptr;
@@ -658,7 +776,7 @@ int main(int argc, const char** argv) {
}
json out;
- out["created"] = iso_timestamp_now();
+ out["created"] = static_cast(std::time(nullptr));
out["data"] = json::array();
out["output_format"] = output_format;
@@ -676,6 +794,7 @@ int main(int argc, const char** argv) {
item["b64_json"] = b64;
out["data"].push_back(item);
}
+ free_results(results, num_results);
res.set_content(out.dump(), "application/json");
res.status = 200;
@@ -698,6 +817,408 @@ int main(int argc, const char** argv) {
}
});
+ // sdapi endpoints (AUTOMATIC1111 / Forge)
+
+ auto sdapi_any2img = [&](const httplib::Request& req, httplib::Response& res, bool img2img) {
+ try {
+ if (req.body.empty()) {
+ res.status = 400;
+ res.set_content(R"({"error":"empty body"})", "application/json");
+ return;
+ }
+
+ json j = json::parse(req.body);
+
+ std::string prompt = j.value("prompt", "");
+ std::string negative_prompt = j.value("negative_prompt", "");
+ int width = j.value("width", 512);
+ int height = j.value("height", 512);
+ int steps = j.value("steps", default_gen_params.sample_params.sample_steps);
+ float cfg_scale = j.value("cfg_scale", default_gen_params.sample_params.guidance.txt_cfg);
+ int64_t seed = j.value("seed", -1);
+ int batch_size = j.value("batch_size", 1);
+ int clip_skip = j.value("clip_skip", -1);
+ std::string sampler_name = j.value("sampler_name", "");
+ std::string scheduler_name = j.value("scheduler", "");
+
+ auto bad = [&](const std::string& msg) {
+ res.status = 400;
+ res.set_content("{\"error\":\"" + msg + "\"}", "application/json");
+ return;
+ };
+
+ if (width <= 0 || height <= 0) {
+ return bad("width and height must be positive");
+ }
+
+ if (steps < 1 || steps > 150) {
+ return bad("steps must be in range [1, 150]");
+ }
+
+ if (batch_size < 1 || batch_size > 8) {
+ return bad("batch_size must be in range [1, 8]");
+ }
+
+ if (cfg_scale < 0.f) {
+ return bad("cfg_scale must be positive");
+ }
+
+ if (prompt.empty()) {
+ return bad("prompt required");
+ }
+
+ std::vector sd_loras;
+ std::vector lora_path_storage;
+
+ if (j.contains("lora") && j["lora"].is_array()) {
+ for (const auto& item : j["lora"]) {
+ if (!item.is_object()) {
+ continue;
+ }
+
+ std::string path = item.value("path", "");
+ float multiplier = item.value("multiplier", 1.0f);
+ bool is_high_noise = item.value("is_high_noise", false);
+
+ if (path.empty()) {
+ return bad("lora.path required");
+ }
+
+ std::string fullpath = get_lora_full_path(path);
+ if (fullpath.empty()) {
+ return bad("invalid lora path: " + path);
+ }
+
+ lora_path_storage.push_back(fullpath);
+ sd_lora_t l;
+ l.is_high_noise = is_high_noise;
+ l.multiplier = multiplier;
+ l.path = lora_path_storage.back().c_str();
+
+ sd_loras.push_back(l);
+ }
+ }
+
+ auto get_sample_method = [](std::string name) -> enum sample_method_t {
+ enum sample_method_t result = str_to_sample_method(name.c_str());
+ if (result != SAMPLE_METHOD_COUNT) return result;
+ // some applications use a hardcoded sampler list
+ std::transform(name.begin(), name.end(), name.begin(),
+ [](unsigned char c) { return std::tolower(c); });
+ static const std::unordered_map hardcoded{
+ {"euler a", EULER_A_SAMPLE_METHOD},
+ {"k_euler_a", EULER_A_SAMPLE_METHOD},
+ {"euler", EULER_SAMPLE_METHOD},
+ {"k_euler", EULER_SAMPLE_METHOD},
+ {"heun", HEUN_SAMPLE_METHOD},
+ {"k_heun", HEUN_SAMPLE_METHOD},
+ {"dpm2", DPM2_SAMPLE_METHOD},
+ {"k_dpm_2", DPM2_SAMPLE_METHOD},
+ {"lcm", LCM_SAMPLE_METHOD},
+ {"ddim", DDIM_TRAILING_SAMPLE_METHOD},
+ {"dpm++ 2m", DPMPP2M_SAMPLE_METHOD},
+ {"k_dpmpp_2m", DPMPP2M_SAMPLE_METHOD},
+ {"res multistep", RES_MULTISTEP_SAMPLE_METHOD},
+ {"k_res_multistep", RES_MULTISTEP_SAMPLE_METHOD},
+ {"res 2s", RES_2S_SAMPLE_METHOD},
+ {"k_res_2s", RES_2S_SAMPLE_METHOD}};
+ auto it = hardcoded.find(name);
+ if (it != hardcoded.end()) return it->second;
+ return SAMPLE_METHOD_COUNT;
+ };
+
+ enum sample_method_t sample_method = get_sample_method(sampler_name);
+
+ enum scheduler_t scheduler = str_to_scheduler(scheduler_name.c_str());
+
+ SDGenerationParams gen_params = default_gen_params;
+ gen_params.prompt = prompt;
+ gen_params.negative_prompt = negative_prompt;
+ gen_params.seed = seed;
+ gen_params.sample_params.sample_steps = steps;
+ gen_params.batch_count = batch_size;
+ gen_params.sample_params.guidance.txt_cfg = cfg_scale;
+
+ if (clip_skip > 0) {
+ gen_params.clip_skip = clip_skip;
+ }
+
+ if (sample_method != SAMPLE_METHOD_COUNT) {
+ gen_params.sample_params.sample_method = sample_method;
+ }
+
+ if (scheduler != SCHEDULER_COUNT) {
+ gen_params.sample_params.scheduler = scheduler;
+ }
+
+ // re-read to avoid applying 512 as default before the provided
+ // images and/or server command-line
+ gen_params.width = j.value("width", -1);
+ gen_params.height = j.value("height", -1);
+
+ LOG_DEBUG("%s\n", gen_params.to_string().c_str());
+
+ sd_image_t init_image = {0, 0, 3, nullptr};
+ sd_image_t control_image = {0, 0, 3, nullptr};
+ sd_image_t mask_image = {0, 0, 1, nullptr};
+ std::vector mask_data;
+ std::vector pmid_images;
+ std::vector ref_images;
+
+ auto get_resolved_width = [&gen_params, &default_gen_params]() -> int {
+ if (gen_params.width > 0)
+ return gen_params.width;
+ if (default_gen_params.width > 0)
+ return default_gen_params.width;
+ return 512;
+ };
+ auto get_resolved_height = [&gen_params, &default_gen_params]() -> int {
+ if (gen_params.height > 0)
+ return gen_params.height;
+ if (default_gen_params.height > 0)
+ return default_gen_params.height;
+ return 512;
+ };
+
+ auto decode_image = [&gen_params](sd_image_t& image, std::string encoded) -> bool {
+ // remove data URI prefix if present ("data:image/png;base64,")
+ auto comma_pos = encoded.find(',');
+ if (comma_pos != std::string::npos) {
+ encoded = encoded.substr(comma_pos + 1);
+ }
+ std::vector img_data = base64_decode(encoded);
+ if (!img_data.empty()) {
+ int expected_width = 0;
+ int expected_height = 0;
+ if (gen_params.width_and_height_are_set()) {
+ expected_width = gen_params.width;
+ expected_height = gen_params.height;
+ }
+ int img_w;
+ int img_h;
+
+ uint8_t* raw_data = load_image_from_memory(
+ (const char*)img_data.data(), (int)img_data.size(),
+ img_w, img_h,
+ expected_width, expected_height, image.channel);
+ if (raw_data) {
+ image = {(uint32_t)img_w, (uint32_t)img_h, image.channel, raw_data};
+ gen_params.set_width_and_height_if_unset(image.width, image.height);
+ return true;
+ }
+ }
+ return false;
+ };
+
+ if (img2img) {
+ if (j.contains("init_images") && j["init_images"].is_array() && !j["init_images"].empty()) {
+ std::string encoded = j["init_images"][0].get();
+ decode_image(init_image, encoded);
+ }
+
+ if (j.contains("mask") && j["mask"].is_string()) {
+ std::string encoded = j["mask"].get();
+ decode_image(mask_image, encoded);
+ bool inpainting_mask_invert = j.value("inpainting_mask_invert", 0) != 0;
+ if (inpainting_mask_invert && mask_image.data != nullptr) {
+ for (uint32_t i = 0; i < mask_image.width * mask_image.height; i++) {
+ mask_image.data[i] = 255 - mask_image.data[i];
+ }
+ }
+ } else {
+ int m_width = get_resolved_width();
+ int m_height = get_resolved_height();
+ mask_data = std::vector(m_width * m_height, 255);
+ mask_image.width = m_width;
+ mask_image.height = m_height;
+ mask_image.channel = 1;
+ mask_image.data = mask_data.data();
+ }
+
+ float denoising_strength = j.value("denoising_strength", -1.f);
+ if (denoising_strength >= 0.f) {
+ denoising_strength = std::min(denoising_strength, 1.0f);
+ gen_params.strength = denoising_strength;
+ }
+ }
+
+ if (j.contains("extra_images") && j["extra_images"].is_array()) {
+ for (auto extra_image : j["extra_images"]) {
+ std::string encoded = extra_image.get();
+ sd_image_t tmp_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
+ if (decode_image(tmp_image, encoded)) {
+ ref_images.push_back(tmp_image);
+ }
+ }
+ }
+
+ sd_img_gen_params_t img_gen_params = {
+ sd_loras.data(),
+ static_cast(sd_loras.size()),
+ gen_params.prompt.c_str(),
+ gen_params.negative_prompt.c_str(),
+ gen_params.clip_skip,
+ init_image,
+ ref_images.data(),
+ (int)ref_images.size(),
+ gen_params.auto_resize_ref_image,
+ gen_params.increase_ref_index,
+ mask_image,
+ get_resolved_width(),
+ get_resolved_height(),
+ gen_params.sample_params,
+ gen_params.strength,
+ gen_params.seed,
+ gen_params.batch_count,
+ control_image,
+ gen_params.control_strength,
+ {
+ pmid_images.data(),
+ (int)pmid_images.size(),
+ gen_params.pm_id_embed_path.c_str(),
+ gen_params.pm_style_strength,
+ }, // pm_params
+ ctx_params.vae_tiling_params,
+ gen_params.cache_params,
+ };
+
+ sd_image_t* results = nullptr;
+ int num_results = 0;
+
+ {
+ std::lock_guard lock(sd_ctx_mutex);
+ results = generate_image(sd_ctx, &img_gen_params);
+ num_results = gen_params.batch_count;
+ }
+
+ json out;
+ out["images"] = json::array();
+ out["parameters"] = j; // TODO should return changed defaults
+ out["info"] = "";
+
+ for (int i = 0; i < num_results; i++) {
+ if (results[i].data == nullptr) {
+ continue;
+ }
+
+ auto image_bytes = write_image_to_vector(ImageFormat::PNG,
+ results[i].data,
+ results[i].width,
+ results[i].height,
+ results[i].channel);
+
+ if (image_bytes.empty()) {
+ LOG_ERROR("write image to mem failed");
+ continue;
+ }
+
+ std::string b64 = base64_encode(image_bytes);
+ out["images"].push_back(b64);
+ }
+ free_results(results, num_results);
+
+ res.set_content(out.dump(), "application/json");
+ res.status = 200;
+
+ if (init_image.data) {
+ stbi_image_free(init_image.data);
+ }
+ if (mask_image.data && mask_data.empty()) {
+ stbi_image_free(mask_image.data);
+ }
+ for (auto ref_image : ref_images) {
+ stbi_image_free(ref_image.data);
+ }
+
+ } catch (const std::exception& e) {
+ res.status = 500;
+ json err;
+ err["error"] = "server_error";
+ err["message"] = e.what();
+ res.set_content(err.dump(), "application/json");
+ }
+ };
+
+ svr.Post("/sdapi/v1/txt2img", [&](const httplib::Request& req, httplib::Response& res) {
+ sdapi_any2img(req, res, false);
+ });
+
+ svr.Post("/sdapi/v1/img2img", [&](const httplib::Request& req, httplib::Response& res) {
+ sdapi_any2img(req, res, true);
+ });
+
+ svr.Get("/sdapi/v1/loras", [&](const httplib::Request&, httplib::Response& res) {
+ refresh_lora_cache();
+
+ json result = json::array();
+ {
+ std::lock_guard lock(lora_mutex);
+ for (const auto& e : lora_cache) {
+ json item;
+ item["name"] = e.name;
+ item["path"] = e.path;
+ result.push_back(item);
+ }
+ }
+
+ res.set_content(result.dump(), "application/json");
+ });
+
+ svr.Get("/sdapi/v1/samplers", [&](const httplib::Request&, httplib::Response& res) {
+ std::vector sampler_names;
+ sampler_names.push_back("default");
+ for (int i = 0; i < SAMPLE_METHOD_COUNT; i++) {
+ sampler_names.push_back(sd_sample_method_name((sample_method_t)i));
+ }
+ json r = json::array();
+ for (auto name : sampler_names) {
+ json entry;
+ entry["name"] = name;
+ entry["aliases"] = json::array({name});
+ entry["options"] = json::object();
+ r.push_back(entry);
+ }
+ res.set_content(r.dump(), "application/json");
+ });
+
+ svr.Get("/sdapi/v1/schedulers", [&](const httplib::Request&, httplib::Response& res) {
+ std::vector scheduler_names;
+ scheduler_names.push_back("default");
+ for (int i = 0; i < SCHEDULER_COUNT; i++) {
+ scheduler_names.push_back(sd_scheduler_name((scheduler_t)i));
+ }
+ json r = json::array();
+ for (auto name : scheduler_names) {
+ json entry;
+ entry["name"] = name;
+ entry["label"] = name;
+ r.push_back(entry);
+ }
+ res.set_content(r.dump(), "application/json");
+ });
+
+ svr.Get("/sdapi/v1/sd-models", [&](const httplib::Request&, httplib::Response& res) {
+ fs::path model_path = ctx_params.model_path;
+ json entry;
+ entry["title"] = model_path.stem();
+ entry["model_name"] = model_path.stem();
+ entry["filename"] = model_path.filename();
+ entry["hash"] = "8888888888";
+ entry["sha256"] = "8888888888888888888888888888888888888888888888888888888888888888";
+ entry["config"] = nullptr;
+ json r = json::array();
+ r.push_back(entry);
+ res.set_content(r.dump(), "application/json");
+ });
+
+ svr.Get("/sdapi/v1/options", [&](const httplib::Request&, httplib::Response& res) {
+ fs::path model_path = ctx_params.model_path;
+ json r;
+ r["samples_format"] = "png";
+ r["sd_model_checkpoint"] = model_path.stem();
+ res.set_content(r.dump(), "application/json");
+ });
+
LOG_INFO("listening on: %s:%d\n", svr_params.listen_ip.c_str(), svr_params.listen_port);
svr.listen(svr_params.listen_ip, svr_params.listen_port);
diff --git a/format-code.sh b/format-code.sh
index d2a75bdc..ac5fd340 100644
--- a/format-code.sh
+++ b/format-code.sh
@@ -1,4 +1,4 @@
-for f in *.cpp *.h *.hpp examples/cli/*.cpp examples/common/*.hpp examples/cli/*.h examples/server/*.cpp; do
+for f in src/*.cpp src/*.h src/*.hpp src/vocab/*.h src/vocab/*.cpp examples/cli/*.cpp examples/common/*.hpp examples/cli/*.h examples/server/*.cpp; do
[[ "$f" == vocab* ]] && continue
echo "formatting '$f'"
# if [ "$f" != "stable-diffusion.h" ]; then
diff --git a/ggml b/ggml
index f5425c0e..a8db410a 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit f5425c0ee5e582a7d64411f06139870bff3e52e0
+Subproject commit a8db410a252c8c8f2d120c6f2e7133ebe032f35d
diff --git a/stable-diffusion.h b/include/stable-diffusion.h
similarity index 89%
rename from stable-diffusion.h
rename to include/stable-diffusion.h
index 9266ba43..51b2b329 100644
--- a/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -48,6 +48,8 @@ enum sample_method_t {
LCM_SAMPLE_METHOD,
DDIM_TRAILING_SAMPLE_METHOD,
TCD_SAMPLE_METHOD,
+ RES_MULTISTEP_SAMPLE_METHOD,
+ RES_2S_SAMPLE_METHOD,
SAMPLE_METHOD_COUNT
};
@@ -60,7 +62,9 @@ enum scheduler_t {
SGM_UNIFORM_SCHEDULER,
SIMPLE_SCHEDULER,
SMOOTHSTEP_SCHEDULER,
+ KL_OPTIMAL_SCHEDULER,
LCM_SCHEDULER,
+ BONG_TANGENT_SCHEDULER,
SCHEDULER_COUNT
};
@@ -181,18 +185,22 @@ typedef struct {
enum prediction_t prediction;
enum lora_apply_mode_t lora_apply_mode;
bool offload_params_to_cpu;
+ bool enable_mmap;
bool keep_clip_on_cpu;
bool keep_control_net_on_cpu;
bool keep_vae_on_cpu;
+ bool flash_attn;
bool diffusion_flash_attn;
bool tae_preview_only;
bool diffusion_conv_direct;
bool vae_conv_direct;
+ bool circular_x;
+ bool circular_y;
bool force_sdxl_vae_conv_scale;
bool chroma_use_dit_mask;
bool chroma_use_t5_mask;
int chroma_t5_mask_pad;
- float flow_shift;
+ bool qwen_image_zero_cond_t;
} sd_ctx_params_t;
typedef struct {
@@ -226,6 +234,7 @@ typedef struct {
int shifted_timestep;
float* custom_sigmas;
int custom_sigmas_count;
+ float flow_shift;
} sd_sample_params_t;
typedef struct {
@@ -235,12 +244,34 @@ typedef struct {
float style_strength;
} sd_pm_params_t; // photo maker
+enum sd_cache_mode_t {
+ SD_CACHE_DISABLED = 0,
+ SD_CACHE_EASYCACHE,
+ SD_CACHE_UCACHE,
+ SD_CACHE_DBCACHE,
+ SD_CACHE_TAYLORSEER,
+ SD_CACHE_CACHE_DIT,
+};
+
typedef struct {
- bool enabled;
+ enum sd_cache_mode_t mode;
float reuse_threshold;
float start_percent;
float end_percent;
-} sd_easycache_params_t;
+ float error_decay_rate;
+ bool use_relative_threshold;
+ bool reset_error_on_compute;
+ int Fn_compute_blocks;
+ int Bn_compute_blocks;
+ float residual_diff_threshold;
+ int max_warmup_steps;
+ int max_cached_steps;
+ int max_continuous_cached_steps;
+ int taylorseer_n_derivatives;
+ int taylorseer_skip_interval;
+ const char* scm_mask;
+ bool scm_policy_dynamic;
+} sd_cache_params_t;
typedef struct {
bool is_high_noise;
@@ -270,7 +301,7 @@ typedef struct {
float control_strength;
sd_pm_params_t pm_params;
sd_tiling_params_t vae_tiling_params;
- sd_easycache_params_t easycache;
+ sd_cache_params_t cache;
} sd_img_gen_params_t;
typedef struct {
@@ -292,7 +323,8 @@ typedef struct {
int64_t seed;
int video_frames;
float vace_strength;
- sd_easycache_params_t easycache;
+ sd_tiling_params_t vae_tiling_params;
+ sd_cache_params_t cache;
} sd_vid_gen_params_t;
typedef struct sd_ctx_t sd_ctx_t;
@@ -322,7 +354,7 @@ SD_API enum preview_t str_to_preview(const char* str);
SD_API const char* sd_lora_apply_mode_name(enum lora_apply_mode_t mode);
SD_API enum lora_apply_mode_t str_to_lora_apply_mode(const char* str);
-SD_API void sd_easycache_params_init(sd_easycache_params_t* easycache_params);
+SD_API void sd_cache_params_init(sd_cache_params_t* cache_params);
SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
@@ -334,7 +366,7 @@ SD_API void sd_sample_params_init(sd_sample_params_t* sample_params);
SD_API char* sd_sample_params_to_str(const sd_sample_params_t* sample_params);
SD_API enum sample_method_t sd_get_default_sample_method(const sd_ctx_t* sd_ctx);
-SD_API enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx);
+SD_API enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx, enum sample_method_t sample_method);
SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params);
SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
@@ -362,7 +394,8 @@ SD_API bool convert(const char* input_path,
const char* vae_path,
const char* output_path,
enum sd_type_t output_type,
- const char* tensor_type_rules);
+ const char* tensor_type_rules,
+ bool convert_name);
SD_API bool preprocess_canny(sd_image_t image,
float high_threshold,
diff --git a/face_detect.py b/script/face_detect.py
similarity index 97%
rename from face_detect.py
rename to script/face_detect.py
index 7131af31..e7a3eae1 100644
--- a/face_detect.py
+++ b/script/face_detect.py
@@ -1,88 +1,88 @@
-import os
-import sys
-
-import numpy as np
-import torch
-from diffusers.utils import load_image
-# pip install insightface==0.7.3
-from insightface.app import FaceAnalysis
-from insightface.data import get_image as ins_get_image
-from safetensors.torch import save_file
-
-###
-# https://github.com/cubiq/ComfyUI_IPAdapter_plus/issues/165#issue-2055829543
-###
-class FaceAnalysis2(FaceAnalysis):
- # NOTE: allows setting det_size for each detection call.
- # the model allows it but the wrapping code from insightface
- # doesn't show it, and people end up loading duplicate models
- # for different sizes where there is absolutely no need to
- def get(self, img, max_num=0, det_size=(640, 640)):
- if det_size is not None:
- self.det_model.input_size = det_size
-
- return super().get(img, max_num)
-
-def analyze_faces(face_analysis: FaceAnalysis, img_data: np.ndarray, det_size=(640, 640)):
- # NOTE: try detect faces, if no faces detected, lower det_size until it does
- detection_sizes = [None] + [(size, size) for size in range(640, 256, -64)] + [(256, 256)]
-
- for size in detection_sizes:
- faces = face_analysis.get(img_data, det_size=size)
- if len(faces) > 0:
- return faces
-
- return []
-
-if __name__ == "__main__":
- #face_detector = FaceAnalysis2(providers=['CUDAExecutionProvider'], allowed_modules=['detection', 'recognition'])
- face_detector = FaceAnalysis2(providers=['CPUExecutionProvider'], allowed_modules=['detection', 'recognition'])
- face_detector.prepare(ctx_id=0, det_size=(640, 640))
- #input_folder_name = './scarletthead_woman'
- input_folder_name = sys.argv[1]
- image_basename_list = os.listdir(input_folder_name)
- image_path_list = sorted([os.path.join(input_folder_name, basename) for basename in image_basename_list])
-
- input_id_images = []
- for image_path in image_path_list:
- input_id_images.append(load_image(image_path))
-
- id_embed_list = []
-
- for img in input_id_images:
- img = np.array(img)
- img = img[:, :, ::-1]
- faces = analyze_faces(face_detector, img)
- if len(faces) > 0:
- id_embed_list.append(torch.from_numpy((faces[0]['embedding'])))
-
- if len(id_embed_list) == 0:
- raise ValueError(f"No face detected in input image pool")
-
- id_embeds = torch.stack(id_embed_list)
-
- # for r in id_embeds:
- # print(r)
- # #torch.save(id_embeds, input_folder_name+'/id_embeds.pt');
- # weights = dict()
- # weights["id_embeds"] = id_embeds
- # save_file(weights, input_folder_name+'/id_embeds.safetensors')
-
- binary_data = id_embeds.numpy().tobytes()
- two = 4
- zero = 0
- one = 1
- tensor_name = "id_embeds"
-# Write binary data to a file
- with open(input_folder_name+'/id_embeds.bin', "wb") as f:
- f.write(two.to_bytes(4, byteorder='little'))
- f.write((len(tensor_name)).to_bytes(4, byteorder='little'))
- f.write(zero.to_bytes(4, byteorder='little'))
- f.write((id_embeds.shape[1]).to_bytes(4, byteorder='little'))
- f.write((id_embeds.shape[0]).to_bytes(4, byteorder='little'))
- f.write(one.to_bytes(4, byteorder='little'))
- f.write(one.to_bytes(4, byteorder='little'))
- f.write(tensor_name.encode('ascii'))
- f.write(binary_data)
-
+import os
+import sys
+
+import numpy as np
+import torch
+from diffusers.utils import load_image
+# pip install insightface==0.7.3
+from insightface.app import FaceAnalysis
+from insightface.data import get_image as ins_get_image
+from safetensors.torch import save_file
+
+###
+# https://github.com/cubiq/ComfyUI_IPAdapter_plus/issues/165#issue-2055829543
+###
+class FaceAnalysis2(FaceAnalysis):
+ # NOTE: allows setting det_size for each detection call.
+ # the model allows it but the wrapping code from insightface
+ # doesn't show it, and people end up loading duplicate models
+ # for different sizes where there is absolutely no need to
+ def get(self, img, max_num=0, det_size=(640, 640)):
+ if det_size is not None:
+ self.det_model.input_size = det_size
+
+ return super().get(img, max_num)
+
+def analyze_faces(face_analysis: FaceAnalysis, img_data: np.ndarray, det_size=(640, 640)):
+ # NOTE: try detect faces, if no faces detected, lower det_size until it does
+ detection_sizes = [None] + [(size, size) for size in range(640, 256, -64)] + [(256, 256)]
+
+ for size in detection_sizes:
+ faces = face_analysis.get(img_data, det_size=size)
+ if len(faces) > 0:
+ return faces
+
+ return []
+
+if __name__ == "__main__":
+ #face_detector = FaceAnalysis2(providers=['CUDAExecutionProvider'], allowed_modules=['detection', 'recognition'])
+ face_detector = FaceAnalysis2(providers=['CPUExecutionProvider'], allowed_modules=['detection', 'recognition'])
+ face_detector.prepare(ctx_id=0, det_size=(640, 640))
+ #input_folder_name = './scarletthead_woman'
+ input_folder_name = sys.argv[1]
+ image_basename_list = os.listdir(input_folder_name)
+ image_path_list = sorted([os.path.join(input_folder_name, basename) for basename in image_basename_list])
+
+ input_id_images = []
+ for image_path in image_path_list:
+ input_id_images.append(load_image(image_path))
+
+ id_embed_list = []
+
+ for img in input_id_images:
+ img = np.array(img)
+ img = img[:, :, ::-1]
+ faces = analyze_faces(face_detector, img)
+ if len(faces) > 0:
+ id_embed_list.append(torch.from_numpy((faces[0]['embedding'])))
+
+ if len(id_embed_list) == 0:
+ raise ValueError(f"No face detected in input image pool")
+
+ id_embeds = torch.stack(id_embed_list)
+
+ # for r in id_embeds:
+ # print(r)
+ # #torch.save(id_embeds, input_folder_name+'/id_embeds.pt');
+ # weights = dict()
+ # weights["id_embeds"] = id_embeds
+ # save_file(weights, input_folder_name+'/id_embeds.safetensors')
+
+ binary_data = id_embeds.numpy().tobytes()
+ two = 4
+ zero = 0
+ one = 1
+ tensor_name = "id_embeds"
+# Write binary data to a file
+ with open(input_folder_name+'/id_embeds.bin', "wb") as f:
+ f.write(two.to_bytes(4, byteorder='little'))
+ f.write((len(tensor_name)).to_bytes(4, byteorder='little'))
+ f.write(zero.to_bytes(4, byteorder='little'))
+ f.write((id_embeds.shape[1]).to_bytes(4, byteorder='little'))
+ f.write((id_embeds.shape[0]).to_bytes(4, byteorder='little'))
+ f.write(one.to_bytes(4, byteorder='little'))
+ f.write(one.to_bytes(4, byteorder='little'))
+ f.write(tensor_name.encode('ascii'))
+ f.write(binary_data)
+
\ No newline at end of file
diff --git a/src/anima.hpp b/src/anima.hpp
new file mode 100644
index 00000000..191a096d
--- /dev/null
+++ b/src/anima.hpp
@@ -0,0 +1,686 @@
+#ifndef __ANIMA_HPP__
+#define __ANIMA_HPP__
+
+#include
+#include
+#include
+#include
+
+#include "common_block.hpp"
+#include "flux.hpp"
+#include "rope.hpp"
+
+namespace Anima {
+ constexpr int ANIMA_GRAPH_SIZE = 65536;
+
+ __STATIC_INLINE__ struct ggml_tensor* apply_gate(struct ggml_context* ctx,
+ struct ggml_tensor* x,
+ struct ggml_tensor* gate) {
+ gate = ggml_reshape_3d(ctx, gate, gate->ne[0], 1, gate->ne[1]); // [N, 1, C]
+ return ggml_mul(ctx, x, gate);
+ }
+
+ struct XEmbedder : public GGMLBlock {
+ public:
+ XEmbedder(int64_t in_dim, int64_t out_dim) {
+ blocks["proj.1"] = std::make_shared(in_dim, out_dim, false);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+ auto proj = std::dynamic_pointer_cast(blocks["proj.1"]);
+ return proj->forward(ctx, x);
+ }
+ };
+
+ struct TimestepEmbedder : public GGMLBlock {
+ public:
+ TimestepEmbedder(int64_t in_dim, int64_t out_dim) {
+ blocks["1.linear_1"] = std::make_shared(in_dim, in_dim, false);
+ blocks["1.linear_2"] = std::make_shared(in_dim, out_dim, false);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+ auto linear_1 = std::dynamic_pointer_cast(blocks["1.linear_1"]);
+ auto linear_2 = std::dynamic_pointer_cast(blocks["1.linear_2"]);
+
+ x = linear_1->forward(ctx, x);
+ x = ggml_silu_inplace(ctx->ggml_ctx, x);
+ x = linear_2->forward(ctx, x);
+ return x;
+ }
+ };
+
+ struct AdaLayerNormZero : public GGMLBlock {
+ protected:
+ int64_t in_features;
+
+ public:
+ AdaLayerNormZero(int64_t in_features, int64_t hidden_features = 256)
+ : in_features(in_features) {
+ blocks["norm"] = std::make_shared(in_features, 1e-6f, false, false);
+ blocks["1"] = std::make_shared(in_features, hidden_features, false);
+ blocks["2"] = std::make_shared(hidden_features, 3 * in_features, false);
+ }
+
+ std::pair forward(GGMLRunnerContext* ctx,
+ struct ggml_tensor* hidden_states,
+ struct ggml_tensor* embedded_timestep,
+ struct ggml_tensor* temb = nullptr) {
+ auto norm = std::dynamic_pointer_cast(blocks["norm"]);
+ auto linear_1 = std::dynamic_pointer_cast(blocks["1"]);
+ auto linear_2 = std::dynamic_pointer_cast(blocks["2"]);
+
+ auto emb = ggml_silu(ctx->ggml_ctx, embedded_timestep);
+ emb = linear_1->forward(ctx, emb);
+ emb = linear_2->forward(ctx, emb); // [N, 3*C]
+
+ if (temb != nullptr) {
+ emb = ggml_add(ctx->ggml_ctx, emb, temb);
+ }
+
+ auto emb_chunks = ggml_ext_chunk(ctx->ggml_ctx, emb, 3, 0);
+ auto shift = emb_chunks[0];
+ auto scale = emb_chunks[1];
+ auto gate = emb_chunks[2];
+
+ auto x = norm->forward(ctx, hidden_states);
+ x = Flux::modulate(ctx->ggml_ctx, x, shift, scale);
+
+ return {x, gate};
+ }
+ };
+
+ struct AdaLayerNorm : public GGMLBlock {
+ protected:
+ int64_t embedding_dim;
+
+ public:
+ AdaLayerNorm(int64_t in_features, int64_t hidden_features = 256)
+ : embedding_dim(in_features) {
+ blocks["norm"] = std::make_shared(in_features, 1e-6f, false, false);
+ blocks["1"] = std::make_shared(in_features, hidden_features, false);
+ blocks["2"] = std::make_shared(hidden_features, 2 * in_features, false);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+ struct ggml_tensor* hidden_states,
+ struct ggml_tensor* embedded_timestep,
+ struct ggml_tensor* temb = nullptr) {
+ auto norm = std::dynamic_pointer_cast(blocks["norm"]);
+ auto linear_1 = std::dynamic_pointer_cast(blocks["1"]);
+ auto linear_2 = std::dynamic_pointer_cast(blocks["2"]);
+
+ auto emb = ggml_silu(ctx->ggml_ctx, embedded_timestep);
+ emb = linear_1->forward(ctx, emb);
+ emb = linear_2->forward(ctx, emb); // [N, 2*C]
+
+ if (temb != nullptr) {
+ auto temb_2c = ggml_view_2d(ctx->ggml_ctx, temb, 2 * embedding_dim, temb->ne[1], temb->nb[1], 0);
+ emb = ggml_add(ctx->ggml_ctx, emb, temb_2c);
+ }
+
+ auto emb_chunks = ggml_ext_chunk(ctx->ggml_ctx, emb, 2, 0);
+ auto shift = emb_chunks[0];
+ auto scale = emb_chunks[1];
+
+ auto x = norm->forward(ctx, hidden_states);
+ x = Flux::modulate(ctx->ggml_ctx, x, shift, scale);
+ return x;
+ }
+ };
+
+ struct AnimaAttention : public GGMLBlock {
+ protected:
+ int64_t num_heads;
+ int64_t head_dim;
+ std::string out_proj_name;
+
+ public:
+ AnimaAttention(int64_t query_dim,
+ int64_t context_dim,
+ int64_t num_heads,
+ int64_t head_dim,
+ const std::string& out_proj_name = "output_proj")
+ : num_heads(num_heads), head_dim(head_dim), out_proj_name(out_proj_name) {
+ int64_t inner_dim = num_heads * head_dim;
+
+ blocks["q_proj"] = std::make_shared(query_dim, inner_dim, false);
+ blocks["k_proj"] = std::make_shared(context_dim, inner_dim, false);
+ blocks["v_proj"] = std::make_shared(context_dim, inner_dim, false);
+ blocks["q_norm"] = std::make_shared(head_dim, 1e-6f);
+ blocks["k_norm"] = std::make_shared(head_dim, 1e-6f);
+ blocks[this->out_proj_name] = std::make_shared(inner_dim, query_dim, false);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+ struct ggml_tensor* hidden_states,
+ struct ggml_tensor* encoder_hidden_states = nullptr,
+ struct ggml_tensor* pe_q = nullptr,
+ struct ggml_tensor* pe_k = nullptr) {
+ if (encoder_hidden_states == nullptr) {
+ encoder_hidden_states = hidden_states;
+ }
+
+ auto q_proj = std::dynamic_pointer_cast(blocks["q_proj"]);
+ auto k_proj = std::dynamic_pointer_cast(blocks["k_proj"]);
+ auto v_proj = std::dynamic_pointer_cast(blocks["v_proj"]);
+ auto q_norm = std::dynamic_pointer_cast(blocks["q_norm"]);
+ auto k_norm = std::dynamic_pointer_cast(blocks["k_norm"]);
+ auto out_proj = std::dynamic_pointer_cast(blocks[out_proj_name]);
+
+ auto q = q_proj->forward(ctx, hidden_states);
+ auto k = k_proj->forward(ctx, encoder_hidden_states);
+ auto v = v_proj->forward(ctx, encoder_hidden_states);
+
+ int64_t N = q->ne[2];
+ int64_t L_q = q->ne[1];
+ int64_t L_k = k->ne[1];
+
+ auto q4 = ggml_reshape_4d(ctx->ggml_ctx, q, head_dim, num_heads, L_q, N); // [N, L_q, H, D]
+ auto k4 = ggml_reshape_4d(ctx->ggml_ctx, k, head_dim, num_heads, L_k, N); // [N, L_k, H, D]
+ auto v4 = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim, num_heads, L_k, N); // [N, L_k, H, D]
+
+ q4 = q_norm->forward(ctx, q4);
+ k4 = k_norm->forward(ctx, k4);
+
+ struct ggml_tensor* attn_out = nullptr;
+ if (pe_q != nullptr || pe_k != nullptr) {
+ if (pe_q == nullptr) {
+ pe_q = pe_k;
+ }
+ if (pe_k == nullptr) {
+ pe_k = pe_q;
+ }
+ auto q_rope = Rope::apply_rope(ctx->ggml_ctx, q4, pe_q, false);
+ auto k_rope = Rope::apply_rope(ctx->ggml_ctx, k4, pe_k, false);
+ attn_out = ggml_ext_attention_ext(ctx->ggml_ctx,
+ ctx->backend,
+ q_rope,
+ k_rope,
+ v4,
+ num_heads,
+ nullptr,
+ true,
+ ctx->flash_attn_enabled);
+ } else {
+ auto q_flat = ggml_reshape_3d(ctx->ggml_ctx, q4, head_dim * num_heads, L_q, N);
+ auto k_flat = ggml_reshape_3d(ctx->ggml_ctx, k4, head_dim * num_heads, L_k, N);
+ attn_out = ggml_ext_attention_ext(ctx->ggml_ctx,
+ ctx->backend,
+ q_flat,
+ k_flat,
+ v,
+ num_heads,
+ nullptr,
+ false,
+ ctx->flash_attn_enabled);
+ }
+
+ return out_proj->forward(ctx, attn_out);
+ }
+ };
+
+ struct AnimaMLP : public GGMLBlock {
+ public:
+ AnimaMLP(int64_t dim, int64_t hidden_dim) {
+ blocks["layer1"] = std::make_shared(dim, hidden_dim, false);
+ blocks["layer2"] = std::make_shared(hidden_dim, dim, false);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+ auto layer1 = std::dynamic_pointer_cast(blocks["layer1"]);
+ auto layer2 = std::dynamic_pointer_cast(blocks["layer2"]);
+
+ x = layer1->forward(ctx, x);
+ x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
+ x = layer2->forward(ctx, x);
+ return x;
+ }
+ };
+
+ struct AdapterMLP : public GGMLBlock {
+ public:
+ AdapterMLP(int64_t dim, int64_t hidden_dim) {
+ blocks["0"] = std::make_shared(dim, hidden_dim, true);
+ blocks["2"] = std::make_shared(hidden_dim, dim, true);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+ auto layer0 = std::dynamic_pointer_cast(blocks["0"]);
+ auto layer2 = std::dynamic_pointer_cast(blocks["2"]);
+
+ x = layer0->forward(ctx, x);
+ x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
+ x = layer2->forward(ctx, x);
+ return x;
+ }
+ };
+
+ struct LLMAdapterBlock : public GGMLBlock {
+ public:
+ LLMAdapterBlock(int64_t model_dim = 1024, int64_t source_dim = 1024, int64_t num_heads = 16, int64_t head_dim = 64) {
+ blocks["norm_self_attn"] = std::make_shared(model_dim, 1e-6f);
+ blocks["self_attn"] = std::make_shared(model_dim, model_dim, num_heads, head_dim, "o_proj");
+ blocks["norm_cross_attn"] = std::make_shared(model_dim, 1e-6f);
+ blocks["cross_attn"] = std::make_shared(model_dim, source_dim, num_heads, head_dim, "o_proj");
+ blocks["norm_mlp"] = std::make_shared(model_dim, 1e-6f);
+ blocks["mlp"] = std::make_shared(model_dim, model_dim * 4);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+ struct ggml_tensor* x,
+ struct ggml_tensor* context,
+ struct ggml_tensor* target_pe,
+ struct ggml_tensor* context_pe) {
+ auto norm_self_attn = std::dynamic_pointer_cast(blocks["norm_self_attn"]);
+ auto self_attn = std::dynamic_pointer_cast(blocks["self_attn"]);
+ auto norm_cross_attn = std::dynamic_pointer_cast(blocks["norm_cross_attn"]);
+ auto cross_attn = std::dynamic_pointer_cast(blocks["cross_attn"]);
+ auto norm_mlp = std::dynamic_pointer_cast(blocks["norm_mlp"]);
+ auto mlp = std::dynamic_pointer_cast(blocks["mlp"]);
+
+ auto h = norm_self_attn->forward(ctx, x);
+ h = self_attn->forward(ctx, h, nullptr, target_pe, target_pe);
+ x = ggml_add(ctx->ggml_ctx, x, h);
+
+ h = norm_cross_attn->forward(ctx, x);
+ h = cross_attn->forward(ctx, h, context, target_pe, context_pe);
+ x = ggml_add(ctx->ggml_ctx, x, h);
+
+ h = norm_mlp->forward(ctx, x);
+ h = mlp->forward(ctx, h);
+ x = ggml_add(ctx->ggml_ctx, x, h);
+
+ return x;
+ }
+ };
+
+ struct LLMAdapter : public GGMLBlock {
+ protected:
+ int num_layers;
+
+ public:
+ LLMAdapter(int64_t source_dim = 1024,
+ int64_t target_dim = 1024,
+ int64_t model_dim = 1024,
+ int num_layers = 6,
+ int num_heads = 16)
+ : num_layers(num_layers) {
+ int64_t head_dim = model_dim / num_heads;
+
+ blocks["embed"] = std::make_shared(32128, target_dim);
+ for (int i = 0; i < num_layers; i++) {
+ blocks["blocks." + std::to_string(i)] =
+ std::make_shared(model_dim, source_dim, num_heads, head_dim);
+ }
+ blocks["out_proj"] = std::make_shared(model_dim, target_dim, true);
+ blocks["norm"] = std::make_shared(target_dim, 1e-6f);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+ struct ggml_tensor* source_hidden_states,
+ struct ggml_tensor* target_input_ids,
+ struct ggml_tensor* target_pe,
+ struct ggml_tensor* source_pe) {
+ GGML_ASSERT(target_input_ids != nullptr);
+ if (ggml_n_dims(target_input_ids) == 1) {
+ target_input_ids = ggml_reshape_2d(ctx->ggml_ctx, target_input_ids, target_input_ids->ne[0], 1);
+ }
+
+ auto embed = std::dynamic_pointer_cast(blocks["embed"]);
+ auto out_proj = std::dynamic_pointer_cast(blocks["out_proj"]);
+ auto norm = std::dynamic_pointer_cast(blocks["norm"]);
+
+ auto x = embed->forward(ctx, target_input_ids); // [N, target_len, target_dim]
+
+ for (int i = 0; i < num_layers; i++) {
+ auto block = std::dynamic_pointer_cast(blocks["blocks." + std::to_string(i)]);
+ x = block->forward(ctx, x, source_hidden_states, target_pe, source_pe);
+ }
+
+ x = out_proj->forward(ctx, x);
+ x = norm->forward(ctx, x);
+ return x;
+ }
+ };
+
+ struct TransformerBlock : public GGMLBlock {
+ public:
+ TransformerBlock(int64_t hidden_size,
+ int64_t text_embed_dim,
+ int64_t num_heads,
+ int64_t head_dim,
+ int64_t mlp_ratio = 4,
+ int64_t adaln_lora_dim = 256) {
+ blocks["adaln_modulation_self_attn"] = std::make_shared(hidden_size, adaln_lora_dim);
+ blocks["self_attn"] = std::make_shared(hidden_size, hidden_size, num_heads, head_dim);
+ blocks["adaln_modulation_cross_attn"] = std::make_shared(hidden_size, adaln_lora_dim);
+ blocks["cross_attn"] = std::make_shared(hidden_size, text_embed_dim, num_heads, head_dim);
+ blocks["adaln_modulation_mlp"] = std::make_shared(hidden_size, adaln_lora_dim);
+ blocks["mlp"] = std::make_shared(hidden_size, hidden_size * mlp_ratio);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+ struct ggml_tensor* hidden_states,
+ struct ggml_tensor* encoder_hidden_states,
+ struct ggml_tensor* embedded_timestep,
+ struct ggml_tensor* temb,
+ struct ggml_tensor* image_pe) {
+ auto norm1 = std::dynamic_pointer_cast(blocks["adaln_modulation_self_attn"]);
+ auto attn1 = std::dynamic_pointer_cast(blocks["self_attn"]);
+ auto norm2 = std::dynamic_pointer_cast(blocks["adaln_modulation_cross_attn"]);
+ auto attn2 = std::dynamic_pointer_cast(blocks["cross_attn"]);
+ auto norm3 = std::dynamic_pointer_cast(blocks["adaln_modulation_mlp"]);
+ auto mlp = std::dynamic_pointer_cast(blocks["mlp"]);
+
+ auto [normed1, gate1] = norm1->forward(ctx, hidden_states, embedded_timestep, temb);
+ auto h = attn1->forward(ctx, normed1, nullptr, image_pe, image_pe);
+ hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate1));
+
+ auto [normed2, gate2] = norm2->forward(ctx, hidden_states, embedded_timestep, temb);
+ h = attn2->forward(ctx, normed2, encoder_hidden_states, nullptr, nullptr);
+ hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate2));
+
+ auto [normed3, gate3] = norm3->forward(ctx, hidden_states, embedded_timestep, temb);
+ h = mlp->forward(ctx, normed3);
+ hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate3));
+
+ return hidden_states;
+ }
+ };
+
+ struct FinalLayer : public GGMLBlock {
+ protected:
+ int64_t hidden_size;
+ int64_t patch_size;
+ int64_t out_channels;
+
+ public:
+ FinalLayer(int64_t hidden_size, int64_t patch_size, int64_t out_channels)
+ : hidden_size(hidden_size), patch_size(patch_size), out_channels(out_channels) {
+ blocks["adaln_modulation"] = std::make_shared(hidden_size, 256);
+ blocks["linear"] = std::make_shared(hidden_size, patch_size * patch_size * out_channels, false);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+ struct ggml_tensor* hidden_states,
+ struct ggml_tensor* embedded_timestep,
+ struct ggml_tensor* temb) {
+ auto adaln = std::dynamic_pointer_cast(blocks["adaln_modulation"]);
+ auto linear = std::dynamic_pointer_cast(blocks["linear"]);
+
+ hidden_states = adaln->forward(ctx, hidden_states, embedded_timestep, temb);
+ hidden_states = linear->forward(ctx, hidden_states);
+ return hidden_states;
+ }
+ };
+
+ struct AnimaNet : public GGMLBlock {
+ public:
+ int64_t in_channels = 16;
+ int64_t out_channels = 16;
+ int64_t hidden_size = 2048;
+ int64_t text_embed_dim = 1024;
+ int64_t num_heads = 16;
+ int64_t head_dim = 128;
+ int patch_size = 2;
+ int64_t num_layers = 28;
+ std::vector axes_dim = {44, 42, 42};
+ int theta = 10000;
+
+ public:
+ AnimaNet() = default;
+ explicit AnimaNet(int64_t num_layers)
+ : num_layers(num_layers) {
+ blocks["x_embedder"] = std::make_shared((in_channels + 1) * patch_size * patch_size, hidden_size);
+ blocks["t_embedder"] = std::make_shared(hidden_size, hidden_size * 3);
+ blocks["t_embedding_norm"] = std::make_shared(hidden_size, 1e-6f);
+ for (int i = 0; i < num_layers; i++) {
+ blocks["blocks." + std::to_string(i)] = std::make_shared(hidden_size,
+ text_embed_dim,
+ num_heads,
+ head_dim);
+ }
+ blocks["final_layer"] = std::make_shared(hidden_size, patch_size, out_channels);
+ blocks["llm_adapter"] = std::make_shared(1024, 1024, 1024, 6, 16);
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+ struct ggml_tensor* x,
+ struct ggml_tensor* timestep,
+ struct ggml_tensor* encoder_hidden_states,
+ struct ggml_tensor* image_pe,
+ struct ggml_tensor* t5_ids = nullptr,
+ struct ggml_tensor* t5_weights = nullptr,
+ struct ggml_tensor* adapter_q_pe = nullptr,
+ struct ggml_tensor* adapter_k_pe = nullptr) {
+ GGML_ASSERT(x->ne[3] == 1);
+
+ auto x_embedder = std::dynamic_pointer_cast(blocks["x_embedder"]);
+ auto t_embedder = std::dynamic_pointer_cast(blocks["t_embedder"]);
+ auto t_embedding_norm = std::dynamic_pointer_cast(blocks["t_embedding_norm"]);
+ auto final_layer = std::dynamic_pointer_cast(blocks["final_layer"]);
+ auto llm_adapter = std::dynamic_pointer_cast(blocks["llm_adapter"]);
+
+ int64_t W = x->ne[0];
+ int64_t H = x->ne[1];
+
+ auto padding_mask = ggml_ext_zeros(ctx->ggml_ctx, x->ne[0], x->ne[1], 1, x->ne[3]);
+ x = ggml_concat(ctx->ggml_ctx, x, padding_mask, 2); // [N, C + 1, H, W]
+
+ x = DiT::pad_and_patchify(ctx, x, patch_size, patch_size); // [N, h*w, (C+1)*ph*pw]
+
+ x = x_embedder->forward(ctx, x);
+
+ auto timestep_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast(hidden_size));
+ auto temb = t_embedder->forward(ctx, timestep_proj);
+ auto embedded_timestep = t_embedding_norm->forward(ctx, timestep_proj);
+
+ if (t5_ids != nullptr) {
+ auto adapted_context = llm_adapter->forward(ctx, encoder_hidden_states, t5_ids, adapter_q_pe, adapter_k_pe);
+ if (t5_weights != nullptr) {
+ auto w = t5_weights;
+ if (ggml_n_dims(w) == 1) {
+ w = ggml_reshape_3d(ctx->ggml_ctx, w, 1, w->ne[0], 1);
+ }
+ w = ggml_repeat_4d(ctx->ggml_ctx, w, adapted_context->ne[0], adapted_context->ne[1], adapted_context->ne[2], 1);
+ adapted_context = ggml_mul(ctx->ggml_ctx, adapted_context, w);
+ }
+ if (adapted_context->ne[1] < 512) {
+ auto pad_ctx = ggml_ext_zeros(ctx->ggml_ctx,
+ adapted_context->ne[0],
+ 512 - adapted_context->ne[1],
+ adapted_context->ne[2],
+ 1);
+ adapted_context = ggml_concat(ctx->ggml_ctx, adapted_context, pad_ctx, 1);
+ } else if (adapted_context->ne[1] > 512) {
+ adapted_context = ggml_ext_slice(ctx->ggml_ctx, adapted_context, 1, 0, 512);
+ }
+ encoder_hidden_states = adapted_context;
+ }
+
+ for (int i = 0; i < num_layers; i++) {
+ auto block = std::dynamic_pointer_cast(blocks["blocks." + std::to_string(i)]);
+ x = block->forward(ctx, x, encoder_hidden_states, embedded_timestep, temb, image_pe);
+ }
+
+ x = final_layer->forward(ctx, x, embedded_timestep, temb); // [N, h*w, ph*pw*C]
+
+ x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, patch_size, patch_size, false); // [N, C, H, W]
+
+ return x;
+ }
+ };
+
+ struct AnimaRunner : public GGMLRunner {
+ public:
+ std::vector image_pe_vec;
+ std::vector adapter_q_pe_vec;
+ std::vector adapter_k_pe_vec;
+ AnimaNet net;
+
+ AnimaRunner(ggml_backend_t backend,
+ bool offload_params_to_cpu,
+ const String2TensorStorage& tensor_storage_map = {},
+ const std::string prefix = "model.diffusion_model")
+ : GGMLRunner(backend, offload_params_to_cpu) {
+ int64_t num_layers = 0;
+ std::string layer_tag = prefix + ".net.blocks.";
+ for (const auto& kv : tensor_storage_map) {
+ const std::string& tensor_name = kv.first;
+ size_t pos = tensor_name.find(layer_tag);
+ if (pos == std::string::npos) {
+ continue;
+ }
+ size_t start = pos + layer_tag.size();
+ size_t end = tensor_name.find('.', start);
+ if (end == std::string::npos) {
+ continue;
+ }
+ int64_t layer_id = atoll(tensor_name.substr(start, end - start).c_str());
+ num_layers = std::max(num_layers, layer_id + 1);
+ }
+ if (num_layers <= 0) {
+ num_layers = 28;
+ }
+ LOG_INFO("anima net layers: %" PRId64, num_layers);
+
+ net = AnimaNet(num_layers);
+ net.init(params_ctx, tensor_storage_map, prefix + ".net");
+ }
+
+ std::string get_desc() override {
+ return "anima";
+ }
+
+ void get_param_tensors(std::map& tensors, const std::string prefix) {
+ net.get_param_tensors(tensors, prefix + ".net");
+ }
+
+ static std::vector gen_1d_rope_pe_vec(int64_t seq_len, int dim, float theta = 10000.f) {
+ std::vector pos(seq_len);
+ for (int64_t i = 0; i < seq_len; i++) {
+ pos[i] = static_cast(i);
+ }
+ auto rope_emb = Rope::rope(pos, dim, theta);
+ return Rope::flatten(rope_emb);
+ }
+
+ static float calc_ntk_factor(float extrapolation_ratio, int axis_dim) {
+ if (extrapolation_ratio == 1.0f || axis_dim <= 2) {
+ return 1.0f;
+ }
+ return std::pow(extrapolation_ratio, static_cast(axis_dim) / static_cast(axis_dim - 2));
+ }
+
+ static std::vector gen_anima_image_pe_vec(int bs,
+ int h,
+ int w,
+ int patch_size,
+ int theta,
+ const std::vector& axes_dim,
+ float h_extrapolation_ratio,
+ float w_extrapolation_ratio,
+ float t_extrapolation_ratio) {
+ static const std::vector empty_ref_latents;
+ auto ids = Rope::gen_flux_ids(h,
+ w,
+ patch_size,
+ bs,
+ static_cast(axes_dim.size()),
+ 0,
+ {},
+ empty_ref_latents,
+ false,
+ 1.0f);
+
+ std::vector axis_thetas = {
+ static_cast(theta) * calc_ntk_factor(t_extrapolation_ratio, axes_dim[0]),
+ static_cast(theta) * calc_ntk_factor(h_extrapolation_ratio, axes_dim[1]),
+ static_cast(theta) * calc_ntk_factor(w_extrapolation_ratio, axes_dim[2]),
+ };
+ return Rope::embed_nd(ids, bs, axis_thetas, axes_dim);
+ }
+
+ struct ggml_cgraph* build_graph(struct ggml_tensor* x,
+ struct ggml_tensor* timesteps,
+ struct ggml_tensor* context,
+ struct ggml_tensor* t5_ids = nullptr,
+ struct ggml_tensor* t5_weights = nullptr) {
+ GGML_ASSERT(x->ne[3] == 1);
+ struct ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE);
+
+ x = to_backend(x);
+ timesteps = to_backend(timesteps);
+ context = to_backend(context);
+ t5_ids = to_backend(t5_ids);
+ t5_weights = to_backend(t5_weights);
+
+ int64_t pad_h = (net.patch_size - x->ne[1] % net.patch_size) % net.patch_size;
+ int64_t pad_w = (net.patch_size - x->ne[0] % net.patch_size) % net.patch_size;
+ int64_t h_pad = x->ne[1] + pad_h;
+ int64_t w_pad = x->ne[0] + pad_w;
+
+ image_pe_vec = gen_anima_image_pe_vec(1,
+ static_cast(h_pad),
+ static_cast(w_pad),
+ static_cast(net.patch_size),
+ net.theta,
+ net.axes_dim,
+ 4.0f,
+ 4.0f,
+ 1.0f);
+ int64_t image_pos_len = static_cast(image_pe_vec.size()) / (2 * 2 * (net.head_dim / 2));
+ auto image_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, net.head_dim / 2, image_pos_len);
+ set_backend_tensor_data(image_pe, image_pe_vec.data());
+
+ ggml_tensor* adapter_q_pe = nullptr;
+ ggml_tensor* adapter_k_pe = nullptr;
+ if (t5_ids != nullptr) {
+ int64_t target_len = t5_ids->ne[0];
+ int64_t source_len = context->ne[1];
+
+ adapter_q_pe_vec = gen_1d_rope_pe_vec(target_len, 64, 10000.f);
+ adapter_k_pe_vec = gen_1d_rope_pe_vec(source_len, 64, 10000.f);
+
+ int64_t target_pos_len = static_cast(adapter_q_pe_vec.size()) / (2 * 2 * 32);
+ int64_t source_pos_len = static_cast(adapter_k_pe_vec.size()) / (2 * 2 * 32);
+
+ adapter_q_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, 32, target_pos_len);
+ adapter_k_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, 32, source_pos_len);
+ set_backend_tensor_data(adapter_q_pe, adapter_q_pe_vec.data());
+ set_backend_tensor_data(adapter_k_pe, adapter_k_pe_vec.data());
+ }
+
+ auto runner_ctx = get_context();
+ auto out = net.forward(&runner_ctx,
+ x,
+ timesteps,
+ context,
+ image_pe,
+ t5_ids,
+ t5_weights,
+ adapter_q_pe,
+ adapter_k_pe);
+
+ ggml_build_forward_expand(gf, out);
+ return gf;
+ }
+
+ bool compute(int n_threads,
+ struct ggml_tensor* x,
+ struct ggml_tensor* timesteps,
+ struct ggml_tensor* context,
+ struct ggml_tensor* t5_ids = nullptr,
+ struct ggml_tensor* t5_weights = nullptr,
+ struct ggml_tensor** output = nullptr,
+ struct ggml_context* output_ctx = nullptr) {
+ auto get_graph = [&]() -> struct ggml_cgraph* {
+ return build_graph(x, timesteps, context, t5_ids, t5_weights);
+ };
+ return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+ }
+ };
+} // namespace Anima
+
+#endif // __ANIMA_HPP__
diff --git a/src/cache_dit.hpp b/src/cache_dit.hpp
new file mode 100644
index 00000000..6fe104da
--- /dev/null
+++ b/src/cache_dit.hpp
@@ -0,0 +1,975 @@
+#ifndef __CACHE_DIT_HPP__
+#define __CACHE_DIT_HPP__
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "ggml_extend.hpp"
+
+struct DBCacheConfig {
+ bool enabled = false;
+ int Fn_compute_blocks = 8;
+ int Bn_compute_blocks = 0;
+ float residual_diff_threshold = 0.08f;
+ int max_warmup_steps = 8;
+ int max_cached_steps = -1;
+ int max_continuous_cached_steps = -1;
+ float max_accumulated_residual_diff = -1.0f;
+ std::vector steps_computation_mask;
+ bool scm_policy_dynamic = true;
+};
+
+struct TaylorSeerConfig {
+ bool enabled = false;
+ int n_derivatives = 1;
+ int max_warmup_steps = 2;
+ int skip_interval_steps = 1;
+};
+
+struct CacheDitConfig {
+ DBCacheConfig dbcache;
+ TaylorSeerConfig taylorseer;
+ int double_Fn_blocks = -1;
+ int double_Bn_blocks = -1;
+ int single_Fn_blocks = -1;
+ int single_Bn_blocks = -1;
+};
+
+struct TaylorSeerState {
+ int n_derivatives = 1;
+ int current_step = -1;
+ int last_computed_step = -1;
+ std::vector> dY_prev;
+ std::vector> dY_current;
+
+ void init(int n_deriv, size_t hidden_size) {
+ n_derivatives = n_deriv;
+ int order = n_derivatives + 1;
+ dY_prev.resize(order);
+ dY_current.resize(order);
+ for (int i = 0; i < order; i++) {
+ dY_prev[i].clear();
+ dY_current[i].clear();
+ }
+ current_step = -1;
+ last_computed_step = -1;
+ }
+
+ void reset() {
+ for (auto& v : dY_prev)
+ v.clear();
+ for (auto& v : dY_current)
+ v.clear();
+ current_step = -1;
+ last_computed_step = -1;
+ }
+
+ bool can_approximate() const {
+ return last_computed_step >= n_derivatives && !dY_prev.empty() && !dY_prev[0].empty();
+ }
+
+ void update_derivatives(const float* Y, size_t size, int step) {
+ int order = n_derivatives + 1;
+ dY_prev = dY_current;
+ dY_current[0].resize(size);
+ for (size_t i = 0; i < size; i++) {
+ dY_current[0][i] = Y[i];
+ }
+
+ int window = step - last_computed_step;
+ if (window <= 0)
+ window = 1;
+
+ for (int d = 0; d < n_derivatives; d++) {
+ if (!dY_prev[d].empty() && dY_prev[d].size() == size) {
+ dY_current[d + 1].resize(size);
+ for (size_t i = 0; i < size; i++) {
+ dY_current[d + 1][i] = (dY_current[d][i] - dY_prev[d][i]) / static_cast(window);
+ }
+ } else {
+ dY_current[d + 1].clear();
+ }
+ }
+
+ current_step = step;
+ last_computed_step = step;
+ }
+
+ void approximate(float* output, size_t size, int target_step) const {
+ if (!can_approximate() || dY_prev[0].size() != size) {
+ return;
+ }
+
+ int elapsed = target_step - last_computed_step;
+ if (elapsed <= 0)
+ elapsed = 1;
+
+ std::fill(output, output + size, 0.0f);
+ float factorial = 1.0f;
+ int order = static_cast(dY_prev.size());
+
+ for (int o = 0; o < order; o++) {
+ if (dY_prev[o].empty() || dY_prev[o].size() != size)
+ continue;
+ if (o > 0)
+ factorial *= static_cast(o);
+ float coeff = ::powf(static_cast(elapsed), static_cast(o)) / factorial;
+ for (size_t i = 0; i < size; i++) {
+ output[i] += coeff * dY_prev[o][i];
+ }
+ }
+ }
+};
+
+struct BlockCacheEntry {
+ std::vector residual_img;
+ std::vector residual_txt;
+ std::vector residual;
+ std::vector prev_img;
+ std::vector prev_txt;
+ std::vector prev_output;
+ bool has_prev = false;
+};
+
+struct CacheDitState {
+ CacheDitConfig config;
+ bool initialized = false;
+
+ int total_double_blocks = 0;
+ int total_single_blocks = 0;
+ size_t hidden_size = 0;
+
+ int current_step = -1;
+ int total_steps = 0;
+ int warmup_remaining = 0;
+ std::vector cached_steps;
+ int continuous_cached_steps = 0;
+ float accumulated_residual_diff = 0.0f;
+
+ std::vector double_block_cache;
+ std::vector single_block_cache;
+
+ std::vector Fn_residual_img;
+ std::vector Fn_residual_txt;
+ std::vector prev_Fn_residual_img;
+ std::vector prev_Fn_residual_txt;
+ bool has_prev_Fn_residual = false;
+
+ std::vector Bn_buffer_img;
+ std::vector Bn_buffer_txt;
+ std::vector Bn_buffer;
+ bool has_Bn_buffer = false;
+
+ TaylorSeerState taylor_state;
+
+ bool can_cache_this_step = false;
+ bool is_caching_this_step = false;
+
+ int total_blocks_computed = 0;
+ int total_blocks_cached = 0;
+
+ void init(const CacheDitConfig& cfg, int num_double_blocks, int num_single_blocks, size_t h_size) {
+ config = cfg;
+ total_double_blocks = num_double_blocks;
+ total_single_blocks = num_single_blocks;
+ hidden_size = h_size;
+
+ initialized = cfg.dbcache.enabled || cfg.taylorseer.enabled;
+
+ if (!initialized)
+ return;
+
+ warmup_remaining = cfg.dbcache.max_warmup_steps;
+ double_block_cache.resize(total_double_blocks);
+ single_block_cache.resize(total_single_blocks);
+
+ if (cfg.taylorseer.enabled) {
+ taylor_state.init(cfg.taylorseer.n_derivatives, h_size);
+ }
+
+ reset_runtime();
+ }
+
+ void reset_runtime() {
+ current_step = -1;
+ total_steps = 0;
+ warmup_remaining = config.dbcache.max_warmup_steps;
+ cached_steps.clear();
+ continuous_cached_steps = 0;
+ accumulated_residual_diff = 0.0f;
+
+ for (auto& entry : double_block_cache) {
+ entry.residual_img.clear();
+ entry.residual_txt.clear();
+ entry.prev_img.clear();
+ entry.prev_txt.clear();
+ entry.has_prev = false;
+ }
+
+ for (auto& entry : single_block_cache) {
+ entry.residual.clear();
+ entry.prev_output.clear();
+ entry.has_prev = false;
+ }
+
+ Fn_residual_img.clear();
+ Fn_residual_txt.clear();
+ prev_Fn_residual_img.clear();
+ prev_Fn_residual_txt.clear();
+ has_prev_Fn_residual = false;
+
+ Bn_buffer_img.clear();
+ Bn_buffer_txt.clear();
+ Bn_buffer.clear();
+ has_Bn_buffer = false;
+
+ taylor_state.reset();
+
+ can_cache_this_step = false;
+ is_caching_this_step = false;
+
+ total_blocks_computed = 0;
+ total_blocks_cached = 0;
+ }
+
+ bool enabled() const {
+ return initialized && (config.dbcache.enabled || config.taylorseer.enabled);
+ }
+
+ void begin_step(int step_index, float sigma = 0.0f) {
+ if (!enabled())
+ return;
+ if (step_index == current_step)
+ return;
+
+ current_step = step_index;
+ total_steps++;
+
+ bool in_warmup = warmup_remaining > 0;
+ if (in_warmup) {
+ warmup_remaining--;
+ }
+
+ bool scm_allows_cache = true;
+ if (!config.dbcache.steps_computation_mask.empty()) {
+ if (step_index < static_cast(config.dbcache.steps_computation_mask.size())) {
+ scm_allows_cache = (config.dbcache.steps_computation_mask[step_index] == 0);
+ if (!config.dbcache.scm_policy_dynamic && scm_allows_cache) {
+ can_cache_this_step = true;
+ is_caching_this_step = false;
+ return;
+ }
+ }
+ }
+
+ bool max_cached_ok = (config.dbcache.max_cached_steps < 0) ||
+ (static_cast(cached_steps.size()) < config.dbcache.max_cached_steps);
+
+ bool max_cont_ok = (config.dbcache.max_continuous_cached_steps < 0) ||
+ (continuous_cached_steps < config.dbcache.max_continuous_cached_steps);
+
+ bool accum_ok = (config.dbcache.max_accumulated_residual_diff < 0.0f) ||
+ (accumulated_residual_diff < config.dbcache.max_accumulated_residual_diff);
+
+ can_cache_this_step = !in_warmup && scm_allows_cache && max_cached_ok && max_cont_ok && accum_ok && has_prev_Fn_residual;
+ is_caching_this_step = false;
+ }
+
+ void end_step(bool was_cached) {
+ if (was_cached) {
+ cached_steps.push_back(current_step);
+ continuous_cached_steps++;
+ } else {
+ continuous_cached_steps = 0;
+ }
+ }
+
+ static float calculate_residual_diff(const float* prev, const float* curr, size_t size) {
+ if (size == 0)
+ return 0.0f;
+
+ float sum_diff = 0.0f;
+ float sum_abs = 0.0f;
+
+ for (size_t i = 0; i < size; i++) {
+ sum_diff += std::fabs(prev[i] - curr[i]);
+ sum_abs += std::fabs(prev[i]);
+ }
+
+ return sum_diff / (sum_abs + 1e-6f);
+ }
+
+ static float calculate_residual_diff(const std::vector& prev, const std::vector& curr) {
+ if (prev.size() != curr.size() || prev.empty())
+ return 1.0f;
+ return calculate_residual_diff(prev.data(), curr.data(), prev.size());
+ }
+
+ int get_double_Fn_blocks() const {
+ return (config.double_Fn_blocks >= 0) ? config.double_Fn_blocks : config.dbcache.Fn_compute_blocks;
+ }
+
+ int get_double_Bn_blocks() const {
+ return (config.double_Bn_blocks >= 0) ? config.double_Bn_blocks : config.dbcache.Bn_compute_blocks;
+ }
+
+ int get_single_Fn_blocks() const {
+ return (config.single_Fn_blocks >= 0) ? config.single_Fn_blocks : config.dbcache.Fn_compute_blocks;
+ }
+
+ int get_single_Bn_blocks() const {
+ return (config.single_Bn_blocks >= 0) ? config.single_Bn_blocks : config.dbcache.Bn_compute_blocks;
+ }
+
+ bool is_Fn_double_block(int block_idx) const {
+ return block_idx < get_double_Fn_blocks();
+ }
+
+ bool is_Bn_double_block(int block_idx) const {
+ int Bn = get_double_Bn_blocks();
+ return Bn > 0 && block_idx >= (total_double_blocks - Bn);
+ }
+
+ bool is_Mn_double_block(int block_idx) const {
+ return !is_Fn_double_block(block_idx) && !is_Bn_double_block(block_idx);
+ }
+
+ bool is_Fn_single_block(int block_idx) const {
+ return block_idx < get_single_Fn_blocks();
+ }
+
+ bool is_Bn_single_block(int block_idx) const {
+ int Bn = get_single_Bn_blocks();
+ return Bn > 0 && block_idx >= (total_single_blocks - Bn);
+ }
+
+ bool is_Mn_single_block(int block_idx) const {
+ return !is_Fn_single_block(block_idx) && !is_Bn_single_block(block_idx);
+ }
+
+ void store_Fn_residual(const float* img, const float* txt, size_t img_size, size_t txt_size, const float* input_img, const float* input_txt) {
+ Fn_residual_img.resize(img_size);
+ Fn_residual_txt.resize(txt_size);
+
+ for (size_t i = 0; i < img_size; i++) {
+ Fn_residual_img[i] = img[i] - input_img[i];
+ }
+ for (size_t i = 0; i < txt_size; i++) {
+ Fn_residual_txt[i] = txt[i] - input_txt[i];
+ }
+ }
+
+ bool check_cache_decision() {
+ if (!can_cache_this_step) {
+ is_caching_this_step = false;
+ return false;
+ }
+
+ if (!has_prev_Fn_residual || prev_Fn_residual_img.empty()) {
+ is_caching_this_step = false;
+ return false;
+ }
+
+ float diff_img = calculate_residual_diff(prev_Fn_residual_img, Fn_residual_img);
+ float diff_txt = calculate_residual_diff(prev_Fn_residual_txt, Fn_residual_txt);
+ float diff = (diff_img + diff_txt) / 2.0f;
+
+ if (diff < config.dbcache.residual_diff_threshold) {
+ is_caching_this_step = true;
+ accumulated_residual_diff += diff;
+ return true;
+ }
+
+ is_caching_this_step = false;
+ return false;
+ }
+
+ void update_prev_Fn_residual() {
+ prev_Fn_residual_img = Fn_residual_img;
+ prev_Fn_residual_txt = Fn_residual_txt;
+ has_prev_Fn_residual = !prev_Fn_residual_img.empty();
+ }
+
+ void store_double_block_residual(int block_idx, const float* img, const float* txt, size_t img_size, size_t txt_size, const float* prev_img, const float* prev_txt) {
+ if (block_idx < 0 || block_idx >= static_cast(double_block_cache.size()))
+ return;
+
+ BlockCacheEntry& entry = double_block_cache[block_idx];
+
+ entry.residual_img.resize(img_size);
+ entry.residual_txt.resize(txt_size);
+ for (size_t i = 0; i < img_size; i++) {
+ entry.residual_img[i] = img[i] - prev_img[i];
+ }
+ for (size_t i = 0; i < txt_size; i++) {
+ entry.residual_txt[i] = txt[i] - prev_txt[i];
+ }
+
+ entry.prev_img.resize(img_size);
+ entry.prev_txt.resize(txt_size);
+ for (size_t i = 0; i < img_size; i++) {
+ entry.prev_img[i] = img[i];
+ }
+ for (size_t i = 0; i < txt_size; i++) {
+ entry.prev_txt[i] = txt[i];
+ }
+ entry.has_prev = true;
+ }
+
+ void apply_double_block_cache(int block_idx, float* img, float* txt, size_t img_size, size_t txt_size) {
+ if (block_idx < 0 || block_idx >= static_cast(double_block_cache.size()))
+ return;
+
+ const BlockCacheEntry& entry = double_block_cache[block_idx];
+ if (entry.residual_img.size() != img_size || entry.residual_txt.size() != txt_size)
+ return;
+
+ for (size_t i = 0; i < img_size; i++) {
+ img[i] += entry.residual_img[i];
+ }
+ for (size_t i = 0; i < txt_size; i++) {
+ txt[i] += entry.residual_txt[i];
+ }
+
+ total_blocks_cached++;
+ }
+
+ void store_single_block_residual(int block_idx, const float* output, size_t size, const float* input) {
+ if (block_idx < 0 || block_idx >= static_cast(single_block_cache.size()))
+ return;
+
+ BlockCacheEntry& entry = single_block_cache[block_idx];
+
+ entry.residual.resize(size);
+ for (size_t i = 0; i < size; i++) {
+ entry.residual[i] = output[i] - input[i];
+ }
+
+ entry.prev_output.resize(size);
+ for (size_t i = 0; i < size; i++) {
+ entry.prev_output[i] = output[i];
+ }
+ entry.has_prev = true;
+ }
+
+ void apply_single_block_cache(int block_idx, float* output, size_t size) {
+ if (block_idx < 0 || block_idx >= static_cast(single_block_cache.size()))
+ return;
+
+ const BlockCacheEntry& entry = single_block_cache[block_idx];
+ if (entry.residual.size() != size)
+ return;
+
+ for (size_t i = 0; i < size; i++) {
+ output[i] += entry.residual[i];
+ }
+
+ total_blocks_cached++;
+ }
+
+ void store_Bn_buffer(const float* img, const float* txt, size_t img_size, size_t txt_size, const float* Bn_start_img, const float* Bn_start_txt) {
+ Bn_buffer_img.resize(img_size);
+ Bn_buffer_txt.resize(txt_size);
+
+ for (size_t i = 0; i < img_size; i++) {
+ Bn_buffer_img[i] = img[i] - Bn_start_img[i];
+ }
+ for (size_t i = 0; i < txt_size; i++) {
+ Bn_buffer_txt[i] = txt[i] - Bn_start_txt[i];
+ }
+ has_Bn_buffer = true;
+ }
+
+ void apply_Bn_buffer(float* img, float* txt, size_t img_size, size_t txt_size) {
+ if (!has_Bn_buffer)
+ return;
+ if (Bn_buffer_img.size() != img_size || Bn_buffer_txt.size() != txt_size)
+ return;
+
+ for (size_t i = 0; i < img_size; i++) {
+ img[i] += Bn_buffer_img[i];
+ }
+ for (size_t i = 0; i < txt_size; i++) {
+ txt[i] += Bn_buffer_txt[i];
+ }
+ }
+
+ void taylor_update(const float* hidden_state, size_t size) {
+ if (!config.taylorseer.enabled)
+ return;
+ taylor_state.update_derivatives(hidden_state, size, current_step);
+ }
+
+ bool taylor_can_approximate() const {
+ return config.taylorseer.enabled && taylor_state.can_approximate();
+ }
+
+ void taylor_approximate(float* output, size_t size) {
+ if (!config.taylorseer.enabled)
+ return;
+ taylor_state.approximate(output, size, current_step);
+ }
+
+ bool should_use_taylor_this_step() const {
+ if (!config.taylorseer.enabled)
+ return false;
+ if (current_step < config.taylorseer.max_warmup_steps)
+ return false;
+
+ int interval = config.taylorseer.skip_interval_steps;
+ if (interval <= 0)
+ interval = 1;
+
+ return (current_step % (interval + 1)) != 0;
+ }
+
+ void log_metrics() const {
+ if (!enabled())
+ return;
+
+ int total_blocks = total_blocks_computed + total_blocks_cached;
+ float cache_ratio = (total_blocks > 0) ? (static_cast(total_blocks_cached) / total_blocks * 100.0f) : 0.0f;
+
+ float step_cache_ratio = (total_steps > 0) ? (static_cast(cached_steps.size()) / total_steps * 100.0f) : 0.0f;
+
+ LOG_INFO("CacheDIT: steps_cached=%zu/%d (%.1f%%), blocks_cached=%d/%d (%.1f%%), accum_diff=%.4f",
+ cached_steps.size(), total_steps, step_cache_ratio,
+ total_blocks_cached, total_blocks, cache_ratio,
+ accumulated_residual_diff);
+ }
+
+ std::string get_summary() const {
+ char buf[256];
+ snprintf(buf, sizeof(buf),
+ "CacheDIT[thresh=%.2f]: cached %zu/%d steps, %d/%d blocks",
+ config.dbcache.residual_diff_threshold,
+ cached_steps.size(), total_steps,
+ total_blocks_cached, total_blocks_computed + total_blocks_cached);
+ return std::string(buf);
+ }
+};
+
+inline std::vector parse_scm_mask(const std::string& mask_str) {
+ std::vector mask;
+ if (mask_str.empty())
+ return mask;
+
+ size_t pos = 0;
+ size_t start = 0;
+ while ((pos = mask_str.find(',', start)) != std::string::npos) {
+ std::string token = mask_str.substr(start, pos - start);
+ mask.push_back(std::stoi(token));
+ start = pos + 1;
+ }
+ if (start < mask_str.length()) {
+ mask.push_back(std::stoi(mask_str.substr(start)));
+ }
+
+ return mask;
+}
+
+inline std::vector generate_scm_mask(
+ const std::vector& compute_bins,
+ const std::vector& cache_bins,
+ int total_steps) {
+ std::vector mask;
+ size_t c_idx = 0, cache_idx = 0;
+
+ while (static_cast(mask.size()) < total_steps) {
+ if (c_idx < compute_bins.size()) {
+ for (int i = 0; i < compute_bins[c_idx] && static_cast(mask.size()) < total_steps; i++) {
+ mask.push_back(1);
+ }
+ c_idx++;
+ }
+ if (cache_idx < cache_bins.size()) {
+ for (int i = 0; i < cache_bins[cache_idx] && static_cast(mask.size()) < total_steps; i++) {
+ mask.push_back(0);
+ }
+ cache_idx++;
+ }
+ if (c_idx >= compute_bins.size() && cache_idx >= cache_bins.size())
+ break;
+ }
+
+ if (!mask.empty()) {
+ mask.back() = 1;
+ }
+
+ return mask;
+}
+
+inline std::vector get_scm_preset(const std::string& preset, int total_steps) {
+ struct Preset {
+ std::vector compute_bins;
+ std::vector cache_bins;
+ };
+
+ Preset slow = {{8, 3, 3, 2, 1, 1}, {1, 2, 2, 2, 3}};
+ Preset medium = {{6, 2, 2, 2, 2, 1}, {1, 3, 3, 3, 3}};
+ Preset fast = {{6, 1, 1, 1, 1, 1}, {1, 3, 4, 5, 4}};
+ Preset ultra = {{4, 1, 1, 1, 1}, {2, 5, 6, 7}};
+
+ Preset* p = nullptr;
+ if (preset == "slow" || preset == "s" || preset == "S")
+ p = &slow;
+ else if (preset == "medium" || preset == "m" || preset == "M")
+ p = &medium;
+ else if (preset == "fast" || preset == "f" || preset == "F")
+ p = &fast;
+ else if (preset == "ultra" || preset == "u" || preset == "U")
+ p = &ultra;
+ else
+ return {};
+
+ if (total_steps != 28 && total_steps > 0) {
+ float scale = static_cast(total_steps) / 28.0f;
+ std::vector