fix: avoid the issue of NaN for qwen-image on certain devices (#1249 )

feat: add --fa option (#1242 )
sync: update ggml
2026-02-05 03:16:05 +00:00 · 2026-02-04 23:49:05 +08:00 · 2026-02-01 21:44:54 +08:00 · 2026-02-01 20:54:23 +08:00 · 2026-02-01 20:05:27 +08:00 · 2026-02-01 20:00:16 +08:00
178 changed files with 3871877 additions and 6199 deletions
--- a/.clang-format
+++ b/.clang-format
@ -3,7 +3,6 @@ UseTab: Never
 IndentWidth: 4
 TabWidth: 4
 AllowShortIfStatementsOnASingleLine: false
 IndentCaseLabels: false
 ColumnLimit: 0
 AccessModifierOffset: -4
 NamespaceIndentation: All
--- a/.clang-tidy
+++ b/.clang-tidy
@ -0,0 +1,10 @@
 Checks: >
  modernize-make-shared,
  modernize-use-nullptr,
  modernize-use-override,
  modernize-pass-by-value,
  modernize-return-braced-init-list,
  modernize-deprecated-headers,
 HeaderFilterRegex: '^$'
 WarningsAsErrors: ''
 FormatStyle: none
--- a/.dockerignore
+++ b/.dockerignore
@ -1,4 +1,5 @@
 build*/
 docs/
 test/
 .cache/
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@ -0,0 +1,73 @@
 name: 🐞 Bug Report
 description: Report a bug or unexpected behavior
 title: "[Bug] "
 labels: ["bug"]
 body:
  - type: markdown
    attributes:
      value: |
        Please use this template and include as many details as possible to help us reproduce and fix the issue.
  - type: textarea
    id: commit
    attributes:
      label: Git commit
      description: Which commit are you trying to compile?
      placeholder: |
        $git rev-parse HEAD
        40a6a8710ec15b1b5db6b5a098409f6bc8f654a4
    validations:
      required: true
  - type: input
    id: os
    attributes:
      label: Operating System & Version
      placeholder: e.g. “Ubuntu 22.04”, “Windows 11 23H2”, “macOS 14.3”
    validations:
      required: true
  - type: dropdown
    id: backends
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
        options: [CPU, CUDA, HIP, Metal, Musa, SYCL, Vulkan, OpenCL]
        multiple: true
    validations:
      required: true
  - type: input
    id: cmd_arguments
    attributes:
      label: Command-line arguments used
      placeholder: The full command line you ran (with all flags)
    validations:
      required: true
  - type: textarea
    id: steps_to_reproduce
    attributes:
      label: Steps to reproduce
      placeholder: A step-by-step list of what you did
    validations:
      required: true
  - type: textarea
    id: expected_behavior
    attributes:
      label: What you expected to happen
      placeholder: Describe the expected behavior or result
    validations:
      required: true
  - type: textarea
    id: actual_behavior
    attributes:
      label: What actually happened
      placeholder: Describe what you saw instead (errors, logs, crash, etc.)
    validations:
      required: true
  - type: textarea
    id: logs_and_errors
    attributes:
      label: Logs / error messages / stack trace
      placeholder: Paste complete logs or error output
  - type: textarea
    id: additional_info
    attributes:
      label: Additional context / environment details
      placeholder: e.g. CPU model, GPU, RAM, model file versions, quantization type, etc.
--- a/.github/ISSUE_TEMPLATE/feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@ -0,0 +1,33 @@
 name: 💡 Feature Request
 description: Suggest a new feature or improvement
 title: "[Feature] "
 labels: ["enhancement"]
 body:
  - type: markdown
    attributes:
      value: |
        Thank you for suggesting an improvement! Please fill in the fields below.
  - type: input
    id: summary
    attributes:
      label: Feature Summary
      placeholder: A one-line summary of the feature you’d like
    validations:
      required: true
  - type: textarea
    id: description
    attributes:
      label: Detailed Description
      placeholder: What problem does this solve? How do you expect it to work?
    validations:
      required: true
  - type: textarea
    id: alternatives
    attributes:
      label: Alternatives you considered
      placeholder: Any alternative designs or workarounds you tried
  - type: textarea
    id: additional_context
    attributes:
      label: Additional context
      placeholder: Any extra information (use cases, related functionalities, constraints)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -4,21 +4,44 @@ on:
  workflow_dispatch: # allows manual triggering
    inputs:
      create_release:
-        description: 'Create new release'
+        description: "Create new release"
        required: true
        type: boolean
  push:
    branches:
      - master
      - ci
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
+    paths:
      [
        ".github/workflows/**",
        "**/CMakeLists.txt",
        "**/Makefile",
        "**/*.h",
        "**/*.hpp",
        "**/*.c",
        "**/*.cpp",
        "**/*.cu",
      ]
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
+    paths:
      [
        "**/CMakeLists.txt",
        "**/Makefile",
        "**/*.h",
        "**/*.hpp",
        "**/*.c",
        "**/*.cpp",
        "**/*.cu",
      ]
 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  ubuntu-latest-cmake:
    runs-on: ubuntu-latest
@ -30,7 +53,6 @@ jobs:
        with:
          submodules: recursive
      - name: Dependencies
        id: depends
        run: |
@ -42,14 +64,154 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake ..
+          cmake .. -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON
          cmake --build . --config Release
-      #- name: Test
+      - name: Get commit hash
-        #id: cmake_test
+        id: commit
-        #run: |
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-          #cd build
+        uses: pr-mpt/actions-commit-hash@v2
-          #ctest --verbose --timeout 900
+
      - name: Fetch system info
        id: system-info
        run: |
          echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
          echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
          echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
          echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
      - name: Pack artifacts
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          cp ggml/LICENSE ./build/bin/ggml.txt
          cp LICENSE ./build/bin/stable-diffusion.cpp.txt
          zip -j sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip ./build/bin/*
      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
  ubuntu-latest-cmake-vulkan:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v3
        with:
          submodules: recursive
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential libvulkan-dev glslc
      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
          cmake .. -DSD_BUILD_SHARED_LIBS=ON -DSD_VULKAN=ON
          cmake --build . --config Release
      - name: Get commit hash
        id: commit
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: pr-mpt/actions-commit-hash@v2
      - name: Fetch system info
        id: system-info
        run: |
          echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
          echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
          echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
          echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
      - name: Pack artifacts
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          cp ggml/LICENSE ./build/bin/ggml.txt
          cp LICENSE ./build/bin/stable-diffusion.cpp.txt
          zip -j sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip ./build/bin/*
      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip
  build-and-push-docker-images:
    name: Build and push container images
    runs-on: ubuntu-latest
    permissions:
      contents: read
      packages: write
      id-token: write
      attestations: write
      artifact-metadata: write
    strategy:
      matrix:
        variant: [musa, sycl, vulkan]
    env:
      REGISTRY: ghcr.io
      IMAGE_NAME: ${{ github.repository }}
    steps:
      - name: Checkout
        uses: actions/checkout@v6
        with:
          submodules: recursive
      - name: Get commit hash
        id: commit
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: pr-mpt/actions-commit-hash@v2
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Log in to the container registry
        uses: docker/login-action@v3
        with:
          registry: ${{ env.REGISTRY }}
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}
      - name: Extract metadata for Docker
        id: meta
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
      - name: Free Disk Space (Ubuntu)
        uses: jlumbroso/free-disk-space@v1.3.1
        with:
          # this might remove tools that are actually needed,
          # if set to "true" but frees about 6 GB
          tool-cache: false
      - name: Build and push Docker image
        id: build-push
        uses: docker/build-push-action@v6
        with:
          platforms: linux/amd64
          push: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
          file: Dockerfile.${{ matrix.variant }}
          tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH_NAME }}-${{ matrix.variant }}
          labels: ${{ steps.meta.outputs.labels }}
          annotations: ${{ steps.meta.outputs.annotations }}
  macOS-latest-cmake:
    runs-on: macos-latest
@ -63,9 +225,8 @@ jobs:
      - name: Dependencies
        id: depends
        continue-on-error: true
        run: |
-          brew update
+          brew install zip
      - name: Build
        id: cmake_build
@ -73,30 +234,59 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake ..
+          cmake .. -DGGML_AVX2=ON -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" -DSD_BUILD_SHARED_LIBS=ON
          cmake --build . --config Release
-      #- name: Test
+      - name: Get commit hash
-        #id: cmake_test
+        id: commit
-        #run: |
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-          #cd build
+        uses: pr-mpt/actions-commit-hash@v2
-          #ctest --verbose --timeout 900
+
      - name: Fetch system info
        id: system-info
        run: |
          echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
          echo "OS_NAME=`sw_vers -productName`" >> "$GITHUB_OUTPUT"
          echo "OS_VERSION=`sw_vers -productVersion`" >> "$GITHUB_OUTPUT"
          echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
      - name: Pack artifacts
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          cp ggml/LICENSE ./build/bin/ggml.txt
          cp LICENSE ./build/bin/stable-diffusion.cpp.txt
          zip -j sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip ./build/bin/*
      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
  windows-latest-cmake:
-    runs-on: windows-latest
+    runs-on: windows-2022
    env:
      VULKAN_VERSION: 1.4.328.1
    strategy:
      matrix:
        include:
-          - build: 'noavx'
+          - build: "noavx"
-            defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON"
-          - build: 'avx2'
+          - build: "avx2"
-            defines: '-DGGML_AVX2=ON'
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
-          - build: 'avx'
+          - build: "avx"
-            defines: '-DGGML_AVX2=OFF'
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX=ON -DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON"
-          - build: 'avx512'
+          - build: "avx512"
-            defines: '-DGGML_AVX512=ON'
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
-
+          - build: "cuda12"
            defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES='61;70;75;80;86;89;90;100;120' -DCMAKE_CUDA_FLAGS='-Xcudafe \"--diag_suppress=177\" -Xcudafe \"--diag_suppress=550\"'"
          - build: "vulkan"
            defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
    steps:
      - name: Clone
        id: checkout
@ -104,13 +294,35 @@ jobs:
        with:
          submodules: recursive
      - name: Install cuda-toolkit
        id: cuda-toolkit
        if: ${{ matrix.build == 'cuda12' }}
        uses: Jimver/cuda-toolkit@v0.2.22
        with:
          cuda: "12.8.1"
          method: "network"
          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
      - name: Install Vulkan SDK
        id: get_vulkan
        if: ${{ matrix.build == 'vulkan' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
      - name: Activate MSVC environment
        id: msvc_dev_cmd
        uses: ilammy/msvc-dev-cmd@v1
      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
-          cmake .. ${{ matrix.defines }}
+          cmake .. -DCMAKE_CXX_FLAGS='/bigobj' -G Ninja -DCMAKE_C_COMPILER=cl.exe -DCMAKE_CXX_COMPILER=cl.exe -DCMAKE_BUILD_TYPE=Release ${{ matrix.defines }}
-          cmake --build . --config Release
+          cmake --build .
      - name: Check AVX512F support
        id: check_avx512f
@ -125,12 +337,6 @@ jobs:
          & $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
          .\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
      #- name: Test
        #id: cmake_test
        #run: |
          #cd build
          #ctest -C Release --verbose --timeout 900
      - name: Get commit hash
        id: commit
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -140,17 +346,145 @@ jobs:
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          $filePath = ".\build\bin\Release\*"
          if (Test-Path $filePath) {
            echo "Exists at path $filePath"
            Copy-Item ggml/LICENSE .\build\bin\Release\ggml.txt
            Copy-Item LICENSE .\build\bin\Release\stable-diffusion.cpp.txt
-          7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
+          } elseif (Test-Path ".\build\bin\stable-diffusion.dll") {
          $filePath = ".\build\bin\*"
            echo "Exists at path $filePath"
            Copy-Item ggml/LICENSE .\build\bin\ggml.txt
            Copy-Item LICENSE .\build\bin\stable-diffusion.cpp.txt
          } else {
            ls .\build\bin
            throw "Can't find stable-diffusion.dll"
          }
          7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip $filePath
      - name: Copy and pack Cuda runtime
        id: pack_cuda_runtime
        if: ${{ matrix.build == 'cuda12' && (github.event_name == 'push' && github.ref == 'refs/heads/master' || github.event.inputs.create_release == 'true') }}
        run: |
          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
          $dst='.\build\bin\cudart\'
          robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
          7z a cudart-sd-bin-win-cu12-x64.zip $dst\*
      - name: Upload Cuda runtime
        if: ${{ matrix.build == 'cuda12' && (github.event_name == 'push' && github.ref == 'refs/heads/master' || github.event.inputs.create_release == 'true') }}
        uses: actions/upload-artifact@v4
        with:
          name: sd-cudart-sd-bin-win-cu12-x64.zip
          path: |
            cudart-sd-bin-win-cu12-x64.zip
      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
  windows-latest-cmake-hip:
    runs-on: windows-2022
    env:
      HIPSDK_INSTALLER_VERSION: "25.Q3"
      GPU_TARGETS: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
    steps:
      - uses: actions/checkout@v3
        with:
          submodules: recursive
      - name: Cache ROCm Installation
        id: cache-rocm
        uses: actions/cache@v4
        with:
          path: C:\Program Files\AMD\ROCm
          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-x64
          evict-old-files: 1d
      - name: Install ROCm
        if: steps.cache-rocm.outputs.cache-hit != 'true'
        run: |
          $ErrorActionPreference = "Stop"
          write-host "Downloading AMD HIP SDK Installer"
          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
          $completed = $proc.WaitForExit(600000)
          if (-not $completed) {
              Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
              $proc.Kill()
              exit 1
          }
          if ($proc.ExitCode -ne 0) {
              Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
              exit 1
          }
          write-host "Completed AMD HIP SDK installation"
      - name: Verify ROCm
        run: |
          # Find and test ROCm installation
          $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
          if (-not $clangPath) {
            Write-Error "ROCm installation not found"
            exit 1
          }
          & $clangPath.FullName --version
          # Set HIP_PATH environment variable for later steps
          echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)" >> $env:GITHUB_ENV
      - name: Build
        run: |
          mkdir build
          cd build
          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
          cmake .. `
            -G "Unix Makefiles" `
            -DSD_HIPBLAS=ON `
            -DSD_BUILD_SHARED_LIBS=ON `
            -DGGML_NATIVE=OFF `
            -DCMAKE_C_COMPILER=clang `
            -DCMAKE_CXX_COMPILER=clang++ `
            -DCMAKE_BUILD_TYPE=Release `
            -DGPU_TARGETS="${{ env.GPU_TARGETS }}"
          cmake --build . --config Release --parallel ${env:NUMBER_OF_PROCESSORS}
      - name: Get commit hash
        id: commit
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: pr-mpt/actions-commit-hash@v2
      - name: Pack artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          md "build\bin\rocblas\library\"
          md "build\bin\hipblaslt\library"
          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
          cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
          7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip .\build\bin\*
      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -158,13 +492,30 @@ jobs:
    needs:
      - ubuntu-latest-cmake
      - ubuntu-latest-cmake-vulkan
      - build-and-push-docker-images
      - macOS-latest-cmake
      - windows-latest-cmake
      - windows-latest-cmake-hip
    steps:
      - name: Clone
        uses: actions/checkout@v3
        with:
          fetch-depth: 0
      - name: Download artifacts
        id: download-artifact
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
        with:
          path: ./artifact
          pattern: sd-*
          merge-multiple: true
      - name: Get commit count
        id: commit_count
        run: |
          echo "count=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
      - name: Get commit hash
        id: commit
@ -172,14 +523,16 @@ jobs:
      - name: Create release
        id: create_release
        if: ${{ github.event_name == 'workflow_dispatch' || github.ref_name == 'master' }}
        uses: anzz1/action-create-release@v1
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
-          tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
+          tag_name: ${{ format('{0}-{1}-{2}', env.BRANCH_NAME, steps.commit_count.outputs.count, steps.commit.outputs.short) }}
      - name: Upload release
        id: upload_release
        if: ${{ github.event_name == 'workflow_dispatch' || github.ref_name == 'master' }}
        uses: actions/github-script@v3
        with:
          github-token: ${{secrets.GITHUB_TOKEN}}
--- a/.gitignore
+++ b/.gitignore
@ -1,14 +1,15 @@
 build*/
 cmake-build-*/
 test/
 .vscode/
 .idea/
 .cache/
 *.swp
 .vscode/
 *.bat
 *.bin
 *.exe
 *.gguf
 output*.png
 models*
 !taesd-model.gguf
 *.log
 preview.png
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +1,3 @@
 [submodule "ggml"]
    path = ggml
-	url = https://github.com/leejet/ggml.git
+	url = https://github.com/ggml-org/ggml.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -8,6 +8,11 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 if (MSVC)
    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
    add_compile_definitions(_SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)
 endif()
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
@ -24,50 +29,168 @@ endif()
 # general
 #option(SD_BUILD_TESTS                "sd: build tests"    ${SD_STANDALONE})
 option(SD_BUILD_EXAMPLES             "sd: build examples" ${SD_STANDALONE})
-option(SD_CUBLAS                     "sd: cuda backend" OFF)
+option(SD_CUDA                       "sd: cuda backend" OFF)
 option(SD_HIPBLAS                    "sd: rocm backend" OFF)
 option(SD_METAL                      "sd: metal backend" OFF)
-option(SD_FLASH_ATTN                 "sd: use flash attention for x4 less memory usage" OFF)
+option(SD_VULKAN                     "sd: vulkan backend" OFF)
 option(SD_OPENCL                     "sd: opencl backend" OFF)
 option(SD_SYCL                       "sd: sycl backend" OFF)
 option(SD_MUSA                       "sd: musa backend" OFF)
 option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
-option(BUILD_SHARED_LIBS             "sd: build shared libs" OFF)
+option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
 option(SD_BUILD_SHARED_GGML_LIB      "sd: build ggml as a separate shared lib" OFF)
 option(SD_USE_SYSTEM_GGML            "sd: use system-installed GGML library" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)
-if(SD_CUBLAS)
+if(SD_CUDA)
-	message("Use CUBLAS as backend stable-diffusion")
+    message("-- Use CUDA as backend stable-diffusion")
-    set(GGML_CUBLAS ON)
+    set(GGML_CUDA ON)
-    add_definitions(-DSD_USE_CUBLAS)
+    add_definitions(-DSD_USE_CUDA)
 endif()
 if(SD_METAL)
    message("-- Use Metal as backend stable-diffusion")
    set(GGML_METAL ON)
    add_definitions(-DSD_USE_METAL)
 endif()
 if (SD_VULKAN)
    message("-- Use Vulkan as backend stable-diffusion")
    set(GGML_VULKAN ON)
    add_definitions(-DSD_USE_VULKAN)
 endif ()
 if (SD_OPENCL)
    message("-- Use OpenCL as backend stable-diffusion")
    set(GGML_OPENCL ON)
    add_definitions(-DSD_USE_OPENCL)
 endif ()
 if (SD_HIPBLAS)
    message("-- Use HIPBLAS as backend stable-diffusion")
    set(GGML_HIP ON)
    add_definitions(-DSD_USE_CUDA)
    if(SD_FAST_SOFTMAX)
        set(GGML_CUDA_FAST_SOFTMAX ON)
    endif()
 endif ()
-if(SD_METAL)
+if(SD_MUSA)
-	message("Use Metal as backend stable-diffusion")
+    message("-- Use MUSA as backend stable-diffusion")
-    set(GGML_METAL ON)
+    set(GGML_MUSA ON)
-    add_definitions(-DSD_USE_METAL)
+    add_definitions(-DSD_USE_CUDA)
    if(SD_FAST_SOFTMAX)
        set(GGML_CUDA_FAST_SOFTMAX ON)
    endif()
 if(SD_FLASH_ATTN)
    message("Use Flash Attention for memory optimization")
    add_definitions(-DSD_USE_FLASH_ATTENTION)
 endif()
 set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
 # deps
 add_subdirectory(ggml)
 add_subdirectory(thirdparty)
 set(SD_LIB stable-diffusion)
-add_library(${SD_LIB} stable-diffusion.h stable-diffusion.cpp model.h model.cpp util.h util.cpp upscaler.cpp
+file(GLOB SD_LIB_SOURCES
-             ggml_extend.hpp clip.hpp common.hpp unet.hpp tae.hpp esrgan.hpp lora.hpp denoiser.hpp rng.hpp rng_philox.hpp)
+    "*.h"
    "*.cpp"
    "*.hpp"
 )
 find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
 if(GIT_EXE)
    execute_process(COMMAND ${GIT_EXE} describe --tags --abbrev=7 --dirty=+
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
        OUTPUT_VARIABLE SDCPP_BUILD_VERSION
        OUTPUT_STRIP_TRAILING_WHITESPACE
        ERROR_QUIET
    )
    execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
        OUTPUT_VARIABLE SDCPP_BUILD_COMMIT
        OUTPUT_STRIP_TRAILING_WHITESPACE
        ERROR_QUIET
    )
 endif()
 if(NOT SDCPP_BUILD_VERSION)
    set(SDCPP_BUILD_VERSION unknown)
 endif()
 message(STATUS "stable-diffusion.cpp version ${SDCPP_BUILD_VERSION}")
 if(NOT SDCPP_BUILD_COMMIT)
    set(SDCPP_BUILD_COMMIT unknown)
 endif()
 message(STATUS "stable-diffusion.cpp commit ${SDCPP_BUILD_COMMIT}")
 set_property(
  SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp
  APPEND PROPERTY COMPILE_DEFINITIONS
  SDCPP_BUILD_COMMIT=${SDCPP_BUILD_COMMIT} SDCPP_BUILD_VERSION=${SDCPP_BUILD_VERSION}
 )
 if(SD_BUILD_SHARED_LIBS)
    message("-- Build shared library")
    message(${SD_LIB_SOURCES})
    if(NOT SD_BUILD_SHARED_GGML_LIB)
        set(BUILD_SHARED_LIBS OFF)
    endif()
    add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES})
    add_definitions(-DSD_BUILD_SHARED_LIB)
    target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL)
    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 else()
    message("-- Build static library")
    if(NOT SD_BUILD_SHARED_GGML_LIB)
        set(BUILD_SHARED_LIBS OFF)
    endif()
    add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
 endif()
 if(SD_SYCL)
    message("-- Use SYCL as backend stable-diffusion")
    set(GGML_SYCL ON)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
    add_definitions(-DSD_USE_SYCL)
    # disable fast-math on host, see:
    # https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/fp-model-fp.html
    if (WIN32)
        set(SYCL_COMPILE_OPTIONS /fp:precise)
    else()
        set(SYCL_COMPILE_OPTIONS -fp-model=precise)
    endif()
    message("-- Turn off fast-math for host in SYCL backend")
    target_compile_options(${SD_LIB} PRIVATE ${SYCL_COMPILE_OPTIONS})
 endif()
 set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
 if (NOT SD_USE_SYSTEM_GGML)
    # see https://github.com/ggerganov/ggml/pull/682
    add_definitions(-DGGML_MAX_NAME=128)
 endif()
 # deps
 # Only add ggml if it hasn't been added yet
 if (NOT TARGET ggml)
    if (SD_USE_SYSTEM_GGML)
        find_package(ggml REQUIRED)
        if (NOT ggml_FOUND)
            message(FATAL_ERROR "System-installed GGML library not found.")
        endif()
        add_library(ggml ALIAS ggml::ggml)
    else()
        add_subdirectory(ggml)
    endif()
 endif()
 add_subdirectory(thirdparty)
 target_link_libraries(${SD_LIB} PUBLIC ggml zip)
 target_include_directories(${SD_LIB} PUBLIC . thirdparty)
-target_compile_features(${SD_LIB} PUBLIC cxx_std_11)
+target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)
 if (SD_BUILD_EXAMPLES)
    add_subdirectory(examples)
 endif()
 set(SD_PUBLIC_HEADERS stable-diffusion.h)
 set_target_properties(${SD_LIB} PROPERTIES PUBLIC_HEADER "${SD_PUBLIC_HEADERS}")
 install(TARGETS ${SD_LIB} LIBRARY PUBLIC_HEADER)
--- a/20
+++ b/20
@ -1,17 +1,23 @@
-ARG UBUNTU_VERSION=22.04
+ARG UBUNTU_VERSION=24.04
-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build
-RUN apt-get update && apt-get install -y build-essential git cmake
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake
 WORKDIR /sd.cpp
 COPY . .
-RUN mkdir build && cd build && cmake .. && cmake --build . --config Release
+RUN cmake . -B ./build
 RUN cmake --build ./build --config Release --parallel
-FROM ubuntu:$UBUNTU_VERSION as runtime
+FROM ubuntu:$UBUNTU_VERSION AS runtime
-COPY --from=build /sd.cpp/build/bin/sd /sd
+RUN apt-get update && \
    apt-get install --yes --no-install-recommends libgomp1 && \
    apt-get clean
-ENTRYPOINT [ "/sd" ]
+COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
 COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
 ENTRYPOINT [ "/sd-cli" ]
--- a/Dockerfile.musa
+++ b/Dockerfile.musa
@ -0,0 +1,24 @@
 ARG MUSA_VERSION=rc4.2.0
 ARG UBUNTU_VERSION=22.04
 FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64 as build
 RUN apt-get update && apt-get install -y ccache cmake git
 WORKDIR /sd.cpp
 COPY . .
 RUN mkdir build && cd build && \
    cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ \
        -DCMAKE_C_FLAGS="${CMAKE_C_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
        -DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
        -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release && \
    cmake --build . --config Release
 FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 as runtime
 COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
 COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
 ENTRYPOINT [ "/sd-cli" ]
--- a/Dockerfile.sycl
+++ b/Dockerfile.sycl
@ -0,0 +1,20 @@
 ARG SYCL_VERSION=2025.1.0-0
 FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS build
 RUN apt-get update && apt-get install -y cmake
 WORKDIR /sd.cpp
 COPY . .
 RUN mkdir build && cd build && \
    cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON -DCMAKE_BUILD_TYPE=Release && \
    cmake --build . --config Release -j$(nproc)
 FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS runtime
 COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
 COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
 ENTRYPOINT [ "/sd-cli" ]
--- a/Dockerfile.vulkan
+++ b/Dockerfile.vulkan
@ -0,0 +1,23 @@
 ARG UBUNTU_VERSION=24.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake libvulkan-dev glslc
 WORKDIR /sd.cpp
 COPY . .
 RUN cmake . -B ./build -DSD_VULKAN=ON
 RUN cmake --build ./build --config Release --parallel
 FROM ubuntu:$UBUNTU_VERSION AS runtime
 RUN apt-get update && \
    apt-get install --yes --no-install-recommends libgomp1 libvulkan1 mesa-vulkan-drivers && \
    apt-get clean
 COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
 COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
 ENTRYPOINT [ "/sd-cli" ]
--- a/README.md
+++ b/README.md
@ -1,35 +1,88 @@
 <p align="center">
-  <img src="./assets/a%20lovely%20cat.png" width="256x">
+  <img src="./assets/logo.png" width="360x">
 </p>
 # stable-diffusion.cpp
-Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in pure C/C++
+<div align="center">
 <a href="https://trendshift.io/repositories/9714" target="_blank"><img src="https://trendshift.io/api/badge/repositories/9714" alt="leejet%2Fstable-diffusion.cpp | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
 </div>
 Diffusion model(SD,Flux,Wan,...) inference in pure C/C++
 ***Note that this project is under active development. \
 API and command-line option may change frequently.***
 ## 🔥Important News
 * **2026/01/18** 🚀 stable-diffusion.cpp now supports **FLUX.2-klein**  
  👉 Details: [PR #1193](https://github.com/leejet/stable-diffusion.cpp/pull/1193)
 * **2025/12/01** 🚀 stable-diffusion.cpp now supports **Z-Image**  
  👉 Details: [PR #1020](https://github.com/leejet/stable-diffusion.cpp/pull/1020)
 * **2025/11/30** 🚀 stable-diffusion.cpp now supports **FLUX.2-dev**  
  👉 Details: [PR #1016](https://github.com/leejet/stable-diffusion.cpp/pull/1016)
 * **2025/10/13** 🚀 stable-diffusion.cpp now supports **Qwen-Image-Edit / Qwen-Image-Edit 2509**  
  👉 Details: [PR #877](https://github.com/leejet/stable-diffusion.cpp/pull/877)
 * **2025/10/12** 🚀 stable-diffusion.cpp now supports **Qwen-Image**  
  👉 Details: [PR #851](https://github.com/leejet/stable-diffusion.cpp/pull/851)
 * **2025/09/14** 🚀 stable-diffusion.cpp now supports **Wan2.1 Vace**  
  👉 Details: [PR #819](https://github.com/leejet/stable-diffusion.cpp/pull/819)
 * **2025/09/06** 🚀 stable-diffusion.cpp now supports **Wan2.1 / Wan2.2**  
  👉 Details: [PR #778](https://github.com/leejet/stable-diffusion.cpp/pull/778)
 ## Features
- Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
+- Plain C/C++ implementation based on [ggml](https://github.com/ggml-org/ggml), working in the same way as [llama.cpp](https://github.com/ggml-org/llama.cpp)
 - Super lightweight and without external dependencies
- SD1.x, SD2.x and SDXL support
+- Supported models
-    - !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors).
+  - Image Models
-
+    - SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo)
- [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo) and [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) support
+    - SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
- 16-bit, 32-bit float support
+    - [Some SD1.x and SDXL distilled models](./docs/distilled_sd.md)
- 4-bit, 5-bit and 8-bit integer quantization support
+    - [SD3/SD3.5](./docs/sd3.md)
- Accelerated memory-efficient CPU inference
+    - [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
-    - Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB.
+    - [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
- AVX, AVX2 and AVX512 support for x86 architectures
+    - [Chroma](./docs/chroma.md)
- Full CUDA and Metal backend for GPU acceleration.
+    - [Chroma1-Radiance](./docs/chroma_radiance.md)
- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models
+    - [Qwen Image](./docs/qwen_image.md)
-    - No need to convert to `.ggml` or `.gguf` anymore!
+    - [Z-Image](./docs/z_image.md)
- Flash Attention for memory usage optimization (only cpu for now)
+    - [Ovis-Image](./docs/ovis_image.md)
- Original `txt2img` and `img2img` mode
+  - Image Edit Models
- Negative prompt
+    - [FLUX.1-Kontext-dev](./docs/kontext.md)
- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
+    - [Qwen Image Edit series](./docs/qwen_image_edit.md)
  - Video Models
    - [Wan2.1/Wan2.2](./docs/wan.md)
  - [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
  - Control Net support with SD 1.5
  - LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
  - Latent Consistency Models support (LCM/LCM-LoRA)
  - Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
  - Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
 - Supported backends
  - CPU (AVX, AVX2 and AVX512 support for x86 architectures)
  - CUDA
  - Vulkan
  - Metal
  - OpenCL
  - SYCL
 - Supported weight formats
  - Pytorch checkpoint (`.ckpt` or `.pth`)
  - Safetensors (`.safetensors`)
  - GGUF (`.gguf`)
 - Supported platforms
    - Linux
    - Mac OS
    - Windows
    - Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))
 - Flash Attention for memory usage optimization
 - Negative prompt
 - [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
 - VAE tiling processing for reduce memory usage
 - Sampling method
    - `Euler A`
@ -40,256 +93,84 @@ Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in
    - [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
    - `DPM++ 2S a`
    - [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
+- Cross-platform reproducibility
    - `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG`
    - `--rng cpu`, consistent with the `comfyui RNG`
 - Embedds generation parameters into png output as webui-compatible text string
 - Supported platforms
    - Linux
    - Mac OS
    - Windows
    - Android (via Termux)
-### TODO
+## Quick Start
- [ ] More sampling methods
+### Get the sd executable
 - [ ] Make inference faster
    - The current implementation of ggml_conv_2d is slow and has high memory usage
    - Implement Winograd Convolution 2D for 3x3 kernel filtering
 - [ ] Continuing to reduce memory usage (quantizing the weights of ggml_conv_2d)
 - [ ] Implement Textual Inversion (embeddings)
 - [ ] Implement Inpainting support
 - [ ] k-quants support
-## Usage
+- Download pre-built binaries from the [releases page](https://github.com/leejet/stable-diffusion.cpp/releases)
 - Or build from source by following the [build guide](./docs/build.md)
-### Get the Code
+### Download model weights
-```
+- download weights(.ckpt or .safetensors or .gguf). For example
-git clone --recursive https://github.com/leejet/stable-diffusion.cpp
+    - Stable Diffusion v1.5 from https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5 
 cd stable-diffusion.cpp
 ```
 - If you have already cloned the repository, you can use the following command to update the repository to the latest code.
 ```
 cd stable-diffusion.cpp
 git pull origin master
 git submodule init
 git submodule update
 ```
 ### Download weights
 - download original weights(.ckpt or .safetensors). For example
    - Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
    - Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
    - Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
    ```shell
    curl -L -O https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
    # curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
    # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-nonema-pruned.safetensors
    ```
 ### Build
 #### Build from scratch
 ```shell
 mkdir build
 cd build
 cmake ..
 cmake --build . --config Release
 ```
 ##### Using OpenBLAS
 ```
 cmake .. -DGGML_OPENBLAS=ON
 cmake --build . --config Release
 ```
 ##### Using CUBLAS
 This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
 ```
 cmake .. -DSD_CUBLAS=ON
 cmake --build . --config Release
 ```
 ##### Using Metal
 Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
 ```
 cmake .. -DSD_METAL=ON
 cmake --build . --config Release
 ```
 ### Using Flash Attention
 Enabling flash attention reduces memory usage by at least 400 MB. At the moment, it is not supported when CUBLAS is enabled because the kernel implementation is missing.
 ```
 cmake .. -DSD_FLASH_ATTN=ON
 cmake --build . --config Release
 ```
 ### Run
 ```
 usage: ./bin/sd [arguments]
 arguments:
  -h, --help                         show this help message and exit
  -M, --mode [txt2img or img2img]    generation mode (default: txt2img)
  -t, --threads N                    number of threads to use during computation (default: -1).
                                     If threads <= 0, then threads will be set to the number of CPU physical cores
  -m, --model [MODEL]                path to model
  --vae [VAE]                        path to vae
  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
  --upscale-model [ESRGAN_PATH]      path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now.
  --type [TYPE]                      weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)
                                     If not specified, the default is the type of the weight file.
  --lora-model-dir [DIR]             lora model directory
  -i, --init-img [IMAGE]             path to the input image, required by img2img
  -o, --output OUTPUT                path to write result image to (default: ./output.png)
  -p, --prompt [PROMPT]              the prompt to render
  -n, --negative-prompt PROMPT       the negative prompt (default: "")
  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)
  --strength STRENGTH                strength for noising/unnoising (default: 0.75)
                                     1.0 corresponds to full destruction of information in init image
  -H, --height H                     image height, in pixel space (default: 512)
  -W, --width W                      image width, in pixel space (default: 512)
  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, lcm}
                                     sampling method (default: "euler_a")
  --steps  STEPS                     number of sample steps (default: 20)
  --rng {std_default, cuda}          RNG (default: cuda)
  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)
  -b, --batch-count COUNT            number of images to generate.
  --schedule {discrete, karras}      Denoiser sigma schedule (default: discrete)
  --clip-skip N                      number of layers to skip of clip model (default: 0)
  --vae-tiling                       process vae in tiles to reduce memory usage
  -v, --verbose                      print extra info
 ```
 #### Quantization
 You can specify the model weight type using the `--type` parameter. The weights are automatically converted when loading the model.
 - `f16` for 16-bit floating-point
 - `f32` for 32-bit floating-point
 - `q8_0` for 8-bit integer quantization
 - `q5_0` or `q5_1` for 5-bit integer quantization
 - `q4_0` or `q4_1` for 4-bit integer quantization
 #### txt2img example
    ```sh
-./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
+    curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
 # ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
 # ./bin/sd -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
    ```
-Using formats of different precisions will yield results of varying quality.
+### Generate an image with just one command
-| f32  | f16  |q8_0  |q5_0  |q5_1  |q4_0  |q4_1  |
+```sh
-| ----  |----  |----  |----  |----  |----  |----  |
+./bin/sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
 | ![](./assets/f32.png) |![](./assets/f16.png) |![](./assets/q8_0.png) |![](./assets/q5_0.png) |![](./assets/q5_1.png) |![](./assets/q4_0.png) |![](./assets/q4_1.png) |
 #### img2img example
 - `./output.png` is the image generated from the above txt2img pipeline
 ```
 ./bin/sd --mode img2img -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
 ```
-<p align="center">
+***For detailed command-line arguments, check out [cli doc](./examples/cli/README.md).***
  <img src="./assets/img2img_output.png" width="256x">
 </p>
-#### with LoRA
+## Performance
- You can specify the directory where the lora weights are stored via `--lora-model-dir`. If not specified, the default is the current working directory.
+If you want to improve performance or reduce VRAM/RAM usage, please refer to [performance guide](./docs/performance.md).
- LoRA is specified via prompt, just like [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora).
+## More Guides
-Here's a simple example:
+- [SD1.x/SD2.x/SDXL](./docs/sd.md)
 - [SD3/SD3.5](./docs/sd3.md)
 - [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
 - [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
 - [FLUX.1-Kontext-dev](./docs/kontext.md)
 - [Chroma](./docs/chroma.md)
 - [🔥Qwen Image](./docs/qwen_image.md)
 - [🔥Qwen Image Edit series](./docs/qwen_image_edit.md)
 - [🔥Wan2.1/Wan2.2](./docs/wan.md)
 - [🔥Z-Image](./docs/z_image.md)
 - [Ovis-Image](./docs/ovis_image.md)
 - [LoRA](./docs/lora.md)
 - [LCM/LCM-LoRA](./docs/lcm.md)
 - [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
 - [Using ESRGAN to upscale results](./docs/esrgan.md)
 - [Using TAESD to faster decoding](./docs/taesd.md)
 - [Docker](./docs/docker.md)
 - [Quantization and GGUF](./docs/quantization_and_gguf.md)
 - [Inference acceleration via caching](./docs/caching.md)
-```
+## Bindings
 ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
 ```
-`../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model
+These projects wrap `stable-diffusion.cpp` for easier use in other languages/frameworks.
-#### LCM/LCM-LoRA
+* Golang (non-cgo): [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion)
 * Golang (cgo): [Binozo/GoStableDiffusion](https://github.com/Binozo/GoStableDiffusion)
 * C#: [DarthAffe/StableDiffusion.NET](https://github.com/DarthAffe/StableDiffusion.NET)
 * Python: [william-murray1204/stable-diffusion-cpp-python](https://github.com/william-murray1204/stable-diffusion-cpp-python)
 * Rust: [newfla/diffusion-rs](https://github.com/newfla/diffusion-rs)
 * Flutter/Dart: [rmatif/Local-Diffusion](https://github.com/rmatif/Local-Diffusion)
- Download LCM-LoRA form https://huggingface.co/latent-consistency/lcm-lora-sdv1-5
+## UIs
 - Specify LCM-LoRA by adding `<lora:lcm-lora-sdv1-5:1>` to prompt
 - It's advisable to set `--cfg-scale` to `1.0` instead of the default `7.0`. For `--steps`, a range of `2-8` steps is recommended. For `--sampling-method`, `lcm`/`euler_a` is recommended.
-Here's a simple example:
+These projects use `stable-diffusion.cpp` as a backend for their image generation.
-```
+- [Jellybox](https://jellybox.com)
-./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:lcm-lora-sdv1-5:1>" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
+- [Stable Diffusion GUI](https://github.com/fszontagh/sd.cpp.gui.wx)
-```
+- [Stable Diffusion CLI-GUI](https://github.com/piallai/stable-diffusion.cpp)
-
+- [Local Diffusion](https://github.com/rmatif/Local-Diffusion)
-| without LCM-LoRA (--cfg-scale 7)  | with LCM-LoRA (--cfg-scale 1)  |
+- [sd.cpp-webui](https://github.com/daniandtheweb/sd.cpp-webui)
-| ----  |----    |
+- [LocalAI](https://github.com/mudler/LocalAI)
-| ![](./assets/without_lcm.png) |![](./assets/with_lcm.png)  |
+- [Neural-Pixel](https://github.com/Luiz-Alcantara/Neural-Pixel)
-
+- [KoboldCpp](https://github.com/LostRuins/koboldcpp)
 ## Using TAESD to faster decoding
 You can use TAESD to accelerate the decoding of latent images by following these steps:
 - Download the model [weights](https://huggingface.co/madebyollin/taesd/blob/main/diffusion_pytorch_model.safetensors).
 Or curl
 ```bash
 curl -L -O https://huggingface.co/madebyollin/taesd/blob/main/diffusion_pytorch_model.safetensors
 ```
 - Specify the model path using the `--taesd PATH` parameter. example:
 ```bash
 sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --taesd ../models/diffusion_pytorch_model.safetensors
 ```
 ## Using ESRGAN to upscale results
 You can use ESRGAN to upscale the generated images. At the moment, only the [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth) model is supported. Support for more models of this architecture will be added soon.
 - Specify the model path using the `--upscale-model PATH` parameter. example:
 ```bash
 sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --upscale-model ../models/RealESRGAN_x4plus_anime_6B.pth
 ```
 ### Docker
 #### Building using Docker
 ```shell
 docker build -t sd .
 ```
 #### Run
 ```shell
 docker run -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
 # For example
 # docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
 ```
 ## Memory Requirements
 | precision | f32  | f16  |q8_0  |q5_0  |q5_1  |q4_0  |q4_1  |
 | ----         | ----  |----  |----  |----  |----  |----  |----  |
 |  **Memory** (txt2img - 512 x 512) | ~2.8G | ~2.3G | ~2.1G | ~2.0G | ~2.0G | ~2.0G | ~2.0G |
 |  **Memory** (txt2img - 512 x 512) *with Flash Attention* | ~2.4G | ~1.9G | ~1.6G | ~1.5G | ~1.5G | ~1.5G | ~1.5G |
 ## Contributors
@ -297,12 +178,22 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp
 [![Contributors](https://contrib.rocks/image?repo=leejet/stable-diffusion.cpp)](https://github.com/leejet/stable-diffusion.cpp/graphs/contributors)
 ## Star History
 [![Star History Chart](https://api.star-history.com/svg?repos=leejet/stable-diffusion.cpp&type=Date)](https://star-history.com/#leejet/stable-diffusion.cpp&Date)
 ## References
- [ggml](https://github.com/ggerganov/ggml)
+- [ggml](https://github.com/ggml-org/ggml)
 - [diffusers](https://github.com/huggingface/diffusers)
 - [stable-diffusion](https://github.com/CompVis/stable-diffusion)
 - [sd3-ref](https://github.com/Stability-AI/sd3-ref)
 - [stable-diffusion-stability-ai](https://github.com/Stability-AI/stablediffusion)
 - [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui)
 - [ComfyUI](https://github.com/comfyanonymous/ComfyUI)
 - [k-diffusion](https://github.com/crowsonkb/k-diffusion)
 - [latent-consistency-model](https://github.com/luosiallen/latent-consistency-model)
 - [generative-models](https://github.com/Stability-AI/generative-models/)
 - [PhotoMaker](https://github.com/TencentARC/PhotoMaker)
 - [Wan2.1](https://github.com/Wan-Video/Wan2.1)
 - [Wan2.2](https://github.com/Wan-Video/Wan2.2)
--- a/assets/cat_with_sd_cpp_20184.png
+++ b/assets/cat_with_sd_cpp_20184.png
--- a/assets/cat_with_sd_cpp_42.png
+++ b/assets/cat_with_sd_cpp_42.png
--- a/assets/control.png
+++ b/assets/control.png
--- a/assets/control_2.png
+++ b/assets/control_2.png
--- a/assets/control_3.png
+++ b/assets/control_3.png
--- a/assets/flux/chroma1-radiance.png
+++ b/assets/flux/chroma1-radiance.png
--- a/assets/flux/chroma_v40.png
+++ b/assets/flux/chroma_v40.png
--- a/assets/flux/flux1-dev-q2_k.png
+++ b/assets/flux/flux1-dev-q2_k.png
--- a/assets/flux/flux1-dev-q3_k.png
+++ b/assets/flux/flux1-dev-q3_k.png
--- a/assets/flux/flux1-dev-q4_0.png
+++ b/assets/flux/flux1-dev-q4_0.png
--- a/assets/flux/flux1-dev-q4_k.png
+++ b/assets/flux/flux1-dev-q4_k.png
--- a/assets/flux/flux1-dev-q8_0
+++ b/assets/flux/flux1-dev-q8_0
--- a/assets/flux/flux1-dev-q8_0.png
+++ b/assets/flux/flux1-dev-q8_0.png
--- a/assets/flux/flux1-schnell-q8_0.png
+++ b/assets/flux/flux1-schnell-q8_0.png
--- a/assets/flux/kontext1_dev_output.png
+++ b/assets/flux/kontext1_dev_output.png
--- a/assets/flux2/example.png
+++ b/assets/flux2/example.png
--- a/assets/flux2/flux2-klein-4b-edit.png
+++ b/assets/flux2/flux2-klein-4b-edit.png
--- a/assets/flux2/flux2-klein-4b.png
+++ b/assets/flux2/flux2-klein-4b.png
--- a/assets/flux2/flux2-klein-9b-edit.png
+++ b/assets/flux2/flux2-klein-9b-edit.png
--- a/assets/flux2/flux2-klein-9b.png
+++ b/assets/flux2/flux2-klein-9b.png
--- a/assets/flux2/flux2-klein-base-4b.png
+++ b/assets/flux2/flux2-klein-base-4b.png
--- a/assets/flux2/flux2-klein-base-9b.png
+++ b/assets/flux2/flux2-klein-base-9b.png
--- a/assets/logo.png
+++ b/assets/logo.png
--- a/assets/ovis_image/example.png
+++ b/assets/ovis_image/example.png
--- a/assets/photomaker_examples/lenna_woman/lenna.jpg
+++ b/assets/photomaker_examples/lenna_woman/lenna.jpg
--- a/assets/photomaker_examples/newton_man/newton_0.jpg
+++ b/assets/photomaker_examples/newton_man/newton_0.jpg
--- a/assets/photomaker_examples/newton_man/newton_1.jpg
+++ b/assets/photomaker_examples/newton_man/newton_1.jpg
--- a/assets/photomaker_examples/newton_man/newton_2.png
+++ b/assets/photomaker_examples/newton_man/newton_2.png
--- a/assets/photomaker_examples/newton_man/newton_3.jpg
+++ b/assets/photomaker_examples/newton_man/newton_3.jpg
--- a/assets/photomaker_examples/scarletthead_woman/scarlett_0.jpg
+++ b/assets/photomaker_examples/scarletthead_woman/scarlett_0.jpg
--- a/assets/photomaker_examples/scarletthead_woman/scarlett_1.jpg
+++ b/assets/photomaker_examples/scarletthead_woman/scarlett_1.jpg
--- a/assets/photomaker_examples/scarletthead_woman/scarlett_2.jpg
+++ b/assets/photomaker_examples/scarletthead_woman/scarlett_2.jpg
--- a/assets/photomaker_examples/scarletthead_woman/scarlett_3.jpg
+++ b/assets/photomaker_examples/scarletthead_woman/scarlett_3.jpg
--- a/assets/photomaker_examples/yangmi_woman/yangmi_1.jpg
+++ b/assets/photomaker_examples/yangmi_woman/yangmi_1.jpg
--- a/assets/photomaker_examples/yangmi_woman/yangmi_2.jpeg
+++ b/assets/photomaker_examples/yangmi_woman/yangmi_2.jpeg
--- a/assets/photomaker_examples/yangmi_woman/yangmi_3.jpg
+++ b/assets/photomaker_examples/yangmi_woman/yangmi_3.jpg
--- a/assets/photomaker_examples/yangmi_woman/yangmi_4.jpg
+++ b/assets/photomaker_examples/yangmi_woman/yangmi_4.jpg
--- a/assets/photomaker_examples/yangmi_woman/yangmi_5.jpg
+++ b/assets/photomaker_examples/yangmi_woman/yangmi_5.jpg
--- a/assets/photomaker_examples/yangmi_woman/yangmi_6.jpg
+++ b/assets/photomaker_examples/yangmi_woman/yangmi_6.jpg
--- a/assets/qwen/example.png
+++ b/assets/qwen/example.png
--- a/assets/qwen/qwen_image_edit.png
+++ b/assets/qwen/qwen_image_edit.png
--- a/assets/qwen/qwen_image_edit_2509.png
+++ b/assets/qwen/qwen_image_edit_2509.png
--- a/assets/qwen/qwen_image_edit_2511.png
+++ b/assets/qwen/qwen_image_edit_2511.png
--- a/assets/sd3.5_large.png
+++ b/assets/sd3.5_large.png
--- a/assets/sycl_sd3_output.png
+++ b/assets/sycl_sd3_output.png
--- a/assets/wan/Wan2.1_1.3B_t2v.mp4
+++ b/assets/wan/Wan2.1_1.3B_t2v.mp4
--- a/assets/wan/Wan2.1_1.3B_vace_r2v.mp4
+++ b/assets/wan/Wan2.1_1.3B_vace_r2v.mp4
--- a/assets/wan/Wan2.1_1.3B_vace_t2v.mp4
+++ b/assets/wan/Wan2.1_1.3B_vace_t2v.mp4
--- a/assets/wan/Wan2.1_1.3B_vace_v2v.mp4
+++ b/assets/wan/Wan2.1_1.3B_vace_v2v.mp4
--- a/assets/wan/Wan2.1_14B_flf2v.mp4
+++ b/assets/wan/Wan2.1_14B_flf2v.mp4
--- a/assets/wan/Wan2.1_14B_i2v.mp4
+++ b/assets/wan/Wan2.1_14B_i2v.mp4
--- a/assets/wan/Wan2.1_14B_t2v.mp4
+++ b/assets/wan/Wan2.1_14B_t2v.mp4
--- a/assets/wan/Wan2.1_14B_vace_r2v.mp4
+++ b/assets/wan/Wan2.1_14B_vace_r2v.mp4
--- a/assets/wan/Wan2.1_14B_vace_t2v.mp4
+++ b/assets/wan/Wan2.1_14B_vace_t2v.mp4
--- a/assets/wan/Wan2.1_14B_vace_v2v.mp4
+++ b/assets/wan/Wan2.1_14B_vace_v2v.mp4
--- a/assets/wan/Wan2.2_14B_flf2v.mp4
+++ b/assets/wan/Wan2.2_14B_flf2v.mp4
--- a/assets/wan/Wan2.2_14B_i2v.mp4
+++ b/assets/wan/Wan2.2_14B_i2v.mp4
--- a/assets/wan/Wan2.2_14B_t2i.png
+++ b/assets/wan/Wan2.2_14B_t2i.png
--- a/assets/wan/Wan2.2_14B_t2v.mp4
+++ b/assets/wan/Wan2.2_14B_t2v.mp4
--- a/assets/wan/Wan2.2_14B_t2v_lora.mp4
+++ b/assets/wan/Wan2.2_14B_t2v_lora.mp4
--- a/assets/wan/Wan2.2_5B_i2v.mp4
+++ b/assets/wan/Wan2.2_5B_i2v.mp4
--- a/assets/wan/Wan2.2_5B_t2v.mp4
+++ b/assets/wan/Wan2.2_5B_t2v.mp4
--- a/assets/z_image/base_bf16.png
+++ b/assets/z_image/base_bf16.png
--- a/assets/z_image/bf16.png
+++ b/assets/z_image/bf16.png
--- a/assets/z_image/q2_K.png
+++ b/assets/z_image/q2_K.png
--- a/assets/z_image/q3_K.png
+++ b/assets/z_image/q3_K.png
--- a/assets/z_image/q4_0.png
+++ b/assets/z_image/q4_0.png
--- a/assets/z_image/q4_K.png
+++ b/assets/z_image/q4_K.png
--- a/assets/z_image/q5_0.png
+++ b/assets/z_image/q5_0.png
--- a/assets/z_image/q6_K.png
+++ b/assets/z_image/q6_K.png
--- a/assets/z_image/q8_0.png
+++ b/assets/z_image/q8_0.png
--- a/cache_dit.hpp
+++ b/cache_dit.hpp
@ -0,0 +1,975 @@
 #ifndef __CACHE_DIT_HPP__
 #define __CACHE_DIT_HPP__
 #include <algorithm>
 #include <cmath>
 #include <limits>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include "ggml_extend.hpp"
 struct DBCacheConfig {
    bool enabled                        = false;
    int Fn_compute_blocks               = 8;
    int Bn_compute_blocks               = 0;
    float residual_diff_threshold       = 0.08f;
    int max_warmup_steps                = 8;
    int max_cached_steps                = -1;
    int max_continuous_cached_steps     = -1;
    float max_accumulated_residual_diff = -1.0f;
    std::vector<int> steps_computation_mask;
    bool scm_policy_dynamic = true;
 };
 struct TaylorSeerConfig {
    bool enabled            = false;
    int n_derivatives       = 1;
    int max_warmup_steps    = 2;
    int skip_interval_steps = 1;
 };
 struct CacheDitConfig {
    DBCacheConfig dbcache;
    TaylorSeerConfig taylorseer;
    int double_Fn_blocks = -1;
    int double_Bn_blocks = -1;
    int single_Fn_blocks = -1;
    int single_Bn_blocks = -1;
 };
 struct TaylorSeerState {
    int n_derivatives      = 1;
    int current_step       = -1;
    int last_computed_step = -1;
    std::vector<std::vector<float>> dY_prev;
    std::vector<std::vector<float>> dY_current;
    void init(int n_deriv, size_t hidden_size) {
        n_derivatives = n_deriv;
        int order     = n_derivatives + 1;
        dY_prev.resize(order);
        dY_current.resize(order);
        for (int i = 0; i < order; i++) {
            dY_prev[i].clear();
            dY_current[i].clear();
        }
        current_step       = -1;
        last_computed_step = -1;
    }
    void reset() {
        for (auto& v : dY_prev)
            v.clear();
        for (auto& v : dY_current)
            v.clear();
        current_step       = -1;
        last_computed_step = -1;
    }
    bool can_approximate() const {
        return last_computed_step >= n_derivatives && !dY_prev.empty() && !dY_prev[0].empty();
    }
    void update_derivatives(const float* Y, size_t size, int step) {
        int order = n_derivatives + 1;
        dY_prev   = dY_current;
        dY_current[0].resize(size);
        for (size_t i = 0; i < size; i++) {
            dY_current[0][i] = Y[i];
        }
        int window = step - last_computed_step;
        if (window <= 0)
            window = 1;
        for (int d = 0; d < n_derivatives; d++) {
            if (!dY_prev[d].empty() && dY_prev[d].size() == size) {
                dY_current[d + 1].resize(size);
                for (size_t i = 0; i < size; i++) {
                    dY_current[d + 1][i] = (dY_current[d][i] - dY_prev[d][i]) / static_cast<float>(window);
                }
            } else {
                dY_current[d + 1].clear();
            }
        }
        current_step       = step;
        last_computed_step = step;
    }
    void approximate(float* output, size_t size, int target_step) const {
        if (!can_approximate() || dY_prev[0].size() != size) {
            return;
        }
        int elapsed = target_step - last_computed_step;
        if (elapsed <= 0)
            elapsed = 1;
        std::fill(output, output + size, 0.0f);
        float factorial = 1.0f;
        int order       = static_cast<int>(dY_prev.size());
        for (int o = 0; o < order; o++) {
            if (dY_prev[o].empty() || dY_prev[o].size() != size)
                continue;
            if (o > 0)
                factorial *= static_cast<float>(o);
            float coeff = ::powf(static_cast<float>(elapsed), static_cast<float>(o)) / factorial;
            for (size_t i = 0; i < size; i++) {
                output[i] += coeff * dY_prev[o][i];
            }
        }
    }
 };
 struct BlockCacheEntry {
    std::vector<float> residual_img;
    std::vector<float> residual_txt;
    std::vector<float> residual;
    std::vector<float> prev_img;
    std::vector<float> prev_txt;
    std::vector<float> prev_output;
    bool has_prev = false;
 };
 struct CacheDitState {
    CacheDitConfig config;
    bool initialized = false;
    int total_double_blocks = 0;
    int total_single_blocks = 0;
    size_t hidden_size      = 0;
    int current_step     = -1;
    int total_steps      = 0;
    int warmup_remaining = 0;
    std::vector<int> cached_steps;
    int continuous_cached_steps     = 0;
    float accumulated_residual_diff = 0.0f;
    std::vector<BlockCacheEntry> double_block_cache;
    std::vector<BlockCacheEntry> single_block_cache;
    std::vector<float> Fn_residual_img;
    std::vector<float> Fn_residual_txt;
    std::vector<float> prev_Fn_residual_img;
    std::vector<float> prev_Fn_residual_txt;
    bool has_prev_Fn_residual = false;
    std::vector<float> Bn_buffer_img;
    std::vector<float> Bn_buffer_txt;
    std::vector<float> Bn_buffer;
    bool has_Bn_buffer = false;
    TaylorSeerState taylor_state;
    bool can_cache_this_step  = false;
    bool is_caching_this_step = false;
    int total_blocks_computed = 0;
    int total_blocks_cached   = 0;
    void init(const CacheDitConfig& cfg, int num_double_blocks, int num_single_blocks, size_t h_size) {
        config              = cfg;
        total_double_blocks = num_double_blocks;
        total_single_blocks = num_single_blocks;
        hidden_size         = h_size;
        initialized = cfg.dbcache.enabled || cfg.taylorseer.enabled;
        if (!initialized)
            return;
        warmup_remaining = cfg.dbcache.max_warmup_steps;
        double_block_cache.resize(total_double_blocks);
        single_block_cache.resize(total_single_blocks);
        if (cfg.taylorseer.enabled) {
            taylor_state.init(cfg.taylorseer.n_derivatives, h_size);
        }
        reset_runtime();
    }
    void reset_runtime() {
        current_step     = -1;
        total_steps      = 0;
        warmup_remaining = config.dbcache.max_warmup_steps;
        cached_steps.clear();
        continuous_cached_steps   = 0;
        accumulated_residual_diff = 0.0f;
        for (auto& entry : double_block_cache) {
            entry.residual_img.clear();
            entry.residual_txt.clear();
            entry.prev_img.clear();
            entry.prev_txt.clear();
            entry.has_prev = false;
        }
        for (auto& entry : single_block_cache) {
            entry.residual.clear();
            entry.prev_output.clear();
            entry.has_prev = false;
        }
        Fn_residual_img.clear();
        Fn_residual_txt.clear();
        prev_Fn_residual_img.clear();
        prev_Fn_residual_txt.clear();
        has_prev_Fn_residual = false;
        Bn_buffer_img.clear();
        Bn_buffer_txt.clear();
        Bn_buffer.clear();
        has_Bn_buffer = false;
        taylor_state.reset();
        can_cache_this_step  = false;
        is_caching_this_step = false;
        total_blocks_computed = 0;
        total_blocks_cached   = 0;
    }
    bool enabled() const {
        return initialized && (config.dbcache.enabled || config.taylorseer.enabled);
    }
    void begin_step(int step_index, float sigma = 0.0f) {
        if (!enabled())
            return;
        if (step_index == current_step)
            return;
        current_step = step_index;
        total_steps++;
        bool in_warmup = warmup_remaining > 0;
        if (in_warmup) {
            warmup_remaining--;
        }
        bool scm_allows_cache = true;
        if (!config.dbcache.steps_computation_mask.empty()) {
            if (step_index < static_cast<int>(config.dbcache.steps_computation_mask.size())) {
                scm_allows_cache = (config.dbcache.steps_computation_mask[step_index] == 0);
                if (!config.dbcache.scm_policy_dynamic && scm_allows_cache) {
                    can_cache_this_step  = true;
                    is_caching_this_step = false;
                    return;
                }
            }
        }
        bool max_cached_ok = (config.dbcache.max_cached_steps < 0) ||
                             (static_cast<int>(cached_steps.size()) < config.dbcache.max_cached_steps);
        bool max_cont_ok = (config.dbcache.max_continuous_cached_steps < 0) ||
                           (continuous_cached_steps < config.dbcache.max_continuous_cached_steps);
        bool accum_ok = (config.dbcache.max_accumulated_residual_diff < 0.0f) ||
                        (accumulated_residual_diff < config.dbcache.max_accumulated_residual_diff);
        can_cache_this_step  = !in_warmup && scm_allows_cache && max_cached_ok && max_cont_ok && accum_ok && has_prev_Fn_residual;
        is_caching_this_step = false;
    }
    void end_step(bool was_cached) {
        if (was_cached) {
            cached_steps.push_back(current_step);
            continuous_cached_steps++;
        } else {
            continuous_cached_steps = 0;
        }
    }
    static float calculate_residual_diff(const float* prev, const float* curr, size_t size) {
        if (size == 0)
            return 0.0f;
        float sum_diff = 0.0f;
        float sum_abs  = 0.0f;
        for (size_t i = 0; i < size; i++) {
            sum_diff += std::fabs(prev[i] - curr[i]);
            sum_abs += std::fabs(prev[i]);
        }
        return sum_diff / (sum_abs + 1e-6f);
    }
    static float calculate_residual_diff(const std::vector<float>& prev, const std::vector<float>& curr) {
        if (prev.size() != curr.size() || prev.empty())
            return 1.0f;
        return calculate_residual_diff(prev.data(), curr.data(), prev.size());
    }
    int get_double_Fn_blocks() const {
        return (config.double_Fn_blocks >= 0) ? config.double_Fn_blocks : config.dbcache.Fn_compute_blocks;
    }
    int get_double_Bn_blocks() const {
        return (config.double_Bn_blocks >= 0) ? config.double_Bn_blocks : config.dbcache.Bn_compute_blocks;
    }
    int get_single_Fn_blocks() const {
        return (config.single_Fn_blocks >= 0) ? config.single_Fn_blocks : config.dbcache.Fn_compute_blocks;
    }
    int get_single_Bn_blocks() const {
        return (config.single_Bn_blocks >= 0) ? config.single_Bn_blocks : config.dbcache.Bn_compute_blocks;
    }
    bool is_Fn_double_block(int block_idx) const {
        return block_idx < get_double_Fn_blocks();
    }
    bool is_Bn_double_block(int block_idx) const {
        int Bn = get_double_Bn_blocks();
        return Bn > 0 && block_idx >= (total_double_blocks - Bn);
    }
    bool is_Mn_double_block(int block_idx) const {
        return !is_Fn_double_block(block_idx) && !is_Bn_double_block(block_idx);
    }
    bool is_Fn_single_block(int block_idx) const {
        return block_idx < get_single_Fn_blocks();
    }
    bool is_Bn_single_block(int block_idx) const {
        int Bn = get_single_Bn_blocks();
        return Bn > 0 && block_idx >= (total_single_blocks - Bn);
    }
    bool is_Mn_single_block(int block_idx) const {
        return !is_Fn_single_block(block_idx) && !is_Bn_single_block(block_idx);
    }
    void store_Fn_residual(const float* img, const float* txt, size_t img_size, size_t txt_size, const float* input_img, const float* input_txt) {
        Fn_residual_img.resize(img_size);
        Fn_residual_txt.resize(txt_size);
        for (size_t i = 0; i < img_size; i++) {
            Fn_residual_img[i] = img[i] - input_img[i];
        }
        for (size_t i = 0; i < txt_size; i++) {
            Fn_residual_txt[i] = txt[i] - input_txt[i];
        }
    }
    bool check_cache_decision() {
        if (!can_cache_this_step) {
            is_caching_this_step = false;
            return false;
        }
        if (!has_prev_Fn_residual || prev_Fn_residual_img.empty()) {
            is_caching_this_step = false;
            return false;
        }
        float diff_img = calculate_residual_diff(prev_Fn_residual_img, Fn_residual_img);
        float diff_txt = calculate_residual_diff(prev_Fn_residual_txt, Fn_residual_txt);
        float diff     = (diff_img + diff_txt) / 2.0f;
        if (diff < config.dbcache.residual_diff_threshold) {
            is_caching_this_step = true;
            accumulated_residual_diff += diff;
            return true;
        }
        is_caching_this_step = false;
        return false;
    }
    void update_prev_Fn_residual() {
        prev_Fn_residual_img = Fn_residual_img;
        prev_Fn_residual_txt = Fn_residual_txt;
        has_prev_Fn_residual = !prev_Fn_residual_img.empty();
    }
    void store_double_block_residual(int block_idx, const float* img, const float* txt, size_t img_size, size_t txt_size, const float* prev_img, const float* prev_txt) {
        if (block_idx < 0 || block_idx >= static_cast<int>(double_block_cache.size()))
            return;
        BlockCacheEntry& entry = double_block_cache[block_idx];
        entry.residual_img.resize(img_size);
        entry.residual_txt.resize(txt_size);
        for (size_t i = 0; i < img_size; i++) {
            entry.residual_img[i] = img[i] - prev_img[i];
        }
        for (size_t i = 0; i < txt_size; i++) {
            entry.residual_txt[i] = txt[i] - prev_txt[i];
        }
        entry.prev_img.resize(img_size);
        entry.prev_txt.resize(txt_size);
        for (size_t i = 0; i < img_size; i++) {
            entry.prev_img[i] = img[i];
        }
        for (size_t i = 0; i < txt_size; i++) {
            entry.prev_txt[i] = txt[i];
        }
        entry.has_prev = true;
    }
    void apply_double_block_cache(int block_idx, float* img, float* txt, size_t img_size, size_t txt_size) {
        if (block_idx < 0 || block_idx >= static_cast<int>(double_block_cache.size()))
            return;
        const BlockCacheEntry& entry = double_block_cache[block_idx];
        if (entry.residual_img.size() != img_size || entry.residual_txt.size() != txt_size)
            return;
        for (size_t i = 0; i < img_size; i++) {
            img[i] += entry.residual_img[i];
        }
        for (size_t i = 0; i < txt_size; i++) {
            txt[i] += entry.residual_txt[i];
        }
        total_blocks_cached++;
    }
    void store_single_block_residual(int block_idx, const float* output, size_t size, const float* input) {
        if (block_idx < 0 || block_idx >= static_cast<int>(single_block_cache.size()))
            return;
        BlockCacheEntry& entry = single_block_cache[block_idx];
        entry.residual.resize(size);
        for (size_t i = 0; i < size; i++) {
            entry.residual[i] = output[i] - input[i];
        }
        entry.prev_output.resize(size);
        for (size_t i = 0; i < size; i++) {
            entry.prev_output[i] = output[i];
        }
        entry.has_prev = true;
    }
    void apply_single_block_cache(int block_idx, float* output, size_t size) {
        if (block_idx < 0 || block_idx >= static_cast<int>(single_block_cache.size()))
            return;
        const BlockCacheEntry& entry = single_block_cache[block_idx];
        if (entry.residual.size() != size)
            return;
        for (size_t i = 0; i < size; i++) {
            output[i] += entry.residual[i];
        }
        total_blocks_cached++;
    }
    void store_Bn_buffer(const float* img, const float* txt, size_t img_size, size_t txt_size, const float* Bn_start_img, const float* Bn_start_txt) {
        Bn_buffer_img.resize(img_size);
        Bn_buffer_txt.resize(txt_size);
        for (size_t i = 0; i < img_size; i++) {
            Bn_buffer_img[i] = img[i] - Bn_start_img[i];
        }
        for (size_t i = 0; i < txt_size; i++) {
            Bn_buffer_txt[i] = txt[i] - Bn_start_txt[i];
        }
        has_Bn_buffer = true;
    }
    void apply_Bn_buffer(float* img, float* txt, size_t img_size, size_t txt_size) {
        if (!has_Bn_buffer)
            return;
        if (Bn_buffer_img.size() != img_size || Bn_buffer_txt.size() != txt_size)
            return;
        for (size_t i = 0; i < img_size; i++) {
            img[i] += Bn_buffer_img[i];
        }
        for (size_t i = 0; i < txt_size; i++) {
            txt[i] += Bn_buffer_txt[i];
        }
    }
    void taylor_update(const float* hidden_state, size_t size) {
        if (!config.taylorseer.enabled)
            return;
        taylor_state.update_derivatives(hidden_state, size, current_step);
    }
    bool taylor_can_approximate() const {
        return config.taylorseer.enabled && taylor_state.can_approximate();
    }
    void taylor_approximate(float* output, size_t size) {
        if (!config.taylorseer.enabled)
            return;
        taylor_state.approximate(output, size, current_step);
    }
    bool should_use_taylor_this_step() const {
        if (!config.taylorseer.enabled)
            return false;
        if (current_step < config.taylorseer.max_warmup_steps)
            return false;
        int interval = config.taylorseer.skip_interval_steps;
        if (interval <= 0)
            interval = 1;
        return (current_step % (interval + 1)) != 0;
    }
    void log_metrics() const {
        if (!enabled())
            return;
        int total_blocks  = total_blocks_computed + total_blocks_cached;
        float cache_ratio = (total_blocks > 0) ? (static_cast<float>(total_blocks_cached) / total_blocks * 100.0f) : 0.0f;
        float step_cache_ratio = (total_steps > 0) ? (static_cast<float>(cached_steps.size()) / total_steps * 100.0f) : 0.0f;
        LOG_INFO("CacheDIT: steps_cached=%zu/%d (%.1f%%), blocks_cached=%d/%d (%.1f%%), accum_diff=%.4f",
                 cached_steps.size(), total_steps, step_cache_ratio,
                 total_blocks_cached, total_blocks, cache_ratio,
                 accumulated_residual_diff);
    }
    std::string get_summary() const {
        char buf[256];
        snprintf(buf, sizeof(buf),
                 "CacheDIT[thresh=%.2f]: cached %zu/%d steps, %d/%d blocks",
                 config.dbcache.residual_diff_threshold,
                 cached_steps.size(), total_steps,
                 total_blocks_cached, total_blocks_computed + total_blocks_cached);
        return std::string(buf);
    }
 };
 inline std::vector<int> parse_scm_mask(const std::string& mask_str) {
    std::vector<int> mask;
    if (mask_str.empty())
        return mask;
    size_t pos   = 0;
    size_t start = 0;
    while ((pos = mask_str.find(',', start)) != std::string::npos) {
        std::string token = mask_str.substr(start, pos - start);
        mask.push_back(std::stoi(token));
        start = pos + 1;
    }
    if (start < mask_str.length()) {
        mask.push_back(std::stoi(mask_str.substr(start)));
    }
    return mask;
 }
 inline std::vector<int> generate_scm_mask(
    const std::vector<int>& compute_bins,
    const std::vector<int>& cache_bins,
    int total_steps) {
    std::vector<int> mask;
    size_t c_idx = 0, cache_idx = 0;
    while (static_cast<int>(mask.size()) < total_steps) {
        if (c_idx < compute_bins.size()) {
            for (int i = 0; i < compute_bins[c_idx] && static_cast<int>(mask.size()) < total_steps; i++) {
                mask.push_back(1);
            }
            c_idx++;
        }
        if (cache_idx < cache_bins.size()) {
            for (int i = 0; i < cache_bins[cache_idx] && static_cast<int>(mask.size()) < total_steps; i++) {
                mask.push_back(0);
            }
            cache_idx++;
        }
        if (c_idx >= compute_bins.size() && cache_idx >= cache_bins.size())
            break;
    }
    if (!mask.empty()) {
        mask.back() = 1;
    }
    return mask;
 }
 inline std::vector<int> get_scm_preset(const std::string& preset, int total_steps) {
    struct Preset {
        std::vector<int> compute_bins;
        std::vector<int> cache_bins;
    };
    Preset slow   = {{8, 3, 3, 2, 1, 1}, {1, 2, 2, 2, 3}};
    Preset medium = {{6, 2, 2, 2, 2, 1}, {1, 3, 3, 3, 3}};
    Preset fast   = {{6, 1, 1, 1, 1, 1}, {1, 3, 4, 5, 4}};
    Preset ultra  = {{4, 1, 1, 1, 1}, {2, 5, 6, 7}};
    Preset* p = nullptr;
    if (preset == "slow" || preset == "s" || preset == "S")
        p = &slow;
    else if (preset == "medium" || preset == "m" || preset == "M")
        p = &medium;
    else if (preset == "fast" || preset == "f" || preset == "F")
        p = &fast;
    else if (preset == "ultra" || preset == "u" || preset == "U")
        p = &ultra;
    else
        return {};
    if (total_steps != 28 && total_steps > 0) {
        float scale = static_cast<float>(total_steps) / 28.0f;
        std::vector<int> scaled_compute, scaled_cache;
        for (int v : p->compute_bins) {
            scaled_compute.push_back(std::max(1, static_cast<int>(v * scale + 0.5f)));
        }
        for (int v : p->cache_bins) {
            scaled_cache.push_back(std::max(1, static_cast<int>(v * scale + 0.5f)));
        }
        return generate_scm_mask(scaled_compute, scaled_cache, total_steps);
    }
    return generate_scm_mask(p->compute_bins, p->cache_bins, total_steps);
 }
 inline float get_preset_threshold(const std::string& preset) {
    if (preset == "slow" || preset == "s" || preset == "S")
        return 0.20f;
    if (preset == "medium" || preset == "m" || preset == "M")
        return 0.25f;
    if (preset == "fast" || preset == "f" || preset == "F")
        return 0.30f;
    if (preset == "ultra" || preset == "u" || preset == "U")
        return 0.34f;
    return 0.08f;
 }
 inline int get_preset_warmup(const std::string& preset) {
    if (preset == "slow" || preset == "s" || preset == "S")
        return 8;
    if (preset == "medium" || preset == "m" || preset == "M")
        return 6;
    if (preset == "fast" || preset == "f" || preset == "F")
        return 6;
    if (preset == "ultra" || preset == "u" || preset == "U")
        return 4;
    return 8;
 }
 inline int get_preset_Fn(const std::string& preset) {
    if (preset == "slow" || preset == "s" || preset == "S")
        return 8;
    if (preset == "medium" || preset == "m" || preset == "M")
        return 8;
    if (preset == "fast" || preset == "f" || preset == "F")
        return 6;
    if (preset == "ultra" || preset == "u" || preset == "U")
        return 4;
    return 8;
 }
 inline int get_preset_Bn(const std::string& preset) {
    (void)preset;
    return 0;
 }
 inline void parse_dbcache_options(const std::string& opts, DBCacheConfig& cfg) {
    if (opts.empty())
        return;
    int Fn = 8, Bn = 0, warmup = 8, max_cached = -1, max_cont = -1;
    float thresh = 0.08f;
    sscanf(opts.c_str(), "%d,%d,%f,%d,%d,%d",
           &Fn, &Bn, &thresh, &warmup, &max_cached, &max_cont);
    cfg.Fn_compute_blocks           = Fn;
    cfg.Bn_compute_blocks           = Bn;
    cfg.residual_diff_threshold     = thresh;
    cfg.max_warmup_steps            = warmup;
    cfg.max_cached_steps            = max_cached;
    cfg.max_continuous_cached_steps = max_cont;
 }
 inline void parse_taylorseer_options(const std::string& opts, TaylorSeerConfig& cfg) {
    if (opts.empty())
        return;
    int n_deriv = 1, warmup = 2, interval = 1;
    sscanf(opts.c_str(), "%d,%d,%d", &n_deriv, &warmup, &interval);
    cfg.n_derivatives       = n_deriv;
    cfg.max_warmup_steps    = warmup;
    cfg.skip_interval_steps = interval;
 }
 struct CacheDitConditionState {
    DBCacheConfig config;
    TaylorSeerConfig taylor_config;
    bool initialized = false;
    int current_step_index = -1;
    bool step_active       = false;
    bool skip_current_step = false;
    bool initial_step      = true;
    int warmup_remaining   = 0;
    std::vector<int> cached_steps;
    int continuous_cached_steps     = 0;
    float accumulated_residual_diff = 0.0f;
    int total_steps_skipped         = 0;
    const void* anchor_condition = nullptr;
    struct CacheEntry {
        std::vector<float> diff;
        std::vector<float> prev_input;
        std::vector<float> prev_output;
        bool has_prev = false;
    };
    std::unordered_map<const void*, CacheEntry> cache_diffs;
    TaylorSeerState taylor_state;
    float start_sigma = std::numeric_limits<float>::max();
    float end_sigma   = 0.0f;
    void reset_runtime() {
        current_step_index = -1;
        step_active        = false;
        skip_current_step  = false;
        initial_step       = true;
        warmup_remaining   = config.max_warmup_steps;
        cached_steps.clear();
        continuous_cached_steps   = 0;
        accumulated_residual_diff = 0.0f;
        total_steps_skipped       = 0;
        anchor_condition          = nullptr;
        cache_diffs.clear();
        taylor_state.reset();
    }
    void init(const DBCacheConfig& dbcfg, const TaylorSeerConfig& tcfg) {
        config        = dbcfg;
        taylor_config = tcfg;
        initialized   = dbcfg.enabled || tcfg.enabled;
        reset_runtime();
        if (taylor_config.enabled) {
            taylor_state.init(taylor_config.n_derivatives, 0);
        }
    }
    void set_sigmas(const std::vector<float>& sigmas) {
        if (!initialized || sigmas.size() < 2)
            return;
        float start_percent = 0.15f;
        float end_percent   = 0.95f;
        size_t n_steps    = sigmas.size() - 1;
        size_t start_step = static_cast<size_t>(start_percent * n_steps);
        size_t end_step   = static_cast<size_t>(end_percent * n_steps);
        if (start_step >= n_steps)
            start_step = n_steps - 1;
        if (end_step >= n_steps)
            end_step = n_steps - 1;
        start_sigma = sigmas[start_step];
        end_sigma   = sigmas[end_step];
        if (start_sigma < end_sigma) {
            std::swap(start_sigma, end_sigma);
        }
    }
    bool enabled() const {
        return initialized && (config.enabled || taylor_config.enabled);
    }
    void begin_step(int step_index, float sigma) {
        if (!enabled())
            return;
        if (step_index == current_step_index)
            return;
        current_step_index = step_index;
        skip_current_step  = false;
        step_active        = false;
        if (sigma > start_sigma)
            return;
        if (!(sigma > end_sigma))
            return;
        step_active = true;
        if (warmup_remaining > 0) {
            warmup_remaining--;
            return;
        }
        if (!config.steps_computation_mask.empty()) {
            if (step_index < static_cast<int>(config.steps_computation_mask.size())) {
                if (config.steps_computation_mask[step_index] == 1) {
                    return;
                }
            }
        }
        if (config.max_cached_steps >= 0 &&
            static_cast<int>(cached_steps.size()) >= config.max_cached_steps) {
            return;
        }
        if (config.max_continuous_cached_steps >= 0 &&
            continuous_cached_steps >= config.max_continuous_cached_steps) {
            return;
        }
    }
    bool step_is_active() const {
        return enabled() && step_active;
    }
    bool is_step_skipped() const {
        return enabled() && step_active && skip_current_step;
    }
    bool has_cache(const void* cond) const {
        auto it = cache_diffs.find(cond);
        return it != cache_diffs.end() && !it->second.diff.empty();
    }
    void update_cache(const void* cond, const float* input, const float* output, size_t size) {
        CacheEntry& entry = cache_diffs[cond];
        entry.diff.resize(size);
        for (size_t i = 0; i < size; i++) {
            entry.diff[i] = output[i] - input[i];
        }
        entry.prev_input.resize(size);
        entry.prev_output.resize(size);
        for (size_t i = 0; i < size; i++) {
            entry.prev_input[i]  = input[i];
            entry.prev_output[i] = output[i];
        }
        entry.has_prev = true;
    }
    void apply_cache(const void* cond, const float* input, float* output, size_t size) {
        auto it = cache_diffs.find(cond);
        if (it == cache_diffs.end() || it->second.diff.empty())
            return;
        if (it->second.diff.size() != size)
            return;
        for (size_t i = 0; i < size; i++) {
            output[i] = input[i] + it->second.diff[i];
        }
    }
    bool before_condition(const void* cond, struct ggml_tensor* input, struct ggml_tensor* output, float sigma, int step_index) {
        if (!enabled() || step_index < 0)
            return false;
        if (step_index != current_step_index) {
            begin_step(step_index, sigma);
        }
        if (!step_active)
            return false;
        if (initial_step) {
            anchor_condition = cond;
            initial_step     = false;
        }
        bool is_anchor = (cond == anchor_condition);
        if (skip_current_step) {
            if (has_cache(cond)) {
                apply_cache(cond, (float*)input->data, (float*)output->data,
                            static_cast<size_t>(ggml_nelements(output)));
                return true;
            }
            return false;
        }
        if (!is_anchor)
            return false;
        auto it = cache_diffs.find(cond);
        if (it == cache_diffs.end() || !it->second.has_prev)
            return false;
        size_t ne = static_cast<size_t>(ggml_nelements(input));
        if (it->second.prev_input.size() != ne)
            return false;
        float* input_data = (float*)input->data;
        float diff        = CacheDitState::calculate_residual_diff(
                   it->second.prev_input.data(), input_data, ne);
        float effective_threshold = config.residual_diff_threshold;
        if (config.Fn_compute_blocks > 0) {
            float fn_confidence = 1.0f + 0.02f * (config.Fn_compute_blocks - 8);
            fn_confidence       = std::max(0.5f, std::min(2.0f, fn_confidence));
            effective_threshold *= fn_confidence;
        }
        if (config.Bn_compute_blocks > 0) {
            float bn_quality = 1.0f - 0.03f * config.Bn_compute_blocks;
            bn_quality       = std::max(0.5f, std::min(1.0f, bn_quality));
            effective_threshold *= bn_quality;
        }
        if (diff < effective_threshold) {
            skip_current_step = true;
            total_steps_skipped++;
            cached_steps.push_back(current_step_index);
            continuous_cached_steps++;
            accumulated_residual_diff += diff;
            apply_cache(cond, input_data, (float*)output->data, ne);
            return true;
        }
        continuous_cached_steps = 0;
        return false;
    }
    void after_condition(const void* cond, struct ggml_tensor* input, struct ggml_tensor* output) {
        if (!step_is_active())
            return;
        size_t ne = static_cast<size_t>(ggml_nelements(output));
        update_cache(cond, (float*)input->data, (float*)output->data, ne);
        if (cond == anchor_condition && taylor_config.enabled) {
            taylor_state.update_derivatives((float*)output->data, ne, current_step_index);
        }
    }
    void log_metrics() const {
        if (!enabled())
            return;
        LOG_INFO("CacheDIT: steps_skipped=%d/%d (%.1f%%), accum_residual_diff=%.4f",
                 total_steps_skipped,
                 current_step_index + 1,
                 (current_step_index > 0) ? (100.0f * total_steps_skipped / (current_step_index + 1)) : 0.0f,
                 accumulated_residual_diff);
    }
 };
 #endif
--- a/clip.hpp
+++ b/clip.hpp
--- a/common.hpp
+++ b/common.hpp
@ -3,82 +3,589 @@
 #include "ggml_extend.hpp"
-struct DownSample {
+class DownSampleBlock : public GGMLBlock {
-    // hparams
+protected:
    int channels;
    int out_channels;
    bool vae_downsample;
-    // conv2d params
+public:
-    struct ggml_tensor* op_w;  // [out_channels, channels, 3, 3]
+    DownSampleBlock(int channels,
-    struct ggml_tensor* op_b;  // [out_channels,]
+                    int out_channels,
-
+                    bool vae_downsample = false)
-    bool vae_downsample = false;
+        : channels(channels),
-
+          out_channels(out_channels),
-    size_t calculate_mem_size(ggml_type wtype) {
+          vae_downsample(vae_downsample) {
        double mem_size = 0;
        mem_size += out_channels * channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // op_w
        mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32);                     // op_b
        return static_cast<size_t>(mem_size);
    }
    void init_params(struct ggml_context* ctx, ggml_type wtype) {
        op_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, out_channels);
        op_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
    }
    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
        if (vae_downsample) {
-            tensors[prefix + "conv.weight"] = op_w;
+            blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}));
            tensors[prefix + "conv.bias"]   = op_b;
        } else {
-            tensors[prefix + "op.weight"] = op_w;
+            blocks["op"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}));
            tensors[prefix + "op.bias"]   = op_b;
        }
    }
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [N, channels, h, w]
        struct ggml_tensor* c = NULL;
        if (vae_downsample) {
-            c = ggml_pad(ctx, x, 1, 1, 0, 0);
+            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
-            c = ggml_nn_conv_2d(ctx, c, op_w, op_b, 2, 2, 0, 0);
+
            x = ggml_ext_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
            x = conv->forward(ctx, x);
        } else {
-            c = ggml_nn_conv_2d(ctx, x, op_w, op_b, 2, 2, 1, 1);
+            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
            x = conv->forward(ctx, x);
        }
-        return c;  // [N, out_channels, h/2, w/2]
+        return x;  // [N, out_channels, h/2, w/2]
    }
 };
-struct UpSample {
+class UpSampleBlock : public GGMLBlock {
-    // hparams
+protected:
    int channels;
    int out_channels;
-    // conv2d params
+public:
-    struct ggml_tensor* conv_w;  // [out_channels, channels, 3, 3]
+    UpSampleBlock(int channels,
-    struct ggml_tensor* conv_b;  // [out_channels,]
+                  int out_channels)
-
+        : channels(channels),
-    size_t calculate_mem_size(ggml_type wtype) {
+          out_channels(out_channels) {
-        double mem_size = 0;
+        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
        mem_size += out_channels * channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // op_w
        mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32);                     // op_b
        return static_cast<size_t>(mem_size);
    }
-    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        conv_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, out_channels);
        conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
    }
    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
        tensors[prefix + "conv.weight"] = conv_w;
        tensors[prefix + "conv.bias"]   = conv_b;
    }
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [N, channels, h, w]
-        x = ggml_upscale(ctx, x, 2);                              // [N, channels, h*2, w*2]
+        auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
-        x = ggml_nn_conv_2d(ctx, x, conv_w, conv_b, 1, 1, 1, 1);  // [N, out_channels, h*2, w*2]
+
        x = ggml_upscale(ctx->ggml_ctx, x, 2, GGML_SCALE_MODE_NEAREST);  // [N, channels, h*2, w*2]
        x = conv->forward(ctx, x);                                       // [N, out_channels, h*2, w*2]
        return x;
    }
 };
 class ResBlock : public GGMLBlock {
 protected:
    // network hparams
    int64_t channels;      // model_channels * (1, 1, 1, 2, 2, 4, 4, 4)
    int64_t emb_channels;  // time_embed_dim
    int64_t out_channels;  // mult * model_channels
    std::pair<int, int> kernel_size;
    int dims;
    bool skip_t_emb;
    bool exchange_temb_dims;
    std::shared_ptr<GGMLBlock> conv_nd(int dims,
                                       int64_t in_channels,
                                       int64_t out_channels,
                                       std::pair<int, int> kernel_size,
                                       std::pair<int, int> padding) {
        GGML_ASSERT(dims == 2 || dims == 3);
        if (dims == 3) {
            return std::shared_ptr<GGMLBlock>(new Conv3d(in_channels, out_channels, {kernel_size.first, 1, 1}, {1, 1, 1}, {padding.first, 0, 0}));
        } else {
            return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding));
        }
    }
 public:
    ResBlock(int64_t channels,
             int64_t emb_channels,
             int64_t out_channels,
             std::pair<int, int> kernel_size = {3, 3},
             int dims                        = 2,
             bool exchange_temb_dims         = false,
             bool skip_t_emb                 = false)
        : channels(channels),
          emb_channels(emb_channels),
          out_channels(out_channels),
          kernel_size(kernel_size),
          dims(dims),
          skip_t_emb(skip_t_emb),
          exchange_temb_dims(exchange_temb_dims) {
        std::pair<int, int> padding = {kernel_size.first / 2, kernel_size.second / 2};
        blocks["in_layers.0"]       = std::shared_ptr<GGMLBlock>(new GroupNorm32(channels));
        // in_layer_1 is nn.SILU()
        blocks["in_layers.2"] = conv_nd(dims, channels, out_channels, kernel_size, padding);
        if (!skip_t_emb) {
            // emb_layer_0 is nn.SILU()
            blocks["emb_layers.1"] = std::shared_ptr<GGMLBlock>(new Linear(emb_channels, out_channels));
        }
        blocks["out_layers.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(out_channels));
        // out_layer_1 is nn.SILU()
        // out_layer_2 is nn.Dropout(), skip for inference
        blocks["out_layers.3"] = conv_nd(dims, out_channels, out_channels, kernel_size, padding);
        if (out_channels != channels) {
            blocks["skip_connection"] = conv_nd(dims, channels, out_channels, {1, 1}, {0, 0});
        }
    }
    virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = nullptr) {
        // For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
        // [N, c, t, h, w] => [N, c, t, h * w]
        // x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
        // emb: [N, emb_channels] if dims == 2 else [N, t, emb_channels]
        auto in_layers_0  = std::dynamic_pointer_cast<GroupNorm32>(blocks["in_layers.0"]);
        auto in_layers_2  = std::dynamic_pointer_cast<UnaryBlock>(blocks["in_layers.2"]);
        auto out_layers_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out_layers.0"]);
        auto out_layers_3 = std::dynamic_pointer_cast<UnaryBlock>(blocks["out_layers.3"]);
        if (emb == nullptr) {
            GGML_ASSERT(skip_t_emb);
        }
        // in_layers
        auto h = in_layers_0->forward(ctx, x);
        h      = ggml_silu_inplace(ctx->ggml_ctx, h);
        h      = in_layers_2->forward(ctx, h);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
        // emb_layers
        if (!skip_t_emb) {
            auto emb_layer_1 = std::dynamic_pointer_cast<Linear>(blocks["emb_layers.1"]);
            auto emb_out = ggml_silu(ctx->ggml_ctx, emb);
            emb_out      = emb_layer_1->forward(ctx, emb_out);  // [N, out_channels] if dims == 2 else [N, t, out_channels]
            if (dims == 2) {
                emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]);  // [N, out_channels, 1, 1]
            } else {
                emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]);  // [N, t, out_channels, 1]
                if (exchange_temb_dims) {
                    // emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
                    emb_out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, emb_out, 0, 2, 1, 3));  // [N, out_channels, t, 1]
                }
            }
            h = ggml_add(ctx->ggml_ctx, h, emb_out);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
        }
        // out_layers
        h = out_layers_0->forward(ctx, h);
        h = ggml_silu_inplace(ctx->ggml_ctx, h);
        // dropout, skip for inference
        h = out_layers_3->forward(ctx, h);
        // skip connection
        if (out_channels != channels) {
            auto skip_connection = std::dynamic_pointer_cast<UnaryBlock>(blocks["skip_connection"]);
            x                    = skip_connection->forward(ctx, x);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
        }
        h = ggml_add(ctx->ggml_ctx, h, x);
        return h;  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
    }
 };
 class GEGLU : public UnaryBlock {
 protected:
    int64_t dim_in;
    int64_t dim_out;
 public:
    GEGLU(int64_t dim_in, int64_t dim_out)
        : dim_in(dim_in), dim_out(dim_out) {
        blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out * 2));
    }
    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
        // x: [ne3, ne2, ne1, dim_in]
        // return: [ne3, ne2, ne1, dim_out]
        auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
        x          = proj->forward(ctx, x);  // [ne3, ne2, ne1, dim_out*2]
        auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0, false);
        x          = x_vec[0];  // [ne3, ne2, ne1, dim_out]
        auto gate  = x_vec[1];  // [ne3, ne2, ne1, dim_out]
        gate = ggml_cont(ctx->ggml_ctx, gate);
        gate = ggml_ext_gelu(ctx->ggml_ctx, gate, true);
        x = ggml_mul(ctx->ggml_ctx, x, gate);  // [ne3, ne2, ne1, dim_out]
        return x;
    }
 };
 class GELU : public UnaryBlock {
 public:
    GELU(int64_t dim_in, int64_t dim_out, bool bias = true) {
        blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out, bias));
    }
    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
        // x: [ne3, ne2, ne1, dim_in]
        // return: [ne3, ne2, ne1, dim_out]
        auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
        x = proj->forward(ctx, x);
        x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
        return x;
    }
 };
 class FeedForward : public GGMLBlock {
 public:
    enum class Activation {
        GEGLU,
        GELU
    };
    FeedForward(int64_t dim,
                int64_t dim_out,
                int64_t mult          = 4,
                Activation activation = Activation::GEGLU,
                bool precision_fix    = false) {
        int64_t inner_dim = dim * mult;
        if (activation == Activation::GELU) {
            blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GELU(dim, inner_dim));
        } else {
            blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
        }
        // net_1 is nn.Dropout(), skip for inference
        bool force_prec_f32 = false;
        float scale         = 1.f;
        if (precision_fix) {
            scale = 1.f / 128.f;
 #ifdef SD_USE_VULKAN
            force_prec_f32 = true;
 #endif
        }
        // The purpose of the scale here is to prevent NaN issues in certain situations.
        // For example, when using Vulkan without enabling force_prec_f32,
        // or when using CUDA but the weights are k-quants.
        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, force_prec_f32, scale));
    }
    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [ne3, ne2, ne1, dim]
        // return: [ne3, ne2, ne1, dim_out]
        auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
        auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
        x = net_0->forward(ctx, x);  // [ne3, ne2, ne1, inner_dim]
        x = net_2->forward(ctx, x);  // [ne3, ne2, ne1, dim_out]
        return x;
    }
 };
 class CrossAttention : public GGMLBlock {
 protected:
    int64_t query_dim;
    int64_t context_dim;
    int64_t n_head;
    int64_t d_head;
 public:
    CrossAttention(int64_t query_dim,
                   int64_t context_dim,
                   int64_t n_head,
                   int64_t d_head)
        : n_head(n_head),
          d_head(d_head),
          query_dim(query_dim),
          context_dim(context_dim) {
        int64_t inner_dim = d_head * n_head;
        blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
        blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
        blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
        blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, query_dim));
        // to_out_1 is nn.Dropout(), skip for inference
    }
    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* context) {
        // x: [N, n_token, query_dim]
        // context: [N, n_context, context_dim]
        // return: [N, n_token, query_dim]
        auto to_q     = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
        auto to_k     = std::dynamic_pointer_cast<Linear>(blocks["to_k"]);
        auto to_v     = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
        auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
        int64_t n         = x->ne[2];
        int64_t n_token   = x->ne[1];
        int64_t n_context = context->ne[1];
        int64_t inner_dim = d_head * n_head;
        auto q = to_q->forward(ctx, x);        // [N, n_token, inner_dim]
        auto k = to_k->forward(ctx, context);  // [N, n_context, inner_dim]
        auto v = to_v->forward(ctx, context);  // [N, n_context, inner_dim]
        x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled);  // [N, n_token, inner_dim]
        x = to_out_0->forward(ctx, x);  // [N, n_token, query_dim]
        return x;
    }
 };
 class BasicTransformerBlock : public GGMLBlock {
 protected:
    int64_t n_head;
    int64_t d_head;
    bool ff_in;
 public:
    BasicTransformerBlock(int64_t dim,
                          int64_t n_head,
                          int64_t d_head,
                          int64_t context_dim,
                          bool ff_in = false)
        : n_head(n_head), d_head(d_head), ff_in(ff_in) {
        // disable_self_attn is always False
        // disable_temporal_crossattention is always False
        // switch_temporal_ca_to_sa is always False
        // inner_dim is always None or equal to dim
        // gated_ff is always True
        blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head));
        blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head));
        blocks["ff"]    = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
        blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
        blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
        blocks["norm3"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
        if (ff_in) {
            blocks["norm_in"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
            blocks["ff_in"]   = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
        }
    }
    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* context) {
        // x: [N, n_token, query_dim]
        // context: [N, n_context, context_dim]
        // return: [N, n_token, query_dim]
        auto attn1 = std::dynamic_pointer_cast<CrossAttention>(blocks["attn1"]);
        auto attn2 = std::dynamic_pointer_cast<CrossAttention>(blocks["attn2"]);
        auto ff    = std::dynamic_pointer_cast<FeedForward>(blocks["ff"]);
        auto norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
        auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
        auto norm3 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm3"]);
        if (ff_in) {
            auto norm_in = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_in"]);
            auto ff_in   = std::dynamic_pointer_cast<FeedForward>(blocks["ff_in"]);
            auto x_skip = x;
            x           = norm_in->forward(ctx, x);
            x           = ff_in->forward(ctx, x);
            // self.is_res is always True
            x = ggml_add(ctx->ggml_ctx, x, x_skip);
        }
        auto r = x;
        x      = norm1->forward(ctx, x);
        x      = attn1->forward(ctx, x, x);  // self-attention
        x      = ggml_add(ctx->ggml_ctx, x, r);
        r      = x;
        x      = norm2->forward(ctx, x);
        x      = attn2->forward(ctx, x, context);  // cross-attention
        x      = ggml_add(ctx->ggml_ctx, x, r);
        r      = x;
        x      = norm3->forward(ctx, x);
        x      = ff->forward(ctx, x);
        x      = ggml_add(ctx->ggml_ctx, x, r);
        return x;
    }
 };
 class SpatialTransformer : public GGMLBlock {
 protected:
    int64_t in_channels;  // mult * model_channels
    int64_t n_head;
    int64_t d_head;
    int64_t depth       = 1;    // 1
    int64_t context_dim = 768;  // hidden_size, 1024 for VERSION_SD2
    bool use_linear     = false;
    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
        auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
        if (iter != tensor_storage_map.end()) {
            int64_t inner_dim = n_head * d_head;
            if (iter->second.n_dims == 4 && use_linear) {
                use_linear         = false;
                blocks["proj_in"]  = std::make_shared<Conv2d>(in_channels, inner_dim, std::pair{1, 1});
                blocks["proj_out"] = std::make_shared<Conv2d>(inner_dim, in_channels, std::pair{1, 1});
            } else if (iter->second.n_dims == 2 && !use_linear) {
                use_linear         = true;
                blocks["proj_in"]  = std::make_shared<Linear>(in_channels, inner_dim);
                blocks["proj_out"] = std::make_shared<Linear>(inner_dim, in_channels);
            }
        }
    }
 public:
    SpatialTransformer(int64_t in_channels,
                       int64_t n_head,
                       int64_t d_head,
                       int64_t depth,
                       int64_t context_dim,
                       bool use_linear)
        : in_channels(in_channels),
          n_head(n_head),
          d_head(d_head),
          depth(depth),
          context_dim(context_dim),
          use_linear(use_linear) {
        // disable_self_attn is always False
        int64_t inner_dim = n_head * d_head;  // in_channels
        blocks["norm"]    = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
        if (use_linear) {
            blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, inner_dim));
        } else {
            blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
        }
        for (int i = 0; i < depth; i++) {
            std::string name = "transformer_blocks." + std::to_string(i);
            blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false));
        }
        if (use_linear) {
            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, in_channels));
        } else {
            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
        }
    }
    virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                        struct ggml_tensor* x,
                                        struct ggml_tensor* context) {
        // x: [N, in_channels, h, w]
        // context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
        auto norm     = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
        auto proj_in  = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_in"]);
        auto proj_out = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_out"]);
        auto x_in         = x;
        int64_t n         = x->ne[3];
        int64_t h         = x->ne[1];
        int64_t w         = x->ne[0];
        int64_t inner_dim = n_head * d_head;
        x = norm->forward(ctx, x);
        if (use_linear) {
            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3));  // [N, h, w, inner_dim]
            x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n);                // [N, h * w, inner_dim]
            x = proj_in->forward(ctx, x);                                              // [N, inner_dim, h, w]
        } else {
            x = proj_in->forward(ctx, x);                                              // [N, inner_dim, h, w]
            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3));  // [N, h, w, inner_dim]
            x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n);                // [N, h * w, inner_dim]
        }
        for (int i = 0; i < depth; i++) {
            std::string name       = "transformer_blocks." + std::to_string(i);
            auto transformer_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[name]);
            x = transformer_block->forward(ctx, x, context);
        }
        if (use_linear) {
            // proj_out
            x = proj_out->forward(ctx, x);  // [N, in_channels, h, w]
            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));  // [N, inner_dim, h * w]
            x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n);                 // [N, inner_dim, h, w]
        } else {
            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));  // [N, inner_dim, h * w]
            x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n);                 // [N, inner_dim, h, w]
            // proj_out
            x = proj_out->forward(ctx, x);  // [N, in_channels, h, w]
        }
        x = ggml_add(ctx->ggml_ctx, x, x_in);
        return x;
    }
 };
 class AlphaBlender : public GGMLBlock {
 protected:
    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
        // Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
        enum ggml_type wtype = GGML_TYPE_F32;
        params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
    }
    float get_alpha() {
        // image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
        // so learned_with_images is same as learned
        float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
        return sigmoid(alpha);
    }
 public:
    AlphaBlender() {
        // merge_strategy is always learned_with_images
        // for inference, we don't need to set alpha
        // since mix_factor.shape is [1,], we don't need rearrange using  rearrange_pattern
    }
    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x_spatial,
                                struct ggml_tensor* x_temporal) {
        // image_only_indicator is always tensor([0.])
        float alpha = get_alpha();
        auto x      = ggml_add(ctx->ggml_ctx,
                               ggml_ext_scale(ctx->ggml_ctx, x_spatial, alpha),
                               ggml_ext_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha));
        return x;
    }
 };
 class VideoResBlock : public ResBlock {
 public:
    VideoResBlock(int64_t channels,
                  int64_t emb_channels,
                  int64_t out_channels,
                  std::pair<int, int> kernel_size = {3, 3},
                  int64_t video_kernel_size       = 3,
                  int dims                        = 2)  // always 2
        : ResBlock(channels, emb_channels, out_channels, kernel_size, dims) {
        blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, emb_channels, out_channels, kernel_size, 3, true));
        blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
    }
    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* emb,
                                int num_video_frames) {
        // x: [N, channels, h, w] aka [b*t, channels, h, w]
        // emb: [N, emb_channels] aka [b*t, emb_channels]
        // image_only_indicator is always tensor([0.])
        auto time_stack = std::dynamic_pointer_cast<ResBlock>(blocks["time_stack"]);
        auto time_mixer = std::dynamic_pointer_cast<AlphaBlender>(blocks["time_mixer"]);
        x = ResBlock::forward(ctx, x, emb);
        int64_t T = num_video_frames;
        int64_t B = x->ne[3] / T;
        int64_t C = x->ne[2];
        int64_t H = x->ne[1];
        int64_t W = x->ne[0];
        x          = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B);                     // (b t) c h w -> b t c (h w)
        x          = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
        auto x_mix = x;
        emb = ggml_reshape_4d(ctx->ggml_ctx, emb, emb->ne[0], T, B, emb->ne[3]);  // (b t) ... -> b t ...
        x = time_stack->forward(ctx, x, emb);  // b t c (h w)
        x = time_mixer->forward(ctx, x_mix, x);  // b t c (h w)
        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
        x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B);                     // b t c (h w) -> (b t) c h w
        return x;
    }
 };
--- a/conditioner.hpp
+++ b/conditioner.hpp
--- a/control.hpp
+++ b/control.hpp
@ -0,0 +1,466 @@
 #ifndef __CONTROL_HPP__
 #define __CONTROL_HPP__
 #include "common.hpp"
 #include "ggml_extend.hpp"
 #include "model.h"
 #define CONTROL_NET_GRAPH_SIZE 1536
 /*
    =================================== ControlNet ===================================
    Reference: https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/cldm/cldm.py
 */
 class ControlNetBlock : public GGMLBlock {
 protected:
    SDVersion version = VERSION_SD1;
    // network hparams
    int in_channels                        = 4;
    int out_channels                       = 4;
    int hint_channels                      = 3;
    int num_res_blocks                     = 2;
    std::vector<int> attention_resolutions = {4, 2, 1};
    std::vector<int> channel_mult          = {1, 2, 4, 4};
    std::vector<int> transformer_depth     = {1, 1, 1, 1};
    int time_embed_dim                     = 1280;  // model_channels*4
    int num_heads                          = 8;
    int num_head_channels                  = -1;   // channels // num_heads
    int context_dim                        = 768;  // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
    bool use_linear_projection             = false;
 public:
    int model_channels  = 320;
    int adm_in_channels = 2816;  // only for VERSION_SDXL
    ControlNetBlock(SDVersion version = VERSION_SD1)
        : version(version) {
        if (sd_version_is_sd2(version)) {
            context_dim       = 1024;
            num_head_channels = 64;
            num_heads         = -1;
        } else if (sd_version_is_sdxl(version)) {
            context_dim           = 2048;
            attention_resolutions = {4, 2};
            channel_mult          = {1, 2, 4};
            transformer_depth     = {1, 2, 10};
            num_head_channels     = 64;
            num_heads             = -1;
        } else if (version == VERSION_SVD) {
            in_channels       = 8;
            out_channels      = 4;
            context_dim       = 1024;
            adm_in_channels   = 768;
            num_head_channels = 64;
            num_heads         = -1;
        }
        blocks["time_embed.0"] = std::shared_ptr<GGMLBlock>(new Linear(model_channels, time_embed_dim));
        // time_embed_1 is nn.SiLU()
        blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
        if (sd_version_is_sdxl(version) || version == VERSION_SVD) {
            blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
            // label_emb_1 is nn.SiLU()
            blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
        }
        // input_blocks
        blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}));
        std::vector<int> input_block_chans;
        input_block_chans.push_back(model_channels);
        int ch              = model_channels;
        int input_block_idx = 0;
        int ds              = 1;
        auto get_resblock = [&](int64_t channels, int64_t emb_channels, int64_t out_channels) -> ResBlock* {
            return new ResBlock(channels, emb_channels, out_channels);
        };
        auto get_attention_layer = [&](int64_t in_channels,
                                       int64_t n_head,
                                       int64_t d_head,
                                       int64_t depth,
                                       int64_t context_dim) -> SpatialTransformer* {
            return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
        };
        auto make_zero_conv = [&](int64_t channels) {
            return new Conv2d(channels, channels, {1, 1});
        };
        blocks["zero_convs.0.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(model_channels));
        blocks["input_hint_block.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(hint_channels, 16, {3, 3}, {1, 1}, {1, 1}));
        // nn.SiLU()
        blocks["input_hint_block.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 16, {3, 3}, {1, 1}, {1, 1}));
        // nn.SiLU()
        blocks["input_hint_block.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 32, {3, 3}, {2, 2}, {1, 1}));
        // nn.SiLU()
        blocks["input_hint_block.6"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 32, {3, 3}, {1, 1}, {1, 1}));
        // nn.SiLU()
        blocks["input_hint_block.8"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 96, {3, 3}, {2, 2}, {1, 1}));
        // nn.SiLU()
        blocks["input_hint_block.10"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 96, {3, 3}, {1, 1}, {1, 1}));
        // nn.SiLU()
        blocks["input_hint_block.12"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 256, {3, 3}, {2, 2}, {1, 1}));
        // nn.SiLU()
        blocks["input_hint_block.14"] = std::shared_ptr<GGMLBlock>(new Conv2d(256, model_channels, {3, 3}, {1, 1}, {1, 1}));
        size_t len_mults = channel_mult.size();
        for (int i = 0; i < len_mults; i++) {
            int mult = channel_mult[i];
            for (int j = 0; j < num_res_blocks; j++) {
                input_block_idx += 1;
                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
                blocks[name]     = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, mult * model_channels));
                ch = mult * model_channels;
                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
                    int n_head = num_heads;
                    int d_head = ch / num_heads;
                    if (num_head_channels != -1) {
                        d_head = num_head_channels;
                        n_head = ch / d_head;
                    }
                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
                    blocks[name]     = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
                                                                                      n_head,
                                                                                      d_head,
                                                                                      transformer_depth[i],
                                                                                      context_dim));
                }
                blocks["zero_convs." + std::to_string(input_block_idx) + ".0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
                input_block_chans.push_back(ch);
            }
            if (i != len_mults - 1) {
                input_block_idx += 1;
                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
                blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(ch, ch));
                blocks["zero_convs." + std::to_string(input_block_idx) + ".0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
                input_block_chans.push_back(ch);
                ds *= 2;
            }
        }
        // middle blocks
        int n_head = num_heads;
        int d_head = ch / num_heads;
        if (num_head_channels != -1) {
            d_head = num_head_channels;
            n_head = ch / d_head;
        }
        blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
        blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
                                                                                  n_head,
                                                                                  d_head,
                                                                                  transformer_depth[transformer_depth.size() - 1],
                                                                                  context_dim));
        blocks["middle_block.2"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
        // middle_block_out
        blocks["middle_block_out.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
    }
    struct ggml_tensor* resblock_forward(std::string name,
                                         GGMLRunnerContext* ctx,
                                         struct ggml_tensor* x,
                                         struct ggml_tensor* emb) {
        auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
        return block->forward(ctx, x, emb);
    }
    struct ggml_tensor* attention_layer_forward(std::string name,
                                                GGMLRunnerContext* ctx,
                                                struct ggml_tensor* x,
                                                struct ggml_tensor* context) {
        auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
        return block->forward(ctx, x, context);
    }
    struct ggml_tensor* input_hint_block_forward(GGMLRunnerContext* ctx,
                                                 struct ggml_tensor* hint,
                                                 struct ggml_tensor* emb,
                                                 struct ggml_tensor* context) {
        int num_input_blocks = 15;
        auto h               = hint;
        for (int i = 0; i < num_input_blocks; i++) {
            if (i % 2 == 0) {
                auto block = std::dynamic_pointer_cast<Conv2d>(blocks["input_hint_block." + std::to_string(i)]);
                h = block->forward(ctx, h);
            } else {
                h = ggml_silu_inplace(ctx->ggml_ctx, h);
            }
        }
        return h;
    }
    std::vector<struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
                                             struct ggml_tensor* x,
                                             struct ggml_tensor* hint,
                                             struct ggml_tensor* guided_hint,
                                             struct ggml_tensor* timesteps,
                                             struct ggml_tensor* context,
                                             struct ggml_tensor* y = nullptr) {
        // x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
        // timesteps: [N,]
        // context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
        // y: [N, adm_in_channels] or [1, adm_in_channels]
        if (context != nullptr) {
            if (context->ne[2] != x->ne[3]) {
                context = ggml_repeat(ctx->ggml_ctx, context, ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
            }
        }
        if (y != nullptr) {
            if (y->ne[1] != x->ne[3]) {
                y = ggml_repeat(ctx->ggml_ctx, y, ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
            }
        }
        auto time_embed_0     = std::dynamic_pointer_cast<Linear>(blocks["time_embed.0"]);
        auto time_embed_2     = std::dynamic_pointer_cast<Linear>(blocks["time_embed.2"]);
        auto input_blocks_0_0 = std::dynamic_pointer_cast<Conv2d>(blocks["input_blocks.0.0"]);
        auto zero_convs_0     = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs.0.0"]);
        auto middle_block_out = std::dynamic_pointer_cast<Conv2d>(blocks["middle_block_out.0"]);
        auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, model_channels);  // [N, model_channels]
        auto emb = time_embed_0->forward(ctx, t_emb);
        emb      = ggml_silu_inplace(ctx->ggml_ctx, emb);
        emb      = time_embed_2->forward(ctx, emb);  // [N, time_embed_dim]
        // SDXL/SVD
        if (y != nullptr) {
            auto label_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.0"]);
            auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);
            auto label_emb = label_embed_0->forward(ctx, y);
            label_emb      = ggml_silu_inplace(ctx->ggml_ctx, label_emb);
            label_emb      = label_embed_2->forward(ctx, label_emb);  // [N, time_embed_dim]
            emb = ggml_add(ctx->ggml_ctx, emb, label_emb);  // [N, time_embed_dim]
        }
        std::vector<struct ggml_tensor*> outs;
        if (guided_hint == nullptr) {
            guided_hint = input_hint_block_forward(ctx, hint, emb, context);
        }
        outs.push_back(guided_hint);
        // input_blocks
        // input block 0
        auto h = input_blocks_0_0->forward(ctx, x);
        h      = ggml_add(ctx->ggml_ctx, h, guided_hint);
        outs.push_back(zero_convs_0->forward(ctx, h));
        // input block 1-11
        size_t len_mults    = channel_mult.size();
        int input_block_idx = 0;
        int ds              = 1;
        for (int i = 0; i < len_mults; i++) {
            int mult = channel_mult[i];
            for (int j = 0; j < num_res_blocks; j++) {
                input_block_idx += 1;
                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
                h                = resblock_forward(name, ctx, h, emb);  // [N, mult*model_channels, h, w]
                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
                    h                = attention_layer_forward(name, ctx, h, context);  // [N, mult*model_channels, h, w]
                }
                auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
                outs.push_back(zero_conv->forward(ctx, h));
            }
            if (i != len_mults - 1) {
                ds *= 2;
                input_block_idx += 1;
                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
                auto block       = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
                h = block->forward(ctx, h);  // [N, mult*model_channels, h/(2^(i+1)), w/(2^(i+1))]
                auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
                outs.push_back(zero_conv->forward(ctx, h));
            }
        }
        // [N, 4*model_channels, h/8, w/8]
        // middle_block
        h = resblock_forward("middle_block.0", ctx, h, emb);             // [N, 4*model_channels, h/8, w/8]
        h = attention_layer_forward("middle_block.1", ctx, h, context);  // [N, 4*model_channels, h/8, w/8]
        h = resblock_forward("middle_block.2", ctx, h, emb);             // [N, 4*model_channels, h/8, w/8]
        // out
        outs.push_back(middle_block_out->forward(ctx, h));
        return outs;
    }
 };
 struct ControlNet : public GGMLRunner {
    SDVersion version = VERSION_SD1;
    ControlNetBlock control_net;
    ggml_backend_buffer_t control_buffer = nullptr;  // keep control output tensors in backend memory
    ggml_context* control_ctx            = nullptr;
    std::vector<struct ggml_tensor*> controls;  // (12 input block outputs, 1 middle block output) SD 1.5
    struct ggml_tensor* guided_hint = nullptr;  // guided_hint cache, for faster inference
    bool guided_hint_cached         = false;
    ControlNet(ggml_backend_t backend,
               bool offload_params_to_cpu,
               const String2TensorStorage& tensor_storage_map = {},
               SDVersion version                              = VERSION_SD1)
        : GGMLRunner(backend, offload_params_to_cpu), control_net(version) {
        control_net.init(params_ctx, tensor_storage_map, "");
    }
    ~ControlNet() override {
        free_control_ctx();
    }
    void alloc_control_ctx(std::vector<struct ggml_tensor*> outs) {
        struct ggml_init_params params;
        params.mem_size   = static_cast<size_t>(outs.size() * ggml_tensor_overhead()) + 1024 * 1024;
        params.mem_buffer = nullptr;
        params.no_alloc   = true;
        control_ctx       = ggml_init(params);
        controls.resize(outs.size() - 1);
        size_t control_buffer_size = 0;
        guided_hint = ggml_dup_tensor(control_ctx, outs[0]);
        control_buffer_size += ggml_nbytes(guided_hint);
        for (int i = 0; i < outs.size() - 1; i++) {
            controls[i] = ggml_dup_tensor(control_ctx, outs[i + 1]);
            control_buffer_size += ggml_nbytes(controls[i]);
        }
        control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, runtime_backend);
        LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f);
    }
    void free_control_ctx() {
        if (control_buffer != nullptr) {
            ggml_backend_buffer_free(control_buffer);
            control_buffer = nullptr;
        }
        if (control_ctx != nullptr) {
            ggml_free(control_ctx);
            control_ctx = nullptr;
        }
        guided_hint        = nullptr;
        guided_hint_cached = false;
        controls.clear();
    }
    std::string get_desc() override {
        return "control_net";
    }
    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
        control_net.get_param_tensors(tensors, prefix);
    }
    struct ggml_cgraph* build_graph(struct ggml_tensor* x,
                                    struct ggml_tensor* hint,
                                    struct ggml_tensor* timesteps,
                                    struct ggml_tensor* context,
                                    struct ggml_tensor* y = nullptr) {
        struct ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE);
        x = to_backend(x);
        if (guided_hint_cached) {
            hint = nullptr;
        } else {
            hint = to_backend(hint);
        }
        context   = to_backend(context);
        y         = to_backend(y);
        timesteps = to_backend(timesteps);
        auto runner_ctx = get_context();
        auto outs = control_net.forward(&runner_ctx,
                                        x,
                                        hint,
                                        guided_hint_cached ? guided_hint : nullptr,
                                        timesteps,
                                        context,
                                        y);
        if (control_ctx == nullptr) {
            alloc_control_ctx(outs);
        }
        ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[0], guided_hint));
        for (int i = 0; i < outs.size() - 1; i++) {
            ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[i + 1], controls[i]));
        }
        return gf;
    }
    bool compute(int n_threads,
                 struct ggml_tensor* x,
                 struct ggml_tensor* hint,
                 struct ggml_tensor* timesteps,
                 struct ggml_tensor* context,
                 struct ggml_tensor* y,
                 struct ggml_tensor** output     = nullptr,
                 struct ggml_context* output_ctx = nullptr) {
        // x: [N, in_channels, h, w]
        // timesteps: [N, ]
        // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
        // y: [N, adm_in_channels] or [1, adm_in_channels]
        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(x, hint, timesteps, context, y);
        };
        bool res = GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
        if (res) {
            // cache guided_hint
            guided_hint_cached = true;
        }
        return res;
    }
    bool load_from_file(const std::string& file_path, int n_threads) {
        LOG_INFO("loading control net from '%s'", file_path.c_str());
        alloc_params_buffer();
        std::map<std::string, ggml_tensor*> tensors;
        control_net.get_param_tensors(tensors);
        std::set<std::string> ignore_tensors;
        ModelLoader model_loader;
        if (!model_loader.init_from_file_and_convert_name(file_path)) {
            LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
            return false;
        }
        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);
        if (!success) {
            LOG_ERROR("load control net tensors from model loader failed");
            return false;
        }
        LOG_INFO("control net model loaded");
        return success;
    }
 };
 #endif  // __CONTROL_HPP__
--- a/denoiser.hpp
+++ b/denoiser.hpp
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@ -0,0 +1,450 @@
 #ifndef __DIFFUSION_MODEL_H__
 #define __DIFFUSION_MODEL_H__
 #include "flux.hpp"
 #include "mmdit.hpp"
 #include "qwen_image.hpp"
 #include "unet.hpp"
 #include "wan.hpp"
 #include "z_image.hpp"
 struct DiffusionParams {
    struct ggml_tensor* x                     = nullptr;
    struct ggml_tensor* timesteps             = nullptr;
    struct ggml_tensor* context               = nullptr;
    struct ggml_tensor* c_concat              = nullptr;
    struct ggml_tensor* y                     = nullptr;
    struct ggml_tensor* guidance              = nullptr;
    std::vector<ggml_tensor*> ref_latents     = {};
    bool increase_ref_index                   = false;
    int num_video_frames                      = -1;
    std::vector<struct ggml_tensor*> controls = {};
    float control_strength                    = 0.f;
    struct ggml_tensor* vace_context          = nullptr;
    float vace_strength                       = 1.f;
    std::vector<int> skip_layers              = {};
 };
 struct DiffusionModel {
    virtual std::string get_desc()                                                      = 0;
    virtual bool compute(int n_threads,
                         DiffusionParams diffusion_params,
                         struct ggml_tensor** output     = nullptr,
                         struct ggml_context* output_ctx = nullptr)                     = 0;
    virtual void alloc_params_buffer()                                                  = 0;
    virtual void free_params_buffer()                                                   = 0;
    virtual void free_compute_buffer()                                                  = 0;
    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
    virtual size_t get_params_buffer_size()                                             = 0;
    virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
    virtual int64_t get_adm_in_channels()                            = 0;
    virtual void set_flash_attention_enabled(bool enabled)           = 0;
    virtual void set_circular_axes(bool circular_x, bool circular_y) = 0;
 };
 struct UNetModel : public DiffusionModel {
    UNetModelRunner unet;
    UNetModel(ggml_backend_t backend,
              bool offload_params_to_cpu,
              const String2TensorStorage& tensor_storage_map = {},
              SDVersion version                              = VERSION_SD1)
        : unet(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version) {
    }
    std::string get_desc() override {
        return unet.get_desc();
    }
    void alloc_params_buffer() override {
        unet.alloc_params_buffer();
    }
    void free_params_buffer() override {
        unet.free_params_buffer();
    }
    void free_compute_buffer() override {
        unet.free_compute_buffer();
    }
    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
        unet.get_param_tensors(tensors, "model.diffusion_model");
    }
    size_t get_params_buffer_size() override {
        return unet.get_params_buffer_size();
    }
    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
        unet.set_weight_adapter(adapter);
    }
    int64_t get_adm_in_channels() override {
        return unet.unet.adm_in_channels;
    }
    void set_flash_attention_enabled(bool enabled) {
        unet.set_flash_attention_enabled(enabled);
    }
    void set_circular_axes(bool circular_x, bool circular_y) override {
        unet.set_circular_axes(circular_x, circular_y);
    }
    bool compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
                 struct ggml_context* output_ctx = nullptr) override {
        return unet.compute(n_threads,
                            diffusion_params.x,
                            diffusion_params.timesteps,
                            diffusion_params.context,
                            diffusion_params.c_concat,
                            diffusion_params.y,
                            diffusion_params.num_video_frames,
                            diffusion_params.controls,
                            diffusion_params.control_strength, output, output_ctx);
    }
 };
 struct MMDiTModel : public DiffusionModel {
    MMDiTRunner mmdit;
    MMDiTModel(ggml_backend_t backend,
               bool offload_params_to_cpu,
               const String2TensorStorage& tensor_storage_map = {})
        : mmdit(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model") {
    }
    std::string get_desc() override {
        return mmdit.get_desc();
    }
    void alloc_params_buffer() override {
        mmdit.alloc_params_buffer();
    }
    void free_params_buffer() override {
        mmdit.free_params_buffer();
    }
    void free_compute_buffer() override {
        mmdit.free_compute_buffer();
    }
    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
        mmdit.get_param_tensors(tensors, "model.diffusion_model");
    }
    size_t get_params_buffer_size() override {
        return mmdit.get_params_buffer_size();
    }
    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
        mmdit.set_weight_adapter(adapter);
    }
    int64_t get_adm_in_channels() override {
        return 768 + 1280;
    }
    void set_flash_attention_enabled(bool enabled) {
        mmdit.set_flash_attention_enabled(enabled);
    }
    void set_circular_axes(bool circular_x, bool circular_y) override {
        mmdit.set_circular_axes(circular_x, circular_y);
    }
    bool compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
                 struct ggml_context* output_ctx = nullptr) override {
        return mmdit.compute(n_threads,
                             diffusion_params.x,
                             diffusion_params.timesteps,
                             diffusion_params.context,
                             diffusion_params.y,
                             output,
                             output_ctx,
                             diffusion_params.skip_layers);
    }
 };
 struct FluxModel : public DiffusionModel {
    Flux::FluxRunner flux;
    FluxModel(ggml_backend_t backend,
              bool offload_params_to_cpu,
              const String2TensorStorage& tensor_storage_map = {},
              SDVersion version                              = VERSION_FLUX,
              bool use_mask                                  = false)
        : flux(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version, use_mask) {
    }
    std::string get_desc() override {
        return flux.get_desc();
    }
    void alloc_params_buffer() override {
        flux.alloc_params_buffer();
    }
    void free_params_buffer() override {
        flux.free_params_buffer();
    }
    void free_compute_buffer() override {
        flux.free_compute_buffer();
    }
    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
        flux.get_param_tensors(tensors, "model.diffusion_model");
    }
    size_t get_params_buffer_size() override {
        return flux.get_params_buffer_size();
    }
    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
        flux.set_weight_adapter(adapter);
    }
    int64_t get_adm_in_channels() override {
        return 768;
    }
    void set_flash_attention_enabled(bool enabled) {
        flux.set_flash_attention_enabled(enabled);
    }
    void set_circular_axes(bool circular_x, bool circular_y) override {
        flux.set_circular_axes(circular_x, circular_y);
    }
    bool compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
                 struct ggml_context* output_ctx = nullptr) override {
        return flux.compute(n_threads,
                            diffusion_params.x,
                            diffusion_params.timesteps,
                            diffusion_params.context,
                            diffusion_params.c_concat,
                            diffusion_params.y,
                            diffusion_params.guidance,
                            diffusion_params.ref_latents,
                            diffusion_params.increase_ref_index,
                            output,
                            output_ctx,
                            diffusion_params.skip_layers);
    }
 };
 struct WanModel : public DiffusionModel {
    std::string prefix;
    WAN::WanRunner wan;
    WanModel(ggml_backend_t backend,
             bool offload_params_to_cpu,
             const String2TensorStorage& tensor_storage_map = {},
             const std::string prefix                       = "model.diffusion_model",
             SDVersion version                              = VERSION_WAN2)
        : prefix(prefix), wan(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
    }
    std::string get_desc() override {
        return wan.get_desc();
    }
    void alloc_params_buffer() override {
        wan.alloc_params_buffer();
    }
    void free_params_buffer() override {
        wan.free_params_buffer();
    }
    void free_compute_buffer() override {
        wan.free_compute_buffer();
    }
    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
        wan.get_param_tensors(tensors, prefix);
    }
    size_t get_params_buffer_size() override {
        return wan.get_params_buffer_size();
    }
    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
        wan.set_weight_adapter(adapter);
    }
    int64_t get_adm_in_channels() override {
        return 768;
    }
    void set_flash_attention_enabled(bool enabled) {
        wan.set_flash_attention_enabled(enabled);
    }
    void set_circular_axes(bool circular_x, bool circular_y) override {
        wan.set_circular_axes(circular_x, circular_y);
    }
    bool compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
                 struct ggml_context* output_ctx = nullptr) override {
        return wan.compute(n_threads,
                           diffusion_params.x,
                           diffusion_params.timesteps,
                           diffusion_params.context,
                           diffusion_params.y,
                           diffusion_params.c_concat,
                           nullptr,
                           diffusion_params.vace_context,
                           diffusion_params.vace_strength,
                           output,
                           output_ctx);
    }
 };
 struct QwenImageModel : public DiffusionModel {
    std::string prefix;
    Qwen::QwenImageRunner qwen_image;
    QwenImageModel(ggml_backend_t backend,
                   bool offload_params_to_cpu,
                   const String2TensorStorage& tensor_storage_map = {},
                   const std::string prefix                       = "model.diffusion_model",
                   SDVersion version                              = VERSION_QWEN_IMAGE,
                   bool zero_cond_t                               = false)
        : prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version, zero_cond_t) {
    }
    std::string get_desc() override {
        return qwen_image.get_desc();
    }
    void alloc_params_buffer() override {
        qwen_image.alloc_params_buffer();
    }
    void free_params_buffer() override {
        qwen_image.free_params_buffer();
    }
    void free_compute_buffer() override {
        qwen_image.free_compute_buffer();
    }
    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
        qwen_image.get_param_tensors(tensors, prefix);
    }
    size_t get_params_buffer_size() override {
        return qwen_image.get_params_buffer_size();
    }
    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
        qwen_image.set_weight_adapter(adapter);
    }
    int64_t get_adm_in_channels() override {
        return 768;
    }
    void set_flash_attention_enabled(bool enabled) {
        qwen_image.set_flash_attention_enabled(enabled);
    }
    void set_circular_axes(bool circular_x, bool circular_y) override {
        qwen_image.set_circular_axes(circular_x, circular_y);
    }
    bool compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
                 struct ggml_context* output_ctx = nullptr) override {
        return qwen_image.compute(n_threads,
                                  diffusion_params.x,
                                  diffusion_params.timesteps,
                                  diffusion_params.context,
                                  diffusion_params.ref_latents,
                                  true,  // increase_ref_index
                                  output,
                                  output_ctx);
    }
 };
 struct ZImageModel : public DiffusionModel {
    std::string prefix;
    ZImage::ZImageRunner z_image;
    ZImageModel(ggml_backend_t backend,
                bool offload_params_to_cpu,
                const String2TensorStorage& tensor_storage_map = {},
                const std::string prefix                       = "model.diffusion_model",
                SDVersion version                              = VERSION_Z_IMAGE)
        : prefix(prefix), z_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
    }
    std::string get_desc() override {
        return z_image.get_desc();
    }
    void alloc_params_buffer() override {
        z_image.alloc_params_buffer();
    }
    void free_params_buffer() override {
        z_image.free_params_buffer();
    }
    void free_compute_buffer() override {
        z_image.free_compute_buffer();
    }
    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
        z_image.get_param_tensors(tensors, prefix);
    }
    size_t get_params_buffer_size() override {
        return z_image.get_params_buffer_size();
    }
    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
        z_image.set_weight_adapter(adapter);
    }
    int64_t get_adm_in_channels() override {
        return 768;
    }
    void set_flash_attention_enabled(bool enabled) {
        z_image.set_flash_attention_enabled(enabled);
    }
    void set_circular_axes(bool circular_x, bool circular_y) override {
        z_image.set_circular_axes(circular_x, circular_y);
    }
    bool compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
                 struct ggml_context* output_ctx = nullptr) override {
        return z_image.compute(n_threads,
                               diffusion_params.x,
                               diffusion_params.timesteps,
                               diffusion_params.context,
                               diffusion_params.ref_latents,
                               true,  // increase_ref_index
                               output,
                               output_ctx);
    }
 };
 #endif
--- a/docs/build.md
+++ b/docs/build.md
@ -0,0 +1,173 @@
 # Build from scratch
 ## Get the Code
 ```
 git clone --recursive https://github.com/leejet/stable-diffusion.cpp
 cd stable-diffusion.cpp
 ```
 - If you have already cloned the repository, you can use the following command to update the repository to the latest code.
 ```
 cd stable-diffusion.cpp
 git pull origin master
 git submodule init
 git submodule update
 ```
 ## Build (CPU only)
 If you don't have a GPU or CUDA installed, you can build a CPU-only version.
 ```shell
 mkdir build && cd build
 cmake ..
 cmake --build . --config Release
 ```
 ## Build with OpenBLAS
 ```shell
 mkdir build && cd build
 cmake .. -DGGML_OPENBLAS=ON
 cmake --build . --config Release
 ```
 ## Build with CUDA
 This provides GPU acceleration using NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
 ```shell
 mkdir build && cd build
 cmake .. -DSD_CUDA=ON
 cmake --build . --config Release
 ```
 ## Build with HipBLAS
 This provides GPU acceleration using AMD GPU. Make sure to have the ROCm toolkit installed.
 To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards.
 Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
 ```shell
 mkdir build && cd build
 if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
 if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
 cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
 cmake --build . --config Release
 ```
 ## Build with MUSA
 This provides GPU acceleration using Moore Threads GPU. Make sure to have the MUSA toolkit installed.
 ```shell
 mkdir build && cd build
 cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
 cmake --build . --config Release
 ```
 ## Build with Metal
 Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
 ```shell
 mkdir build && cd build
 cmake .. -DSD_METAL=ON
 cmake --build . --config Release
 ```
 ## Build with Vulkan
 Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
 ```shell
 mkdir build && cd build
 cmake .. -DSD_VULKAN=ON
 cmake --build . --config Release
 ```
 ## Build with OpenCL (for Adreno GPU)
 Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
 To build for Windows ARM please refers to [Windows 11 Arm64](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
 Building for Android:
  Android NDK:
       Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
 Setup OpenCL Dependencies for NDK:
 You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
 *   OpenCL Headers:
    ```bash
    # In a temporary working directory
    git clone https://github.com/KhronosGroup/OpenCL-Headers
    cd OpenCL-Headers
    # Replace <YOUR_NDK_PATH> with your actual NDK installation path
    # e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
    sudo cp -r CL <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
    cd ..
    ```
 *   OpenCL ICD Loader:
    ```shell
    # In the same temporary working directory
    git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
    cd OpenCL-ICD-Loader
    mkdir build_ndk && cd build_ndk
    # Replace <YOUR_NDK_PATH> in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
    cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
      -DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
      -DOPENCL_ICD_LOADER_HEADERS_DIR=<YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
      -DANDROID_ABI=arm64-v8a \
      -DANDROID_PLATFORM=24 \
      -DANDROID_STL=c++_shared
    ninja
    # Replace <YOUR_NDK_PATH>
    # e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
    sudo cp libOpenCL.so <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
    cd ../..
    ```
 Build `stable-diffusion.cpp` for Android with OpenCL:
 ```shell
 mkdir build-android && cd build-android
 # Replace <YOUR_NDK_PATH> with your actual NDK installation path
 # e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake
 cmake .. -G Ninja \
  -DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
  -DANDROID_ABI=arm64-v8a \
  -DANDROID_PLATFORM=android-28 \
  -DGGML_OPENMP=OFF \
  -DSD_OPENCL=ON
 ninja
 ```
 *(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)*
 ## Build with SYCL
 Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
 ```shell
 # Export relevant ENV variables
 source /opt/intel/oneapi/setvars.sh
 # Option 1: Use FP32 (recommended for better performance in most cases)
 cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 # Option 2: Use FP16
 cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
 cmake --build . --config Release
 ```
--- a/docs/caching.md
+++ b/docs/caching.md
@ -0,0 +1,126 @@
 ## Caching
 Caching methods accelerate diffusion inference by reusing intermediate computations when changes between steps are small.
 ### Cache Modes
 | Mode | Target | Description |
 |------|--------|-------------|
 | `ucache` | UNET models | Condition-level caching with error tracking |
 | `easycache` | DiT models | Condition-level cache |
 | `dbcache` | DiT models | Block-level L1 residual threshold |
 | `taylorseer` | DiT models | Taylor series approximation |
 | `cache-dit` | DiT models | Combined DBCache + TaylorSeer |
 ### UCache (UNET Models)
 UCache caches the residual difference (output - input) and reuses it when input changes are below threshold.
 ```bash
 sd-cli -m model.safetensors -p "a cat" --cache-mode ucache --cache-option "threshold=1.5"
 ```
 #### Parameters
 | Parameter | Description | Default |
 |-----------|-------------|---------|
 | `threshold` | Error threshold for reuse decision | 1.0 |
 | `start` | Start caching at this percent of steps | 0.15 |
 | `end` | Stop caching at this percent of steps | 0.95 |
 | `decay` | Error decay rate (0-1) | 1.0 |
 | `relative` | Scale threshold by output norm (0/1) | 1 |
 | `reset` | Reset error after computing (0/1) | 1 |
 #### Reset Parameter
 The `reset` parameter controls error accumulation behavior:
 - `reset=1` (default): Resets accumulated error after each computed step. More aggressive caching, works well with most samplers.
 - `reset=0`: Keeps error accumulated. More conservative, recommended for `euler_a` sampler.
 ### EasyCache (DiT Models)
 Condition-level caching for DiT models. Caches and reuses outputs when input changes are below threshold.
 ```bash
 --cache-mode easycache --cache-option "threshold=0.3"
 ```
 #### Parameters
 | Parameter | Description | Default |
 |-----------|-------------|---------|
 | `threshold` | Input change threshold for reuse | 0.2 |
 | `start` | Start caching at this percent of steps | 0.15 |
 | `end` | Stop caching at this percent of steps | 0.95 |
 ### Cache-DIT (DiT Models)
 For DiT models like FLUX and QWEN, use block-level caching modes.
 #### DBCache
 Caches blocks based on L1 residual difference threshold:
 ```bash
 --cache-mode dbcache --cache-option "threshold=0.25,warmup=4"
 ```
 #### TaylorSeer
 Uses Taylor series approximation to predict block outputs:
 ```bash
 --cache-mode taylorseer
 ```
 #### Cache-DIT (Combined)
 Combines DBCache and TaylorSeer:
 ```bash
 --cache-mode cache-dit --cache-preset fast
 ```
 #### Parameters
 | Parameter | Description | Default |
 |-----------|-------------|---------|
 | `Fn` | Front blocks to always compute | 8 |
 | `Bn` | Back blocks to always compute | 0 |
 | `threshold` | L1 residual difference threshold | 0.08 |
 | `warmup` | Steps before caching starts | 8 |
 #### Presets
 Available presets: `slow`, `medium`, `fast`, `ultra` (or `s`, `m`, `f`, `u`).
 ```bash
 --cache-mode cache-dit --cache-preset fast
 ```
 #### SCM Options
 Steps Computation Mask controls which steps can be cached:
 ```bash
 --scm-mask "1,1,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1"
 ```
 Mask values: `1` = compute, `0` = can cache.
 | Policy | Description |
 |--------|-------------|
 | `dynamic` | Check threshold before caching |
 | `static` | Always cache on cacheable steps |
 ```bash
 --scm-policy dynamic
 ```
 ### Performance Tips
 - Start with default thresholds and adjust based on output quality
 - Lower threshold = better quality, less speedup
 - Higher threshold = more speedup, potential quality loss
 - More steps generally means more caching opportunities
--- a/docs/chroma.md
+++ b/docs/chroma.md
@ -0,0 +1,33 @@
 # How to Use
 You can run Chroma using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
 ## Download weights
 - Download Chroma
    - If you don't want to do the conversion yourself, download the preconverted gguf model from [silveroxides/Chroma-GGUF](https://huggingface.co/silveroxides/Chroma-GGUF)
    - Otherwise, download chroma's safetensors from [lodestones/Chroma](https://huggingface.co/lodestones/Chroma)
 - Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
 - Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
 ## Convert Chroma weights
 You can download the preconverted gguf weights from [silveroxides/Chroma-GGUF](https://huggingface.co/silveroxides/Chroma-GGUF), this way you don't have to do the conversion yourself.
 ```
 .\bin\Release\sd-cli.exe -M convert -m ..\..\ComfyUI\models\unet\chroma-unlocked-v40.safetensors -o ..\models\chroma-unlocked-v40-q8_0.gguf -v --type q8_0
 ```
 ## Run
 ### Example
 For example:
 ```
 .\bin\Release\sd-cli.exe --diffusion-model  ..\models\chroma-unlocked-v40-q8_0.gguf --vae ..\models\ae.sft --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'chroma.cpp'" --cfg-scale 4.0 --sampling-method euler -v --chroma-disable-dit-mask --clip-on-cpu
 ```
 ![](../assets/flux/chroma_v40.png)
--- a/docs/chroma_radiance.md
+++ b/docs/chroma_radiance.md
@ -0,0 +1,21 @@
 # How to Use
 ## Download weights
 - Download Chroma1-Radiance
    - safetensors: https://huggingface.co/lodestones/Chroma1-Radiance/tree/main
    - gguf: https://huggingface.co/silveroxides/Chroma1-Radiance-GGUF/tree/main
 - Download t5xxl
    - safetensors: https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
 ## Examples
 ```
 .\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Chroma1-Radiance-v0.4-Q8_0.gguf --t5xxl ..\..\ComfyUI\models\clip\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'chroma  radiance cpp'" --cfg-scale 4.0 --sampling-method euler -v
 ```
 <img alt="Chroma1-Radiance" src="../assets/flux/chroma1-radiance.png" />
--- a/docs/distilled_sd.md
+++ b/docs/distilled_sd.md
@ -0,0 +1,137 @@
 # Running distilled models: SSD1B, Vega and SDx.x with tiny U-Nets
 ## Preface 
 These models feature a reduced U-Net architecture. Unlike standard SDXL models, the SSD-1B and Vega U-Net contains only one middle block and fewer attention layers in its up- and down-blocks, resulting in significantly smaller file sizes. Using these models can reduce inference time by more than 33%. For more details, refer to Segmind's paper: https://arxiv.org/abs/2401.02677v1.
 Similarly, SD1.x- and SD2.x-style models with a tiny U-Net consist of only 6 U-Net blocks, leading to very small files and time savings of up to 50%. For more information, see the paper: https://arxiv.org/pdf/2305.15798.pdf.
 ## SSD1B
 Note that not all of these models follow the standard parameter naming conventions. However, several useful SSD-1B models are available online, such as:
 * https://huggingface.co/segmind/SSD-1B/resolve/main/SSD-1B-A1111.safetensors
 * https://huggingface.co/hassenhamdi/SSD-1B-fp8_e4m3fn/resolve/main/SSD-1B_fp8_e4m3fn.safetensors
 Useful LoRAs are also available:
 * https://huggingface.co/seungminh/lora-swarovski-SSD-1B/resolve/main/pytorch_lora_weights.safetensors
 * https://huggingface.co/kylielee505/mylcmlorassd/resolve/main/pytorch_lora_weights.safetensors
 ## Vega
 Segmind's Vega model is available online here:
 * https://huggingface.co/segmind/Segmind-Vega/resolve/main/segmind-vega.safetensors
 VegaRT is an example for an LCM-LoRA:
 * https://huggingface.co/segmind/Segmind-VegaRT/resolve/main/pytorch_lora_weights.safetensors
 Both files can be used out-of-the-box, unlike the models described in next sections.
 ## SD1.x, SD2.x with tiny U-Nets
 These models require conversion before use. You will need a Python script provided by the diffusers team, available on GitHub:
 * https://raw.githubusercontent.com/huggingface/diffusers/refs/heads/main/scripts/convert_diffusers_to_original_stable_diffusion.py
 ### SD2.x
 NotaAI provides the following model online:
 * https://huggingface.co/nota-ai/bk-sdm-v2-tiny
 Creating a .safetensors file involves two steps. First, run this short Python script to download the model from Hugging Face:
 ```python
 from diffusers import StableDiffusionPipeline
 pipe = StableDiffusionPipeline.from_pretrained("nota-ai/bk-sdm-v2-tiny",cache_dir="./")
 ```
 Second, create the .safetensors file by running:
 ```bash
 python convert_diffusers_to_original_stable_diffusion.py \
      --model_path  models--nota-ai--bk-sdm-v2-tiny/snapshots/68277af553777858cd47e133f92e4db47321bc74 \
      --checkpoint_path bk-sdm-v2-tiny.safetensors --half --use_safetensors
 ```
 This will generate the **file bk-sdm-v2-tiny.safetensors**, which is now ready for use with sd.cpp.
 ### SD1.x
 Several Tiny SD 1.x models are available online, such as:
 * https://huggingface.co/segmind/tiny-sd
 * https://huggingface.co/segmind/portrait-finetuned
 * https://huggingface.co/nota-ai/bk-sdm-tiny
 These models also require conversion, partly because some tensors are stored in a non-contiguous manner. To create a usable checkpoint file, follow these simple steps:
 Download and prepare the model using Python: 
 ##### Download the model using Python on your computer, for example this way:
 ```python
 import torch
 from diffusers import StableDiffusionPipeline
 pipe = StableDiffusionPipeline.from_pretrained("segmind/tiny-sd")
 unet=pipe.unet
 for param in unet.parameters():
    param.data = param.data.contiguous()     # <- important here
 pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
 ```
 ##### Run the conversion script:
 ```bash
 python convert_diffusers_to_original_stable_diffusion.py \
      --model_path  ./segmindtiny-sd \
      --checkpoint_path ./segmind_tiny-sd.ckpt --half
 ```
 The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
 ##### Another available .ckpt file:
 * https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
 To use this file, you must first adjust its non-contiguous tensors:
 ```python
 import torch
 ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu'))
 for key, value in ckpt['state_dict'].items():
    if isinstance(value, torch.Tensor):
        ckpt['state_dict'][key] = value.contiguous()
 torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
 ```
 ### SDXS-512
 Another very tiny and **incredibly fast**  model is SDXS by IDKiro et al.  The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
 ##### 1. Download the diffusers model from  Hugging Face using Python:
 ```python
 from diffusers import StableDiffusionPipeline
 pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
 pipe.save_pretrained(save_directory="sdxs")
 ```
 ##### 2. Create a safetensors file
 ```bash
 python convert_diffusers_to_original_stable_diffusion.py \
    --model_path  sdxs  --checkpoint_path sdxs.safetensors --half --use_safetensors
 ```
 ##### 3. Run the model as follows:
 ```bash
 ~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
  --cfg-scale 1 --steps 1
 ```
 Both options: ``` --cfg-scale 1 ``` and  ``` --steps 1 ``` are mandatory here.                                                 
--- a/docs/docker.md
+++ b/docs/docker.md
@ -0,0 +1,39 @@
 # Docker
 ## Run CLI
 ```shell
 docker run --rm -v /path/to/models:/models -v /path/to/output/:/output ghcr.io/leejet/stable-diffusion.cpp:master [args...]
 # For example
 # docker run --rm -v ./models:/models -v ./build:/output ghcr.io/leejet/stable-diffusion.cpp:master -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
 ```
 ## Run server
 ```shell
 docker run --rm --init -v /path/to/models:/models -v /path/to/output/:/output -p "1234:1234" --entrypoint "/sd-server" ghcr.io/leejet/stable-diffusion.cpp:master [args...]
 # For example
 # docker run --rm --init -v ./models:/models -v ./build:/output -p "1234:1234" --entrypoint "/sd-server" ghcr.io/leejet/stable-diffusion.cpp:master -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
 ```
 ## Building using Docker
 ```shell
 docker build -t sd .
 ```
 ## Building variants using Docker
 Vulkan:
 ```shell
 docker build -f Dockerfile.vulkan -t sd .
 ```
 ## Run locally built image's CLI
 ```shell
 docker run --rm -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
 # For example
 # docker run --rm -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
 ```
--- a/docs/esrgan.md
+++ b/docs/esrgan.md
@ -0,0 +1,9 @@
 ## Using ESRGAN to upscale results
 You can use ESRGAN—such as the model [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth)—to upscale the generated images and improve their overall resolution and clarity.
 - Specify the model path using the `--upscale-model PATH` parameter. example:
 ```bash
 sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --upscale-model ../models/RealESRGAN_x4plus_anime_6B.pth
 ```
--- a/docs/flux.md
+++ b/docs/flux.md
@ -0,0 +1,66 @@
 # How to Use
 You can run Flux using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
 ## Download weights
 - Download flux
    - If you don't want to do the conversion yourself, download the preconverted gguf model from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf)
    - Otherwise, download flux-dev from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors or flux-schnell from https://huggingface.co/black-forest-labs/FLUX.1-schnell/blob/main/flux1-schnell.safetensors
 - Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
 - Download clip_l from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/clip_l.safetensors
 - Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
 ## Convert flux weights
 You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself.
 For example:
 ```
 .\bin\Release\sd-cli.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
 ```
 ## Run
 - `--cfg-scale` is recommended to be set to 1. 
 ### Flux-dev
 For example:
 ```
 .\bin\Release\sd-cli.exe --diffusion-model  ..\models\flux1-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
 ```
 Using formats of different precisions will yield results of varying quality.
 | Type | q8_0  | q4_0  | q4_k  | q3_k  | q2_k |
 |---- | ----  |----  |----  |----  |----  |
 | **Memory** | 12068.09 MB  | 6394.53 MB | 6395.17 MB | 4888.16 MB  | 3735.73 MB |
 | **Result** | ![](../assets/flux/flux1-dev-q8_0.png) |![](../assets/flux/flux1-dev-q4_0.png) |![](../assets/flux/flux1-dev-q4_k.png) |![](../assets/flux/flux1-dev-q3_k.png) |![](../assets/flux/flux1-dev-q2_k.png)|
 ### Flux-schnell
 ```
 .\bin\Release\sd-cli.exe --diffusion-model  ..\models\flux1-schnell-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --steps 4 --clip-on-cpu
 ```
 | q8_0  |
 | ----  |
 |![](../assets/flux/flux1-schnell-q8_0.png) |
 ## Run with LoRA
 Since many flux LoRA training libraries have used various LoRA naming formats, it is possible that not all flux LoRA naming formats are supported. It is recommended to use LoRA with naming formats compatible with ComfyUI.
 ### Flux-dev q8_0 with LoRA
 - LoRA model from https://huggingface.co/XLabs-AI/flux-lora-collection/tree/main (using comfy converted version!!!)
 ```
 .\bin\Release\sd-cli.exe --diffusion-model  ..\models\flux1-dev-q8_0.gguf --vae ...\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'<lora:realism_lora_comfy_converted:1>" --cfg-scale 1.0 --sampling-method euler -v --lora-model-dir ../models --clip-on-cpu
 ```
 ![output](../assets/flux/flux1-dev-q8_0%20with%20lora.png)
--- a/Show More
+++ b/Show More