refactor: simplify sample cache flow (#1350 )

perf(z-image): switch to fused SwiGLU kernel (#1302 )
style: remove redundant struct qualifiers for consistent C/C++ type usage (#1349 )
2026-03-24 10:18:51 +00:00 · 2026-03-17 00:28:03 +08:00 · 2026-03-17 00:27:46 +08:00 · 2026-03-16 22:17:22 +08:00 · 2026-03-16 22:16:43 +08:00 · 2026-03-16 00:26:57 +08:00
199 changed files with 4400320 additions and 534430 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@ -0,0 +1,10 @@
+Checks: >
+  modernize-make-shared,
+  modernize-use-nullptr,
+  modernize-use-override,
+  modernize-pass-by-value,
+  modernize-return-braced-init-list,
+  modernize-deprecated-headers,
+HeaderFilterRegex: '^$'
+WarningsAsErrors: ''
+FormatStyle: none
--- a/.dockerignore
+++ b/.dockerignore
@ -1,4 +1,5 @@
 build*/
+docs/
 test/

 .cache/
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@ -0,0 +1,73 @@
+name: 🐞 Bug Report
+description: Report a bug or unexpected behavior
+title: "[Bug] "
+labels: ["bug"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Please use this template and include as many details as possible to help us reproduce and fix the issue.
+  - type: textarea
+    id: commit
+    attributes:
+      label: Git commit
+      description: Which commit are you trying to compile?
+      placeholder: |
+        $git rev-parse HEAD
+        40a6a8710ec15b1b5db6b5a098409f6bc8f654a4
+    validations:
+      required: true
+  - type: input
+    id: os
+    attributes:
+      label: Operating System & Version
+      placeholder: e.g. “Ubuntu 22.04”, “Windows 11 23H2”, “macOS 14.3”
+    validations:
+      required: true
+  - type: dropdown
+    id: backends
+    attributes:
+        label: GGML backends
+        description: Which GGML backends do you know to be affected?
+        options: [CPU, CUDA, HIP, Metal, Musa, SYCL, Vulkan, OpenCL]
+        multiple: true
+    validations:
+      required: true
+  - type: input
+    id: cmd_arguments
+    attributes:
+      label: Command-line arguments used
+      placeholder: The full command line you ran (with all flags)
+    validations:
+      required: true
+  - type: textarea
+    id: steps_to_reproduce
+    attributes:
+      label: Steps to reproduce
+      placeholder: A step-by-step list of what you did
+    validations:
+      required: true
+  - type: textarea
+    id: expected_behavior
+    attributes:
+      label: What you expected to happen
+      placeholder: Describe the expected behavior or result
+    validations:
+      required: true
+  - type: textarea
+    id: actual_behavior
+    attributes:
+      label: What actually happened
+      placeholder: Describe what you saw instead (errors, logs, crash, etc.)
+    validations:
+      required: true
+  - type: textarea
+    id: logs_and_errors
+    attributes:
+      label: Logs / error messages / stack trace
+      placeholder: Paste complete logs or error output
+  - type: textarea
+    id: additional_info
+    attributes:
+      label: Additional context / environment details
+      placeholder: e.g. CPU model, GPU, RAM, model file versions, quantization type, etc.
--- a/.github/ISSUE_TEMPLATE/feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@ -0,0 +1,33 @@
+name: 💡 Feature Request
+description: Suggest a new feature or improvement
+title: "[Feature] "
+labels: ["enhancement"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thank you for suggesting an improvement! Please fill in the fields below.
+  - type: input
+    id: summary
+    attributes:
+      label: Feature Summary
+      placeholder: A one-line summary of the feature you’d like
+    validations:
+      required: true
+  - type: textarea
+    id: description
+    attributes:
+      label: Detailed Description
+      placeholder: What problem does this solve? How do you expect it to work?
+    validations:
+      required: true
+  - type: textarea
+    id: alternatives
+    attributes:
+      label: Alternatives you considered
+      placeholder: Any alternative designs or workarounds you tried
+  - type: textarea
+    id: additional_context
+    attributes:
+      label: Additional context
+      placeholder: Any extra information (use cases, related functionalities, constraints)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -4,21 +4,47 @@ on:
  workflow_dispatch: # allows manual triggering
    inputs:
      create_release:
-        description: 'Create new release'
+        description: "Create new release"
        required: true
        type: boolean
  push:
    branches:
      - master
      - ci
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
+    paths:
+      [
+        ".github/workflows/**",
+        "**/CMakeLists.txt",
+        "**/Makefile",
+        "**/*.h",
+        "**/*.hpp",
+        "**/*.c",
+        "**/*.cpp",
+        "**/*.cu",
+        "examples/server/frontend/**",
+      ]
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
+    paths:
+      [
+        ".github/workflows/**",
+        "**/CMakeLists.txt",
+        "**/Makefile",
+        "**/*.h",
+        "**/*.hpp",
+        "**/*.c",
+        "**/*.cpp",
+        "**/*.cu",
+        "examples/server/frontend/**",
+      ]

 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}

+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
  ubuntu-latest-cmake:
    runs-on: ubuntu-latest
@ -30,6 +56,16 @@ jobs:
        with:
          submodules: recursive

+      - name: Setup Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
+      - name: Setup pnpm
+        uses: pnpm/action-setup@v4
+        with:
+          version: 9
+
      - name: Dependencies
        id: depends
        run: |
@ -46,8 +82,8 @@ jobs:

      - name: Get commit hash
        id: commit
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
-        uses: pr-mpt/actions-commit-hash@v2
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: prompt/actions-commit-hash@v2

      - name: Fetch system info
        id: system-info
@ -67,11 +103,148 @@ jobs:

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip

+  ubuntu-latest-cmake-vulkan:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      - name: Setup Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
+      - name: Setup pnpm
+        uses: pnpm/action-setup@v4
+        with:
+          version: 9
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential libvulkan-dev glslc
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake .. -DSD_BUILD_SHARED_LIBS=ON -DSD_VULKAN=ON
+          cmake --build . --config Release
+
+      - name: Get commit hash
+        id: commit
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: prompt/actions-commit-hash@v2
+
+      - name: Fetch system info
+        id: system-info
+        run: |
+          echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
+          echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
+          echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
+          echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp ggml/LICENSE ./build/bin/ggml.txt
+          cp LICENSE ./build/bin/stable-diffusion.cpp.txt
+          zip -j sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip
+          path: |
+            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip
+
+  build-and-push-docker-images:
+    name: Build and push container images
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: read
+      packages: write
+      id-token: write
+      attestations: write
+      artifact-metadata: write
+
+    strategy:
+      matrix:
+        variant: [musa, sycl, vulkan, cuda]
+
+    env:
+      REGISTRY: ghcr.io
+      IMAGE_NAME: ${{ github.repository }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          submodules: recursive
+
+      - name: Setup Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
+      - name: Setup pnpm
+        uses: pnpm/action-setup@v4
+        with:
+          version: 9
+
+      - name: Get commit hash
+        id: commit
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: prompt/actions-commit-hash@v2
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to the container registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata for Docker
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@v1.3.1
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: false
+
+      - name: Build and push Docker image
+        id: build-push
+        uses: docker/build-push-action@v6
+        with:
+          platforms: linux/amd64
+          push: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+          file: Dockerfile.${{ matrix.variant }}
+          tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH_NAME }}-${{ matrix.variant }}
+          labels: ${{ steps.meta.outputs.labels }}
+          annotations: ${{ steps.meta.outputs.annotations }}

  macOS-latest-cmake:
    runs-on: macos-latest
@ -83,6 +256,16 @@ jobs:
        with:
          submodules: recursive

+      - name: Setup Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
+      - name: Setup pnpm
+        uses: pnpm/action-setup@v4
+        with:
+          version: 9
+
      - name: Dependencies
        id: depends
        run: |
@ -99,8 +282,8 @@ jobs:

      - name: Get commit hash
        id: commit
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
-        uses: pr-mpt/actions-commit-hash@v2
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: prompt/actions-commit-hash@v2

      - name: Fetch system info
        id: system-info
@ -120,30 +303,33 @@ jobs:

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip

-
  windows-latest-cmake:
-    runs-on: windows-latest
+    runs-on: windows-2022
+
+    env:
+      VULKAN_VERSION: 1.4.328.1

    strategy:
      matrix:
        include:
-          - build: 'noavx'
-            defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON'
-          - build: 'avx2'
-            defines: '-DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON'
-          - build: 'avx'
-            defines: '-DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON'
-          - build: 'avx512'
-            defines: '-DGGML_AVX512=ON -DSD_BUILD_SHARED_LIBS=ON'
-          - build: 'cuda12'
-            defines: '-DSD_CUBLAS=ON -DSD_BUILD_SHARED_LIBS=ON'
-          - build: 'rocm5.5'
-            defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
+          - build: "noavx"
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON"
+          - build: "avx2"
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
+          - build: "avx"
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX=ON -DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON"
+          - build: "avx512"
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
+          - build: "cuda12"
+            defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES='61;70;75;80;86;89;90;100;120' -DCMAKE_CUDA_FLAGS='-Xcudafe \"--diag_suppress=177\" -Xcudafe \"--diag_suppress=550\"'"
+          - build: "vulkan"
+            defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
    steps:
      - name: Clone
        id: checkout
@ -151,36 +337,45 @@ jobs:
        with:
          submodules: recursive

+      - name: Setup Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
+      - name: Setup pnpm
+        uses: pnpm/action-setup@v4
+        with:
+          version: 9
+
      - name: Install cuda-toolkit
        id: cuda-toolkit
        if: ${{ matrix.build == 'cuda12' }}
-        uses: Jimver/cuda-toolkit@v0.2.11
+        uses: Jimver/cuda-toolkit@v0.2.22
        with:
-          cuda: '12.2.0'
-          method: 'network'
+          cuda: "12.8.1"
+          method: "network"
          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'

-      - name: Install rocm-toolkit
-        id: rocm-toolkit
-        if: ${{ matrix.build == 'rocm5.5' }}
-        uses: Cyberhan123/rocm-toolkit@v0.1.0
-        with:
-          rocm: '5.5.0'
+      - name: Install Vulkan SDK
+        id: get_vulkan
+        if: ${{ matrix.build == 'vulkan' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
+          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
+          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
+          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"

-      - name: Install Ninja
-        id: install-ninja
-        if: ${{ matrix.build == 'rocm5.5' }}
-        uses: urkle/action-get-ninja@v1
-        with:
-          version: 1.11.1
+      - name: Activate MSVC environment
+        id: msvc_dev_cmd
+        uses: ilammy/msvc-dev-cmd@v1

      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
-          cmake .. ${{ matrix.defines }}
-          cmake --build . --config Release
+          cmake .. -DCMAKE_CXX_FLAGS='/bigobj' -G Ninja -DCMAKE_C_COMPILER=cl.exe -DCMAKE_CXX_COMPILER=cl.exe -DCMAKE_BUILD_TYPE=Release ${{ matrix.defines }}
+          cmake --build .

      - name: Check AVX512F support
        id: check_avx512f
@ -198,7 +393,7 @@ jobs:
      - name: Get commit hash
        id: commit
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: pr-mpt/actions-commit-hash@v2
+        uses: prompt/actions-commit-hash@v2

      - name: Pack artifacts
        id: pack_artifacts
@ -222,7 +417,7 @@ jobs:

      - name: Copy and pack Cuda runtime
        id: pack_cuda_runtime
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ matrix.build == 'cuda12' && (github.event_name == 'push' && github.ref == 'refs/heads/master' || github.event.inputs.create_release == 'true') }}
        run: |
          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
          $dst='.\build\bin\cudart\'
@ -230,19 +425,279 @@ jobs:
          7z a cudart-sd-bin-win-cu12-x64.zip $dst\*

      - name: Upload Cuda runtime
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
+        if: ${{ matrix.build == 'cuda12' && (github.event_name == 'push' && github.ref == 'refs/heads/master' || github.event.inputs.create_release == 'true') }}
+        uses: actions/upload-artifact@v4
        with:
+          name: sd-cudart-sd-bin-win-cu12-x64.zip
          path: |
            cudart-sd-bin-win-cu12-x64.zip

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip

+  windows-latest-cmake-hip:
+    runs-on: windows-2022
+
+    env:
+      HIPSDK_INSTALLER_VERSION: "25.Q3"
+      GPU_TARGETS: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      - name: Setup Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
+      - name: Setup pnpm
+        uses: pnpm/action-setup@v4
+        with:
+          version: 9
+
+      - name: Cache ROCm Installation
+        id: cache-rocm
+        uses: actions/cache@v4
+        with:
+          path: C:\Program Files\AMD\ROCm
+          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-x64
+          evict-old-files: 1d
+
+      - name: Install ROCm
+        if: steps.cache-rocm.outputs.cache-hit != 'true'
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "Downloading AMD HIP SDK Installer"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          write-host "Installing AMD HIP SDK"
+          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
+          $completed = $proc.WaitForExit(600000)
+          if (-not $completed) {
+              Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
+              $proc.Kill()
+              exit 1
+          }
+          if ($proc.ExitCode -ne 0) {
+              Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
+              exit 1
+          }
+          write-host "Completed AMD HIP SDK installation"
+
+      - name: Verify ROCm
+        run: |
+          # Find and test ROCm installation
+          $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
+          if (-not $clangPath) {
+            Write-Error "ROCm installation not found"
+            exit 1
+          }
+          & $clangPath.FullName --version
+          # Set HIP_PATH environment variable for later steps
+          echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)" >> $env:GITHUB_ENV
+
+      - name: Build
+        run: |
+          mkdir build
+          cd build
+          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+          cmake .. `
+            -G "Unix Makefiles" `
+            -DSD_HIPBLAS=ON `
+            -DSD_BUILD_SHARED_LIBS=ON `
+            -DGGML_NATIVE=OFF `
+            -DCMAKE_C_COMPILER=clang `
+            -DCMAKE_CXX_COMPILER=clang++ `
+            -DCMAKE_BUILD_TYPE=Release `
+            -DGPU_TARGETS="${{ env.GPU_TARGETS }}"
+          cmake --build . --config Release --parallel ${env:NUMBER_OF_PROCESSORS}
+
+      - name: Get commit hash
+        id: commit
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: prompt/actions-commit-hash@v2
+
+      - name: Pack artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          md "build\bin\rocblas\library\"
+          md "build\bin\hipblaslt\library"
+          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
+          cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
+          7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip .\build\bin\*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
+          path: |
+            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
+
+  ubuntu-latest-rocm:
+    runs-on: ubuntu-latest
+    container: rocm/dev-ubuntu-24.04:7.2
+
+    env:
+      ROCM_VERSION: "7.2"
+      UBUNTU_VERSION: "24.04"
+      GPU_TARGETS: "gfx1151;gfx1150;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+
+    steps:
+      - run: apt-get update && apt-get install -y git
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+        with:
+          submodules: recursive
+
+      - name: Setup Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
+      - name: Setup pnpm
+        uses: pnpm/action-setup@v4
+        with:
+          version: 9
+
+      - name: Free disk space
+        run: |
+          # Remove preinstalled SDKs and caches not needed for this job
+          sudo rm -rf /usr/share/dotnet || true
+          sudo rm -rf /usr/local/lib/android || true
+          sudo rm -rf /opt/ghc || true
+          sudo rm -rf /usr/local/.ghcup || true
+          sudo rm -rf /opt/hostedtoolcache || true
+
+          # Remove old package lists and caches
+          sudo rm -rf /var/lib/apt/lists/* || true
+          sudo apt clean
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt install -y \
+            cmake \
+            hip-dev \
+            hipblas-dev \
+            ninja-build \
+            rocm-dev \
+            zip
+          # Clean apt caches to recover disk space
+          sudo apt clean
+          sudo rm -rf /var/lib/apt/lists/* || true
+
+      - name: Setup ROCm Environment
+        run: |
+          # Add ROCm to PATH for current session
+          echo "/opt/rocm/bin" >> $GITHUB_PATH
+
+          # Build regex pattern from ${{ env.GPU_TARGETS }} (match target as substring)
+          TARGET_REGEX="($(printf '%s' "${{ env.GPU_TARGETS }}" | sed 's/;/|/g'))"
+
+          # Remove library files for architectures we're not building for to save disk space
+          echo "Cleaning up unneeded architecture files..."
+          cd /opt/rocm/lib/rocblas/library
+          # Keep only our target architectures
+          for file in *; do
+            if printf '%s' "$file" | grep -q 'gfx'; then
+              if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
+                echo "Removing $file" &&
+                sudo rm -f "$file";
+              fi
+            fi
+          done
+
+          cd /opt/rocm/lib/hipblaslt/library
+          for file in *; do
+            if printf '%s' "$file" | grep -q 'gfx'; then
+              if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
+                echo "Removing $file" &&
+                sudo rm -f "$file";
+              fi
+            fi
+          done
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake .. -G Ninja \
+            -DCMAKE_CXX_COMPILER=amdclang++ \
+            -DCMAKE_C_COMPILER=amdclang \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DSD_HIPBLAS=ON \
+            -DGPU_TARGETS="${{ env.GPU_TARGETS }}" \
+            -DAMDGPU_TARGETS="${{ env.GPU_TARGETS }}" \
+            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
+            -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+            -DSD_BUILD_SHARED_LIBS=ON
+          cmake --build . --config Release
+
+      - name: Get commit hash
+        id: commit
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: prompt/actions-commit-hash@v2
+
+      - name: Prepare artifacts
+        id: prepare_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          # Copy licenses
+          cp ggml/LICENSE ./build/bin/ggml.txt
+          cp LICENSE ./build/bin/stable-diffusion.cpp.txt
+
+          # Move ROCm runtime libraries (to avoid double space consumption)
+          sudo mv /opt/rocm/lib/librocsparse.so* ./build/bin/
+          sudo mv /opt/rocm/lib/libhsa-runtime64.so* ./build/bin/
+          sudo mv /opt/rocm/lib/libamdhip64.so* ./build/bin/
+          sudo mv /opt/rocm/lib/libhipblas.so* ./build/bin/
+          sudo mv /opt/rocm/lib/libhipblaslt.so* ./build/bin/
+          sudo mv /opt/rocm/lib/librocblas.so* ./build/bin/
+          sudo mv /opt/rocm/lib/rocblas/ ./build/bin/
+          sudo mv /opt/rocm/lib/hipblaslt/ ./build/bin/
+
+      - name: Fetch system info
+        id: system-info
+        run: |
+          echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
+          echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
+          echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
+          echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp ggml/LICENSE ./build/bin/ggml.txt
+          cp LICENSE ./build/bin/stable-diffusion.cpp.txt
+          zip -y -r sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip ./build/bin
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
+          path: |
+            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
+
  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

@ -250,28 +705,48 @@ jobs:

    needs:
      - ubuntu-latest-cmake
+      - ubuntu-latest-cmake-vulkan
+      - ubuntu-latest-rocm
+      - build-and-push-docker-images
      - macOS-latest-cmake
      - windows-latest-cmake
+      - windows-latest-cmake-hip

    steps:
+      - name: Clone
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
      - name: Download artifacts
        id: download-artifact
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
+        with:
+          path: ./artifact
+          pattern: sd-*
+          merge-multiple: true
+
+      - name: Get commit count
+        id: commit_count
+        run: |
+          echo "count=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT

      - name: Get commit hash
        id: commit
-        uses: pr-mpt/actions-commit-hash@v2
+        uses: prompt/actions-commit-hash@v2

      - name: Create release
        id: create_release
+        if: ${{ github.event_name == 'workflow_dispatch' || github.ref_name == 'master' }}
        uses: anzz1/action-create-release@v1
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
-          tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
+          tag_name: ${{ format('{0}-{1}-{2}', env.BRANCH_NAME, steps.commit_count.outputs.count, steps.commit.outputs.short) }}

      - name: Upload release
        id: upload_release
+        if: ${{ github.event_name == 'workflow_dispatch' || github.ref_name == 'master' }}
        uses: actions/github-script@v3
        with:
          github-token: ${{secrets.GITHUB_TOKEN}}
--- a/.gitignore
+++ b/.gitignore
@ -1,9 +1,10 @@
 build*/
+cmake-build-*/
 test/
 .vscode/
+.idea/
 .cache/
 *.swp
-.vscode/
 *.bat
 *.bin
 *.exe
@ -11,3 +12,4 @@ test/
 output*.png
 models*
 *.log
+preview.png
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +1,6 @@
 [submodule "ggml"]
    path = ggml
-	url = https://github.com/ggerganov/ggml.git
+	url = https://github.com/ggml-org/ggml.git
+[submodule "examples/server/frontend"]
+	path = examples/server/frontend
+	url = https://github.com/leejet/stable-ui.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -8,6 +8,11 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()

+if (MSVC)
+    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+    add_compile_definitions(_SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)
+endif()
+
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

@ -24,79 +29,164 @@ endif()
 # general
 #option(SD_BUILD_TESTS                "sd: build tests"    ${SD_STANDALONE})
 option(SD_BUILD_EXAMPLES             "sd: build examples" ${SD_STANDALONE})
-option(SD_CUBLAS                     "sd: cuda backend" OFF)
+option(SD_CUDA                       "sd: cuda backend" OFF)
 option(SD_HIPBLAS                    "sd: rocm backend" OFF)
 option(SD_METAL                      "sd: metal backend" OFF)
-option(SD_FLASH_ATTN                 "sd: use flash attention for x4 less memory usage" OFF)
-option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
+option(SD_VULKAN                     "sd: vulkan backend" OFF)
+option(SD_OPENCL                     "sd: opencl backend" OFF)
+option(SD_SYCL                       "sd: sycl backend" OFF)
+option(SD_MUSA                       "sd: musa backend" OFF)
 option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
+option(SD_BUILD_SHARED_GGML_LIB      "sd: build ggml as a separate shared lib" OFF)
+option(SD_USE_SYSTEM_GGML            "sd: use system-installed GGML library" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)

-if(SD_CUBLAS)
-	message("Use CUBLAS as backend stable-diffusion")
-    set(GGML_CUBLAS ON)
-    add_definitions(-DSD_USE_CUBLAS)
+if(SD_CUDA)
+    message("-- Use CUDA as backend stable-diffusion")
+    set(GGML_CUDA ON)
+    add_definitions(-DSD_USE_CUDA)
 endif()

 if(SD_METAL)
-	message("Use Metal as backend stable-diffusion")
+    message("-- Use Metal as backend stable-diffusion")
    set(GGML_METAL ON)
    add_definitions(-DSD_USE_METAL)
 endif()

-if (SD_HIPBLAS)
-    message("Use HIPBLAS as backend stable-diffusion")
-    set(GGML_HIPBLAS ON)
-    add_definitions(-DSD_USE_CUBLAS)
-    if(SD_FAST_SOFTMAX)
-        set(GGML_CUDA_FAST_SOFTMAX ON)
-    endif()
+if (SD_VULKAN)
+    message("-- Use Vulkan as backend stable-diffusion")
+    set(GGML_VULKAN ON)
+    add_definitions(-DSD_USE_VULKAN)
 endif ()

-if(SD_FLASH_ATTN)
-    message("Use Flash Attention for memory optimization")
-    add_definitions(-DSD_USE_FLASH_ATTENTION)
+if (SD_OPENCL)
+    message("-- Use OpenCL as backend stable-diffusion")
+    set(GGML_OPENCL ON)
+    add_definitions(-DSD_USE_OPENCL)
+endif ()
+
+if (SD_HIPBLAS)
+    message("-- Use HIPBLAS as backend stable-diffusion")
+    set(GGML_HIP ON)
+    add_definitions(-DSD_USE_CUDA)
+endif ()
+
+if(SD_MUSA)
+    message("-- Use MUSA as backend stable-diffusion")
+    set(GGML_MUSA ON)
+    add_definitions(-DSD_USE_CUDA)
 endif()

 set(SD_LIB stable-diffusion)

 file(GLOB SD_LIB_SOURCES
-    "*.h"
-    "*.cpp"
-    "*.hpp"
+    "src/*.h"
+    "src/*.cpp"
+    "src/*.hpp"
+    "src/vocab/*.h"
+    "src/vocab/*.cpp"
+)
+
+find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
+if(GIT_EXE)
+    execute_process(COMMAND ${GIT_EXE} describe --tags --abbrev=7 --dirty=+
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE SDCPP_BUILD_VERSION
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+    )
+    execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE SDCPP_BUILD_COMMIT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+    )
+endif()
+
+if(NOT SDCPP_BUILD_VERSION)
+    set(SDCPP_BUILD_VERSION unknown)
+endif()
+message(STATUS "stable-diffusion.cpp version ${SDCPP_BUILD_VERSION}")
+
+if(NOT SDCPP_BUILD_COMMIT)
+    set(SDCPP_BUILD_COMMIT unknown)
+endif()
+message(STATUS "stable-diffusion.cpp commit ${SDCPP_BUILD_COMMIT}")
+
+set_property(
+  SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/version.cpp
+  APPEND PROPERTY COMPILE_DEFINITIONS
+  SDCPP_BUILD_COMMIT=${SDCPP_BUILD_COMMIT} SDCPP_BUILD_VERSION=${SDCPP_BUILD_VERSION}
 )

-# we can get only one share lib
 if(SD_BUILD_SHARED_LIBS)
-    message("Build shared library")
-    set(BUILD_SHARED_LIBS OFF)
+    message("-- Build shared library")
    message(${SD_LIB_SOURCES})
+    if(NOT SD_BUILD_SHARED_GGML_LIB)
+        set(BUILD_SHARED_LIBS OFF)
+    endif()
    add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES})
    add_definitions(-DSD_BUILD_SHARED_LIB)
    target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL)
    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 else()
-    message("Build static library")
+    message("-- Build static library")
+    if(NOT SD_BUILD_SHARED_GGML_LIB)
+        set(BUILD_SHARED_LIBS OFF)
+    endif()
    add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
 endif()

+if(SD_SYCL)
+    message("-- Use SYCL as backend stable-diffusion")
+    set(GGML_SYCL ON)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
+    add_definitions(-DSD_USE_SYCL)
+    # disable fast-math on host, see:
+    # https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/fp-model-fp.html
+    if (WIN32)
+        set(SYCL_COMPILE_OPTIONS /fp:precise)
+    else()
+        set(SYCL_COMPILE_OPTIONS -fp-model=precise)
+    endif()
+    message("-- Turn off fast-math for host in SYCL backend")
+    target_compile_options(${SD_LIB} PRIVATE ${SYCL_COMPILE_OPTIONS})
+endif()

 set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)

-# see https://github.com/ggerganov/ggml/pull/682
-add_definitions(-DGGML_MAX_NAME=128)
+if (NOT SD_USE_SYSTEM_GGML)
+    # see https://github.com/ggerganov/ggml/pull/682
+    add_definitions(-DGGML_MAX_NAME=128)
+endif()

 # deps
-add_subdirectory(ggml)
+# Only add ggml if it hasn't been added yet
+if (NOT TARGET ggml)
+    if (SD_USE_SYSTEM_GGML)
+        find_package(ggml REQUIRED)
+        if (NOT ggml_FOUND)
+            message(FATAL_ERROR "System-installed GGML library not found.")
+        endif()
+        add_library(ggml ALIAS ggml::ggml)
+    else()
+        add_subdirectory(ggml)
+    endif()
+endif()

 add_subdirectory(thirdparty)

 target_link_libraries(${SD_LIB} PUBLIC ggml zip)
+target_include_directories(${SD_LIB} PUBLIC . include)
 target_include_directories(${SD_LIB} PUBLIC . thirdparty)
-target_compile_features(${SD_LIB} PUBLIC cxx_std_11)
+target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)


 if (SD_BUILD_EXAMPLES)
    add_subdirectory(examples)
 endif()

+set(SD_PUBLIC_HEADERS include/stable-diffusion.h)
+set_target_properties(${SD_LIB} PROPERTIES PUBLIC_HEADER "${SD_PUBLIC_HEADERS}")
+
+install(TARGETS ${SD_LIB} LIBRARY PUBLIC_HEADER)
--- a/20
+++ b/20
@ -1,17 +1,23 @@
-ARG UBUNTU_VERSION=22.04
+ARG UBUNTU_VERSION=24.04

-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build

-RUN apt-get update && apt-get install -y build-essential git cmake
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake

 WORKDIR /sd.cpp

 COPY . .

-RUN mkdir build && cd build && cmake .. && cmake --build . --config Release
+RUN cmake . -B ./build
+RUN cmake --build ./build --config Release --parallel

-FROM ubuntu:$UBUNTU_VERSION as runtime
+FROM ubuntu:$UBUNTU_VERSION AS runtime

-COPY --from=build /sd.cpp/build/bin/sd /sd
+RUN apt-get update && \
+    apt-get install --yes --no-install-recommends libgomp1 && \
+    apt-get clean

-ENTRYPOINT [ "/sd" ]
+COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
+COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
+
+ENTRYPOINT [ "/sd-cli" ]
--- a/Dockerfile.cuda
+++ b/Dockerfile.cuda
@ -0,0 +1,25 @@
+ARG CUDA_VERSION=12.6.3
+ARG UBUNTU_VERSION=24.04
+
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION} AS build
+
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential git ccache cmake
+
+WORKDIR /sd.cpp
+
+COPY . .
+
+ARG CUDACXX=/usr/local/cuda/bin/nvcc
+RUN cmake . -B ./build -DSD_CUDA=ON
+RUN cmake --build ./build --config Release -j$(nproc)
+
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION} AS runtime
+
+RUN apt-get update && \
+    apt-get install --yes --no-install-recommends libgomp1 && \
+    apt-get clean
+
+COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
+COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
+
+ENTRYPOINT [ "/sd-cli" ]
--- a/Dockerfile.musa
+++ b/Dockerfile.musa
@ -0,0 +1,24 @@
+ARG MUSA_VERSION=rc4.2.0
+ARG UBUNTU_VERSION=22.04
+
+FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64 as build
+
+RUN apt-get update && apt-get install -y ccache cmake git
+
+WORKDIR /sd.cpp
+
+COPY . .
+
+RUN mkdir build && cd build && \
+    cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ \
+        -DCMAKE_C_FLAGS="${CMAKE_C_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
+        -DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
+        -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build . --config Release
+
+FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 as runtime
+
+COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
+COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
+
+ENTRYPOINT [ "/sd-cli" ]
--- a/Dockerfile.sycl
+++ b/Dockerfile.sycl
@ -0,0 +1,20 @@
+ARG SYCL_VERSION=2025.1.0-0
+
+FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS build
+
+RUN apt-get update && apt-get install -y cmake
+
+WORKDIR /sd.cpp
+
+COPY . .
+
+RUN mkdir build && cd build && \
+    cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build . --config Release -j$(nproc)
+
+FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS runtime
+
+COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
+COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
+
+ENTRYPOINT [ "/sd-cli" ]
--- a/Dockerfile.vulkan
+++ b/Dockerfile.vulkan
@ -0,0 +1,23 @@
+ARG UBUNTU_VERSION=24.04
+
+FROM ubuntu:$UBUNTU_VERSION AS build
+
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake libvulkan-dev glslc
+
+WORKDIR /sd.cpp
+
+COPY . .
+
+RUN cmake . -B ./build -DSD_VULKAN=ON
+RUN cmake --build ./build --config Release --parallel
+
+FROM ubuntu:$UBUNTU_VERSION AS runtime
+
+RUN apt-get update && \
+    apt-get install --yes --no-install-recommends libgomp1 libvulkan1 mesa-vulkan-drivers && \
+    apt-get clean
+
+COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
+COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
+
+ENTRYPOINT [ "/sd-cli" ]
--- a/README.md
+++ b/README.md
@ -1,37 +1,90 @@
 <p align="center">
-  <img src="./assets/a%20lovely%20cat.png" width="256x">
+  <img src="./assets/logo.png" width="360x">
 </p>

 # stable-diffusion.cpp

-Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in pure C/C++
+<div align="center">
+<a href="https://trendshift.io/repositories/9714" target="_blank"><img src="https://trendshift.io/api/badge/repositories/9714" alt="leejet%2Fstable-diffusion.cpp | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</div>
+
+Diffusion model(SD,Flux,Wan,...) inference in pure C/C++
+
+***Note that this project is under active development. \
+API and command-line option may change frequently.***
+
+## 🔥Important News
+
+* **2026/01/18** 🚀 stable-diffusion.cpp now supports **FLUX.2-klein**  
+  👉 Details: [PR #1193](https://github.com/leejet/stable-diffusion.cpp/pull/1193)
+
+* **2025/12/01** 🚀 stable-diffusion.cpp now supports **Z-Image**  
+  👉 Details: [PR #1020](https://github.com/leejet/stable-diffusion.cpp/pull/1020)
+
+* **2025/11/30** 🚀 stable-diffusion.cpp now supports **FLUX.2-dev**  
+  👉 Details: [PR #1016](https://github.com/leejet/stable-diffusion.cpp/pull/1016)
+
+* **2025/10/13** 🚀 stable-diffusion.cpp now supports **Qwen-Image-Edit / Qwen-Image-Edit 2509**  
+  👉 Details: [PR #877](https://github.com/leejet/stable-diffusion.cpp/pull/877)
+
+* **2025/10/12** 🚀 stable-diffusion.cpp now supports **Qwen-Image**  
+  👉 Details: [PR #851](https://github.com/leejet/stable-diffusion.cpp/pull/851)
+
+* **2025/09/14** 🚀 stable-diffusion.cpp now supports **Wan2.1 Vace**  
+  👉 Details: [PR #819](https://github.com/leejet/stable-diffusion.cpp/pull/819)
+
+* **2025/09/06** 🚀 stable-diffusion.cpp now supports **Wan2.1 / Wan2.2**  
+  👉 Details: [PR #778](https://github.com/leejet/stable-diffusion.cpp/pull/778)

 ## Features

- Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
+- Plain C/C++ implementation based on [ggml](https://github.com/ggml-org/ggml), working in the same way as [llama.cpp](https://github.com/ggml-org/llama.cpp)
 - Super lightweight and without external dependencies
- SD1.x, SD2.x and SDXL support
-    - !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors).
-
- [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo) and [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) support
- 16-bit, 32-bit float support
- 4-bit, 5-bit and 8-bit integer quantization support
- Accelerated memory-efficient CPU inference
-    - Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB.
- AVX, AVX2 and AVX512 support for x86 architectures
- Full CUDA and Metal backend for GPU acceleration.
- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models
-    - No need to convert to `.ggml` or `.gguf` anymore!
- Flash Attention for memory usage optimization (only cpu for now)
- Original `txt2img` and `img2img` mode
+- Supported models
+  - Image Models
+    - SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo)
+    - SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
+    - [Some SD1.x and SDXL distilled models](./docs/distilled_sd.md)
+    - [SD3/SD3.5](./docs/sd3.md)
+    - [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
+    - [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
+    - [Chroma](./docs/chroma.md)
+    - [Chroma1-Radiance](./docs/chroma_radiance.md)
+    - [Qwen Image](./docs/qwen_image.md)
+    - [Z-Image](./docs/z_image.md)
+    - [Ovis-Image](./docs/ovis_image.md)
+    - [Anima](./docs/anima.md)
+  - Image Edit Models
+    - [FLUX.1-Kontext-dev](./docs/kontext.md)
+    - [Qwen Image Edit series](./docs/qwen_image_edit.md)
+  - Video Models
+    - [Wan2.1/Wan2.2](./docs/wan.md)
+  - [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
+  - Control Net support with SD 1.5
+  - LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
+  - Latent Consistency Models support (LCM/LCM-LoRA)
+  - Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
+  - Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
+- Supported backends
+  - CPU (AVX, AVX2 and AVX512 support for x86 architectures)
+  - CUDA
+  - Vulkan
+  - Metal
+  - OpenCL
+  - SYCL
+- Supported weight formats
+  - Pytorch checkpoint (`.ckpt` or `.pth`)
+  - Safetensors (`.safetensors`)
+  - GGUF (`.gguf`)
+- Supported platforms
+    - Linux
+    - Mac OS
+    - Windows
+    - Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))
+- Flash Attention for memory usage optimization
 - Negative prompt
 - [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
- LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
- Latent Consistency Models support (LCM/LCM-LoRA)
- Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
- Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
 - VAE tiling processing for reduce memory usage
- Control Net support with SD 1.5
 - Sampling method
    - `Euler A`
    - `Euler`
@ -41,283 +94,85 @@ Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in
    - [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
    - `DPM++ 2S a`
    - [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
+- Cross-platform reproducibility
+    - `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG`
+    - `--rng cpu`, consistent with the `comfyui RNG`
 - Embedds generation parameters into png output as webui-compatible text string
- Supported platforms
-    - Linux
-    - Mac OS
-    - Windows
-    - Android (via Termux)

-### TODO
+## Quick Start

- [ ] More sampling methods
- [ ] Make inference faster
-    - The current implementation of ggml_conv_2d is slow and has high memory usage
- [ ] Continuing to reduce memory usage (quantizing the weights of ggml_conv_2d)
- [ ] Implement Inpainting support
- [ ] k-quants support
+### Get the sd executable

-## Usage
+- Download pre-built binaries from the [releases page](https://github.com/leejet/stable-diffusion.cpp/releases)
+- Or build from source by following the [build guide](./docs/build.md)

-### Get the Code
+### Download model weights

-```
-git clone --recursive https://github.com/leejet/stable-diffusion.cpp
-cd stable-diffusion.cpp
-```
+- download weights(.ckpt or .safetensors or .gguf). For example
+    - Stable Diffusion v1.5 from https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5 

- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
-
-```
-cd stable-diffusion.cpp
-git pull origin master
-git submodule init
-git submodule update
-```
-
-### Download weights
-
- download original weights(.ckpt or .safetensors). For example
-    - Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
-    - Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
-    - Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
-
-    ```shell
-    curl -L -O https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
-    # curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
-    # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-nonema-pruned.safetensors
+    ```sh
+    curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
    ```

-### Build
-
-#### Build from scratch
-
-```shell
-mkdir build
-cd build
-cmake ..
-cmake --build . --config Release
-```
-
-##### Using OpenBLAS
-
-```
-cmake .. -DGGML_OPENBLAS=ON
-cmake --build . --config Release
-```
-
-##### Using CUBLAS
-
-This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
-
-```
-cmake .. -DSD_CUBLAS=ON
-cmake --build . --config Release
-```
-
-##### Using HipBLAS
-This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed.
-
-Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
-
-```
-cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1100
-cmake --build . --config Release
-```
-
-
-##### Using Metal
-
-Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
-
-```
-cmake .. -DSD_METAL=ON
-cmake --build . --config Release
-```
-
-##### Using Flash Attention
-
-Enabling flash attention reduces memory usage by at least 400 MB. At the moment, it is not supported when CUBLAS is enabled because the kernel implementation is missing.
-
-```
-cmake .. -DSD_FLASH_ATTN=ON
-cmake --build . --config Release
-```
-
-### Run
-
-```
-usage: ./build/bin/sd [arguments]
-
-arguments:
-  -h, --help                         show this help message and exit
-  -M, --mode [MODEL]                 run mode (txt2img or img2img or convert, default: txt2img)
-  -t, --threads N                    number of threads to use during computation (default: -1).
-                                     If threads <= 0, then threads will be set to the number of CPU physical cores
-  -m, --model [MODEL]                path to model
-  --vae [VAE]                        path to vae
-  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
-  --control-net [CONTROL_PATH]       path to control net model
-  --embd-dir [EMBEDDING_PATH]        path to embeddings.
-  --upscale-model [ESRGAN_PATH]      path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now.
-  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)
-  --type [TYPE]                      weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)
-                                     If not specified, the default is the type of the weight file.
-  --lora-model-dir [DIR]             lora model directory
-  -i, --init-img [IMAGE]             path to the input image, required by img2img
-  --control-image [IMAGE]            path to image condition, control net
-  -o, --output OUTPUT                path to write result image to (default: ./output.png)
-  -p, --prompt [PROMPT]              the prompt to render
-  -n, --negative-prompt PROMPT       the negative prompt (default: "")
-  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)
-  --strength STRENGTH                strength for noising/unnoising (default: 0.75)
-  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)
-                                     1.0 corresponds to full destruction of information in init image
-  -H, --height H                     image height, in pixel space (default: 512)
-  -W, --width W                      image width, in pixel space (default: 512)
-  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, lcm}
-                                     sampling method (default: "euler_a")
-  --steps  STEPS                     number of sample steps (default: 20)
-  --rng {std_default, cuda}          RNG (default: cuda)
-  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)
-  -b, --batch-count COUNT            number of images to generate.
-  --schedule {discrete, karras}      Denoiser sigma schedule (default: discrete)
-  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
-                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
-  --vae-tiling                       process vae in tiles to reduce memory usage
-  --control-net-cpu                  keep controlnet in cpu (for low vram)
-  --canny                            apply canny preprocessor (edge detection)
-  -v, --verbose                      print extra info
-```
-
-#### Quantization
-
-You can specify the model weight type using the `--type` parameter. The weights are automatically converted when loading the model.
-
- `f16` for 16-bit floating-point
- `f32` for 32-bit floating-point
- `q8_0` for 8-bit integer quantization
- `q5_0` or `q5_1` for 5-bit integer quantization
- `q4_0` or `q4_1` for 4-bit integer quantization
-
-#### Convert to GGUF
-
-You can also convert weights in the formats `ckpt/safetensors/diffusers` to gguf and perform quantization in advance, avoiding the need for quantization every time you load them.
-
-For example:
+### Generate an image with just one command

 ```sh
-./bin/sd -M convert -m ../models/v1-5-pruned-emaonly.safetensors -o  ../models/v1-5-pruned-emaonly.q8_0.gguf -v --type q8_0
+./bin/sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
 ```

-#### txt2img example
+***For detailed command-line arguments, check out [cli doc](./examples/cli/README.md).***

-```sh
-./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
-# ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
-# ./bin/sd -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
-```
+## Performance

-Using formats of different precisions will yield results of varying quality.
+If you want to improve performance or reduce VRAM/RAM usage, please refer to [performance guide](./docs/performance.md).

-| f32  | f16  |q8_0  |q5_0  |q5_1  |q4_0  |q4_1  |
-| ----  |----  |----  |----  |----  |----  |----  |
-| ![](./assets/f32.png) |![](./assets/f16.png) |![](./assets/q8_0.png) |![](./assets/q5_0.png) |![](./assets/q5_1.png) |![](./assets/q4_0.png) |![](./assets/q4_1.png) |
+## More Guides

-#### img2img example
+- [SD1.x/SD2.x/SDXL](./docs/sd.md)
+- [SD3/SD3.5](./docs/sd3.md)
+- [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
+- [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
+- [FLUX.1-Kontext-dev](./docs/kontext.md)
+- [Chroma](./docs/chroma.md)
+- [🔥Qwen Image](./docs/qwen_image.md)
+- [🔥Qwen Image Edit series](./docs/qwen_image_edit.md)
+- [🔥Wan2.1/Wan2.2](./docs/wan.md)
+- [🔥Z-Image](./docs/z_image.md)
+- [Ovis-Image](./docs/ovis_image.md)
+- [Anima](./docs/anima.md)
+- [LoRA](./docs/lora.md)
+- [LCM/LCM-LoRA](./docs/lcm.md)
+- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
+- [Using ESRGAN to upscale results](./docs/esrgan.md)
+- [Using TAESD to faster decoding](./docs/taesd.md)
+- [Docker](./docs/docker.md)
+- [Quantization and GGUF](./docs/quantization_and_gguf.md)
+- [Inference acceleration via caching](./docs/caching.md)

- `./output.png` is the image generated from the above txt2img pipeline
+## Bindings

+These projects wrap `stable-diffusion.cpp` for easier use in other languages/frameworks.

-```
-./bin/sd --mode img2img -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
-```
+* Golang (non-cgo): [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion)
+* Golang (cgo): [Binozo/GoStableDiffusion](https://github.com/Binozo/GoStableDiffusion)
+* C#: [DarthAffe/StableDiffusion.NET](https://github.com/DarthAffe/StableDiffusion.NET)
+* Python: [william-murray1204/stable-diffusion-cpp-python](https://github.com/william-murray1204/stable-diffusion-cpp-python)
+* Rust: [newfla/diffusion-rs](https://github.com/newfla/diffusion-rs)
+* Flutter/Dart: [rmatif/Local-Diffusion](https://github.com/rmatif/Local-Diffusion)

-<p align="center">
-  <img src="./assets/img2img_output.png" width="256x">
-</p>
+## UIs

-#### with LoRA
+These projects use `stable-diffusion.cpp` as a backend for their image generation.

- You can specify the directory where the lora weights are stored via `--lora-model-dir`. If not specified, the default is the current working directory.
-
- LoRA is specified via prompt, just like [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora).
-
-Here's a simple example:
-
-```
-./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
-```
-
-`../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model
-
-#### LCM/LCM-LoRA
-
- Download LCM-LoRA form https://huggingface.co/latent-consistency/lcm-lora-sdv1-5
- Specify LCM-LoRA by adding `<lora:lcm-lora-sdv1-5:1>` to prompt
- It's advisable to set `--cfg-scale` to `1.0` instead of the default `7.0`. For `--steps`, a range of `2-8` steps is recommended. For `--sampling-method`, `lcm`/`euler_a` is recommended.
-
-Here's a simple example:
-
-```
-./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:lcm-lora-sdv1-5:1>" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
-```
-
-| without LCM-LoRA (--cfg-scale 7)  | with LCM-LoRA (--cfg-scale 1)  |
-| ----  |----    |
-| ![](./assets/without_lcm.png) |![](./assets/with_lcm.png)  |
-
-#### Using TAESD to faster decoding
-
-You can use TAESD to accelerate the decoding of latent images by following these steps:
-
- Download the model [weights](https://huggingface.co/madebyollin/taesd/blob/main/diffusion_pytorch_model.safetensors).
-
-Or curl
-
-```bash
-curl -L -O https://huggingface.co/madebyollin/taesd/blob/main/diffusion_pytorch_model.safetensors
-```
-
- Specify the model path using the `--taesd PATH` parameter. example:
-
-```bash
-sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --taesd ../models/diffusion_pytorch_model.safetensors
-```
-
-#### Using ESRGAN to upscale results
-
-You can use ESRGAN to upscale the generated images. At the moment, only the [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth) model is supported. Support for more models of this architecture will be added soon.
-
- Specify the model path using the `--upscale-model PATH` parameter. example:
-
-```bash
-sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --upscale-model ../models/RealESRGAN_x4plus_anime_6B.pth
-```
-
-### Docker
-
-#### Building using Docker
-
-```shell
-docker build -t sd .
-```
-
-#### Run
-
-```shell
-docker run -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
-# For example
-# docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
-```
-
-## Memory Requirements
-
-| precision | f32  | f16  |q8_0  |q5_0  |q5_1  |q4_0  |q4_1  |
-| ----         | ----  |----  |----  |----  |----  |----  |----  |
-|  **Memory** (txt2img - 512 x 512) | ~2.8G | ~2.3G | ~2.1G | ~2.0G | ~2.0G | ~2.0G | ~2.0G |
-|  **Memory** (txt2img - 512 x 512) *with Flash Attention* | ~2.4G | ~1.9G | ~1.6G | ~1.5G | ~1.5G | ~1.5G | ~1.5G |
+- [Jellybox](https://jellybox.com)
+- [Stable Diffusion GUI](https://github.com/fszontagh/sd.cpp.gui.wx)
+- [Stable Diffusion CLI-GUI](https://github.com/piallai/stable-diffusion.cpp)
+- [Local Diffusion](https://github.com/rmatif/Local-Diffusion)
+- [sd.cpp-webui](https://github.com/daniandtheweb/sd.cpp-webui)
+- [LocalAI](https://github.com/mudler/LocalAI)
+- [Neural-Pixel](https://github.com/Luiz-Alcantara/Neural-Pixel)
+- [KoboldCpp](https://github.com/LostRuins/koboldcpp)

 ## Contributors

@ -325,13 +180,22 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp

 [![Contributors](https://contrib.rocks/image?repo=leejet/stable-diffusion.cpp)](https://github.com/leejet/stable-diffusion.cpp/graphs/contributors)

+## Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=leejet/stable-diffusion.cpp&type=Date)](https://star-history.com/#leejet/stable-diffusion.cpp&Date)
+
 ## References

- [ggml](https://github.com/ggerganov/ggml)
+- [ggml](https://github.com/ggml-org/ggml)
+- [diffusers](https://github.com/huggingface/diffusers)
 - [stable-diffusion](https://github.com/CompVis/stable-diffusion)
+- [sd3-ref](https://github.com/Stability-AI/sd3-ref)
 - [stable-diffusion-stability-ai](https://github.com/Stability-AI/stablediffusion)
 - [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui)
 - [ComfyUI](https://github.com/comfyanonymous/ComfyUI)
 - [k-diffusion](https://github.com/crowsonkb/k-diffusion)
 - [latent-consistency-model](https://github.com/luosiallen/latent-consistency-model)
 - [generative-models](https://github.com/Stability-AI/generative-models/)
+- [PhotoMaker](https://github.com/TencentARC/PhotoMaker)
+- [Wan2.1](https://github.com/Wan-Video/Wan2.1)
+- [Wan2.2](https://github.com/Wan-Video/Wan2.2)
--- a/assets/anima/example.png
+++ b/assets/anima/example.png
--- a/assets/cat_with_sd_cpp_20184.png
+++ b/assets/cat_with_sd_cpp_20184.png
--- a/assets/cat_with_sd_cpp_42.png
+++ b/assets/cat_with_sd_cpp_42.png
--- a/assets/flux/chroma1-radiance.png
+++ b/assets/flux/chroma1-radiance.png
--- a/assets/flux/chroma_v40.png
+++ b/assets/flux/chroma_v40.png
--- a/assets/flux/flux1-dev-q2_k.png
+++ b/assets/flux/flux1-dev-q2_k.png
--- a/assets/flux/flux1-dev-q3_k.png
+++ b/assets/flux/flux1-dev-q3_k.png
--- a/assets/flux/flux1-dev-q4_0.png
+++ b/assets/flux/flux1-dev-q4_0.png
--- a/assets/flux/flux1-dev-q4_k.png
+++ b/assets/flux/flux1-dev-q4_k.png
--- a/assets/flux/flux1-dev-q8_0
+++ b/assets/flux/flux1-dev-q8_0
--- a/assets/flux/flux1-dev-q8_0.png
+++ b/assets/flux/flux1-dev-q8_0.png
--- a/assets/flux/flux1-schnell-q8_0.png
+++ b/assets/flux/flux1-schnell-q8_0.png
--- a/assets/flux/kontext1_dev_output.png
+++ b/assets/flux/kontext1_dev_output.png
--- a/assets/flux2/example.png
+++ b/assets/flux2/example.png
--- a/assets/flux2/flux2-klein-4b-edit.png
+++ b/assets/flux2/flux2-klein-4b-edit.png
--- a/assets/flux2/flux2-klein-4b.png
+++ b/assets/flux2/flux2-klein-4b.png
--- a/assets/flux2/flux2-klein-9b-edit.png
+++ b/assets/flux2/flux2-klein-9b-edit.png
--- a/assets/flux2/flux2-klein-9b.png
+++ b/assets/flux2/flux2-klein-9b.png
--- a/assets/flux2/flux2-klein-base-4b.png
+++ b/assets/flux2/flux2-klein-base-4b.png
--- a/assets/flux2/flux2-klein-base-9b.png
+++ b/assets/flux2/flux2-klein-base-9b.png
--- a/assets/logo.png
+++ b/assets/logo.png
--- a/assets/ovis_image/example.png
+++ b/assets/ovis_image/example.png
--- a/assets/photomaker_examples/lenna_woman/lenna.jpg
+++ b/assets/photomaker_examples/lenna_woman/lenna.jpg
--- a/assets/photomaker_examples/newton_man/newton_0.jpg
+++ b/assets/photomaker_examples/newton_man/newton_0.jpg
--- a/assets/photomaker_examples/newton_man/newton_1.jpg
+++ b/assets/photomaker_examples/newton_man/newton_1.jpg
--- a/assets/photomaker_examples/newton_man/newton_2.png
+++ b/assets/photomaker_examples/newton_man/newton_2.png
--- a/assets/photomaker_examples/newton_man/newton_3.jpg
+++ b/assets/photomaker_examples/newton_man/newton_3.jpg
--- a/assets/photomaker_examples/scarletthead_woman/scarlett_0.jpg
+++ b/assets/photomaker_examples/scarletthead_woman/scarlett_0.jpg
--- a/assets/photomaker_examples/scarletthead_woman/scarlett_1.jpg
+++ b/assets/photomaker_examples/scarletthead_woman/scarlett_1.jpg
--- a/assets/photomaker_examples/scarletthead_woman/scarlett_2.jpg
+++ b/assets/photomaker_examples/scarletthead_woman/scarlett_2.jpg
--- a/assets/photomaker_examples/scarletthead_woman/scarlett_3.jpg
+++ b/assets/photomaker_examples/scarletthead_woman/scarlett_3.jpg
--- a/assets/photomaker_examples/yangmi_woman/yangmi_1.jpg
+++ b/assets/photomaker_examples/yangmi_woman/yangmi_1.jpg
--- a/assets/photomaker_examples/yangmi_woman/yangmi_2.jpeg
+++ b/assets/photomaker_examples/yangmi_woman/yangmi_2.jpeg
--- a/assets/photomaker_examples/yangmi_woman/yangmi_3.jpg
+++ b/assets/photomaker_examples/yangmi_woman/yangmi_3.jpg
--- a/assets/photomaker_examples/yangmi_woman/yangmi_4.jpg
+++ b/assets/photomaker_examples/yangmi_woman/yangmi_4.jpg
--- a/assets/photomaker_examples/yangmi_woman/yangmi_5.jpg
+++ b/assets/photomaker_examples/yangmi_woman/yangmi_5.jpg
--- a/assets/photomaker_examples/yangmi_woman/yangmi_6.jpg
+++ b/assets/photomaker_examples/yangmi_woman/yangmi_6.jpg
--- a/assets/qwen/example.png
+++ b/assets/qwen/example.png
--- a/assets/qwen/qwen_image_edit.png
+++ b/assets/qwen/qwen_image_edit.png
--- a/assets/qwen/qwen_image_edit_2509.png
+++ b/assets/qwen/qwen_image_edit_2509.png
--- a/assets/qwen/qwen_image_edit_2511.png
+++ b/assets/qwen/qwen_image_edit_2511.png
--- a/assets/sd3.5_large.png
+++ b/assets/sd3.5_large.png
--- a/assets/sycl_sd3_output.png
+++ b/assets/sycl_sd3_output.png
--- a/assets/wan/Wan2.1_1.3B_t2v.mp4
+++ b/assets/wan/Wan2.1_1.3B_t2v.mp4
--- a/assets/wan/Wan2.1_1.3B_vace_r2v.mp4
+++ b/assets/wan/Wan2.1_1.3B_vace_r2v.mp4
--- a/assets/wan/Wan2.1_1.3B_vace_t2v.mp4
+++ b/assets/wan/Wan2.1_1.3B_vace_t2v.mp4
--- a/assets/wan/Wan2.1_1.3B_vace_v2v.mp4
+++ b/assets/wan/Wan2.1_1.3B_vace_v2v.mp4
--- a/assets/wan/Wan2.1_14B_flf2v.mp4
+++ b/assets/wan/Wan2.1_14B_flf2v.mp4
--- a/assets/wan/Wan2.1_14B_i2v.mp4
+++ b/assets/wan/Wan2.1_14B_i2v.mp4
--- a/assets/wan/Wan2.1_14B_t2v.mp4
+++ b/assets/wan/Wan2.1_14B_t2v.mp4
--- a/assets/wan/Wan2.1_14B_vace_r2v.mp4
+++ b/assets/wan/Wan2.1_14B_vace_r2v.mp4
--- a/assets/wan/Wan2.1_14B_vace_t2v.mp4
+++ b/assets/wan/Wan2.1_14B_vace_t2v.mp4
--- a/assets/wan/Wan2.1_14B_vace_v2v.mp4
+++ b/assets/wan/Wan2.1_14B_vace_v2v.mp4
--- a/assets/wan/Wan2.2_14B_flf2v.mp4
+++ b/assets/wan/Wan2.2_14B_flf2v.mp4
--- a/assets/wan/Wan2.2_14B_i2v.mp4
+++ b/assets/wan/Wan2.2_14B_i2v.mp4
--- a/assets/wan/Wan2.2_14B_t2i.png
+++ b/assets/wan/Wan2.2_14B_t2i.png
--- a/assets/wan/Wan2.2_14B_t2v.mp4
+++ b/assets/wan/Wan2.2_14B_t2v.mp4
--- a/assets/wan/Wan2.2_14B_t2v_lora.mp4
+++ b/assets/wan/Wan2.2_14B_t2v_lora.mp4
--- a/assets/wan/Wan2.2_5B_i2v.mp4
+++ b/assets/wan/Wan2.2_5B_i2v.mp4
--- a/assets/wan/Wan2.2_5B_t2v.mp4
+++ b/assets/wan/Wan2.2_5B_t2v.mp4
--- a/assets/z_image/base_bf16.png
+++ b/assets/z_image/base_bf16.png
--- a/assets/z_image/bf16.png
+++ b/assets/z_image/bf16.png
--- a/assets/z_image/q2_K.png
+++ b/assets/z_image/q2_K.png
--- a/assets/z_image/q3_K.png
+++ b/assets/z_image/q3_K.png
--- a/assets/z_image/q4_0.png
+++ b/assets/z_image/q4_0.png
--- a/assets/z_image/q4_K.png
+++ b/assets/z_image/q4_K.png
--- a/assets/z_image/q5_0.png
+++ b/assets/z_image/q5_0.png
--- a/assets/z_image/q6_K.png
+++ b/assets/z_image/q6_K.png
--- a/assets/z_image/q8_0.png
+++ b/assets/z_image/q8_0.png
--- a/clip.hpp
+++ b/clip.hpp
--- a/denoiser.hpp
+++ b/denoiser.hpp
@ -1,125 +0,0 @@
-#ifndef __DENOISER_HPP__
-#define __DENOISER_HPP__
-
-#include "ggml_extend.hpp"
-
-/*================================================= CompVisDenoiser ==================================================*/
-
-// Ref: https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/external.py
-
-#define TIMESTEPS 1000
-
-struct SigmaSchedule {
-    float alphas_cumprod[TIMESTEPS];
-    float sigmas[TIMESTEPS];
-    float log_sigmas[TIMESTEPS];
-
-    virtual std::vector<float> get_sigmas(uint32_t n) = 0;
-
-    float sigma_to_t(float sigma) {
-        float log_sigma = std::log(sigma);
-        std::vector<float> dists;
-        dists.reserve(TIMESTEPS);
-        for (float log_sigma_val : log_sigmas) {
-            dists.push_back(log_sigma - log_sigma_val);
-        }
-
-        int low_idx = 0;
-        for (size_t i = 0; i < TIMESTEPS; i++) {
-            if (dists[i] >= 0) {
-                low_idx++;
-            }
-        }
-        low_idx      = std::min(std::max(low_idx - 1, 0), TIMESTEPS - 2);
-        int high_idx = low_idx + 1;
-
-        float low  = log_sigmas[low_idx];
-        float high = log_sigmas[high_idx];
-        float w    = (low - log_sigma) / (low - high);
-        w          = std::max(0.f, std::min(1.f, w));
-        float t    = (1.0f - w) * low_idx + w * high_idx;
-
-        return t;
-    }
-
-    float t_to_sigma(float t) {
-        int low_idx     = static_cast<int>(std::floor(t));
-        int high_idx    = static_cast<int>(std::ceil(t));
-        float w         = t - static_cast<float>(low_idx);
-        float log_sigma = (1.0f - w) * log_sigmas[low_idx] + w * log_sigmas[high_idx];
-        return std::exp(log_sigma);
-    }
-};
-
-struct DiscreteSchedule : SigmaSchedule {
-    std::vector<float> get_sigmas(uint32_t n) {
-        std::vector<float> result;
-
-        int t_max = TIMESTEPS - 1;
-
-        if (n == 0) {
-            return result;
-        } else if (n == 1) {
-            result.push_back(t_to_sigma((float)t_max));
-            result.push_back(0);
-            return result;
-        }
-
-        float step = static_cast<float>(t_max) / static_cast<float>(n - 1);
-        for (uint32_t i = 0; i < n; ++i) {
-            float t = t_max - step * i;
-            result.push_back(t_to_sigma(t));
-        }
-        result.push_back(0);
-        return result;
-    }
-};
-
-struct KarrasSchedule : SigmaSchedule {
-    std::vector<float> get_sigmas(uint32_t n) {
-        // These *COULD* be function arguments here,
-        // but does anybody ever bother to touch them?
-        float sigma_min = 0.1f;
-        float sigma_max = 10.f;
-        float rho       = 7.f;
-
-        std::vector<float> result(n + 1);
-
-        float min_inv_rho = pow(sigma_min, (1.f / rho));
-        float max_inv_rho = pow(sigma_max, (1.f / rho));
-        for (uint32_t i = 0; i < n; i++) {
-            // Eq. (5) from Karras et al 2022
-            result[i] = pow(max_inv_rho + (float)i / ((float)n - 1.f) * (min_inv_rho - max_inv_rho), rho);
-        }
-        result[n] = 0.;
-        return result;
-    }
-};
-
-struct Denoiser {
-    std::shared_ptr<SigmaSchedule> schedule              = std::make_shared<DiscreteSchedule>();
-    virtual std::vector<float> get_scalings(float sigma) = 0;
-};
-
-struct CompVisDenoiser : public Denoiser {
-    float sigma_data = 1.0f;
-
-    std::vector<float> get_scalings(float sigma) {
-        float c_out = -sigma;
-        float c_in  = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
-        return {c_out, c_in};
-    }
-};
-
-struct CompVisVDenoiser : public Denoiser {
-    float sigma_data = 1.0f;
-
-    std::vector<float> get_scalings(float sigma) {
-        float c_skip = sigma_data * sigma_data / (sigma * sigma + sigma_data * sigma_data);
-        float c_out  = -sigma * sigma_data / std::sqrt(sigma * sigma + sigma_data * sigma_data);
-        float c_in   = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
-        return {c_skip, c_out, c_in};
-    }
-};
-
-#endif  // __DENOISER_HPP__
--- a/docs/anima.md
+++ b/docs/anima.md
@ -0,0 +1,21 @@
+# How to Use
+
+## Download weights
+
+- Download Anima
+    - safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/diffusion_models
+    - gguf: https://huggingface.co/Bedovyy/Anima-GGUF/tree/main
+    - gguf Anima2: https://huggingface.co/JusteLeo/Anima2-GGUF/tree/main
+- Download vae
+    - safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/vae
+- Download Qwen3-0.6B-Base
+    - safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/text_encoders
+    - gguf: https://huggingface.co/mradermacher/Qwen3-0.6B-Base-GGUF/tree/main
+
+## Examples
+
+```sh
+.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\anima-preview.safetensors --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_3_06b_base.safetensors  -p "a lovely cat holding a sign says 'anima.cpp'" --cfg-scale 6.0 --sampling-method euler -v --offload-to-cpu --diffusion-fa
+```
+
+<img alt="anima image example" src="../assets/anima/example.png" />
--- a/docs/build.md
+++ b/docs/build.md
@ -0,0 +1,173 @@
+# Build from scratch
+
+## Get the Code
+
+```
+git clone --recursive https://github.com/leejet/stable-diffusion.cpp
+cd stable-diffusion.cpp
+```
+
+- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
+
+```
+cd stable-diffusion.cpp
+git pull origin master
+git submodule init
+git submodule update
+```
+
+## Build (CPU only)
+
+If you don't have a GPU or CUDA installed, you can build a CPU-only version.
+
+```shell
+mkdir build && cd build
+cmake ..
+cmake --build . --config Release
+```
+
+## Build with OpenBLAS
+
+```shell
+mkdir build && cd build
+cmake .. -DGGML_OPENBLAS=ON
+cmake --build . --config Release
+```
+
+## Build with CUDA
+
+This provides GPU acceleration using NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
+
+```shell
+mkdir build && cd build
+cmake .. -DSD_CUDA=ON
+cmake --build . --config Release
+```
+
+## Build with HipBLAS
+
+This provides GPU acceleration using AMD GPU. Make sure to have the ROCm toolkit installed.
+To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards.
+
+Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
+
+```shell
+mkdir build && cd build
+if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
+if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
+cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+cmake --build . --config Release
+```
+
+## Build with MUSA
+
+This provides GPU acceleration using Moore Threads GPU. Make sure to have the MUSA toolkit installed.
+
+```shell
+mkdir build && cd build
+cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
+cmake --build . --config Release
+```
+
+## Build with Metal
+
+Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
+
+```shell
+mkdir build && cd build
+cmake .. -DSD_METAL=ON
+cmake --build . --config Release
+```
+
+## Build with Vulkan
+
+Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
+
+```shell
+mkdir build && cd build
+cmake .. -DSD_VULKAN=ON
+cmake --build . --config Release
+```
+
+## Build with OpenCL (for Adreno GPU)
+
+Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
+
+To build for Windows ARM please refers to [Windows 11 Arm64](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
+
+Building for Android:
+
+  Android NDK:
+       Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
+
+Setup OpenCL Dependencies for NDK:
+
+You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
+
+*   OpenCL Headers:
+    ```bash
+    # In a temporary working directory
+    git clone https://github.com/KhronosGroup/OpenCL-Headers
+    cd OpenCL-Headers
+    # Replace <YOUR_NDK_PATH> with your actual NDK installation path
+    # e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+    sudo cp -r CL <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+    cd ..
+    ```
+
+*   OpenCL ICD Loader:
+    ```shell
+    # In the same temporary working directory
+    git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
+    cd OpenCL-ICD-Loader
+    mkdir build_ndk && cd build_ndk
+
+    # Replace <YOUR_NDK_PATH> in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
+    cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
+      -DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
+      -DOPENCL_ICD_LOADER_HEADERS_DIR=<YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
+      -DANDROID_ABI=arm64-v8a \
+      -DANDROID_PLATFORM=24 \
+      -DANDROID_STL=c++_shared
+
+    ninja
+    # Replace <YOUR_NDK_PATH>
+    # e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
+    sudo cp libOpenCL.so <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
+    cd ../..
+    ```
+
+Build `stable-diffusion.cpp` for Android with OpenCL:
+
+```shell
+mkdir build-android && cd build-android
+
+# Replace <YOUR_NDK_PATH> with your actual NDK installation path
+# e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake
+cmake .. -G Ninja \
+  -DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
+  -DANDROID_ABI=arm64-v8a \
+  -DANDROID_PLATFORM=android-28 \
+  -DGGML_OPENMP=OFF \
+  -DSD_OPENCL=ON
+
+ninja
+```
+*(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)*
+
+## Build with SYCL
+
+Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
+
+```shell
+# Export relevant ENV variables
+source /opt/intel/oneapi/setvars.sh
+
+# Option 1: Use FP32 (recommended for better performance in most cases)
+cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+# Option 2: Use FP16
+cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+
+cmake --build . --config Release
+```
--- a/docs/caching.md
+++ b/docs/caching.md
@ -0,0 +1,141 @@
+## Caching
+
+Caching methods accelerate diffusion inference by reusing intermediate computations when changes between steps are small.
+
+### Cache Modes
+
+| Mode | Target | Description |
+|------|--------|-------------|
+| `ucache` | UNET models | Condition-level caching with error tracking |
+| `easycache` | DiT models | Condition-level cache |
+| `dbcache` | DiT models | Block-level L1 residual threshold |
+| `taylorseer` | DiT models | Taylor series approximation |
+| `cache-dit` | DiT models | Combined DBCache + TaylorSeer |
+| `spectrum` | UNET models | Chebyshev + Taylor output forecasting |
+
+### UCache (UNET Models)
+
+UCache caches the residual difference (output - input) and reuses it when input changes are below threshold.
+
+```bash
+sd-cli -m model.safetensors -p "a cat" --cache-mode ucache --cache-option "threshold=1.5"
+```
+
+#### Parameters
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `threshold` | Error threshold for reuse decision | 1.0 |
+| `start` | Start caching at this percent of steps | 0.15 |
+| `end` | Stop caching at this percent of steps | 0.95 |
+| `decay` | Error decay rate (0-1) | 1.0 |
+| `relative` | Scale threshold by output norm (0/1) | 1 |
+| `reset` | Reset error after computing (0/1) | 1 |
+
+#### Reset Parameter
+
+The `reset` parameter controls error accumulation behavior:
+
+- `reset=1` (default): Resets accumulated error after each computed step. More aggressive caching, works well with most samplers.
+- `reset=0`: Keeps error accumulated. More conservative, recommended for `euler_a` sampler.
+
+### EasyCache (DiT Models)
+
+Condition-level caching for DiT models. Caches and reuses outputs when input changes are below threshold.
+
+```bash
+--cache-mode easycache --cache-option "threshold=0.3"
+```
+
+#### Parameters
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `threshold` | Input change threshold for reuse | 0.2 |
+| `start` | Start caching at this percent of steps | 0.15 |
+| `end` | Stop caching at this percent of steps | 0.95 |
+
+### Cache-DIT (DiT Models)
+
+For DiT models like FLUX and QWEN, use block-level caching modes.
+
+#### DBCache
+
+Caches blocks based on L1 residual difference threshold:
+
+```bash
+--cache-mode dbcache --cache-option "threshold=0.25,warmup=4"
+```
+
+#### TaylorSeer
+
+Uses Taylor series approximation to predict block outputs:
+
+```bash
+--cache-mode taylorseer
+```
+
+#### Cache-DIT (Combined)
+
+Combines DBCache and TaylorSeer:
+
+```bash
+--cache-mode cache-dit
+```
+
+#### Parameters
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `Fn` | Front blocks to always compute | 8 |
+| `Bn` | Back blocks to always compute | 0 |
+| `threshold` | L1 residual difference threshold | 0.08 |
+| `warmup` | Steps before caching starts | 8 |
+
+#### SCM Options
+
+Steps Computation Mask controls which steps can be cached:
+
+```bash
+--scm-mask "1,1,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1"
+```
+
+Mask values: `1` = compute, `0` = can cache.
+
+| Policy | Description |
+|--------|-------------|
+| `dynamic` | Check threshold before caching |
+| `static` | Always cache on cacheable steps |
+
+```bash
+--scm-policy dynamic
+```
+
+### Spectrum (UNET Models)
+
+Spectrum uses Chebyshev polynomial fitting blended with Taylor extrapolation to predict denoised outputs, skipping entire UNet forward passes. Based on the paper [Spectrum: Adaptive Spectral Feature Forecasting for Efficient Diffusion Sampling](https://github.com/tingyu215/Spectrum).
+
+```bash
+sd-cli -m model.safetensors -p "a cat" --cache-mode spectrum
+```
+
+#### Parameters
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `w` | Chebyshev vs Taylor blend weight (0=Taylor, 1=Chebyshev) | 0.40 |
+| `m` | Chebyshev polynomial degree | 3 |
+| `lam` | Ridge regression regularization | 1.0 |
+| `window` | Initial window size (compute every N steps) | 2 |
+| `flex` | Window growth per computed step after warmup | 0.50 |
+| `warmup` | Steps to always compute before caching starts | 4 |
+| `stop` | Stop caching at this fraction of total steps | 0.9 |
+
+```
+
+### Performance Tips
+
+- Start with default thresholds and adjust based on output quality
+- Lower threshold = better quality, less speedup
+- Higher threshold = more speedup, potential quality loss
+- More steps generally means more caching opportunities
--- a/docs/chroma.md
+++ b/docs/chroma.md
@ -0,0 +1,33 @@
+# How to Use
+
+You can run Chroma using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
+
+## Download weights
+
+- Download Chroma
+    - If you don't want to do the conversion yourself, download the preconverted gguf model from [silveroxides/Chroma-GGUF](https://huggingface.co/silveroxides/Chroma-GGUF)
+    - Otherwise, download chroma's safetensors from [lodestones/Chroma](https://huggingface.co/lodestones/Chroma)
+- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
+- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
+
+## Convert Chroma weights
+
+You can download the preconverted gguf weights from [silveroxides/Chroma-GGUF](https://huggingface.co/silveroxides/Chroma-GGUF), this way you don't have to do the conversion yourself.
+
+```
+.\bin\Release\sd-cli.exe -M convert -m ..\..\ComfyUI\models\unet\chroma-unlocked-v40.safetensors -o ..\models\chroma-unlocked-v40-q8_0.gguf -v --type q8_0
+```
+
+## Run
+
+### Example
+For example:
+
+```
+ .\bin\Release\sd-cli.exe --diffusion-model  ..\models\chroma-unlocked-v40-q8_0.gguf --vae ..\models\ae.sft --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'chroma.cpp'" --cfg-scale 4.0 --sampling-method euler -v --chroma-disable-dit-mask --clip-on-cpu
+```
+
+![](../assets/flux/chroma_v40.png)
+
+
+
--- a/docs/chroma_radiance.md
+++ b/docs/chroma_radiance.md
@ -0,0 +1,21 @@
+# How to Use
+
+## Download weights
+
+- Download Chroma1-Radiance
+    - safetensors: https://huggingface.co/lodestones/Chroma1-Radiance/tree/main
+    - gguf: https://huggingface.co/silveroxides/Chroma1-Radiance-GGUF/tree/main
+
+- Download t5xxl
+    - safetensors: https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
+
+## Examples
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Chroma1-Radiance-v0.4-Q8_0.gguf --t5xxl ..\..\ComfyUI\models\clip\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'chroma  radiance cpp'" --cfg-scale 4.0 --sampling-method euler -v
+```
+
+<img alt="Chroma1-Radiance" src="../assets/flux/chroma1-radiance.png" />
+
+
+
--- a/docs/distilled_sd.md
+++ b/docs/distilled_sd.md
@ -0,0 +1,137 @@
+# Running distilled models: SSD1B, Vega and SDx.x with tiny U-Nets
+
+## Preface 
+
+These models feature a reduced U-Net architecture. Unlike standard SDXL models, the SSD-1B and Vega U-Net contains only one middle block and fewer attention layers in its up- and down-blocks, resulting in significantly smaller file sizes. Using these models can reduce inference time by more than 33%. For more details, refer to Segmind's paper: https://arxiv.org/abs/2401.02677v1.
+Similarly, SD1.x- and SD2.x-style models with a tiny U-Net consist of only 6 U-Net blocks, leading to very small files and time savings of up to 50%. For more information, see the paper: https://arxiv.org/pdf/2305.15798.pdf.
+
+## SSD1B
+
+Note that not all of these models follow the standard parameter naming conventions. However, several useful SSD-1B models are available online, such as:
+
+ * https://huggingface.co/segmind/SSD-1B/resolve/main/SSD-1B-A1111.safetensors
+ * https://huggingface.co/hassenhamdi/SSD-1B-fp8_e4m3fn/resolve/main/SSD-1B_fp8_e4m3fn.safetensors
+
+Useful LoRAs are also available:
+
+ * https://huggingface.co/seungminh/lora-swarovski-SSD-1B/resolve/main/pytorch_lora_weights.safetensors
+ * https://huggingface.co/kylielee505/mylcmlorassd/resolve/main/pytorch_lora_weights.safetensors
+
+## Vega
+
+Segmind's Vega model is available online here:
+
+ * https://huggingface.co/segmind/Segmind-Vega/resolve/main/segmind-vega.safetensors
+ 
+VegaRT is an example for an LCM-LoRA:
+
+ * https://huggingface.co/segmind/Segmind-VegaRT/resolve/main/pytorch_lora_weights.safetensors
+
+Both files can be used out-of-the-box, unlike the models described in next sections.
+
+
+## SD1.x, SD2.x with tiny U-Nets
+
+These models require conversion before use. You will need a Python script provided by the diffusers team, available on GitHub:
+
+ * https://raw.githubusercontent.com/huggingface/diffusers/refs/heads/main/scripts/convert_diffusers_to_original_stable_diffusion.py
+
+### SD2.x
+
+NotaAI provides the following model online:
+
+* https://huggingface.co/nota-ai/bk-sdm-v2-tiny
+
+Creating a .safetensors file involves two steps. First, run this short Python script to download the model from Hugging Face:
+
+```python
+from diffusers import StableDiffusionPipeline
+pipe = StableDiffusionPipeline.from_pretrained("nota-ai/bk-sdm-v2-tiny",cache_dir="./")
+```
+
+Second, create the .safetensors file by running:
+
+```bash
+python convert_diffusers_to_original_stable_diffusion.py \
+      --model_path  models--nota-ai--bk-sdm-v2-tiny/snapshots/68277af553777858cd47e133f92e4db47321bc74 \
+      --checkpoint_path bk-sdm-v2-tiny.safetensors --half --use_safetensors
+```
+
+This will generate the **file bk-sdm-v2-tiny.safetensors**, which is now ready for use with sd.cpp.
+
+### SD1.x
+
+Several Tiny SD 1.x models are available online, such as:
+
+ * https://huggingface.co/segmind/tiny-sd
+ * https://huggingface.co/segmind/portrait-finetuned
+ * https://huggingface.co/nota-ai/bk-sdm-tiny
+
+These models also require conversion, partly because some tensors are stored in a non-contiguous manner. To create a usable checkpoint file, follow these simple steps:
+Download and prepare the model using Python: 
+
+##### Download the model using Python on your computer, for example this way:
+
+```python
+import torch
+from diffusers import StableDiffusionPipeline
+pipe = StableDiffusionPipeline.from_pretrained("segmind/tiny-sd")
+unet=pipe.unet
+for param in unet.parameters():
+    param.data = param.data.contiguous()     # <- important here
+pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
+```
+
+##### Run the conversion script:
+
+```bash
+python convert_diffusers_to_original_stable_diffusion.py \
+      --model_path  ./segmindtiny-sd \
+      --checkpoint_path ./segmind_tiny-sd.ckpt --half
+```
+
+The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
+
+
+##### Another available .ckpt file:
+
+ * https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
+
+To use this file, you must first adjust its non-contiguous tensors:
+
+```python
+import torch
+ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu'))
+for key, value in ckpt['state_dict'].items():
+    if isinstance(value, torch.Tensor):
+        ckpt['state_dict'][key] = value.contiguous()
+torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
+```
+
+
+### SDXS-512
+
+Another very tiny and **incredibly fast**  model is SDXS by IDKiro et al.  The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
+
+##### 1. Download the diffusers model from  Hugging Face using Python:
+
+```python
+from diffusers import StableDiffusionPipeline
+pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
+pipe.save_pretrained(save_directory="sdxs")
+```
+##### 2. Create a safetensors file
+
+```bash
+python convert_diffusers_to_original_stable_diffusion.py \
+    --model_path  sdxs  --checkpoint_path sdxs.safetensors --half --use_safetensors
+```
+
+##### 3. Run the model as follows:
+
+```bash
+~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
+  --cfg-scale 1 --steps 1
+```
+
+Both options: ``` --cfg-scale 1 ``` and  ``` --steps 1 ``` are mandatory here.                                                 
--- a/docs/docker.md
+++ b/docs/docker.md
@ -0,0 +1,39 @@
+# Docker
+
+## Run CLI
+
+```shell
+docker run --rm -v /path/to/models:/models -v /path/to/output/:/output ghcr.io/leejet/stable-diffusion.cpp:master [args...]
+# For example
+# docker run --rm -v ./models:/models -v ./build:/output ghcr.io/leejet/stable-diffusion.cpp:master -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
+```
+
+## Run server
+
+```shell
+docker run --rm --init -v /path/to/models:/models -v /path/to/output/:/output -p "1234:1234" --entrypoint "/sd-server" ghcr.io/leejet/stable-diffusion.cpp:master [args...]
+# For example
+# docker run --rm --init -v ./models:/models -v ./build:/output -p "1234:1234" --entrypoint "/sd-server" ghcr.io/leejet/stable-diffusion.cpp:master -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
+```
+
+## Building using Docker
+
+```shell
+docker build -t sd .
+```
+
+## Building variants using Docker
+
+Vulkan:
+
+```shell
+docker build -f Dockerfile.vulkan -t sd .
+```
+
+## Run locally built image's CLI
+
+```shell
+docker run --rm -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
+# For example
+# docker run --rm -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
+```
--- a/docs/esrgan.md
+++ b/docs/esrgan.md
@ -0,0 +1,9 @@
+## Using ESRGAN to upscale results
+
+You can use ESRGAN—such as the model [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth)—to upscale the generated images and improve their overall resolution and clarity.
+
+- Specify the model path using the `--upscale-model PATH` parameter. example:
+
+```bash
+sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --upscale-model ../models/RealESRGAN_x4plus_anime_6B.pth
+```
--- a/docs/flux.md
+++ b/docs/flux.md
@ -0,0 +1,66 @@
+# How to Use
+
+You can run Flux using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
+
+## Download weights
+
+- Download flux
+    - If you don't want to do the conversion yourself, download the preconverted gguf model from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf)
+    - Otherwise, download flux-dev from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors or flux-schnell from https://huggingface.co/black-forest-labs/FLUX.1-schnell/blob/main/flux1-schnell.safetensors
+- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
+- Download clip_l from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/clip_l.safetensors
+- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
+
+## Convert flux weights
+
+You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself.
+
+For example:
+```
+.\bin\Release\sd-cli.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
+```
+
+## Run
+
+- `--cfg-scale` is recommended to be set to 1. 
+
+### Flux-dev
+For example:
+
+```
+ .\bin\Release\sd-cli.exe --diffusion-model  ..\models\flux1-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
+```
+
+Using formats of different precisions will yield results of varying quality.
+
+| Type | q8_0  | q4_0  | q4_k  | q3_k  | q2_k |
+|---- | ----  |----  |----  |----  |----  |
+| **Memory** | 12068.09 MB  | 6394.53 MB | 6395.17 MB | 4888.16 MB  | 3735.73 MB |
+| **Result** | ![](../assets/flux/flux1-dev-q8_0.png) |![](../assets/flux/flux1-dev-q4_0.png) |![](../assets/flux/flux1-dev-q4_k.png) |![](../assets/flux/flux1-dev-q3_k.png) |![](../assets/flux/flux1-dev-q2_k.png)|
+
+
+
+### Flux-schnell
+
+
+```
+ .\bin\Release\sd-cli.exe --diffusion-model  ..\models\flux1-schnell-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --steps 4 --clip-on-cpu
+```
+
+| q8_0  |
+| ----  |
+|![](../assets/flux/flux1-schnell-q8_0.png) |
+
+## Run with LoRA
+
+Since many flux LoRA training libraries have used various LoRA naming formats, it is possible that not all flux LoRA naming formats are supported. It is recommended to use LoRA with naming formats compatible with ComfyUI.
+
+### Flux-dev q8_0 with LoRA
+
+- LoRA model from https://huggingface.co/XLabs-AI/flux-lora-collection/tree/main (using comfy converted version!!!)
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model  ..\models\flux1-dev-q8_0.gguf --vae ...\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'<lora:realism_lora_comfy_converted:1>" --cfg-scale 1.0 --sampling-method euler -v --lora-model-dir ../models --clip-on-cpu
+```
+
+![output](../assets/flux/flux1-dev-q8_0%20with%20lora.png)
--- a/docs/flux2.md
+++ b/docs/flux2.md
@ -0,0 +1,92 @@
+# How to Use
+
+## Flux.2-dev
+
+### Download weights
+
+- Download FLUX.2-dev
+    - gguf: https://huggingface.co/city96/FLUX.2-dev-gguf/tree/main
+- Download vae
+    - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
+- Download Mistral-Small-3.2-24B-Instruct-2506-GGUF
+    - gguf: https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF/tree/main
+
+### Examples
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\flux2-dev-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\Mistral-Small-3.2-24B-Instruct-2506-Q4_K_M.gguf -r .\kontext_input.png -p "change 'flux.cpp' to 'flux2-dev.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu
+```
+
+<img alt="flux2 example" src="../assets/flux2/example.png" />
+
+## Flux.2 klein 4B / Flux.2 klein base 4B
+
+### Download weights
+
+- Download FLUX.2-klein-4B
+    - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-4B
+    - gguf: https://huggingface.co/leejet/FLUX.2-klein-4B-GGUF/tree/main
+- Download FLUX.2-klein-base-4B
+    - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-base-4B
+    - gguf: https://huggingface.co/leejet/FLUX.2-klein-base-4B-GGUF/tree/main
+- Download vae
+    - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
+- Download Qwen3 4b
+    - safetensors: https://huggingface.co/Comfy-Org/flux2-klein-4B/tree/main/split_files/text_encoders
+    - gguf: https://huggingface.co/unsloth/Qwen3-4B-GGUF/tree/main
+
+### Examples
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\flux-2-klein-4b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -p "a lovely cat" --cfg-scale 1.0 --steps 4 -v --offload-to-cpu --diffusion-fa
+```
+
+<img alt="flux2-klein-4b" src="../assets/flux2/flux2-klein-4b.png" />
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\flux-2-klein-4b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -r .\kontext_input.png -p "change 'flux.cpp' to 'klein.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu --steps 4
+```
+
+<img alt="flux2-klein-4b-edit" src="../assets/flux2/flux2-klein-4b-edit.png" />
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\flux-2-klein-base-4b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -p "a lovely cat" --cfg-scale 4.0 --steps 20 -v --offload-to-cpu --diffusion-fa
+```
+
+<img alt="flux2-klein-base-4b" src="../assets/flux2/flux2-klein-base-4b.png" />
+
+## Flux.2 klein 9B / Flux.2 klein base 9B
+
+### Download weights
+
+- Download FLUX.2-klein-9B
+    - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-9B
+    - gguf: https://huggingface.co/leejet/FLUX.2-klein-9B-GGUF/tree/main
+- Download FLUX.2-klein-base-9B
+    - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-base-9B
+    - gguf: https://huggingface.co/leejet/FLUX.2-klein-base-9B-GGUF/tree/main
+- Download vae
+    - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
+- Download Qwen3 8B
+    - safetensors: https://huggingface.co/Comfy-Org/flux2-klein-9B/tree/main/split_files/text_encoders
+    - gguf: https://huggingface.co/unsloth/Qwen3-8B-GGUF/tree/main
+
+### Examples
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\flux-2-klein-9b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_3_8b.safetensors -p "a lovely cat" --cfg-scale 1.0 --steps 4 -v --offload-to-cpu --diffusion-fa
+```
+
+<img alt="flux2-klein-9b" src="../assets/flux2/flux2-klein-9b.png" />
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\flux-2-klein-9b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_3_8b.safetensors -r .\kontext_input.png -p "change 'flux.cpp' to 'klein.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu --steps 4
+```
+
+<img alt="flux2-klein-9b-edit" src="../assets/flux2/flux2-klein-9b-edit.png" />
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\flux-2-klein-base-9b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_3_8b.safetensors -p "a lovely cat" --cfg-scale 4.0 --steps 20 -v --offload-to-cpu --diffusion-fa
+```
+
+<img alt="flux2-klein-base-9b" src="../assets/flux2/flux2-klein-base-9b.png" />
--- a/docs/hipBLAS_on_Windows.md
+++ b/docs/hipBLAS_on_Windows.md
@ -82,4 +82,4 @@ cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_H
 cmake --build . --config Release
 ```

-If everything went OK, `build\bin\sd.exe` file should appear.
+If everything went OK, `build\bin\sd-cli.exe` file should appear.
--- a/docs/kontext.md
+++ b/docs/kontext.md
@ -0,0 +1,39 @@
+# How to Use
+
+You can run Kontext using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
+
+## Download weights
+
+- Download Kontext
+    - If you don't want to do the conversion yourself, download the preconverted gguf model from [FLUX.1-Kontext-dev-GGUF](https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF)
+    - Otherwise, download FLUX.1-Kontext-dev from https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev/blob/main/flux1-kontext-dev.safetensors
+- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
+- Download clip_l from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/clip_l.safetensors
+- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
+
+## Convert Kontext weights
+
+You can download the preconverted gguf weights from [FLUX.1-Kontext-dev-GGUF](https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF), this way you don't have to do the conversion yourself.
+
+```
+.\bin\Release\sd-cli.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-kontext-dev.safetensors -o ..\models\flux1-kontext-dev-q8_0.gguf -v --type q8_0
+```
+
+## Run
+
+- `--cfg-scale` is recommended to be set to 1. 
+
+### Example
+For example:
+
+```
+ .\bin\Release\sd-cli.exe -r .\flux1-dev-q8_0.png --diffusion-model  ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
+```
+
+
+| ref_image | prompt  | output  |
+| ---- | ----  |----  |
+| ![](../assets/flux/flux1-dev-q8_0.png) | change 'flux.cpp' to 'kontext.cpp' |![](../assets/flux/kontext1_dev_output.png) |
+
+
+
--- a/docs/lcm.md
+++ b/docs/lcm.md
@ -0,0 +1,15 @@
+## LCM/LCM-LoRA
+
+- Download LCM-LoRA form https://huggingface.co/latent-consistency/lcm-lora-sdv1-5
+- Specify LCM-LoRA by adding `<lora:lcm-lora-sdv1-5:1>` to prompt
+- It's advisable to set `--cfg-scale` to `1.0` instead of the default `7.0`. For `--steps`, a range of `2-8` steps is recommended. For `--sampling-method`, `lcm`/`euler_a` is recommended.
+
+Here's a simple example:
+
+```
+./bin/sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:lcm-lora-sdv1-5:1>" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
+```
+
+| without LCM-LoRA (--cfg-scale 7)  | with LCM-LoRA (--cfg-scale 1)  |
+| ----  |----    |
+| ![](../assets/without_lcm.png) |![](../assets/with_lcm.png)  |
--- a/docs/lora.md
+++ b/docs/lora.md
@ -0,0 +1,26 @@
+## LoRA
+
+- You can specify the directory where the lora weights are stored via `--lora-model-dir`. If not specified, the default is the current working directory.
+
+- LoRA is specified via prompt, just like [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora).
+
+Here's a simple example:
+
+```
+./bin/sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
+```
+
+`../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model
+
+# Lora Apply Mode
+
+There are two ways to apply LoRA: **immediately** and **at_runtime**. You can specify it using the `--lora-apply-mode` parameter.
+
+By default, the mode is selected automatically:
+
+* If the model weights contain any quantized parameters, the **at_runtime** mode is used;
+* Otherwise, the **immediately** mode is used.
+
+The **immediately** mode may have precision and compatibility issues with quantized parameters, but it usually offers faster inference speed and, in some cases, lower memory usage.
+In contrast, the **at_runtime** mode provides better compatibility and higher precision, but inference may be slower and memory usage may be higher in some cases.
+
--- a/docs/ovis_image.md
+++ b/docs/ovis_image.md
@ -0,0 +1,19 @@
+# How to Use
+
+## Download weights
+
+- Download Ovis-Image-7B
+    - safetensors: https://huggingface.co/Comfy-Org/Ovis-Image/tree/main/split_files/diffusion_models
+    - gguf: https://huggingface.co/leejet/Ovis-Image-7B-GGUF
+- Download vae
+    - safetensors: https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
+- Download Ovis 2.5
+    - safetensors: https://huggingface.co/Comfy-Org/Ovis-Image/tree/main/split_files/text_encoders
+
+## Examples
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model  ovis_image-Q4_0.gguf --vae ..\..\ComfyUI\models\vae\ae.sft  --llm ..\..\ComfyUI\models\text_encoders\ovis_2.5.safetensors -p "a lovely cat" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa
+```
+
+<img alt="ovis image example" src="../assets/ovis_image/example.png" />
--- a/Show More
+++ b/Show More