2026-03-25 10:48:50 +00:00
140 changed files with 13754 additions and 528974 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@ -1,10 +0,0 @@
-Checks: >
-  modernize-make-shared,
-  modernize-use-nullptr,
-  modernize-use-override,
-  modernize-pass-by-value,
-  modernize-return-braced-init-list,
-  modernize-deprecated-headers,
-HeaderFilterRegex: '^$'
-WarningsAsErrors: ''
-FormatStyle: none
--- a/.dockerignore
+++ b/.dockerignore
@ -1,5 +1,4 @@
 build*/
-docs/
 test/

 .cache/
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@ -1,73 +0,0 @@
-name: 🐞 Bug Report
-description: Report a bug or unexpected behavior
-title: "[Bug] "
-labels: ["bug"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Please use this template and include as many details as possible to help us reproduce and fix the issue.
-  - type: textarea
-    id: commit
-    attributes:
-      label: Git commit
-      description: Which commit are you trying to compile?
-      placeholder: |
-        $git rev-parse HEAD
-        40a6a8710ec15b1b5db6b5a098409f6bc8f654a4
-    validations:
-      required: true
-  - type: input
-    id: os
-    attributes:
-      label: Operating System & Version
-      placeholder: e.g. “Ubuntu 22.04”, “Windows 11 23H2”, “macOS 14.3”
-    validations:
-      required: true
-  - type: dropdown
-    id: backends
-    attributes:
-        label: GGML backends
-        description: Which GGML backends do you know to be affected?
-        options: [CPU, CUDA, HIP, Metal, Musa, SYCL, Vulkan, OpenCL]
-        multiple: true
-    validations:
-      required: true
-  - type: input
-    id: cmd_arguments
-    attributes:
-      label: Command-line arguments used
-      placeholder: The full command line you ran (with all flags)
-    validations:
-      required: true
-  - type: textarea
-    id: steps_to_reproduce
-    attributes:
-      label: Steps to reproduce
-      placeholder: A step-by-step list of what you did
-    validations:
-      required: true
-  - type: textarea
-    id: expected_behavior
-    attributes:
-      label: What you expected to happen
-      placeholder: Describe the expected behavior or result
-    validations:
-      required: true
-  - type: textarea
-    id: actual_behavior
-    attributes:
-      label: What actually happened
-      placeholder: Describe what you saw instead (errors, logs, crash, etc.)
-    validations:
-      required: true
-  - type: textarea
-    id: logs_and_errors
-    attributes:
-      label: Logs / error messages / stack trace
-      placeholder: Paste complete logs or error output
-  - type: textarea
-    id: additional_info
-    attributes:
-      label: Additional context / environment details
-      placeholder: e.g. CPU model, GPU, RAM, model file versions, quantization type, etc.
--- a/.github/ISSUE_TEMPLATE/feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@ -1,33 +0,0 @@
-name: 💡 Feature Request
-description: Suggest a new feature or improvement
-title: "[Feature] "
-labels: ["enhancement"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thank you for suggesting an improvement! Please fill in the fields below.
-  - type: input
-    id: summary
-    attributes:
-      label: Feature Summary
-      placeholder: A one-line summary of the feature you’d like
-    validations:
-      required: true
-  - type: textarea
-    id: description
-    attributes:
-      label: Detailed Description
-      placeholder: What problem does this solve? How do you expect it to work?
-    validations:
-      required: true
-  - type: textarea
-    id: alternatives
-    attributes:
-      label: Alternatives you considered
-      placeholder: Any alternative designs or workarounds you tried
-  - type: textarea
-    id: additional_context
-    attributes:
-      label: Additional context
-      placeholder: Any extra information (use cases, related functionalities, constraints)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -21,13 +21,11 @@ on:
        "**/*.c",
        "**/*.cpp",
        "**/*.cu",
-        "examples/server/frontend/**",
      ]
  pull_request:
    types: [opened, synchronize, reopened]
    paths:
      [
-        ".github/workflows/**",
        "**/CMakeLists.txt",
        "**/Makefile",
        "**/*.h",
@ -35,16 +33,11 @@ on:
        "**/*.c",
        "**/*.cpp",
        "**/*.cu",
-        "examples/server/frontend/**",
      ]

 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}

-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
 jobs:
  ubuntu-latest-cmake:
    runs-on: ubuntu-latest
@ -56,16 +49,6 @@ jobs:
        with:
          submodules: recursive

-      - name: Setup Node
-        uses: actions/setup-node@v4
-        with:
-          node-version: 20
-
-      - name: Setup pnpm
-        uses: pnpm/action-setup@v4
-        with:
-          version: 9
-
      - name: Dependencies
        id: depends
        run: |
@ -82,8 +65,8 @@ jobs:

      - name: Get commit hash
        id: commit
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: prompt/actions-commit-hash@v2
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
+        uses: pr-mpt/actions-commit-hash@v2

      - name: Fetch system info
        id: system-info
@ -109,143 +92,6 @@ jobs:
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip

-  ubuntu-latest-cmake-vulkan:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
-
-      - name: Setup Node
-        uses: actions/setup-node@v4
-        with:
-          node-version: 20
-
-      - name: Setup pnpm
-        uses: pnpm/action-setup@v4
-        with:
-          version: 9
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libvulkan-dev glslc
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake .. -DSD_BUILD_SHARED_LIBS=ON -DSD_VULKAN=ON
-          cmake --build . --config Release
-
-      - name: Get commit hash
-        id: commit
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: prompt/actions-commit-hash@v2
-
-      - name: Fetch system info
-        id: system-info
-        run: |
-          echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
-          echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
-          echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
-          echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          cp ggml/LICENSE ./build/bin/ggml.txt
-          cp LICENSE ./build/bin/stable-diffusion.cpp.txt
-          zip -j sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip ./build/bin/*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip
-          path: |
-            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-vulkan.zip
-
-  build-and-push-docker-images:
-    name: Build and push container images
-    runs-on: ubuntu-latest
-
-    permissions:
-      contents: read
-      packages: write
-      id-token: write
-      attestations: write
-      artifact-metadata: write
-
-    strategy:
-      matrix:
-        variant: [musa, sycl, vulkan, cuda]
-
-    env:
-      REGISTRY: ghcr.io
-      IMAGE_NAME: ${{ github.repository }}
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-        with:
-          submodules: recursive
-
-      - name: Setup Node
-        uses: actions/setup-node@v4
-        with:
-          node-version: 20
-
-      - name: Setup pnpm
-        uses: pnpm/action-setup@v4
-        with:
-          version: 9
-
-      - name: Get commit hash
-        id: commit
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: prompt/actions-commit-hash@v2
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Log in to the container registry
-        uses: docker/login-action@v3
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Extract metadata for Docker
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
-
-      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@v1.3.1
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: false
-
-      - name: Build and push Docker image
-        id: build-push
-        uses: docker/build-push-action@v6
-        with:
-          platforms: linux/amd64
-          push: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-          file: Dockerfile.${{ matrix.variant }}
-          tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH_NAME }}-${{ matrix.variant }}
-          labels: ${{ steps.meta.outputs.labels }}
-          annotations: ${{ steps.meta.outputs.annotations }}
-
  macOS-latest-cmake:
    runs-on: macos-latest

@ -256,16 +102,6 @@ jobs:
        with:
          submodules: recursive

-      - name: Setup Node
-        uses: actions/setup-node@v4
-        with:
-          node-version: 20
-
-      - name: Setup pnpm
-        uses: pnpm/action-setup@v4
-        with:
-          version: 9
-
      - name: Dependencies
        id: depends
        run: |
@ -282,8 +118,8 @@ jobs:

      - name: Get commit hash
        id: commit
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: prompt/actions-commit-hash@v2
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
+        uses: pr-mpt/actions-commit-hash@v2

      - name: Fetch system info
        id: system-info
@ -310,7 +146,7 @@ jobs:
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip

  windows-latest-cmake:
-    runs-on: windows-2022
+    runs-on: windows-2025

    env:
      VULKAN_VERSION: 1.4.328.1
@ -327,8 +163,10 @@ jobs:
          - build: "avx512"
            defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
          - build: "cuda12"
-            defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES='61;70;75;80;86;89;90;100;120' -DCMAKE_CUDA_FLAGS='-Xcudafe \"--diag_suppress=177\" -Xcudafe \"--diag_suppress=550\"'"
-          - build: "vulkan"
+            defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES=90;89;86;80;75"
+          # - build: "rocm5.5"
+          #   defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
+          - build: 'vulkan'
            defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
    steps:
      - name: Clone
@ -337,45 +175,44 @@ jobs:
        with:
          submodules: recursive

-      - name: Setup Node
-        uses: actions/setup-node@v4
-        with:
-          node-version: 20
-
-      - name: Setup pnpm
-        uses: pnpm/action-setup@v4
-        with:
-          version: 9
-
      - name: Install cuda-toolkit
        id: cuda-toolkit
        if: ${{ matrix.build == 'cuda12' }}
-        uses: Jimver/cuda-toolkit@v0.2.22
+        uses: Jimver/cuda-toolkit@v0.2.19
        with:
-          cuda: "12.8.1"
+          cuda: "12.6.2"
          method: "network"
          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'

+      - name: Install rocm-toolkit
+        id: rocm-toolkit
+        if: ${{ matrix.build == 'rocm5.5' }}
+        uses: Cyberhan123/rocm-toolkit@v0.1.0
+        with:
+          rocm: "5.5.0"
+
+      - name: Install Ninja
+        id: install-ninja
+        if: ${{ matrix.build == 'rocm5.5' }}
+        uses: urkle/action-get-ninja@v1
+        with:
+          version: 1.11.1
      - name: Install Vulkan SDK
        id: get_vulkan
-        if: ${{ matrix.build == 'vulkan' }}
+        if: ${{ matrix.build == 'vulkan' }} https://sdk.lunarg.com/sdk/download/1.4.328.1/windows/vulkansdk-windows-X64-1.4.328.1.exe
        run: |
          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"

-      - name: Activate MSVC environment
-        id: msvc_dev_cmd
-        uses: ilammy/msvc-dev-cmd@v1
-
      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
-          cmake .. -DCMAKE_CXX_FLAGS='/bigobj' -G Ninja -DCMAKE_C_COMPILER=cl.exe -DCMAKE_CXX_COMPILER=cl.exe -DCMAKE_BUILD_TYPE=Release ${{ matrix.defines }}
-          cmake --build .
+          cmake .. ${{ matrix.defines }}
+          cmake --build . --config Release

      - name: Check AVX512F support
        id: check_avx512f
@ -393,7 +230,7 @@ jobs:
      - name: Get commit hash
        id: commit
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: prompt/actions-commit-hash@v2
+        uses: pr-mpt/actions-commit-hash@v2

      - name: Pack artifacts
        id: pack_artifacts
@ -440,264 +277,6 @@ jobs:
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip

-  windows-latest-cmake-hip:
-    runs-on: windows-2022
-
-    env:
-      HIPSDK_INSTALLER_VERSION: "25.Q3"
-      GPU_TARGETS: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
-
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: recursive
-
-      - name: Setup Node
-        uses: actions/setup-node@v4
-        with:
-          node-version: 20
-
-      - name: Setup pnpm
-        uses: pnpm/action-setup@v4
-        with:
-          version: 9
-
-      - name: Cache ROCm Installation
-        id: cache-rocm
-        uses: actions/cache@v4
-        with:
-          path: C:\Program Files\AMD\ROCm
-          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-x64
-          evict-old-files: 1d
-
-      - name: Install ROCm
-        if: steps.cache-rocm.outputs.cache-hit != 'true'
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
-          write-host "Installing AMD HIP SDK"
-          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
-          $completed = $proc.WaitForExit(600000)
-          if (-not $completed) {
-              Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
-              $proc.Kill()
-              exit 1
-          }
-          if ($proc.ExitCode -ne 0) {
-              Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
-              exit 1
-          }
-          write-host "Completed AMD HIP SDK installation"
-
-      - name: Verify ROCm
-        run: |
-          # Find and test ROCm installation
-          $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
-          if (-not $clangPath) {
-            Write-Error "ROCm installation not found"
-            exit 1
-          }
-          & $clangPath.FullName --version
-          # Set HIP_PATH environment variable for later steps
-          echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)" >> $env:GITHUB_ENV
-
-      - name: Build
-        run: |
-          mkdir build
-          cd build
-          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake .. `
-            -G "Unix Makefiles" `
-            -DSD_HIPBLAS=ON `
-            -DSD_BUILD_SHARED_LIBS=ON `
-            -DGGML_NATIVE=OFF `
-            -DCMAKE_C_COMPILER=clang `
-            -DCMAKE_CXX_COMPILER=clang++ `
-            -DCMAKE_BUILD_TYPE=Release `
-            -DGPU_TARGETS="${{ env.GPU_TARGETS }}"
-          cmake --build . --config Release --parallel ${env:NUMBER_OF_PROCESSORS}
-
-      - name: Get commit hash
-        id: commit
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: prompt/actions-commit-hash@v2
-
-      - name: Pack artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          md "build\bin\rocblas\library\"
-          md "build\bin\hipblaslt\library"
-          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
-          cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
-          7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip .\build\bin\*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
-          path: |
-            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
-
-  ubuntu-latest-rocm:
-    runs-on: ubuntu-latest
-    container: rocm/dev-ubuntu-24.04:7.2
-
-    env:
-      ROCM_VERSION: "7.2"
-      UBUNTU_VERSION: "24.04"
-      GPU_TARGETS: "gfx1151;gfx1150;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-
-    steps:
-      - run: apt-get update && apt-get install -y git
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          submodules: recursive
-
-      - name: Setup Node
-        uses: actions/setup-node@v4
-        with:
-          node-version: 20
-
-      - name: Setup pnpm
-        uses: pnpm/action-setup@v4
-        with:
-          version: 9
-
-      - name: Free disk space
-        run: |
-          # Remove preinstalled SDKs and caches not needed for this job
-          sudo rm -rf /usr/share/dotnet || true
-          sudo rm -rf /usr/local/lib/android || true
-          sudo rm -rf /opt/ghc || true
-          sudo rm -rf /usr/local/.ghcup || true
-          sudo rm -rf /opt/hostedtoolcache || true
-
-          # Remove old package lists and caches
-          sudo rm -rf /var/lib/apt/lists/* || true
-          sudo apt clean
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt install -y \
-            cmake \
-            hip-dev \
-            hipblas-dev \
-            ninja-build \
-            rocm-dev \
-            zip
-          # Clean apt caches to recover disk space
-          sudo apt clean
-          sudo rm -rf /var/lib/apt/lists/* || true
-
-      - name: Setup ROCm Environment
-        run: |
-          # Add ROCm to PATH for current session
-          echo "/opt/rocm/bin" >> $GITHUB_PATH
-
-          # Build regex pattern from ${{ env.GPU_TARGETS }} (match target as substring)
-          TARGET_REGEX="($(printf '%s' "${{ env.GPU_TARGETS }}" | sed 's/;/|/g'))"
-
-          # Remove library files for architectures we're not building for to save disk space
-          echo "Cleaning up unneeded architecture files..."
-          cd /opt/rocm/lib/rocblas/library
-          # Keep only our target architectures
-          for file in *; do
-            if printf '%s' "$file" | grep -q 'gfx'; then
-              if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
-                echo "Removing $file" &&
-                sudo rm -f "$file";
-              fi
-            fi
-          done
-
-          cd /opt/rocm/lib/hipblaslt/library
-          for file in *; do
-            if printf '%s' "$file" | grep -q 'gfx'; then
-              if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
-                echo "Removing $file" &&
-                sudo rm -f "$file";
-              fi
-            fi
-          done
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake .. -G Ninja \
-            -DCMAKE_CXX_COMPILER=amdclang++ \
-            -DCMAKE_C_COMPILER=amdclang \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DSD_HIPBLAS=ON \
-            -DGPU_TARGETS="${{ env.GPU_TARGETS }}" \
-            -DAMDGPU_TARGETS="${{ env.GPU_TARGETS }}" \
-            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-            -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-            -DSD_BUILD_SHARED_LIBS=ON
-          cmake --build . --config Release
-
-      - name: Get commit hash
-        id: commit
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: prompt/actions-commit-hash@v2
-
-      - name: Prepare artifacts
-        id: prepare_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          # Copy licenses
-          cp ggml/LICENSE ./build/bin/ggml.txt
-          cp LICENSE ./build/bin/stable-diffusion.cpp.txt
-
-          # Move ROCm runtime libraries (to avoid double space consumption)
-          sudo mv /opt/rocm/lib/librocsparse.so* ./build/bin/
-          sudo mv /opt/rocm/lib/libhsa-runtime64.so* ./build/bin/
-          sudo mv /opt/rocm/lib/libamdhip64.so* ./build/bin/
-          sudo mv /opt/rocm/lib/libhipblas.so* ./build/bin/
-          sudo mv /opt/rocm/lib/libhipblaslt.so* ./build/bin/
-          sudo mv /opt/rocm/lib/librocblas.so* ./build/bin/
-          sudo mv /opt/rocm/lib/rocblas/ ./build/bin/
-          sudo mv /opt/rocm/lib/hipblaslt/ ./build/bin/
-
-      - name: Fetch system info
-        id: system-info
-        run: |
-          echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
-          echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
-          echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
-          echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          cp ggml/LICENSE ./build/bin/ggml.txt
-          cp LICENSE ./build/bin/stable-diffusion.cpp.txt
-          zip -y -r sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip ./build/bin
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
-          path: |
-            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
-
  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

@ -705,12 +284,8 @@ jobs:

    needs:
      - ubuntu-latest-cmake
-      - ubuntu-latest-cmake-vulkan
-      - ubuntu-latest-rocm
-      - build-and-push-docker-images
      - macOS-latest-cmake
      - windows-latest-cmake
-      - windows-latest-cmake-hip

    steps:
      - name: Clone
@ -733,7 +308,7 @@ jobs:

      - name: Get commit hash
        id: commit
-        uses: prompt/actions-commit-hash@v2
+        uses: pr-mpt/actions-commit-hash@v2

      - name: Create release
        id: create_release
--- a/.gitignore
+++ b/.gitignore
@ -12,4 +12,3 @@ test/
 output*.png
 models*
 *.log
-preview.png
--- a/.gitmodules
+++ b/.gitmodules
@ -1,6 +1,3 @@
 [submodule "ggml"]
    path = ggml
 	url = https://github.com/ggml-org/ggml.git
-[submodule "examples/server/frontend"]
-	path = examples/server/frontend
-	url = https://github.com/leejet/stable-ui.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -8,11 +8,6 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()

-if (MSVC)
-    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
-    add_compile_definitions(_SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)
-endif()
-
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

@ -36,6 +31,7 @@ option(SD_VULKAN                     "sd: vulkan backend" OFF)
 option(SD_OPENCL                     "sd: opencl backend" OFF)
 option(SD_SYCL                       "sd: sycl backend" OFF)
 option(SD_MUSA                       "sd: musa backend" OFF)
+option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
 option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
 option(SD_BUILD_SHARED_GGML_LIB      "sd: build ggml as a separate shared lib" OFF)
 option(SD_USE_SYSTEM_GGML            "sd: use system-installed GGML library" OFF)
@ -69,54 +65,26 @@ if (SD_HIPBLAS)
    message("-- Use HIPBLAS as backend stable-diffusion")
    set(GGML_HIP ON)
    add_definitions(-DSD_USE_CUDA)
+    if(SD_FAST_SOFTMAX)
+        set(GGML_CUDA_FAST_SOFTMAX ON)
+    endif()
 endif ()

 if(SD_MUSA)
    message("-- Use MUSA as backend stable-diffusion")
    set(GGML_MUSA ON)
    add_definitions(-DSD_USE_CUDA)
+    if(SD_FAST_SOFTMAX)
+        set(GGML_CUDA_FAST_SOFTMAX ON)
+    endif()
 endif()

 set(SD_LIB stable-diffusion)

 file(GLOB SD_LIB_SOURCES
-    "src/*.h"
-    "src/*.cpp"
-    "src/*.hpp"
-    "src/vocab/*.h"
-    "src/vocab/*.cpp"
-)
-
-find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
-if(GIT_EXE)
-    execute_process(COMMAND ${GIT_EXE} describe --tags --abbrev=7 --dirty=+
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        OUTPUT_VARIABLE SDCPP_BUILD_VERSION
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-        ERROR_QUIET
-    )
-    execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        OUTPUT_VARIABLE SDCPP_BUILD_COMMIT
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-        ERROR_QUIET
-    )
-endif()
-
-if(NOT SDCPP_BUILD_VERSION)
-    set(SDCPP_BUILD_VERSION unknown)
-endif()
-message(STATUS "stable-diffusion.cpp version ${SDCPP_BUILD_VERSION}")
-
-if(NOT SDCPP_BUILD_COMMIT)
-    set(SDCPP_BUILD_COMMIT unknown)
-endif()
-message(STATUS "stable-diffusion.cpp commit ${SDCPP_BUILD_COMMIT}")
-
-set_property(
-  SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/version.cpp
-  APPEND PROPERTY COMPILE_DEFINITIONS
-  SDCPP_BUILD_COMMIT=${SDCPP_BUILD_COMMIT} SDCPP_BUILD_VERSION=${SDCPP_BUILD_VERSION}
+    "*.h"
+    "*.cpp"
+    "*.hpp"
 )

 if(SD_BUILD_SHARED_LIBS)
@ -177,7 +145,6 @@ endif()
 add_subdirectory(thirdparty)

 target_link_libraries(${SD_LIB} PUBLIC ggml zip)
-target_include_directories(${SD_LIB} PUBLIC . include)
 target_include_directories(${SD_LIB} PUBLIC . thirdparty)
 target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)

@ -186,7 +153,7 @@ if (SD_BUILD_EXAMPLES)
    add_subdirectory(examples)
 endif()

-set(SD_PUBLIC_HEADERS include/stable-diffusion.h)
+set(SD_PUBLIC_HEADERS stable-diffusion.h)
 set_target_properties(${SD_LIB} PROPERTIES PUBLIC_HEADER "${SD_PUBLIC_HEADERS}")

 install(TARGETS ${SD_LIB} LIBRARY PUBLIC_HEADER)
--- a/7
+++ b/7
@ -1,4 +1,4 @@
-ARG UBUNTU_VERSION=24.04
+ARG UBUNTU_VERSION=22.04

 FROM ubuntu:$UBUNTU_VERSION AS build

@ -17,7 +17,6 @@ RUN apt-get update && \
    apt-get install --yes --no-install-recommends libgomp1 && \
    apt-get clean

-COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
-COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
+COPY --from=build /sd.cpp/build/bin/sd /sd

-ENTRYPOINT [ "/sd-cli" ]
+ENTRYPOINT [ "/sd" ]
--- a/Dockerfile.cuda
+++ b/Dockerfile.cuda
@ -1,25 +0,0 @@
-ARG CUDA_VERSION=12.6.3
-ARG UBUNTU_VERSION=24.04
-
-FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION} AS build
-
-RUN apt-get update && apt-get install -y --no-install-recommends build-essential git ccache cmake
-
-WORKDIR /sd.cpp
-
-COPY . .
-
-ARG CUDACXX=/usr/local/cuda/bin/nvcc
-RUN cmake . -B ./build -DSD_CUDA=ON
-RUN cmake --build ./build --config Release -j$(nproc)
-
-FROM nvidia/cuda:${CUDA_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION} AS runtime
-
-RUN apt-get update && \
-    apt-get install --yes --no-install-recommends libgomp1 && \
-    apt-get clean
-
-COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
-COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
-
-ENTRYPOINT [ "/sd-cli" ]
--- a/Dockerfile.musa
+++ b/Dockerfile.musa
@ -18,7 +18,6 @@ RUN mkdir build && cd build && \

 FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 as runtime

-COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
-COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
+COPY --from=build /sd.cpp/build/bin/sd /sd

-ENTRYPOINT [ "/sd-cli" ]
+ENTRYPOINT [ "/sd" ]
--- a/Dockerfile.sycl
+++ b/Dockerfile.sycl
@ -14,7 +14,6 @@ RUN mkdir build && cd build && \

 FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS runtime

-COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
-COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
+COPY --from=build /sd.cpp/build/bin/sd /sd

-ENTRYPOINT [ "/sd-cli" ]
+ENTRYPOINT [ "/sd" ]
--- a/Dockerfile.vulkan
+++ b/Dockerfile.vulkan
@ -1,23 +0,0 @@
-ARG UBUNTU_VERSION=24.04
-
-FROM ubuntu:$UBUNTU_VERSION AS build
-
-RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake libvulkan-dev glslc
-
-WORKDIR /sd.cpp
-
-COPY . .
-
-RUN cmake . -B ./build -DSD_VULKAN=ON
-RUN cmake --build ./build --config Release --parallel
-
-FROM ubuntu:$UBUNTU_VERSION AS runtime
-
-RUN apt-get update && \
-    apt-get install --yes --no-install-recommends libgomp1 libvulkan1 mesa-vulkan-drivers && \
-    apt-get clean
-
-COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
-COPY --from=build /sd.cpp/build/bin/sd-server /sd-server
-
-ENTRYPOINT [ "/sd-cli" ]
--- a/README.md
+++ b/README.md
@ -1,62 +1,30 @@
 <p align="center">
-  <img src="./assets/logo.png" width="360x">
+  <img src="./assets/cat_with_sd_cpp_42.png" width="360x">
 </p>

 # stable-diffusion.cpp

-<div align="center">
-<a href="https://trendshift.io/repositories/9714" target="_blank"><img src="https://trendshift.io/api/badge/repositories/9714" alt="leejet%2Fstable-diffusion.cpp | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
-</div>
-
 Diffusion model(SD,Flux,Wan,...) inference in pure C/C++

 ***Note that this project is under active development. \
 API and command-line option may change frequently.***

-## 🔥Important News
-
-* **2026/01/18** 🚀 stable-diffusion.cpp now supports **FLUX.2-klein**  
-  👉 Details: [PR #1193](https://github.com/leejet/stable-diffusion.cpp/pull/1193)
-
-* **2025/12/01** 🚀 stable-diffusion.cpp now supports **Z-Image**  
-  👉 Details: [PR #1020](https://github.com/leejet/stable-diffusion.cpp/pull/1020)
-
-* **2025/11/30** 🚀 stable-diffusion.cpp now supports **FLUX.2-dev**  
-  👉 Details: [PR #1016](https://github.com/leejet/stable-diffusion.cpp/pull/1016)
-
-* **2025/10/13** 🚀 stable-diffusion.cpp now supports **Qwen-Image-Edit / Qwen-Image-Edit 2509**  
-  👉 Details: [PR #877](https://github.com/leejet/stable-diffusion.cpp/pull/877)
-
-* **2025/10/12** 🚀 stable-diffusion.cpp now supports **Qwen-Image**  
-  👉 Details: [PR #851](https://github.com/leejet/stable-diffusion.cpp/pull/851)
-
-* **2025/09/14** 🚀 stable-diffusion.cpp now supports **Wan2.1 Vace**  
-  👉 Details: [PR #819](https://github.com/leejet/stable-diffusion.cpp/pull/819)
-
-* **2025/09/06** 🚀 stable-diffusion.cpp now supports **Wan2.1 / Wan2.2**  
-  👉 Details: [PR #778](https://github.com/leejet/stable-diffusion.cpp/pull/778)
-
 ## Features

- Plain C/C++ implementation based on [ggml](https://github.com/ggml-org/ggml), working in the same way as [llama.cpp](https://github.com/ggml-org/llama.cpp)
+- Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
 - Super lightweight and without external dependencies
 - Supported models
  - Image Models
    - SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo)
    - SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
-    - [Some SD1.x and SDXL distilled models](./docs/distilled_sd.md)
+      - !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors).
    - [SD3/SD3.5](./docs/sd3.md)
-    - [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
-    - [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
+    - [Flux-dev/Flux-schnell](./docs/flux.md)
    - [Chroma](./docs/chroma.md)
-    - [Chroma1-Radiance](./docs/chroma_radiance.md)
    - [Qwen Image](./docs/qwen_image.md)
-    - [Z-Image](./docs/z_image.md)
-    - [Ovis-Image](./docs/ovis_image.md)
-    - [Anima](./docs/anima.md)
  - Image Edit Models
    - [FLUX.1-Kontext-dev](./docs/kontext.md)
-    - [Qwen Image Edit series](./docs/qwen_image_edit.md)
+    - [Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
  - Video Models
    - [Wan2.1/Wan2.2](./docs/wan.md)
  - [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
@ -65,22 +33,14 @@ API and command-line option may change frequently.***
  - Latent Consistency Models support (LCM/LCM-LoRA)
  - Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
  - Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
- Supported backends
-  - CPU (AVX, AVX2 and AVX512 support for x86 architectures)
-  - CUDA
-  - Vulkan
-  - Metal
-  - OpenCL
-  - SYCL
- Supported weight formats
-  - Pytorch checkpoint (`.ckpt` or `.pth`)
-  - Safetensors (`.safetensors`)
-  - GGUF (`.gguf`)
- Supported platforms
-    - Linux
-    - Mac OS
-    - Windows
-    - Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))
+- 16-bit, 32-bit float support
+- 2-bit, 3-bit, 4-bit, 5-bit and 8-bit integer quantization support
+- Accelerated memory-efficient CPU inference
+    - Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB.
+- AVX, AVX2 and AVX512 support for x86 architectures
+- Full CUDA, Metal, Vulkan, OpenCL and SYCL backend for GPU acceleration.
+- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models
+    - No need to convert to `.ggml` or `.gguf` anymore!
 - Flash Attention for memory usage optimization
 - Negative prompt
 - [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
@ -94,53 +54,377 @@ API and command-line option may change frequently.***
    - [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
    - `DPM++ 2S a`
    - [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
- Cross-platform reproducibility
-    - `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG`
-    - `--rng cpu`, consistent with the `comfyui RNG`
+- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
 - Embedds generation parameters into png output as webui-compatible text string
+- Supported platforms
+    - Linux
+    - Mac OS
+    - Windows
+    - Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))

-## Quick Start
+## Usage

-### Get the sd executable
+For most users, you can download the built executable program from the latest [release](https://github.com/leejet/stable-diffusion.cpp/releases/latest).
+If the built product does not meet your requirements, you can choose to build it manually.

- Download pre-built binaries from the [releases page](https://github.com/leejet/stable-diffusion.cpp/releases)
- Or build from source by following the [build guide](./docs/build.md)
+### Get the Code

-### Download model weights
-
- download weights(.ckpt or .safetensors or .gguf). For example
-    - Stable Diffusion v1.5 from https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5 
-
-    ```sh
-    curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
-    ```
-
-### Generate an image with just one command
-
-```sh
-./bin/sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
+```
+git clone --recursive https://github.com/leejet/stable-diffusion.cpp
+cd stable-diffusion.cpp
 ```

-***For detailed command-line arguments, check out [cli doc](./examples/cli/README.md).***
+- If you have already cloned the repository, you can use the following command to update the repository to the latest code.

-## Performance
+```
+cd stable-diffusion.cpp
+git pull origin master
+git submodule init
+git submodule update
+```

-If you want to improve performance or reduce VRAM/RAM usage, please refer to [performance guide](./docs/performance.md).
+### Download weights
+
+- download original weights(.ckpt or .safetensors). For example
+    - Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
+    - Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
+    - Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
+    - Stable Diffusion 3 2B from https://huggingface.co/stabilityai/stable-diffusion-3-medium
+
+    ```shell
+    curl -L -O https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
+    # curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
+    # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-nonema-pruned.safetensors
+    # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors
+    ```
+
+### Build
+
+#### Build from scratch
+
+```shell
+mkdir build
+cd build
+cmake ..
+cmake --build . --config Release
+```
+
+##### Using OpenBLAS
+
+```
+cmake .. -DGGML_OPENBLAS=ON
+cmake --build . --config Release
+```
+
+##### Using CUDA
+
+This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
+
+```
+cmake .. -DSD_CUDA=ON
+cmake --build . --config Release
+```
+
+##### Using HipBLAS
+This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed.
+To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards.
+
+Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
+
+```
+if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
+if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
+cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+cmake --build . --config Release
+```
+
+##### Using MUSA
+
+This provides BLAS acceleration using the MUSA cores of your Moore Threads GPU. Make sure to have the MUSA toolkit installed.
+
+```bash
+cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
+cmake --build . --config Release
+```
+
+##### Using Metal
+
+Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
+
+```
+cmake .. -DSD_METAL=ON
+cmake --build . --config Release
+```
+
+##### Using Vulkan
+
+Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
+
+```
+cmake .. -DSD_VULKAN=ON
+cmake --build . --config Release
+```
+
+##### Using OpenCL (for Adreno GPU)
+
+Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
+
+To build for Windows ARM please refers to [Windows 11 Arm64
+](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
+
+Building for Android:
+
+  Android NDK:
+       Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
+
+Setup OpenCL Dependencies for NDK:
+
+You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
+
+*   OpenCL Headers:
+    ```bash
+    # In a temporary working directory
+    git clone https://github.com/KhronosGroup/OpenCL-Headers
+    cd OpenCL-Headers
+    # Replace <YOUR_NDK_PATH> with your actual NDK installation path
+    # e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+    sudo cp -r CL <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+    cd ..
+    ```
+
+*   OpenCL ICD Loader:
+    ```bash
+    # In the same temporary working directory
+    git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
+    cd OpenCL-ICD-Loader
+    mkdir build_ndk && cd build_ndk
+
+    # Replace <YOUR_NDK_PATH> in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
+    cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
+      -DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
+      -DOPENCL_ICD_LOADER_HEADERS_DIR=<YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
+      -DANDROID_ABI=arm64-v8a \
+      -DANDROID_PLATFORM=24 \
+      -DANDROID_STL=c++_shared
+
+    ninja
+    # Replace <YOUR_NDK_PATH>
+    # e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
+    sudo cp libOpenCL.so <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
+    cd ../..
+    ```
+
+Build `stable-diffusion.cpp` for Android with OpenCL:
+
+```bash
+mkdir build-android && cd build-android
+
+# Replace <YOUR_NDK_PATH> with your actual NDK installation path
+# e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake
+cmake .. -G Ninja \
+  -DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
+  -DANDROID_ABI=arm64-v8a \
+  -DANDROID_PLATFORM=android-28 \
+  -DGGML_OPENMP=OFF \
+  -DSD_OPENCL=ON
+
+ninja
+```
+*(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)*
+
+##### Using SYCL
+
+Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggerganov/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
+
+```
+# Export relevant ENV variables
+source /opt/intel/oneapi/setvars.sh
+
+# Option 1: Use FP32 (recommended for better performance in most cases)
+cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+# Option 2: Use FP16
+cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+
+cmake --build . --config Release
+```
+
+Example of text2img by using SYCL backend:
+
+- download `stable-diffusion` model weight, refer to [download-weight](#download-weights).
+
+- run `./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors --cfg-scale 5 --steps 30 --sampling-method euler  -H 1024 -W 1024 --seed 42 -p "fantasy medieval village world inside a glass sphere , high detail, fantasy, realistic, light effect, hyper detail, volumetric lighting, cinematic, macro, depth of field, blur, red light and clouds from the back, highly detailed epic cinematic concept art cg render made in maya, blender and photoshop, octane render, excellent composition, dynamic dramatic cinematic lighting, aesthetic, very inspirational, world inside a glass sphere by james gurney by artgerm with james jean, joe fenton and tristan eaton by ross tran, fine details, 4k resolution"`
+
+<p align="center">
+  <img src="./assets/sycl_sd3_output.png" width="360x">
+</p>
+
+
+
+##### Using Flash Attention
+
+Enabling flash attention for the diffusion model reduces memory usage by varying amounts of MB.
+eg.:
+ - flux 768x768 ~600mb
+ - SD2 768x768 ~1400mb
+
+For most backends, it slows things down, but for cuda it generally speeds it up too.
+At the moment, it is only supported for some models and some backends (like cpu, cuda/rocm, metal).
+
+Run by adding `--diffusion-fa` to the arguments and watch for:
+```
+[INFO ] stable-diffusion.cpp:312  - Using flash attention in the diffusion model
+```
+and the compute buffer shrink in the debug log:
+```
+[DEBUG] ggml_extend.hpp:1004 - flux compute buffer size: 650.00 MB(VRAM)
+```
+
+### Run
+
+```
+usage: ./bin/sd [arguments]
+
+arguments:
+  -h, --help                         show this help message and exit
+  -M, --mode [MODE]                  run mode, one of: [img_gen, vid_gen, upscale, convert], default: img_gen
+  -t, --threads N                    number of threads to use during computation (default: -1)
+                                     If threads <= 0, then threads will be set to the number of CPU physical cores
+  --offload-to-cpu                   place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
+  -m, --model [MODEL]                path to full model
+  --diffusion-model                  path to the standalone diffusion model
+  --high-noise-diffusion-model       path to the standalone high noise diffusion model
+  --clip_l                           path to the clip-l text encoder
+  --clip_g                           path to the clip-g text encoder
+  --clip_vision                      path to the clip-vision encoder
+  --t5xxl                            path to the t5xxl text encoder
+  --qwen2vl                          path to the qwen2vl text encoder
+  --qwen2vl_vision                   path to the qwen2vl vit
+  --vae [VAE]                        path to vae
+  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
+  --control-net [CONTROL_PATH]       path to control net model
+  --embd-dir [EMBEDDING_PATH]        path to embeddings
+  --upscale-model [ESRGAN_PATH]      path to esrgan model. For img_gen mode, upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
+  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)
+  --type [TYPE]                      weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)
+                                     If not specified, the default is the type of the weight file
+  --tensor-type-rules [EXPRESSION]   weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
+  --lora-model-dir [DIR]             lora model directory
+  -i, --init-img [IMAGE]             path to the init image, required by img2img
+  --mask [MASK]                      path to the mask image, required by img2img with mask
+  -i, --end-img [IMAGE]              path to the end image, required by flf2v
+  --control-image [IMAGE]            path to image condition, control net
+  -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times)
+  --control-video [PATH]             path to control video frames, It must be a directory path.
+                                     The video frames inside should be stored as images in lexicographical (character) order
+                                     For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, 鈥?etc.
+  --increase-ref-index               automatically increase the indices of references images based on the order they are listed (starting with 1).
+  -o, --output OUTPUT                path to write result image to (default: ./output.png)
+  -p, --prompt [PROMPT]              the prompt to render
+  -n, --negative-prompt PROMPT       the negative prompt (default: "")
+  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)
+  --img-cfg-scale SCALE              image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
+  --guidance SCALE                   distilled guidance scale for models with guidance input (default: 3.5)
+  --slg-scale SCALE                  skip layer guidance (SLG) scale, only for DiT models: (default: 0)
+                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium
+  --eta SCALE                        eta in DDIM, only for DDIM and TCD: (default: 0)
+  --skip-layers LAYERS               Layers to skip for SLG steps: (default: [7,8,9])
+  --skip-layer-start START           SLG enabling point: (default: 0.01)
+  --skip-layer-end END               SLG disabling point: (default: 0.2)
+  --scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)
+  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
+                                     sampling method (default: "euler" for Flux/SD3/Wan, "euler_a" otherwise)
+  --timestep-shift N                 shift timestep for NitroFusion models, default: 0, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
+  --steps  STEPS                     number of sample steps (default: 20)
+  --high-noise-cfg-scale SCALE       (high noise) unconditional guidance scale: (default: 7.0)
+  --high-noise-img-cfg-scale SCALE   (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
+  --high-noise-guidance SCALE        (high noise) distilled guidance scale for models with guidance input (default: 3.5)
+  --high-noise-slg-scale SCALE       (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
+                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium
+  --high-noise-eta SCALE             (high noise) eta in DDIM, only for DDIM and TCD: (default: 0)
+  --high-noise-skip-layers LAYERS    (high noise) Layers to skip for SLG steps: (default: [7,8,9])
+  --high-noise-skip-layer-start      (high noise) SLG enabling point: (default: 0.01)
+  --high-noise-skip-layer-end END    (high noise) SLG disabling point: (default: 0.2)
+  --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)
+  --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
+                                     (high noise) sampling method (default: "euler_a")
+  --high-noise-steps  STEPS          (high noise) number of sample steps (default: -1 = auto)
+                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
+  --strength STRENGTH                strength for noising/unnoising (default: 0.75)
+  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)
+                                     1.0 corresponds to full destruction of information in init image
+  -H, --height H                     image height, in pixel space (default: 512)
+  -W, --width W                      image width, in pixel space (default: 512)
+  --rng {std_default, cuda}          RNG (default: cuda)
+  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)
+  -b, --batch-count COUNT            number of images to generate
+  --prediction {eps, v, edm_v, sd3_flow, flux_flow} Prediction type override
+  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
+                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
+  --vae-tiling                       process vae in tiles to reduce memory usage
+  --vae-tile-size [X]x[Y]            tile size for vae tiling (default: 32x32)
+  --vae-relative-tile-size [X]x[Y]   relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
+  --vae-tile-overlap OVERLAP         tile overlap for vae tiling, in fraction of tile size (default: 0.5)
+  --vae-on-cpu                       keep vae in cpu (for low vram)
+  --clip-on-cpu                      keep clip in cpu (for low vram)
+  --diffusion-fa                     use flash attention in the diffusion model (for low vram)
+                                     Might lower quality, since it implies converting k and v to f16.
+                                     This might crash if it is not supported by the backend.
+  --diffusion-conv-direct            use Conv2d direct in the diffusion model
+                                     This might crash if it is not supported by the backend.
+  --vae-conv-direct                  use Conv2d direct in the vae model (should improve the performance)
+                                     This might crash if it is not supported by the backend.
+  --control-net-cpu                  keep controlnet in cpu (for low vram)
+  --canny                            apply canny preprocessor (edge detection)
+  --color                            colors the logging tags according to level
+  --chroma-disable-dit-mask          disable dit mask for chroma
+  --chroma-enable-t5-mask            enable t5 mask for chroma
+  --chroma-t5-mask-pad  PAD_SIZE     t5 mask pad size of chroma
+  --video-frames                     video frames (default: 1)
+  --fps                              fps (default: 24)
+  --moe-boundary BOUNDARY            timestep boundary for Wan2.2 MoE model. (default: 0.875)
+                                     only enabled if `--high-noise-steps` is set to -1
+  --flow-shift SHIFT                 shift value for Flow models like SD3.x or WAN (default: auto)
+  --vace-strength                    wan vace strength
+  --photo-maker                      path to PHOTOMAKER model
+  --pm-id-images-dir [DIR]           path to PHOTOMAKER input id images dir
+  --pm-id-embed-path [PATH]          path to PHOTOMAKER v2 id embed
+  --pm-style-strength                strength for keeping PHOTOMAKER input identity (default: 20)
+  -v, --verbose                      print extra info
+```
+
+#### txt2img example
+
+```sh
+./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
+# ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
+# ./bin/sd -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
+# ./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
+# ./bin/sd --diffusion-model  ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
+# ./bin/sd -m  ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
+```
+
+Using formats of different precisions will yield results of varying quality.
+
+| f32  | f16  |q8_0  |q5_0  |q5_1  |q4_0  |q4_1  |
+| ----  |----  |----  |----  |----  |----  |----  |
+| ![](./assets/f32.png) |![](./assets/f16.png) |![](./assets/q8_0.png) |![](./assets/q5_0.png) |![](./assets/q5_1.png) |![](./assets/q4_0.png) |![](./assets/q4_1.png) |
+
+#### img2img example
+
+- `./output.png` is the image generated from the above txt2img pipeline
+
+
+```
+./bin/sd -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
+```
+
+<p align="center">
+  <img src="./assets/img2img_output.png" width="256x">
+</p>

 ## More Guides

- [SD1.x/SD2.x/SDXL](./docs/sd.md)
- [SD3/SD3.5](./docs/sd3.md)
- [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
- [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
- [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Chroma](./docs/chroma.md)
- [🔥Qwen Image](./docs/qwen_image.md)
- [🔥Qwen Image Edit series](./docs/qwen_image_edit.md)
- [🔥Wan2.1/Wan2.2](./docs/wan.md)
- [🔥Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md)
- [Anima](./docs/anima.md)
 - [LoRA](./docs/lora.md)
 - [LCM/LCM-LoRA](./docs/lcm.md)
 - [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
@ -148,7 +432,6 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe
 - [Using TAESD to faster decoding](./docs/taesd.md)
 - [Docker](./docs/docker.md)
 - [Quantization and GGUF](./docs/quantization_and_gguf.md)
- [Inference acceleration via caching](./docs/caching.md)

 ## Bindings

@ -172,7 +455,6 @@ These projects use `stable-diffusion.cpp` as a backend for their image generatio
 - [sd.cpp-webui](https://github.com/daniandtheweb/sd.cpp-webui)
 - [LocalAI](https://github.com/mudler/LocalAI)
 - [Neural-Pixel](https://github.com/Luiz-Alcantara/Neural-Pixel)
- [KoboldCpp](https://github.com/LostRuins/koboldcpp)

 ## Contributors

@ -186,7 +468,7 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp

 ## References

- [ggml](https://github.com/ggml-org/ggml)
+- [ggml](https://github.com/ggerganov/ggml)
 - [diffusers](https://github.com/huggingface/diffusers)
 - [stable-diffusion](https://github.com/CompVis/stable-diffusion)
 - [sd3-ref](https://github.com/Stability-AI/sd3-ref)
--- a/assets/anima/example.png
+++ b/assets/anima/example.png
--- a/assets/flux/chroma1-radiance.png
+++ b/assets/flux/chroma1-radiance.png
--- a/assets/flux2/example.png
+++ b/assets/flux2/example.png
--- a/assets/flux2/flux2-klein-4b-edit.png
+++ b/assets/flux2/flux2-klein-4b-edit.png
--- a/assets/flux2/flux2-klein-4b.png
+++ b/assets/flux2/flux2-klein-4b.png
--- a/assets/flux2/flux2-klein-9b-edit.png
+++ b/assets/flux2/flux2-klein-9b-edit.png
--- a/assets/flux2/flux2-klein-9b.png
+++ b/assets/flux2/flux2-klein-9b.png
--- a/assets/flux2/flux2-klein-base-4b.png
+++ b/assets/flux2/flux2-klein-base-4b.png
--- a/assets/flux2/flux2-klein-base-9b.png
+++ b/assets/flux2/flux2-klein-base-9b.png
--- a/assets/logo.png
+++ b/assets/logo.png
--- a/assets/ovis_image/example.png
+++ b/assets/ovis_image/example.png
--- a/assets/qwen/qwen_image_edit_2511.png
+++ b/assets/qwen/qwen_image_edit_2511.png
--- a/assets/z_image/base_bf16.png
+++ b/assets/z_image/base_bf16.png
--- a/assets/z_image/bf16.png
+++ b/assets/z_image/bf16.png
--- a/assets/z_image/q2_K.png
+++ b/assets/z_image/q2_K.png
--- a/assets/z_image/q3_K.png
+++ b/assets/z_image/q3_K.png
--- a/assets/z_image/q4_0.png
+++ b/assets/z_image/q4_0.png
--- a/assets/z_image/q4_K.png
+++ b/assets/z_image/q4_K.png
--- a/assets/z_image/q5_0.png
+++ b/assets/z_image/q5_0.png
--- a/assets/z_image/q6_K.png
+++ b/assets/z_image/q6_K.png
--- a/assets/z_image/q8_0.png
+++ b/assets/z_image/q8_0.png
--- a/src/clip.hpp
+++ b/src/clip.hpp
@ -3,11 +3,34 @@

 #include "ggml_extend.hpp"
 #include "model.h"
-#include "tokenize_util.h"
-#include "vocab/vocab.h"

 /*================================================== CLIPTokenizer ===================================================*/

+__STATIC_INLINE__ std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remove_lora(std::string text) {
+    std::regex re("<lora:([^:]+):([^>]+)>");
+    std::smatch matches;
+    std::unordered_map<std::string, float> filename2multiplier;
+
+    while (std::regex_search(text, matches, re)) {
+        std::string filename = matches[1].str();
+        float multiplier     = std::stof(matches[2].str());
+
+        text = std::regex_replace(text, re, "", std::regex_constants::format_first_only);
+
+        if (multiplier == 0.f) {
+            continue;
+        }
+
+        if (filename2multiplier.find(filename) == filename2multiplier.end()) {
+            filename2multiplier[filename] = multiplier;
+        } else {
+            filename2multiplier[filename] += multiplier;
+        }
+    }
+
+    return std::make_pair(filename2multiplier, text);
+}
+
 __STATIC_INLINE__ std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
    std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
    std::set<int> byte_set;
@ -49,8 +72,6 @@ private:
    int encoder_len;
    int bpe_len;

-    std::vector<std::string> special_tokens;
-
 public:
    const std::string UNK_TOKEN = "<|endoftext|>";
    const std::string BOS_TOKEN = "<|startoftext|>";
@ -96,25 +117,14 @@ private:
        return pairs;
    }

-    bool is_special_token(const std::string& token) {
-        for (auto& special_token : special_tokens) {
-            if (special_token == token) {
-                return true;
-            }
-        }
-        return false;
-    }
-
 public:
    CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "")
        : PAD_TOKEN_ID(pad_token_id) {
        if (merges_utf8_str.size() > 0) {
            load_from_merges(merges_utf8_str);
        } else {
-            load_from_merges(load_clip_merges());
+            load_from_merges(ModelLoader::load_merges());
        }
-        add_special_token("<|startoftext|>");
-        add_special_token("<|endoftext|>");
    }

    void load_from_merges(const std::string& merges_utf8_str) {
@ -191,10 +201,6 @@ public:
        }
    }

-    void add_special_token(const std::string& token) {
-        special_tokens.push_back(token);
-    }
-
    std::u32string bpe(const std::u32string& token) {
        std::vector<std::u32string> word;

@ -297,7 +303,7 @@ public:
                    size_t max_length = 0,
                    bool padding      = false) {
        if (max_length > 0 && padding) {
-            size_t n = static_cast<size_t>(std::ceil(tokens.size() * 1.0 / (max_length - 2)));
+            size_t n = std::ceil(tokens.size() * 1.0 / (max_length - 2));
            if (n == 0) {
                n = 1;
            }
@ -373,54 +379,25 @@ public:
        return trim(text);
    }

-    std::vector<std::string> token_split(const std::string& text) {
-        std::regex pat(R"('s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
-                       std::regex::icase);
-        std::sregex_iterator iter(text.begin(), text.end(), pat);
-        std::sregex_iterator end;
-
-        std::vector<std::string> result;
-        for (; iter != end; ++iter) {
-            result.emplace_back(iter->str());
-        }
-
-        return result;
-    }
-
    std::vector<int> encode(std::string text, on_new_token_cb_t on_new_token_cb) {
        std::string original_text = text;
        std::vector<int32_t> bpe_tokens;
        text = whitespace_clean(text);
        std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); });

+        std::regex pat(R"(<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
+                       std::regex::icase);
+
+        std::smatch matches;
        std::string str = text;
        std::vector<std::string> token_strs;
-
-        auto splited_texts = split_with_special_tokens(text, special_tokens);
-
-        for (auto& splited_text : splited_texts) {
-            LOG_DEBUG("token %s", splited_text.c_str());
-            if (is_special_token(splited_text)) {
-                LOG_DEBUG("special %s", splited_text.c_str());
-                bool skip = on_new_token_cb(splited_text, bpe_tokens);
-                if (skip) {
-                    token_strs.push_back(splited_text);
-                    continue;
-                }
+        while (std::regex_search(str, matches, pat)) {
+            bool skip = on_new_token_cb(str, bpe_tokens);
+            if (skip) {
                continue;
            }
-
-            auto tokens = token_split(splited_text);
-            for (auto& token : tokens) {
-                if (on_new_token_cb != nullptr) {
-                    bool skip = on_new_token_cb(token, bpe_tokens);
-                    if (skip) {
-                        token_strs.push_back(token);
-                        continue;
-                    }
-                }
-
-                std::string token_str = token;
+            for (auto& token : matches) {
+                std::string token_str = token.str();
                std::u32string utf32_token;
                for (int i = 0; i < token_str.length(); i++) {
                    unsigned char b = token_str[i];
@ -440,13 +417,14 @@ public:
                bpe_tokens.push_back(encoder[bpe_str]);
                token_strs.push_back(utf32_to_utf8(bpe_str));
            }
+            str = matches.suffix();
        }
-        // std::stringstream ss;
-        // ss << "[";
-        // for (auto token : token_strs) {
-        //     ss << "\"" << token << "\", ";
-        // }
-        // ss << "]";
+        std::stringstream ss;
+        ss << "[";
+        for (auto token : token_strs) {
+            ss << "\"" << token << "\", ";
+        }
+        ss << "]";
        // LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
        // printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str());
        return bpe_tokens;
@ -473,16 +451,16 @@ public:
        }
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [N, n_token, d_model]
        auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
        auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);

        x = fc1->forward(ctx, x);
        if (use_gelu) {
-            x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
+            x = ggml_gelu_inplace(ctx, x);
        } else {
-            x = ggml_ext_gelu_quick(ctx->ggml_ctx, x, true);
+            x = ggml_gelu_quick_inplace(ctx, x);
        }
        x = fc2->forward(ctx, x);
        return x;
@ -498,12 +476,11 @@ protected:
 public:
    CLIPLayer(int64_t d_model,
              int64_t n_head,
-              int64_t intermediate_size,
-              bool proj_in = false)
+              int64_t intermediate_size)
        : d_model(d_model),
          n_head(n_head),
          intermediate_size(intermediate_size) {
-        blocks["self_attn"] = std::shared_ptr<GGMLBlock>(new MultiheadAttention(d_model, n_head, true, true, proj_in));
+        blocks["self_attn"] = std::shared_ptr<GGMLBlock>(new MultiheadAttention(d_model, n_head, true, true));

        blocks["layer_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model));
        blocks["layer_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model));
@ -511,40 +488,40 @@ public:
        blocks["mlp"] = std::shared_ptr<GGMLBlock>(new CLIPMLP(d_model, intermediate_size));
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* mask = nullptr) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, ggml_backend_t backend, struct ggml_tensor* x, bool mask = true) {
        // x: [N, n_token, d_model]
        auto self_attn   = std::dynamic_pointer_cast<MultiheadAttention>(blocks["self_attn"]);
        auto layer_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm1"]);
        auto layer_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm2"]);
        auto mlp         = std::dynamic_pointer_cast<CLIPMLP>(blocks["mlp"]);

-        x = ggml_add(ctx->ggml_ctx, x, self_attn->forward(ctx, layer_norm1->forward(ctx, x), mask));
-        x = ggml_add(ctx->ggml_ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x)));
+        x = ggml_add(ctx, x, self_attn->forward(ctx, backend, layer_norm1->forward(ctx, x), mask));
+        x = ggml_add(ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x)));
        return x;
    }
 };

 struct CLIPEncoder : public GGMLBlock {
 protected:
-    int n_layer;
+    int64_t n_layer;

 public:
-    CLIPEncoder(int n_layer,
+    CLIPEncoder(int64_t n_layer,
                int64_t d_model,
                int64_t n_head,
-                int64_t intermediate_size,
-                bool proj_in = false)
+                int64_t intermediate_size)
        : n_layer(n_layer) {
        for (int i = 0; i < n_layer; i++) {
            std::string name = "layers." + std::to_string(i);
-            blocks[name]     = std::shared_ptr<GGMLBlock>(new CLIPLayer(d_model, n_head, intermediate_size, proj_in));
+            blocks[name]     = std::shared_ptr<GGMLBlock>(new CLIPLayer(d_model, n_head, intermediate_size));
        }
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* x,
-                         ggml_tensor* mask = nullptr,
-                         int clip_skip     = -1) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                ggml_backend_t backend,
+                                struct ggml_tensor* x,
+                                int clip_skip = -1,
+                                bool mask     = true) {
        // x: [N, n_token, d_model]
        int layer_idx = n_layer - 1;
        // LOG_DEBUG("clip_skip %d", clip_skip);
@ -559,7 +536,7 @@ public:
            }
            std::string name = "layers." + std::to_string(i);
            auto layer       = std::dynamic_pointer_cast<CLIPLayer>(blocks[name]);
-            x                = layer->forward(ctx, x, mask);  // [N, n_token, d_model]
+            x                = layer->forward(ctx, backend, x, mask);  // [N, n_token, d_model]
            // LOG_DEBUG("layer %d", i);
        }
        return x;
@ -573,10 +550,10 @@ protected:
    int64_t num_positions;
    bool force_clip_f32;

-    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
        enum ggml_type token_wtype = GGML_TYPE_F32;
        if (!force_clip_f32) {
-            token_wtype = get_type(prefix + "token_embedding.weight", tensor_storage_map, GGML_TYPE_F32);
+            token_wtype = get_type(prefix + "token_embedding.weight", tensor_types, GGML_TYPE_F32);
            if (!support_get_rows(token_wtype)) {
                token_wtype = GGML_TYPE_F32;
            }
@ -597,24 +574,24 @@ public:
          force_clip_f32(force_clip_f32) {
    }

-    ggml_tensor* get_token_embed_weight() {
+    struct ggml_tensor* get_token_embed_weight() {
        return params["token_embedding.weight"];
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* input_ids,
-                         ggml_tensor* custom_embed_weight) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* input_ids,
+                                struct ggml_tensor* custom_embed_weight) {
        // input_ids: [N, n_token]
        auto token_embed_weight    = params["token_embedding.weight"];
        auto position_embed_weight = params["position_embedding.weight"];

        GGML_ASSERT(input_ids->ne[0] == position_embed_weight->ne[1]);
-        input_ids            = ggml_reshape_3d(ctx->ggml_ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
-        auto token_embedding = ggml_get_rows(ctx->ggml_ctx, custom_embed_weight != nullptr ? custom_embed_weight : token_embed_weight, input_ids);
-        token_embedding      = ggml_reshape_3d(ctx->ggml_ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]);
+        input_ids            = ggml_reshape_3d(ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
+        auto token_embedding = ggml_get_rows(ctx, custom_embed_weight != NULL ? custom_embed_weight : token_embed_weight, input_ids);
+        token_embedding      = ggml_reshape_3d(ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]);

        // token_embedding + position_embedding
-        auto x = ggml_add(ctx->ggml_ctx,
+        auto x = ggml_add(ctx,
                          token_embedding,
                          position_embed_weight);  // [N, n_token, embed_dim]
        return x;
@ -624,13 +601,12 @@ public:
 class CLIPVisionEmbeddings : public GGMLBlock {
 protected:
    int64_t embed_dim;
-    int num_channels;
-    int patch_size;
-    int image_size;
-    int num_patches;
+    int64_t num_channels;
+    int64_t patch_size;
+    int64_t image_size;
+    int64_t num_patches;
    int64_t num_positions;
-
-    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
        enum ggml_type patch_wtype    = GGML_TYPE_F16;
        enum ggml_type class_wtype    = GGML_TYPE_F32;
        enum ggml_type position_wtype = GGML_TYPE_F32;
@ -642,9 +618,9 @@ protected:

 public:
    CLIPVisionEmbeddings(int64_t embed_dim,
-                         int num_channels = 3,
-                         int patch_size   = 14,
-                         int image_size   = 224)
+                         int64_t num_channels = 3,
+                         int64_t patch_size   = 14,
+                         int64_t image_size   = 224)
        : embed_dim(embed_dim),
          num_channels(num_channels),
          patch_size(patch_size),
@ -653,7 +629,7 @@ public:
        num_positions = num_patches + 1;
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* pixel_values) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values) {
        // pixel_values: [N, num_channels, image_size, image_size]
        // return: [N, num_positions, embed_dim]
        GGML_ASSERT(pixel_values->ne[0] == image_size && pixel_values->ne[1] == image_size && pixel_values->ne[2] == num_channels);
@ -663,20 +639,20 @@ public:
        auto position_embed_weight = params["position_embedding.weight"];

        // concat(patch_embedding, class_embedding) + position_embedding
-        ggml_tensor* patch_embedding;
+        struct ggml_tensor* patch_embedding;
        int64_t N       = pixel_values->ne[3];
-        patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size);  // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
-        patch_embedding = ggml_reshape_3d(ctx->ggml_ctx, patch_embedding, num_patches, embed_dim, N);                          // [N, embed_dim, num_patches]
-        patch_embedding = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, patch_embedding, 1, 0, 2, 3));                  // [N, num_patches, embed_dim]
-        patch_embedding = ggml_reshape_4d(ctx->ggml_ctx, patch_embedding, 1, embed_dim, num_patches, N);                       // [N, num_patches, embed_dim, 1]
+        patch_embedding = ggml_nn_conv_2d(ctx, pixel_values, patch_embed_weight, NULL, patch_size, patch_size);  // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
+        patch_embedding = ggml_reshape_3d(ctx, patch_embedding, num_patches, embed_dim, N);                      // [N, embed_dim, num_patches]
+        patch_embedding = ggml_cont(ctx, ggml_permute(ctx, patch_embedding, 1, 0, 2, 3));                        // [N, num_patches, embed_dim]
+        patch_embedding = ggml_reshape_4d(ctx, patch_embedding, 1, embed_dim, num_patches, N);                   // [N, num_patches, embed_dim, 1]

-        ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, embed_dim, N);
-        class_embedding              = ggml_repeat(ctx->ggml_ctx, class_embed_weight, class_embedding);      // [N, embed_dim]
-        class_embedding              = ggml_reshape_4d(ctx->ggml_ctx, class_embedding, 1, embed_dim, 1, N);  // [N, 1, embed_dim, 1]
+        struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, embed_dim, N);
+        class_embedding                     = ggml_repeat(ctx, class_embed_weight, class_embedding);      // [N, embed_dim]
+        class_embedding                     = ggml_reshape_4d(ctx, class_embedding, 1, embed_dim, 1, N);  // [N, 1, embed_dim, 1]

-        ggml_tensor* x = ggml_concat(ctx->ggml_ctx, class_embedding, patch_embedding, 2);  // [N, num_positions, embed_dim, 1]
-        x              = ggml_reshape_3d(ctx->ggml_ctx, x, embed_dim, num_positions, N);   // [N, num_positions, embed_dim]
-        x              = ggml_add(ctx->ggml_ctx, x, position_embed_weight);
+        struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding, 2);  // [N, num_positions, embed_dim, 1]
+        x                     = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N);   // [N, num_positions, embed_dim]
+        x                     = ggml_add(ctx, x, position_embed_weight);
        return x;  // [N, num_positions, embed_dim]
    }
 };
@ -693,7 +669,7 @@ enum CLIPVersion {

 class CLIPTextModel : public GGMLBlock {
 protected:
-    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
        if (version == OPEN_CLIP_VIT_BIGG_14) {
            enum ggml_type wtype      = GGML_TYPE_F32;
            params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
@ -714,8 +690,7 @@ public:

    CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
                  bool with_final_ln  = true,
-                  bool force_clip_f32 = false,
-                  bool proj_in        = false)
+                  bool force_clip_f32 = false)
        : version(version), with_final_ln(with_final_ln) {
        if (version == OPEN_CLIP_VIT_H_14) {
            hidden_size       = 1024;
@ -730,38 +705,38 @@ public:
        }

        blocks["embeddings"]       = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32));
-        blocks["encoder"]          = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size, proj_in));
+        blocks["encoder"]          = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
        blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
    }

-    ggml_tensor* get_token_embed_weight() {
+    struct ggml_tensor* get_token_embed_weight() {
        auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
        return embeddings->get_token_embed_weight();
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* input_ids,
-                         ggml_tensor* tkn_embeddings,
-                         ggml_tensor* mask    = nullptr,
-                         size_t max_token_idx = 0,
-                         bool return_pooled   = false,
-                         int clip_skip        = -1) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                ggml_backend_t backend,
+                                struct ggml_tensor* input_ids,
+                                struct ggml_tensor* tkn_embeddings,
+                                size_t max_token_idx = 0,
+                                bool return_pooled   = false,
+                                int clip_skip        = -1) {
        // input_ids: [N, n_token]
        auto embeddings       = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
        auto encoder          = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]);
        auto final_layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["final_layer_norm"]);

        auto x = embeddings->forward(ctx, input_ids, tkn_embeddings);  // [N, n_token, hidden_size]
-        x      = encoder->forward(ctx, x, mask, return_pooled ? -1 : clip_skip);
+        x      = encoder->forward(ctx, backend, x, return_pooled ? -1 : clip_skip, true);
        if (return_pooled || with_final_ln) {
            x = final_layer_norm->forward(ctx, x);
        }

        if (return_pooled) {
            auto text_projection = params["text_projection"];
-            ggml_tensor* pooled  = ggml_view_1d(ctx->ggml_ctx, x, hidden_size, x->nb[1] * max_token_idx);
-            if (text_projection != nullptr) {
-                pooled = ggml_ext_linear(ctx->ggml_ctx, pooled, text_projection, nullptr);
+            ggml_tensor* pooled  = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx);
+            if (text_projection != NULL) {
+                pooled = ggml_nn_linear(ctx, pooled, text_projection, NULL);
            } else {
                LOG_DEBUG("identity projection");
            }
@ -785,7 +760,7 @@ public:
    int32_t n_layer           = 24;

 public:
-    CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14, bool proj_in = false) {
+    CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14) {
        if (version == OPEN_CLIP_VIT_H_14) {
            hidden_size       = 1280;
            intermediate_size = 5120;
@ -800,14 +775,15 @@ public:

        blocks["embeddings"]     = std::shared_ptr<GGMLBlock>(new CLIPVisionEmbeddings(hidden_size, num_channels, patch_size, image_size));
        blocks["pre_layernorm"]  = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
-        blocks["encoder"]        = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size, proj_in));
+        blocks["encoder"]        = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
        blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* pixel_values,
-                         bool return_pooled = true,
-                         int clip_skip      = -1) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                ggml_backend_t backend,
+                                struct ggml_tensor* pixel_values,
+                                bool return_pooled = true,
+                                int clip_skip      = -1) {
        // pixel_values: [N, num_channels, image_size, image_size]
        auto embeddings     = std::dynamic_pointer_cast<CLIPVisionEmbeddings>(blocks["embeddings"]);
        auto pre_layernorm  = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_layernorm"]);
@ -816,15 +792,14 @@ public:

        auto x = embeddings->forward(ctx, pixel_values);  // [N, num_positions, embed_dim]
        x      = pre_layernorm->forward(ctx, x);
-        x      = encoder->forward(ctx, x, nullptr, clip_skip);
-
+        x      = encoder->forward(ctx, backend, x, clip_skip, false);
+        // print_ggml_tensor(x, true, "ClipVisionModel x: ");
        auto last_hidden_state = x;
-
-        x = post_layernorm->forward(ctx, x);  // [N, n_token, hidden_size]
+        x                      = post_layernorm->forward(ctx, x);  // [N, n_token, hidden_size]

        GGML_ASSERT(x->ne[3] == 1);
        if (return_pooled) {
-            ggml_tensor* pooled = ggml_cont(ctx->ggml_ctx, ggml_view_2d(ctx->ggml_ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
+            ggml_tensor* pooled = ggml_cont(ctx, ggml_view_2d(ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
            return pooled;  // [N, hidden_size]
        } else {
            // return x;  // [N, n_token, hidden_size]
@ -839,8 +814,8 @@ protected:
    int64_t out_features;
    bool transpose_weight;

-    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
-        enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
        if (transpose_weight) {
            params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
        } else {
@ -856,12 +831,12 @@ public:
          out_features(out_features),
          transpose_weight(transpose_weight) {}

-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
-        ggml_tensor* w = params["weight"];
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        struct ggml_tensor* w = params["weight"];
        if (transpose_weight) {
-            w = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, w));
+            w = ggml_cont(ctx, ggml_transpose(ctx, w));
        }
-        return ggml_ext_linear(ctx->ggml_ctx, x, w, nullptr);
+        return ggml_nn_linear(ctx, x, w, NULL);
    }
 };

@ -873,8 +848,7 @@ public:

 public:
    CLIPVisionModelProjection(CLIPVersion version   = OPENAI_CLIP_VIT_L_14,
-                              bool transpose_proj_w = false,
-                              bool proj_in          = false) {
+                              bool transpose_proj_w = false) {
        if (version == OPEN_CLIP_VIT_H_14) {
            hidden_size    = 1280;
            projection_dim = 1024;
@ -882,20 +856,21 @@ public:
            hidden_size = 1664;
        }

-        blocks["vision_model"]      = std::shared_ptr<GGMLBlock>(new CLIPVisionModel(version, proj_in));
+        blocks["vision_model"]      = std::shared_ptr<GGMLBlock>(new CLIPVisionModel(version));
        blocks["visual_projection"] = std::shared_ptr<GGMLBlock>(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w));
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* pixel_values,
-                         bool return_pooled = true,
-                         int clip_skip      = -1) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                ggml_backend_t backend,
+                                struct ggml_tensor* pixel_values,
+                                bool return_pooled = true,
+                                int clip_skip      = -1) {
        // pixel_values: [N, num_channels, image_size, image_size]
        // return: [N, projection_dim] if return_pooled else [N, n_token, hidden_size]
        auto vision_model      = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
        auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]);

-        auto x = vision_model->forward(ctx, pixel_values, return_pooled, clip_skip);  // [N, hidden_size] or [N, n_token, hidden_size]
+        auto x = vision_model->forward(ctx, backend, pixel_values, return_pooled, clip_skip);  // [N, hidden_size] or [N, n_token, hidden_size]

        if (return_pooled) {
            x = visual_projection->forward(ctx, x);  // [N, projection_dim]
@ -908,68 +883,55 @@ public:
 struct CLIPTextModelRunner : public GGMLRunner {
    CLIPTextModel model;

-    std::vector<float> attention_mask_vec;
-
    CLIPTextModelRunner(ggml_backend_t backend,
                        bool offload_params_to_cpu,
-                        const String2TensorStorage& tensor_storage_map,
+                        const String2GGMLType& tensor_types,
                        const std::string prefix,
                        CLIPVersion version = OPENAI_CLIP_VIT_L_14,
                        bool with_final_ln  = true,
                        bool force_clip_f32 = false)
-        : GGMLRunner(backend, offload_params_to_cpu) {
-        bool proj_in = false;
-        for (const auto& [name, tensor_storage] : tensor_storage_map) {
-            if (!starts_with(name, prefix)) {
-                continue;
-            }
-            if (contains(name, "self_attn.in_proj")) {
-                proj_in = true;
-                break;
-            }
-        }
-        model = CLIPTextModel(version, with_final_ln, force_clip_f32, proj_in);
-        model.init(params_ctx, tensor_storage_map, prefix);
+        : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, force_clip_f32) {
+        model.init(params_ctx, tensor_types, prefix);
    }

-    std::string get_desc() override {
+    std::string get_desc() {
        return "clip";
    }

-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
        model.get_param_tensors(tensors, prefix);
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* input_ids,
-                         ggml_tensor* embeddings,
-                         ggml_tensor* mask,
-                         size_t max_token_idx = 0,
-                         bool return_pooled   = false,
-                         int clip_skip        = -1) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                ggml_backend_t backend,
+                                struct ggml_tensor* input_ids,
+                                struct ggml_tensor* embeddings,
+                                size_t max_token_idx = 0,
+                                bool return_pooled   = false,
+                                int clip_skip        = -1) {
        size_t N       = input_ids->ne[1];
        size_t n_token = input_ids->ne[0];
        if (input_ids->ne[0] > model.n_token) {
            GGML_ASSERT(input_ids->ne[0] % model.n_token == 0);
-            input_ids = ggml_reshape_2d(ctx->ggml_ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
+            input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
        }

-        return model.forward(ctx, input_ids, embeddings, mask, max_token_idx, return_pooled, clip_skip);
+        return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
    }

-    ggml_cgraph* build_graph(ggml_tensor* input_ids,
-                             int num_custom_embeddings    = 0,
-                             void* custom_embeddings_data = nullptr,
-                             size_t max_token_idx         = 0,
-                             bool return_pooled           = false,
-                             int clip_skip                = -1) {
-        ggml_cgraph* gf = new_graph_custom(2048);
+    struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
+                                    int num_custom_embeddings    = 0,
+                                    void* custom_embeddings_data = NULL,
+                                    size_t max_token_idx         = 0,
+                                    bool return_pooled           = false,
+                                    int clip_skip                = -1) {
+        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);

        input_ids = to_backend(input_ids);

-        ggml_tensor* embeddings = nullptr;
+        struct ggml_tensor* embeddings = NULL;

-        if (num_custom_embeddings > 0 && custom_embeddings_data != nullptr) {
+        if (num_custom_embeddings > 0 && custom_embeddings_data != NULL) {
            auto token_embed_weight = model.get_token_embed_weight();
            auto custom_embeddings  = ggml_new_tensor_2d(compute_ctx,
                                                         token_embed_weight->type,
@ -981,42 +943,26 @@ struct CLIPTextModelRunner : public GGMLRunner {
            embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
        }

-        int n_tokens = static_cast<int>(input_ids->ne[0]);
-        attention_mask_vec.resize(n_tokens * n_tokens);
-        for (int i0 = 0; i0 < n_tokens; i0++) {
-            for (int i1 = 0; i1 < n_tokens; i1++) {
-                float value = 0.f;
-                if (i0 > i1) {
-                    value = -INFINITY;
-                }
-                attention_mask_vec[i1 * n_tokens + i0] = value;
-            }
-        }
-        auto attention_mask = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, n_tokens, n_tokens);
-        set_backend_tensor_data(attention_mask, attention_mask_vec.data());
-
-        auto runner_ctx = get_context();
-
-        ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, embeddings, attention_mask, max_token_idx, return_pooled, clip_skip);
+        struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);

        ggml_build_forward_expand(gf, hidden_states);

        return gf;
    }

-    bool compute(const int n_threads,
-                 ggml_tensor* input_ids,
+    void compute(const int n_threads,
+                 struct ggml_tensor* input_ids,
                 int num_custom_embeddings,
                 void* custom_embeddings_data,
                 size_t max_token_idx,
                 bool return_pooled,
                 int clip_skip,
                 ggml_tensor** output,
-                 ggml_context* output_ctx = nullptr) {
-        auto get_graph = [&]() -> ggml_cgraph* {
+                 ggml_context* output_ctx = NULL) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
        };
-        return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+        GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
    }
 };

--- a/src/common_block.hpp
+++ b/src/common_block.hpp
@ -1,5 +1,5 @@
-#ifndef __COMMON_BLOCK_HPP__
-#define __COMMON_BLOCK_HPP__
+#ifndef __COMMON_HPP__
+#define __COMMON_HPP__

 #include "ggml_extend.hpp"

@ -23,12 +23,12 @@ public:
        }
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [N, channels, h, w]
        if (vae_downsample) {
            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);

-            x = ggml_ext_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
+            x = ggml_pad(ctx, x, 1, 1, 0, 0);
            x = conv->forward(ctx, x);
        } else {
            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
@ -52,12 +52,12 @@ public:
        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [N, channels, h, w]
        auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);

-        x = ggml_upscale(ctx->ggml_ctx, x, 2, GGML_SCALE_MODE_NEAREST);  // [N, channels, h*2, w*2]
-        x = conv->forward(ctx, x);                                       // [N, out_channels, h*2, w*2]
+        x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST);  // [N, channels, h*2, w*2]
+        x = conv->forward(ctx, x);                             // [N, out_channels, h*2, w*2]
        return x;
    }
 };
@ -80,7 +80,7 @@ protected:
                                       std::pair<int, int> padding) {
        GGML_ASSERT(dims == 2 || dims == 3);
        if (dims == 3) {
-            return std::shared_ptr<GGMLBlock>(new Conv3d(in_channels, out_channels, {kernel_size.first, 1, 1}, {1, 1, 1}, {padding.first, 0, 0}));
+            return std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(in_channels, out_channels, kernel_size.first, 1, padding.first));
        } else {
            return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding));
        }
@ -121,7 +121,7 @@ public:
        }
    }

-    virtual ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* emb = nullptr) {
+    virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = NULL) {
        // For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
        // [N, c, t, h, w] => [N, c, t, h * w]
        // x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
@ -131,38 +131,38 @@ public:
        auto out_layers_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out_layers.0"]);
        auto out_layers_3 = std::dynamic_pointer_cast<UnaryBlock>(blocks["out_layers.3"]);

-        if (emb == nullptr) {
+        if (emb == NULL) {
            GGML_ASSERT(skip_t_emb);
        }

        // in_layers
        auto h = in_layers_0->forward(ctx, x);
-        h      = ggml_silu_inplace(ctx->ggml_ctx, h);
+        h      = ggml_silu_inplace(ctx, h);
        h      = in_layers_2->forward(ctx, h);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]

        // emb_layers
        if (!skip_t_emb) {
            auto emb_layer_1 = std::dynamic_pointer_cast<Linear>(blocks["emb_layers.1"]);

-            auto emb_out = ggml_silu(ctx->ggml_ctx, emb);
+            auto emb_out = ggml_silu(ctx, emb);
            emb_out      = emb_layer_1->forward(ctx, emb_out);  // [N, out_channels] if dims == 2 else [N, t, out_channels]

            if (dims == 2) {
-                emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]);  // [N, out_channels, 1, 1]
+                emb_out = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]);  // [N, out_channels, 1, 1]
            } else {
-                emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]);  // [N, t, out_channels, 1]
+                emb_out = ggml_reshape_4d(ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]);  // [N, t, out_channels, 1]
                if (exchange_temb_dims) {
                    // emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
-                    emb_out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, emb_out, 0, 2, 1, 3));  // [N, out_channels, t, 1]
+                    emb_out = ggml_cont(ctx, ggml_permute(ctx, emb_out, 0, 2, 1, 3));  // [N, out_channels, t, 1]
                }
            }

-            h = ggml_add(ctx->ggml_ctx, h, emb_out);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+            h = ggml_add(ctx, h, emb_out);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
        }

        // out_layers
        h = out_layers_0->forward(ctx, h);
-        h = ggml_silu_inplace(ctx->ggml_ctx, h);
+        h = ggml_silu_inplace(ctx, h);
        // dropout, skip for inference
        h = out_layers_3->forward(ctx, h);

@ -172,7 +172,7 @@ public:
            x                    = skip_connection->forward(ctx, x);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
        }

-        h = ggml_add(ctx->ggml_ctx, h, x);
+        h = ggml_add(ctx, h, x);
        return h;  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
    }
 };
@ -182,27 +182,35 @@ protected:
    int64_t dim_in;
    int64_t dim_out;

-public:
-    GEGLU(int64_t dim_in, int64_t dim_out)
-        : dim_in(dim_in), dim_out(dim_out) {
-        blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out * 2));
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
+        enum ggml_type wtype      = get_type(prefix + "proj.weight", tensor_types, GGML_TYPE_F32);
+        enum ggml_type bias_wtype = GGML_TYPE_F32;
+        params["proj.weight"]     = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
+        params["proj.bias"]       = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2);
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+public:
+    GEGLU(int64_t dim_in, int64_t dim_out)
+        : dim_in(dim_in), dim_out(dim_out) {}
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [ne3, ne2, ne1, dim_in]
        // return: [ne3, ne2, ne1, dim_out]
-        auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
+        struct ggml_tensor* w = params["proj.weight"];
+        struct ggml_tensor* b = params["proj.bias"];

-        x          = proj->forward(ctx, x);  // [ne3, ne2, ne1, dim_out*2]
-        auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0, false);
-        x          = x_vec[0];  // [ne3, ne2, ne1, dim_out]
-        auto gate  = x_vec[1];  // [ne3, ne2, ne1, dim_out]
+        auto x_w    = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0);                        // [dim_out, dim_in]
+        auto x_b    = ggml_view_1d(ctx, b, b->ne[0] / 2, 0);                                            // [dim_out, dim_in]
+        auto gate_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2);  // [dim_out, ]
+        auto gate_b = ggml_view_1d(ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2);                      // [dim_out, ]

-        gate = ggml_cont(ctx->ggml_ctx, gate);
+        auto x_in = x;
+        x         = ggml_nn_linear(ctx, x_in, x_w, x_b);        // [ne3, ne2, ne1, dim_out]
+        auto gate = ggml_nn_linear(ctx, x_in, gate_w, gate_b);  // [ne3, ne2, ne1, dim_out]

-        gate = ggml_ext_gelu(ctx->ggml_ctx, gate, true);
+        gate = ggml_gelu_inplace(ctx, gate);

-        x = ggml_mul(ctx->ggml_ctx, x, gate);  // [ne3, ne2, ne1, dim_out]
+        x = ggml_mul(ctx, x, gate);  // [ne3, ne2, ne1, dim_out]

        return x;
    }
@ -214,13 +222,13 @@ public:
        blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out, bias));
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [ne3, ne2, ne1, dim_in]
        // return: [ne3, ne2, ne1, dim_out]
        auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);

        x = proj->forward(ctx, x);
-        x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
+        x = ggml_gelu_inplace(ctx, x);
        return x;
    }
 };
@ -244,21 +252,17 @@ public:
        }

        // net_1 is nn.Dropout(), skip for inference
-        bool force_prec_f32 = false;
-        float scale         = 1.f;
+        float scale = 1.f;
        if (precision_fix) {
            scale = 1.f / 128.f;
-#ifdef SD_USE_VULKAN
-            force_prec_f32 = true;
-#endif
        }
        // The purpose of the scale here is to prevent NaN issues in certain situations.
        // For example, when using Vulkan without enabling force_prec_f32,
        // or when using CUDA but the weights are k-quants.
-        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, force_prec_f32, scale));
+        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, false, scale));
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [ne3, ne2, ne1, dim]
        // return: [ne3, ne2, ne1, dim_out]

@ -277,16 +281,19 @@ protected:
    int64_t context_dim;
    int64_t n_head;
    int64_t d_head;
+    bool flash_attn;

 public:
    CrossAttention(int64_t query_dim,
                   int64_t context_dim,
                   int64_t n_head,
-                   int64_t d_head)
+                   int64_t d_head,
+                   bool flash_attn = false)
        : n_head(n_head),
          d_head(d_head),
          query_dim(query_dim),
-          context_dim(context_dim) {
+          context_dim(context_dim),
+          flash_attn(flash_attn) {
        int64_t inner_dim = d_head * n_head;

        blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
@ -297,9 +304,10 @@ public:
        // to_out_1 is nn.Dropout(), skip for inference
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* x,
-                         ggml_tensor* context) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                ggml_backend_t backend,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* context) {
        // x: [N, n_token, query_dim]
        // context: [N, n_context, context_dim]
        // return: [N, n_token, query_dim]
@ -317,7 +325,7 @@ public:
        auto k = to_k->forward(ctx, context);  // [N, n_context, inner_dim]
        auto v = to_v->forward(ctx, context);  // [N, n_context, inner_dim]

-        x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled);  // [N, n_token, inner_dim]
+        x = ggml_nn_attention_ext(ctx, backend, q, k, v, n_head, NULL, false, false, flash_attn);  // [N, n_token, inner_dim]

        x = to_out_0->forward(ctx, x);  // [N, n_token, query_dim]
        return x;
@ -335,15 +343,16 @@ public:
                          int64_t n_head,
                          int64_t d_head,
                          int64_t context_dim,
-                          bool ff_in = false)
+                          bool ff_in      = false,
+                          bool flash_attn = false)
        : n_head(n_head), d_head(d_head), ff_in(ff_in) {
        // disable_self_attn is always False
        // disable_temporal_crossattention is always False
        // switch_temporal_ca_to_sa is always False
        // inner_dim is always None or equal to dim
        // gated_ff is always True
-        blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head));
-        blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head));
+        blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head, flash_attn));
+        blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head, flash_attn));
        blocks["ff"]    = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
        blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
        blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
@ -355,9 +364,10 @@ public:
        }
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* x,
-                         ggml_tensor* context) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                ggml_backend_t backend,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* context) {
        // x: [N, n_token, query_dim]
        // context: [N, n_context, context_dim]
        // return: [N, n_token, query_dim]
@ -377,21 +387,21 @@ public:
            x           = norm_in->forward(ctx, x);
            x           = ff_in->forward(ctx, x);
            // self.is_res is always True
-            x = ggml_add(ctx->ggml_ctx, x, x_skip);
+            x = ggml_add(ctx, x, x_skip);
        }

        auto r = x;
        x      = norm1->forward(ctx, x);
-        x      = attn1->forward(ctx, x, x);  // self-attention
-        x      = ggml_add(ctx->ggml_ctx, x, r);
+        x      = attn1->forward(ctx, backend, x, x);  // self-attention
+        x      = ggml_add(ctx, x, r);
        r      = x;
        x      = norm2->forward(ctx, x);
-        x      = attn2->forward(ctx, x, context);  // cross-attention
-        x      = ggml_add(ctx->ggml_ctx, x, r);
+        x      = attn2->forward(ctx, backend, x, context);  // cross-attention
+        x      = ggml_add(ctx, x, r);
        r      = x;
        x      = norm3->forward(ctx, x);
        x      = ff->forward(ctx, x);
-        x      = ggml_add(ctx->ggml_ctx, x, r);
+        x      = ggml_add(ctx, x, r);

        return x;
    }
@ -404,23 +414,6 @@ protected:
    int64_t d_head;
    int64_t depth       = 1;    // 1
    int64_t context_dim = 768;  // hidden_size, 1024 for VERSION_SD2
-    bool use_linear     = false;
-
-    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
-        auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
-        if (iter != tensor_storage_map.end()) {
-            int64_t inner_dim = n_head * d_head;
-            if (iter->second.n_dims == 4 && use_linear) {
-                use_linear         = false;
-                blocks["proj_in"]  = std::make_shared<Conv2d>(in_channels, inner_dim, std::pair{1, 1});
-                blocks["proj_out"] = std::make_shared<Conv2d>(inner_dim, in_channels, std::pair{1, 1});
-            } else if (iter->second.n_dims == 2 && !use_linear) {
-                use_linear         = true;
-                blocks["proj_in"]  = std::make_shared<Linear>(in_channels, inner_dim);
-                blocks["proj_out"] = std::make_shared<Linear>(inner_dim, in_channels);
-            }
-        }
-    }

 public:
    SpatialTransformer(int64_t in_channels,
@ -428,42 +421,35 @@ public:
                       int64_t d_head,
                       int64_t depth,
                       int64_t context_dim,
-                       bool use_linear)
+                       bool flash_attn = false)
        : in_channels(in_channels),
          n_head(n_head),
          d_head(d_head),
          depth(depth),
-          context_dim(context_dim),
-          use_linear(use_linear) {
+          context_dim(context_dim) {
+        // We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False
        // disable_self_attn is always False
        int64_t inner_dim = n_head * d_head;  // in_channels
        blocks["norm"]    = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
-        if (use_linear) {
-            blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, inner_dim));
-        } else {
-            blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
-        }
+        blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));

        for (int i = 0; i < depth; i++) {
            std::string name = "transformer_blocks." + std::to_string(i);
-            blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false));
+            blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false, flash_attn));
        }

-        if (use_linear) {
-            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, in_channels));
-        } else {
-            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
-        }
+        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
    }

-    virtual ggml_tensor* forward(GGMLRunnerContext* ctx,
-                                 ggml_tensor* x,
-                                 ggml_tensor* context) {
+    virtual struct ggml_tensor* forward(struct ggml_context* ctx,
+                                        ggml_backend_t backend,
+                                        struct ggml_tensor* x,
+                                        struct ggml_tensor* context) {
        // x: [N, in_channels, h, w]
        // context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
        auto norm     = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
-        auto proj_in  = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_in"]);
-        auto proj_out = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_out"]);
+        auto proj_in  = std::dynamic_pointer_cast<Conv2d>(blocks["proj_in"]);
+        auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]);

        auto x_in         = x;
        int64_t n         = x->ne[3];
@ -472,45 +458,32 @@ public:
        int64_t inner_dim = n_head * d_head;

        x = norm->forward(ctx, x);
-        if (use_linear) {
-            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3));  // [N, h, w, inner_dim]
-            x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n);                // [N, h * w, inner_dim]
-            x = proj_in->forward(ctx, x);                                              // [N, inner_dim, h, w]
-        } else {
-            x = proj_in->forward(ctx, x);                                              // [N, inner_dim, h, w]
-            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3));  // [N, h, w, inner_dim]
-            x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n);                // [N, h * w, inner_dim]
-        }
+        x = proj_in->forward(ctx, x);  // [N, inner_dim, h, w]
+
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3));  // [N, h, w, inner_dim]
+        x = ggml_reshape_3d(ctx, x, inner_dim, w * h, n);      // [N, h * w, inner_dim]

        for (int i = 0; i < depth; i++) {
            std::string name       = "transformer_blocks." + std::to_string(i);
            auto transformer_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[name]);

-            x = transformer_block->forward(ctx, x, context);
+            x = transformer_block->forward(ctx, backend, x, context);
        }

-        if (use_linear) {
-            // proj_out
-            x = proj_out->forward(ctx, x);  // [N, in_channels, h, w]
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));  // [N, inner_dim, h * w]
+        x = ggml_reshape_4d(ctx, x, w, h, inner_dim, n);       // [N, inner_dim, h, w]

-            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));  // [N, inner_dim, h * w]
-            x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n);                 // [N, inner_dim, h, w]
-        } else {
-            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));  // [N, inner_dim, h * w]
-            x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n);                 // [N, inner_dim, h, w]
+        // proj_out
+        x = proj_out->forward(ctx, x);  // [N, in_channels, h, w]

-            // proj_out
-            x = proj_out->forward(ctx, x);  // [N, in_channels, h, w]
-        }
-
-        x = ggml_add(ctx->ggml_ctx, x, x_in);
+        x = ggml_add(ctx, x, x_in);
        return x;
    }
 };

 class AlphaBlender : public GGMLBlock {
 protected:
-    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
        // Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
        enum ggml_type wtype = GGML_TYPE_F32;
        params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
@ -519,7 +492,7 @@ protected:
    float get_alpha() {
        // image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
        // so learned_with_images is same as learned
-        float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
+        float alpha = ggml_backend_tensor_get_f32(params["mix_factor"]);
        return sigmoid(alpha);
    }

@ -530,23 +503,23 @@ public:
        // since mix_factor.shape is [1,], we don't need rearrange using  rearrange_pattern
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* x_spatial,
-                         ggml_tensor* x_temporal) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x_spatial,
+                                struct ggml_tensor* x_temporal) {
        // image_only_indicator is always tensor([0.])
        float alpha = get_alpha();
-        auto x      = ggml_add(ctx->ggml_ctx,
-                               ggml_ext_scale(ctx->ggml_ctx, x_spatial, alpha),
-                               ggml_ext_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha));
+        auto x      = ggml_add(ctx,
+                               ggml_scale(ctx, x_spatial, alpha),
+                               ggml_scale(ctx, x_temporal, 1.0f - alpha));
        return x;
    }
 };

 class VideoResBlock : public ResBlock {
 public:
-    VideoResBlock(int64_t channels,
-                  int64_t emb_channels,
-                  int64_t out_channels,
+    VideoResBlock(int channels,
+                  int emb_channels,
+                  int out_channels,
                  std::pair<int, int> kernel_size = {3, 3},
                  int64_t video_kernel_size       = 3,
                  int dims                        = 2)  // always 2
@ -555,10 +528,10 @@ public:
        blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* x,
-                         ggml_tensor* emb,
-                         int num_video_frames) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* emb,
+                                int num_video_frames) {
        // x: [N, channels, h, w] aka [b*t, channels, h, w]
        // emb: [N, emb_channels] aka [b*t, emb_channels]
        // image_only_indicator is always tensor([0.])
@ -573,21 +546,21 @@ public:
        int64_t H = x->ne[1];
        int64_t W = x->ne[0];

-        x          = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B);                     // (b t) c h w -> b t c (h w)
-        x          = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
+        x          = ggml_reshape_4d(ctx, x, W * H, C, T, B);           // (b t) c h w -> b t c (h w)
+        x          = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
        auto x_mix = x;

-        emb = ggml_reshape_4d(ctx->ggml_ctx, emb, emb->ne[0], T, B, emb->ne[3]);  // (b t) ... -> b t ...
+        emb = ggml_reshape_4d(ctx, emb, emb->ne[0], T, B, emb->ne[3]);  // (b t) ... -> b t ...

        x = time_stack->forward(ctx, x, emb);  // b t c (h w)

        x = time_mixer->forward(ctx, x_mix, x);  // b t c (h w)

-        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
-        x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B);                     // b t c (h w) -> (b t) c h w
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
+        x = ggml_reshape_4d(ctx, x, W, H, C, T * B);           // b t c (h w) -> (b t) c h w

        return x;
    }
 };

-#endif  // __COMMON_BLOCK_HPP__
+#endif  // __COMMON_HPP__
--- a/src/conditioner.hpp
+++ b/src/conditioner.hpp
--- a/src/control.hpp
+++ b/src/control.hpp
@ -1,7 +1,8 @@
 #ifndef __CONTROL_HPP__
 #define __CONTROL_HPP__

-#include "common_block.hpp"
+#include "common.hpp"
+#include "ggml_extend.hpp"
 #include "model.h"

 #define CONTROL_NET_GRAPH_SIZE 1536
@ -26,7 +27,6 @@ protected:
    int num_heads                          = 8;
    int num_head_channels                  = -1;   // channels // num_heads
    int context_dim                        = 768;  // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
-    bool use_linear_projection             = false;

 public:
    int model_channels  = 320;
@ -82,7 +82,7 @@ public:
                                       int64_t d_head,
                                       int64_t depth,
                                       int64_t context_dim) -> SpatialTransformer* {
-            return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
+            return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim);
        };

        auto make_zero_conv = [&](int64_t channels) {
@ -164,26 +164,27 @@ public:
        blocks["middle_block_out.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
    }

-    ggml_tensor* resblock_forward(std::string name,
-                                  GGMLRunnerContext* ctx,
-                                  ggml_tensor* x,
-                                  ggml_tensor* emb) {
+    struct ggml_tensor* resblock_forward(std::string name,
+                                         struct ggml_context* ctx,
+                                         struct ggml_tensor* x,
+                                         struct ggml_tensor* emb) {
        auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
        return block->forward(ctx, x, emb);
    }

-    ggml_tensor* attention_layer_forward(std::string name,
-                                         GGMLRunnerContext* ctx,
-                                         ggml_tensor* x,
-                                         ggml_tensor* context) {
+    struct ggml_tensor* attention_layer_forward(std::string name,
+                                                struct ggml_context* ctx,
+                                                ggml_backend_t backend,
+                                                struct ggml_tensor* x,
+                                                struct ggml_tensor* context) {
        auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
-        return block->forward(ctx, x, context);
+        return block->forward(ctx, backend, x, context);
    }

-    ggml_tensor* input_hint_block_forward(GGMLRunnerContext* ctx,
-                                          ggml_tensor* hint,
-                                          ggml_tensor* emb,
-                                          ggml_tensor* context) {
+    struct ggml_tensor* input_hint_block_forward(struct ggml_context* ctx,
+                                                 struct ggml_tensor* hint,
+                                                 struct ggml_tensor* emb,
+                                                 struct ggml_tensor* context) {
        int num_input_blocks = 15;
        auto h               = hint;
        for (int i = 0; i < num_input_blocks; i++) {
@ -192,32 +193,33 @@ public:

                h = block->forward(ctx, h);
            } else {
-                h = ggml_silu_inplace(ctx->ggml_ctx, h);
+                h = ggml_silu_inplace(ctx, h);
            }
        }
        return h;
    }

-    std::vector<ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                      ggml_tensor* x,
-                                      ggml_tensor* hint,
-                                      ggml_tensor* guided_hint,
-                                      ggml_tensor* timesteps,
-                                      ggml_tensor* context,
-                                      ggml_tensor* y = nullptr) {
+    std::vector<struct ggml_tensor*> forward(struct ggml_context* ctx,
+                                             ggml_backend_t backend,
+                                             struct ggml_tensor* x,
+                                             struct ggml_tensor* hint,
+                                             struct ggml_tensor* guided_hint,
+                                             struct ggml_tensor* timesteps,
+                                             struct ggml_tensor* context,
+                                             struct ggml_tensor* y = NULL) {
        // x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
        // timesteps: [N,]
        // context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
        // y: [N, adm_in_channels] or [1, adm_in_channels]
-        if (context != nullptr) {
+        if (context != NULL) {
            if (context->ne[2] != x->ne[3]) {
-                context = ggml_repeat(ctx->ggml_ctx, context, ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
+                context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
            }
        }

-        if (y != nullptr) {
+        if (y != NULL) {
            if (y->ne[1] != x->ne[3]) {
-                y = ggml_repeat(ctx->ggml_ctx, y, ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
+                y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
            }
        }

@ -228,27 +230,27 @@ public:

        auto middle_block_out = std::dynamic_pointer_cast<Conv2d>(blocks["middle_block_out.0"]);

-        auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, model_channels);  // [N, model_channels]
+        auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels);  // [N, model_channels]

        auto emb = time_embed_0->forward(ctx, t_emb);
-        emb      = ggml_silu_inplace(ctx->ggml_ctx, emb);
+        emb      = ggml_silu_inplace(ctx, emb);
        emb      = time_embed_2->forward(ctx, emb);  // [N, time_embed_dim]

        // SDXL/SVD
-        if (y != nullptr) {
+        if (y != NULL) {
            auto label_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.0"]);
            auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);

            auto label_emb = label_embed_0->forward(ctx, y);
-            label_emb      = ggml_silu_inplace(ctx->ggml_ctx, label_emb);
+            label_emb      = ggml_silu_inplace(ctx, label_emb);
            label_emb      = label_embed_2->forward(ctx, label_emb);  // [N, time_embed_dim]

-            emb = ggml_add(ctx->ggml_ctx, emb, label_emb);  // [N, time_embed_dim]
+            emb = ggml_add(ctx, emb, label_emb);  // [N, time_embed_dim]
        }

-        std::vector<ggml_tensor*> outs;
+        std::vector<struct ggml_tensor*> outs;

-        if (guided_hint == nullptr) {
+        if (guided_hint == NULL) {
            guided_hint = input_hint_block_forward(ctx, hint, emb, context);
        }
        outs.push_back(guided_hint);
@ -257,7 +259,7 @@ public:

        // input block 0
        auto h = input_blocks_0_0->forward(ctx, x);
-        h      = ggml_add(ctx->ggml_ctx, h, guided_hint);
+        h      = ggml_add(ctx, h, guided_hint);
        outs.push_back(zero_convs_0->forward(ctx, h));

        // input block 1-11
@ -272,7 +274,7 @@ public:
                h                = resblock_forward(name, ctx, h, emb);  // [N, mult*model_channels, h, w]
                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
-                    h                = attention_layer_forward(name, ctx, h, context);  // [N, mult*model_channels, h, w]
+                    h                = attention_layer_forward(name, ctx, backend, h, context);  // [N, mult*model_channels, h, w]
                }

                auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
@ -296,9 +298,9 @@ public:
        // [N, 4*model_channels, h/8, w/8]

        // middle_block
-        h = resblock_forward("middle_block.0", ctx, h, emb);             // [N, 4*model_channels, h/8, w/8]
-        h = attention_layer_forward("middle_block.1", ctx, h, context);  // [N, 4*model_channels, h/8, w/8]
-        h = resblock_forward("middle_block.2", ctx, h, emb);             // [N, 4*model_channels, h/8, w/8]
+        h = resblock_forward("middle_block.0", ctx, h, emb);                      // [N, 4*model_channels, h/8, w/8]
+        h = attention_layer_forward("middle_block.1", ctx, backend, h, context);  // [N, 4*model_channels, h/8, w/8]
+        h = resblock_forward("middle_block.2", ctx, h, emb);                      // [N, 4*model_channels, h/8, w/8]

        // out
        outs.push_back(middle_block_out->forward(ctx, h));
@ -310,28 +312,39 @@ struct ControlNet : public GGMLRunner {
    SDVersion version = VERSION_SD1;
    ControlNetBlock control_net;

-    ggml_backend_buffer_t control_buffer = nullptr;  // keep control output tensors in backend memory
-    ggml_context* control_ctx            = nullptr;
-    std::vector<ggml_tensor*> controls;  // (12 input block outputs, 1 middle block output) SD 1.5
-    ggml_tensor* guided_hint = nullptr;  // guided_hint cache, for faster inference
-    bool guided_hint_cached  = false;
+    ggml_backend_buffer_t control_buffer = NULL;  // keep control output tensors in backend memory
+    ggml_context* control_ctx            = NULL;
+    std::vector<struct ggml_tensor*> controls;  // (12 input block outputs, 1 middle block output) SD 1.5
+    struct ggml_tensor* guided_hint = NULL;     // guided_hint cache, for faster inference
+    bool guided_hint_cached         = false;

    ControlNet(ggml_backend_t backend,
               bool offload_params_to_cpu,
-               const String2TensorStorage& tensor_storage_map = {},
-               SDVersion version                              = VERSION_SD1)
+               const String2GGMLType& tensor_types = {},
+               SDVersion version                   = VERSION_SD1)
        : GGMLRunner(backend, offload_params_to_cpu), control_net(version) {
-        control_net.init(params_ctx, tensor_storage_map, "");
+        control_net.init(params_ctx, tensor_types, "");
    }

-    ~ControlNet() override {
+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        control_net.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
+    ~ControlNet() {
        free_control_ctx();
    }

-    void alloc_control_ctx(std::vector<ggml_tensor*> outs) {
-        ggml_init_params params;
+    void alloc_control_ctx(std::vector<struct ggml_tensor*> outs) {
+        struct ggml_init_params params;
        params.mem_size   = static_cast<size_t>(outs.size() * ggml_tensor_overhead()) + 1024 * 1024;
-        params.mem_buffer = nullptr;
+        params.mem_buffer = NULL;
        params.no_alloc   = true;
        control_ctx       = ggml_init(params);

@ -353,37 +366,37 @@ struct ControlNet : public GGMLRunner {
    }

    void free_control_ctx() {
-        if (control_buffer != nullptr) {
+        if (control_buffer != NULL) {
            ggml_backend_buffer_free(control_buffer);
-            control_buffer = nullptr;
+            control_buffer = NULL;
        }
-        if (control_ctx != nullptr) {
+        if (control_ctx != NULL) {
            ggml_free(control_ctx);
-            control_ctx = nullptr;
+            control_ctx = NULL;
        }
-        guided_hint        = nullptr;
+        guided_hint        = NULL;
        guided_hint_cached = false;
        controls.clear();
    }

-    std::string get_desc() override {
+    std::string get_desc() {
        return "control_net";
    }

-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
        control_net.get_param_tensors(tensors, prefix);
    }

-    ggml_cgraph* build_graph(ggml_tensor* x,
-                             ggml_tensor* hint,
-                             ggml_tensor* timesteps,
-                             ggml_tensor* context,
-                             ggml_tensor* y = nullptr) {
-        ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE);
+    struct ggml_cgraph* build_graph(struct ggml_tensor* x,
+                                    struct ggml_tensor* hint,
+                                    struct ggml_tensor* timesteps,
+                                    struct ggml_tensor* context,
+                                    struct ggml_tensor* y = NULL) {
+        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false);

        x = to_backend(x);
        if (guided_hint_cached) {
-            hint = nullptr;
+            hint = NULL;
        } else {
            hint = to_backend(hint);
        }
@ -391,17 +404,16 @@ struct ControlNet : public GGMLRunner {
        y         = to_backend(y);
        timesteps = to_backend(timesteps);

-        auto runner_ctx = get_context();
-
-        auto outs = control_net.forward(&runner_ctx,
+        auto outs = control_net.forward(compute_ctx,
+                                        runtime_backend,
                                        x,
                                        hint,
-                                        guided_hint_cached ? guided_hint : nullptr,
+                                        guided_hint_cached ? guided_hint : NULL,
                                        timesteps,
                                        context,
                                        y);

-        if (control_ctx == nullptr) {
+        if (control_ctx == NULL) {
            alloc_control_ctx(outs);
        }

@ -413,28 +425,24 @@ struct ControlNet : public GGMLRunner {
        return gf;
    }

-    bool compute(int n_threads,
-                 ggml_tensor* x,
-                 ggml_tensor* hint,
-                 ggml_tensor* timesteps,
-                 ggml_tensor* context,
-                 ggml_tensor* y,
-                 ggml_tensor** output     = nullptr,
-                 ggml_context* output_ctx = nullptr) {
+    void compute(int n_threads,
+                 struct ggml_tensor* x,
+                 struct ggml_tensor* hint,
+                 struct ggml_tensor* timesteps,
+                 struct ggml_tensor* context,
+                 struct ggml_tensor* y,
+                 struct ggml_tensor** output     = NULL,
+                 struct ggml_context* output_ctx = NULL) {
        // x: [N, in_channels, h, w]
        // timesteps: [N, ]
        // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
        // y: [N, adm_in_channels] or [1, adm_in_channels]
-        auto get_graph = [&]() -> ggml_cgraph* {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(x, hint, timesteps, context, y);
        };

-        bool res = GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
-        if (res) {
-            // cache guided_hint
-            guided_hint_cached = true;
-        }
-        return res;
+        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        guided_hint_cached = true;
    }

    bool load_from_file(const std::string& file_path, int n_threads) {
@ -445,7 +453,7 @@ struct ControlNet : public GGMLRunner {
        std::set<std::string> ignore_tensors;

        ModelLoader model_loader;
-        if (!model_loader.init_from_file_and_convert_name(file_path)) {
+        if (!model_loader.init_from_file(file_path)) {
            LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
            return false;
        }
--- a/src/denoiser.hpp
+++ b/src/denoiser.hpp
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@ -0,0 +1,323 @@
+#ifndef __DIFFUSION_MODEL_H__
+#define __DIFFUSION_MODEL_H__
+
+#include "flux.hpp"
+#include "mmdit.hpp"
+#include "qwen_image.hpp"
+#include "unet.hpp"
+#include "wan.hpp"
+
+struct DiffusionParams {
+    struct ggml_tensor* x                     = NULL;
+    struct ggml_tensor* timesteps             = NULL;
+    struct ggml_tensor* context               = NULL;
+    struct ggml_tensor* c_concat              = NULL;
+    struct ggml_tensor* y                     = NULL;
+    struct ggml_tensor* guidance              = NULL;
+    std::vector<ggml_tensor*> ref_latents     = {};
+    bool increase_ref_index                   = false;
+    int num_video_frames                      = -1;
+    std::vector<struct ggml_tensor*> controls = {};
+    float control_strength                    = 0.f;
+    struct ggml_tensor* vace_context          = NULL;
+    float vace_strength                       = 1.f;
+    std::vector<int> skip_layers              = {};
+};
+
+struct DiffusionModel {
+    virtual std::string get_desc()                                                      = 0;
+    virtual void compute(int n_threads,
+                         DiffusionParams diffusion_params,
+                         struct ggml_tensor** output     = NULL,
+                         struct ggml_context* output_ctx = NULL)                        = 0;
+    virtual void alloc_params_buffer()                                                  = 0;
+    virtual void free_params_buffer()                                                   = 0;
+    virtual void free_compute_buffer()                                                  = 0;
+    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
+    virtual size_t get_params_buffer_size()                                             = 0;
+    virtual int64_t get_adm_in_channels()                                               = 0;
+};
+
+struct UNetModel : public DiffusionModel {
+    UNetModelRunner unet;
+
+    UNetModel(ggml_backend_t backend,
+              bool offload_params_to_cpu,
+              const String2GGMLType& tensor_types = {},
+              SDVersion version                   = VERSION_SD1,
+              bool flash_attn                     = false)
+        : unet(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn) {
+    }
+
+    std::string get_desc() {
+        return unet.get_desc();
+    }
+
+    void alloc_params_buffer() {
+        unet.alloc_params_buffer();
+    }
+
+    void free_params_buffer() {
+        unet.free_params_buffer();
+    }
+
+    void free_compute_buffer() {
+        unet.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+        unet.get_param_tensors(tensors, "model.diffusion_model");
+    }
+
+    size_t get_params_buffer_size() {
+        return unet.get_params_buffer_size();
+    }
+
+    int64_t get_adm_in_channels() {
+        return unet.unet.adm_in_channels;
+    }
+
+    void compute(int n_threads,
+                 DiffusionParams diffusion_params,
+                 struct ggml_tensor** output     = NULL,
+                 struct ggml_context* output_ctx = NULL) {
+        return unet.compute(n_threads,
+                            diffusion_params.x,
+                            diffusion_params.timesteps,
+                            diffusion_params.context,
+                            diffusion_params.c_concat,
+                            diffusion_params.y,
+                            diffusion_params.num_video_frames,
+                            diffusion_params.controls,
+                            diffusion_params.control_strength, output, output_ctx);
+    }
+};
+
+struct MMDiTModel : public DiffusionModel {
+    MMDiTRunner mmdit;
+
+    MMDiTModel(ggml_backend_t backend,
+               bool offload_params_to_cpu,
+               bool flash_attn                     = false,
+               const String2GGMLType& tensor_types = {})
+        : mmdit(backend, offload_params_to_cpu, flash_attn, tensor_types, "model.diffusion_model") {
+    }
+
+    std::string get_desc() {
+        return mmdit.get_desc();
+    }
+
+    void alloc_params_buffer() {
+        mmdit.alloc_params_buffer();
+    }
+
+    void free_params_buffer() {
+        mmdit.free_params_buffer();
+    }
+
+    void free_compute_buffer() {
+        mmdit.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+        mmdit.get_param_tensors(tensors, "model.diffusion_model");
+    }
+
+    size_t get_params_buffer_size() {
+        return mmdit.get_params_buffer_size();
+    }
+
+    int64_t get_adm_in_channels() {
+        return 768 + 1280;
+    }
+
+    void compute(int n_threads,
+                 DiffusionParams diffusion_params,
+                 struct ggml_tensor** output     = NULL,
+                 struct ggml_context* output_ctx = NULL) {
+        return mmdit.compute(n_threads,
+                             diffusion_params.x,
+                             diffusion_params.timesteps,
+                             diffusion_params.context,
+                             diffusion_params.y,
+                             output,
+                             output_ctx,
+                             diffusion_params.skip_layers);
+    }
+};
+
+struct FluxModel : public DiffusionModel {
+    Flux::FluxRunner flux;
+
+    FluxModel(ggml_backend_t backend,
+              bool offload_params_to_cpu,
+              const String2GGMLType& tensor_types = {},
+              SDVersion version                   = VERSION_FLUX,
+              bool flash_attn                     = false,
+              bool use_mask                       = false)
+        : flux(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) {
+    }
+
+    std::string get_desc() {
+        return flux.get_desc();
+    }
+
+    void alloc_params_buffer() {
+        flux.alloc_params_buffer();
+    }
+
+    void free_params_buffer() {
+        flux.free_params_buffer();
+    }
+
+    void free_compute_buffer() {
+        flux.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+        flux.get_param_tensors(tensors, "model.diffusion_model");
+    }
+
+    size_t get_params_buffer_size() {
+        return flux.get_params_buffer_size();
+    }
+
+    int64_t get_adm_in_channels() {
+        return 768;
+    }
+
+    void compute(int n_threads,
+                 DiffusionParams diffusion_params,
+                 struct ggml_tensor** output     = NULL,
+                 struct ggml_context* output_ctx = NULL) {
+        return flux.compute(n_threads,
+                            diffusion_params.x,
+                            diffusion_params.timesteps,
+                            diffusion_params.context,
+                            diffusion_params.c_concat,
+                            diffusion_params.y,
+                            diffusion_params.guidance,
+                            diffusion_params.ref_latents,
+                            diffusion_params.increase_ref_index,
+                            output,
+                            output_ctx,
+                            diffusion_params.skip_layers);
+    }
+};
+
+struct WanModel : public DiffusionModel {
+    std::string prefix;
+    WAN::WanRunner wan;
+
+    WanModel(ggml_backend_t backend,
+             bool offload_params_to_cpu,
+             const String2GGMLType& tensor_types = {},
+             const std::string prefix            = "model.diffusion_model",
+             SDVersion version                   = VERSION_WAN2,
+             bool flash_attn                     = false)
+        : prefix(prefix), wan(backend, offload_params_to_cpu, tensor_types, prefix, version, flash_attn) {
+    }
+
+    std::string get_desc() {
+        return wan.get_desc();
+    }
+
+    void alloc_params_buffer() {
+        wan.alloc_params_buffer();
+    }
+
+    void free_params_buffer() {
+        wan.free_params_buffer();
+    }
+
+    void free_compute_buffer() {
+        wan.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+        wan.get_param_tensors(tensors, prefix);
+    }
+
+    size_t get_params_buffer_size() {
+        return wan.get_params_buffer_size();
+    }
+
+    int64_t get_adm_in_channels() {
+        return 768;
+    }
+
+    void compute(int n_threads,
+                 DiffusionParams diffusion_params,
+                 struct ggml_tensor** output     = NULL,
+                 struct ggml_context* output_ctx = NULL) {
+        return wan.compute(n_threads,
+                           diffusion_params.x,
+                           diffusion_params.timesteps,
+                           diffusion_params.context,
+                           diffusion_params.y,
+                           diffusion_params.c_concat,
+                           NULL,
+                           diffusion_params.vace_context,
+                           diffusion_params.vace_strength,
+                           output,
+                           output_ctx);
+    }
+};
+
+struct QwenImageModel : public DiffusionModel {
+    std::string prefix;
+    Qwen::QwenImageRunner qwen_image;
+
+    QwenImageModel(ggml_backend_t backend,
+                   bool offload_params_to_cpu,
+                   const String2GGMLType& tensor_types = {},
+                   const std::string prefix            = "model.diffusion_model",
+                   SDVersion version                   = VERSION_QWEN_IMAGE,
+                   bool flash_attn                     = false)
+        : prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_types, prefix, version, flash_attn) {
+    }
+
+    std::string get_desc() {
+        return qwen_image.get_desc();
+    }
+
+    void alloc_params_buffer() {
+        qwen_image.alloc_params_buffer();
+    }
+
+    void free_params_buffer() {
+        qwen_image.free_params_buffer();
+    }
+
+    void free_compute_buffer() {
+        qwen_image.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+        qwen_image.get_param_tensors(tensors, prefix);
+    }
+
+    size_t get_params_buffer_size() {
+        return qwen_image.get_params_buffer_size();
+    }
+
+    int64_t get_adm_in_channels() {
+        return 768;
+    }
+
+    void compute(int n_threads,
+                 DiffusionParams diffusion_params,
+                 struct ggml_tensor** output     = NULL,
+                 struct ggml_context* output_ctx = NULL) {
+        return qwen_image.compute(n_threads,
+                                  diffusion_params.x,
+                                  diffusion_params.timesteps,
+                                  diffusion_params.context,
+                                  diffusion_params.ref_latents,
+                                  true,  // increase_ref_index
+                                  output,
+                                  output_ctx);
+    }
+};
+
+#endif
--- a/docs/anima.md
+++ b/docs/anima.md
@ -1,21 +0,0 @@
-# How to Use
-
-## Download weights
-
- Download Anima
-    - safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/diffusion_models
-    - gguf: https://huggingface.co/Bedovyy/Anima-GGUF/tree/main
-    - gguf Anima2: https://huggingface.co/JusteLeo/Anima2-GGUF/tree/main
- Download vae
-    - safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/vae
- Download Qwen3-0.6B-Base
-    - safetensors: https://huggingface.co/circlestone-labs/Anima/tree/main/split_files/text_encoders
-    - gguf: https://huggingface.co/mradermacher/Qwen3-0.6B-Base-GGUF/tree/main
-
-## Examples
-
-```sh
-.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\anima-preview.safetensors --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_3_06b_base.safetensors  -p "a lovely cat holding a sign says 'anima.cpp'" --cfg-scale 6.0 --sampling-method euler -v --offload-to-cpu --diffusion-fa
-```
-
-<img alt="anima image example" src="../assets/anima/example.png" />
--- a/docs/build.md
+++ b/docs/build.md
@ -1,173 +0,0 @@
-# Build from scratch
-
-## Get the Code
-
-```
-git clone --recursive https://github.com/leejet/stable-diffusion.cpp
-cd stable-diffusion.cpp
-```
-
- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
-
-```
-cd stable-diffusion.cpp
-git pull origin master
-git submodule init
-git submodule update
-```
-
-## Build (CPU only)
-
-If you don't have a GPU or CUDA installed, you can build a CPU-only version.
-
-```shell
-mkdir build && cd build
-cmake ..
-cmake --build . --config Release
-```
-
-## Build with OpenBLAS
-
-```shell
-mkdir build && cd build
-cmake .. -DGGML_OPENBLAS=ON
-cmake --build . --config Release
-```
-
-## Build with CUDA
-
-This provides GPU acceleration using NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
-
-```shell
-mkdir build && cd build
-cmake .. -DSD_CUDA=ON
-cmake --build . --config Release
-```
-
-## Build with HipBLAS
-
-This provides GPU acceleration using AMD GPU. Make sure to have the ROCm toolkit installed.
-To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards.
-
-Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
-
-```shell
-mkdir build && cd build
-if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
-if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
-cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-cmake --build . --config Release
-```
-
-## Build with MUSA
-
-This provides GPU acceleration using Moore Threads GPU. Make sure to have the MUSA toolkit installed.
-
-```shell
-mkdir build && cd build
-cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
-cmake --build . --config Release
-```
-
-## Build with Metal
-
-Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
-
-```shell
-mkdir build && cd build
-cmake .. -DSD_METAL=ON
-cmake --build . --config Release
-```
-
-## Build with Vulkan
-
-Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
-
-```shell
-mkdir build && cd build
-cmake .. -DSD_VULKAN=ON
-cmake --build . --config Release
-```
-
-## Build with OpenCL (for Adreno GPU)
-
-Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
-
-To build for Windows ARM please refers to [Windows 11 Arm64](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
-
-Building for Android:
-
-  Android NDK:
-       Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
-
-Setup OpenCL Dependencies for NDK:
-
-You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
-
-*   OpenCL Headers:
-    ```bash
-    # In a temporary working directory
-    git clone https://github.com/KhronosGroup/OpenCL-Headers
-    cd OpenCL-Headers
-    # Replace <YOUR_NDK_PATH> with your actual NDK installation path
-    # e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
-    sudo cp -r CL <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
-    cd ..
-    ```
-
-*   OpenCL ICD Loader:
-    ```shell
-    # In the same temporary working directory
-    git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
-    cd OpenCL-ICD-Loader
-    mkdir build_ndk && cd build_ndk
-
-    # Replace <YOUR_NDK_PATH> in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
-    cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
-      -DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
-      -DOPENCL_ICD_LOADER_HEADERS_DIR=<YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
-      -DANDROID_ABI=arm64-v8a \
-      -DANDROID_PLATFORM=24 \
-      -DANDROID_STL=c++_shared
-
-    ninja
-    # Replace <YOUR_NDK_PATH>
-    # e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
-    sudo cp libOpenCL.so <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
-    cd ../..
-    ```
-
-Build `stable-diffusion.cpp` for Android with OpenCL:
-
-```shell
-mkdir build-android && cd build-android
-
-# Replace <YOUR_NDK_PATH> with your actual NDK installation path
-# e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake
-cmake .. -G Ninja \
-  -DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
-  -DANDROID_ABI=arm64-v8a \
-  -DANDROID_PLATFORM=android-28 \
-  -DGGML_OPENMP=OFF \
-  -DSD_OPENCL=ON
-
-ninja
-```
-*(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)*
-
-## Build with SYCL
-
-Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
-
-```shell
-# Export relevant ENV variables
-source /opt/intel/oneapi/setvars.sh
-
-# Option 1: Use FP32 (recommended for better performance in most cases)
-cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-# Option 2: Use FP16
-cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
-
-cmake --build . --config Release
-```
--- a/docs/caching.md
+++ b/docs/caching.md
@ -1,141 +0,0 @@
-## Caching
-
-Caching methods accelerate diffusion inference by reusing intermediate computations when changes between steps are small.
-
-### Cache Modes
-
-| Mode | Target | Description |
-|------|--------|-------------|
-| `ucache` | UNET models | Condition-level caching with error tracking |
-| `easycache` | DiT models | Condition-level cache |
-| `dbcache` | DiT models | Block-level L1 residual threshold |
-| `taylorseer` | DiT models | Taylor series approximation |
-| `cache-dit` | DiT models | Combined DBCache + TaylorSeer |
-| `spectrum` | UNET models | Chebyshev + Taylor output forecasting |
-
-### UCache (UNET Models)
-
-UCache caches the residual difference (output - input) and reuses it when input changes are below threshold.
-
-```bash
-sd-cli -m model.safetensors -p "a cat" --cache-mode ucache --cache-option "threshold=1.5"
-```
-
-#### Parameters
-
-| Parameter | Description | Default |
-|-----------|-------------|---------|
-| `threshold` | Error threshold for reuse decision | 1.0 |
-| `start` | Start caching at this percent of steps | 0.15 |
-| `end` | Stop caching at this percent of steps | 0.95 |
-| `decay` | Error decay rate (0-1) | 1.0 |
-| `relative` | Scale threshold by output norm (0/1) | 1 |
-| `reset` | Reset error after computing (0/1) | 1 |
-
-#### Reset Parameter
-
-The `reset` parameter controls error accumulation behavior:
-
- `reset=1` (default): Resets accumulated error after each computed step. More aggressive caching, works well with most samplers.
- `reset=0`: Keeps error accumulated. More conservative, recommended for `euler_a` sampler.
-
-### EasyCache (DiT Models)
-
-Condition-level caching for DiT models. Caches and reuses outputs when input changes are below threshold.
-
-```bash
--cache-mode easycache --cache-option "threshold=0.3"
-```
-
-#### Parameters
-
-| Parameter | Description | Default |
-|-----------|-------------|---------|
-| `threshold` | Input change threshold for reuse | 0.2 |
-| `start` | Start caching at this percent of steps | 0.15 |
-| `end` | Stop caching at this percent of steps | 0.95 |
-
-### Cache-DIT (DiT Models)
-
-For DiT models like FLUX and QWEN, use block-level caching modes.
-
-#### DBCache
-
-Caches blocks based on L1 residual difference threshold:
-
-```bash
--cache-mode dbcache --cache-option "threshold=0.25,warmup=4"
-```
-
-#### TaylorSeer
-
-Uses Taylor series approximation to predict block outputs:
-
-```bash
--cache-mode taylorseer
-```
-
-#### Cache-DIT (Combined)
-
-Combines DBCache and TaylorSeer:
-
-```bash
--cache-mode cache-dit
-```
-
-#### Parameters
-
-| Parameter | Description | Default |
-|-----------|-------------|---------|
-| `Fn` | Front blocks to always compute | 8 |
-| `Bn` | Back blocks to always compute | 0 |
-| `threshold` | L1 residual difference threshold | 0.08 |
-| `warmup` | Steps before caching starts | 8 |
-
-#### SCM Options
-
-Steps Computation Mask controls which steps can be cached:
-
-```bash
--scm-mask "1,1,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1"
-```
-
-Mask values: `1` = compute, `0` = can cache.
-
-| Policy | Description |
-|--------|-------------|
-| `dynamic` | Check threshold before caching |
-| `static` | Always cache on cacheable steps |
-
-```bash
--scm-policy dynamic
-```
-
-### Spectrum (UNET Models)
-
-Spectrum uses Chebyshev polynomial fitting blended with Taylor extrapolation to predict denoised outputs, skipping entire UNet forward passes. Based on the paper [Spectrum: Adaptive Spectral Feature Forecasting for Efficient Diffusion Sampling](https://github.com/tingyu215/Spectrum).
-
-```bash
-sd-cli -m model.safetensors -p "a cat" --cache-mode spectrum
-```
-
-#### Parameters
-
-| Parameter | Description | Default |
-|-----------|-------------|---------|
-| `w` | Chebyshev vs Taylor blend weight (0=Taylor, 1=Chebyshev) | 0.40 |
-| `m` | Chebyshev polynomial degree | 3 |
-| `lam` | Ridge regression regularization | 1.0 |
-| `window` | Initial window size (compute every N steps) | 2 |
-| `flex` | Window growth per computed step after warmup | 0.50 |
-| `warmup` | Steps to always compute before caching starts | 4 |
-| `stop` | Stop caching at this fraction of total steps | 0.9 |
-
-```
-
-### Performance Tips
-
- Start with default thresholds and adjust based on output quality
- Lower threshold = better quality, less speedup
- Higher threshold = more speedup, potential quality loss
- More steps generally means more caching opportunities
--- a/docs/chroma.md
+++ b/docs/chroma.md
@ -15,7 +15,7 @@ You can run Chroma using stable-diffusion.cpp with a GPU that has 6GB or even 4G
 You can download the preconverted gguf weights from [silveroxides/Chroma-GGUF](https://huggingface.co/silveroxides/Chroma-GGUF), this way you don't have to do the conversion yourself.

 ```
-.\bin\Release\sd-cli.exe -M convert -m ..\..\ComfyUI\models\unet\chroma-unlocked-v40.safetensors -o ..\models\chroma-unlocked-v40-q8_0.gguf -v --type q8_0
+.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\chroma-unlocked-v40.safetensors -o ..\models\chroma-unlocked-v40-q8_0.gguf -v --type q8_0
 ```

 ## Run
@ -24,7 +24,7 @@ You can download the preconverted gguf weights from [silveroxides/Chroma-GGUF](h
 For example:

 ```
- .\bin\Release\sd-cli.exe --diffusion-model  ..\models\chroma-unlocked-v40-q8_0.gguf --vae ..\models\ae.sft --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'chroma.cpp'" --cfg-scale 4.0 --sampling-method euler -v --chroma-disable-dit-mask --clip-on-cpu
+ .\bin\Release\sd.exe --diffusion-model  ..\models\chroma-unlocked-v40-q8_0.gguf --vae ..\models\ae.sft --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'chroma.cpp'" --cfg-scale 4.0 --sampling-method euler -v --chroma-disable-dit-mask --clip-on-cpu
 ```

 ![](../assets/flux/chroma_v40.png)
--- a/docs/chroma_radiance.md
+++ b/docs/chroma_radiance.md
@ -1,21 +0,0 @@
-# How to Use
-
-## Download weights
-
- Download Chroma1-Radiance
-    - safetensors: https://huggingface.co/lodestones/Chroma1-Radiance/tree/main
-    - gguf: https://huggingface.co/silveroxides/Chroma1-Radiance-GGUF/tree/main
-
- Download t5xxl
-    - safetensors: https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
-
-## Examples
-
-```
-.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Chroma1-Radiance-v0.4-Q8_0.gguf --t5xxl ..\..\ComfyUI\models\clip\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'chroma  radiance cpp'" --cfg-scale 4.0 --sampling-method euler -v
-```
-
-<img alt="Chroma1-Radiance" src="../assets/flux/chroma1-radiance.png" />
-
-
-
--- a/docs/distilled_sd.md
+++ b/docs/distilled_sd.md
@ -1,137 +0,0 @@
-# Running distilled models: SSD1B, Vega and SDx.x with tiny U-Nets
-
-## Preface 
-
-These models feature a reduced U-Net architecture. Unlike standard SDXL models, the SSD-1B and Vega U-Net contains only one middle block and fewer attention layers in its up- and down-blocks, resulting in significantly smaller file sizes. Using these models can reduce inference time by more than 33%. For more details, refer to Segmind's paper: https://arxiv.org/abs/2401.02677v1.
-Similarly, SD1.x- and SD2.x-style models with a tiny U-Net consist of only 6 U-Net blocks, leading to very small files and time savings of up to 50%. For more information, see the paper: https://arxiv.org/pdf/2305.15798.pdf.
-
-## SSD1B
-
-Note that not all of these models follow the standard parameter naming conventions. However, several useful SSD-1B models are available online, such as:
-
- * https://huggingface.co/segmind/SSD-1B/resolve/main/SSD-1B-A1111.safetensors
- * https://huggingface.co/hassenhamdi/SSD-1B-fp8_e4m3fn/resolve/main/SSD-1B_fp8_e4m3fn.safetensors
-
-Useful LoRAs are also available:
-
- * https://huggingface.co/seungminh/lora-swarovski-SSD-1B/resolve/main/pytorch_lora_weights.safetensors
- * https://huggingface.co/kylielee505/mylcmlorassd/resolve/main/pytorch_lora_weights.safetensors
-
-## Vega
-
-Segmind's Vega model is available online here:
-
- * https://huggingface.co/segmind/Segmind-Vega/resolve/main/segmind-vega.safetensors
- 
-VegaRT is an example for an LCM-LoRA:
-
- * https://huggingface.co/segmind/Segmind-VegaRT/resolve/main/pytorch_lora_weights.safetensors
-
-Both files can be used out-of-the-box, unlike the models described in next sections.
-
-
-## SD1.x, SD2.x with tiny U-Nets
-
-These models require conversion before use. You will need a Python script provided by the diffusers team, available on GitHub:
-
- * https://raw.githubusercontent.com/huggingface/diffusers/refs/heads/main/scripts/convert_diffusers_to_original_stable_diffusion.py
-
-### SD2.x
-
-NotaAI provides the following model online:
-
-* https://huggingface.co/nota-ai/bk-sdm-v2-tiny
-
-Creating a .safetensors file involves two steps. First, run this short Python script to download the model from Hugging Face:
-
-```python
-from diffusers import StableDiffusionPipeline
-pipe = StableDiffusionPipeline.from_pretrained("nota-ai/bk-sdm-v2-tiny",cache_dir="./")
-```
-
-Second, create the .safetensors file by running:
-
-```bash
-python convert_diffusers_to_original_stable_diffusion.py \
-      --model_path  models--nota-ai--bk-sdm-v2-tiny/snapshots/68277af553777858cd47e133f92e4db47321bc74 \
-      --checkpoint_path bk-sdm-v2-tiny.safetensors --half --use_safetensors
-```
-
-This will generate the **file bk-sdm-v2-tiny.safetensors**, which is now ready for use with sd.cpp.
-
-### SD1.x
-
-Several Tiny SD 1.x models are available online, such as:
-
- * https://huggingface.co/segmind/tiny-sd
- * https://huggingface.co/segmind/portrait-finetuned
- * https://huggingface.co/nota-ai/bk-sdm-tiny
-
-These models also require conversion, partly because some tensors are stored in a non-contiguous manner. To create a usable checkpoint file, follow these simple steps:
-Download and prepare the model using Python: 
-
-##### Download the model using Python on your computer, for example this way:
-
-```python
-import torch
-from diffusers import StableDiffusionPipeline
-pipe = StableDiffusionPipeline.from_pretrained("segmind/tiny-sd")
-unet=pipe.unet
-for param in unet.parameters():
-    param.data = param.data.contiguous()     # <- important here
-pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
-```
-
-##### Run the conversion script:
-
-```bash
-python convert_diffusers_to_original_stable_diffusion.py \
-      --model_path  ./segmindtiny-sd \
-      --checkpoint_path ./segmind_tiny-sd.ckpt --half
-```
-
-The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
-
-
-##### Another available .ckpt file:
-
- * https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
-
-To use this file, you must first adjust its non-contiguous tensors:
-
-```python
-import torch
-ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu'))
-for key, value in ckpt['state_dict'].items():
-    if isinstance(value, torch.Tensor):
-        ckpt['state_dict'][key] = value.contiguous()
-torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
-```
-
-
-### SDXS-512
-
-Another very tiny and **incredibly fast**  model is SDXS by IDKiro et al.  The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
-
-##### 1. Download the diffusers model from  Hugging Face using Python:
-
-```python
-from diffusers import StableDiffusionPipeline
-pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
-pipe.save_pretrained(save_directory="sdxs")
-```
-##### 2. Create a safetensors file
-
-```bash
-python convert_diffusers_to_original_stable_diffusion.py \
-    --model_path  sdxs  --checkpoint_path sdxs.safetensors --half --use_safetensors
-```
-
-##### 3. Run the model as follows:
-
-```bash
-~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
-  --cfg-scale 1 --steps 1
-```
-
-Both options: ``` --cfg-scale 1 ``` and  ``` --steps 1 ``` are mandatory here.                                                 
--- a/docs/docker.md
+++ b/docs/docker.md
@ -1,39 +1,15 @@
-# Docker
+## Docker

-## Run CLI
-
-```shell
-docker run --rm -v /path/to/models:/models -v /path/to/output/:/output ghcr.io/leejet/stable-diffusion.cpp:master [args...]
-# For example
-# docker run --rm -v ./models:/models -v ./build:/output ghcr.io/leejet/stable-diffusion.cpp:master -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
-```
-
-## Run server
-
-```shell
-docker run --rm --init -v /path/to/models:/models -v /path/to/output/:/output -p "1234:1234" --entrypoint "/sd-server" ghcr.io/leejet/stable-diffusion.cpp:master [args...]
-# For example
-# docker run --rm --init -v ./models:/models -v ./build:/output -p "1234:1234" --entrypoint "/sd-server" ghcr.io/leejet/stable-diffusion.cpp:master -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
-```
-
-## Building using Docker
+### Building using Docker

 ```shell
 docker build -t sd .
 ```

-## Building variants using Docker
-
-Vulkan:
+### Run

 ```shell
-docker build -f Dockerfile.vulkan -t sd .
-```
-
-## Run locally built image's CLI
-
-```shell
-docker run --rm -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
+docker run -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
 # For example
-# docker run --rm -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
-```
+# docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
+```
--- a/docs/esrgan.md
+++ b/docs/esrgan.md
@ -1,9 +1,9 @@
 ## Using ESRGAN to upscale results

-You can use ESRGAN—such as the model [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth)—to upscale the generated images and improve their overall resolution and clarity.
+You can use ESRGAN to upscale the generated images. At the moment, only the [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth) model is supported. Support for more models of this architecture will be added soon.

 - Specify the model path using the `--upscale-model PATH` parameter. example:

 ```bash
-sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --upscale-model ../models/RealESRGAN_x4plus_anime_6B.pth
+sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --upscale-model ../models/RealESRGAN_x4plus_anime_6B.pth
 ```
--- a/docs/flux.md
+++ b/docs/flux.md
@ -15,9 +15,9 @@ You can run Flux using stable-diffusion.cpp with a GPU that has 6GB or even 4GB

 You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself.

-For example:
+Using fp16 will lead to overflow, but ggml's support for bf16 is not yet fully developed. Therefore, we need to convert flux to gguf format here, which also saves VRAM. For example:
 ```
-.\bin\Release\sd-cli.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
+.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
 ```

 ## Run
@ -28,7 +28,7 @@ For example:
 For example:

 ```
- .\bin\Release\sd-cli.exe --diffusion-model  ..\models\flux1-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
+ .\bin\Release\sd.exe --diffusion-model  ..\models\flux1-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
 ```

 Using formats of different precisions will yield results of varying quality.
@ -44,7 +44,7 @@ Using formats of different precisions will yield results of varying quality.


 ```
- .\bin\Release\sd-cli.exe --diffusion-model  ..\models\flux1-schnell-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --steps 4 --clip-on-cpu
+ .\bin\Release\sd.exe --diffusion-model  ..\models\flux1-schnell-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --steps 4 --clip-on-cpu
 ```

 | q8_0  |
@ -60,7 +60,7 @@ Since many flux LoRA training libraries have used various LoRA naming formats, i
 - LoRA model from https://huggingface.co/XLabs-AI/flux-lora-collection/tree/main (using comfy converted version!!!)

 ```
-.\bin\Release\sd-cli.exe --diffusion-model  ..\models\flux1-dev-q8_0.gguf --vae ...\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'<lora:realism_lora_comfy_converted:1>" --cfg-scale 1.0 --sampling-method euler -v --lora-model-dir ../models --clip-on-cpu
+.\bin\Release\sd.exe --diffusion-model  ..\models\flux1-dev-q8_0.gguf --vae ...\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'<lora:realism_lora_comfy_converted:1>" --cfg-scale 1.0 --sampling-method euler -v --lora-model-dir ../models --clip-on-cpu
 ```

 ![output](../assets/flux/flux1-dev-q8_0%20with%20lora.png)
--- a/docs/flux2.md
+++ b/docs/flux2.md
@ -1,92 +0,0 @@
-# How to Use
-
-## Flux.2-dev
-
-### Download weights
-
- Download FLUX.2-dev
-    - gguf: https://huggingface.co/city96/FLUX.2-dev-gguf/tree/main
- Download vae
-    - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
- Download Mistral-Small-3.2-24B-Instruct-2506-GGUF
-    - gguf: https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF/tree/main
-
-### Examples
-
-```
-.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\flux2-dev-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\Mistral-Small-3.2-24B-Instruct-2506-Q4_K_M.gguf -r .\kontext_input.png -p "change 'flux.cpp' to 'flux2-dev.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu
-```
-
-<img alt="flux2 example" src="../assets/flux2/example.png" />
-
-## Flux.2 klein 4B / Flux.2 klein base 4B
-
-### Download weights
-
- Download FLUX.2-klein-4B
-    - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-4B
-    - gguf: https://huggingface.co/leejet/FLUX.2-klein-4B-GGUF/tree/main
- Download FLUX.2-klein-base-4B
-    - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-base-4B
-    - gguf: https://huggingface.co/leejet/FLUX.2-klein-base-4B-GGUF/tree/main
- Download vae
-    - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
- Download Qwen3 4b
-    - safetensors: https://huggingface.co/Comfy-Org/flux2-klein-4B/tree/main/split_files/text_encoders
-    - gguf: https://huggingface.co/unsloth/Qwen3-4B-GGUF/tree/main
-
-### Examples
-
-```
-.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\flux-2-klein-4b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -p "a lovely cat" --cfg-scale 1.0 --steps 4 -v --offload-to-cpu --diffusion-fa
-```
-
-<img alt="flux2-klein-4b" src="../assets/flux2/flux2-klein-4b.png" />
-
-```
-.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\flux-2-klein-4b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -r .\kontext_input.png -p "change 'flux.cpp' to 'klein.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu --steps 4
-```
-
-<img alt="flux2-klein-4b-edit" src="../assets/flux2/flux2-klein-4b-edit.png" />
-
-```
-.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\flux-2-klein-base-4b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -p "a lovely cat" --cfg-scale 4.0 --steps 20 -v --offload-to-cpu --diffusion-fa
-```
-
-<img alt="flux2-klein-base-4b" src="../assets/flux2/flux2-klein-base-4b.png" />
-
-## Flux.2 klein 9B / Flux.2 klein base 9B
-
-### Download weights
-
- Download FLUX.2-klein-9B
-    - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-9B
-    - gguf: https://huggingface.co/leejet/FLUX.2-klein-9B-GGUF/tree/main
- Download FLUX.2-klein-base-9B
-    - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-klein-base-9B
-    - gguf: https://huggingface.co/leejet/FLUX.2-klein-base-9B-GGUF/tree/main
- Download vae
-    - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
- Download Qwen3 8B
-    - safetensors: https://huggingface.co/Comfy-Org/flux2-klein-9B/tree/main/split_files/text_encoders
-    - gguf: https://huggingface.co/unsloth/Qwen3-8B-GGUF/tree/main
-
-### Examples
-
-```
-.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\flux-2-klein-9b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_3_8b.safetensors -p "a lovely cat" --cfg-scale 1.0 --steps 4 -v --offload-to-cpu --diffusion-fa
-```
-
-<img alt="flux2-klein-9b" src="../assets/flux2/flux2-klein-9b.png" />
-
-```
-.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\flux-2-klein-9b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_3_8b.safetensors -r .\kontext_input.png -p "change 'flux.cpp' to 'klein.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu --steps 4
-```
-
-<img alt="flux2-klein-9b-edit" src="../assets/flux2/flux2-klein-9b-edit.png" />
-
-```
-.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\flux-2-klein-base-9b.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_3_8b.safetensors -p "a lovely cat" --cfg-scale 4.0 --steps 20 -v --offload-to-cpu --diffusion-fa
-```
-
-<img alt="flux2-klein-base-9b" src="../assets/flux2/flux2-klein-base-9b.png" />
--- a/docs/hipBLAS_on_Windows.md
+++ b/docs/hipBLAS_on_Windows.md
@ -82,4 +82,4 @@ cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_H
 cmake --build . --config Release
 ```

-If everything went OK, `build\bin\sd-cli.exe` file should appear.
+If everything went OK, `build\bin\sd.exe` file should appear.
--- a/docs/kontext.md
+++ b/docs/kontext.md
@ -16,7 +16,7 @@ You can run Kontext using stable-diffusion.cpp with a GPU that has 6GB or even 4
 You can download the preconverted gguf weights from [FLUX.1-Kontext-dev-GGUF](https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF), this way you don't have to do the conversion yourself.

 ```
-.\bin\Release\sd-cli.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-kontext-dev.safetensors -o ..\models\flux1-kontext-dev-q8_0.gguf -v --type q8_0
+.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-kontext-dev.safetensors -o ..\models\flux1-kontext-dev-q8_0.gguf -v --type q8_0
 ```

 ## Run
@ -27,7 +27,7 @@ You can download the preconverted gguf weights from [FLUX.1-Kontext-dev-GGUF](ht
 For example:

 ```
- .\bin\Release\sd-cli.exe -r .\flux1-dev-q8_0.png --diffusion-model  ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
+ .\bin\Release\sd.exe -r .\flux1-dev-q8_0.png --diffusion-model  ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
 ```


--- a/docs/lcm.md
+++ b/docs/lcm.md
@ -7,7 +7,7 @@
 Here's a simple example:

 ```
-./bin/sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:lcm-lora-sdv1-5:1>" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
+./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:lcm-lora-sdv1-5:1>" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
 ```

 | without LCM-LoRA (--cfg-scale 7)  | with LCM-LoRA (--cfg-scale 1)  |
--- a/docs/lora.md
+++ b/docs/lora.md
@ -7,20 +7,43 @@
 Here's a simple example:

 ```
-./bin/sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
+./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
 ```

 `../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model

-# Lora Apply Mode
+# Support matrix

-There are two ways to apply LoRA: **immediately** and **at_runtime**. You can specify it using the `--lora-apply-mode` parameter.
+> ℹ️ CUDA `get_rows` support is defined here:  
+> [ggml-org/ggml/src/ggml-cuda/getrows.cu#L156](https://github.com/ggml-org/ggml/blob/7dee1d6a1e7611f238d09be96738388da97c88ed/src/ggml-cuda/getrows.cu#L156)  
+> Currently only the basic types + Q4/Q5/Q8 are implemented. K-quants are **not** supported.

-By default, the mode is selected automatically:
-
-* If the model weights contain any quantized parameters, the **at_runtime** mode is used;
-* Otherwise, the **immediately** mode is used.
-
-The **immediately** mode may have precision and compatibility issues with quantized parameters, but it usually offers faster inference speed and, in some cases, lower memory usage.
-In contrast, the **at_runtime** mode provides better compatibility and higher precision, but inference may be slower and memory usage may be higher in some cases.
+NOTE: The other backends may have different support.

+| Quant / Type | CUDA | Vulkan |
+|--------------|------|--------|
+| F32          | ✔️   | ✔️   |
+| F16          | ✔️   | ✔️   |
+| BF16         | ✔️   | ✔️   |
+| I32          | ✔️   | ❌   |
+| Q4_0         | ✔️   | ✔️   |
+| Q4_1         | ✔️   | ✔️   |
+| Q5_0         | ✔️   | ✔️   |
+| Q5_1         | ✔️   | ✔️   |
+| Q8_0         | ✔️   | ✔️   |
+| Q2_K         | ❌   | ❌   |
+| Q3_K         | ❌   | ❌   |
+| Q4_K         | ❌   | ❌   |
+| Q5_K         | ❌   | ❌   |
+| Q6_K         | ❌   | ❌   |
+| Q8_K         | ❌   | ❌   |
+| IQ1_S        | ❌   | ✔️   |
+| IQ1_M        | ❌   | ✔️   |
+| IQ2_XXS      | ❌   | ✔️   |
+| IQ2_XS       | ❌   | ✔️   |
+| IQ2_S        | ❌   | ✔️   |
+| IQ3_XXS      | ❌   | ✔️   |
+| IQ3_S        | ❌   | ✔️   |
+| IQ4_XS       | ❌   | ✔️   |
+| IQ4_NL       | ❌   | ✔️   |
+| MXFP4        | ❌   | ✔️   |
--- a/docs/ovis_image.md
+++ b/docs/ovis_image.md
@ -1,19 +0,0 @@
-# How to Use
-
-## Download weights
-
- Download Ovis-Image-7B
-    - safetensors: https://huggingface.co/Comfy-Org/Ovis-Image/tree/main/split_files/diffusion_models
-    - gguf: https://huggingface.co/leejet/Ovis-Image-7B-GGUF
- Download vae
-    - safetensors: https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
- Download Ovis 2.5
-    - safetensors: https://huggingface.co/Comfy-Org/Ovis-Image/tree/main/split_files/text_encoders
-
-## Examples
-
-```
-.\bin\Release\sd-cli.exe --diffusion-model  ovis_image-Q4_0.gguf --vae ..\..\ComfyUI\models\vae\ae.sft  --llm ..\..\ComfyUI\models\text_encoders\ovis_2.5.safetensors -p "a lovely cat" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa
-```
-
-<img alt="ovis image example" src="../assets/ovis_image/example.png" />
--- a/docs/performance.md
+++ b/docs/performance.md
@ -1,26 +0,0 @@
-## Use Flash Attention to save memory and improve speed.
-
-Enabling flash attention for the diffusion model reduces memory usage by varying amounts of MB.
-eg.:
- - flux 768x768 ~600mb
- - SD2 768x768 ~1400mb
-
-For most backends, it slows things down, but for cuda it generally speeds it up too.
-At the moment, it is only supported for some models and some backends (like cpu, cuda/rocm, metal).
-
-Run by adding `--diffusion-fa` to the arguments and watch for:
-```
-[INFO ] stable-diffusion.cpp:312  - Using flash attention in the diffusion model
-```
-and the compute buffer shrink in the debug log:
-```
-[DEBUG] ggml_extend.hpp:1004 - flux compute buffer size: 650.00 MB(VRAM)
-```
-
-## Offload weights to the CPU to save VRAM without reducing generation speed.
-
-Using `--offload-to-cpu` allows you to offload weights to the CPU, saving VRAM without reducing generation speed.
-
-## Use quantization to reduce memory usage.
-
-[quantization](./quantization_and_gguf.md)
--- a/docs/photo_maker.md
+++ b/docs/photo_maker.md
@ -27,7 +27,7 @@ If on low memory GPUs (<= 8GB), recommend running with ```--vae-on-cpu``` option
 Example:

 ```bash
-bin/sd-cli -m ../models/sdxlUnstableDiffusers_v11.safetensors  --vae ../models/sdxl_vae.safetensors --photo-maker ../models/photomaker-v1.safetensors --pm-id-images-dir ../assets/photomaker_examples/scarletthead_woman -p "a girl img, retro futurism, retro game art style but extremely beautiful, intricate details, masterpiece, best quality, space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed" -n "realistic, photo-realistic, worst quality, greyscale, bad anatomy, bad hands, error, text" --cfg-scale 5.0  --sampling-method euler -H 1024 -W 1024 --pm-style-strength 10 --vae-on-cpu --steps 50
+bin/sd -m ../models/sdxlUnstableDiffusers_v11.safetensors  --vae ../models/sdxl_vae.safetensors --photo-maker ../models/photomaker-v1.safetensors --pm-id-images-dir ../assets/photomaker_examples/scarletthead_woman -p "a girl img, retro futurism, retro game art style but extremely beautiful, intricate details, masterpiece, best quality, space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed" -n "realistic, photo-realistic, worst quality, greyscale, bad anatomy, bad hands, error, text" --cfg-scale 5.0  --sampling-method euler -H 1024 -W 1024 --pm-style-strength 10 --vae-on-cpu --steps 50
 ```

 ## PhotoMaker Version 2
@ -40,7 +40,7 @@ Running PMV2 is now a two-step process:
 ```
 python face_detect.py input_image_dir
 ```
-An ```id_embeds.bin``` file will be generated in ```input_images_dir```
+An ```id_embeds.safetensors``` file will be generated in ```input_images_dir```

 **Note: this step is only needed to run once; the same ```id_embeds``` can be reused**

@ -48,6 +48,6 @@ An ```id_embeds.bin``` file will be generated in ```input_images_dir```

  You can download ```photomaker-v2.safetensors``` from [here](https://huggingface.co/bssrdf/PhotoMakerV2)

- All the command line parameters from Version 1 remain the same for Version 2 plus one extra pointing to a valid ```id_embeds``` file:  --pm-id-embed-path [path_to__id_embeds.bin] 
+- All the command line parameters from Version 1 remain the same for Version 2


--- a/docs/quantization_and_gguf.md
+++ b/docs/quantization_and_gguf.md
@ -23,5 +23,5 @@ You can also convert weights in the formats `ckpt/safetensors/diffusers` to gguf
 For example:

 ```sh
-./bin/sd-cli -M convert -m ../models/v1-5-pruned-emaonly.safetensors -o  ../models/v1-5-pruned-emaonly.q8_0.gguf -v --type q8_0
+./bin/sd -M convert -m ../models/v1-5-pruned-emaonly.safetensors -o  ../models/v1-5-pruned-emaonly.q8_0.gguf -v --type q8_0
 ```
--- a/docs/qwen_image.md
+++ b/docs/qwen_image.md
@ -14,7 +14,7 @@
 ## Examples

 ```
-.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf  -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线： 探索视觉生成基础模型的极限，开创理解与生成一体化的未来。二、Qwen-Image的模型特色：1、复杂文字渲染。支持中英渲染、自动布局； 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景：赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
+.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --qwen2vl ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf  -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线： 探索视觉生成基础模型的极限，开创理解与生成一体化的未来。二、Qwen-Image的模型特色：1、复杂文字渲染。支持中英渲染、自动布局； 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景：赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
 ```

 <img alt="qwen example" src="../assets/qwen/example.png" />
--- a/docs/qwen_image_edit.md
+++ b/docs/qwen_image_edit.md
@ -9,9 +9,6 @@
    - Qwen Image Edit 2509
        - safetensors: https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/tree/main/split_files/diffusion_models
        - gguf: https://huggingface.co/QuantStack/Qwen-Image-Edit-2509-GGUF/tree/main
-    - Qwen Image Edit 2511
-        - safetensors: https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/tree/main/split_files/diffusion_models
-        - gguf: https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/tree/main
 - Download vae
    - safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/vae
 - Download qwen_2.5_vl 7b
@ -23,7 +20,7 @@
 ### Qwen Image Edit

 ```
-.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
+.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --qwen2vl ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
 ```

 <img alt="qwen_image_edit" src="../assets/qwen/qwen_image_edit.png" />
@ -32,17 +29,7 @@
 ### Qwen Image Edit 2509

 ```
-.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --llm_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
+.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --qwen2vl ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --qwen2vl_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
 ```

-<img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2509.png" />
-
-### Qwen Image Edit 2511
-
-To use the new Qwen Image Edit 2511 mode, the  `--qwen-image-zero-cond-t` flag must be enabled; otherwise, image editing quality will degrade significantly.
-
-```
-.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\qwen-image-edit-2511-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'"  --qwen-image-zero-cond-t
-```
-
-<img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2511.png" />
+<img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2509.png" />
--- a/docs/sd.md
+++ b/docs/sd.md
@ -1,37 +0,0 @@
-## Download weights
-
- download original weights(.ckpt or .safetensors). For example
-    - Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
-    - Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
-    - Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
-    - Stable Diffusion 3 2B from https://huggingface.co/stabilityai/stable-diffusion-3-medium
-
-### txt2img example
-
-```sh
-./bin/sd-cli -m ../models/sd-v1-4.ckpt -p "a lovely cat"
-# ./bin/sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
-# ./bin/sd-cli -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
-# ./bin/sd-cli -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
-# ./bin/sd-cli --diffusion-model  ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
-# ./bin/sd-cli -m  ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
-```
-
-Using formats of different precisions will yield results of varying quality.
-
-| f32  | f16  |q8_0  |q5_0  |q5_1  |q4_0  |q4_1  |
-| ----  |----  |----  |----  |----  |----  |----  |
-| ![](../assets/f32.png) |![](../assets/f16.png) |![](../assets/q8_0.png) |![](../assets/q5_0.png) |![](../assets/q5_1.png) |![](../assets/q4_0.png) |![](../assets/q4_1.png) |
-
-### img2img example
-
- `./output.png` is the image generated from the above txt2img pipeline
-
-
-```
-./bin/sd-cli -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
-```
-
-<p align="center">
-  <img src="../assets/img2img_output.png" width="256x">
-</p>
--- a/docs/sd3.md
+++ b/docs/sd3.md
@ -14,7 +14,7 @@
 For example:

 ```
-.\bin\Release\sd-cli.exe -m  ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
+.\bin\Release\sd.exe -m  ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
 ```

 ![](../assets/sd3.5_large.png)
--- a/docs/taesd.md
+++ b/docs/taesd.md
@ -7,33 +7,11 @@ You can use TAESD to accelerate the decoding of latent images by following these
 Or curl

 ```bash
-curl -L -O https://huggingface.co/madebyollin/taesd/resolve/main/diffusion_pytorch_model.safetensors
+curl -L -O https://huggingface.co/madebyollin/taesd/blob/main/diffusion_pytorch_model.safetensors
 ```

 - Specify the model path using the `--taesd PATH` parameter. example:

 ```bash
-sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --taesd ../models/diffusion_pytorch_model.safetensors
-```
-
-### Qwen-Image and wan (TAEHV)
-
-sd.cpp also supports [TAEHV](https://github.com/madebyollin/taehv) (#937), which can be used for Qwen-Image and wan.
-
- For **Qwen-Image and wan2.1 and wan2.2-A14B**, download the wan2.1 tae [safetensors weights](https://github.com/madebyollin/taehv/blob/main/safetensors/taew2_1.safetensors)
-  
-  Or curl
-  
-  ```bash
-  curl -L -O https://github.com/madebyollin/taehv/raw/refs/heads/main/safetensors/taew2_1.safetensors
-  ```
-
- For **wan2.2-TI2V-5B**, use the wan2.2 tae [safetensors weights](https://github.com/madebyollin/taehv/blob/main/safetensors/taew2_2.safetensors)
-  
-  Or curl
-  
-  ```bash
-  curl -L -O https://github.com/madebyollin/taehv/raw/refs/heads/main/safetensors/taew2_2.safetensors
-  ```
-
-Then simply replace the `--vae xxx.safetensors` with `--tae xxx.safetensors` in the commands. If it still out of VRAM, add `--vae-conv-direct` to your command though might be slower.
+sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --taesd ../models/diffusion_pytorch_model.safetensors
+```
--- a/docs/wan.md
+++ b/docs/wan.md
@ -39,9 +39,6 @@
        - safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/vae/wan_2.1_vae.safetensors
    - wan_2.2_vae (for Wan2.2 TI2V 5B only)
        - safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/blob/main/split_files/vae/wan2.2_vae.safetensors
-
-    > Wan models vae requires really much VRAM! If you do not have enough VRAM, please try tae instead, though the results may be poorer. For tae usage, please refer to [taesd](taesd.md)
-
 - Download umt5_xxl
    - safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/text_encoders/umt5_xxl_fp16.safetensors
    - gguf: https://huggingface.co/city96/umt5-xxl-encoder-gguf/tree/main
@ -55,7 +52,7 @@
 ### Wan2.1 T2V 1.3B

 ```
-.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1_t2v_1.3B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --video-frames 33 --flow-shift 3.0
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1_t2v_1.3B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --video-frames 33 --flow-shift 3.0
 ```

 <video src=../assets/wan/Wan2.1_1.3B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -63,7 +60,7 @@
 ### Wan2.1 T2V 14B

 ```
-.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-t2v-14b-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa  --offload-to-cpu --video-frames 33 --flow-shift 3.0
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-t2v-14b-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa  --offload-to-cpu --video-frames 33 --flow-shift 3.0
 ```

 <video src=../assets/wan/Wan2.1_14B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -73,7 +70,7 @@
 ### Wan2.1 I2V 14B

 ```
-.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-i2v-14b-480p-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --clip_vision ..\..\ComfyUI\models\clip_vision\clip_vision_h.safetensors -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-i2v-14b-480p-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --clip_vision ..\..\ComfyUI\models\clip_vision\clip_vision_h.safetensors -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
 ```

 <video src=../assets/wan/Wan2.1_14B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -81,7 +78,7 @@
 ### Wan2.2 T2V A14B

 ```
-.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33 --flow-shift 3.0
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33 --flow-shift 3.0
 ```

 <video src=../assets/wan/Wan2.2_14B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -89,7 +86,7 @@
 ### Wan2.2 I2V A14B

 ```
-.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33 --offload-to-cpu -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33 --offload-to-cpu -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
 ```

 <video src=../assets/wan/Wan2.2_14B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -97,7 +94,7 @@
 ### Wan2.2 T2V A14B T2I

 ```
-.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --flow-shift 3.0
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --flow-shift 3.0
 ```

 <img width="832" height="480" alt="Wan2 2_14B_t2i" src="../assets/wan/Wan2.2_14B_t2i.png" />
@ -105,7 +102,7 @@
 ### Wan2.2 T2V 14B with Lora

 ```
-.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat<lora:wan2.2_t2v_lightx2v_4steps_lora_v1.1_low_noise:1><lora:|high_noise|wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise:1>" --cfg-scale 3.5 --sampling-method euler --steps 4 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 4 -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --lora-model-dir ..\..\ComfyUI\models\loras --video-frames 33 --flow-shift 3.0
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat<lora:wan2.2_t2v_lightx2v_4steps_lora_v1.1_low_noise:1><lora:|high_noise|wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise:1>" --cfg-scale 3.5 --sampling-method euler --steps 4 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 4 -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --lora-model-dir ..\..\ComfyUI\models\loras --video-frames 33 --flow-shift 3.0
 ```

 <video src=../assets/wan/Wan2.2_14B_t2v_lora.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -117,7 +114,7 @@
 #### T2V

 ```
-.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.2_ti2v_5B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan2.2_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --offload-to-cpu --video-frames 33 --flow-shift 3.0
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.2_ti2v_5B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan2.2_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --offload-to-cpu --video-frames 33 --flow-shift 3.0
 ```

 <video src=../assets/wan/Wan2.2_5B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -125,7 +122,7 @@
 #### I2V

 ```
-.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.2_ti2v_5B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan2.2_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --offload-to-cpu --video-frames 33 -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.2_ti2v_5B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan2.2_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --offload-to-cpu --video-frames 33 -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
 ```

 <video src=../assets/wan/Wan2.2_5B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -133,7 +130,7 @@
 ### Wan2.1 FLF2V 14B

 ```
-.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-flf2v-14b-720p-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --clip_vision ..\..\ComfyUI\models\clip_vision\clip_vision_h.safetensors -p "glass flower blossom" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png --flow-shift 3.0
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-flf2v-14b-720p-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --clip_vision ..\..\ComfyUI\models\clip_vision\clip_vision_h.safetensors -p "glass flower blossom" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png --flow-shift 3.0
 ```


@ -142,7 +139,7 @@
 ### Wan2.2 FLF2V 14B

 ```
-.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -p "glass flower blossom" -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png --flow-shift 3.0
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -p "glass flower blossom" -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png --flow-shift 3.0
 ```

 <video src=../assets/wan/Wan2.2_14B_flf2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -152,7 +149,7 @@
 #### T2V

 ```
-.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --video-frames 1 --offload-to-cpu
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --video-frames 1 --offload-to-cpu
 ```

 <video src=../assets/wan/Wan2.1_1.3B_vace_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -161,7 +158,7 @@
 #### R2V

 ```
-.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa -i ..\assets\cat_with_sd_cpp_42.png --video-frames 33 --offload-to-cpu
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa -i ..\assets\cat_with_sd_cpp_42.png --video-frames 33 --offload-to-cpu
 ```

 <video src=../assets/wan/Wan2.1_1.3B_vace_r2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -172,7 +169,7 @@
 ```
 mkdir post+depth
 ffmpeg -i ..\..\ComfyUI\input\post+depth.mp4 -qscale:v 1 -vf fps=8 post+depth\frame_%04d.jpg
-.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "The girl is dancing in a sea of flowers, slowly moving her hands. There is a close - up shot of her upper body. The character is surrounded by other transparent glass flowers in the style of Nicoletta Ceccoli, creating a beautiful, surreal, and emotionally expressive movie scene with a white. transparent feel and a dreamyl atmosphere." --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa -i ..\..\ComfyUI\input\dance_girl.jpg --control-video ./post+depth --video-frames 33 --offload-to-cpu
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "The girl is dancing in a sea of flowers, slowly moving her hands. There is a close - up shot of her upper body. The character is surrounded by other transparent glass flowers in the style of Nicoletta Ceccoli, creating a beautiful, surreal, and emotionally expressive movie scene with a white. transparent feel and a dreamyl atmosphere." --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa -i ..\..\ComfyUI\input\dance_girl.jpg --control-video ./post+depth --video-frames 33 --offload-to-cpu
 ```

 <video src=../assets/wan/Wan2.1_1.3B_vace_v2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -182,7 +179,7 @@ ffmpeg -i ..\..\ComfyUI\input\post+depth.mp4 -qscale:v 1 -vf fps=8 post+depth\fr
 #### T2V

 ```
-.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --video-frames 33 --offload-to-cpu
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --video-frames 33 --offload-to-cpu
 ```

 <video src=../assets/wan/Wan2.1_14B_vace_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -191,7 +188,7 @@ ffmpeg -i ..\..\ComfyUI\input\post+depth.mp4 -qscale:v 1 -vf fps=8 post+depth\fr
 #### R2V

 ```
-.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa -i ..\assets\cat_with_sd_cpp_42.png --video-frames 33 --offload-to-cpu
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa -i ..\assets\cat_with_sd_cpp_42.png --video-frames 33 --offload-to-cpu
 ```

 <video src=../assets/wan/Wan2.1_14B_vace_r2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -201,7 +198,7 @@ ffmpeg -i ..\..\ComfyUI\input\post+depth.mp4 -qscale:v 1 -vf fps=8 post+depth\fr
 #### V2V

 ```
-.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "The girl is dancing in a sea of flowers, slowly moving her hands. There is a close - up shot of her upper body. The character is surrounded by other transparent glass flowers in the style of Nicoletta Ceccoli, creating a beautiful, surreal, and emotionally expressive movie scene with a white. transparent feel and a dreamyl atmosphere." --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa -i ..\..\ComfyUI\input\dance_girl.jpg --control-video ./post+depth --video-frames 33 --offload-to-cpu
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "The girl is dancing in a sea of flowers, slowly moving her hands. There is a close - up shot of her upper body. The character is surrounded by other transparent glass flowers in the style of Nicoletta Ceccoli, creating a beautiful, surreal, and emotionally expressive movie scene with a white. transparent feel and a dreamyl atmosphere." --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa -i ..\..\ComfyUI\input\dance_girl.jpg --control-video ./post+depth --video-frames 33 --offload-to-cpu
 ```

 <video src=../assets/wan/Wan2.1_14B_vace_v2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
--- a/docs/z_image.md
+++ b/docs/z_image.md
@ -1,41 +0,0 @@
-# How to Use
-
-You can run Z-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or even less.
-
-## Download weights
-
- Download Z-Image-Turbo
-    - safetensors: https://huggingface.co/Comfy-Org/z_image_turbo/tree/main/split_files/diffusion_models
-    - gguf: https://huggingface.co/leejet/Z-Image-Turbo-GGUF/tree/main
- Download Z-Image
-    - safetensors: https://huggingface.co/Comfy-Org/z_image/tree/main/split_files/diffusion_models
-    - gguf: https://huggingface.co/unsloth/Z-Image-GGUF/tree/main
- Download vae
-    - safetensors: https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
- Download Qwen3 4b
-    - safetensors: https://huggingface.co/Comfy-Org/z_image_turbo/tree/main/split_files/text_encoders
-    - gguf: https://huggingface.co/unsloth/Qwen3-4B-Instruct-2507-GGUF/tree/main
-
-## Examples
-
-### Z-Image-Turbo
-
-```
-.\bin\Release\sd-cli.exe --diffusion-model  z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft  --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512
-```
-
-<img width="256" alt="z-image example" src="../assets/z_image/q3_K.png" />
-
-### Z-Image-Base
-
-```
-.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\z_image_bf16.safetensors --vae ..\..\ComfyUI\models\vae\ae.sft  --llm ..\..\ComfyUI\models\text_encoders\qwen_3_4b.safetensors -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512
-```
-
-<img width="256" alt="z-image example" src="../assets/z_image/base_bf16.png" />
-
-## Comparison of Different Quantization Types
-
-| bf16 | q8_0 | q6_K | q5_0 | q4_K | q4_0 | q3_K | q2_K|
-|---|---|---|---|---|---|---|---|
-| <img width="256" alt="bf16" src="../assets/z_image/bf16.png" /> | <img width="256" alt="q8_0" src="../assets/z_image/q8_0.png" /> | <img width="256" alt="q6_K" src="../assets/z_image/q6_K.png" /> | <img width="256" alt="q5_0" src="../assets/z_image/q5_0.png" />  | <img width="256" alt="q4_K" src="../assets/z_image/q4_K.png" /> | <img width="256" alt="q4_0" src="../assets/z_image/q4_0.png" /> | <img width="256" alt="q3_K" src="../assets/z_image/q3_K.png" /> | <img width="256" alt="q2_K" src="../assets/z_image/q2_K.png" /> |
--- a/src/esrgan.hpp
+++ b/src/esrgan.hpp
@ -27,11 +27,11 @@ public:
        blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
    }

-    ggml_tensor* lrelu(GGMLRunnerContext* ctx, ggml_tensor* x) {
-        return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true);
+    struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
+        return ggml_leaky_relu(ctx, x, 0.2f, true);
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [n, num_feat, h, w]
        // return: [n, num_feat, h, w]

@ -42,16 +42,16 @@ public:
        auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]);

        auto x1    = lrelu(ctx, conv1->forward(ctx, x));
-        auto x_cat = ggml_concat(ctx->ggml_ctx, x, x1, 2);
+        auto x_cat = ggml_concat(ctx, x, x1, 2);
        auto x2    = lrelu(ctx, conv2->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx->ggml_ctx, x_cat, x2, 2);
+        x_cat      = ggml_concat(ctx, x_cat, x2, 2);
        auto x3    = lrelu(ctx, conv3->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx->ggml_ctx, x_cat, x3, 2);
+        x_cat      = ggml_concat(ctx, x_cat, x3, 2);
        auto x4    = lrelu(ctx, conv4->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx->ggml_ctx, x_cat, x4, 2);
+        x_cat      = ggml_concat(ctx, x_cat, x4, 2);
        auto x5    = conv5->forward(ctx, x_cat);

-        x5 = ggml_add(ctx->ggml_ctx, ggml_ext_scale(ctx->ggml_ctx, x5, 0.2f), x);
+        x5 = ggml_add(ctx, ggml_scale(ctx, x5, 0.2f), x);
        return x5;
    }
 };
@ -64,7 +64,7 @@ public:
        blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [n, num_feat, h, w]
        // return: [n, num_feat, h, w]

@ -76,7 +76,7 @@ public:
        out      = rdb2->forward(ctx, out);
        out      = rdb3->forward(ctx, out);

-        out = ggml_add(ctx->ggml_ctx, ggml_ext_scale(ctx->ggml_ctx, out, 0.2f), x);
+        out = ggml_add(ctx, ggml_scale(ctx, out, 0.2f), x);
        return out;
    }
 };
@ -112,11 +112,11 @@ public:
    int get_scale() { return scale; }
    int get_num_block() { return num_block; }

-    ggml_tensor* lrelu(GGMLRunnerContext* ctx, ggml_tensor* x) {
-        return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true);
+    struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
+        return ggml_leaky_relu(ctx, x, 0.2f, true);
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [n, num_in_ch, h, w]
        // return: [n, num_out_ch, h*scale, w*scale]
        auto conv_first = std::dynamic_pointer_cast<Conv2d>(blocks["conv_first"]);
@ -133,14 +133,14 @@ public:
            body_feat = block->forward(ctx, body_feat);
        }
        body_feat = conv_body->forward(ctx, body_feat);
-        feat      = ggml_add(ctx->ggml_ctx, feat, body_feat);
+        feat      = ggml_add(ctx, feat, body_feat);
        // upsample
        if (scale >= 2) {
            auto conv_up1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up1"]);
-            feat          = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx->ggml_ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
+            feat          = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
            if (scale == 4) {
                auto conv_up2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up2"]);
-                feat          = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx->ggml_ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
+                feat          = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
            }
        }
        // for all scales
@ -156,13 +156,25 @@ struct ESRGAN : public GGMLRunner {

    ESRGAN(ggml_backend_t backend,
           bool offload_params_to_cpu,
-           int tile_size                                  = 128,
-           const String2TensorStorage& tensor_storage_map = {})
+           const String2GGMLType& tensor_types = {})
        : GGMLRunner(backend, offload_params_to_cpu) {
-        this->tile_size = tile_size;
+        // rrdb_net will be created in load_from_file
    }

-    std::string get_desc() override {
+    void enable_conv2d_direct() {
+        if (!rrdb_net)
+            return;
+        std::vector<GGMLBlock*> blocks;
+        rrdb_net->get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
+    std::string get_desc() {
        return "esrgan";
    }

@ -170,7 +182,7 @@ struct ESRGAN : public GGMLRunner {
        LOG_INFO("loading esrgan from '%s'", file_path.c_str());

        ModelLoader model_loader;
-        if (!model_loader.init_from_file_and_convert_name(file_path)) {
+        if (!model_loader.init_from_file(file_path)) {
            LOG_ERROR("init esrgan model loader from file failed: '%s'", file_path.c_str());
            return false;
        }
@ -341,27 +353,25 @@ struct ESRGAN : public GGMLRunner {
        return success;
    }

-    ggml_cgraph* build_graph(ggml_tensor* x) {
+    struct ggml_cgraph* build_graph(struct ggml_tensor* x) {
        if (!rrdb_net)
            return nullptr;
        constexpr int kGraphNodes = 1 << 16;  // 65k
-        ggml_cgraph* gf           = new_graph_custom(kGraphNodes);
+        struct ggml_cgraph* gf    = ggml_new_graph_custom(compute_ctx, kGraphNodes, /*grads*/ false);
        x                         = to_backend(x);
-
-        auto runner_ctx  = get_context();
-        ggml_tensor* out = rrdb_net->forward(&runner_ctx, x);
+        struct ggml_tensor* out   = rrdb_net->forward(compute_ctx, x);
        ggml_build_forward_expand(gf, out);
        return gf;
    }

-    bool compute(const int n_threads,
-                 ggml_tensor* x,
+    void compute(const int n_threads,
+                 struct ggml_tensor* x,
                 ggml_tensor** output,
-                 ggml_context* output_ctx = nullptr) {
-        auto get_graph = [&]() -> ggml_cgraph* {
+                 ggml_context* output_ctx = NULL) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(x);
        };
-        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
    }
 };

--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -1,4 +1,3 @@
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})

-add_subdirectory(cli)
-add_subdirectory(server)
+add_subdirectory(cli)
--- a/examples/cli/CMakeLists.txt
+++ b/examples/cli/CMakeLists.txt
@ -1,4 +1,4 @@
-set(TARGET sd-cli)
+set(TARGET sd)

 add_executable(${TARGET} main.cpp)
 install(TARGETS ${TARGET} RUNTIME)
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -1,149 +0,0 @@
-# Run
-
-```
-usage: ./bin/sd-cli  [options]
-
-CLI Options:
-  -o, --output <string>       path to write result image to. you can use printf-style %d format specifiers for image sequences (default:
-                              ./output.png) (eg. output_%03d.png)
-  --preview-path <string>     path to write preview image to (default: ./preview.png)
-  --preview-interval <int>    interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
-                              every step)
-  --output-begin-idx <int>    starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)
-  --canny                     apply canny preprocessor (edge detection)
-  --convert-name              convert tensor name (for convert mode)
-  -v, --verbose               print extra info
-  --color                     colors the logging tags according to level
-  --taesd-preview-only        prevents usage of taesd for decoding the final image. (for use with --preview tae)
-  --preview-noisy             enables previewing noisy inputs of the models rather than the denoised outputs
-  -M, --mode                  run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
-  --preview                   preview method. must be one of the following [none, proj, tae, vae] (default is none)
-  -h, --help                  show this help message and exit
-
-Context Options:
-  -m, --model <string>                     path to full model
-  --clip_l <string>                        path to the clip-l text encoder
-  --clip_g <string>                        path to the clip-g text encoder
-  --clip_vision <string>                   path to the clip-vision encoder
-  --t5xxl <string>                         path to the t5xxl text encoder
-  --llm <string>                           path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
-  --llm_vision <string>                    path to the llm vit
-  --qwen2vl <string>                       alias of --llm. Deprecated.
-  --qwen2vl_vision <string>                alias of --llm_vision. Deprecated.
-  --diffusion-model <string>               path to the standalone diffusion model
-  --high-noise-diffusion-model <string>    path to the standalone high noise diffusion model
-  --vae <string>                           path to standalone vae model
-  --taesd <string>                         path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
-  --tae <string>                           alias of --taesd
-  --control-net <string>                   path to control net model
-  --embd-dir <string>                      embeddings directory
-  --lora-model-dir <string>                lora model directory
-  --tensor-type-rules <string>             weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
-  --photo-maker <string>                   path to PHOTOMAKER model
-  --upscale-model <string>                 path to esrgan model.
-  -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
-                                           CPU physical cores
-  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
-  --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
-  --vae-tiling                             process vae in tiles to reduce memory usage
-  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
-  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
-  --mmap                                   whether to memory-map model
-  --control-net-cpu                        keep controlnet in cpu (for low vram)
-  --clip-on-cpu                            keep clip in cpu (for low vram)
-  --vae-on-cpu                             keep vae in cpu (for low vram)
-  --fa                                     use flash attention
-  --diffusion-fa                           use flash attention in the diffusion model only
-  --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
-  --vae-conv-direct                        use ggml_conv2d_direct in the vae model
-  --circular                               enable circular padding for convolutions
-  --circularx                              enable circular RoPE wrapping on x-axis (width) only
-  --circulary                              enable circular RoPE wrapping on y-axis (height) only
-  --chroma-disable-dit-mask                disable dit mask for chroma
-  --qwen-image-zero-cond-t                 enable zero_cond_t for qwen image
-  --chroma-enable-t5-mask                  enable t5 mask for chroma
-  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
-                                           type of the weight file
-  --rng                                    RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
-  --sampler-rng                            sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
-  --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
-  --lora-apply-mode                        the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
-                                           contain any quantized parameters, the at_runtime mode will be used; otherwise,
-                                           immediately will be used.The immediately mode may have precision and
-                                           compatibility issues with quantized parameters, but it usually offers faster inference
-                                           speed and, in some cases, lower memory usage. The at_runtime mode, on the
-                                           other hand, is exactly the opposite.
-  --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
-  --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
-                                           (overrides --vae-tile-size)
-
-Generation Options:
-  -p, --prompt <string>                    the prompt to render
-  -n, --negative-prompt <string>           the negative prompt (default: "")
-  -i, --init-img <string>                  path to the init image
-  --end-img <string>                       path to the end image, required by flf2v
-  --mask <string>                          path to the mask image
-  --control-image <string>                 path to control image, control net
-  --control-video <string>                 path to control video frames, It must be a directory path. The video frames inside should be stored as images in
-                                           lexicographical (character) order. For example, if the control video path is
-                                           `frames`, the directory contain images such as 00.png, 01.png, ... etc.
-  --pm-id-images-dir <string>              path to PHOTOMAKER input id images dir
-  --pm-id-embed-path <string>              path to PHOTOMAKER v2 id embed
-  -H, --height <int>                       image height, in pixel space (default: 512)
-  -W, --width <int>                        image width, in pixel space (default: 512)
-  --steps <int>                            number of sample steps (default: 20)
-  --high-noise-steps <int>                 (high noise) number of sample steps (default: -1 = auto)
-  --clip-skip <int>                        ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
-                                           will be 1 for SD1.x, 2 for SD2.x
-  -b, --batch-count <int>                  batch count
-  --video-frames <int>                     video frames (default: 1)
-  --fps <int>                              fps (default: 24)
-  --timestep-shift <int>                   shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
-                                           NitroSD-Vibrant
-  --upscale-repeats <int>                  Run the ESRGAN upscaler this many times (default: 1)
-  --upscale-tile-size <int>                tile size for ESRGAN upscaling (default: 128)
-  --cfg-scale <float>                      unconditional guidance scale: (default: 7.0)
-  --img-cfg-scale <float>                  image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
-  --guidance <float>                       distilled guidance scale for models with guidance input (default: 3.5)
-  --slg-scale <float>                      skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
-                                           medium
-  --skip-layer-start <float>               SLG enabling point (default: 0.01)
-  --skip-layer-end <float>                 SLG disabling point (default: 0.2)
-  --eta <float>                            eta in DDIM, only for DDIM and TCD (default: 0)
-  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
-  --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
-  --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
-  --high-noise-guidance <float>            (high noise) distilled guidance scale for models with guidance input (default: 3.5)
-  --high-noise-slg-scale <float>           (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
-  --high-noise-skip-layer-start <float>    (high noise) SLG enabling point (default: 0.01)
-  --high-noise-skip-layer-end <float>      (high noise) SLG disabling point (default: 0.2)
-  --high-noise-eta <float>                 (high noise) eta in DDIM, only for DDIM and TCD (default: 0)
-  --strength <float>                       strength for noising/unnoising (default: 0.75)
-  --pm-style-strength <float>
-  --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
-  --moe-boundary <float>                   timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
-  --vace-strength <float>                  wan vace strength
-  --increase-ref-index                     automatically increase the indices of references images based on the order they are listed (starting with 1).
-  --disable-auto-resize-ref-image          disable auto resize of ref images
-  -s, --seed                               RNG seed (default: 42, use random seed for < 0)
-  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
-                                           tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a
-                                           otherwise)
-  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
-                                           ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan,
-                                           euler_a otherwise
-  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
-                                           kl_optimal, lcm, bong_tangent], default: discrete
-  --sigmas                                 custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
-  --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
-  --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
-  -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
-  --cache-mode                             caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level),
-                                           'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
-  --cache-option                           named cache params (key=value format, comma-separated). easycache/ucache:
-                                           threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=;
-                                           spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples:
-                                           "threshold=0.25" or "threshold=1.5,reset=0" or "w=0.4,window=2"
-  --scm-mask                               SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
-  --scm-policy                             SCM policy: 'dynamic' (default) or 'static'
-```
--- a/examples/cli/avi_writer.h
+++ b/examples/cli/avi_writer.h
@ -1,10 +1,10 @@
 #ifndef __AVI_WRITER_H__
 #define __AVI_WRITER_H__

-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>

 #include "stable-diffusion.h"

@ -130,7 +130,7 @@ int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int
    write_u32_le(f, 0);                   // Colors important

    // 'movi' LIST (video frames)
-    // long movi_list_pos = ftell(f);
+    long movi_list_pos = ftell(f);
    fwrite("LIST", 4, 1, f);
    long movi_size_pos = ftell(f);
    write_u32_le(f, 0);  // Placeholder for movi size
@ -149,7 +149,7 @@ int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int
    } jpeg_data;

    for (int i = 0; i < num_images; i++) {
-        jpeg_data.buf  = nullptr;
+        jpeg_data.buf  = NULL;
        jpeg_data.size = 0;

        // Callback function to collect JPEG data into memory
@ -172,9 +172,9 @@ int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int

        // Write '00dc' chunk (video frame)
        fwrite("00dc", 4, 1, f);
-        write_u32_le(f, (uint32_t)jpeg_data.size);
+        write_u32_le(f, jpeg_data.size);
        index[i].offset = ftell(f) - 8;
-        index[i].size   = (uint32_t)jpeg_data.size;
+        index[i].size   = jpeg_data.size;
        fwrite(jpeg_data.buf, 1, jpeg_data.size, f);

        // Align to even byte size
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
--- a/examples/common/common.hpp
+++ b/examples/common/common.hpp
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -1,73 +0,0 @@
-set(TARGET sd-server)
-
-option(SD_SERVER_BUILD_FRONTEND "Build server frontend with pnpm" ON)
-
-set(FRONTEND_DIR "${CMAKE_CURRENT_SOURCE_DIR}/frontend")
-set(GENERATED_HTML_HEADER "${FRONTEND_DIR}/dist/gen_index_html.h")
-
-set(HAVE_FRONTEND_BUILD OFF)
-
-if(SD_SERVER_BUILD_FRONTEND AND EXISTS "${FRONTEND_DIR}")
-    if(WIN32)
-        find_program(PNPM_EXECUTABLE NAMES pnpm.cmd pnpm)
-    else()
-        find_program(PNPM_EXECUTABLE NAMES pnpm)
-    endif()
-
-    if(PNPM_EXECUTABLE)
-        message(STATUS "Frontend dir found: ${FRONTEND_DIR}")
-        message(STATUS "pnpm found: ${PNPM_EXECUTABLE}")
-
-        set(HAVE_FRONTEND_BUILD ON)
-
-        add_custom_target(${TARGET}_frontend_install
-            COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" install
-            WORKING_DIRECTORY "${FRONTEND_DIR}"
-            COMMENT "Installing frontend dependencies"
-            VERBATIM
-        )
-
-        add_custom_target(${TARGET}_frontend_build
-            COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" run build
-            WORKING_DIRECTORY "${FRONTEND_DIR}"
-            COMMENT "Building frontend"
-            VERBATIM
-        )
-
-        add_custom_target(${TARGET}_frontend_header
-            COMMAND "${PNPM_EXECUTABLE}" -C "${FRONTEND_DIR}" run build:header
-            WORKING_DIRECTORY "${FRONTEND_DIR}"
-            COMMENT "Generating gen_index_html.h"
-            VERBATIM
-        )
-
-        add_dependencies(${TARGET}_frontend_build ${TARGET}_frontend_install)
-        add_dependencies(${TARGET}_frontend_header ${TARGET}_frontend_build)
-
-        add_custom_target(${TARGET}_frontend
-            DEPENDS ${TARGET}_frontend_header
-        )
-
-        set_source_files_properties("${GENERATED_HTML_HEADER}" PROPERTIES GENERATED TRUE)
-    else()
-        message(WARNING "pnpm not found, frontend build disabled")
-    endif()
-else()
-    message(STATUS "Frontend disabled or directory not found: ${FRONTEND_DIR}")
-endif()
-
-add_executable(${TARGET} main.cpp)
-
-if(HAVE_FRONTEND_BUILD)
-    add_dependencies(${TARGET} ${TARGET}_frontend)
-    target_sources(${TARGET} PRIVATE "${GENERATED_HTML_HEADER}")
-    target_include_directories(${TARGET} PRIVATE "${FRONTEND_DIR}/dist")
-    target_compile_definitions(${TARGET} PRIVATE HAVE_INDEX_HTML)
-    message(STATUS "HAVE_INDEX_HTML enabled")
-else()
-    message(STATUS "HAVE_INDEX_HTML disabled")
-endif()
-
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PUBLIC c_std_11 cxx_std_17)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -1,227 +0,0 @@
-# Frontend
-
-## Build with Frontend
-
-The server can optionally build the web frontend and embed it into the binary as `gen_index_html.h`.
-
-### Requirements
-
-Install the following tools:
-
-* **Node.js** ≥ 22.18
-  https://nodejs.org/
-
-* **pnpm** ≥ 10
-  Install via npm:
-
-```bash
-npm install -g pnpm
-```
-
-Verify installation:
-
-```bash
-node -v
-pnpm -v
-```
-
-### Install frontend dependencies
-
-Go to the frontend directory and install dependencies:
-
-```bash
-cd examples/server/frontend
-pnpm install
-```
-
-### Build the server with CMake
-
-Enable the frontend build option when configuring CMake:
-
-```bash
-cmake -B build -DSD_SERVER_BUILD_FRONTEND=ON
-cmake --build build --config Release
-```
-
-If `pnpm` is available, the build system will automatically run:
-
-```
-pnpm run build
-pnpm run build:header
-```
-
-and embed the generated frontend into the server binary.
-
-## Frontend Repository
-
-The web frontend is maintained in a **separate repository**, https://github.com/leejet/stable-ui.
-
-If you want to modify the UI or frontend logic, please submit pull requests to the **frontend repository**.
-
-This repository (`stable-diffusion.cpp`) only vendors the frontend periodically. Changes from the frontend repo are synchronized:
-
-* approximately **every 1–2 weeks**, or
-* when there are **major frontend updates**
-
-Because of this, frontend changes will **not appear here immediately** after being merged upstream.
-
-## Using an external frontend
-
-By default, the server uses the **embedded frontend** generated during the build (`gen_index_html.h`).
-
-You can also serve a custom frontend file instead of the embedded one by using:
-
-```bash
--serve-html-path <path-to-index.html>
-```
-
-For example:
-
-```bash
-sd-server --serve-html-path ./index.html
-```
-
-In this case, the server will load and serve the specified `index.html` file instead of the embedded frontend. This is useful when:
-
-* developing or testing frontend changes
-* using a custom UI
-* avoiding rebuilding the binary after frontend modifications
-
-# Run
-
-```
-usage: ./bin/sd-server  [options]
-
-Svr Options:
-  -l, --listen-ip <string>      server listen ip (default: 127.0.0.1)        
-  --serve-html-path <string>    path to HTML file to serve at root (optional)
-  --listen-port <int>           server listen port (default: 1234)
-  -v, --verbose                 print extra info
-  --color                       colors the logging tags according to level   
-  -h, --help                    show this help message and exit
-
-Context Options:
-  -m, --model <string>                     path to full model
-  --clip_l <string>                        path to the clip-l text encoder
-  --clip_g <string>                        path to the clip-g text encoder
-  --clip_vision <string>                   path to the clip-vision encoder
-  --t5xxl <string>                         path to the t5xxl text encoder
-  --llm <string>                           path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
-  --llm_vision <string>                    path to the llm vit
-  --qwen2vl <string>                       alias of --llm. Deprecated.
-  --qwen2vl_vision <string>                alias of --llm_vision. Deprecated.
-  --diffusion-model <string>               path to the standalone diffusion model
-  --high-noise-diffusion-model <string>    path to the standalone high noise diffusion model
-  --vae <string>                           path to standalone vae model
-  --taesd <string>                         path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
-  --tae <string>                           alias of --taesd
-  --control-net <string>                   path to control net model
-  --embd-dir <string>                      embeddings directory
-  --lora-model-dir <string>                lora model directory
-  --tensor-type-rules <string>             weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
-  --photo-maker <string>                   path to PHOTOMAKER model
-  --upscale-model <string>                 path to esrgan model.
-  -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
-                                           CPU physical cores
-  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
-  --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
-  --vae-tiling                             process vae in tiles to reduce memory usage
-  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
-  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
-  --mmap                                   whether to memory-map model
-  --control-net-cpu                        keep controlnet in cpu (for low vram)
-  --clip-on-cpu                            keep clip in cpu (for low vram)
-  --vae-on-cpu                             keep vae in cpu (for low vram)
-  --fa                                     use flash attention
-  --diffusion-fa                           use flash attention in the diffusion model only
-  --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
-  --vae-conv-direct                        use ggml_conv2d_direct in the vae model
-  --circular                               enable circular padding for convolutions
-  --circularx                              enable circular RoPE wrapping on x-axis (width) only
-  --circulary                              enable circular RoPE wrapping on y-axis (height) only
-  --chroma-disable-dit-mask                disable dit mask for chroma
-  --qwen-image-zero-cond-t                 enable zero_cond_t for qwen image
-  --chroma-enable-t5-mask                  enable t5 mask for chroma
-  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
-                                           type of the weight file
-  --rng                                    RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
-  --sampler-rng                            sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
-  --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
-  --lora-apply-mode                        the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
-                                           contain any quantized parameters, the at_runtime mode will be used; otherwise,
-                                           immediately will be used.The immediately mode may have precision and
-                                           compatibility issues with quantized parameters, but it usually offers faster inference
-                                           speed and, in some cases, lower memory usage. The at_runtime mode, on the
-                                           other hand, is exactly the opposite.
-  --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
-  --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
-                                           (overrides --vae-tile-size)
-
-Default Generation Options:
-  -p, --prompt <string>                    the prompt to render
-  -n, --negative-prompt <string>           the negative prompt (default: "")
-  -i, --init-img <string>                  path to the init image
-  --end-img <string>                       path to the end image, required by flf2v
-  --mask <string>                          path to the mask image
-  --control-image <string>                 path to control image, control net
-  --control-video <string>                 path to control video frames, It must be a directory path. The video frames inside should be stored as images in
-                                           lexicographical (character) order. For example, if the control video path is
-                                           `frames`, the directory contain images such as 00.png, 01.png, ... etc.
-  --pm-id-images-dir <string>              path to PHOTOMAKER input id images dir
-  --pm-id-embed-path <string>              path to PHOTOMAKER v2 id embed
-  -H, --height <int>                       image height, in pixel space (default: 512)
-  -W, --width <int>                        image width, in pixel space (default: 512)
-  --steps <int>                            number of sample steps (default: 20)
-  --high-noise-steps <int>                 (high noise) number of sample steps (default: -1 = auto)
-  --clip-skip <int>                        ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
-                                           will be 1 for SD1.x, 2 for SD2.x
-  -b, --batch-count <int>                  batch count
-  --video-frames <int>                     video frames (default: 1)
-  --fps <int>                              fps (default: 24)
-  --timestep-shift <int>                   shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
-                                           NitroSD-Vibrant
-  --upscale-repeats <int>                  Run the ESRGAN upscaler this many times (default: 1)
-  --upscale-tile-size <int>                tile size for ESRGAN upscaling (default: 128)
-  --cfg-scale <float>                      unconditional guidance scale: (default: 7.0)
-  --img-cfg-scale <float>                  image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
-  --guidance <float>                       distilled guidance scale for models with guidance input (default: 3.5)
-  --slg-scale <float>                      skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
-                                           medium
-  --skip-layer-start <float>               SLG enabling point (default: 0.01)
-  --skip-layer-end <float>                 SLG disabling point (default: 0.2)
-  --eta <float>                            eta in DDIM, only for DDIM and TCD (default: 0)
-  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
-  --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
-  --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
-  --high-noise-guidance <float>            (high noise) distilled guidance scale for models with guidance input (default: 3.5)
-  --high-noise-slg-scale <float>           (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
-  --high-noise-skip-layer-start <float>    (high noise) SLG enabling point (default: 0.01)
-  --high-noise-skip-layer-end <float>      (high noise) SLG disabling point (default: 0.2)
-  --high-noise-eta <float>                 (high noise) eta in DDIM, only for DDIM and TCD (default: 0)
-  --strength <float>                       strength for noising/unnoising (default: 0.75)
-  --pm-style-strength <float>
-  --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
-  --moe-boundary <float>                   timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
-  --vace-strength <float>                  wan vace strength
-  --increase-ref-index                     automatically increase the indices of references images based on the order they are listed (starting with 1).
-  --disable-auto-resize-ref-image          disable auto resize of ref images
-  -s, --seed                               RNG seed (default: 42, use random seed for < 0)
-  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
-                                           tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a
-                                           otherwise)
-  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
-                                           ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan,
-                                           euler_a otherwise
-  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
-                                           kl_optimal, lcm, bong_tangent], default: discrete
-  --sigmas                                 custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
-  --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
-  --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
-  -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
-  --cache-mode                             caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
-  --cache-option                           named cache params (key=value format, comma-separated). easycache/ucache:
-                                           threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
-                                           "threshold=0.25" or "threshold=1.5,reset=0"
-  --scm-mask                               SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
-  --scm-policy                             SCM policy: 'dynamic' (default) or 'static'
-```
--- a/examples/server/frontend
+++ b/examples/server/frontend
@ -1 +0,0 @@
-Subproject commit 1a34176cd6d39ad3a226b2b69047e71f6797f6bc
--- a/examples/server/main.cpp
+++ b/examples/server/main.cpp
--- a/script/face_detect.py
+++ b/script/face_detect.py
@ -1,88 +1,88 @@
-import os
-import sys
-
-import numpy as np
-import torch
-from diffusers.utils import load_image
-# pip install insightface==0.7.3
-from insightface.app import FaceAnalysis
-from insightface.data import get_image as ins_get_image
-from safetensors.torch import save_file
-
-### 
-# https://github.com/cubiq/ComfyUI_IPAdapter_plus/issues/165#issue-2055829543
-###
-class FaceAnalysis2(FaceAnalysis):
-    # NOTE: allows setting det_size for each detection call.
-    # the model allows it but the wrapping code from insightface
-    # doesn't show it, and people end up loading duplicate models
-    # for different sizes where there is absolutely no need to
-    def get(self, img, max_num=0, det_size=(640, 640)):
-        if det_size is not None:
-            self.det_model.input_size = det_size
-
-        return super().get(img, max_num)
-
-def analyze_faces(face_analysis: FaceAnalysis, img_data: np.ndarray, det_size=(640, 640)):
-    # NOTE: try detect faces, if no faces detected, lower det_size until it does
-    detection_sizes = [None] + [(size, size) for size in range(640, 256, -64)] + [(256, 256)]
-
-    for size in detection_sizes:
-        faces = face_analysis.get(img_data, det_size=size)
-        if len(faces) > 0:
-            return faces
-
-    return []
-
-if __name__ == "__main__":
-    #face_detector = FaceAnalysis2(providers=['CUDAExecutionProvider'], allowed_modules=['detection', 'recognition'])
-    face_detector = FaceAnalysis2(providers=['CPUExecutionProvider'], allowed_modules=['detection', 'recognition'])
-    face_detector.prepare(ctx_id=0, det_size=(640, 640))
-    #input_folder_name = './scarletthead_woman'
-    input_folder_name = sys.argv[1]
-    image_basename_list = os.listdir(input_folder_name)
-    image_path_list = sorted([os.path.join(input_folder_name, basename) for basename in image_basename_list])
-
-    input_id_images = []
-    for image_path in image_path_list:
-        input_id_images.append(load_image(image_path))
-    
-    id_embed_list = []
-    
-    for img in input_id_images:
-        img = np.array(img)
-        img = img[:, :, ::-1]
-        faces = analyze_faces(face_detector, img)
-        if len(faces) > 0:
-            id_embed_list.append(torch.from_numpy((faces[0]['embedding'])))
-    
-    if len(id_embed_list) == 0:
-        raise ValueError(f"No face detected in input image pool")
-    
-    id_embeds = torch.stack(id_embed_list)    
-    
-    # for r in id_embeds:
-    #     print(r)
-    # #torch.save(id_embeds, input_folder_name+'/id_embeds.pt');
-    # weights = dict()
-    # weights["id_embeds"] = id_embeds
-    # save_file(weights, input_folder_name+'/id_embeds.safetensors')
-
-    binary_data = id_embeds.numpy().tobytes()
-    two = 4
-    zero = 0
-    one = 1
-    tensor_name = "id_embeds"
-# Write binary data to a file
-    with open(input_folder_name+'/id_embeds.bin', "wb") as f:
-        f.write(two.to_bytes(4, byteorder='little'))
-        f.write((len(tensor_name)).to_bytes(4, byteorder='little'))
-        f.write(zero.to_bytes(4, byteorder='little'))
-        f.write((id_embeds.shape[1]).to_bytes(4, byteorder='little'))
-        f.write((id_embeds.shape[0]).to_bytes(4, byteorder='little'))
-        f.write(one.to_bytes(4, byteorder='little'))
-        f.write(one.to_bytes(4, byteorder='little'))
-        f.write(tensor_name.encode('ascii'))
-        f.write(binary_data)
-
+import os
+import sys
+
+import numpy as np
+import torch
+from diffusers.utils import load_image
+# pip install insightface==0.7.3
+from insightface.app import FaceAnalysis
+from insightface.data import get_image as ins_get_image
+from safetensors.torch import save_file
+
+### 
+# https://github.com/cubiq/ComfyUI_IPAdapter_plus/issues/165#issue-2055829543
+###
+class FaceAnalysis2(FaceAnalysis):
+    # NOTE: allows setting det_size for each detection call.
+    # the model allows it but the wrapping code from insightface
+    # doesn't show it, and people end up loading duplicate models
+    # for different sizes where there is absolutely no need to
+    def get(self, img, max_num=0, det_size=(640, 640)):
+        if det_size is not None:
+            self.det_model.input_size = det_size
+
+        return super().get(img, max_num)
+
+def analyze_faces(face_analysis: FaceAnalysis, img_data: np.ndarray, det_size=(640, 640)):
+    # NOTE: try detect faces, if no faces detected, lower det_size until it does
+    detection_sizes = [None] + [(size, size) for size in range(640, 256, -64)] + [(256, 256)]
+
+    for size in detection_sizes:
+        faces = face_analysis.get(img_data, det_size=size)
+        if len(faces) > 0:
+            return faces
+
+    return []
+
+if __name__ == "__main__":
+    #face_detector = FaceAnalysis2(providers=['CUDAExecutionProvider'], allowed_modules=['detection', 'recognition'])
+    face_detector = FaceAnalysis2(providers=['CPUExecutionProvider'], allowed_modules=['detection', 'recognition'])
+    face_detector.prepare(ctx_id=0, det_size=(640, 640))
+    #input_folder_name = './scarletthead_woman'
+    input_folder_name = sys.argv[1]
+    image_basename_list = os.listdir(input_folder_name)
+    image_path_list = sorted([os.path.join(input_folder_name, basename) for basename in image_basename_list])
+
+    input_id_images = []
+    for image_path in image_path_list:
+        input_id_images.append(load_image(image_path))
+    
+    id_embed_list = []
+    
+    for img in input_id_images:
+        img = np.array(img)
+        img = img[:, :, ::-1]
+        faces = analyze_faces(face_detector, img)
+        if len(faces) > 0:
+            id_embed_list.append(torch.from_numpy((faces[0]['embedding'])))
+    
+    if len(id_embed_list) == 0:
+        raise ValueError(f"No face detected in input image pool")
+    
+    id_embeds = torch.stack(id_embed_list)    
+    
+    # for r in id_embeds:
+    #     print(r)
+    # #torch.save(id_embeds, input_folder_name+'/id_embeds.pt');
+    # weights = dict()
+    # weights["id_embeds"] = id_embeds
+    # save_file(weights, input_folder_name+'/id_embeds.safetensors')
+
+    binary_data = id_embeds.numpy().tobytes()
+    two = 4
+    zero = 0
+    one = 1
+    tensor_name = "id_embeds"
+# Write binary data to a file
+    with open(input_folder_name+'/id_embeds.bin', "wb") as f:
+        f.write(two.to_bytes(4, byteorder='little'))
+        f.write((len(tensor_name)).to_bytes(4, byteorder='little'))
+        f.write(zero.to_bytes(4, byteorder='little'))
+        f.write((id_embeds.shape[1]).to_bytes(4, byteorder='little'))
+        f.write((id_embeds.shape[0]).to_bytes(4, byteorder='little'))
+        f.write(one.to_bytes(4, byteorder='little'))
+        f.write(one.to_bytes(4, byteorder='little'))
+        f.write(tensor_name.encode('ascii'))
+        f.write(binary_data)
+
    
--- a/flux.hpp
+++ b/flux.hpp
--- a/format-code.sh
+++ b/format-code.sh
@ -1,8 +1,5 @@
-for f in src/*.cpp src/*.h src/*.hpp src/vocab/*.h src/vocab/*.cpp examples/cli/*.cpp examples/common/*.hpp examples/cli/*.h examples/server/*.cpp; do
+for f in *.cpp *.h *.hpp examples/cli/*.cpp examples/cli/*.h; do
  [[ "$f" == vocab* ]] && continue
  echo "formatting '$f'"
-  # if [ "$f" != "stable-diffusion.h" ]; then
-  #   clang-tidy -fix -p build_linux/ "$f"
-  # fi
  clang-format -style=file -i "$f"
 done
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit a8db410a252c8c8f2d120c6f2e7133ebe032f35d
+Subproject commit 7bffd79a4bec72e9a3bfbedb582a218b84401c13
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
--- a/src/gguf_reader.hpp
+++ b/src/gguf_reader.hpp
@ -151,7 +151,7 @@ private:
        }

        if (n_dims > GGML_MAX_DIMS) {
-            for (uint32_t i = GGML_MAX_DIMS; i < n_dims; i++) {
+            for (int i = GGML_MAX_DIMS; i < n_dims; i++) {
                info.shape[GGML_MAX_DIMS - 1] *= info.shape[i];  // stack to last dim;
            }
            info.shape.resize(GGML_MAX_DIMS);
--- a/src/gits_noise.inl
+++ b/src/gits_noise.inl
--- a/lora.hpp
+++ b/lora.hpp
@ -0,0 +1,897 @@
+#ifndef __LORA_HPP__
+#define __LORA_HPP__
+
+#include <mutex>
+#include "ggml_extend.hpp"
+
+#define LORA_GRAPH_BASE_SIZE 10240
+
+struct LoraModel : public GGMLRunner {
+    enum lora_t {
+        REGULAR      = 0,
+        DIFFUSERS    = 1,
+        DIFFUSERS_2  = 2,
+        DIFFUSERS_3  = 3,
+        TRANSFORMERS = 4,
+        LORA_TYPE_COUNT
+    };
+
+    const std::string lora_ups[LORA_TYPE_COUNT] = {
+        ".lora_up",
+        "_lora.up",
+        ".lora_B",
+        ".lora.up",
+        ".lora_linear_layer.up",
+    };
+
+    const std::string lora_downs[LORA_TYPE_COUNT] = {
+        ".lora_down",
+        "_lora.down",
+        ".lora_A",
+        ".lora.down",
+        ".lora_linear_layer.down",
+    };
+
+    const std::string lora_pre[LORA_TYPE_COUNT] = {
+        "lora.",
+        "",
+        "",
+        "",
+        "",
+    };
+
+    const std::map<std::string, std::string> alt_names = {
+        // mmdit
+        {"final_layer.adaLN_modulation.1", "norm_out.linear"},
+        {"pos_embed", "pos_embed.proj"},
+        {"final_layer.linear", "proj_out"},
+        {"y_embedder.mlp.0", "time_text_embed.text_embedder.linear_1"},
+        {"y_embedder.mlp.2", "time_text_embed.text_embedder.linear_2"},
+        {"t_embedder.mlp.0", "time_text_embed.timestep_embedder.linear_1"},
+        {"t_embedder.mlp.2", "time_text_embed.timestep_embedder.linear_2"},
+        {"x_block.mlp.fc1", "ff.net.0.proj"},
+        {"x_block.mlp.fc2", "ff.net.2"},
+        {"context_block.mlp.fc1", "ff_context.net.0.proj"},
+        {"context_block.mlp.fc2", "ff_context.net.2"},
+        {"x_block.adaLN_modulation.1", "norm1.linear"},
+        {"context_block.adaLN_modulation.1", "norm1_context.linear"},
+        {"context_block.attn.proj", "attn.to_add_out"},
+        {"x_block.attn.proj", "attn.to_out.0"},
+        {"x_block.attn2.proj", "attn2.to_out.0"},
+        // flux
+        {"img_in", "x_embedder"},
+        // singlestream
+        {"linear2", "proj_out"},
+        {"modulation.lin", "norm.linear"},
+        // doublestream
+        {"txt_attn.proj", "attn.to_add_out"},
+        {"img_attn.proj", "attn.to_out.0"},
+        {"txt_mlp.0", "ff_context.net.0.proj"},
+        {"txt_mlp.2", "ff_context.net.2"},
+        {"img_mlp.0", "ff.net.0.proj"},
+        {"img_mlp.2", "ff.net.2"},
+        {"txt_mod.lin", "norm1_context.linear"},
+        {"img_mod.lin", "norm1.linear"},
+    };
+
+    const std::map<std::string, std::string> qkv_prefixes = {
+        // mmdit
+        {"context_block.attn.qkv", "attn.add_"},  // suffix "_proj"
+        {"x_block.attn.qkv", "attn.to_"},
+        {"x_block.attn2.qkv", "attn2.to_"},
+        // flux
+        // doublestream
+        {"txt_attn.qkv", "attn.add_"},  // suffix "_proj"
+        {"img_attn.qkv", "attn.to_"},
+    };
+    const std::map<std::string, std::string> qkvm_prefixes = {
+        // flux
+        // singlestream
+        {"linear1", ""},
+    };
+
+    const std::string* type_fingerprints = lora_ups;
+
+    float multiplier = 1.0f;
+    std::map<std::string, struct ggml_tensor*> lora_tensors;
+    std::map<ggml_tensor*, ggml_tensor*> original_tensor_to_final_tensor;
+    std::string file_path;
+    ModelLoader model_loader;
+    bool load_failed                = false;
+    bool applied                    = false;
+    std::vector<int> zero_index_vec = {0};
+    ggml_tensor* zero_index         = NULL;
+    enum lora_t type                = REGULAR;
+
+    LoraModel(ggml_backend_t backend,
+              const std::string& file_path = "",
+              const std::string prefix     = "")
+        : file_path(file_path), GGMLRunner(backend, false) {
+        if (!model_loader.init_from_file(file_path, prefix)) {
+            load_failed = true;
+        }
+    }
+
+    std::string get_desc() {
+        return "lora";
+    }
+
+    bool load_from_file(bool filter_tensor, int n_threads) {
+        LOG_INFO("loading LoRA from '%s'", file_path.c_str());
+
+        if (load_failed) {
+            LOG_ERROR("init lora model loader from file failed: '%s'", file_path.c_str());
+            return false;
+        }
+
+        std::unordered_map<std::string, TensorStorage> tensors_to_create;
+        std::mutex lora_mutex;
+        bool dry_run          = true;
+        auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
+            if (dry_run) {
+                const std::string& name = tensor_storage.name;
+
+                if (filter_tensor && !contains(name, "lora")) {
+                    return true;
+                }
+
+                {
+                    std::lock_guard<std::mutex> lock(lora_mutex);
+                    for (int i = 0; i < LORA_TYPE_COUNT; i++) {
+                        if (name.find(type_fingerprints[i]) != std::string::npos) {
+                            type = (lora_t)i;
+                            break;
+                        }
+                    }
+                    tensors_to_create[name] = tensor_storage;
+                }
+            } else {
+                const std::string& name = tensor_storage.name;
+                auto iter               = lora_tensors.find(name);
+                if (iter != lora_tensors.end()) {
+                    *dst_tensor = iter->second;
+                }
+            }
+            return true;
+        };
+
+        model_loader.load_tensors(on_new_tensor_cb, n_threads);
+
+        for (const auto& pair : tensors_to_create) {
+            const auto& name         = pair.first;
+            const auto& ts           = pair.second;
+            struct ggml_tensor* real = ggml_new_tensor(params_ctx,
+                                                       ts.type,
+                                                       ts.n_dims,
+                                                       ts.ne);
+            lora_tensors[name]       = real;
+        }
+
+        alloc_params_buffer();
+
+        dry_run = false;
+        model_loader.load_tensors(on_new_tensor_cb, n_threads);
+
+        LOG_DEBUG("lora type: \"%s\"/\"%s\"", lora_downs[type].c_str(), lora_ups[type].c_str());
+
+        LOG_DEBUG("finished loaded lora");
+        return true;
+    }
+
+    ggml_tensor* to_f32(ggml_context* ctx, ggml_tensor* a) {
+        auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a));
+        out      = ggml_get_rows(ctx, out, zero_index);
+        out      = ggml_reshape(ctx, out, a);
+        // auto out = ggml_cast(ctx, a, GGML_TYPE_F32);
+        return out;
+    }
+
+    std::vector<std::string> to_lora_keys(std::string blk_name, SDVersion version) {
+        std::vector<std::string> keys;
+        // if (!sd_version_is_sd3(version) || blk_name != "model.diffusion_model.pos_embed") {
+        size_t k_pos = blk_name.find(".weight");
+        if (k_pos == std::string::npos) {
+            return keys;
+        }
+        blk_name = blk_name.substr(0, k_pos);
+        // }
+        keys.push_back(blk_name);
+        keys.push_back("lora." + blk_name);
+        if (sd_version_is_dit(version)) {
+            if (blk_name.find("model.diffusion_model") != std::string::npos) {
+                blk_name.replace(blk_name.find("model.diffusion_model"), sizeof("model.diffusion_model") - 1, "transformer");
+            }
+
+            if (blk_name.find(".single_blocks") != std::string::npos) {
+                blk_name.replace(blk_name.find(".single_blocks"), sizeof(".single_blocks") - 1, ".single_transformer_blocks");
+            }
+            if (blk_name.find(".double_blocks") != std::string::npos) {
+                blk_name.replace(blk_name.find(".double_blocks"), sizeof(".double_blocks") - 1, ".transformer_blocks");
+            }
+
+            if (blk_name.find(".joint_blocks") != std::string::npos) {
+                blk_name.replace(blk_name.find(".joint_blocks"), sizeof(".joint_blocks") - 1, ".transformer_blocks");
+            }
+
+            if (blk_name.find("text_encoders.clip_l") != std::string::npos) {
+                blk_name.replace(blk_name.find("text_encoders.clip_l"), sizeof("text_encoders.clip_l") - 1, "cond_stage_model");
+            }
+
+            for (const auto& item : alt_names) {
+                size_t match = blk_name.find(item.first);
+                if (match != std::string::npos) {
+                    blk_name = blk_name.substr(0, match) + item.second;
+                }
+            }
+            for (const auto& prefix : qkv_prefixes) {
+                size_t match = blk_name.find(prefix.first);
+                if (match != std::string::npos) {
+                    std::string split_blk = "SPLIT|" + blk_name.substr(0, match) + prefix.second;
+                    keys.push_back(split_blk);
+                }
+            }
+            for (const auto& prefix : qkvm_prefixes) {
+                size_t match = blk_name.find(prefix.first);
+                if (match != std::string::npos) {
+                    std::string split_blk = "SPLIT_L|" + blk_name.substr(0, match) + prefix.second;
+                    keys.push_back(split_blk);
+                }
+            }
+            keys.push_back(blk_name);
+        }
+
+        std::vector<std::string> ret;
+        for (std::string& key : keys) {
+            ret.push_back(key);
+            replace_all_chars(key, '.', '_');
+            // fix for some sdxl lora, like lcm-lora-xl
+            if (key == "model_diffusion_model_output_blocks_2_2_conv") {
+                ret.push_back("model_diffusion_model_output_blocks_2_1_conv");
+            }
+            ret.push_back(key);
+        }
+        return ret;
+    }
+
+    struct ggml_cgraph* build_lora_graph(std::map<std::string, struct ggml_tensor*> model_tensors, SDVersion version) {
+        size_t lora_graph_size = LORA_GRAPH_BASE_SIZE + lora_tensors.size() * 10;
+        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, lora_graph_size, false);
+
+        zero_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1);
+        set_backend_tensor_data(zero_index, zero_index_vec.data());
+        ggml_build_forward_expand(gf, zero_index);
+
+        original_tensor_to_final_tensor.clear();
+
+        std::set<std::string> applied_lora_tensors;
+        for (auto it : model_tensors) {
+            std::string model_tensor_name    = it.first;
+            struct ggml_tensor* model_tensor = model_tensors[it.first];
+
+            std::vector<std::string> keys = to_lora_keys(model_tensor_name, version);
+            bool is_bias                  = ends_with(model_tensor_name, ".bias");
+            if (keys.size() == 0) {
+                if (is_bias) {
+                    keys.push_back(model_tensor_name.substr(0, model_tensor_name.size() - 5));  // remove .bias
+                } else {
+                    continue;
+                }
+            }
+
+            for (auto& key : keys) {
+                bool is_qkv_split = starts_with(key, "SPLIT|");
+                if (is_qkv_split) {
+                    key = key.substr(sizeof("SPLIT|") - 1);
+                }
+                bool is_qkvm_split = starts_with(key, "SPLIT_L|");
+                if (is_qkvm_split) {
+                    key = key.substr(sizeof("SPLIT_L|") - 1);
+                }
+                struct ggml_tensor* updown = NULL;
+                float scale_value          = 1.0f;
+                std::string full_key       = lora_pre[type] + key;
+                if (is_bias) {
+                    if (lora_tensors.find(full_key + ".diff_b") != lora_tensors.end()) {
+                        std::string diff_name = full_key + ".diff_b";
+                        ggml_tensor* diff     = lora_tensors[diff_name];
+                        updown                = to_f32(compute_ctx, diff);
+                        applied_lora_tensors.insert(diff_name);
+                    } else {
+                        continue;
+                    }
+                } else if (lora_tensors.find(full_key + ".diff") != lora_tensors.end()) {
+                    std::string diff_name = full_key + ".diff";
+                    ggml_tensor* diff     = lora_tensors[diff_name];
+                    updown                = to_f32(compute_ctx, diff);
+                    applied_lora_tensors.insert(diff_name);
+                } else if (lora_tensors.find(full_key + ".hada_w1_a") != lora_tensors.end()) {
+                    // LoHa mode
+
+                    // TODO: split qkv convention for LoHas (is it ever used?)
+                    if (is_qkv_split || is_qkvm_split) {
+                        LOG_ERROR("Split qkv isn't supported for LoHa models.");
+                        break;
+                    }
+                    std::string alpha_name = "";
+
+                    ggml_tensor* hada_1_mid  = NULL;  // tau for tucker decomposition
+                    ggml_tensor* hada_1_up   = NULL;
+                    ggml_tensor* hada_1_down = NULL;
+
+                    ggml_tensor* hada_2_mid  = NULL;  // tau for tucker decomposition
+                    ggml_tensor* hada_2_up   = NULL;
+                    ggml_tensor* hada_2_down = NULL;
+
+                    std::string hada_1_mid_name  = "";
+                    std::string hada_1_down_name = "";
+                    std::string hada_1_up_name   = "";
+
+                    std::string hada_2_mid_name  = "";
+                    std::string hada_2_down_name = "";
+                    std::string hada_2_up_name   = "";
+
+                    hada_1_down_name = full_key + ".hada_w1_b";
+                    hada_1_up_name   = full_key + ".hada_w1_a";
+                    hada_1_mid_name  = full_key + ".hada_t1";
+                    if (lora_tensors.find(hada_1_down_name) != lora_tensors.end()) {
+                        hada_1_down = to_f32(compute_ctx, lora_tensors[hada_1_down_name]);
+                    }
+                    if (lora_tensors.find(hada_1_up_name) != lora_tensors.end()) {
+                        hada_1_up = to_f32(compute_ctx, lora_tensors[hada_1_up_name]);
+                    }
+                    if (lora_tensors.find(hada_1_mid_name) != lora_tensors.end()) {
+                        hada_1_mid = to_f32(compute_ctx, lora_tensors[hada_1_mid_name]);
+                        applied_lora_tensors.insert(hada_1_mid_name);
+                        hada_1_up = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, hada_1_up));
+                    }
+
+                    hada_2_down_name = full_key + ".hada_w2_b";
+                    hada_2_up_name   = full_key + ".hada_w2_a";
+                    hada_2_mid_name  = full_key + ".hada_t2";
+                    if (lora_tensors.find(hada_2_down_name) != lora_tensors.end()) {
+                        hada_2_down = to_f32(compute_ctx, lora_tensors[hada_2_down_name]);
+                    }
+                    if (lora_tensors.find(hada_2_up_name) != lora_tensors.end()) {
+                        hada_2_up = to_f32(compute_ctx, lora_tensors[hada_2_up_name]);
+                    }
+                    if (lora_tensors.find(hada_2_mid_name) != lora_tensors.end()) {
+                        hada_2_mid = to_f32(compute_ctx, lora_tensors[hada_2_mid_name]);
+                        applied_lora_tensors.insert(hada_2_mid_name);
+                        hada_2_up = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, hada_2_up));
+                    }
+
+                    alpha_name = full_key + ".alpha";
+
+                    applied_lora_tensors.insert(hada_1_down_name);
+                    applied_lora_tensors.insert(hada_1_up_name);
+                    applied_lora_tensors.insert(hada_2_down_name);
+                    applied_lora_tensors.insert(hada_2_up_name);
+
+                    applied_lora_tensors.insert(alpha_name);
+                    if (hada_1_up == NULL || hada_1_down == NULL || hada_2_up == NULL || hada_2_down == NULL) {
+                        continue;
+                    }
+
+                    struct ggml_tensor* updown_1 = ggml_merge_lora(compute_ctx, hada_1_down, hada_1_up, hada_1_mid);
+                    struct ggml_tensor* updown_2 = ggml_merge_lora(compute_ctx, hada_2_down, hada_2_up, hada_2_mid);
+                    updown                       = ggml_mul_inplace(compute_ctx, updown_1, updown_2);
+
+                    // calc_scale
+                    // TODO: .dora_scale?
+                    int64_t rank = hada_1_down->ne[ggml_n_dims(hada_1_down) - 1];
+                    if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
+                        float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
+                        scale_value = alpha / rank;
+                    }
+                } else if (lora_tensors.find(full_key + ".lokr_w1") != lora_tensors.end() || lora_tensors.find(full_key + ".lokr_w1_a") != lora_tensors.end()) {
+                    // LoKr mode
+
+                    // TODO: split qkv convention for LoKrs (is it ever used?)
+                    if (is_qkv_split || is_qkvm_split) {
+                        LOG_ERROR("Split qkv isn't supported for LoKr models.");
+                        break;
+                    }
+
+                    std::string alpha_name = full_key + ".alpha";
+
+                    ggml_tensor* lokr_w1 = NULL;
+                    ggml_tensor* lokr_w2 = NULL;
+
+                    std::string lokr_w1_name = "";
+                    std::string lokr_w2_name = "";
+
+                    lokr_w1_name = full_key + ".lokr_w1";
+                    lokr_w2_name = full_key + ".lokr_w2";
+
+                    if (lora_tensors.find(lokr_w1_name) != lora_tensors.end()) {
+                        lokr_w1 = to_f32(compute_ctx, lora_tensors[lokr_w1_name]);
+                        applied_lora_tensors.insert(lokr_w1_name);
+                    } else {
+                        ggml_tensor* down     = NULL;
+                        ggml_tensor* up       = NULL;
+                        std::string down_name = lokr_w1_name + "_b";
+                        std::string up_name   = lokr_w1_name + "_a";
+                        if (lora_tensors.find(down_name) != lora_tensors.end()) {
+                            // w1 should not be low rank normally, sometimes w1 and w2 are swapped
+                            down = to_f32(compute_ctx, lora_tensors[down_name]);
+                            applied_lora_tensors.insert(down_name);
+
+                            int64_t rank = down->ne[ggml_n_dims(down) - 1];
+                            if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
+                                float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
+                                scale_value = alpha / rank;
+                            }
+                        }
+                        if (lora_tensors.find(up_name) != lora_tensors.end()) {
+                            up = to_f32(compute_ctx, lora_tensors[up_name]);
+                            applied_lora_tensors.insert(up_name);
+                        }
+                        lokr_w1 = ggml_merge_lora(compute_ctx, down, up);
+                    }
+                    if (lora_tensors.find(lokr_w2_name) != lora_tensors.end()) {
+                        lokr_w2 = to_f32(compute_ctx, lora_tensors[lokr_w2_name]);
+                        applied_lora_tensors.insert(lokr_w2_name);
+                    } else {
+                        ggml_tensor* down     = NULL;
+                        ggml_tensor* up       = NULL;
+                        std::string down_name = lokr_w2_name + "_b";
+                        std::string up_name   = lokr_w2_name + "_a";
+                        if (lora_tensors.find(down_name) != lora_tensors.end()) {
+                            down = to_f32(compute_ctx, lora_tensors[down_name]);
+                            applied_lora_tensors.insert(down_name);
+
+                            int64_t rank = down->ne[ggml_n_dims(down) - 1];
+                            if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
+                                float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
+                                scale_value = alpha / rank;
+                            }
+                        }
+                        if (lora_tensors.find(up_name) != lora_tensors.end()) {
+                            up = to_f32(compute_ctx, lora_tensors[up_name]);
+                            applied_lora_tensors.insert(up_name);
+                        }
+                        lokr_w2 = ggml_merge_lora(compute_ctx, down, up);
+                    }
+
+                    // Technically it might be unused, but I believe it's the expected behavior
+                    applied_lora_tensors.insert(alpha_name);
+
+                    updown = ggml_kronecker(compute_ctx, lokr_w1, lokr_w2);
+
+                } else {
+                    // LoRA mode
+                    ggml_tensor* lora_mid  = NULL;  // tau for tucker decomposition
+                    ggml_tensor* lora_up   = NULL;
+                    ggml_tensor* lora_down = NULL;
+
+                    std::string alpha_name         = "";
+                    std::string scale_name         = "";
+                    std::string split_q_scale_name = "";
+                    std::string lora_mid_name      = "";
+                    std::string lora_down_name     = "";
+                    std::string lora_up_name       = "";
+
+                    if (is_qkv_split) {
+                        std::string suffix  = "";
+                        auto split_q_d_name = full_key + "q" + suffix + lora_downs[type] + ".weight";
+
+                        if (lora_tensors.find(split_q_d_name) == lora_tensors.end()) {
+                            suffix         = "_proj";
+                            split_q_d_name = full_key + "q" + suffix + lora_downs[type] + ".weight";
+                        }
+                        if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) {
+                            // print_ggml_tensor(it.second, true);  //[3072, 21504, 1, 1]
+                            // find qkv and mlp up parts in LoRA model
+                            auto split_k_d_name = full_key + "k" + suffix + lora_downs[type] + ".weight";
+                            auto split_v_d_name = full_key + "v" + suffix + lora_downs[type] + ".weight";
+
+                            auto split_q_u_name = full_key + "q" + suffix + lora_ups[type] + ".weight";
+                            auto split_k_u_name = full_key + "k" + suffix + lora_ups[type] + ".weight";
+                            auto split_v_u_name = full_key + "v" + suffix + lora_ups[type] + ".weight";
+
+                            auto split_q_scale_name = full_key + "q" + suffix + ".scale";
+                            auto split_k_scale_name = full_key + "k" + suffix + ".scale";
+                            auto split_v_scale_name = full_key + "v" + suffix + ".scale";
+
+                            auto split_q_alpha_name = full_key + "q" + suffix + ".alpha";
+                            auto split_k_alpha_name = full_key + "k" + suffix + ".alpha";
+                            auto split_v_alpha_name = full_key + "v" + suffix + ".alpha";
+
+                            ggml_tensor* lora_q_down = NULL;
+                            ggml_tensor* lora_q_up   = NULL;
+                            ggml_tensor* lora_k_down = NULL;
+                            ggml_tensor* lora_k_up   = NULL;
+                            ggml_tensor* lora_v_down = NULL;
+                            ggml_tensor* lora_v_up   = NULL;
+
+                            lora_q_down = to_f32(compute_ctx, lora_tensors[split_q_d_name]);
+
+                            if (lora_tensors.find(split_q_u_name) != lora_tensors.end()) {
+                                lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]);
+                            }
+
+                            if (lora_tensors.find(split_k_d_name) != lora_tensors.end()) {
+                                lora_k_down = to_f32(compute_ctx, lora_tensors[split_k_d_name]);
+                            }
+
+                            if (lora_tensors.find(split_k_u_name) != lora_tensors.end()) {
+                                lora_k_up = to_f32(compute_ctx, lora_tensors[split_k_u_name]);
+                            }
+
+                            if (lora_tensors.find(split_v_d_name) != lora_tensors.end()) {
+                                lora_v_down = to_f32(compute_ctx, lora_tensors[split_v_d_name]);
+                            }
+
+                            if (lora_tensors.find(split_v_u_name) != lora_tensors.end()) {
+                                lora_v_up = to_f32(compute_ctx, lora_tensors[split_v_u_name]);
+                            }
+
+                            float q_rank = lora_q_up->ne[0];
+                            float k_rank = lora_k_up->ne[0];
+                            float v_rank = lora_v_up->ne[0];
+
+                            float lora_q_scale = 1;
+                            float lora_k_scale = 1;
+                            float lora_v_scale = 1;
+
+                            if (lora_tensors.find(split_q_scale_name) != lora_tensors.end()) {
+                                lora_q_scale = ggml_backend_tensor_get_f32(lora_tensors[split_q_scale_name]);
+                                applied_lora_tensors.insert(split_q_scale_name);
+                            }
+                            if (lora_tensors.find(split_k_scale_name) != lora_tensors.end()) {
+                                lora_k_scale = ggml_backend_tensor_get_f32(lora_tensors[split_k_scale_name]);
+                                applied_lora_tensors.insert(split_k_scale_name);
+                            }
+                            if (lora_tensors.find(split_v_scale_name) != lora_tensors.end()) {
+                                lora_v_scale = ggml_backend_tensor_get_f32(lora_tensors[split_v_scale_name]);
+                                applied_lora_tensors.insert(split_v_scale_name);
+                            }
+
+                            if (lora_tensors.find(split_q_alpha_name) != lora_tensors.end()) {
+                                float lora_q_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_q_alpha_name]);
+                                applied_lora_tensors.insert(split_q_alpha_name);
+                                lora_q_scale = lora_q_alpha / q_rank;
+                            }
+                            if (lora_tensors.find(split_k_alpha_name) != lora_tensors.end()) {
+                                float lora_k_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_k_alpha_name]);
+                                applied_lora_tensors.insert(split_k_alpha_name);
+                                lora_k_scale = lora_k_alpha / k_rank;
+                            }
+                            if (lora_tensors.find(split_v_alpha_name) != lora_tensors.end()) {
+                                float lora_v_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_v_alpha_name]);
+                                applied_lora_tensors.insert(split_v_alpha_name);
+                                lora_v_scale = lora_v_alpha / v_rank;
+                            }
+
+                            ggml_scale_inplace(compute_ctx, lora_q_down, lora_q_scale);
+                            ggml_scale_inplace(compute_ctx, lora_k_down, lora_k_scale);
+                            ggml_scale_inplace(compute_ctx, lora_v_down, lora_v_scale);
+
+                            // print_ggml_tensor(lora_q_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_k_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_v_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_q_up, true);    //[R, 3072, 1, 1]
+                            // print_ggml_tensor(lora_k_up, true);    //[R, 3072, 1, 1]
+                            // print_ggml_tensor(lora_v_up, true);    //[R, 3072, 1, 1]
+
+                            // these need to be stitched together this way:
+                            //                          |q_up,0   ,0   |
+                            //                          |0   ,k_up,0   |
+                            //                          |0   ,0   ,v_up|
+                            // (q_down,k_down,v_down) . (q   ,k   ,v)
+
+                            // up_concat will be [9216, R*3, 1, 1]
+                            // down_concat will be [R*3, 3072, 1, 1]
+                            ggml_tensor* lora_down_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_down, lora_k_down, 1), lora_v_down, 1);
+
+                            ggml_tensor* z = ggml_dup_tensor(compute_ctx, lora_q_up);
+                            ggml_scale(compute_ctx, z, 0);
+                            ggml_tensor* zz = ggml_concat(compute_ctx, z, z, 1);
+
+                            ggml_tensor* q_up = ggml_concat(compute_ctx, lora_q_up, zz, 1);
+                            ggml_tensor* k_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, z, lora_k_up, 1), z, 1);
+                            ggml_tensor* v_up = ggml_concat(compute_ctx, zz, lora_v_up, 1);
+                            // print_ggml_tensor(q_up, true);  //[R, 9216, 1, 1]
+                            // print_ggml_tensor(k_up, true);  //[R, 9216, 1, 1]
+                            // print_ggml_tensor(v_up, true);  //[R, 9216, 1, 1]
+                            ggml_tensor* lora_up_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, q_up, k_up, 0), v_up, 0);
+                            // print_ggml_tensor(lora_up_concat, true);  //[R*3, 9216, 1, 1]
+
+                            lora_down = ggml_cont(compute_ctx, lora_down_concat);
+                            lora_up   = ggml_cont(compute_ctx, lora_up_concat);
+
+                            applied_lora_tensors.insert(split_q_u_name);
+                            applied_lora_tensors.insert(split_k_u_name);
+                            applied_lora_tensors.insert(split_v_u_name);
+
+                            applied_lora_tensors.insert(split_q_d_name);
+                            applied_lora_tensors.insert(split_k_d_name);
+                            applied_lora_tensors.insert(split_v_d_name);
+                        }
+                    } else if (is_qkvm_split) {
+                        auto split_q_d_name = full_key + "attn.to_q" + lora_downs[type] + ".weight";
+                        if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) {
+                            // print_ggml_tensor(it.second, true);  //[3072, 21504, 1, 1]
+                            // find qkv and mlp up parts in LoRA model
+                            auto split_k_d_name = full_key + "attn.to_k" + lora_downs[type] + ".weight";
+                            auto split_v_d_name = full_key + "attn.to_v" + lora_downs[type] + ".weight";
+
+                            auto split_q_u_name = full_key + "attn.to_q" + lora_ups[type] + ".weight";
+                            auto split_k_u_name = full_key + "attn.to_k" + lora_ups[type] + ".weight";
+                            auto split_v_u_name = full_key + "attn.to_v" + lora_ups[type] + ".weight";
+
+                            auto split_m_d_name = full_key + "proj_mlp" + lora_downs[type] + ".weight";
+                            auto split_m_u_name = full_key + "proj_mlp" + lora_ups[type] + ".weight";
+
+                            auto split_q_scale_name = full_key + "attn.to_q" + ".scale";
+                            auto split_k_scale_name = full_key + "attn.to_k" + ".scale";
+                            auto split_v_scale_name = full_key + "attn.to_v" + ".scale";
+                            auto split_m_scale_name = full_key + "proj_mlp" + ".scale";
+
+                            auto split_q_alpha_name = full_key + "attn.to_q" + ".alpha";
+                            auto split_k_alpha_name = full_key + "attn.to_k" + ".alpha";
+                            auto split_v_alpha_name = full_key + "attn.to_v" + ".alpha";
+                            auto split_m_alpha_name = full_key + "proj_mlp" + ".alpha";
+
+                            ggml_tensor* lora_q_down = NULL;
+                            ggml_tensor* lora_q_up   = NULL;
+                            ggml_tensor* lora_k_down = NULL;
+                            ggml_tensor* lora_k_up   = NULL;
+                            ggml_tensor* lora_v_down = NULL;
+                            ggml_tensor* lora_v_up   = NULL;
+
+                            ggml_tensor* lora_m_down = NULL;
+                            ggml_tensor* lora_m_up   = NULL;
+
+                            lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]);
+
+                            if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) {
+                                lora_q_down = to_f32(compute_ctx, lora_tensors[split_q_d_name]);
+                            }
+
+                            if (lora_tensors.find(split_q_u_name) != lora_tensors.end()) {
+                                lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]);
+                            }
+
+                            if (lora_tensors.find(split_k_d_name) != lora_tensors.end()) {
+                                lora_k_down = to_f32(compute_ctx, lora_tensors[split_k_d_name]);
+                            }
+
+                            if (lora_tensors.find(split_k_u_name) != lora_tensors.end()) {
+                                lora_k_up = to_f32(compute_ctx, lora_tensors[split_k_u_name]);
+                            }
+
+                            if (lora_tensors.find(split_v_d_name) != lora_tensors.end()) {
+                                lora_v_down = to_f32(compute_ctx, lora_tensors[split_v_d_name]);
+                            }
+
+                            if (lora_tensors.find(split_v_u_name) != lora_tensors.end()) {
+                                lora_v_up = to_f32(compute_ctx, lora_tensors[split_v_u_name]);
+                            }
+
+                            if (lora_tensors.find(split_m_d_name) != lora_tensors.end()) {
+                                lora_m_down = to_f32(compute_ctx, lora_tensors[split_m_d_name]);
+                            }
+
+                            if (lora_tensors.find(split_m_u_name) != lora_tensors.end()) {
+                                lora_m_up = to_f32(compute_ctx, lora_tensors[split_m_u_name]);
+                            }
+
+                            float q_rank = lora_q_up->ne[0];
+                            float k_rank = lora_k_up->ne[0];
+                            float v_rank = lora_v_up->ne[0];
+                            float m_rank = lora_v_up->ne[0];
+
+                            float lora_q_scale = 1;
+                            float lora_k_scale = 1;
+                            float lora_v_scale = 1;
+                            float lora_m_scale = 1;
+
+                            if (lora_tensors.find(split_q_scale_name) != lora_tensors.end()) {
+                                lora_q_scale = ggml_backend_tensor_get_f32(lora_tensors[split_q_scale_name]);
+                                applied_lora_tensors.insert(split_q_scale_name);
+                            }
+                            if (lora_tensors.find(split_k_scale_name) != lora_tensors.end()) {
+                                lora_k_scale = ggml_backend_tensor_get_f32(lora_tensors[split_k_scale_name]);
+                                applied_lora_tensors.insert(split_k_scale_name);
+                            }
+                            if (lora_tensors.find(split_v_scale_name) != lora_tensors.end()) {
+                                lora_v_scale = ggml_backend_tensor_get_f32(lora_tensors[split_v_scale_name]);
+                                applied_lora_tensors.insert(split_v_scale_name);
+                            }
+                            if (lora_tensors.find(split_m_scale_name) != lora_tensors.end()) {
+                                lora_m_scale = ggml_backend_tensor_get_f32(lora_tensors[split_m_scale_name]);
+                                applied_lora_tensors.insert(split_m_scale_name);
+                            }
+
+                            if (lora_tensors.find(split_q_alpha_name) != lora_tensors.end()) {
+                                float lora_q_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_q_alpha_name]);
+                                applied_lora_tensors.insert(split_q_alpha_name);
+                                lora_q_scale = lora_q_alpha / q_rank;
+                            }
+                            if (lora_tensors.find(split_k_alpha_name) != lora_tensors.end()) {
+                                float lora_k_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_k_alpha_name]);
+                                applied_lora_tensors.insert(split_k_alpha_name);
+                                lora_k_scale = lora_k_alpha / k_rank;
+                            }
+                            if (lora_tensors.find(split_v_alpha_name) != lora_tensors.end()) {
+                                float lora_v_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_v_alpha_name]);
+                                applied_lora_tensors.insert(split_v_alpha_name);
+                                lora_v_scale = lora_v_alpha / v_rank;
+                            }
+                            if (lora_tensors.find(split_m_alpha_name) != lora_tensors.end()) {
+                                float lora_m_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_m_alpha_name]);
+                                applied_lora_tensors.insert(split_m_alpha_name);
+                                lora_m_scale = lora_m_alpha / m_rank;
+                            }
+
+                            ggml_scale_inplace(compute_ctx, lora_q_down, lora_q_scale);
+                            ggml_scale_inplace(compute_ctx, lora_k_down, lora_k_scale);
+                            ggml_scale_inplace(compute_ctx, lora_v_down, lora_v_scale);
+                            ggml_scale_inplace(compute_ctx, lora_m_down, lora_m_scale);
+
+                            // print_ggml_tensor(lora_q_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_k_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_v_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_m_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_q_up, true);  //[R, 3072, 1, 1]
+                            // print_ggml_tensor(lora_k_up, true);  //[R, 3072, 1, 1]
+                            // print_ggml_tensor(lora_v_up, true);  //[R, 3072, 1, 1]
+                            // print_ggml_tensor(lora_m_up, true);  //[R, 12288, 1, 1]
+
+                            // these need to be stitched together this way:
+                            //                                 |q_up,0   ,0   ,0   |
+                            //                                 |0   ,k_up,0   ,0   |
+                            //                                 |0   ,0   ,v_up,0   |
+                            //                                 |0   ,0   ,0   ,m_up|
+                            // (q_down,k_down,v_down,m_down) . (q   ,k   ,v   ,m)
+
+                            // up_concat will be [21504, R*4, 1, 1]
+                            // down_concat will be [R*4, 3072, 1, 1]
+
+                            ggml_tensor* lora_down_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_down, lora_k_down, 1), ggml_concat(compute_ctx, lora_v_down, lora_m_down, 1), 1);
+                            // print_ggml_tensor(lora_down_concat, true);  //[3072, R*4, 1, 1]
+
+                            // this also means that if rank is bigger than 672, it is less memory efficient to do it this way (should be fine)
+                            // print_ggml_tensor(lora_q_up, true);  //[3072, R, 1, 1]
+                            ggml_tensor* z     = ggml_dup_tensor(compute_ctx, lora_q_up);
+                            ggml_tensor* mlp_z = ggml_dup_tensor(compute_ctx, lora_m_up);
+                            ggml_scale(compute_ctx, z, 0);
+                            ggml_scale(compute_ctx, mlp_z, 0);
+                            ggml_tensor* zz = ggml_concat(compute_ctx, z, z, 1);
+
+                            ggml_tensor* q_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_up, zz, 1), mlp_z, 1);
+                            ggml_tensor* k_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, z, lora_k_up, 1), ggml_concat(compute_ctx, z, mlp_z, 1), 1);
+                            ggml_tensor* v_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, zz, lora_v_up, 1), mlp_z, 1);
+                            ggml_tensor* m_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, zz, z, 1), lora_m_up, 1);
+                            // print_ggml_tensor(q_up, true);  //[R, 21504, 1, 1]
+                            // print_ggml_tensor(k_up, true);  //[R, 21504, 1, 1]
+                            // print_ggml_tensor(v_up, true);  //[R, 21504, 1, 1]
+                            // print_ggml_tensor(m_up, true);  //[R, 21504, 1, 1]
+
+                            ggml_tensor* lora_up_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, q_up, k_up, 0), ggml_concat(compute_ctx, v_up, m_up, 0), 0);
+                            // print_ggml_tensor(lora_up_concat, true);  //[R*4, 21504, 1, 1]
+
+                            lora_down = ggml_cont(compute_ctx, lora_down_concat);
+                            lora_up   = ggml_cont(compute_ctx, lora_up_concat);
+
+                            applied_lora_tensors.insert(split_q_u_name);
+                            applied_lora_tensors.insert(split_k_u_name);
+                            applied_lora_tensors.insert(split_v_u_name);
+                            applied_lora_tensors.insert(split_m_u_name);
+
+                            applied_lora_tensors.insert(split_q_d_name);
+                            applied_lora_tensors.insert(split_k_d_name);
+                            applied_lora_tensors.insert(split_v_d_name);
+                            applied_lora_tensors.insert(split_m_d_name);
+                        }
+                    } else {
+                        lora_up_name   = full_key + lora_ups[type] + ".weight";
+                        lora_down_name = full_key + lora_downs[type] + ".weight";
+                        lora_mid_name  = full_key + ".lora_mid.weight";
+
+                        alpha_name = full_key + ".alpha";
+                        scale_name = full_key + ".scale";
+
+                        if (lora_tensors.find(lora_up_name) != lora_tensors.end()) {
+                            lora_up = to_f32(compute_ctx, lora_tensors[lora_up_name]);
+                            applied_lora_tensors.insert(lora_up_name);
+                        }
+
+                        if (lora_tensors.find(lora_down_name) != lora_tensors.end()) {
+                            lora_down = to_f32(compute_ctx, lora_tensors[lora_down_name]);
+                            applied_lora_tensors.insert(lora_down_name);
+                        }
+
+                        if (lora_tensors.find(lora_mid_name) != lora_tensors.end()) {
+                            lora_mid = to_f32(compute_ctx, lora_tensors[lora_mid_name]);
+                            applied_lora_tensors.insert(lora_mid_name);
+                        }
+                    }
+
+                    if (lora_up == NULL || lora_down == NULL) {
+                        continue;
+                    }
+                    // calc_scale
+                    // TODO: .dora_scale?
+                    int64_t rank = lora_down->ne[ggml_n_dims(lora_down) - 1];
+                    if (lora_tensors.find(scale_name) != lora_tensors.end()) {
+                        scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]);
+                        applied_lora_tensors.insert(scale_name);
+                    } else if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
+                        float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
+                        scale_value = alpha / rank;
+                        // LOG_DEBUG("rank %s %ld %.2f %.2f", alpha_name.c_str(), rank, alpha, scale_value);
+                        applied_lora_tensors.insert(alpha_name);
+                    }
+
+                    updown = ggml_merge_lora(compute_ctx, lora_down, lora_up, lora_mid);
+                }
+                scale_value *= multiplier;
+                ggml_tensor* original_tensor = model_tensor;
+                if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
+                    model_tensor = ggml_dup_tensor(compute_ctx, model_tensor);
+                    set_backend_tensor_data(model_tensor, original_tensor->data);
+                }
+                updown = ggml_reshape(compute_ctx, updown, model_tensor);
+                GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(model_tensor));
+                updown = ggml_scale_inplace(compute_ctx, updown, scale_value);
+                ggml_tensor* final_tensor;
+                if (model_tensor->type != GGML_TYPE_F32 && model_tensor->type != GGML_TYPE_F16) {
+                    final_tensor = to_f32(compute_ctx, model_tensor);
+                    final_tensor = ggml_add_inplace(compute_ctx, final_tensor, updown);
+                    final_tensor = ggml_cpy(compute_ctx, final_tensor, model_tensor);
+                } else {
+                    final_tensor = ggml_add_inplace(compute_ctx, model_tensor, updown);
+                }
+                ggml_build_forward_expand(gf, final_tensor);
+                if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
+                    original_tensor_to_final_tensor[original_tensor] = final_tensor;
+                }
+                break;
+            }
+        }
+        size_t total_lora_tensors_count   = 0;
+        size_t applied_lora_tensors_count = 0;
+
+        for (auto& kv : lora_tensors) {
+            total_lora_tensors_count++;
+            if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) {
+                LOG_WARN("unused lora tensor |%s|", kv.first.c_str());
+                print_ggml_tensor(kv.second, true);
+                // exit(0);
+            } else {
+                applied_lora_tensors_count++;
+            }
+        }
+        /* Don't worry if this message shows up twice in the logs per LoRA,
+         * this function is called once to calculate the required buffer size
+         * and then again to actually generate a graph to be used */
+        if (applied_lora_tensors_count != total_lora_tensors_count) {
+            LOG_WARN("Only (%lu / %lu) LoRA tensors will be applied",
+                     applied_lora_tensors_count, total_lora_tensors_count);
+        } else {
+            LOG_DEBUG("(%lu / %lu) LoRA tensors will be applied",
+                      applied_lora_tensors_count, total_lora_tensors_count);
+        }
+
+        return gf;
+    }
+
+    void apply(std::map<std::string, struct ggml_tensor*> model_tensors, SDVersion version, int n_threads) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_lora_graph(model_tensors, version);
+        };
+        GGMLRunner::compute(get_graph, n_threads, false);
+        for (auto item : original_tensor_to_final_tensor) {
+            ggml_tensor* original_tensor = item.first;
+            ggml_tensor* final_tensor    = item.second;
+
+            ggml_backend_tensor_copy(final_tensor, original_tensor);
+        }
+        original_tensor_to_final_tensor.clear();
+        GGMLRunner::free_compute_buffer();
+    }
+};
+
+#endif  // __LORA_HPP__
--- a/src/ltxv.hpp
+++ b/src/ltxv.hpp
@ -1,7 +1,8 @@
 #ifndef __LTXV_HPP__
 #define __LTXV_HPP__

-#include "common_block.hpp"
+#include "common.hpp"
+#include "ggml_extend.hpp"

 namespace LTXV {

@ -12,10 +13,10 @@ namespace LTXV {
    public:
        CausalConv3d(int64_t in_channels,
                     int64_t out_channels,
-                     int kernel_size                  = 3,
-                     std::tuple<int, int, int> stride = {1, 1, 1},
-                     int dilation                     = 1,
-                     bool bias                        = true) {
+                     int kernel_size        = 3,
+                     std::tuple<int> stride = {1, 1, 1},
+                     int dilation           = 1,
+                     bool bias              = true) {
            time_kernel_size = kernel_size / 2;
            blocks["conv"]   = std::shared_ptr<GGMLBlock>(new Conv3d(in_channels,
                                                                     out_channels,
@ -26,9 +27,9 @@ namespace LTXV {
                                                                     bias));
        }

-        ggml_tensor* forward(GGMLRunnerContext* ctx,
-                             ggml_tensor* x,
-                             bool causal = true) {
+        struct ggml_tensor* forward(struct ggml_context* ctx,
+                                    struct ggml_tensor* x,
+                                    bool causal = true) {
            // x: [N*IC, ID, IH, IW]
            // result: [N*OC, OD, OH, OW]
            auto conv = std::dynamic_pointer_cast<Conv3d>(blocks["conv"]);
--- a/src/mmdit.hpp
+++ b/src/mmdit.hpp
@ -1,8 +1,6 @@
 #ifndef __MMDIT_HPP__
 #define __MMDIT_HPP__

-#include <memory>
-
 #include "ggml_extend.hpp"
 #include "model.h"

@ -27,13 +25,13 @@ public:
        blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias));
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [N, n_token, in_features]
        auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
        auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);

        x = fc1->forward(ctx, x);
-        x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
+        x = ggml_gelu_inplace(ctx, x);
        x = fc2->forward(ctx, x);
        return x;
    }
@ -72,7 +70,7 @@ public:
                                                               bias));
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [N, C, H, W]
        // return: [N, H*W, embed_dim]
        auto proj = std::dynamic_pointer_cast<Conv2d>(blocks["proj"]);
@ -82,13 +80,13 @@ public:
            int64_t H = x->ne[1];
            int pad_h = (patch_size - H % patch_size) % patch_size;
            int pad_w = (patch_size - W % patch_size) % patch_size;
-            x         = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0);  // TODO: reflect pad mode
+            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // TODO: reflect pad mode
        }
        x = proj->forward(ctx, x);

        if (flatten) {
-            x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]);
-            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));
+            x = ggml_reshape_3d(ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]);
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));
        }
        return x;
    }
@ -97,30 +95,26 @@ public:
 struct TimestepEmbedder : public GGMLBlock {
    // Embeds scalar timesteps into vector representations.
 protected:
-    int frequency_embedding_size;
+    int64_t frequency_embedding_size;

 public:
    TimestepEmbedder(int64_t hidden_size,
-                     int frequency_embedding_size = 256,
-                     int64_t out_channels         = 0)
+                     int64_t frequency_embedding_size = 256)
        : frequency_embedding_size(frequency_embedding_size) {
-        if (out_channels <= 0) {
-            out_channels = hidden_size;
-        }
        blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size, true, true));
-        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, out_channels, true, true));
+        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* t) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* t) {
        // t: [N, ]
        // return: [N, hidden_size]
        auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
        auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);

-        auto t_freq = ggml_ext_timestep_embedding(ctx->ggml_ctx, t, frequency_embedding_size);  // [N, frequency_embedding_size]
+        auto t_freq = ggml_nn_timestep_embedding(ctx, t, frequency_embedding_size);  // [N, frequency_embedding_size]

        auto t_emb = mlp_0->forward(ctx, t_freq);
-        t_emb      = ggml_silu_inplace(ctx->ggml_ctx, t_emb);
+        t_emb      = ggml_silu_inplace(ctx, t_emb);
        t_emb      = mlp_2->forward(ctx, t_emb);
        return t_emb;
    }
@ -135,14 +129,14 @@ public:
        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [N, input_dim]
        // return: [N, hidden_size]
        auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
        auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);

        x = mlp_0->forward(ctx, x);
-        x = ggml_silu_inplace(ctx->ggml_ctx, x);
+        x = ggml_silu_inplace(ctx, x);
        x = mlp_2->forward(ctx, x);
        return x;
    }
@ -153,37 +147,39 @@ public:
    int64_t num_heads;
    bool pre_only;
    std::string qk_norm;
+    bool flash_attn;

 public:
    SelfAttention(int64_t dim,
                  int64_t num_heads   = 8,
                  std::string qk_norm = "",
                  bool qkv_bias       = false,
-                  bool pre_only       = false)
-        : num_heads(num_heads), pre_only(pre_only), qk_norm(qk_norm) {
+                  bool pre_only       = false,
+                  bool flash_attn     = false)
+        : num_heads(num_heads), pre_only(pre_only), qk_norm(qk_norm), flash_attn(flash_attn) {
        int64_t d_head = dim / num_heads;
        blocks["qkv"]  = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * 3, qkv_bias));
        if (!pre_only) {
            blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim));
        }
        if (qk_norm == "rms") {
-            blocks["ln_q"] = std::shared_ptr<GGMLBlock>(new RMSNorm(d_head, 1.0e-6f));
-            blocks["ln_k"] = std::shared_ptr<GGMLBlock>(new RMSNorm(d_head, 1.0e-6f));
+            blocks["ln_q"] = std::shared_ptr<GGMLBlock>(new RMSNorm(d_head, 1.0e-6));
+            blocks["ln_k"] = std::shared_ptr<GGMLBlock>(new RMSNorm(d_head, 1.0e-6));
        } else if (qk_norm == "ln") {
-            blocks["ln_q"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_head, 1.0e-6f));
-            blocks["ln_k"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_head, 1.0e-6f));
+            blocks["ln_q"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_head, 1.0e-6));
+            blocks["ln_k"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_head, 1.0e-6));
        }
    }

-    std::vector<ggml_tensor*> pre_attention(GGMLRunnerContext* ctx, ggml_tensor* x) {
+    std::vector<struct ggml_tensor*> pre_attention(struct ggml_context* ctx, struct ggml_tensor* x) {
        auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);

        auto qkv         = qkv_proj->forward(ctx, x);
-        auto qkv_vec     = split_qkv(ctx->ggml_ctx, qkv);
+        auto qkv_vec     = split_qkv(ctx, qkv);
        int64_t head_dim = qkv_vec[0]->ne[0] / num_heads;
-        auto q           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]);  // [N, n_token, n_head, d_head]
-        auto k           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]);  // [N, n_token, n_head, d_head]
-        auto v           = qkv_vec[2];                                                                                             // [N, n_token, n_head*d_head]
+        auto q           = ggml_reshape_4d(ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]);  // [N, n_token, n_head, d_head]
+        auto k           = ggml_reshape_4d(ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]);  // [N, n_token, n_head, d_head]
+        auto v           = qkv_vec[2];                                                                                   // [N, n_token, n_head*d_head]

        if (qk_norm == "rms" || qk_norm == "ln") {
            auto ln_q = std::dynamic_pointer_cast<UnaryBlock>(blocks["ln_q"]);
@ -192,13 +188,13 @@ public:
            k         = ln_k->forward(ctx, k);
        }

-        q = ggml_reshape_3d(ctx->ggml_ctx, q, q->ne[0] * q->ne[1], q->ne[2], q->ne[3]);  // [N, n_token, n_head*d_head]
-        k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0] * k->ne[1], k->ne[2], k->ne[3]);  // [N, n_token, n_head*d_head]
+        q = ggml_reshape_3d(ctx, q, q->ne[0] * q->ne[1], q->ne[2], q->ne[3]);  // [N, n_token, n_head*d_head]
+        k = ggml_reshape_3d(ctx, k, k->ne[0] * k->ne[1], k->ne[2], k->ne[3]);  // [N, n_token, n_head*d_head]

        return {q, k, v};
    }

-    ggml_tensor* post_attention(GGMLRunnerContext* ctx, ggml_tensor* x) {
+    struct ggml_tensor* post_attention(struct ggml_context* ctx, struct ggml_tensor* x) {
        GGML_ASSERT(!pre_only);

        auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
@ -208,19 +204,20 @@ public:
    }

    // x: [N, n_token, dim]
-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                ggml_backend_t backend,
+                                struct ggml_tensor* x) {
        auto qkv = pre_attention(ctx, x);
-        x        = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, ctx->flash_attn_enabled);  // [N, n_token, dim]
-        x        = post_attention(ctx, x);                                                                                                           // [N, n_token, dim]
+        x        = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, true);  // [N, n_token, dim]
+        x        = post_attention(ctx, x);                                                                            // [N, n_token, dim]
        return x;
    }
 };

-__STATIC_INLINE__ ggml_tensor* modulate(ggml_context* ctx,
-                                        ggml_tensor* x,
-                                        ggml_tensor* shift,
-                                        ggml_tensor* scale) {
+__STATIC_INLINE__ struct ggml_tensor* modulate(struct ggml_context* ctx,
+                                               struct ggml_tensor* x,
+                                               struct ggml_tensor* shift,
+                                               struct ggml_tensor* scale) {
    // x: [N, L, C]
    // scale: [N, C]
    // shift: [N, C]
@ -237,6 +234,7 @@ public:
    int64_t num_heads;
    bool pre_only;
    bool self_attn;
+    bool flash_attn;

 public:
    DismantledBlock(int64_t hidden_size,
@ -245,16 +243,17 @@ public:
                    std::string qk_norm = "",
                    bool qkv_bias       = false,
                    bool pre_only       = false,
-                    bool self_attn      = false)
+                    bool self_attn      = false,
+                    bool flash_attn     = false)
        : num_heads(num_heads), pre_only(pre_only), self_attn(self_attn) {
        // rmsnorm is always Flase
        // scale_mod_only is always Flase
        // swiglu is always Flase
        blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
-        blocks["attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, pre_only));
+        blocks["attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, pre_only, flash_attn));

        if (self_attn) {
-            blocks["attn2"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, false));
+            blocks["attn2"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, false, flash_attn));
        }

        if (!pre_only) {
@ -273,9 +272,9 @@ public:
        blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, n_mods * hidden_size));
    }

-    std::tuple<std::vector<ggml_tensor*>, std::vector<ggml_tensor*>, std::vector<ggml_tensor*>> pre_attention_x(GGMLRunnerContext* ctx,
-                                                                                                                ggml_tensor* x,
-                                                                                                                ggml_tensor* c) {
+    std::tuple<std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>> pre_attention_x(struct ggml_context* ctx,
+                                                                                                                                     struct ggml_tensor* x,
+                                                                                                                                     struct ggml_tensor* c) {
        GGML_ASSERT(self_attn);
        // x: [N, n_token, hidden_size]
        // c: [N, hidden_size]
@ -284,77 +283,83 @@ public:
        auto attn2              = std::dynamic_pointer_cast<SelfAttention>(blocks["attn2"]);
        auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);

-        int n_mods = 9;
-        auto m     = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c));  // [N, n_mods * hidden_size]
-        auto m_vec = ggml_ext_chunk(ctx->ggml_ctx, m, n_mods, 0);
+        int64_t n_mods = 9;
+        auto m         = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c));  // [N, n_mods * hidden_size]
+        m              = ggml_reshape_3d(ctx, m, c->ne[0], n_mods, c->ne[1]);  // [N, n_mods, hidden_size]
+        m              = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3));     // [n_mods, N, hidden_size]

-        auto shift_msa  = m_vec[0];  // [N, hidden_size]
-        auto scale_msa  = m_vec[1];  // [N, hidden_size]
-        auto gate_msa   = m_vec[2];  // [N, hidden_size]
-        auto shift_mlp  = m_vec[3];  // [N, hidden_size]
-        auto scale_mlp  = m_vec[4];  // [N, hidden_size]
-        auto gate_mlp   = m_vec[5];  // [N, hidden_size]
-        auto shift_msa2 = m_vec[6];  // [N, hidden_size]
-        auto scale_msa2 = m_vec[7];  // [N, hidden_size]
-        auto gate_msa2  = m_vec[8];  // [N, hidden_size]
+        int64_t offset = m->nb[1] * m->ne[1];
+        auto shift_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
+        auto scale_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]
+        auto gate_msa  = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2);  // [N, hidden_size]
+
+        auto shift_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3);  // [N, hidden_size]
+        auto scale_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4);  // [N, hidden_size]
+        auto gate_mlp  = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5);  // [N, hidden_size]
+
+        auto shift_msa2 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 6);  // [N, hidden_size]
+        auto scale_msa2 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 7);  // [N, hidden_size]
+        auto gate_msa2  = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 8);  // [N, hidden_size]

        auto x_norm = norm1->forward(ctx, x);

-        auto attn_in = modulate(ctx->ggml_ctx, x_norm, shift_msa, scale_msa);
+        auto attn_in = modulate(ctx, x_norm, shift_msa, scale_msa);
        auto qkv     = attn->pre_attention(ctx, attn_in);

-        auto attn2_in = modulate(ctx->ggml_ctx, x_norm, shift_msa2, scale_msa2);
+        auto attn2_in = modulate(ctx, x_norm, shift_msa2, scale_msa2);
        auto qkv2     = attn2->pre_attention(ctx, attn2_in);

        return {qkv, qkv2, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp, gate_msa2}};
    }

-    std::pair<std::vector<ggml_tensor*>, std::vector<ggml_tensor*>> pre_attention(GGMLRunnerContext* ctx,
-                                                                                  ggml_tensor* x,
-                                                                                  ggml_tensor* c) {
+    std::pair<std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>> pre_attention(struct ggml_context* ctx,
+                                                                                                struct ggml_tensor* x,
+                                                                                                struct ggml_tensor* c) {
        // x: [N, n_token, hidden_size]
        // c: [N, hidden_size]
        auto norm1              = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
        auto attn               = std::dynamic_pointer_cast<SelfAttention>(blocks["attn"]);
        auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);

-        int n_mods = 6;
+        int64_t n_mods = 6;
        if (pre_only) {
            n_mods = 2;
        }
-        auto m     = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c));  // [N, n_mods * hidden_size]
-        auto m_vec = ggml_ext_chunk(ctx->ggml_ctx, m, n_mods, 0);
+        auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c));  // [N, n_mods * hidden_size]
+        m      = ggml_reshape_3d(ctx, m, c->ne[0], n_mods, c->ne[1]);  // [N, n_mods, hidden_size]
+        m      = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3));     // [n_mods, N, hidden_size]

-        auto shift_msa = m_vec[0];  // [N, hidden_size]
-        auto scale_msa = m_vec[1];  // [N, hidden_size]
+        int64_t offset = m->nb[1] * m->ne[1];
+        auto shift_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
+        auto scale_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]
        if (!pre_only) {
-            auto gate_msa  = m_vec[2];  // [N, hidden_size]
-            auto shift_mlp = m_vec[3];  // [N, hidden_size]
-            auto scale_mlp = m_vec[4];  // [N, hidden_size]
-            auto gate_mlp  = m_vec[5];  // [N, hidden_size]
+            auto gate_msa  = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2);  // [N, hidden_size]
+            auto shift_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3);  // [N, hidden_size]
+            auto scale_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4);  // [N, hidden_size]
+            auto gate_mlp  = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5);  // [N, hidden_size]

-            auto attn_in = modulate(ctx->ggml_ctx, norm1->forward(ctx, x), shift_msa, scale_msa);
+            auto attn_in = modulate(ctx, norm1->forward(ctx, x), shift_msa, scale_msa);

            auto qkv = attn->pre_attention(ctx, attn_in);

            return {qkv, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp}};
        } else {
-            auto attn_in = modulate(ctx->ggml_ctx, norm1->forward(ctx, x), shift_msa, scale_msa);
+            auto attn_in = modulate(ctx, norm1->forward(ctx, x), shift_msa, scale_msa);
            auto qkv     = attn->pre_attention(ctx, attn_in);

-            return {qkv, {nullptr, nullptr, nullptr, nullptr, nullptr}};
+            return {qkv, {NULL, NULL, NULL, NULL, NULL}};
        }
    }

-    ggml_tensor* post_attention_x(GGMLRunnerContext* ctx,
-                                  ggml_tensor* attn_out,
-                                  ggml_tensor* attn2_out,
-                                  ggml_tensor* x,
-                                  ggml_tensor* gate_msa,
-                                  ggml_tensor* shift_mlp,
-                                  ggml_tensor* scale_mlp,
-                                  ggml_tensor* gate_mlp,
-                                  ggml_tensor* gate_msa2) {
+    struct ggml_tensor* post_attention_x(struct ggml_context* ctx,
+                                         struct ggml_tensor* attn_out,
+                                         struct ggml_tensor* attn2_out,
+                                         struct ggml_tensor* x,
+                                         struct ggml_tensor* gate_msa,
+                                         struct ggml_tensor* shift_mlp,
+                                         struct ggml_tensor* scale_mlp,
+                                         struct ggml_tensor* gate_mlp,
+                                         struct ggml_tensor* gate_msa2) {
        // attn_out: [N, n_token, hidden_size]
        // x: [N, n_token, hidden_size]
        // gate_msa: [N, hidden_size]
@ -369,28 +374,28 @@ public:
        auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
        auto mlp   = std::dynamic_pointer_cast<Mlp>(blocks["mlp"]);

-        gate_msa  = ggml_reshape_3d(ctx->ggml_ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]);     // [N, 1, hidden_size]
-        gate_mlp  = ggml_reshape_3d(ctx->ggml_ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]);     // [N, 1, hidden_size]
-        gate_msa2 = ggml_reshape_3d(ctx->ggml_ctx, gate_msa2, gate_msa2->ne[0], 1, gate_msa2->ne[1]);  // [N, 1, hidden_size]
+        gate_msa  = ggml_reshape_3d(ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]);     // [N, 1, hidden_size]
+        gate_mlp  = ggml_reshape_3d(ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]);     // [N, 1, hidden_size]
+        gate_msa2 = ggml_reshape_3d(ctx, gate_msa2, gate_msa2->ne[0], 1, gate_msa2->ne[1]);  // [N, 1, hidden_size]

        attn_out  = attn->post_attention(ctx, attn_out);
        attn2_out = attn2->post_attention(ctx, attn2_out);

-        x            = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn_out, gate_msa));
-        x            = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn2_out, gate_msa2));
-        auto mlp_out = mlp->forward(ctx, modulate(ctx->ggml_ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp));
-        x            = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, mlp_out, gate_mlp));
+        x            = ggml_add(ctx, x, ggml_mul(ctx, attn_out, gate_msa));
+        x            = ggml_add(ctx, x, ggml_mul(ctx, attn2_out, gate_msa2));
+        auto mlp_out = mlp->forward(ctx, modulate(ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp));
+        x            = ggml_add(ctx, x, ggml_mul(ctx, mlp_out, gate_mlp));

        return x;
    }

-    ggml_tensor* post_attention(GGMLRunnerContext* ctx,
-                                ggml_tensor* attn_out,
-                                ggml_tensor* x,
-                                ggml_tensor* gate_msa,
-                                ggml_tensor* shift_mlp,
-                                ggml_tensor* scale_mlp,
-                                ggml_tensor* gate_mlp) {
+    struct ggml_tensor* post_attention(struct ggml_context* ctx,
+                                       struct ggml_tensor* attn_out,
+                                       struct ggml_tensor* x,
+                                       struct ggml_tensor* gate_msa,
+                                       struct ggml_tensor* shift_mlp,
+                                       struct ggml_tensor* scale_mlp,
+                                       struct ggml_tensor* gate_mlp) {
        // attn_out: [N, n_token, hidden_size]
        // x: [N, n_token, hidden_size]
        // gate_msa: [N, hidden_size]
@ -404,21 +409,22 @@ public:
        auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
        auto mlp   = std::dynamic_pointer_cast<Mlp>(blocks["mlp"]);

-        gate_msa = ggml_reshape_3d(ctx->ggml_ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]);  // [N, 1, hidden_size]
-        gate_mlp = ggml_reshape_3d(ctx->ggml_ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]);  // [N, 1, hidden_size]
+        gate_msa = ggml_reshape_3d(ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]);  // [N, 1, hidden_size]
+        gate_mlp = ggml_reshape_3d(ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]);  // [N, 1, hidden_size]

        attn_out = attn->post_attention(ctx, attn_out);

-        x            = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn_out, gate_msa));
-        auto mlp_out = mlp->forward(ctx, modulate(ctx->ggml_ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp));
-        x            = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, mlp_out, gate_mlp));
+        x            = ggml_add(ctx, x, ggml_mul(ctx, attn_out, gate_msa));
+        auto mlp_out = mlp->forward(ctx, modulate(ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp));
+        x            = ggml_add(ctx, x, ggml_mul(ctx, mlp_out, gate_mlp));

        return x;
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* x,
-                         ggml_tensor* c) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                ggml_backend_t backend,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* c) {
        // x: [N, n_token, hidden_size]
        // c: [N, hidden_size]
        // return: [N, n_token, hidden_size]
@ -433,8 +439,8 @@ public:
            auto qkv2          = std::get<1>(qkv_intermediates);
            auto intermediates = std::get<2>(qkv_intermediates);

-            auto attn_out  = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, ctx->flash_attn_enabled);     // [N, n_token, dim]
-            auto attn2_out = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv2[0], qkv2[1], qkv2[2], num_heads, nullptr, false, ctx->flash_attn_enabled);  // [N, n_token, dim]
+            auto attn_out  = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, flash_attn);     // [N, n_token, dim]
+            auto attn2_out = ggml_nn_attention_ext(ctx, backend, qkv2[0], qkv2[1], qkv2[2], num_heads, NULL, false, false, flash_attn);  // [N, n_token, dim]
            x              = post_attention_x(ctx,
                                              attn_out,
                                              attn2_out,
@ -450,7 +456,7 @@ public:
            auto qkv               = qkv_intermediates.first;
            auto intermediates     = qkv_intermediates.second;

-            auto attn_out = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, ctx->flash_attn_enabled);  // [N, n_token, dim]
+            auto attn_out = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, flash_attn);  // [N, n_token, dim]
            x             = post_attention(ctx,
                                           attn_out,
                                           intermediates[0],
@ -463,11 +469,13 @@ public:
    }
 };

-__STATIC_INLINE__ std::pair<ggml_tensor*, ggml_tensor*>
-block_mixing(GGMLRunnerContext* ctx,
-             ggml_tensor* context,
-             ggml_tensor* x,
-             ggml_tensor* c,
+__STATIC_INLINE__ std::pair<struct ggml_tensor*, struct ggml_tensor*>
+block_mixing(struct ggml_context* ctx,
+             ggml_backend_t backend,
+             bool flash_attn,
+             struct ggml_tensor* context,
+             struct ggml_tensor* x,
+             struct ggml_tensor* c,
             std::shared_ptr<DismantledBlock> context_block,
             std::shared_ptr<DismantledBlock> x_block) {
    // context: [N, n_context, hidden_size]
@ -489,29 +497,31 @@ block_mixing(GGMLRunnerContext* ctx,
        x_qkv                    = x_qkv_intermediates.first;
        x_intermediates          = x_qkv_intermediates.second;
    }
-    std::vector<ggml_tensor*> qkv;
+    std::vector<struct ggml_tensor*> qkv;
    for (int i = 0; i < 3; i++) {
-        qkv.push_back(ggml_concat(ctx->ggml_ctx, context_qkv[i], x_qkv[i], 1));
+        qkv.push_back(ggml_concat(ctx, context_qkv[i], x_qkv[i], 1));
    }

-    auto attn = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], x_block->num_heads, nullptr, false, ctx->flash_attn_enabled);  // [N, n_context + n_token, hidden_size]
-
-    auto context_attn = ggml_view_3d(ctx->ggml_ctx,
+    auto attn         = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], x_block->num_heads, NULL, false, false, flash_attn);  // [N, n_context + n_token, hidden_size]
+    attn              = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));                                                              // [n_context + n_token, N, hidden_size]
+    auto context_attn = ggml_view_3d(ctx,
                                     attn,
                                     attn->ne[0],
+                                     attn->ne[1],
                                     context->ne[1],
-                                     attn->ne[2],
                                     attn->nb[1],
                                     attn->nb[2],
-                                     0);  // [N, n_context, hidden_size]
-    auto x_attn       = ggml_view_3d(ctx->ggml_ctx,
+                                     0);                                              // [n_context, N, hidden_size]
+    context_attn      = ggml_cont(ctx, ggml_permute(ctx, context_attn, 0, 2, 1, 3));  // [N, n_context, hidden_size]
+    auto x_attn       = ggml_view_3d(ctx,
                                     attn,
                                     attn->ne[0],
+                                     attn->ne[1],
                                     x->ne[1],
-                                     attn->ne[2],
                                     attn->nb[1],
                                     attn->nb[2],
-                                     context->ne[1] * attn->nb[1]);  // [N, n_token, hidden_size]
+                                     attn->nb[2] * context->ne[1]);             // [n_token, N, hidden_size]
+    x_attn            = ggml_cont(ctx, ggml_permute(ctx, x_attn, 0, 2, 1, 3));  // [N, n_token, hidden_size]

    if (!context_block->pre_only) {
        context = context_block->post_attention(ctx,
@ -522,11 +532,11 @@ block_mixing(GGMLRunnerContext* ctx,
                                                context_intermediates[3],
                                                context_intermediates[4]);
    } else {
-        context = nullptr;
+        context = NULL;
    }

    if (x_block->self_attn) {
-        auto attn2 = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, x_qkv2[0], x_qkv2[1], x_qkv2[2], x_block->num_heads, nullptr, false, ctx->flash_attn_enabled);  // [N, n_token, hidden_size]
+        auto attn2 = ggml_nn_attention_ext(ctx, backend, x_qkv2[0], x_qkv2[1], x_qkv2[2], x_block->num_heads);  // [N, n_token, hidden_size]

        x = x_block->post_attention_x(ctx,
                                      x_attn,
@ -551,6 +561,8 @@ block_mixing(GGMLRunnerContext* ctx,
 }

 struct JointBlock : public GGMLBlock {
+    bool flash_attn;
+
 public:
    JointBlock(int64_t hidden_size,
               int64_t num_heads,
@ -558,19 +570,22 @@ public:
               std::string qk_norm = "",
               bool qkv_bias       = false,
               bool pre_only       = false,
-               bool self_attn_x    = false) {
-        blocks["context_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, pre_only, false));
-        blocks["x_block"]       = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, false, self_attn_x));
+               bool self_attn_x    = false,
+               bool flash_attn     = false)
+        : flash_attn(flash_attn) {
+        blocks["context_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, pre_only, false, flash_attn));
+        blocks["x_block"]       = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, false, self_attn_x, flash_attn));
    }

-    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                  ggml_tensor* context,
-                                                  ggml_tensor* x,
-                                                  ggml_tensor* c) {
+    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
+                                                                ggml_backend_t backend,
+                                                                struct ggml_tensor* context,
+                                                                struct ggml_tensor* x,
+                                                                struct ggml_tensor* c) {
        auto context_block = std::dynamic_pointer_cast<DismantledBlock>(blocks["context_block"]);
        auto x_block       = std::dynamic_pointer_cast<DismantledBlock>(blocks["x_block"]);

-        return block_mixing(ctx, context, x, c, context_block, x_block);
+        return block_mixing(ctx, backend, flash_attn, context, x, c, context_block, x_block);
    }
 };

@ -586,9 +601,9 @@ public:
        blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size));
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* x,
-                         ggml_tensor* c) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* c) {
        // x: [N, n_token, hidden_size]
        // c: [N, hidden_size]
        // return: [N, n_token, patch_size * patch_size * out_channels]
@ -596,12 +611,15 @@ public:
        auto linear             = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
        auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);

-        auto m     = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c));  // [N, 2 * hidden_size]
-        auto m_vec = ggml_ext_chunk(ctx->ggml_ctx, m, 2, 0);
-        auto shift = m_vec[0];  // [N, hidden_size]
-        auto scale = m_vec[1];  // [N, hidden_size]
+        auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c));  // [N, 2 * hidden_size]
+        m      = ggml_reshape_3d(ctx, m, c->ne[0], 2, c->ne[1]);       // [N, 2, hidden_size]
+        m      = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3));     // [2, N, hidden_size]

-        x = modulate(ctx->ggml_ctx, norm_final->forward(ctx, x), shift, scale);
+        int64_t offset = m->nb[1] * m->ne[1];
+        auto shift     = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
+        auto scale     = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]
+
+        x = modulate(ctx, norm_final->forward(ctx, x), shift, scale);
        x = linear->forward(ctx, x);

        return x;
@ -612,7 +630,7 @@ struct MMDiT : public GGMLBlock {
    // Diffusion model with a Transformer backbone.
 protected:
    int64_t input_size               = -1;
-    int patch_size                   = 2;
+    int64_t patch_size               = 2;
    int64_t in_channels              = 16;
    int64_t d_self                   = -1;  // >=0 for MMdiT-X
    int64_t depth                    = 24;
@ -625,14 +643,16 @@ protected:
    int64_t context_embedder_out_dim = 1536;
    int64_t hidden_size;
    std::string qk_norm;
+    bool flash_attn = false;

-    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
        enum ggml_type wtype = GGML_TYPE_F32;
        params["pos_embed"]  = ggml_new_tensor_3d(ctx, wtype, hidden_size, num_patchs, 1);
    }

 public:
-    MMDiT(const String2TensorStorage& tensor_storage_map = {}) {
+    MMDiT(bool flash_attn = false, const String2GGMLType& tensor_types = {})
+        : flash_attn(flash_attn) {
        // input_size is always None
        // learn_sigma is always False
        // register_length is alwalys 0
@ -645,7 +665,8 @@ public:
        // pos_embed_offset is not used
        // context_embedder_config is always {'target': 'torch.nn.Linear', 'params': {'in_features': 4096, 'out_features': 1536}}

-        for (auto pair : tensor_storage_map) {
+        // read tensors from tensor_types
+        for (auto pair : tensor_types) {
            std::string tensor_name = pair.first;
            if (tensor_name.find("model.diffusion_model.") == std::string::npos)
                continue;
@ -699,14 +720,15 @@ public:
                                                                                                    qk_norm,
                                                                                                    true,
                                                                                                    i == depth - 1,
-                                                                                                    i <= d_self));
+                                                                                                    i <= d_self,
+                                                                                                    flash_attn));
        }

        blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new FinalLayer(hidden_size, patch_size, out_channels));
    }

-    ggml_tensor*
-    cropped_pos_embed(ggml_context* ctx,
+    struct ggml_tensor*
+    cropped_pos_embed(struct ggml_context* ctx,
                      int64_t h,
                      int64_t w) {
        auto pos_embed = params["pos_embed"];
@ -745,11 +767,34 @@ public:
        return spatial_pos_embed;
    }

-    ggml_tensor* forward_core_with_concat(GGMLRunnerContext* ctx,
-                                          ggml_tensor* x,
-                                          ggml_tensor* c_mod,
-                                          ggml_tensor* context,
-                                          std::vector<int> skip_layers = std::vector<int>()) {
+    struct ggml_tensor* unpatchify(struct ggml_context* ctx,
+                                   struct ggml_tensor* x,
+                                   int64_t h,
+                                   int64_t w) {
+        // x: [N, H*W, patch_size * patch_size * C]
+        // return: [N, C, H, W]
+        int64_t n = x->ne[2];
+        int64_t c = out_channels;
+        int64_t p = patch_size;
+        h         = (h + 1) / p;
+        w         = (w + 1) / p;
+
+        GGML_ASSERT(h * w == x->ne[1]);
+
+        x = ggml_reshape_4d(ctx, x, c, p * p, w * h, n);       // [N, H*W, P*P, C]
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 2, 0, 1, 3));  // [N, C, H*W, P*P]
+        x = ggml_reshape_4d(ctx, x, p, p, w, h * c * n);       // [N*C*H, W, P, P]
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N*C*H, P, W, P]
+        x = ggml_reshape_4d(ctx, x, p * w, p * h, c, n);       // [N, C, H*P, W*P]
+        return x;
+    }
+
+    struct ggml_tensor* forward_core_with_concat(struct ggml_context* ctx,
+                                                 ggml_backend_t backend,
+                                                 struct ggml_tensor* x,
+                                                 struct ggml_tensor* c_mod,
+                                                 struct ggml_tensor* context,
+                                                 std::vector<int> skip_layers = std::vector<int>()) {
        // x: [N, H*W, hidden_size]
        // context: [N, n_context, d_context]
        // c: [N, hidden_size]
@ -764,7 +809,7 @@ public:

            auto block = std::dynamic_pointer_cast<JointBlock>(blocks["joint_blocks." + std::to_string(i)]);

-            auto context_x = block->forward(ctx, context, x, c_mod);
+            auto context_x = block->forward(ctx, backend, context, x, c_mod);
            context        = context_x.first;
            x              = context_x.second;
        }
@ -774,12 +819,13 @@ public:
        return x;
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* x,
-                         ggml_tensor* t,
-                         ggml_tensor* y               = nullptr,
-                         ggml_tensor* context         = nullptr,
-                         std::vector<int> skip_layers = std::vector<int>()) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                ggml_backend_t backend,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* t,
+                                struct ggml_tensor* y        = NULL,
+                                struct ggml_tensor* context  = NULL,
+                                std::vector<int> skip_layers = std::vector<int>()) {
        // Forward pass of DiT.
        // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
        // t: (N,) tensor of diffusion timesteps
@ -789,30 +835,30 @@ public:
        auto x_embedder = std::dynamic_pointer_cast<PatchEmbed>(blocks["x_embedder"]);
        auto t_embedder = std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder"]);

-        int64_t W = x->ne[0];
-        int64_t H = x->ne[1];
+        int64_t w = x->ne[0];
+        int64_t h = x->ne[1];

-        auto patch_embed = x_embedder->forward(ctx, x);                      // [N, H*W, hidden_size]
-        auto pos_embed   = cropped_pos_embed(ctx->ggml_ctx, H, W);           // [1, H*W, hidden_size]
-        x                = ggml_add(ctx->ggml_ctx, patch_embed, pos_embed);  // [N, H*W, hidden_size]
+        auto patch_embed = x_embedder->forward(ctx, x);            // [N, H*W, hidden_size]
+        auto pos_embed   = cropped_pos_embed(ctx, h, w);           // [1, H*W, hidden_size]
+        x                = ggml_add(ctx, patch_embed, pos_embed);  // [N, H*W, hidden_size]

        auto c = t_embedder->forward(ctx, t);  // [N, hidden_size]
-        if (y != nullptr && adm_in_channels != -1) {
+        if (y != NULL && adm_in_channels != -1) {
            auto y_embedder = std::dynamic_pointer_cast<VectorEmbedder>(blocks["y_embedder"]);

            y = y_embedder->forward(ctx, y);  // [N, hidden_size]
-            c = ggml_add(ctx->ggml_ctx, c, y);
+            c = ggml_add(ctx, c, y);
        }

-        if (context != nullptr) {
+        if (context != NULL) {
            auto context_embedder = std::dynamic_pointer_cast<Linear>(blocks["context_embedder"]);

            context = context_embedder->forward(ctx, context);  // [N, L, D] aka [N, L, 1536]
        }

-        x = forward_core_with_concat(ctx, x, c, context, skip_layers);  // (N, H*W, patch_size ** 2 * out_channels)
+        x = forward_core_with_concat(ctx, backend, x, c, context, skip_layers);  // (N, H*W, patch_size ** 2 * out_channels)

-        x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, patch_size, patch_size, /*patch_last*/ false);  // [N, C, H, W]
+        x = unpatchify(ctx, x, h, w);  // [N, C, H, W]

        return x;
    }
@ -822,72 +868,73 @@ struct MMDiTRunner : public GGMLRunner {

    MMDiTRunner(ggml_backend_t backend,
                bool offload_params_to_cpu,
-                const String2TensorStorage& tensor_storage_map = {},
-                const std::string prefix                       = "")
-        : GGMLRunner(backend, offload_params_to_cpu), mmdit(tensor_storage_map) {
-        mmdit.init(params_ctx, tensor_storage_map, prefix);
+                bool flash_attn,
+                const String2GGMLType& tensor_types = {},
+                const std::string prefix            = "")
+        : GGMLRunner(backend, offload_params_to_cpu), mmdit(flash_attn, tensor_types) {
+        mmdit.init(params_ctx, tensor_types, prefix);
    }

-    std::string get_desc() override {
+    std::string get_desc() {
        return "mmdit";
    }

-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
        mmdit.get_param_tensors(tensors, prefix);
    }

-    ggml_cgraph* build_graph(ggml_tensor* x,
-                             ggml_tensor* timesteps,
-                             ggml_tensor* context,
-                             ggml_tensor* y,
-                             std::vector<int> skip_layers = std::vector<int>()) {
-        ggml_cgraph* gf = new_graph_custom(MMDIT_GRAPH_SIZE);
+    struct ggml_cgraph* build_graph(struct ggml_tensor* x,
+                                    struct ggml_tensor* timesteps,
+                                    struct ggml_tensor* context,
+                                    struct ggml_tensor* y,
+                                    std::vector<int> skip_layers = std::vector<int>()) {
+        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, MMDIT_GRAPH_SIZE, false);

        x         = to_backend(x);
        context   = to_backend(context);
        y         = to_backend(y);
        timesteps = to_backend(timesteps);

-        auto runner_ctx  = get_context();
-        ggml_tensor* out = mmdit.forward(&runner_ctx,
-                                         x,
-                                         timesteps,
-                                         y,
-                                         context,
-                                         skip_layers);
+        struct ggml_tensor* out = mmdit.forward(compute_ctx,
+                                                runtime_backend,
+                                                x,
+                                                timesteps,
+                                                y,
+                                                context,
+                                                skip_layers);

        ggml_build_forward_expand(gf, out);

        return gf;
    }

-    bool compute(int n_threads,
-                 ggml_tensor* x,
-                 ggml_tensor* timesteps,
-                 ggml_tensor* context,
-                 ggml_tensor* y,
-                 ggml_tensor** output         = nullptr,
-                 ggml_context* output_ctx     = nullptr,
-                 std::vector<int> skip_layers = std::vector<int>()) {
+    void compute(int n_threads,
+                 struct ggml_tensor* x,
+                 struct ggml_tensor* timesteps,
+                 struct ggml_tensor* context,
+                 struct ggml_tensor* y,
+                 struct ggml_tensor** output     = NULL,
+                 struct ggml_context* output_ctx = NULL,
+                 std::vector<int> skip_layers    = std::vector<int>()) {
        // x: [N, in_channels, h, w]
        // timesteps: [N, ]
        // context: [N, max_position, hidden_size]([N, 154, 4096]) or [1, max_position, hidden_size]
        // y: [N, adm_in_channels] or [1, adm_in_channels]
-        auto get_graph = [&]() -> ggml_cgraph* {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(x, timesteps, context, y, skip_layers);
        };

-        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
    }

    void test() {
-        ggml_init_params params;
+        struct ggml_init_params params;
        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
-        params.mem_buffer = nullptr;
+        params.mem_buffer = NULL;
        params.no_alloc   = false;

-        ggml_context* work_ctx = ggml_init(params);
-        GGML_ASSERT(work_ctx != nullptr);
+        struct ggml_context* work_ctx = ggml_init(params);
+        GGML_ASSERT(work_ctx != NULL);

        {
            // cpu f16: pass
@ -908,14 +955,14 @@ struct MMDiTRunner : public GGMLRunner {
            ggml_set_f32(y, 0.01f);
            // print_ggml_tensor(y);

-            ggml_tensor* out = nullptr;
+            struct ggml_tensor* out = NULL;

-            int64_t t0 = ggml_time_ms();
+            int t0 = ggml_time_ms();
            compute(8, x, timesteps, context, y, &out, work_ctx);
-            int64_t t1 = ggml_time_ms();
+            int t1 = ggml_time_ms();

            print_ggml_tensor(out);
-            LOG_DEBUG("mmdit test done in %lldms", t1 - t0);
+            LOG_DEBUG("mmdit test done in %dms", t1 - t0);
        }
    }

@ -923,7 +970,7 @@ struct MMDiTRunner : public GGMLRunner {
        // ggml_backend_t backend    = ggml_backend_cuda_init(0);
        ggml_backend_t backend             = ggml_backend_cpu_init();
        ggml_type model_data_type          = GGML_TYPE_F16;
-        std::shared_ptr<MMDiTRunner> mmdit = std::make_shared<MMDiTRunner>(backend, false);
+        std::shared_ptr<MMDiTRunner> mmdit = std::shared_ptr<MMDiTRunner>(new MMDiTRunner(backend, false, false));
        {
            LOG_INFO("loading from '%s'", file_path.c_str());

@ -932,7 +979,7 @@ struct MMDiTRunner : public GGMLRunner {
            mmdit->get_param_tensors(tensors, "model.diffusion_model");

            ModelLoader model_loader;
-            if (!model_loader.init_from_file_and_convert_name(file_path)) {
+            if (!model_loader.init_from_file(file_path)) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
            }
@ -950,4 +997,4 @@ struct MMDiTRunner : public GGMLRunner {
    }
 };

-#endif
+#endif
--- a/src/model.cpp
+++ b/src/model.cpp
--- a/src/model.h
+++ b/src/model.h
@ -8,14 +8,12 @@
 #include <sstream>
 #include <string>
 #include <tuple>
-#include <utility>
 #include <vector>

 #include "ggml-backend.h"
 #include "ggml.h"
 #include "gguf.h"
 #include "json.hpp"
-#include "ordered_map.hpp"
 #include "zip.h"

 #define SD_MAX_DIMS 5
@ -24,60 +22,40 @@ enum SDVersion {
    VERSION_SD1,
    VERSION_SD1_INPAINT,
    VERSION_SD1_PIX2PIX,
-    VERSION_SD1_TINY_UNET,
    VERSION_SD2,
    VERSION_SD2_INPAINT,
-    VERSION_SD2_TINY_UNET,
-    VERSION_SDXS,
    VERSION_SDXL,
    VERSION_SDXL_INPAINT,
    VERSION_SDXL_PIX2PIX,
-    VERSION_SDXL_VEGA,
-    VERSION_SDXL_SSD1B,
    VERSION_SVD,
    VERSION_SD3,
    VERSION_FLUX,
    VERSION_FLUX_FILL,
    VERSION_FLUX_CONTROLS,
    VERSION_FLEX_2,
-    VERSION_CHROMA_RADIANCE,
    VERSION_WAN2,
    VERSION_WAN2_2_I2V,
    VERSION_WAN2_2_TI2V,
    VERSION_QWEN_IMAGE,
-    VERSION_ANIMA,
-    VERSION_FLUX2,
-    VERSION_FLUX2_KLEIN,
-    VERSION_Z_IMAGE,
-    VERSION_OVIS_IMAGE,
    VERSION_COUNT,
 };

 static inline bool sd_version_is_sd1(SDVersion version) {
-    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS) {
+    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX) {
        return true;
    }
    return false;
 }

 static inline bool sd_version_is_sd2(SDVersion version) {
-    if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET) {
+    if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT) {
        return true;
    }
    return false;
 }

 static inline bool sd_version_is_sdxl(SDVersion version) {
-    if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT || version == VERSION_SDXL_PIX2PIX || version == VERSION_SDXL_SSD1B || version == VERSION_SDXL_VEGA) {
-        return true;
-    }
-    return false;
-}
-
-static inline bool sd_version_is_unet(SDVersion version) {
-    if (sd_version_is_sd1(version) ||
-        sd_version_is_sd2(version) ||
-        sd_version_is_sdxl(version)) {
+    if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT || version == VERSION_SDXL_PIX2PIX) {
        return true;
    }
    return false;
@ -91,19 +69,7 @@ static inline bool sd_version_is_sd3(SDVersion version) {
 }

 static inline bool sd_version_is_flux(SDVersion version) {
-    if (version == VERSION_FLUX ||
-        version == VERSION_FLUX_FILL ||
-        version == VERSION_FLUX_CONTROLS ||
-        version == VERSION_FLEX_2 ||
-        version == VERSION_OVIS_IMAGE ||
-        version == VERSION_CHROMA_RADIANCE) {
-        return true;
-    }
-    return false;
-}
-
-static inline bool sd_version_is_flux2(SDVersion version) {
-    if (version == VERSION_FLUX2 || version == VERSION_FLUX2_KLEIN) {
+    if (version == VERSION_FLUX || version == VERSION_FLUX_FILL || version == VERSION_FLUX_CONTROLS || version == VERSION_FLEX_2) {
        return true;
    }
    return false;
@ -123,26 +89,8 @@ static inline bool sd_version_is_qwen_image(SDVersion version) {
    return false;
 }

-static inline bool sd_version_is_anima(SDVersion version) {
-    if (version == VERSION_ANIMA) {
-        return true;
-    }
-    return false;
-}
-
-static inline bool sd_version_is_z_image(SDVersion version) {
-    if (version == VERSION_Z_IMAGE) {
-        return true;
-    }
-    return false;
-}
-
 static inline bool sd_version_is_inpaint(SDVersion version) {
-    if (version == VERSION_SD1_INPAINT ||
-        version == VERSION_SD2_INPAINT ||
-        version == VERSION_SDXL_INPAINT ||
-        version == VERSION_FLUX_FILL ||
-        version == VERSION_FLEX_2) {
+    if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT || version == VERSION_FLUX_FILL || version == VERSION_FLEX_2) {
        return true;
    }
    return false;
@ -150,12 +98,9 @@ static inline bool sd_version_is_inpaint(SDVersion version) {

 static inline bool sd_version_is_dit(SDVersion version) {
    if (sd_version_is_flux(version) ||
-        sd_version_is_flux2(version) ||
        sd_version_is_sd3(version) ||
        sd_version_is_wan(version) ||
-        sd_version_is_qwen_image(version) ||
-        sd_version_is_anima(version) ||
-        sd_version_is_z_image(version)) {
+        sd_version_is_qwen_image(version)) {
        return true;
    }
    return false;
@ -181,7 +126,7 @@ enum PMVersion {
 struct TensorStorage {
    std::string name;
    ggml_type type          = GGML_TYPE_F32;
-    ggml_type expected_type = GGML_TYPE_COUNT;
+    bool is_bf16            = false;
    bool is_f8_e4m3         = false;
    bool is_f8_e5m2         = false;
    bool is_f64             = false;
@ -195,8 +140,8 @@ struct TensorStorage {

    TensorStorage() = default;

-    TensorStorage(std::string name, ggml_type type, const int64_t* ne, int n_dims, size_t file_index, size_t offset = 0)
-        : name(std::move(name)), type(type), n_dims(n_dims), file_index(file_index), offset(offset) {
+    TensorStorage(const std::string& name, ggml_type type, const int64_t* ne, int n_dims, size_t file_index, size_t offset = 0)
+        : name(name), type(type), n_dims(n_dims), file_index(file_index), offset(offset) {
        for (int i = 0; i < n_dims; i++) {
            this->ne[i] = ne[i];
        }
@ -215,7 +160,7 @@ struct TensorStorage {
    }

    int64_t nbytes_to_read() const {
-        if (is_f8_e4m3 || is_f8_e5m2) {
+        if (is_bf16 || is_f8_e4m3 || is_f8_e5m2) {
            return nbytes() / 2;
        } else if (is_f64 || is_i64) {
            return nbytes() * 2;
@ -263,7 +208,9 @@ struct TensorStorage {
    std::string to_string() const {
        std::stringstream ss;
        const char* type_name = ggml_type_name(type);
-        if (is_f8_e4m3) {
+        if (is_bf16) {
+            type_name = "bf16";
+        } else if (is_f8_e4m3) {
            type_name = "f8_e4m3";
        } else if (is_f8_e5m2) {
            type_name = "f8_e5m2";
@ -287,15 +234,12 @@ struct TensorStorage {

 typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;

-typedef OrderedMap<std::string, TensorStorage> String2TensorStorage;
+typedef std::map<std::string, enum ggml_type> String2GGMLType;

 class ModelLoader {
 protected:
-    SDVersion version_ = VERSION_COUNT;
    std::vector<std::string> file_paths_;
-    String2TensorStorage tensor_storage_map;
-
-    void add_tensor_storage(const TensorStorage& tensor_storage);
+    std::vector<TensorStorage> tensor_storages;

    bool parse_data_pkl(uint8_t* buffer,
                        size_t buffer_size,
@ -310,28 +254,25 @@ protected:
    bool init_from_diffusers_file(const std::string& file_path, const std::string& prefix = "");

 public:
+    String2GGMLType tensor_storages_types;
+
    bool init_from_file(const std::string& file_path, const std::string& prefix = "");
-    void convert_tensors_name();
-    bool init_from_file_and_convert_name(const std::string& file_path,
-                                         const std::string& prefix = "",
-                                         SDVersion version         = VERSION_COUNT);
+    bool model_is_unet();
    SDVersion get_sd_version();
-    std::map<ggml_type, uint32_t> get_wtype_stat();
-    std::map<ggml_type, uint32_t> get_conditioner_wtype_stat();
-    std::map<ggml_type, uint32_t> get_diffusion_model_wtype_stat();
-    std::map<ggml_type, uint32_t> get_vae_wtype_stat();
-    String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
-    void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = "");
-    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0, bool use_mmap = false);
-    bool load_tensors(std::map<std::string, ggml_tensor*>& tensors,
+    ggml_type get_sd_wtype();
+    ggml_type get_conditioner_wtype();
+    ggml_type get_diffusion_model_wtype();
+    ggml_type get_vae_wtype();
+    void set_wtype_override(ggml_type wtype, std::string prefix = "");
+    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0);
+    bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                      std::set<std::string> ignore_tensors = {},
-                      int n_threads                        = 0,
-                      bool use_mmap                        = false);
+                      int n_threads                        = 0);

    std::vector<std::string> get_tensor_names() const {
        std::vector<std::string> names;
-        for (const auto& [name, tensor_storage] : tensor_storage_map) {
-            names.push_back(name);
+        for (const auto& ts : tensor_storages) {
+            names.push_back(ts.name);
        }
        return names;
    }
@ -340,6 +281,11 @@ public:
    bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
    int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
    ~ModelLoader() = default;
+
+    static std::string load_merges();
+    static std::string load_qwen2_merges();
+    static std::string load_t5_tokenizer_json();
+    static std::string load_umt5_tokenizer_json();
 };

 #endif  // __MODEL_H__
--- a/src/pmid.hpp
+++ b/src/pmid.hpp
@ -21,23 +21,23 @@ public:
        blocks["layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(in_dim));
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [N, channels, h, w]

        auto fc1        = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
        auto fc2        = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
        auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layernorm"]);

-        ggml_tensor* r = x;
-        // x = ggml_ext_layer_norm(ctx, x, ln_w, ln_b);
+        struct ggml_tensor* r = x;
+        // x = ggml_nn_layer_norm(ctx, x, ln_w, ln_b);
        x = layer_norm->forward(ctx, x);
        // x = ggml_add(ctx, ggml_mul_mat(ctx, fc1_w, x),  fc1_b);
        x = fc1->forward(ctx, x);
-        x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
+        x = ggml_gelu_inplace(ctx, x);
        x = fc2->forward(ctx, x);
        // x = ggml_add(ctx, ggml_mul_mat(ctx, fc2_w, x),  fc2_b);
        if (use_residue)
-            x = ggml_add(ctx->ggml_ctx, x, r);
+            x = ggml_add(ctx, x, r);
        return x;
    }
 };
@ -54,8 +54,8 @@ public:
        blocks["1"]   = std::shared_ptr<GGMLBlock>(new Mlp(dim, inner_dim, dim, false));
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x) {
        auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["0"]);
        auto ff   = std::dynamic_pointer_cast<Mlp>(blocks["1"]);

@ -72,7 +72,7 @@ struct PerceiverAttention : public GGMLBlock {
    int heads;     // = heads
 public:
    PerceiverAttention(int dim, int dim_h = 64, int h = 8)
-        : scale(powf(static_cast<float>(dim_h), -0.5f)), dim_head(dim_h), heads(h) {
+        : scale(powf(dim_h, -0.5)), dim_head(dim_h), heads(h) {
        int inner_dim    = dim_head * heads;
        blocks["norm1"]  = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
        blocks["norm2"]  = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
@ -81,9 +81,9 @@ public:
        blocks["to_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim, false));
    }

-    ggml_tensor* reshape_tensor(ggml_context* ctx,
-                                ggml_tensor* x,
-                                int heads) {
+    struct ggml_tensor* reshape_tensor(struct ggml_context* ctx,
+                                       struct ggml_tensor* x,
+                                       int heads) {
        int64_t ne[4];
        for (int i = 0; i < 4; ++i)
            ne[i] = x->ne[i];
@ -92,17 +92,17 @@ public:
        return x;
    }

-    std::vector<ggml_tensor*> chunk_half(ggml_context* ctx,
-                                         ggml_tensor* x) {
+    std::vector<struct ggml_tensor*> chunk_half(struct ggml_context* ctx,
+                                                struct ggml_tensor* x) {
        auto tlo = ggml_view_4d(ctx, x, x->ne[0] / 2, x->ne[1], x->ne[2], x->ne[3], x->nb[1], x->nb[2], x->nb[3], 0);
        auto tli = ggml_view_4d(ctx, x, x->ne[0] / 2, x->ne[1], x->ne[2], x->ne[3], x->nb[1], x->nb[2], x->nb[3], x->nb[0] * x->ne[0] / 2);
        return {ggml_cont(ctx, tlo),
                ggml_cont(ctx, tli)};
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* x,
-                         ggml_tensor* latents) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* latents) {
        // x (torch.Tensor): image features
        //     shape (b, n1, D)
        // latent (torch.Tensor): latent features
@ -118,33 +118,33 @@ public:
        auto to_q  = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
        auto q     = to_q->forward(ctx, latents);

-        auto kv_input = ggml_concat(ctx->ggml_ctx, x, latents, 1);
+        auto kv_input = ggml_concat(ctx, x, latents, 1);
        auto to_kv    = std::dynamic_pointer_cast<Linear>(blocks["to_kv"]);
        auto kv       = to_kv->forward(ctx, kv_input);
-        auto k        = ggml_view_4d(ctx->ggml_ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, 0);
-        auto v        = ggml_view_4d(ctx->ggml_ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, kv->nb[0] * (kv->ne[0] / 2));
-        k             = ggml_cont(ctx->ggml_ctx, k);
-        v             = ggml_cont(ctx->ggml_ctx, v);
-        q             = reshape_tensor(ctx->ggml_ctx, q, heads);
-        k             = reshape_tensor(ctx->ggml_ctx, k, heads);
-        v             = reshape_tensor(ctx->ggml_ctx, v, heads);
+        auto k        = ggml_view_4d(ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, 0);
+        auto v        = ggml_view_4d(ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, kv->nb[0] * (kv->ne[0] / 2));
+        k             = ggml_cont(ctx, k);
+        v             = ggml_cont(ctx, v);
+        q             = reshape_tensor(ctx, q, heads);
+        k             = reshape_tensor(ctx, k, heads);
+        v             = reshape_tensor(ctx, v, heads);
        scale         = 1.f / sqrt(sqrt((float)dim_head));
-        k             = ggml_ext_scale(ctx->ggml_ctx, k, scale, true);
-        q             = ggml_ext_scale(ctx->ggml_ctx, q, scale, true);
+        k             = ggml_scale_inplace(ctx, k, scale);
+        q             = ggml_scale_inplace(ctx, q, scale);
        // auto weight = ggml_mul_mat(ctx, q, k);
-        auto weight = ggml_mul_mat(ctx->ggml_ctx, k, q);  // NOTE order of mul is opposite to pytorch
+        auto weight = ggml_mul_mat(ctx, k, q);  // NOTE order of mul is opposite to pytorch

        // GGML's softmax() is equivalent to pytorch's softmax(x, dim=-1)
        // in this case, dimension along which Softmax will be computed is the last dim
        // in torch and the first dim in GGML, consistent with the convention that pytorch's
        // last dimension (varying most rapidly) corresponds to GGML's first (varying most rapidly).
        // weight = ggml_soft_max(ctx, weight);
-        weight = ggml_soft_max_inplace(ctx->ggml_ctx, weight);
-        v      = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, v));
+        weight = ggml_soft_max_inplace(ctx, weight);
+        v      = ggml_cont(ctx, ggml_transpose(ctx, v));
        // auto out = ggml_mul_mat(ctx, weight, v);
-        auto out    = ggml_mul_mat(ctx->ggml_ctx, v, weight);  // NOTE order of mul is opposite to pytorch
-        out         = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3));
-        out         = ggml_reshape_3d(ctx->ggml_ctx, out, ne[0], ne[1], ggml_nelements(out) / (ne[0] * ne[1]));
+        auto out    = ggml_mul_mat(ctx, v, weight);  // NOTE order of mul is opposite to pytorch
+        out         = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3));
+        out         = ggml_reshape_3d(ctx, out, ne[0], ne[1], ggml_nelements(out) / (ne[0] * ne[1]));
        auto to_out = std::dynamic_pointer_cast<Linear>(blocks["to_out"]);
        out         = to_out->forward(ctx, out);
        return out;
@ -176,9 +176,9 @@ public:
        }
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* latents,
-                         ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* latents,
+                                struct ggml_tensor* x) {
        // x: [N, channels, h, w]
        auto proj_in  = std::dynamic_pointer_cast<Linear>(blocks["proj_in"]);
        auto proj_out = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]);
@ -191,9 +191,9 @@ public:
            name             = "layers." + std::to_string(i) + ".1";
            auto ff          = std::dynamic_pointer_cast<PMFeedForward>(blocks[name]);
            auto t           = attn->forward(ctx, x, latents);
-            latents          = ggml_add(ctx->ggml_ctx, t, latents);
+            latents          = ggml_add(ctx, t, latents);
            t                = ff->forward(ctx, latents);
-            latents          = ggml_add(ctx->ggml_ctx, t, latents);
+            latents          = ggml_add(ctx, t, latents);
        }
        latents = proj_out->forward(ctx, latents);
        latents = norm_out->forward(ctx, latents);
@ -225,21 +225,21 @@ public:
            4));
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* x,
-                         ggml_tensor* last_hidden_state) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* last_hidden_state) {
        // x: [N, channels, h, w]
        auto token_proj          = std::dynamic_pointer_cast<Mlp>(blocks["token_proj"]);
        auto token_norm          = std::dynamic_pointer_cast<LayerNorm>(blocks["token_norm"]);
        auto perceiver_resampler = std::dynamic_pointer_cast<FacePerceiverResampler>(blocks["perceiver_resampler"]);

-        x                = token_proj->forward(ctx, x);
-        int64_t nel      = ggml_nelements(x);
-        x                = ggml_reshape_3d(ctx->ggml_ctx, x, cross_attention_dim, num_tokens, nel / (cross_attention_dim * num_tokens));
-        x                = token_norm->forward(ctx, x);
-        ggml_tensor* out = perceiver_resampler->forward(ctx, x, last_hidden_state);
+        x                       = token_proj->forward(ctx, x);
+        int64_t nel             = ggml_nelements(x);
+        x                       = ggml_reshape_3d(ctx, x, cross_attention_dim, num_tokens, nel / (cross_attention_dim * num_tokens));
+        x                       = token_norm->forward(ctx, x);
+        struct ggml_tensor* out = perceiver_resampler->forward(ctx, x, last_hidden_state);
        if (use_residul)
-            out = ggml_add(ctx->ggml_ctx, x, out);
+            out = ggml_add(ctx, x, out);
        return out;
    }
 };
@ -256,55 +256,55 @@ public:
        blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(embed_dim));
    }

-    ggml_tensor* fuse_fn(GGMLRunnerContext* ctx,
-                         ggml_tensor* prompt_embeds,
-                         ggml_tensor* id_embeds) {
+    struct ggml_tensor* fuse_fn(struct ggml_context* ctx,
+                                struct ggml_tensor* prompt_embeds,
+                                struct ggml_tensor* id_embeds) {
        auto mlp1       = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp1"]);
        auto mlp2       = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp2"]);
        auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm"]);

-        auto stacked_id_embeds = ggml_concat(ctx->ggml_ctx, prompt_embeds, id_embeds, 0);
+        auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds, id_embeds, 0);

        stacked_id_embeds = mlp1->forward(ctx, stacked_id_embeds);
-        stacked_id_embeds = ggml_add(ctx->ggml_ctx, stacked_id_embeds, prompt_embeds);
+        stacked_id_embeds = ggml_add(ctx, stacked_id_embeds, prompt_embeds);
        stacked_id_embeds = mlp2->forward(ctx, stacked_id_embeds);
        stacked_id_embeds = layer_norm->forward(ctx, stacked_id_embeds);

        return stacked_id_embeds;
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* prompt_embeds,
-                         ggml_tensor* id_embeds,
-                         ggml_tensor* class_tokens_mask,
-                         ggml_tensor* class_tokens_mask_pos,
-                         ggml_tensor* left,
-                         ggml_tensor* right) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* prompt_embeds,
+                                struct ggml_tensor* id_embeds,
+                                struct ggml_tensor* class_tokens_mask,
+                                struct ggml_tensor* class_tokens_mask_pos,
+                                struct ggml_tensor* left,
+                                struct ggml_tensor* right) {
        // x: [N, channels, h, w]

-        ggml_tensor* valid_id_embeds = id_embeds;
+        struct ggml_tensor* valid_id_embeds = id_embeds;
        // # slice out the image token embeddings
        ggml_set_name(class_tokens_mask_pos, "class_tokens_mask_pos");
        ggml_set_name(prompt_embeds, "prompt_embeds");
-        ggml_tensor* image_token_embeds = ggml_get_rows(ctx->ggml_ctx, prompt_embeds, class_tokens_mask_pos);
+        struct ggml_tensor* image_token_embeds = ggml_get_rows(ctx, prompt_embeds, class_tokens_mask_pos);
        ggml_set_name(image_token_embeds, "image_token_embeds");
-        valid_id_embeds                = ggml_reshape_2d(ctx->ggml_ctx, valid_id_embeds, valid_id_embeds->ne[0],
-                                                         ggml_nelements(valid_id_embeds) / valid_id_embeds->ne[0]);
-        ggml_tensor* stacked_id_embeds = fuse_fn(ctx, image_token_embeds, valid_id_embeds);
+        valid_id_embeds                       = ggml_reshape_2d(ctx, valid_id_embeds, valid_id_embeds->ne[0],
+                                                                ggml_nelements(valid_id_embeds) / valid_id_embeds->ne[0]);
+        struct ggml_tensor* stacked_id_embeds = fuse_fn(ctx, image_token_embeds, valid_id_embeds);

        if (left && right) {
-            stacked_id_embeds = ggml_concat(ctx->ggml_ctx, left, stacked_id_embeds, 1);
-            stacked_id_embeds = ggml_concat(ctx->ggml_ctx, stacked_id_embeds, right, 1);
+            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 1);
+            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 1);
        } else if (left) {
-            stacked_id_embeds = ggml_concat(ctx->ggml_ctx, left, stacked_id_embeds, 1);
+            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 1);
        } else if (right) {
-            stacked_id_embeds = ggml_concat(ctx->ggml_ctx, stacked_id_embeds, right, 1);
+            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 1);
        }

-        class_tokens_mask                  = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, class_tokens_mask));
-        class_tokens_mask                  = ggml_repeat(ctx->ggml_ctx, class_tokens_mask, prompt_embeds);
-        prompt_embeds                      = ggml_mul(ctx->ggml_ctx, prompt_embeds, class_tokens_mask);
-        ggml_tensor* updated_prompt_embeds = ggml_add(ctx->ggml_ctx, prompt_embeds, stacked_id_embeds);
+        class_tokens_mask                         = ggml_cont(ctx, ggml_transpose(ctx, class_tokens_mask));
+        class_tokens_mask                         = ggml_repeat(ctx, class_tokens_mask, prompt_embeds);
+        prompt_embeds                             = ggml_mul(ctx, prompt_embeds, class_tokens_mask);
+        struct ggml_tensor* updated_prompt_embeds = ggml_add(ctx, prompt_embeds, stacked_id_embeds);
        ggml_set_name(updated_prompt_embeds, "updated_prompt_embeds");
        return updated_prompt_embeds;
    }
@ -317,35 +317,36 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
        blocks["fuse_module"]         = std::shared_ptr<GGMLBlock>(new FuseModule(2048));
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* id_pixel_values,
-                         ggml_tensor* prompt_embeds,
-                         ggml_tensor* class_tokens_mask,
-                         ggml_tensor* class_tokens_mask_pos,
-                         ggml_tensor* left,
-                         ggml_tensor* right) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                ggml_backend_t backend,
+                                struct ggml_tensor* id_pixel_values,
+                                struct ggml_tensor* prompt_embeds,
+                                struct ggml_tensor* class_tokens_mask,
+                                struct ggml_tensor* class_tokens_mask_pos,
+                                struct ggml_tensor* left,
+                                struct ggml_tensor* right) {
        // x: [N, channels, h, w]
        auto vision_model        = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
        auto visual_projection   = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]);
        auto visual_projection_2 = std::dynamic_pointer_cast<Linear>(blocks["visual_projection_2"]);
        auto fuse_module         = std::dynamic_pointer_cast<FuseModule>(blocks["fuse_module"]);

-        ggml_tensor* shared_id_embeds = vision_model->forward(ctx, id_pixel_values);          // [N, hidden_size]
-        ggml_tensor* id_embeds        = visual_projection->forward(ctx, shared_id_embeds);    // [N, proj_dim(768)]
-        ggml_tensor* id_embeds_2      = visual_projection_2->forward(ctx, shared_id_embeds);  // [N, 1280]
+        struct ggml_tensor* shared_id_embeds = vision_model->forward(ctx, backend, id_pixel_values);  // [N, hidden_size]
+        struct ggml_tensor* id_embeds        = visual_projection->forward(ctx, shared_id_embeds);     // [N, proj_dim(768)]
+        struct ggml_tensor* id_embeds_2      = visual_projection_2->forward(ctx, shared_id_embeds);   // [N, 1280]

-        id_embeds   = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds, 2, 0, 1, 3));
-        id_embeds_2 = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds_2, 2, 0, 1, 3));
+        id_embeds   = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
+        id_embeds_2 = ggml_cont(ctx, ggml_permute(ctx, id_embeds_2, 2, 0, 1, 3));

-        id_embeds = ggml_concat(ctx->ggml_ctx, id_embeds, id_embeds_2, 2);  // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
-        id_embeds = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds, 1, 2, 0, 3));
+        id_embeds = ggml_concat(ctx, id_embeds, id_embeds_2, 2);  // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
+        id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 1, 2, 0, 3));

-        ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
-                                                                  prompt_embeds,
-                                                                  id_embeds,
-                                                                  class_tokens_mask,
-                                                                  class_tokens_mask_pos,
-                                                                  left, right);
+        struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
+                                                                         prompt_embeds,
+                                                                         id_embeds,
+                                                                         class_tokens_mask,
+                                                                         class_tokens_mask_pos,
+                                                                         left, right);
        return updated_prompt_embeds;
    }
 };
@ -365,29 +366,30 @@ struct PhotoMakerIDEncoder_CLIPInsightfaceExtendtokenBlock : public CLIPVisionMo
                                                                                        num_tokens));
    }

-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* id_pixel_values,
-                         ggml_tensor* prompt_embeds,
-                         ggml_tensor* class_tokens_mask,
-                         ggml_tensor* class_tokens_mask_pos,
-                         ggml_tensor* id_embeds,
-                         ggml_tensor* left,
-                         ggml_tensor* right) {
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                ggml_backend_t backend,
+                                struct ggml_tensor* id_pixel_values,
+                                struct ggml_tensor* prompt_embeds,
+                                struct ggml_tensor* class_tokens_mask,
+                                struct ggml_tensor* class_tokens_mask_pos,
+                                struct ggml_tensor* id_embeds,
+                                struct ggml_tensor* left,
+                                struct ggml_tensor* right) {
        // x: [N, channels, h, w]
        auto vision_model      = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
        auto fuse_module       = std::dynamic_pointer_cast<FuseModule>(blocks["fuse_module"]);
        auto qformer_perceiver = std::dynamic_pointer_cast<QFormerPerceiver>(blocks["qformer_perceiver"]);

-        // ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values);          // [N, hidden_size]
-        ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values, false);  // [N, hidden_size]
-        id_embeds                      = qformer_perceiver->forward(ctx, id_embeds, last_hidden_state);
+        // struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values);          // [N, hidden_size]
+        struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, backend, id_pixel_values, false);  // [N, hidden_size]
+        id_embeds                             = qformer_perceiver->forward(ctx, id_embeds, last_hidden_state);

-        ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
-                                                                  prompt_embeds,
-                                                                  id_embeds,
-                                                                  class_tokens_mask,
-                                                                  class_tokens_mask_pos,
-                                                                  left, right);
+        struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
+                                                                         prompt_embeds,
+                                                                         id_embeds,
+                                                                         class_tokens_mask,
+                                                                         class_tokens_mask_pos,
+                                                                         left, right);
        return updated_prompt_embeds;
    }
 };
@ -412,7 +414,7 @@ public:
 public:
    PhotoMakerIDEncoder(ggml_backend_t backend,
                        bool offload_params_to_cpu,
-                        const String2TensorStorage& tensor_storage_map,
+                        const String2GGMLType& tensor_types,
                        const std::string prefix,
                        SDVersion version = VERSION_SDXL,
                        PMVersion pm_v    = PM_VERSION_1,
@ -422,9 +424,9 @@ public:
          pm_version(pm_v),
          style_strength(sty) {
        if (pm_version == PM_VERSION_1) {
-            id_encoder.init(params_ctx, tensor_storage_map, prefix);
+            id_encoder.init(params_ctx, tensor_types, prefix);
        } else if (pm_version == PM_VERSION_2) {
-            id_encoder2.init(params_ctx, tensor_storage_map, prefix);
+            id_encoder2.init(params_ctx, tensor_types, prefix);
        }
    }

@ -436,18 +438,18 @@ public:
        return pm_version;
    }

-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
        if (pm_version == PM_VERSION_1)
            id_encoder.get_param_tensors(tensors, prefix);
        else if (pm_version == PM_VERSION_2)
            id_encoder2.get_param_tensors(tensors, prefix);
    }

-    ggml_cgraph* build_graph(  // ggml_allocr* allocr,
-        ggml_tensor* id_pixel_values,
-        ggml_tensor* prompt_embeds,
+    struct ggml_cgraph* build_graph(  // struct ggml_allocr* allocr,
+        struct ggml_tensor* id_pixel_values,
+        struct ggml_tensor* prompt_embeds,
        std::vector<bool>& class_tokens_mask,
-        ggml_tensor* id_embeds) {
+        struct ggml_tensor* id_embeds) {
        ctm.clear();
        ctmf16.clear();
        ctmpos.clear();
@ -456,22 +458,22 @@ public:
        zeros_right.clear();
        zeros_right_16.clear();

-        auto runner_ctx = get_context();
+        ggml_context* ctx0 = compute_ctx;

-        ggml_cgraph* gf = ggml_new_graph(compute_ctx);
+        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);

        int64_t hidden_size = prompt_embeds->ne[0];
        int64_t seq_length  = prompt_embeds->ne[1];
        ggml_type type      = GGML_TYPE_F32;

-        ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(runner_ctx.ggml_ctx, type, class_tokens_mask.size());
+        struct ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(ctx0, type, class_tokens_mask.size());

-        ggml_tensor* id_pixel_values_d = to_backend(id_pixel_values);
-        ggml_tensor* prompt_embeds_d   = to_backend(prompt_embeds);
-        ggml_tensor* id_embeds_d       = to_backend(id_embeds);
+        struct ggml_tensor* id_pixel_values_d = to_backend(id_pixel_values);
+        struct ggml_tensor* prompt_embeds_d   = to_backend(prompt_embeds);
+        struct ggml_tensor* id_embeds_d       = to_backend(id_embeds);

-        ggml_tensor* left  = nullptr;
-        ggml_tensor* right = nullptr;
+        struct ggml_tensor* left  = NULL;
+        struct ggml_tensor* right = NULL;
        for (int i = 0; i < class_tokens_mask.size(); i++) {
            if (class_tokens_mask[i]) {
                // printf(" 1,");
@ -486,16 +488,16 @@ public:
        }
        // printf("\n");
        if (ctmpos[0] > 0) {
-            // left = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type, hidden_size, 1, ctmpos[0]);
-            left = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type, hidden_size, ctmpos[0], 1);
+            // left = ggml_new_tensor_3d(ctx0, type, hidden_size, 1, ctmpos[0]);
+            left = ggml_new_tensor_3d(ctx0, type, hidden_size, ctmpos[0], 1);
        }
        if (ctmpos[ctmpos.size() - 1] < seq_length - 1) {
-            // right = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type,
+            // right = ggml_new_tensor_3d(ctx0, type,
            //                            hidden_size, 1, seq_length - ctmpos[ctmpos.size() - 1] - 1);
-            right = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type,
+            right = ggml_new_tensor_3d(ctx0, type,
                                       hidden_size, seq_length - ctmpos[ctmpos.size() - 1] - 1, 1);
        }
-        ggml_tensor* class_tokens_mask_pos = ggml_new_tensor_1d(runner_ctx.ggml_ctx, GGML_TYPE_I32, ctmpos.size());
+        struct ggml_tensor* class_tokens_mask_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ctmpos.size());

        {
            if (type == GGML_TYPE_F16)
@ -526,16 +528,18 @@ public:
                }
            }
        }
-        ggml_tensor* updated_prompt_embeds = nullptr;
+        struct ggml_tensor* updated_prompt_embeds = NULL;
        if (pm_version == PM_VERSION_1)
-            updated_prompt_embeds = id_encoder.forward(&runner_ctx,
+            updated_prompt_embeds = id_encoder.forward(ctx0,
+                                                       runtime_backend,
                                                       id_pixel_values_d,
                                                       prompt_embeds_d,
                                                       class_tokens_mask_d,
                                                       class_tokens_mask_pos,
                                                       left, right);
        else if (pm_version == PM_VERSION_2)
-            updated_prompt_embeds = id_encoder2.forward(&runner_ctx,
+            updated_prompt_embeds = id_encoder2.forward(ctx0,
+                                                        runtime_backend,
                                                        id_pixel_values_d,
                                                        prompt_embeds_d,
                                                        class_tokens_mask_d,
@ -548,25 +552,25 @@ public:
        return gf;
    }

-    bool compute(const int n_threads,
-                 ggml_tensor* id_pixel_values,
-                 ggml_tensor* prompt_embeds,
-                 ggml_tensor* id_embeds,
+    void compute(const int n_threads,
+                 struct ggml_tensor* id_pixel_values,
+                 struct ggml_tensor* prompt_embeds,
+                 struct ggml_tensor* id_embeds,
                 std::vector<bool>& class_tokens_mask,
-                 ggml_tensor** updated_prompt_embeds,
+                 struct ggml_tensor** updated_prompt_embeds,
                 ggml_context* output_ctx) {
-        auto get_graph = [&]() -> ggml_cgraph* {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
            // return build_graph(compute_allocr, id_pixel_values, prompt_embeds, class_tokens_mask);
            return build_graph(id_pixel_values, prompt_embeds, class_tokens_mask, id_embeds);
        };

        // GGMLRunner::compute(get_graph, n_threads, updated_prompt_embeds);
-        return GGMLRunner::compute(get_graph, n_threads, true, updated_prompt_embeds, output_ctx);
+        GGMLRunner::compute(get_graph, n_threads, true, updated_prompt_embeds, output_ctx);
    }
 };

 struct PhotoMakerIDEmbed : public GGMLRunner {
-    std::map<std::string, ggml_tensor*> tensors;
+    std::map<std::string, struct ggml_tensor*> tensors;
    std::string file_path;
    ModelLoader* model_loader;
    bool load_failed = false;
@ -578,7 +582,7 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
                      const std::string& file_path = "",
                      const std::string& prefix    = "")
        : file_path(file_path), GGMLRunner(backend, offload_params_to_cpu), model_loader(ml) {
-        if (!model_loader->init_from_file_and_convert_name(file_path, prefix)) {
+        if (!model_loader->init_from_file(file_path, prefix)) {
            load_failed = true;
        }
    }
@ -606,11 +610,11 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
            }
            if (dry_run) {
                std::lock_guard<std::mutex> lock(tensor_mutex);
-                ggml_tensor* real = ggml_new_tensor(params_ctx,
-                                                    tensor_storage.type,
-                                                    tensor_storage.n_dims,
-                                                    tensor_storage.ne);
-                tensors[name]     = real;
+                struct ggml_tensor* real = ggml_new_tensor(params_ctx,
+                                                           tensor_storage.type,
+                                                           tensor_storage.n_dims,
+                                                           tensor_storage.ne);
+                tensors[name]            = real;
            } else {
                auto real   = tensors[name];
                *dst_tensor = real;
@ -629,12 +633,12 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
        return true;
    }

-    ggml_tensor* get() {
-        std::map<std::string, ggml_tensor*>::iterator pos;
+    struct ggml_tensor* get() {
+        std::map<std::string, struct ggml_tensor*>::iterator pos;
        pos = tensors.find("pmid.id_embeds");
        if (pos != tensors.end())
            return pos->second;
-        return nullptr;
+        return NULL;
    }
 };

--- a/preprocessing.hpp
+++ b/preprocessing.hpp
@ -0,0 +1,226 @@
+#ifndef __PREPROCESSING_HPP__
+#define __PREPROCESSING_HPP__
+
+#include "ggml_extend.hpp"
+#define M_PI_ 3.14159265358979323846
+
+void convolve(struct ggml_tensor* input, struct ggml_tensor* output, struct ggml_tensor* kernel, int padding) {
+    struct ggml_init_params params;
+    params.mem_size                 = 80 * input->ne[0] * input->ne[1];  // 20M for 512x512
+    params.mem_buffer               = NULL;
+    params.no_alloc                 = false;
+    struct ggml_context* ctx0       = ggml_init(params);
+    struct ggml_tensor* kernel_fp16 = ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, kernel->ne[0], kernel->ne[1], 1, 1);
+    ggml_fp32_to_fp16_row((float*)kernel->data, (ggml_fp16_t*)kernel_fp16->data, ggml_nelements(kernel));
+    ggml_tensor* h  = ggml_conv_2d(ctx0, kernel_fp16, input, 1, 1, padding, padding, 1, 1);
+    ggml_cgraph* gf = ggml_new_graph(ctx0);
+    ggml_build_forward_expand(gf, ggml_cpy(ctx0, h, output));
+    ggml_graph_compute_with_ctx(ctx0, gf, 1);
+    ggml_free(ctx0);
+}
+
+void gaussian_kernel(struct ggml_tensor* kernel) {
+    int ks_mid   = kernel->ne[0] / 2;
+    float sigma  = 1.4f;
+    float normal = 1.f / (2.0f * M_PI_ * powf(sigma, 2.0f));
+    for (int y = 0; y < kernel->ne[0]; y++) {
+        float gx = -ks_mid + y;
+        for (int x = 0; x < kernel->ne[1]; x++) {
+            float gy = -ks_mid + x;
+            float k_ = expf(-((gx * gx + gy * gy) / (2.0f * powf(sigma, 2.0f)))) * normal;
+            ggml_tensor_set_f32(kernel, k_, x, y);
+        }
+    }
+}
+
+void grayscale(struct ggml_tensor* rgb_img, struct ggml_tensor* grayscale) {
+    for (int iy = 0; iy < rgb_img->ne[1]; iy++) {
+        for (int ix = 0; ix < rgb_img->ne[0]; ix++) {
+            float r    = ggml_tensor_get_f32(rgb_img, ix, iy);
+            float g    = ggml_tensor_get_f32(rgb_img, ix, iy, 1);
+            float b    = ggml_tensor_get_f32(rgb_img, ix, iy, 2);
+            float gray = 0.2989f * r + 0.5870f * g + 0.1140f * b;
+            ggml_tensor_set_f32(grayscale, gray, ix, iy);
+        }
+    }
+}
+
+void prop_hypot(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tensor* h) {
+    int n_elements = ggml_nelements(h);
+    float* dx      = (float*)x->data;
+    float* dy      = (float*)y->data;
+    float* dh      = (float*)h->data;
+    for (int i = 0; i < n_elements; i++) {
+        dh[i] = sqrtf(dx[i] * dx[i] + dy[i] * dy[i]);
+    }
+}
+
+void prop_arctan2(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tensor* h) {
+    int n_elements = ggml_nelements(h);
+    float* dx      = (float*)x->data;
+    float* dy      = (float*)y->data;
+    float* dh      = (float*)h->data;
+    for (int i = 0; i < n_elements; i++) {
+        dh[i] = atan2f(dy[i], dx[i]);
+    }
+}
+
+void normalize_tensor(struct ggml_tensor* g) {
+    int n_elements = ggml_nelements(g);
+    float* dg      = (float*)g->data;
+    float max      = -INFINITY;
+    for (int i = 0; i < n_elements; i++) {
+        max = dg[i] > max ? dg[i] : max;
+    }
+    max = 1.0f / max;
+    for (int i = 0; i < n_elements; i++) {
+        dg[i] *= max;
+    }
+}
+
+void non_max_supression(struct ggml_tensor* result, struct ggml_tensor* G, struct ggml_tensor* D) {
+    for (int iy = 1; iy < result->ne[1] - 1; iy++) {
+        for (int ix = 1; ix < result->ne[0] - 1; ix++) {
+            float angle = ggml_tensor_get_f32(D, ix, iy) * 180.0f / M_PI_;
+            angle       = angle < 0.0f ? angle += 180.0f : angle;
+            float q     = 1.0f;
+            float r     = 1.0f;
+
+            // angle 0
+            if ((0 >= angle && angle < 22.5f) || (157.5f >= angle && angle <= 180)) {
+                q = ggml_tensor_get_f32(G, ix, iy + 1);
+                r = ggml_tensor_get_f32(G, ix, iy - 1);
+            }
+            // angle 45
+            else if (22.5f >= angle && angle < 67.5f) {
+                q = ggml_tensor_get_f32(G, ix + 1, iy - 1);
+                r = ggml_tensor_get_f32(G, ix - 1, iy + 1);
+            }
+            // angle 90
+            else if (67.5f >= angle && angle < 112.5) {
+                q = ggml_tensor_get_f32(G, ix + 1, iy);
+                r = ggml_tensor_get_f32(G, ix - 1, iy);
+            }
+            // angle 135
+            else if (112.5 >= angle && angle < 157.5f) {
+                q = ggml_tensor_get_f32(G, ix - 1, iy - 1);
+                r = ggml_tensor_get_f32(G, ix + 1, iy + 1);
+            }
+
+            float cur = ggml_tensor_get_f32(G, ix, iy);
+            if ((cur >= q) && (cur >= r)) {
+                ggml_tensor_set_f32(result, cur, ix, iy);
+            } else {
+                ggml_tensor_set_f32(result, 0.0f, ix, iy);
+            }
+        }
+    }
+}
+
+void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float low_threshold, float weak, float strong) {
+    int n_elements = ggml_nelements(img);
+    float* imd     = (float*)img->data;
+    float max      = -INFINITY;
+    for (int i = 0; i < n_elements; i++) {
+        max = imd[i] > max ? imd[i] : max;
+    }
+    float ht = max * high_threshold;
+    float lt = ht * low_threshold;
+    for (int i = 0; i < n_elements; i++) {
+        float img_v = imd[i];
+        if (img_v >= ht) {  // strong pixel
+            imd[i] = strong;
+        } else if (img_v <= ht && img_v >= lt) {  // strong pixel
+            imd[i] = weak;
+        }
+    }
+
+    for (int iy = 0; iy < img->ne[1]; iy++) {
+        for (int ix = 0; ix < img->ne[0]; ix++) {
+            if (ix >= 3 && ix <= img->ne[0] - 3 && iy >= 3 && iy <= img->ne[1] - 3) {
+                ggml_tensor_set_f32(img, ggml_tensor_get_f32(img, ix, iy), ix, iy);
+            } else {
+                ggml_tensor_set_f32(img, 0.0f, ix, iy);
+            }
+        }
+    }
+
+    // hysteresis
+    for (int iy = 1; iy < img->ne[1] - 1; iy++) {
+        for (int ix = 1; ix < img->ne[0] - 1; ix++) {
+            float imd_v = ggml_tensor_get_f32(img, ix, iy);
+            if (imd_v == weak) {
+                if (ggml_tensor_get_f32(img, ix + 1, iy - 1) == strong || ggml_tensor_get_f32(img, ix + 1, iy) == strong ||
+                    ggml_tensor_get_f32(img, ix, iy - 1) == strong || ggml_tensor_get_f32(img, ix, iy + 1) == strong ||
+                    ggml_tensor_get_f32(img, ix - 1, iy - 1) == strong || ggml_tensor_get_f32(img, ix - 1, iy) == strong) {
+                    ggml_tensor_set_f32(img, strong, ix, iy);
+                } else {
+                    ggml_tensor_set_f32(img, 0.0f, ix, iy);
+                }
+            }
+        }
+    }
+}
+
+bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
+    struct ggml_init_params params;
+    params.mem_size               = static_cast<size_t>(40 * img.width * img.height);  // 10MB for 512x512
+    params.mem_buffer             = NULL;
+    params.no_alloc               = false;
+    struct ggml_context* work_ctx = ggml_init(params);
+
+    if (!work_ctx) {
+        LOG_ERROR("ggml_init() failed");
+        return false;
+    }
+
+    float kX[9] = {
+        -1, 0, 1,
+        -2, 0, 2,
+        -1, 0, 1};
+
+    float kY[9] = {
+        1, 2, 1,
+        0, 0, 0,
+        -1, -2, -1};
+
+    // generate kernel
+    int kernel_size             = 5;
+    struct ggml_tensor* gkernel = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, kernel_size, kernel_size, 1, 1);
+    struct ggml_tensor* sf_kx   = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
+    memcpy(sf_kx->data, kX, ggml_nbytes(sf_kx));
+    struct ggml_tensor* sf_ky = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
+    memcpy(sf_ky->data, kY, ggml_nbytes(sf_ky));
+    gaussian_kernel(gkernel);
+    struct ggml_tensor* image      = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 3, 1);
+    struct ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 1, 1);
+    struct ggml_tensor* iX         = ggml_dup_tensor(work_ctx, image_gray);
+    struct ggml_tensor* iY         = ggml_dup_tensor(work_ctx, image_gray);
+    struct ggml_tensor* G          = ggml_dup_tensor(work_ctx, image_gray);
+    struct ggml_tensor* tetha      = ggml_dup_tensor(work_ctx, image_gray);
+    sd_image_to_tensor(img, image);
+    grayscale(image, image_gray);
+    convolve(image_gray, image_gray, gkernel, 2);
+    convolve(image_gray, iX, sf_kx, 1);
+    convolve(image_gray, iY, sf_ky, 1);
+    prop_hypot(iX, iY, G);
+    normalize_tensor(G);
+    prop_arctan2(iX, iY, tetha);
+    non_max_supression(image_gray, G, tetha);
+    threshold_hystersis(image_gray, high_threshold, low_threshold, weak, strong);
+    // to RGB channels
+    for (int iy = 0; iy < img.height; iy++) {
+        for (int ix = 0; ix < img.width; ix++) {
+            float gray = ggml_tensor_get_f32(image_gray, ix, iy);
+            gray       = inverse ? 1.0f - gray : gray;
+            ggml_tensor_set_f32(image, gray, ix, iy);
+            ggml_tensor_set_f32(image, gray, ix, iy, 1);
+            ggml_tensor_set_f32(image, gray, ix, iy, 2);
+        }
+    }
+    sd_tensor_to_image(image, img.data);
+    ggml_free(work_ctx);
+    return true;
+}
+
+#endif  // __PREPROCESSING_HPP__
--- a/src/qwen_image.hpp
+++ b/src/qwen_image.hpp
@ -1,10 +1,9 @@
 #ifndef __QWEN_IMAGE_HPP__
 #define __QWEN_IMAGE_HPP__

-#include <memory>
-
-#include "common_block.hpp"
+#include "common.hpp"
 #include "flux.hpp"
+#include "ggml_extend.hpp"

 namespace Qwen {
    constexpr int QWEN_IMAGE_GRAPH_SIZE = 20480;
@ -26,18 +25,18 @@ namespace Qwen {
            blocks["linear_2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, out_dim, sample_proj_bias));
        }

-        ggml_tensor* forward(GGMLRunnerContext* ctx,
-                             ggml_tensor* sample,
-                             ggml_tensor* condition = nullptr) {
+        struct ggml_tensor* forward(struct ggml_context* ctx,
+                                    struct ggml_tensor* sample,
+                                    struct ggml_tensor* condition = nullptr) {
            if (condition != nullptr) {
                auto cond_proj = std::dynamic_pointer_cast<Linear>(blocks["cond_proj"]);
-                sample         = ggml_add(ctx->ggml_ctx, sample, cond_proj->forward(ctx, condition));
+                sample         = ggml_add(ctx, sample, cond_proj->forward(ctx, condition));
            }
            auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
            auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);

            sample = linear_1->forward(ctx, sample);
-            sample = ggml_silu_inplace(ctx->ggml_ctx, sample);
+            sample = ggml_silu_inplace(ctx, sample);
            sample = linear_2->forward(ctx, sample);
            return sample;
        }
@ -49,13 +48,13 @@ namespace Qwen {
            blocks["timestep_embedder"] = std::shared_ptr<GGMLBlock>(new TimestepEmbedding(256, embedding_dim));
        }

-        ggml_tensor* forward(GGMLRunnerContext* ctx,
-                             ggml_tensor* timesteps) {
+        struct ggml_tensor* forward(struct ggml_context* ctx,
+                                    struct ggml_tensor* timesteps) {
            // timesteps: [N,]
            // return: [N, embedding_dim]
            auto timestep_embedder = std::dynamic_pointer_cast<TimestepEmbedding>(blocks["timestep_embedder"]);

-            auto timesteps_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1.f);
+            auto timesteps_proj = ggml_nn_timestep_embedding(ctx, timesteps, 256, 10000, 1.f);
            auto timesteps_emb  = timestep_embedder->forward(ctx, timesteps_proj);
            return timesteps_emb;
        }
@ -64,6 +63,7 @@ namespace Qwen {
    struct QwenImageAttention : public GGMLBlock {
    protected:
        int64_t dim_head;
+        bool flash_attn;

    public:
        QwenImageAttention(int64_t query_dim,
@ -73,8 +73,9 @@ namespace Qwen {
                           int64_t out_context_dim = 0,
                           bool bias               = true,
                           bool out_bias           = true,
-                           float eps               = 1e-6)
-            : dim_head(dim_head) {
+                           float eps               = 1e-6,
+                           bool flash_attn         = false)
+            : dim_head(dim_head), flash_attn(flash_attn) {
            int64_t inner_dim = out_dim > 0 ? out_dim : dim_head * num_heads;
            out_dim           = out_dim > 0 ? out_dim : query_dim;
            out_context_dim   = out_context_dim > 0 ? out_context_dim : query_dim;
@ -93,24 +94,21 @@ namespace Qwen {
            blocks["norm_added_q"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps));
            blocks["norm_added_k"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps));

-            float scale         = 1.f / 32.f;
-            bool force_prec_f32 = false;
-#ifdef SD_USE_VULKAN
-            force_prec_f32 = true;
-#endif
+            float scale = 1.f / 32.f;
            // The purpose of the scale here is to prevent NaN issues in certain situations.
            // For example when using CUDA but the weights are k-quants (not all prompts).
-            blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias, false, force_prec_f32, scale));
+            blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias, false, false, scale));
            // to_out.1 is nn.Dropout

            blocks["to_add_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_context_dim, out_bias, false, false, scale));
        }

-        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                      ggml_tensor* img,
-                                                      ggml_tensor* txt,
-                                                      ggml_tensor* pe,
-                                                      ggml_tensor* mask = nullptr) {
+        std::pair<ggml_tensor*, ggml_tensor*> forward(struct ggml_context* ctx,
+                                                      ggml_backend_t backend,
+                                                      struct ggml_tensor* img,
+                                                      struct ggml_tensor* txt,
+                                                      struct ggml_tensor* pe,
+                                                      struct ggml_tensor* mask = nullptr) {
            // img: [N, n_img_token, hidden_size]
            // txt: [N, n_txt_token, hidden_size]
            // pe: [n_img_token + n_txt_token, d_head/2, 2, 2]
@ -138,48 +136,49 @@ namespace Qwen {

            auto img_q        = to_q->forward(ctx, img);
            int64_t num_heads = img_q->ne[0] / dim_head;
-            img_q             = ggml_reshape_4d(ctx->ggml_ctx, img_q, dim_head, num_heads, n_img_token, N);  // [N, n_img_token, n_head, d_head]
+            img_q             = ggml_reshape_4d(ctx, img_q, dim_head, num_heads, n_img_token, N);  // [N, n_img_token, n_head, d_head]
            auto img_k        = to_k->forward(ctx, img);
-            img_k             = ggml_reshape_4d(ctx->ggml_ctx, img_k, dim_head, num_heads, n_img_token, N);  // [N, n_img_token, n_head, d_head]
+            img_k             = ggml_reshape_4d(ctx, img_k, dim_head, num_heads, n_img_token, N);  // [N, n_img_token, n_head, d_head]
            auto img_v        = to_v->forward(ctx, img);
-            img_v             = ggml_reshape_4d(ctx->ggml_ctx, img_v, dim_head, num_heads, n_img_token, N);  // [N, n_img_token, n_head, d_head]
+            img_v             = ggml_reshape_4d(ctx, img_v, dim_head, num_heads, n_img_token, N);  // [N, n_img_token, n_head, d_head]

            img_q = norm_q->forward(ctx, img_q);
            img_k = norm_k->forward(ctx, img_k);

            auto txt_q = add_q_proj->forward(ctx, txt);
-            txt_q      = ggml_reshape_4d(ctx->ggml_ctx, txt_q, dim_head, num_heads, n_txt_token, N);  // [N, n_txt_token, n_head, d_head]
+            txt_q      = ggml_reshape_4d(ctx, txt_q, dim_head, num_heads, n_txt_token, N);  // [N, n_txt_token, n_head, d_head]
            auto txt_k = add_k_proj->forward(ctx, txt);
-            txt_k      = ggml_reshape_4d(ctx->ggml_ctx, txt_k, dim_head, num_heads, n_txt_token, N);  // [N, n_txt_token, n_head, d_head]
+            txt_k      = ggml_reshape_4d(ctx, txt_k, dim_head, num_heads, n_txt_token, N);  // [N, n_txt_token, n_head, d_head]
            auto txt_v = add_v_proj->forward(ctx, txt);
-            txt_v      = ggml_reshape_4d(ctx->ggml_ctx, txt_v, dim_head, num_heads, n_txt_token, N);  // [N, n_txt_token, n_head, d_head]
+            txt_v      = ggml_reshape_4d(ctx, txt_v, dim_head, num_heads, n_txt_token, N);  // [N, n_txt_token, n_head, d_head]

            txt_q = norm_added_q->forward(ctx, txt_q);
            txt_k = norm_added_k->forward(ctx, txt_k);

-            auto q = ggml_concat(ctx->ggml_ctx, txt_q, img_q, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
-            auto k = ggml_concat(ctx->ggml_ctx, txt_k, img_k, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
-            auto v = ggml_concat(ctx->ggml_ctx, txt_v, img_v, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
+            auto q = ggml_concat(ctx, txt_q, img_q, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
+            auto k = ggml_concat(ctx, txt_k, img_k, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
+            auto v = ggml_concat(ctx, txt_v, img_v, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]

-            auto attn         = Rope::attention(ctx, q, k, v, pe, mask, (1.0f / 128.f));  // [N, n_txt_token + n_img_token, n_head*d_head]
-            auto txt_attn_out = ggml_view_3d(ctx->ggml_ctx,
+            auto attn         = Rope::attention(ctx, backend, q, k, v, pe, mask, flash_attn, (1.0f / 128.f));  // [N, n_txt_token + n_img_token, n_head*d_head]
+            attn              = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));                           // [n_txt_token + n_img_token, N, hidden_size]
+            auto txt_attn_out = ggml_view_3d(ctx,
                                             attn,
                                             attn->ne[0],
+                                             attn->ne[1],
                                             txt->ne[1],
-                                             attn->ne[2],
                                             attn->nb[1],
                                             attn->nb[2],
-                                             0);  // [N, n_txt_token, n_head*d_head]
-            auto img_attn_out = ggml_view_3d(ctx->ggml_ctx,
+                                             0);                                              // [n_txt_token, N, hidden_size]
+            txt_attn_out      = ggml_cont(ctx, ggml_permute(ctx, txt_attn_out, 0, 2, 1, 3));  // [N, n_txt_token, hidden_size]
+            auto img_attn_out = ggml_view_3d(ctx,
                                             attn,
                                             attn->ne[0],
+                                             attn->ne[1],
                                             img->ne[1],
-                                             attn->ne[2],
                                             attn->nb[1],
                                             attn->nb[2],
-                                             txt->ne[1] * attn->nb[1]);  // [N, n_img_token, n_head*d_head]
-            img_attn_out      = ggml_cont(ctx->ggml_ctx, img_attn_out);
-            txt_attn_out      = ggml_cont(ctx->ggml_ctx, txt_attn_out);
+                                             attn->nb[2] * txt->ne[1]);                       // [n_img_token, N, hidden_size]
+            img_attn_out      = ggml_cont(ctx, ggml_permute(ctx, img_attn_out, 0, 2, 1, 3));  // [N, n_img_token, hidden_size]

            img_attn_out = to_out_0->forward(ctx, img_attn_out);
            txt_attn_out = to_add_out->forward(ctx, txt_attn_out);
@ -189,16 +188,12 @@ namespace Qwen {
    };

    class QwenImageTransformerBlock : public GGMLBlock {
-    protected:
-        bool zero_cond_t;
-
    public:
        QwenImageTransformerBlock(int64_t dim,
                                  int64_t num_attention_heads,
                                  int64_t attention_head_dim,
-                                  float eps        = 1e-6,
-                                  bool zero_cond_t = false)
-            : zero_cond_t(zero_cond_t) {
+                                  float eps       = 1e-6,
+                                  bool flash_attn = false) {
            // img_mod.0 is nn.SiLU()
            blocks["img_mod.1"] = std::shared_ptr<GGMLBlock>(new Linear(dim, 6 * dim, true));

@ -211,7 +206,7 @@ namespace Qwen {

            blocks["txt_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim, eps, false));
            blocks["txt_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim, eps, false));
-            blocks["txt_mlp"]   = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim, 4, FeedForward::Activation::GELU, true));
+            blocks["txt_mlp"]   = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim, 4, FeedForward::Activation::GELU));

            blocks["attn"] = std::shared_ptr<GGMLBlock>(new QwenImageAttention(dim,
                                                                               attention_head_dim,
@ -220,40 +215,16 @@ namespace Qwen {
                                                                               0,     // out_context-dim
                                                                               true,  // bias
                                                                               true,  // out_bias
-                                                                               eps));
+                                                                               eps,
+                                                                               flash_attn));
        }

-        std::vector<ggml_tensor*> get_mod_params_vec(ggml_context* ctx, ggml_tensor* mod_params, ggml_tensor* index = nullptr) {
-            // index: [N, n_img_token]
-            // mod_params: [N, hidden_size * 12]
-            if (index == nullptr) {
-                return ggml_ext_chunk(ctx, mod_params, 6, 0);
-            }
-            mod_params          = ggml_reshape_1d(ctx, mod_params, ggml_nelements(mod_params));
-            auto mod_params_vec = ggml_ext_chunk(ctx, mod_params, 12, 0);
-            index               = ggml_reshape_3d(ctx, index, 1, index->ne[0], index->ne[1]);                                      // [N, n_img_token, 1]
-            index               = ggml_repeat_4d(ctx, index, mod_params_vec[0]->ne[0], index->ne[1], index->ne[2], index->ne[3]);  // [N, n_img_token, hidden_size]
-            std::vector<ggml_tensor*> mod_results;
-            for (int i = 0; i < 6; i++) {
-                auto mod_0 = mod_params_vec[i];
-                auto mod_1 = mod_params_vec[i + 6];
-
-                // mod_result = torch.where(index == 0, mod_0, mod_1)
-                // mod_result = (1 - index)*mod_0 + index*mod_1
-                mod_0           = ggml_sub(ctx, ggml_repeat(ctx, mod_0, index), ggml_mul(ctx, index, mod_0));  // [N, n_img_token, hidden_size]
-                mod_1           = ggml_mul(ctx, index, mod_1);                                                 // [N, n_img_token, hidden_size]
-                auto mod_result = ggml_add(ctx, mod_0, mod_1);
-                mod_results.push_back(mod_result);
-            }
-            return mod_results;
-        }
-
-        virtual std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                              ggml_tensor* img,
-                                                              ggml_tensor* txt,
-                                                              ggml_tensor* t_emb,
-                                                              ggml_tensor* pe,
-                                                              ggml_tensor* modulate_index = nullptr) {
+        virtual std::pair<ggml_tensor*, ggml_tensor*> forward(struct ggml_context* ctx,
+                                                              ggml_backend_t backend,
+                                                              struct ggml_tensor* img,
+                                                              struct ggml_tensor* txt,
+                                                              struct ggml_tensor* t_emb,
+                                                              struct ggml_tensor* pe) {
            // img: [N, n_img_token, hidden_size]
            // txt: [N, n_txt_token, hidden_size]
            // pe: [n_img_token + n_txt_token, d_head/2, 2, 2]
@ -271,44 +242,40 @@ namespace Qwen {

            auto attn = std::dynamic_pointer_cast<QwenImageAttention>(blocks["attn"]);

-            auto img_mod_params    = ggml_silu(ctx->ggml_ctx, t_emb);
+            auto img_mod_params    = ggml_silu(ctx, t_emb);
            img_mod_params         = img_mod_1->forward(ctx, img_mod_params);
-            auto img_mod_param_vec = get_mod_params_vec(ctx->ggml_ctx, img_mod_params, modulate_index);
+            auto img_mod_param_vec = ggml_chunk(ctx, img_mod_params, 6, 0);

-            if (zero_cond_t) {
-                t_emb = ggml_ext_chunk(ctx->ggml_ctx, t_emb, 2, 1)[0];
-            }
-
-            auto txt_mod_params    = ggml_silu(ctx->ggml_ctx, t_emb);
+            auto txt_mod_params    = ggml_silu(ctx, t_emb);
            txt_mod_params         = txt_mod_1->forward(ctx, txt_mod_params);
-            auto txt_mod_param_vec = get_mod_params_vec(ctx->ggml_ctx, txt_mod_params);
+            auto txt_mod_param_vec = ggml_chunk(ctx, txt_mod_params, 6, 0);

            auto img_normed    = img_norm1->forward(ctx, img);
-            auto img_modulated = Flux::modulate(ctx->ggml_ctx, img_normed, img_mod_param_vec[0], img_mod_param_vec[1], modulate_index != nullptr);
+            auto img_modulated = Flux::modulate(ctx, img_normed, img_mod_param_vec[0], img_mod_param_vec[1]);
            auto img_gate1     = img_mod_param_vec[2];

            auto txt_normed    = txt_norm1->forward(ctx, txt);
-            auto txt_modulated = Flux::modulate(ctx->ggml_ctx, txt_normed, txt_mod_param_vec[0], txt_mod_param_vec[1]);
+            auto txt_modulated = Flux::modulate(ctx, txt_normed, txt_mod_param_vec[0], txt_mod_param_vec[1]);
            auto txt_gate1     = txt_mod_param_vec[2];

-            auto [img_attn_output, txt_attn_output] = attn->forward(ctx, img_modulated, txt_modulated, pe);
+            auto [img_attn_output, txt_attn_output] = attn->forward(ctx, backend, img_modulated, txt_modulated, pe);

-            img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_attn_output, img_gate1));
-            txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_attn_output, txt_gate1));
+            img = ggml_add(ctx, img, ggml_mul(ctx, img_attn_output, img_gate1));
+            txt = ggml_add(ctx, txt, ggml_mul(ctx, txt_attn_output, txt_gate1));

            auto img_normed2    = img_norm2->forward(ctx, img);
-            auto img_modulated2 = Flux::modulate(ctx->ggml_ctx, img_normed2, img_mod_param_vec[3], img_mod_param_vec[4], modulate_index != nullptr);
+            auto img_modulated2 = Flux::modulate(ctx, img_normed2, img_mod_param_vec[3], img_mod_param_vec[4]);
            auto img_gate2      = img_mod_param_vec[5];

            auto txt_normed2    = txt_norm2->forward(ctx, txt);
-            auto txt_modulated2 = Flux::modulate(ctx->ggml_ctx, txt_normed2, txt_mod_param_vec[3], txt_mod_param_vec[4]);
+            auto txt_modulated2 = Flux::modulate(ctx, txt_normed2, txt_mod_param_vec[3], txt_mod_param_vec[4]);
            auto txt_gate2      = txt_mod_param_vec[5];

            auto img_mlp_out = img_mlp->forward(ctx, img_modulated2);
            auto txt_mlp_out = txt_mlp->forward(ctx, txt_modulated2);

-            img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_mlp_out, img_gate2));
-            txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp_out, txt_gate2));
+            img = ggml_add(ctx, img, ggml_mul(ctx, img_mlp_out, img_gate2));
+            txt = ggml_add(ctx, txt, ggml_mul(ctx, txt_mlp_out, txt_gate2));

            return {img, txt};
        }
@ -325,9 +292,9 @@ namespace Qwen {
            blocks["linear"] = std::shared_ptr<GGMLBlock>(new Linear(conditioning_embedding_dim, embedding_dim * 2, bias));
        }

-        ggml_tensor* forward(GGMLRunnerContext* ctx,
-                             ggml_tensor* x,
-                             ggml_tensor* c) {
+        struct ggml_tensor* forward(struct ggml_context* ctx,
+                                    struct ggml_tensor* x,
+                                    struct ggml_tensor* c) {
            // x: [N, n_token, hidden_size]
            // c: [N, hidden_size]
            // return: [N, n_token, patch_size * patch_size * out_channels]
@ -335,30 +302,30 @@ namespace Qwen {
            auto norm   = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
            auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);

-            auto emb   = linear->forward(ctx, ggml_silu(ctx->ggml_ctx, c));
-            auto mods  = ggml_ext_chunk(ctx->ggml_ctx, emb, 2, 0);
+            auto emb   = linear->forward(ctx, ggml_silu(ctx, c));
+            auto mods  = ggml_chunk(ctx, emb, 2, 0);
            auto scale = mods[0];
            auto shift = mods[1];

            x = norm->forward(ctx, x);
-            x = Flux::modulate(ctx->ggml_ctx, x, shift, scale);
+            x = Flux::modulate(ctx, x, shift, scale);

            return x;
        }
    };

    struct QwenImageParams {
-        int patch_size              = 2;
+        int64_t patch_size          = 2;
        int64_t in_channels         = 64;
        int64_t out_channels        = 16;
-        int num_layers              = 60;
+        int64_t num_layers          = 60;
        int64_t attention_head_dim  = 128;
        int64_t num_attention_heads = 24;
        int64_t joint_attention_dim = 3584;
-        int theta                   = 10000;
+        float theta                 = 10000;
        std::vector<int> axes_dim   = {16, 56, 56};
-        int axes_dim_sum            = 128;
-        bool zero_cond_t            = false;
+        int64_t axes_dim_sum        = 128;
+        bool flash_attn             = false;
    };

    class QwenImageModel : public GGMLBlock {
@ -381,7 +348,7 @@ namespace Qwen {
                                                                                                                             params.num_attention_heads,
                                                                                                                             params.attention_head_dim,
                                                                                                                             1e-6f,
-                                                                                                                             params.zero_cond_t));
+                                                                                                                             params.flash_attn));
                blocks["transformer_blocks." + std::to_string(i)] = block;
            }

@ -389,12 +356,75 @@ namespace Qwen {
            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, params.patch_size * params.patch_size * params.out_channels));
        }

-        ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
-                                  ggml_tensor* x,
-                                  ggml_tensor* timestep,
-                                  ggml_tensor* context,
-                                  ggml_tensor* pe,
-                                  ggml_tensor* modulate_index = nullptr) {
+        struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx,
+                                              struct ggml_tensor* x) {
+            int64_t W = x->ne[0];
+            int64_t H = x->ne[1];
+
+            int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
+            int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
+            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            return x;
+        }
+
+        struct ggml_tensor* patchify(struct ggml_context* ctx,
+                                     struct ggml_tensor* x) {
+            // x: [N, C, H, W]
+            // return: [N, h*w, C * patch_size * patch_size]
+            int64_t N = x->ne[3];
+            int64_t C = x->ne[2];
+            int64_t H = x->ne[1];
+            int64_t W = x->ne[0];
+            int64_t p = params.patch_size;
+            int64_t h = H / params.patch_size;
+            int64_t w = W / params.patch_size;
+
+            GGML_ASSERT(h * p == H && w * p == W);
+
+            x = ggml_reshape_4d(ctx, x, p, w, p, h * C * N);       // [N*C*h, p, w, p]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N*C*h, w, p, p]
+            x = ggml_reshape_4d(ctx, x, p * p, w * h, C, N);       // [N, C, h*w, p*p]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N, h*w, C, p*p]
+            x = ggml_reshape_3d(ctx, x, p * p * C, w * h, N);      // [N, h*w, C*p*p]
+            return x;
+        }
+
+        struct ggml_tensor* process_img(struct ggml_context* ctx,
+                                        struct ggml_tensor* x) {
+            x = pad_to_patch_size(ctx, x);
+            x = patchify(ctx, x);
+            return x;
+        }
+
+        struct ggml_tensor* unpatchify(struct ggml_context* ctx,
+                                       struct ggml_tensor* x,
+                                       int64_t h,
+                                       int64_t w) {
+            // x: [N, h*w, C*patch_size*patch_size]
+            // return: [N, C, H, W]
+            int64_t N = x->ne[2];
+            int64_t C = x->ne[0] / params.patch_size / params.patch_size;
+            int64_t H = h * params.patch_size;
+            int64_t W = w * params.patch_size;
+            int64_t p = params.patch_size;
+
+            GGML_ASSERT(C * p * p == x->ne[0]);
+
+            x = ggml_reshape_4d(ctx, x, p * p, C, w * h, N);       // [N, h*w, C, p*p]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N, C, h*w, p*p]
+            x = ggml_reshape_4d(ctx, x, p, p, w, h * C * N);       // [N*C*h, w, p, p]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N*C*h, p, w, p]
+            x = ggml_reshape_4d(ctx, x, W, H, C, N);               // [N, C, h*p, w*p]
+
+            return x;
+        }
+
+        struct ggml_tensor* forward_orig(struct ggml_context* ctx,
+                                         ggml_backend_t backend,
+                                         struct ggml_tensor* x,
+                                         struct ggml_tensor* timestep,
+                                         struct ggml_tensor* context,
+                                         struct ggml_tensor* pe) {
            auto time_text_embed = std::dynamic_pointer_cast<QwenTimestepProjEmbeddings>(blocks["time_text_embed"]);
            auto txt_norm        = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm"]);
            auto img_in          = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
@ -403,39 +433,31 @@ namespace Qwen {
            auto proj_out        = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]);

            auto t_emb = time_text_embed->forward(ctx, timestep);
-            if (params.zero_cond_t) {
-                auto t_emb_0 = time_text_embed->forward(ctx, ggml_ext_zeros_like(ctx->ggml_ctx, timestep));
-                t_emb        = ggml_concat(ctx->ggml_ctx, t_emb, t_emb_0, 1);
-            }
-            auto img = img_in->forward(ctx, x);
-            auto txt = txt_norm->forward(ctx, context);
-            txt      = txt_in->forward(ctx, txt);
+            auto img   = img_in->forward(ctx, x);
+            auto txt   = txt_norm->forward(ctx, context);
+            txt        = txt_in->forward(ctx, txt);

            for (int i = 0; i < params.num_layers; i++) {
                auto block = std::dynamic_pointer_cast<QwenImageTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);

-                auto result = block->forward(ctx, img, txt, t_emb, pe, modulate_index);
+                auto result = block->forward(ctx, backend, img, txt, t_emb, pe);
                img         = result.first;
                txt         = result.second;
            }

-            if (params.zero_cond_t) {
-                t_emb = ggml_ext_chunk(ctx->ggml_ctx, t_emb, 2, 1)[0];
-            }
-
            img = norm_out->forward(ctx, img, t_emb);
            img = proj_out->forward(ctx, img);

            return img;
        }

-        ggml_tensor* forward(GGMLRunnerContext* ctx,
-                             ggml_tensor* x,
-                             ggml_tensor* timestep,
-                             ggml_tensor* context,
-                             ggml_tensor* pe,
-                             std::vector<ggml_tensor*> ref_latents = {},
-                             ggml_tensor* modulate_index           = nullptr) {
+        struct ggml_tensor* forward(struct ggml_context* ctx,
+                                    ggml_backend_t backend,
+                                    struct ggml_tensor* x,
+                                    struct ggml_tensor* timestep,
+                                    struct ggml_tensor* context,
+                                    struct ggml_tensor* pe,
+                                    std::vector<ggml_tensor*> ref_latents = {}) {
            // Forward pass of DiT.
            // x: [N, C, H, W]
            // timestep: [N,]
@ -448,25 +470,32 @@ namespace Qwen {
            int64_t C = x->ne[2];
            int64_t N = x->ne[3];

-            auto img           = DiT::pad_and_patchify(ctx, x, params.patch_size, params.patch_size);
-            int64_t img_tokens = img->ne[1];
+            auto img            = process_img(ctx, x);
+            uint64_t img_tokens = img->ne[1];

            if (ref_latents.size() > 0) {
                for (ggml_tensor* ref : ref_latents) {
-                    ref = DiT::pad_and_patchify(ctx, ref, params.patch_size, params.patch_size);
-                    img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
+                    ref = process_img(ctx, ref);
+                    img = ggml_concat(ctx, img, ref, 1);
                }
            }

-            auto out = forward_orig(ctx, img, timestep, context, pe, modulate_index);  // [N, h_len*w_len, ph*pw*C]
+            int64_t h_len = ((H + (params.patch_size / 2)) / params.patch_size);
+            int64_t w_len = ((W + (params.patch_size / 2)) / params.patch_size);
+
+            auto out = forward_orig(ctx, backend, img, timestep, context, pe);  // [N, h_len*w_len, ph*pw*C]

            if (out->ne[1] > img_tokens) {
-                out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3));  // [num_tokens, N, C * patch_size * patch_size]
-                out = ggml_view_3d(ctx->ggml_ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0);
-                out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3));  // [N, h*w, C * patch_size * patch_size]
+                out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3));  // [num_tokens, N, C * patch_size * patch_size]
+                out = ggml_view_3d(ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0);
+                out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3));  // [N, h*w, C * patch_size * patch_size]
            }

-            out = DiT::unpatchify_and_crop(ctx->ggml_ctx, out, H, W, params.patch_size, params.patch_size);  // [N, C, H, W]
+            out = unpatchify(ctx, out, h_len, w_len);  // [N, C, H + pad_h, W + pad_w]
+
+            // slice
+            out = ggml_slice(ctx, out, 1, 0, H);  // [N, C, H, W + pad_w]
+            out = ggml_slice(ctx, out, 0, 0, W);  // [N, C, H, W]

            return out;
        }
@ -477,25 +506,21 @@ namespace Qwen {
        QwenImageParams qwen_image_params;
        QwenImageModel qwen_image;
        std::vector<float> pe_vec;
-        std::vector<float> modulate_index_vec;
        SDVersion version;

        QwenImageRunner(ggml_backend_t backend,
                        bool offload_params_to_cpu,
-                        const String2TensorStorage& tensor_storage_map = {},
-                        const std::string prefix                       = "",
-                        SDVersion version                              = VERSION_QWEN_IMAGE,
-                        bool zero_cond_t                               = false)
+                        const String2GGMLType& tensor_types = {},
+                        const std::string prefix            = "",
+                        SDVersion version                   = VERSION_QWEN_IMAGE,
+                        bool flash_attn                     = false)
            : GGMLRunner(backend, offload_params_to_cpu) {
-            qwen_image_params.num_layers  = 0;
-            qwen_image_params.zero_cond_t = zero_cond_t;
-            for (auto pair : tensor_storage_map) {
+            qwen_image_params.flash_attn = flash_attn;
+            qwen_image_params.num_layers = 0;
+            for (auto pair : tensor_types) {
                std::string tensor_name = pair.first;
                if (tensor_name.find(prefix) == std::string::npos)
                    continue;
-                if (tensor_name.find("__index_timestep_zero__") != std::string::npos) {
-                    qwen_image_params.zero_cond_t = true;
-                }
                size_t pos = tensor_name.find("transformer_blocks.");
                if (pos != std::string::npos) {
                    tensor_name = tensor_name.substr(pos);  // remove prefix
@ -509,29 +534,26 @@ namespace Qwen {
                    continue;
                }
            }
-            LOG_INFO("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers);
-            if (qwen_image_params.zero_cond_t) {
-                LOG_INFO("use zero_cond_t");
-            }
-            qwen_image = QwenImageModel(qwen_image_params);
-            qwen_image.init(params_ctx, tensor_storage_map, prefix);
+            LOG_ERROR("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers);
+            qwen_image                   = QwenImageModel(qwen_image_params);
+            qwen_image.init(params_ctx, tensor_types, prefix);
        }

-        std::string get_desc() override {
+        std::string get_desc() {
            return "qwen_image";
        }

-        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+        void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
            qwen_image.get_param_tensors(tensors, prefix);
        }

-        ggml_cgraph* build_graph(ggml_tensor* x,
-                                 ggml_tensor* timesteps,
-                                 ggml_tensor* context,
-                                 std::vector<ggml_tensor*> ref_latents = {},
-                                 bool increase_ref_index               = false) {
+        struct ggml_cgraph* build_graph(struct ggml_tensor* x,
+                                        struct ggml_tensor* timesteps,
+                                        struct ggml_tensor* context,
+                                        std::vector<ggml_tensor*> ref_latents = {},
+                                        bool increase_ref_index               = false) {
            GGML_ASSERT(x->ne[3] == 1);
-            ggml_cgraph* gf = new_graph_custom(QWEN_IMAGE_GRAPH_SIZE);
+            struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, QWEN_IMAGE_GRAPH_SIZE, false);

            x         = to_backend(x);
            context   = to_backend(context);
@ -541,91 +563,62 @@ namespace Qwen {
                ref_latents[i] = to_backend(ref_latents[i]);
            }

-            pe_vec      = Rope::gen_qwen_image_pe(static_cast<int>(x->ne[1]),
-                                                  static_cast<int>(x->ne[0]),
+            pe_vec      = Rope::gen_qwen_image_pe(x->ne[1],
+                                                  x->ne[0],
                                                  qwen_image_params.patch_size,
-                                                  static_cast<int>(x->ne[3]),
-                                                  static_cast<int>(context->ne[1]),
+                                                  x->ne[3],
+                                                  context->ne[1],
                                                  ref_latents,
                                                  increase_ref_index,
                                                  qwen_image_params.theta,
-                                                  circular_y_enabled,
-                                                  circular_x_enabled,
                                                  qwen_image_params.axes_dim);
-            int pos_len = static_cast<int>(pe_vec.size() / qwen_image_params.axes_dim_sum / 2);
+            int pos_len = pe_vec.size() / qwen_image_params.axes_dim_sum / 2;
            // LOG_DEBUG("pos_len %d", pos_len);
            auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, qwen_image_params.axes_dim_sum / 2, pos_len);
            // pe->data = pe_vec.data();
            // print_ggml_tensor(pe, true, "pe");
-            // pe->data = nullptr;
+            // pe->data = NULL;
            set_backend_tensor_data(pe, pe_vec.data());

-            ggml_tensor* modulate_index = nullptr;
-            if (qwen_image_params.zero_cond_t) {
-                modulate_index_vec.clear();
-
-                int64_t h_len          = ((x->ne[1] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
-                int64_t w_len          = ((x->ne[0] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
-                int64_t num_img_tokens = h_len * w_len;
-
-                modulate_index_vec.insert(modulate_index_vec.end(), num_img_tokens, 0.f);
-                int64_t num_ref_img_tokens = 0;
-                for (ggml_tensor* ref : ref_latents) {
-                    int64_t h_len = ((ref->ne[1] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
-                    int64_t w_len = ((ref->ne[0] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
-
-                    num_ref_img_tokens += h_len * w_len;
-                }
-
-                if (num_ref_img_tokens > 0) {
-                    modulate_index_vec.insert(modulate_index_vec.end(), num_ref_img_tokens, 1.f);
-                }
-
-                modulate_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, modulate_index_vec.size());
-                set_backend_tensor_data(modulate_index, modulate_index_vec.data());
-            }
-
-            auto runner_ctx = get_context();
-
-            ggml_tensor* out = qwen_image.forward(&runner_ctx,
-                                                  x,
-                                                  timesteps,
-                                                  context,
-                                                  pe,
-                                                  ref_latents,
-                                                  modulate_index);
+            struct ggml_tensor* out = qwen_image.forward(compute_ctx,
+                                                         runtime_backend,
+                                                         x,
+                                                         timesteps,
+                                                         context,
+                                                         pe,
+                                                         ref_latents);

            ggml_build_forward_expand(gf, out);

            return gf;
        }

-        bool compute(int n_threads,
-                     ggml_tensor* x,
-                     ggml_tensor* timesteps,
-                     ggml_tensor* context,
+        void compute(int n_threads,
+                     struct ggml_tensor* x,
+                     struct ggml_tensor* timesteps,
+                     struct ggml_tensor* context,
                     std::vector<ggml_tensor*> ref_latents = {},
                     bool increase_ref_index               = false,
-                     ggml_tensor** output                  = nullptr,
-                     ggml_context* output_ctx              = nullptr) {
+                     struct ggml_tensor** output           = NULL,
+                     struct ggml_context* output_ctx       = NULL) {
            // x: [N, in_channels, h, w]
            // timesteps: [N, ]
            // context: [N, max_position, hidden_size]
-            auto get_graph = [&]() -> ggml_cgraph* {
+            auto get_graph = [&]() -> struct ggml_cgraph* {
                return build_graph(x, timesteps, context, ref_latents, increase_ref_index);
            };

-            return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+            GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
        }

        void test() {
-            ggml_init_params params;
+            struct ggml_init_params params;
            params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1GB
-            params.mem_buffer = nullptr;
+            params.mem_buffer = NULL;
            params.no_alloc   = false;

-            ggml_context* work_ctx = ggml_init(params);
-            GGML_ASSERT(work_ctx != nullptr);
+            struct ggml_context* work_ctx = ggml_init(params);
+            GGML_ASSERT(work_ctx != NULL);

            {
                // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1);
@ -641,42 +634,44 @@ namespace Qwen {
                auto context = load_tensor_from_file(work_ctx, "./qwen_image_context.bin");
                print_ggml_tensor(context);

-                ggml_tensor* out = nullptr;
+                struct ggml_tensor* out = NULL;

-                int64_t t0 = ggml_time_ms();
+                int t0 = ggml_time_ms();
                compute(8, x, timesteps, context, {}, false, &out, work_ctx);
-                int64_t t1 = ggml_time_ms();
+                int t1 = ggml_time_ms();

                print_ggml_tensor(out);
-                LOG_DEBUG("qwen_image test done in %lldms", t1 - t0);
+                LOG_DEBUG("qwen_image test done in %dms", t1 - t0);
            }
        }

        static void load_from_file_and_test(const std::string& file_path) {
            // cuda q8: pass
-            // cuda q8 fa: pass
+            // cuda q8 fa: nan
            // ggml_backend_t backend    = ggml_backend_cuda_init(0);
            ggml_backend_t backend    = ggml_backend_cpu_init();
            ggml_type model_data_type = GGML_TYPE_Q8_0;

            ModelLoader model_loader;
-            if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
+            if (!model_loader.init_from_file(file_path, "model.diffusion_model.")) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
            }

-            auto& tensor_storage_map = model_loader.get_tensor_storage_map();
-            for (auto& [name, tensor_storage] : tensor_storage_map) {
-                if (ends_with(name, "weight")) {
-                    tensor_storage.expected_type = model_data_type;
+            auto tensor_types = model_loader.tensor_storages_types;
+            for (auto& item : tensor_types) {
+                // LOG_DEBUG("%s %u", item.first.c_str(), item.second);
+                if (ends_with(item.first, "weight")) {
+                    item.second = model_data_type;
                }
            }

-            std::shared_ptr<QwenImageRunner> qwen_image = std::make_shared<QwenImageRunner>(backend,
-                                                                                            false,
-                                                                                            tensor_storage_map,
-                                                                                            "model.diffusion_model",
-                                                                                            VERSION_QWEN_IMAGE);
+            std::shared_ptr<QwenImageRunner> qwen_image = std::shared_ptr<QwenImageRunner>(new QwenImageRunner(backend,
+                                                                                                               false,
+                                                                                                               tensor_types,
+                                                                                                               "model.diffusion_model",
+                                                                                                               VERSION_QWEN_IMAGE,
+                                                                                                               true));

            qwen_image->alloc_params_buffer();
            std::map<std::string, ggml_tensor*> tensors;
@ -696,4 +691,4 @@ namespace Qwen {

 }  // namespace name

-#endif  // __QWEN_IMAGE_HPP__
+#endif  // __QWEN_IMAGE_HPP__
--- a/src/llm.hpp
+++ b/src/llm.hpp
--- a/src/rng.hpp
+++ b/src/rng.hpp
@ -15,11 +15,11 @@ private:
    std::default_random_engine generator;

 public:
-    void manual_seed(uint64_t seed) override {
+    void manual_seed(uint64_t seed) {
        generator.seed((unsigned int)seed);
    }

-    std::vector<float> randn(uint32_t n) override {
+    std::vector<float> randn(uint32_t n) {
        std::vector<float> result;
        float mean   = 0.0;
        float stddev = 1.0;
--- a/src/rng_philox.hpp
+++ b/src/rng_philox.hpp
@ -93,12 +93,12 @@ public:
        this->offset = 0;
    }

-    void manual_seed(uint64_t seed) override {
+    void manual_seed(uint64_t seed) {
        this->seed   = seed;
        this->offset = 0;
    }

-    std::vector<float> randn(uint32_t n) override {
+    std::vector<float> randn(uint32_t n) {
        std::vector<std::vector<uint32_t>> counter(4, std::vector<uint32_t>(n, 0));
        for (uint32_t i = 0; i < n; i++) {
            counter[0][i] = this->offset;
--- a/rope.hpp
+++ b/rope.hpp
@ -0,0 +1,410 @@
+#ifndef __ROPE_HPP__
+#define __ROPE_HPP__
+
+#include <vector>
+#include "ggml_extend.hpp"
+
+namespace Rope {
+    template <class T>
+    __STATIC_INLINE__ std::vector<T> linspace(T start, T end, int num) {
+        std::vector<T> result(num);
+        if (num == 1) {
+            result[0] = start;
+            return result;
+        }
+        T step = (end - start) / (num - 1);
+        for (int i = 0; i < num; ++i) {
+            result[i] = start + i * step;
+        }
+        return result;
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>> transpose(const std::vector<std::vector<float>>& mat) {
+        int rows = mat.size();
+        int cols = mat[0].size();
+        std::vector<std::vector<float>> transposed(cols, std::vector<float>(rows));
+        for (int i = 0; i < rows; ++i) {
+            for (int j = 0; j < cols; ++j) {
+                transposed[j][i] = mat[i][j];
+            }
+        }
+        return transposed;
+    }
+
+    __STATIC_INLINE__ std::vector<float> flatten(const std::vector<std::vector<float>>& vec) {
+        std::vector<float> flat_vec;
+        for (const auto& sub_vec : vec) {
+            flat_vec.insert(flat_vec.end(), sub_vec.begin(), sub_vec.end());
+        }
+        return flat_vec;
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>> rope(const std::vector<float>& pos, int dim, int theta) {
+        assert(dim % 2 == 0);
+        int half_dim = dim / 2;
+
+        std::vector<float> scale = linspace(0.f, (dim * 1.f - 2) / dim, half_dim);
+
+        std::vector<float> omega(half_dim);
+        for (int i = 0; i < half_dim; ++i) {
+            omega[i] = 1.0 / std::pow(theta, scale[i]);
+        }
+
+        int pos_size = pos.size();
+        std::vector<std::vector<float>> out(pos_size, std::vector<float>(half_dim));
+        for (int i = 0; i < pos_size; ++i) {
+            for (int j = 0; j < half_dim; ++j) {
+                out[i][j] = pos[i] * omega[j];
+            }
+        }
+
+        std::vector<std::vector<float>> result(pos_size, std::vector<float>(half_dim * 4));
+        for (int i = 0; i < pos_size; ++i) {
+            for (int j = 0; j < half_dim; ++j) {
+                result[i][4 * j]     = std::cos(out[i][j]);
+                result[i][4 * j + 1] = -std::sin(out[i][j]);
+                result[i][4 * j + 2] = std::sin(out[i][j]);
+                result[i][4 * j + 3] = std::cos(out[i][j]);
+            }
+        }
+
+        return result;
+    }
+
+    // Generate IDs for image patches and text
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_txt_ids(int bs, int context_len) {
+        return std::vector<std::vector<float>>(bs * context_len, std::vector<float>(3, 0.0));
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_img_ids(int h, int w, int patch_size, int bs, int index = 0, int h_offset = 0, int w_offset = 0) {
+        int h_len = (h + (patch_size / 2)) / patch_size;
+        int w_len = (w + (patch_size / 2)) / patch_size;
+
+        std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(3, 0.0));
+
+        std::vector<float> row_ids = linspace<float>(h_offset, h_len - 1 + h_offset, h_len);
+        std::vector<float> col_ids = linspace<float>(w_offset, w_len - 1 + w_offset, w_len);
+
+        for (int i = 0; i < h_len; ++i) {
+            for (int j = 0; j < w_len; ++j) {
+                img_ids[i * w_len + j][0] = index;
+                img_ids[i * w_len + j][1] = row_ids[i];
+                img_ids[i * w_len + j][2] = col_ids[j];
+            }
+        }
+
+        std::vector<std::vector<float>> img_ids_repeated(bs * img_ids.size(), std::vector<float>(3));
+        for (int i = 0; i < bs; ++i) {
+            for (int j = 0; j < img_ids.size(); ++j) {
+                img_ids_repeated[i * img_ids.size() + j] = img_ids[j];
+            }
+        }
+        return img_ids_repeated;
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>> concat_ids(const std::vector<std::vector<float>>& a,
+                                                                 const std::vector<std::vector<float>>& b,
+                                                                 int bs) {
+        size_t a_len = a.size() / bs;
+        size_t b_len = b.size() / bs;
+        std::vector<std::vector<float>> ids(a.size() + b.size(), std::vector<float>(3));
+        for (int i = 0; i < bs; ++i) {
+            for (int j = 0; j < a_len; ++j) {
+                ids[i * (a_len + b_len) + j] = a[i * a_len + j];
+            }
+            for (int j = 0; j < b_len; ++j) {
+                ids[i * (a_len + b_len) + a_len + j] = b[i * b_len + j];
+            }
+        }
+        return ids;
+    }
+
+    __STATIC_INLINE__ std::vector<float> embed_nd(const std::vector<std::vector<float>>& ids,
+                                                  int bs,
+                                                  int theta,
+                                                  const std::vector<int>& axes_dim) {
+        std::vector<std::vector<float>> trans_ids = transpose(ids);
+        size_t pos_len                            = ids.size() / bs;
+        int num_axes                              = axes_dim.size();
+        // for (int i = 0; i < pos_len; i++) {
+        //     std::cout << trans_ids[0][i] << " " << trans_ids[1][i] << " " << trans_ids[2][i] << std::endl;
+        // }
+
+        int emb_dim = 0;
+        for (int d : axes_dim)
+            emb_dim += d / 2;
+
+        std::vector<std::vector<float>> emb(bs * pos_len, std::vector<float>(emb_dim * 2 * 2, 0.0));
+        int offset = 0;
+        for (int i = 0; i < num_axes; ++i) {
+            std::vector<std::vector<float>> rope_emb = rope(trans_ids[i], axes_dim[i], theta);  // [bs*pos_len, axes_dim[i]/2 * 2 * 2]
+            for (int b = 0; b < bs; ++b) {
+                for (int j = 0; j < pos_len; ++j) {
+                    for (int k = 0; k < rope_emb[0].size(); ++k) {
+                        emb[b * pos_len + j][offset + k] = rope_emb[j][k];
+                    }
+                }
+            }
+            offset += rope_emb[0].size();
+        }
+
+        return flatten(emb);
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_refs_ids(int patch_size,
+                                                                   int bs,
+                                                                   const std::vector<ggml_tensor*>& ref_latents,
+                                                                   bool increase_ref_index) {
+        std::vector<std::vector<float>> ids;
+        uint64_t curr_h_offset = 0;
+        uint64_t curr_w_offset = 0;
+        int index              = 1;
+        for (ggml_tensor* ref : ref_latents) {
+            uint64_t h_offset = 0;
+            uint64_t w_offset = 0;
+            if (!increase_ref_index) {
+                if (ref->ne[1] + curr_h_offset > ref->ne[0] + curr_w_offset) {
+                    w_offset = curr_w_offset;
+                } else {
+                    h_offset = curr_h_offset;
+                }
+            }
+
+            auto ref_ids = gen_img_ids(ref->ne[1], ref->ne[0], patch_size, bs, index, h_offset, w_offset);
+            ids          = concat_ids(ids, ref_ids, bs);
+
+            if (increase_ref_index) {
+                index++;
+            }
+
+            curr_h_offset = std::max(curr_h_offset, ref->ne[1] + h_offset);
+            curr_w_offset = std::max(curr_w_offset, ref->ne[0] + w_offset);
+        }
+        return ids;
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_ids(int h,
+                                                                   int w,
+                                                                   int patch_size,
+                                                                   int bs,
+                                                                   int context_len,
+                                                                   const std::vector<ggml_tensor*>& ref_latents,
+                                                                   bool increase_ref_index) {
+        auto txt_ids = gen_txt_ids(bs, context_len);
+        auto img_ids = gen_img_ids(h, w, patch_size, bs);
+
+        auto ids = concat_ids(txt_ids, img_ids, bs);
+        if (ref_latents.size() > 0) {
+            auto refs_ids = gen_refs_ids(patch_size, bs, ref_latents, increase_ref_index);
+            ids           = concat_ids(ids, refs_ids, bs);
+        }
+        return ids;
+    }
+
+    // Generate flux positional embeddings
+    __STATIC_INLINE__ std::vector<float> gen_flux_pe(int h,
+                                                     int w,
+                                                     int patch_size,
+                                                     int bs,
+                                                     int context_len,
+                                                     const std::vector<ggml_tensor*>& ref_latents,
+                                                     bool increase_ref_index,
+                                                     int theta,
+                                                     const std::vector<int>& axes_dim) {
+        std::vector<std::vector<float>> ids = gen_flux_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index);
+        return embed_nd(ids, bs, theta, axes_dim);
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_qwen_image_ids(int h,
+                                                                         int w,
+                                                                         int patch_size,
+                                                                         int bs,
+                                                                         int context_len,
+                                                                         const std::vector<ggml_tensor*>& ref_latents,
+                                                                         bool increase_ref_index) {
+        int h_len        = (h + (patch_size / 2)) / patch_size;
+        int w_len        = (w + (patch_size / 2)) / patch_size;
+        int txt_id_start = std::max(h_len, w_len);
+        auto txt_ids     = linspace<float>(txt_id_start, context_len + txt_id_start, context_len);
+        std::vector<std::vector<float>> txt_ids_repeated(bs * context_len, std::vector<float>(3));
+        for (int i = 0; i < bs; ++i) {
+            for (int j = 0; j < txt_ids.size(); ++j) {
+                txt_ids_repeated[i * txt_ids.size() + j] = {txt_ids[j], txt_ids[j], txt_ids[j]};
+            }
+        }
+        auto img_ids = gen_img_ids(h, w, patch_size, bs);
+        auto ids     = concat_ids(txt_ids_repeated, img_ids, bs);
+        if (ref_latents.size() > 0) {
+            auto refs_ids = gen_refs_ids(patch_size, bs, ref_latents, increase_ref_index);
+            ids           = concat_ids(ids, refs_ids, bs);
+        }
+        return ids;
+    }
+
+    // Generate qwen_image positional embeddings
+    __STATIC_INLINE__ std::vector<float> gen_qwen_image_pe(int h,
+                                                           int w,
+                                                           int patch_size,
+                                                           int bs,
+                                                           int context_len,
+                                                           const std::vector<ggml_tensor*>& ref_latents,
+                                                           bool increase_ref_index,
+                                                           int theta,
+                                                           const std::vector<int>& axes_dim) {
+        std::vector<std::vector<float>> ids = gen_qwen_image_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index);
+        return embed_nd(ids, bs, theta, axes_dim);
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_vid_ids(int t,
+                                                                  int h,
+                                                                  int w,
+                                                                  int pt,
+                                                                  int ph,
+                                                                  int pw,
+                                                                  int bs,
+                                                                  int t_offset = 0,
+                                                                  int h_offset = 0,
+                                                                  int w_offset = 0) {
+        int t_len = (t + (pt / 2)) / pt;
+        int h_len = (h + (ph / 2)) / ph;
+        int w_len = (w + (pw / 2)) / pw;
+
+        std::vector<std::vector<float>> vid_ids(t_len * h_len * w_len, std::vector<float>(3, 0.0));
+
+        std::vector<float> t_ids = linspace<float>(t_offset, t_len - 1 + t_offset, t_len);
+        std::vector<float> h_ids = linspace<float>(h_offset, h_len - 1 + h_offset, h_len);
+        std::vector<float> w_ids = linspace<float>(w_offset, w_len - 1 + w_offset, w_len);
+
+        for (int i = 0; i < t_len; ++i) {
+            for (int j = 0; j < h_len; ++j) {
+                for (int k = 0; k < w_len; ++k) {
+                    int idx         = i * h_len * w_len + j * w_len + k;
+                    vid_ids[idx][0] = t_ids[i];
+                    vid_ids[idx][1] = h_ids[j];
+                    vid_ids[idx][2] = w_ids[k];
+                }
+            }
+        }
+
+        std::vector<std::vector<float>> vid_ids_repeated(bs * vid_ids.size(), std::vector<float>(3));
+        for (int i = 0; i < bs; ++i) {
+            for (int j = 0; j < vid_ids.size(); ++j) {
+                vid_ids_repeated[i * vid_ids.size() + j] = vid_ids[j];
+            }
+        }
+        return vid_ids_repeated;
+    }
+
+    // Generate wan positional embeddings
+    __STATIC_INLINE__ std::vector<float> gen_wan_pe(int t,
+                                                    int h,
+                                                    int w,
+                                                    int pt,
+                                                    int ph,
+                                                    int pw,
+                                                    int bs,
+                                                    int theta,
+                                                    const std::vector<int>& axes_dim) {
+        std::vector<std::vector<float>> ids = gen_vid_ids(t, h, w, pt, ph, pw, bs);
+        return embed_nd(ids, bs, theta, axes_dim);
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_qwen2vl_ids(int grid_h,
+                                                                      int grid_w,
+                                                                      int merge_size,
+                                                                      const std::vector<int>& window_index) {
+        std::vector<std::vector<float>> ids(grid_h * grid_w, std::vector<float>(2, 0.0));
+        int index = 0;
+        for (int ih = 0; ih < grid_h; ih += merge_size) {
+            for (int iw = 0; iw < grid_w; iw += merge_size) {
+                for (int iy = 0; iy < merge_size; iy++) {
+                    for (int ix = 0; ix < merge_size; ix++) {
+                        int inverse_index = window_index[index / (merge_size * merge_size)];
+                        int i             = inverse_index * (merge_size * merge_size) + index % (merge_size * merge_size);
+
+                        GGML_ASSERT(i < grid_h * grid_w);
+
+                        ids[i][0] = ih + iy;
+                        ids[i][1] = iw + ix;
+                        index++;
+                    }
+                }
+            }
+        }
+        return ids;
+    }
+
+    // Generate qwen2vl positional embeddings
+    __STATIC_INLINE__ std::vector<float> gen_qwen2vl_pe(int grid_h,
+                                                        int grid_w,
+                                                        int merge_size,
+                                                        const std::vector<int>& window_index,
+                                                        int theta,
+                                                        const std::vector<int>& axes_dim) {
+        std::vector<std::vector<float>> ids = gen_qwen2vl_ids(grid_h, grid_w, merge_size, window_index);
+        return embed_nd(ids, 1, theta, axes_dim);
+    }
+
+    __STATIC_INLINE__ struct ggml_tensor* apply_rope(struct ggml_context* ctx,
+                                                     struct ggml_tensor* x,
+                                                     struct ggml_tensor* pe,
+                                                     bool rope_interleaved = true) {
+        // x: [N, L, n_head, d_head]
+        // pe: [L, d_head/2, 2, 2], [[cos, -sin], [sin, cos]]
+        int64_t d_head = x->ne[0];
+        int64_t n_head = x->ne[1];
+        int64_t L      = x->ne[2];
+        int64_t N      = x->ne[3];
+        x              = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N, n_head, L, d_head]
+        if (rope_interleaved) {
+            x = ggml_reshape_4d(ctx, x, 2, d_head / 2, L, n_head * N);  // [N * n_head, L, d_head/2, 2]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 3, 0, 1, 2));       // [2, N * n_head, L, d_head/2]
+        } else {
+            x = ggml_reshape_4d(ctx, x, d_head / 2, 2, L, n_head * N);   // [N * n_head, L, 2, d_head/2]
+            x = ggml_cont(ctx, ggml_torch_permute(ctx, x, 0, 2, 3, 1));  // [2, N * n_head, L, d_head/2]
+        }
+
+        int64_t offset = x->nb[2] * x->ne[2];
+        auto x_0       = ggml_view_3d(ctx, x, x->ne[0], x->ne[1], x->ne[2], x->nb[1], x->nb[2], offset * 0);  // [N * n_head, L, d_head/2]
+        auto x_1       = ggml_view_3d(ctx, x, x->ne[0], x->ne[1], x->ne[2], x->nb[1], x->nb[2], offset * 1);  // [N * n_head, L, d_head/2]
+        x_0            = ggml_reshape_4d(ctx, x_0, 1, x_0->ne[0], x_0->ne[1], x_0->ne[2]);                    // [N * n_head, L, d_head/2, 1]
+        x_1            = ggml_reshape_4d(ctx, x_1, 1, x_1->ne[0], x_1->ne[1], x_1->ne[2]);                    // [N * n_head, L, d_head/2, 1]
+        auto temp_x    = ggml_new_tensor_4d(ctx, x_0->type, 2, x_0->ne[1], x_0->ne[2], x_0->ne[3]);
+        x_0            = ggml_repeat(ctx, x_0, temp_x);  // [N * n_head, L, d_head/2, 2]
+        x_1            = ggml_repeat(ctx, x_1, temp_x);  // [N * n_head, L, d_head/2, 2]
+
+        pe        = ggml_cont(ctx, ggml_permute(ctx, pe, 3, 0, 1, 2));  // [2, L, d_head/2, 2]
+        offset    = pe->nb[2] * pe->ne[2];
+        auto pe_0 = ggml_view_3d(ctx, pe, pe->ne[0], pe->ne[1], pe->ne[2], pe->nb[1], pe->nb[2], offset * 0);  // [L, d_head/2, 2]
+        auto pe_1 = ggml_view_3d(ctx, pe, pe->ne[0], pe->ne[1], pe->ne[2], pe->nb[1], pe->nb[2], offset * 1);  // [L, d_head/2, 2]
+
+        auto x_out = ggml_add_inplace(ctx, ggml_mul(ctx, x_0, pe_0), ggml_mul(ctx, x_1, pe_1));  // [N * n_head, L, d_head/2, 2]
+        if (!rope_interleaved) {
+            x_out = ggml_cont(ctx, ggml_permute(ctx, x_out, 1, 0, 2, 3));  // [N * n_head, L, x, d_head/2]
+        }
+        x_out = ggml_reshape_3d(ctx, x_out, d_head, L, n_head * N);  // [N*n_head, L, d_head]
+        return x_out;
+    }
+
+    __STATIC_INLINE__ struct ggml_tensor* attention(struct ggml_context* ctx,
+                                                    ggml_backend_t backend,
+                                                    struct ggml_tensor* q,
+                                                    struct ggml_tensor* k,
+                                                    struct ggml_tensor* v,
+                                                    struct ggml_tensor* pe,
+                                                    struct ggml_tensor* mask,
+                                                    bool flash_attn,
+                                                    float kv_scale        = 1.0f,
+                                                    bool rope_interleaved = true) {
+        // q,k,v: [N, L, n_head, d_head]
+        // pe: [L, d_head/2, 2, 2]
+        // return: [N, L, n_head*d_head]
+        q = apply_rope(ctx, q, pe, rope_interleaved);  // [N*n_head, L, d_head]
+        k = apply_rope(ctx, k, pe, rope_interleaved);  // [N*n_head, L, d_head]
+
+        auto x = ggml_nn_attention_ext(ctx, backend, q, k, v, v->ne[1], mask, false, true, flash_attn, kv_scale);  // [N, L, n_head*d_head]
+        return x;
+    }
+};  // namespace Rope
+
+#endif  // __ROPE_HPP__
--- a/src/anima.hpp
+++ b/src/anima.hpp
@ -1,686 +0,0 @@
-#ifndef __ANIMA_HPP__
-#define __ANIMA_HPP__
-
-#include <cmath>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "common_block.hpp"
-#include "flux.hpp"
-#include "rope.hpp"
-
-namespace Anima {
-    constexpr int ANIMA_GRAPH_SIZE = 65536;
-
-    __STATIC_INLINE__ ggml_tensor* apply_gate(ggml_context* ctx,
-                                              ggml_tensor* x,
-                                              ggml_tensor* gate) {
-        gate = ggml_reshape_3d(ctx, gate, gate->ne[0], 1, gate->ne[1]);  // [N, 1, C]
-        return ggml_mul(ctx, x, gate);
-    }
-
-    struct XEmbedder : public GGMLBlock {
-    public:
-        XEmbedder(int64_t in_dim, int64_t out_dim) {
-            blocks["proj.1"] = std::make_shared<Linear>(in_dim, out_dim, false);
-        }
-
-        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
-            auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj.1"]);
-            return proj->forward(ctx, x);
-        }
-    };
-
-    struct TimestepEmbedder : public GGMLBlock {
-    public:
-        TimestepEmbedder(int64_t in_dim, int64_t out_dim) {
-            blocks["1.linear_1"] = std::make_shared<Linear>(in_dim, in_dim, false);
-            blocks["1.linear_2"] = std::make_shared<Linear>(in_dim, out_dim, false);
-        }
-
-        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
-            auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["1.linear_1"]);
-            auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["1.linear_2"]);
-
-            x = linear_1->forward(ctx, x);
-            x = ggml_silu_inplace(ctx->ggml_ctx, x);
-            x = linear_2->forward(ctx, x);
-            return x;
-        }
-    };
-
-    struct AdaLayerNormZero : public GGMLBlock {
-    protected:
-        int64_t in_features;
-
-    public:
-        AdaLayerNormZero(int64_t in_features, int64_t hidden_features = 256)
-            : in_features(in_features) {
-            blocks["norm"] = std::make_shared<LayerNorm>(in_features, 1e-6f, false, false);
-            blocks["1"]    = std::make_shared<Linear>(in_features, hidden_features, false);
-            blocks["2"]    = std::make_shared<Linear>(hidden_features, 3 * in_features, false);
-        }
-
-        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                      ggml_tensor* hidden_states,
-                                                      ggml_tensor* embedded_timestep,
-                                                      ggml_tensor* temb = nullptr) {
-            auto norm     = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
-            auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["1"]);
-            auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["2"]);
-
-            auto emb = ggml_silu(ctx->ggml_ctx, embedded_timestep);
-            emb      = linear_1->forward(ctx, emb);
-            emb      = linear_2->forward(ctx, emb);  // [N, 3*C]
-
-            if (temb != nullptr) {
-                emb = ggml_add(ctx->ggml_ctx, emb, temb);
-            }
-
-            auto emb_chunks = ggml_ext_chunk(ctx->ggml_ctx, emb, 3, 0);
-            auto shift      = emb_chunks[0];
-            auto scale      = emb_chunks[1];
-            auto gate       = emb_chunks[2];
-
-            auto x = norm->forward(ctx, hidden_states);
-            x      = Flux::modulate(ctx->ggml_ctx, x, shift, scale);
-
-            return {x, gate};
-        }
-    };
-
-    struct AdaLayerNorm : public GGMLBlock {
-    protected:
-        int64_t embedding_dim;
-
-    public:
-        AdaLayerNorm(int64_t in_features, int64_t hidden_features = 256)
-            : embedding_dim(in_features) {
-            blocks["norm"] = std::make_shared<LayerNorm>(in_features, 1e-6f, false, false);
-            blocks["1"]    = std::make_shared<Linear>(in_features, hidden_features, false);
-            blocks["2"]    = std::make_shared<Linear>(hidden_features, 2 * in_features, false);
-        }
-
-        ggml_tensor* forward(GGMLRunnerContext* ctx,
-                             ggml_tensor* hidden_states,
-                             ggml_tensor* embedded_timestep,
-                             ggml_tensor* temb = nullptr) {
-            auto norm     = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
-            auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["1"]);
-            auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["2"]);
-
-            auto emb = ggml_silu(ctx->ggml_ctx, embedded_timestep);
-            emb      = linear_1->forward(ctx, emb);
-            emb      = linear_2->forward(ctx, emb);  // [N, 2*C]
-
-            if (temb != nullptr) {
-                auto temb_2c = ggml_view_2d(ctx->ggml_ctx, temb, 2 * embedding_dim, temb->ne[1], temb->nb[1], 0);
-                emb          = ggml_add(ctx->ggml_ctx, emb, temb_2c);
-            }
-
-            auto emb_chunks = ggml_ext_chunk(ctx->ggml_ctx, emb, 2, 0);
-            auto shift      = emb_chunks[0];
-            auto scale      = emb_chunks[1];
-
-            auto x = norm->forward(ctx, hidden_states);
-            x      = Flux::modulate(ctx->ggml_ctx, x, shift, scale);
-            return x;
-        }
-    };
-
-    struct AnimaAttention : public GGMLBlock {
-    protected:
-        int64_t num_heads;
-        int64_t head_dim;
-        std::string out_proj_name;
-
-    public:
-        AnimaAttention(int64_t query_dim,
-                       int64_t context_dim,
-                       int64_t num_heads,
-                       int64_t head_dim,
-                       const std::string& out_proj_name = "output_proj")
-            : num_heads(num_heads), head_dim(head_dim), out_proj_name(out_proj_name) {
-            int64_t inner_dim = num_heads * head_dim;
-
-            blocks["q_proj"]            = std::make_shared<Linear>(query_dim, inner_dim, false);
-            blocks["k_proj"]            = std::make_shared<Linear>(context_dim, inner_dim, false);
-            blocks["v_proj"]            = std::make_shared<Linear>(context_dim, inner_dim, false);
-            blocks["q_norm"]            = std::make_shared<RMSNorm>(head_dim, 1e-6f);
-            blocks["k_norm"]            = std::make_shared<RMSNorm>(head_dim, 1e-6f);
-            blocks[this->out_proj_name] = std::make_shared<Linear>(inner_dim, query_dim, false);
-        }
-
-        ggml_tensor* forward(GGMLRunnerContext* ctx,
-                             ggml_tensor* hidden_states,
-                             ggml_tensor* encoder_hidden_states = nullptr,
-                             ggml_tensor* pe_q                  = nullptr,
-                             ggml_tensor* pe_k                  = nullptr) {
-            if (encoder_hidden_states == nullptr) {
-                encoder_hidden_states = hidden_states;
-            }
-
-            auto q_proj   = std::dynamic_pointer_cast<Linear>(blocks["q_proj"]);
-            auto k_proj   = std::dynamic_pointer_cast<Linear>(blocks["k_proj"]);
-            auto v_proj   = std::dynamic_pointer_cast<Linear>(blocks["v_proj"]);
-            auto q_norm   = std::dynamic_pointer_cast<RMSNorm>(blocks["q_norm"]);
-            auto k_norm   = std::dynamic_pointer_cast<RMSNorm>(blocks["k_norm"]);
-            auto out_proj = std::dynamic_pointer_cast<Linear>(blocks[out_proj_name]);
-
-            auto q = q_proj->forward(ctx, hidden_states);
-            auto k = k_proj->forward(ctx, encoder_hidden_states);
-            auto v = v_proj->forward(ctx, encoder_hidden_states);
-
-            int64_t N   = q->ne[2];
-            int64_t L_q = q->ne[1];
-            int64_t L_k = k->ne[1];
-
-            auto q4 = ggml_reshape_4d(ctx->ggml_ctx, q, head_dim, num_heads, L_q, N);  // [N, L_q, H, D]
-            auto k4 = ggml_reshape_4d(ctx->ggml_ctx, k, head_dim, num_heads, L_k, N);  // [N, L_k, H, D]
-            auto v4 = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim, num_heads, L_k, N);  // [N, L_k, H, D]
-
-            q4 = q_norm->forward(ctx, q4);
-            k4 = k_norm->forward(ctx, k4);
-
-            ggml_tensor* attn_out = nullptr;
-            if (pe_q != nullptr || pe_k != nullptr) {
-                if (pe_q == nullptr) {
-                    pe_q = pe_k;
-                }
-                if (pe_k == nullptr) {
-                    pe_k = pe_q;
-                }
-                auto q_rope = Rope::apply_rope(ctx->ggml_ctx, q4, pe_q, false);
-                auto k_rope = Rope::apply_rope(ctx->ggml_ctx, k4, pe_k, false);
-                attn_out    = ggml_ext_attention_ext(ctx->ggml_ctx,
-                                                     ctx->backend,
-                                                     q_rope,
-                                                     k_rope,
-                                                     v4,
-                                                     num_heads,
-                                                     nullptr,
-                                                     true,
-                                                     ctx->flash_attn_enabled);
-            } else {
-                auto q_flat = ggml_reshape_3d(ctx->ggml_ctx, q4, head_dim * num_heads, L_q, N);
-                auto k_flat = ggml_reshape_3d(ctx->ggml_ctx, k4, head_dim * num_heads, L_k, N);
-                attn_out    = ggml_ext_attention_ext(ctx->ggml_ctx,
-                                                     ctx->backend,
-                                                     q_flat,
-                                                     k_flat,
-                                                     v,
-                                                     num_heads,
-                                                     nullptr,
-                                                     false,
-                                                     ctx->flash_attn_enabled);
-            }
-
-            return out_proj->forward(ctx, attn_out);
-        }
-    };
-
-    struct AnimaMLP : public GGMLBlock {
-    public:
-        AnimaMLP(int64_t dim, int64_t hidden_dim) {
-            blocks["layer1"] = std::make_shared<Linear>(dim, hidden_dim, false);
-            blocks["layer2"] = std::make_shared<Linear>(hidden_dim, dim, false);
-        }
-
-        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
-            auto layer1 = std::dynamic_pointer_cast<Linear>(blocks["layer1"]);
-            auto layer2 = std::dynamic_pointer_cast<Linear>(blocks["layer2"]);
-
-            x = layer1->forward(ctx, x);
-            x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
-            x = layer2->forward(ctx, x);
-            return x;
-        }
-    };
-
-    struct AdapterMLP : public GGMLBlock {
-    public:
-        AdapterMLP(int64_t dim, int64_t hidden_dim) {
-            blocks["0"] = std::make_shared<Linear>(dim, hidden_dim, true);
-            blocks["2"] = std::make_shared<Linear>(hidden_dim, dim, true);
-        }
-
-        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
-            auto layer0 = std::dynamic_pointer_cast<Linear>(blocks["0"]);
-            auto layer2 = std::dynamic_pointer_cast<Linear>(blocks["2"]);
-
-            x = layer0->forward(ctx, x);
-            x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
-            x = layer2->forward(ctx, x);
-            return x;
-        }
-    };
-
-    struct LLMAdapterBlock : public GGMLBlock {
-    public:
-        LLMAdapterBlock(int64_t model_dim = 1024, int64_t source_dim = 1024, int64_t num_heads = 16, int64_t head_dim = 64) {
-            blocks["norm_self_attn"]  = std::make_shared<RMSNorm>(model_dim, 1e-6f);
-            blocks["self_attn"]       = std::make_shared<AnimaAttention>(model_dim, model_dim, num_heads, head_dim, "o_proj");
-            blocks["norm_cross_attn"] = std::make_shared<RMSNorm>(model_dim, 1e-6f);
-            blocks["cross_attn"]      = std::make_shared<AnimaAttention>(model_dim, source_dim, num_heads, head_dim, "o_proj");
-            blocks["norm_mlp"]        = std::make_shared<RMSNorm>(model_dim, 1e-6f);
-            blocks["mlp"]             = std::make_shared<AdapterMLP>(model_dim, model_dim * 4);
-        }
-
-        ggml_tensor* forward(GGMLRunnerContext* ctx,
-                             ggml_tensor* x,
-                             ggml_tensor* context,
-                             ggml_tensor* target_pe,
-                             ggml_tensor* context_pe) {
-            auto norm_self_attn  = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_self_attn"]);
-            auto self_attn       = std::dynamic_pointer_cast<AnimaAttention>(blocks["self_attn"]);
-            auto norm_cross_attn = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_cross_attn"]);
-            auto cross_attn      = std::dynamic_pointer_cast<AnimaAttention>(blocks["cross_attn"]);
-            auto norm_mlp        = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_mlp"]);
-            auto mlp             = std::dynamic_pointer_cast<AdapterMLP>(blocks["mlp"]);
-
-            auto h = norm_self_attn->forward(ctx, x);
-            h      = self_attn->forward(ctx, h, nullptr, target_pe, target_pe);
-            x      = ggml_add(ctx->ggml_ctx, x, h);
-
-            h = norm_cross_attn->forward(ctx, x);
-            h = cross_attn->forward(ctx, h, context, target_pe, context_pe);
-            x = ggml_add(ctx->ggml_ctx, x, h);
-
-            h = norm_mlp->forward(ctx, x);
-            h = mlp->forward(ctx, h);
-            x = ggml_add(ctx->ggml_ctx, x, h);
-
-            return x;
-        }
-    };
-
-    struct LLMAdapter : public GGMLBlock {
-    protected:
-        int num_layers;
-
-    public:
-        LLMAdapter(int64_t source_dim = 1024,
-                   int64_t target_dim = 1024,
-                   int64_t model_dim  = 1024,
-                   int num_layers     = 6,
-                   int num_heads      = 16)
-            : num_layers(num_layers) {
-            int64_t head_dim = model_dim / num_heads;
-
-            blocks["embed"] = std::make_shared<Embedding>(32128, target_dim);
-            for (int i = 0; i < num_layers; i++) {
-                blocks["blocks." + std::to_string(i)] =
-                    std::make_shared<LLMAdapterBlock>(model_dim, source_dim, num_heads, head_dim);
-            }
-            blocks["out_proj"] = std::make_shared<Linear>(model_dim, target_dim, true);
-            blocks["norm"]     = std::make_shared<RMSNorm>(target_dim, 1e-6f);
-        }
-
-        ggml_tensor* forward(GGMLRunnerContext* ctx,
-                             ggml_tensor* source_hidden_states,
-                             ggml_tensor* target_input_ids,
-                             ggml_tensor* target_pe,
-                             ggml_tensor* source_pe) {
-            GGML_ASSERT(target_input_ids != nullptr);
-            if (ggml_n_dims(target_input_ids) == 1) {
-                target_input_ids = ggml_reshape_2d(ctx->ggml_ctx, target_input_ids, target_input_ids->ne[0], 1);
-            }
-
-            auto embed    = std::dynamic_pointer_cast<Embedding>(blocks["embed"]);
-            auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["out_proj"]);
-            auto norm     = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
-
-            auto x = embed->forward(ctx, target_input_ids);  // [N, target_len, target_dim]
-
-            for (int i = 0; i < num_layers; i++) {
-                auto block = std::dynamic_pointer_cast<LLMAdapterBlock>(blocks["blocks." + std::to_string(i)]);
-                x          = block->forward(ctx, x, source_hidden_states, target_pe, source_pe);
-            }
-
-            x = out_proj->forward(ctx, x);
-            x = norm->forward(ctx, x);
-            return x;
-        }
-    };
-
-    struct TransformerBlock : public GGMLBlock {
-    public:
-        TransformerBlock(int64_t hidden_size,
-                         int64_t text_embed_dim,
-                         int64_t num_heads,
-                         int64_t head_dim,
-                         int64_t mlp_ratio      = 4,
-                         int64_t adaln_lora_dim = 256) {
-            blocks["adaln_modulation_self_attn"]  = std::make_shared<AdaLayerNormZero>(hidden_size, adaln_lora_dim);
-            blocks["self_attn"]                   = std::make_shared<AnimaAttention>(hidden_size, hidden_size, num_heads, head_dim);
-            blocks["adaln_modulation_cross_attn"] = std::make_shared<AdaLayerNormZero>(hidden_size, adaln_lora_dim);
-            blocks["cross_attn"]                  = std::make_shared<AnimaAttention>(hidden_size, text_embed_dim, num_heads, head_dim);
-            blocks["adaln_modulation_mlp"]        = std::make_shared<AdaLayerNormZero>(hidden_size, adaln_lora_dim);
-            blocks["mlp"]                         = std::make_shared<AnimaMLP>(hidden_size, hidden_size * mlp_ratio);
-        }
-
-        ggml_tensor* forward(GGMLRunnerContext* ctx,
-                             ggml_tensor* hidden_states,
-                             ggml_tensor* encoder_hidden_states,
-                             ggml_tensor* embedded_timestep,
-                             ggml_tensor* temb,
-                             ggml_tensor* image_pe) {
-            auto norm1 = std::dynamic_pointer_cast<AdaLayerNormZero>(blocks["adaln_modulation_self_attn"]);
-            auto attn1 = std::dynamic_pointer_cast<AnimaAttention>(blocks["self_attn"]);
-            auto norm2 = std::dynamic_pointer_cast<AdaLayerNormZero>(blocks["adaln_modulation_cross_attn"]);
-            auto attn2 = std::dynamic_pointer_cast<AnimaAttention>(blocks["cross_attn"]);
-            auto norm3 = std::dynamic_pointer_cast<AdaLayerNormZero>(blocks["adaln_modulation_mlp"]);
-            auto mlp   = std::dynamic_pointer_cast<AnimaMLP>(blocks["mlp"]);
-
-            auto [normed1, gate1] = norm1->forward(ctx, hidden_states, embedded_timestep, temb);
-            auto h                = attn1->forward(ctx, normed1, nullptr, image_pe, image_pe);
-            hidden_states         = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate1));
-
-            auto [normed2, gate2] = norm2->forward(ctx, hidden_states, embedded_timestep, temb);
-            h                     = attn2->forward(ctx, normed2, encoder_hidden_states, nullptr, nullptr);
-            hidden_states         = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate2));
-
-            auto [normed3, gate3] = norm3->forward(ctx, hidden_states, embedded_timestep, temb);
-            h                     = mlp->forward(ctx, normed3);
-            hidden_states         = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate3));
-
-            return hidden_states;
-        }
-    };
-
-    struct FinalLayer : public GGMLBlock {
-    protected:
-        int64_t hidden_size;
-        int64_t patch_size;
-        int64_t out_channels;
-
-    public:
-        FinalLayer(int64_t hidden_size, int64_t patch_size, int64_t out_channels)
-            : hidden_size(hidden_size), patch_size(patch_size), out_channels(out_channels) {
-            blocks["adaln_modulation"] = std::make_shared<AdaLayerNorm>(hidden_size, 256);
-            blocks["linear"]           = std::make_shared<Linear>(hidden_size, patch_size * patch_size * out_channels, false);
-        }
-
-        ggml_tensor* forward(GGMLRunnerContext* ctx,
-                             ggml_tensor* hidden_states,
-                             ggml_tensor* embedded_timestep,
-                             ggml_tensor* temb) {
-            auto adaln  = std::dynamic_pointer_cast<AdaLayerNorm>(blocks["adaln_modulation"]);
-            auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
-
-            hidden_states = adaln->forward(ctx, hidden_states, embedded_timestep, temb);
-            hidden_states = linear->forward(ctx, hidden_states);
-            return hidden_states;
-        }
-    };
-
-    struct AnimaNet : public GGMLBlock {
-    public:
-        int64_t in_channels       = 16;
-        int64_t out_channels      = 16;
-        int64_t hidden_size       = 2048;
-        int64_t text_embed_dim    = 1024;
-        int64_t num_heads         = 16;
-        int64_t head_dim          = 128;
-        int patch_size            = 2;
-        int64_t num_layers        = 28;
-        std::vector<int> axes_dim = {44, 42, 42};
-        int theta                 = 10000;
-
-    public:
-        AnimaNet() = default;
-        explicit AnimaNet(int64_t num_layers)
-            : num_layers(num_layers) {
-            blocks["x_embedder"]       = std::make_shared<XEmbedder>((in_channels + 1) * patch_size * patch_size, hidden_size);
-            blocks["t_embedder"]       = std::make_shared<TimestepEmbedder>(hidden_size, hidden_size * 3);
-            blocks["t_embedding_norm"] = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
-            for (int i = 0; i < num_layers; i++) {
-                blocks["blocks." + std::to_string(i)] = std::make_shared<TransformerBlock>(hidden_size,
-                                                                                           text_embed_dim,
-                                                                                           num_heads,
-                                                                                           head_dim);
-            }
-            blocks["final_layer"] = std::make_shared<FinalLayer>(hidden_size, patch_size, out_channels);
-            blocks["llm_adapter"] = std::make_shared<LLMAdapter>(1024, 1024, 1024, 6, 16);
-        }
-
-        ggml_tensor* forward(GGMLRunnerContext* ctx,
-                             ggml_tensor* x,
-                             ggml_tensor* timestep,
-                             ggml_tensor* encoder_hidden_states,
-                             ggml_tensor* image_pe,
-                             ggml_tensor* t5_ids       = nullptr,
-                             ggml_tensor* t5_weights   = nullptr,
-                             ggml_tensor* adapter_q_pe = nullptr,
-                             ggml_tensor* adapter_k_pe = nullptr) {
-            GGML_ASSERT(x->ne[3] == 1);
-
-            auto x_embedder       = std::dynamic_pointer_cast<XEmbedder>(blocks["x_embedder"]);
-            auto t_embedder       = std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder"]);
-            auto t_embedding_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["t_embedding_norm"]);
-            auto final_layer      = std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer"]);
-            auto llm_adapter      = std::dynamic_pointer_cast<LLMAdapter>(blocks["llm_adapter"]);
-
-            int64_t W = x->ne[0];
-            int64_t H = x->ne[1];
-
-            auto padding_mask = ggml_ext_zeros(ctx->ggml_ctx, x->ne[0], x->ne[1], 1, x->ne[3]);
-            x                 = ggml_concat(ctx->ggml_ctx, x, padding_mask, 2);  // [N, C + 1, H, W]
-
-            x = DiT::pad_and_patchify(ctx, x, patch_size, patch_size);  // [N, h*w, (C+1)*ph*pw]
-
-            x = x_embedder->forward(ctx, x);
-
-            auto timestep_proj     = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast<int>(hidden_size));
-            auto temb              = t_embedder->forward(ctx, timestep_proj);
-            auto embedded_timestep = t_embedding_norm->forward(ctx, timestep_proj);
-
-            if (t5_ids != nullptr) {
-                auto adapted_context = llm_adapter->forward(ctx, encoder_hidden_states, t5_ids, adapter_q_pe, adapter_k_pe);
-                if (t5_weights != nullptr) {
-                    auto w = t5_weights;
-                    if (ggml_n_dims(w) == 1) {
-                        w = ggml_reshape_3d(ctx->ggml_ctx, w, 1, w->ne[0], 1);
-                    }
-                    w               = ggml_repeat_4d(ctx->ggml_ctx, w, adapted_context->ne[0], adapted_context->ne[1], adapted_context->ne[2], 1);
-                    adapted_context = ggml_mul(ctx->ggml_ctx, adapted_context, w);
-                }
-                if (adapted_context->ne[1] < 512) {
-                    auto pad_ctx    = ggml_ext_zeros(ctx->ggml_ctx,
-                                                     adapted_context->ne[0],
-                                                     512 - adapted_context->ne[1],
-                                                     adapted_context->ne[2],
-                                                     1);
-                    adapted_context = ggml_concat(ctx->ggml_ctx, adapted_context, pad_ctx, 1);
-                } else if (adapted_context->ne[1] > 512) {
-                    adapted_context = ggml_ext_slice(ctx->ggml_ctx, adapted_context, 1, 0, 512);
-                }
-                encoder_hidden_states = adapted_context;
-            }
-
-            for (int i = 0; i < num_layers; i++) {
-                auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["blocks." + std::to_string(i)]);
-                x          = block->forward(ctx, x, encoder_hidden_states, embedded_timestep, temb, image_pe);
-            }
-
-            x = final_layer->forward(ctx, x, embedded_timestep, temb);  // [N, h*w, ph*pw*C]
-
-            x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, patch_size, patch_size, false);  // [N, C, H, W]
-
-            return x;
-        }
-    };
-
-    struct AnimaRunner : public GGMLRunner {
-    public:
-        std::vector<float> image_pe_vec;
-        std::vector<float> adapter_q_pe_vec;
-        std::vector<float> adapter_k_pe_vec;
-        AnimaNet net;
-
-        AnimaRunner(ggml_backend_t backend,
-                    bool offload_params_to_cpu,
-                    const String2TensorStorage& tensor_storage_map = {},
-                    const std::string prefix                       = "model.diffusion_model")
-            : GGMLRunner(backend, offload_params_to_cpu) {
-            int64_t num_layers    = 0;
-            std::string layer_tag = prefix + ".net.blocks.";
-            for (const auto& kv : tensor_storage_map) {
-                const std::string& tensor_name = kv.first;
-                size_t pos                     = tensor_name.find(layer_tag);
-                if (pos == std::string::npos) {
-                    continue;
-                }
-                size_t start = pos + layer_tag.size();
-                size_t end   = tensor_name.find('.', start);
-                if (end == std::string::npos) {
-                    continue;
-                }
-                int64_t layer_id = atoll(tensor_name.substr(start, end - start).c_str());
-                num_layers       = std::max(num_layers, layer_id + 1);
-            }
-            if (num_layers <= 0) {
-                num_layers = 28;
-            }
-            LOG_INFO("anima net layers: %" PRId64, num_layers);
-
-            net = AnimaNet(num_layers);
-            net.init(params_ctx, tensor_storage_map, prefix + ".net");
-        }
-
-        std::string get_desc() override {
-            return "anima";
-        }
-
-        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
-            net.get_param_tensors(tensors, prefix + ".net");
-        }
-
-        static std::vector<float> gen_1d_rope_pe_vec(int64_t seq_len, int dim, float theta = 10000.f) {
-            std::vector<float> pos(seq_len);
-            for (int64_t i = 0; i < seq_len; i++) {
-                pos[i] = static_cast<float>(i);
-            }
-            auto rope_emb = Rope::rope(pos, dim, theta);
-            return Rope::flatten(rope_emb);
-        }
-
-        static float calc_ntk_factor(float extrapolation_ratio, int axis_dim) {
-            if (extrapolation_ratio == 1.0f || axis_dim <= 2) {
-                return 1.0f;
-            }
-            return std::pow(extrapolation_ratio, static_cast<float>(axis_dim) / static_cast<float>(axis_dim - 2));
-        }
-
-        static std::vector<float> gen_anima_image_pe_vec(int bs,
-                                                         int h,
-                                                         int w,
-                                                         int patch_size,
-                                                         int theta,
-                                                         const std::vector<int>& axes_dim,
-                                                         float h_extrapolation_ratio,
-                                                         float w_extrapolation_ratio,
-                                                         float t_extrapolation_ratio) {
-            static const std::vector<ggml_tensor*> empty_ref_latents;
-            auto ids = Rope::gen_flux_ids(h,
-                                          w,
-                                          patch_size,
-                                          bs,
-                                          static_cast<int>(axes_dim.size()),
-                                          0,
-                                          {},
-                                          empty_ref_latents,
-                                          false,
-                                          1.0f);
-
-            std::vector<float> axis_thetas = {
-                static_cast<float>(theta) * calc_ntk_factor(t_extrapolation_ratio, axes_dim[0]),
-                static_cast<float>(theta) * calc_ntk_factor(h_extrapolation_ratio, axes_dim[1]),
-                static_cast<float>(theta) * calc_ntk_factor(w_extrapolation_ratio, axes_dim[2]),
-            };
-            return Rope::embed_nd(ids, bs, axis_thetas, axes_dim);
-        }
-
-        ggml_cgraph* build_graph(ggml_tensor* x,
-                                 ggml_tensor* timesteps,
-                                 ggml_tensor* context,
-                                 ggml_tensor* t5_ids     = nullptr,
-                                 ggml_tensor* t5_weights = nullptr) {
-            GGML_ASSERT(x->ne[3] == 1);
-            ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE);
-
-            x          = to_backend(x);
-            timesteps  = to_backend(timesteps);
-            context    = to_backend(context);
-            t5_ids     = to_backend(t5_ids);
-            t5_weights = to_backend(t5_weights);
-
-            int64_t pad_h = (net.patch_size - x->ne[1] % net.patch_size) % net.patch_size;
-            int64_t pad_w = (net.patch_size - x->ne[0] % net.patch_size) % net.patch_size;
-            int64_t h_pad = x->ne[1] + pad_h;
-            int64_t w_pad = x->ne[0] + pad_w;
-
-            image_pe_vec          = gen_anima_image_pe_vec(1,
-                                                           static_cast<int>(h_pad),
-                                                           static_cast<int>(w_pad),
-                                                           static_cast<int>(net.patch_size),
-                                                           net.theta,
-                                                           net.axes_dim,
-                                                           4.0f,
-                                                           4.0f,
-                                                           1.0f);
-            int64_t image_pos_len = static_cast<int64_t>(image_pe_vec.size()) / (2 * 2 * (net.head_dim / 2));
-            auto image_pe         = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, net.head_dim / 2, image_pos_len);
-            set_backend_tensor_data(image_pe, image_pe_vec.data());
-
-            ggml_tensor* adapter_q_pe = nullptr;
-            ggml_tensor* adapter_k_pe = nullptr;
-            if (t5_ids != nullptr) {
-                int64_t target_len = t5_ids->ne[0];
-                int64_t source_len = context->ne[1];
-
-                adapter_q_pe_vec = gen_1d_rope_pe_vec(target_len, 64, 10000.f);
-                adapter_k_pe_vec = gen_1d_rope_pe_vec(source_len, 64, 10000.f);
-
-                int64_t target_pos_len = static_cast<int64_t>(adapter_q_pe_vec.size()) / (2 * 2 * 32);
-                int64_t source_pos_len = static_cast<int64_t>(adapter_k_pe_vec.size()) / (2 * 2 * 32);
-
-                adapter_q_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, 32, target_pos_len);
-                adapter_k_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, 32, source_pos_len);
-                set_backend_tensor_data(adapter_q_pe, adapter_q_pe_vec.data());
-                set_backend_tensor_data(adapter_k_pe, adapter_k_pe_vec.data());
-            }
-
-            auto runner_ctx = get_context();
-            auto out        = net.forward(&runner_ctx,
-                                          x,
-                                          timesteps,
-                                          context,
-                                          image_pe,
-                                          t5_ids,
-                                          t5_weights,
-                                          adapter_q_pe,
-                                          adapter_k_pe);
-
-            ggml_build_forward_expand(gf, out);
-            return gf;
-        }
-
-        bool compute(int n_threads,
-                     ggml_tensor* x,
-                     ggml_tensor* timesteps,
-                     ggml_tensor* context,
-                     ggml_tensor* t5_ids      = nullptr,
-                     ggml_tensor* t5_weights  = nullptr,
-                     ggml_tensor** output     = nullptr,
-                     ggml_context* output_ctx = nullptr) {
-            auto get_graph = [&]() -> ggml_cgraph* {
-                return build_graph(x, timesteps, context, t5_ids, t5_weights);
-            };
-            return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
-        }
-    };
-}  // namespace Anima
-
-#endif  // __ANIMA_HPP__
--- a/src/auto_encoder_kl.hpp
+++ b/src/auto_encoder_kl.hpp
@ -1,933 +0,0 @@
-#ifndef __AUTO_ENCODER_KL_HPP__
-#define __AUTO_ENCODER_KL_HPP__
-
-#include "vae.hpp"
-
-/*================================================== AutoEncoderKL ===================================================*/
-
-#define VAE_GRAPH_SIZE 20480
-
-class ResnetBlock : public UnaryBlock {
-protected:
-    int64_t in_channels;
-    int64_t out_channels;
-
-public:
-    ResnetBlock(int64_t in_channels,
-                int64_t out_channels)
-        : in_channels(in_channels),
-          out_channels(out_channels) {
-        // temb_channels is always 0
-        blocks["norm1"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
-        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
-
-        blocks["norm2"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(out_channels));
-        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
-
-        if (out_channels != in_channels) {
-            blocks["nin_shortcut"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {1, 1}));
-        }
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
-        // x: [N, in_channels, h, w]
-        // t_emb is always None
-        auto norm1 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm1"]);
-        auto conv1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv1"]);
-        auto norm2 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm2"]);
-        auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv2"]);
-
-        auto h = x;
-        h      = norm1->forward(ctx, h);
-        h      = ggml_silu_inplace(ctx->ggml_ctx, h);  // swish
-        h      = conv1->forward(ctx, h);
-        // return h;
-
-        h = norm2->forward(ctx, h);
-        h = ggml_silu_inplace(ctx->ggml_ctx, h);  // swish
-        // dropout, skip for inference
-        h = conv2->forward(ctx, h);
-
-        // skip connection
-        if (out_channels != in_channels) {
-            auto nin_shortcut = std::dynamic_pointer_cast<Conv2d>(blocks["nin_shortcut"]);
-
-            x = nin_shortcut->forward(ctx, x);  // [N, out_channels, h, w]
-        }
-
-        h = ggml_add(ctx->ggml_ctx, h, x);
-        return h;  // [N, out_channels, h, w]
-    }
-};
-
-class AttnBlock : public UnaryBlock {
-protected:
-    int64_t in_channels;
-    bool use_linear;
-
-    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
-        auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
-        if (iter != tensor_storage_map.end()) {
-            if (iter->second.n_dims == 4 && use_linear) {
-                use_linear         = false;
-                blocks["q"]        = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
-                blocks["k"]        = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
-                blocks["v"]        = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
-                blocks["proj_out"] = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
-            } else if (iter->second.n_dims == 2 && !use_linear) {
-                use_linear         = true;
-                blocks["q"]        = std::make_shared<Linear>(in_channels, in_channels);
-                blocks["k"]        = std::make_shared<Linear>(in_channels, in_channels);
-                blocks["v"]        = std::make_shared<Linear>(in_channels, in_channels);
-                blocks["proj_out"] = std::make_shared<Linear>(in_channels, in_channels);
-            }
-        }
-    }
-
-public:
-    AttnBlock(int64_t in_channels, bool use_linear)
-        : in_channels(in_channels), use_linear(use_linear) {
-        blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
-        if (use_linear) {
-            blocks["q"]        = std::shared_ptr<GGMLBlock>(new Linear(in_channels, in_channels));
-            blocks["k"]        = std::shared_ptr<GGMLBlock>(new Linear(in_channels, in_channels));
-            blocks["v"]        = std::shared_ptr<GGMLBlock>(new Linear(in_channels, in_channels));
-            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, in_channels));
-        } else {
-            blocks["q"]        = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
-            blocks["k"]        = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
-            blocks["v"]        = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
-            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
-        }
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
-        // x: [N, in_channels, h, w]
-        auto norm     = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
-        auto q_proj   = std::dynamic_pointer_cast<UnaryBlock>(blocks["q"]);
-        auto k_proj   = std::dynamic_pointer_cast<UnaryBlock>(blocks["k"]);
-        auto v_proj   = std::dynamic_pointer_cast<UnaryBlock>(blocks["v"]);
-        auto proj_out = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_out"]);
-
-        auto h_ = norm->forward(ctx, x);
-
-        const int64_t n = h_->ne[3];
-        const int64_t c = h_->ne[2];
-        const int64_t h = h_->ne[1];
-        const int64_t w = h_->ne[0];
-
-        ggml_tensor* q;
-        ggml_tensor* k;
-        ggml_tensor* v;
-        if (use_linear) {
-            h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 2, 0, 3));  // [N, h, w, in_channels]
-            h_ = ggml_reshape_3d(ctx->ggml_ctx, h_, c, h * w, n);                        // [N, h * w, in_channels]
-
-            q = q_proj->forward(ctx, h_);  // [N, h * w, in_channels]
-            k = k_proj->forward(ctx, h_);  // [N, h * w, in_channels]
-            v = v_proj->forward(ctx, h_);  // [N, h * w, in_channels]
-        } else {
-            q = q_proj->forward(ctx, h_);                                              // [N, in_channels, h, w]
-            q = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, q, 1, 2, 0, 3));  // [N, h, w, in_channels]
-            q = ggml_reshape_3d(ctx->ggml_ctx, q, c, h * w, n);                        // [N, h * w, in_channels]
-
-            k = k_proj->forward(ctx, h_);                                              // [N, in_channels, h, w]
-            k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 1, 2, 0, 3));  // [N, h, w, in_channels]
-            k = ggml_reshape_3d(ctx->ggml_ctx, k, c, h * w, n);                        // [N, h * w, in_channels]
-
-            v = v_proj->forward(ctx, h_);                                              // [N, in_channels, h, w]
-            v = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, v, 1, 2, 0, 3));  // [N, h, w, in_channels]
-            v = ggml_reshape_3d(ctx->ggml_ctx, v, c, h * w, n);                        // [N, h * w, in_channels]
-        }
-
-        h_ = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, 1, nullptr, false, ctx->flash_attn_enabled);
-
-        if (use_linear) {
-            h_ = proj_out->forward(ctx, h_);  // [N, h * w, in_channels]
-
-            h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 0, 2, 3));  // [N, in_channels, h * w]
-            h_ = ggml_reshape_4d(ctx->ggml_ctx, h_, w, h, c, n);                         // [N, in_channels, h, w]
-        } else {
-            h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 0, 2, 3));  // [N, in_channels, h * w]
-            h_ = ggml_reshape_4d(ctx->ggml_ctx, h_, w, h, c, n);                         // [N, in_channels, h, w]
-
-            h_ = proj_out->forward(ctx, h_);  // [N, in_channels, h, w]
-        }
-
-        h_ = ggml_add(ctx->ggml_ctx, h_, x);
-        return h_;
-    }
-};
-
-class AE3DConv : public Conv2d {
-public:
-    AE3DConv(int64_t in_channels,
-             int64_t out_channels,
-             std::pair<int, int> kernel_size,
-             int video_kernel_size        = 3,
-             std::pair<int, int> stride   = {1, 1},
-             std::pair<int, int> padding  = {0, 0},
-             std::pair<int, int> dilation = {1, 1},
-             bool bias                    = true)
-        : Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias) {
-        int kernel_padding      = video_kernel_size / 2;
-        blocks["time_mix_conv"] = std::shared_ptr<GGMLBlock>(new Conv3d(out_channels,
-                                                                        out_channels,
-                                                                        {video_kernel_size, 1, 1},
-                                                                        {1, 1, 1},
-                                                                        {kernel_padding, 0, 0}));
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* x) override {
-        // timesteps always None
-        // skip_video always False
-        // x: [N, IC, IH, IW]
-        // result: [N, OC, OH, OW]
-        auto time_mix_conv = std::dynamic_pointer_cast<Conv3d>(blocks["time_mix_conv"]);
-
-        x = Conv2d::forward(ctx, x);
-        // timesteps = x.shape[0]
-        // x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
-        // x = conv3d(x)
-        // return rearrange(x, "b c t h w -> (b t) c h w")
-        int64_t T = x->ne[3];
-        int64_t B = x->ne[3] / T;
-        int64_t C = x->ne[2];
-        int64_t H = x->ne[1];
-        int64_t W = x->ne[0];
-
-        x = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B);                     // (b t) c h w -> b t c (h w)
-        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
-        x = time_mix_conv->forward(ctx, x);                                        // [B, OC, T, OH * OW]
-        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
-        x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B);                     // b t c (h w) -> (b t) c h w
-        return x;                                                                  // [B*T, OC, OH, OW]
-    }
-};
-
-class VideoResnetBlock : public ResnetBlock {
-protected:
-    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
-        enum ggml_type wtype = get_type(prefix + "mix_factor", tensor_storage_map, GGML_TYPE_F32);
-        params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
-    }
-
-    float get_alpha() {
-        float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
-        return sigmoid(alpha);
-    }
-
-public:
-    VideoResnetBlock(int64_t in_channels,
-                     int64_t out_channels,
-                     int video_kernel_size = 3)
-        : ResnetBlock(in_channels, out_channels) {
-        // merge_strategy is always learned
-        blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true));
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
-        // x: [N, in_channels, h, w] aka [b*t, in_channels, h, w]
-        // return: [N, out_channels, h, w] aka [b*t, out_channels, h, w]
-        // t_emb is always None
-        // skip_video is always False
-        // timesteps is always None
-        auto time_stack = std::dynamic_pointer_cast<ResBlock>(blocks["time_stack"]);
-
-        x = ResnetBlock::forward(ctx, x);  // [N, out_channels, h, w]
-        // return x;
-
-        int64_t T = x->ne[3];
-        int64_t B = x->ne[3] / T;
-        int64_t C = x->ne[2];
-        int64_t H = x->ne[1];
-        int64_t W = x->ne[0];
-
-        x          = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B);                     // (b t) c h w -> b t c (h w)
-        x          = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
-        auto x_mix = x;
-
-        x = time_stack->forward(ctx, x);  // b t c (h w)
-
-        float alpha = get_alpha();
-        x           = ggml_add(ctx->ggml_ctx,
-                               ggml_ext_scale(ctx->ggml_ctx, x, alpha),
-                               ggml_ext_scale(ctx->ggml_ctx, x_mix, 1.0f - alpha));
-
-        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
-        x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B);                     // b t c (h w) -> (b t) c h w
-
-        return x;
-    }
-};
-
-// ldm.modules.diffusionmodules.model.Encoder
-class Encoder : public GGMLBlock {
-protected:
-    int ch                   = 128;
-    std::vector<int> ch_mult = {1, 2, 4, 4};
-    int num_res_blocks       = 2;
-    int in_channels          = 3;
-    int z_channels           = 4;
-    bool double_z            = true;
-
-public:
-    Encoder(int ch,
-            std::vector<int> ch_mult,
-            int num_res_blocks,
-            int in_channels,
-            int z_channels,
-            bool double_z              = true,
-            bool use_linear_projection = false)
-        : ch(ch),
-          ch_mult(ch_mult),
-          num_res_blocks(num_res_blocks),
-          in_channels(in_channels),
-          z_channels(z_channels),
-          double_z(double_z) {
-        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1}));
-
-        size_t num_resolutions = ch_mult.size();
-
-        int block_in = 1;
-        for (int i = 0; i < num_resolutions; i++) {
-            if (i == 0) {
-                block_in = ch;
-            } else {
-                block_in = ch * ch_mult[i - 1];
-            }
-            int block_out = ch * ch_mult[i];
-            for (int j = 0; j < num_res_blocks; j++) {
-                std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j);
-                blocks[name]     = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_out));
-                block_in         = block_out;
-            }
-            if (i != num_resolutions - 1) {
-                std::string name = "down." + std::to_string(i) + ".downsample";
-                blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(block_in, block_in, true));
-            }
-        }
-
-        blocks["mid.block_1"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
-        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in, use_linear_projection));
-        blocks["mid.block_2"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
-
-        blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
-        blocks["conv_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}));
-    }
-
-    virtual ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
-        // x: [N, in_channels, h, w]
-
-        auto conv_in     = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]);
-        auto mid_block_1 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_1"]);
-        auto mid_attn_1  = std::dynamic_pointer_cast<AttnBlock>(blocks["mid.attn_1"]);
-        auto mid_block_2 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_2"]);
-        auto norm_out    = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm_out"]);
-        auto conv_out    = std::dynamic_pointer_cast<Conv2d>(blocks["conv_out"]);
-
-        auto h = conv_in->forward(ctx, x);  // [N, ch, h, w]
-
-        // downsampling
-        size_t num_resolutions = ch_mult.size();
-        for (int i = 0; i < num_resolutions; i++) {
-            for (int j = 0; j < num_res_blocks; j++) {
-                std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j);
-                auto down_block  = std::dynamic_pointer_cast<ResnetBlock>(blocks[name]);
-
-                h = down_block->forward(ctx, h);
-            }
-            if (i != num_resolutions - 1) {
-                std::string name = "down." + std::to_string(i) + ".downsample";
-                auto down_sample = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
-
-                h = down_sample->forward(ctx, h);
-            }
-        }
-
-        // middle
-        h = mid_block_1->forward(ctx, h);
-        h = mid_attn_1->forward(ctx, h);
-        h = mid_block_2->forward(ctx, h);  // [N, block_in, h, w]
-
-        // end
-        h = norm_out->forward(ctx, h);
-        h = ggml_silu_inplace(ctx->ggml_ctx, h);  // nonlinearity/swish
-        h = conv_out->forward(ctx, h);            // [N, z_channels*2, h, w]
-        return h;
-    }
-};
-
-// ldm.modules.diffusionmodules.model.Decoder
-class Decoder : public GGMLBlock {
-protected:
-    int ch                   = 128;
-    int out_ch               = 3;
-    std::vector<int> ch_mult = {1, 2, 4, 4};
-    int num_res_blocks       = 2;
-    int z_channels           = 4;
-    bool video_decoder       = false;
-    int video_kernel_size    = 3;
-
-    virtual std::shared_ptr<GGMLBlock> get_conv_out(int64_t in_channels,
-                                                    int64_t out_channels,
-                                                    std::pair<int, int> kernel_size,
-                                                    std::pair<int, int> stride  = {1, 1},
-                                                    std::pair<int, int> padding = {0, 0}) {
-        if (video_decoder) {
-            return std::shared_ptr<GGMLBlock>(new AE3DConv(in_channels, out_channels, kernel_size, video_kernel_size, stride, padding));
-        } else {
-            return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, stride, padding));
-        }
-    }
-
-    virtual std::shared_ptr<GGMLBlock> get_resnet_block(int64_t in_channels,
-                                                        int64_t out_channels) {
-        if (video_decoder) {
-            return std::shared_ptr<GGMLBlock>(new VideoResnetBlock(in_channels, out_channels, video_kernel_size));
-        } else {
-            return std::shared_ptr<GGMLBlock>(new ResnetBlock(in_channels, out_channels));
-        }
-    }
-
-public:
-    Decoder(int ch,
-            int out_ch,
-            std::vector<int> ch_mult,
-            int num_res_blocks,
-            int z_channels,
-            bool use_linear_projection = false,
-            bool video_decoder         = false,
-            int video_kernel_size      = 3)
-        : ch(ch),
-          out_ch(out_ch),
-          ch_mult(ch_mult),
-          num_res_blocks(num_res_blocks),
-          z_channels(z_channels),
-          video_decoder(video_decoder),
-          video_kernel_size(video_kernel_size) {
-        int num_resolutions = static_cast<int>(ch_mult.size());
-        int block_in        = ch * ch_mult[num_resolutions - 1];
-
-        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}));
-
-        blocks["mid.block_1"] = get_resnet_block(block_in, block_in);
-        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in, use_linear_projection));
-        blocks["mid.block_2"] = get_resnet_block(block_in, block_in);
-
-        for (int i = num_resolutions - 1; i >= 0; i--) {
-            int mult      = ch_mult[i];
-            int block_out = ch * mult;
-            for (int j = 0; j < num_res_blocks + 1; j++) {
-                std::string name = "up." + std::to_string(i) + ".block." + std::to_string(j);
-                blocks[name]     = get_resnet_block(block_in, block_out);
-
-                block_in = block_out;
-            }
-            if (i != 0) {
-                std::string name = "up." + std::to_string(i) + ".upsample";
-                blocks[name]     = std::shared_ptr<GGMLBlock>(new UpSampleBlock(block_in, block_in));
-            }
-        }
-
-        blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
-        blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1});
-    }
-
-    virtual ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* z) {
-        // z: [N, z_channels, h, w]
-        // alpha is always 0
-        // merge_strategy is always learned
-        // time_mode is always conv-only, so we need to replace conv_out_op/resnet_op to AE3DConv/VideoResBlock
-        // AttnVideoBlock will not be used
-        auto conv_in     = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]);
-        auto mid_block_1 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_1"]);
-        auto mid_attn_1  = std::dynamic_pointer_cast<AttnBlock>(blocks["mid.attn_1"]);
-        auto mid_block_2 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_2"]);
-        auto norm_out    = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm_out"]);
-        auto conv_out    = std::dynamic_pointer_cast<Conv2d>(blocks["conv_out"]);
-
-        // conv_in
-        auto h = conv_in->forward(ctx, z);  // [N, block_in, h, w]
-
-        // middle
-        h = mid_block_1->forward(ctx, h);
-        // return h;
-
-        h = mid_attn_1->forward(ctx, h);
-        h = mid_block_2->forward(ctx, h);  // [N, block_in, h, w]
-
-        // upsampling
-        int num_resolutions = static_cast<int>(ch_mult.size());
-        for (int i = num_resolutions - 1; i >= 0; i--) {
-            for (int j = 0; j < num_res_blocks + 1; j++) {
-                std::string name = "up." + std::to_string(i) + ".block." + std::to_string(j);
-                auto up_block    = std::dynamic_pointer_cast<ResnetBlock>(blocks[name]);
-
-                h = up_block->forward(ctx, h);
-            }
-            if (i != 0) {
-                std::string name = "up." + std::to_string(i) + ".upsample";
-                auto up_sample   = std::dynamic_pointer_cast<UpSampleBlock>(blocks[name]);
-
-                h = up_sample->forward(ctx, h);
-            }
-        }
-
-        h = norm_out->forward(ctx, h);
-        h = ggml_silu_inplace(ctx->ggml_ctx, h);  // nonlinearity/swish
-        h = conv_out->forward(ctx, h);            // [N, out_ch, h*8, w*8]
-        return h;
-    }
-};
-
-// ldm.models.autoencoder.AutoencoderKL
-class AutoEncoderKLModel : public GGMLBlock {
-protected:
-    SDVersion version;
-    bool decode_only       = true;
-    bool use_video_decoder = false;
-    bool use_quant         = true;
-    int embed_dim          = 4;
-    struct {
-        int z_channels           = 4;
-        int resolution           = 256;
-        int in_channels          = 3;
-        int out_ch               = 3;
-        int ch                   = 128;
-        std::vector<int> ch_mult = {1, 2, 4, 4};
-        int num_res_blocks       = 2;
-        bool double_z            = true;
-    } dd_config;
-
-public:
-    AutoEncoderKLModel(SDVersion version          = VERSION_SD1,
-                       bool decode_only           = true,
-                       bool use_linear_projection = false,
-                       bool use_video_decoder     = false)
-        : version(version), decode_only(decode_only), use_video_decoder(use_video_decoder) {
-        if (sd_version_is_dit(version)) {
-            if (sd_version_is_flux2(version)) {
-                dd_config.z_channels = 32;
-                embed_dim            = 32;
-            } else {
-                use_quant            = false;
-                dd_config.z_channels = 16;
-            }
-        }
-        if (use_video_decoder) {
-            use_quant = false;
-        }
-        blocks["decoder"] = std::shared_ptr<GGMLBlock>(new Decoder(dd_config.ch,
-                                                                   dd_config.out_ch,
-                                                                   dd_config.ch_mult,
-                                                                   dd_config.num_res_blocks,
-                                                                   dd_config.z_channels,
-                                                                   use_linear_projection,
-                                                                   use_video_decoder));
-        if (use_quant) {
-            blocks["post_quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(dd_config.z_channels,
-                                                                              embed_dim,
-                                                                              {1, 1}));
-        }
-        if (!decode_only) {
-            blocks["encoder"] = std::shared_ptr<GGMLBlock>(new Encoder(dd_config.ch,
-                                                                       dd_config.ch_mult,
-                                                                       dd_config.num_res_blocks,
-                                                                       dd_config.in_channels,
-                                                                       dd_config.z_channels,
-                                                                       dd_config.double_z,
-                                                                       use_linear_projection));
-            if (use_quant) {
-                int factor = dd_config.double_z ? 2 : 1;
-
-                blocks["quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(embed_dim * factor,
-                                                                             dd_config.z_channels * factor,
-                                                                             {1, 1}));
-            }
-        }
-    }
-
-    ggml_tensor* decode(GGMLRunnerContext* ctx, ggml_tensor* z) {
-        // z: [N, z_channels, h, w]
-        if (sd_version_is_flux2(version)) {
-            // [N, C*p*p, h, w] -> [N, C, h*p, w*p]
-            int64_t p = 2;
-
-            int64_t N = z->ne[3];
-            int64_t C = z->ne[2] / p / p;
-            int64_t h = z->ne[1];
-            int64_t w = z->ne[0];
-            int64_t H = h * p;
-            int64_t W = w * p;
-
-            z = ggml_reshape_4d(ctx->ggml_ctx, z, w * h, p * p, C, N);                           // [N, C, p*p, h*w]
-            z = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, z, 1, 0, 2, 3));  // [N, C, h*w, p*p]
-            z = ggml_reshape_4d(ctx->ggml_ctx, z, p, p, w, h * C * N);                           // [N*C*h, w, p, p]
-            z = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, z, 0, 2, 1, 3));  // [N*C*h, p, w, p]
-            z = ggml_reshape_4d(ctx->ggml_ctx, z, W, H, C, N);                                   // [N, C, h*p, w*p]
-        }
-
-        if (use_quant) {
-            auto post_quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["post_quant_conv"]);
-            z                    = post_quant_conv->forward(ctx, z);  // [N, z_channels, h, w]
-        }
-        auto decoder = std::dynamic_pointer_cast<Decoder>(blocks["decoder"]);
-
-        ggml_set_name(z, "bench-start");
-        auto h = decoder->forward(ctx, z);
-        ggml_set_name(h, "bench-end");
-        return h;
-    }
-
-    ggml_tensor* encode(GGMLRunnerContext* ctx, ggml_tensor* x) {
-        // x: [N, in_channels, h, w]
-        auto encoder = std::dynamic_pointer_cast<Encoder>(blocks["encoder"]);
-
-        auto z = encoder->forward(ctx, x);  // [N, 2*z_channels, h/8, w/8]
-        if (use_quant) {
-            auto quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["quant_conv"]);
-            z               = quant_conv->forward(ctx, z);  // [N, 2*embed_dim, h/8, w/8]
-        }
-        if (sd_version_is_flux2(version)) {
-            z = ggml_ext_chunk(ctx->ggml_ctx, z, 2, 2)[0];
-
-            // [N, C, H, W] -> [N, C*p*p, H/p, W/p]
-            int64_t p = 2;
-            int64_t N = z->ne[3];
-            int64_t C = z->ne[2];
-            int64_t H = z->ne[1];
-            int64_t W = z->ne[0];
-            int64_t h = H / p;
-            int64_t w = W / p;
-
-            z = ggml_reshape_4d(ctx->ggml_ctx, z, p, w, p, h * C * N);                 // [N*C*h, p, w, p]
-            z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 0, 2, 1, 3));  // [N*C*h, w, p, p]
-            z = ggml_reshape_4d(ctx->ggml_ctx, z, p * p, w * h, C, N);                 // [N, C, h*w, p*p]
-            z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 1, 0, 2, 3));  // [N, C, p*p, h*w]
-            z = ggml_reshape_4d(ctx->ggml_ctx, z, w, h, p * p * C, N);                 // [N, C*p*p, h*w]
-        }
-        return z;
-    }
-
-    int get_encoder_output_channels() {
-        int factor = dd_config.double_z ? 2 : 1;
-        if (sd_version_is_flux2(version)) {
-            return dd_config.z_channels * 4;
-        }
-        return dd_config.z_channels * factor;
-    }
-};
-
-struct AutoEncoderKL : public VAE {
-    float scale_factor = 1.f;
-    float shift_factor = 0.f;
-    bool decode_only   = true;
-    AutoEncoderKLModel ae;
-
-    AutoEncoderKL(ggml_backend_t backend,
-                  bool offload_params_to_cpu,
-                  const String2TensorStorage& tensor_storage_map,
-                  const std::string prefix,
-                  bool decode_only       = false,
-                  bool use_video_decoder = false,
-                  SDVersion version      = VERSION_SD1)
-        : decode_only(decode_only), VAE(version, backend, offload_params_to_cpu) {
-        if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
-            scale_factor = 0.18215f;
-            shift_factor = 0.f;
-        } else if (sd_version_is_sdxl(version)) {
-            scale_factor = 0.13025f;
-            shift_factor = 0.f;
-        } else if (sd_version_is_sd3(version)) {
-            scale_factor = 1.5305f;
-            shift_factor = 0.0609f;
-        } else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) {
-            scale_factor = 0.3611f;
-            shift_factor = 0.1159f;
-        } else if (sd_version_is_flux2(version)) {
-            scale_factor = 1.0f;
-            shift_factor = 0.f;
-        }
-        bool use_linear_projection = false;
-        for (const auto& [name, tensor_storage] : tensor_storage_map) {
-            if (!starts_with(name, prefix)) {
-                continue;
-            }
-            if (ends_with(name, "attn_1.proj_out.weight")) {
-                if (tensor_storage.n_dims == 2) {
-                    use_linear_projection = true;
-                }
-                break;
-            }
-        }
-        ae = AutoEncoderKLModel(version, decode_only, use_linear_projection, use_video_decoder);
-        ae.init(params_ctx, tensor_storage_map, prefix);
-    }
-
-    void set_conv2d_scale(float scale) override {
-        std::vector<GGMLBlock*> blocks;
-        ae.get_all_blocks(blocks);
-        for (auto block : blocks) {
-            if (block->get_desc() == "Conv2d") {
-                auto conv_block = (Conv2d*)block;
-                conv_block->set_scale(scale);
-            }
-        }
-    }
-
-    std::string get_desc() override {
-        return "vae";
-    }
-
-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) override {
-        ae.get_param_tensors(tensors, prefix);
-    }
-
-    ggml_cgraph* build_graph(ggml_tensor* z, bool decode_graph) {
-        ggml_cgraph* gf = ggml_new_graph(compute_ctx);
-
-        z = to_backend(z);
-
-        auto runner_ctx = get_context();
-
-        ggml_tensor* out = decode_graph ? ae.decode(&runner_ctx, z) : ae.encode(&runner_ctx, z);
-
-        ggml_build_forward_expand(gf, out);
-
-        return gf;
-    }
-
-    bool _compute(const int n_threads,
-                  ggml_tensor* z,
-                  bool decode_graph,
-                  ggml_tensor** output,
-                  ggml_context* output_ctx = nullptr) override {
-        GGML_ASSERT(!decode_only || decode_graph);
-        auto get_graph = [&]() -> ggml_cgraph* {
-            return build_graph(z, decode_graph);
-        };
-        // ggml_set_f32(z, 0.5f);
-        // print_ggml_tensor(z);
-        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
-    }
-
-    ggml_tensor* gaussian_latent_sample(ggml_context* work_ctx, ggml_tensor* moments, std::shared_ptr<RNG> rng) {
-        // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
-        ggml_tensor* latents = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
-        ggml_tensor* noise   = ggml_dup_tensor(work_ctx, latents);
-        ggml_ext_im_set_randn_f32(noise, rng);
-        {
-            float mean   = 0;
-            float logvar = 0;
-            float value  = 0;
-            float std_   = 0;
-            for (int i = 0; i < latents->ne[3]; i++) {
-                for (int j = 0; j < latents->ne[2]; j++) {
-                    for (int k = 0; k < latents->ne[1]; k++) {
-                        for (int l = 0; l < latents->ne[0]; l++) {
-                            mean   = ggml_ext_tensor_get_f32(moments, l, k, j, i);
-                            logvar = ggml_ext_tensor_get_f32(moments, l, k, j + (int)latents->ne[2], i);
-                            logvar = std::max(-30.0f, std::min(logvar, 20.0f));
-                            std_   = std::exp(0.5f * logvar);
-                            value  = mean + std_ * ggml_ext_tensor_get_f32(noise, l, k, j, i);
-                            // printf("%d %d %d %d -> %f\n", i, j, k, l, value);
-                            ggml_ext_tensor_set_f32(latents, value, l, k, j, i);
-                        }
-                    }
-                }
-            }
-        }
-        return latents;
-    }
-
-    ggml_tensor* vae_output_to_latents(ggml_context* work_ctx, ggml_tensor* vae_output, std::shared_ptr<RNG> rng) {
-        if (sd_version_is_flux2(version)) {
-            return vae_output;
-        } else if (version == VERSION_SD1_PIX2PIX) {
-            return ggml_view_3d(work_ctx,
-                                vae_output,
-                                vae_output->ne[0],
-                                vae_output->ne[1],
-                                vae_output->ne[2] / 2,
-                                vae_output->nb[1],
-                                vae_output->nb[2],
-                                0);
-        } else {
-            return gaussian_latent_sample(work_ctx, vae_output, rng);
-        }
-    }
-
-    void get_latents_mean_std_vec(ggml_tensor* latents, int channel_dim, std::vector<float>& latents_mean_vec, std::vector<float>& latents_std_vec) {
-        // flux2
-        if (sd_version_is_flux2(version)) {
-            GGML_ASSERT(latents->ne[channel_dim] == 128);
-            latents_mean_vec = {-0.0676f, -0.0715f, -0.0753f, -0.0745f, 0.0223f, 0.0180f, 0.0142f, 0.0184f,
-                                -0.0001f, -0.0063f, -0.0002f, -0.0031f, -0.0272f, -0.0281f, -0.0276f, -0.0290f,
-                                -0.0769f, -0.0672f, -0.0902f, -0.0892f, 0.0168f, 0.0152f, 0.0079f, 0.0086f,
-                                0.0083f, 0.0015f, 0.0003f, -0.0043f, -0.0439f, -0.0419f, -0.0438f, -0.0431f,
-                                -0.0102f, -0.0132f, -0.0066f, -0.0048f, -0.0311f, -0.0306f, -0.0279f, -0.0180f,
-                                0.0030f, 0.0015f, 0.0126f, 0.0145f, 0.0347f, 0.0338f, 0.0337f, 0.0283f,
-                                0.0020f, 0.0047f, 0.0047f, 0.0050f, 0.0123f, 0.0081f, 0.0081f, 0.0146f,
-                                0.0681f, 0.0679f, 0.0767f, 0.0732f, -0.0462f, -0.0474f, -0.0392f, -0.0511f,
-                                -0.0528f, -0.0477f, -0.0470f, -0.0517f, -0.0317f, -0.0316f, -0.0345f, -0.0283f,
-                                0.0510f, 0.0445f, 0.0578f, 0.0458f, -0.0412f, -0.0458f, -0.0487f, -0.0467f,
-                                -0.0088f, -0.0106f, -0.0088f, -0.0046f, -0.0376f, -0.0432f, -0.0436f, -0.0499f,
-                                0.0118f, 0.0166f, 0.0203f, 0.0279f, 0.0113f, 0.0129f, 0.0016f, 0.0072f,
-                                -0.0118f, -0.0018f, -0.0141f, -0.0054f, -0.0091f, -0.0138f, -0.0145f, -0.0187f,
-                                0.0323f, 0.0305f, 0.0259f, 0.0300f, 0.0540f, 0.0614f, 0.0495f, 0.0590f,
-                                -0.0511f, -0.0603f, -0.0478f, -0.0524f, -0.0227f, -0.0274f, -0.0154f, -0.0255f,
-                                -0.0572f, -0.0565f, -0.0518f, -0.0496f, 0.0116f, 0.0054f, 0.0163f, 0.0104f};
-            latents_std_vec  = {
-                 1.8029f, 1.7786f, 1.7868f, 1.7837f, 1.7717f, 1.7590f, 1.7610f, 1.7479f,
-                 1.7336f, 1.7373f, 1.7340f, 1.7343f, 1.8626f, 1.8527f, 1.8629f, 1.8589f,
-                 1.7593f, 1.7526f, 1.7556f, 1.7583f, 1.7363f, 1.7400f, 1.7355f, 1.7394f,
-                 1.7342f, 1.7246f, 1.7392f, 1.7304f, 1.7551f, 1.7513f, 1.7559f, 1.7488f,
-                 1.8449f, 1.8454f, 1.8550f, 1.8535f, 1.8240f, 1.7813f, 1.7854f, 1.7945f,
-                 1.8047f, 1.7876f, 1.7695f, 1.7676f, 1.7782f, 1.7667f, 1.7925f, 1.7848f,
-                 1.7579f, 1.7407f, 1.7483f, 1.7368f, 1.7961f, 1.7998f, 1.7920f, 1.7925f,
-                 1.7780f, 1.7747f, 1.7727f, 1.7749f, 1.7526f, 1.7447f, 1.7657f, 1.7495f,
-                 1.7775f, 1.7720f, 1.7813f, 1.7813f, 1.8162f, 1.8013f, 1.8023f, 1.8033f,
-                 1.7527f, 1.7331f, 1.7563f, 1.7482f, 1.7610f, 1.7507f, 1.7681f, 1.7613f,
-                 1.7665f, 1.7545f, 1.7828f, 1.7726f, 1.7896f, 1.7999f, 1.7864f, 1.7760f,
-                 1.7613f, 1.7625f, 1.7560f, 1.7577f, 1.7783f, 1.7671f, 1.7810f, 1.7799f,
-                 1.7201f, 1.7068f, 1.7265f, 1.7091f, 1.7793f, 1.7578f, 1.7502f, 1.7455f,
-                 1.7587f, 1.7500f, 1.7525f, 1.7362f, 1.7616f, 1.7572f, 1.7444f, 1.7430f,
-                 1.7509f, 1.7610f, 1.7634f, 1.7612f, 1.7254f, 1.7135f, 1.7321f, 1.7226f,
-                 1.7664f, 1.7624f, 1.7718f, 1.7664f, 1.7457f, 1.7441f, 1.7569f, 1.7530f};
-        } else {
-            GGML_ABORT("unknown version %d", version);
-        }
-    }
-
-    ggml_tensor* diffusion_to_vae_latents(ggml_context* work_ctx, ggml_tensor* latents) {
-        ggml_tensor* vae_latents = ggml_dup(work_ctx, latents);
-        if (sd_version_is_flux2(version)) {
-            int channel_dim = 2;
-            std::vector<float> latents_mean_vec;
-            std::vector<float> latents_std_vec;
-            get_latents_mean_std_vec(latents, channel_dim, latents_mean_vec, latents_std_vec);
-
-            float mean;
-            float std_;
-            for (int i = 0; i < latents->ne[3]; i++) {
-                if (channel_dim == 3) {
-                    mean = latents_mean_vec[i];
-                    std_ = latents_std_vec[i];
-                }
-                for (int j = 0; j < latents->ne[2]; j++) {
-                    if (channel_dim == 2) {
-                        mean = latents_mean_vec[j];
-                        std_ = latents_std_vec[j];
-                    }
-                    for (int k = 0; k < latents->ne[1]; k++) {
-                        for (int l = 0; l < latents->ne[0]; l++) {
-                            float value = ggml_ext_tensor_get_f32(latents, l, k, j, i);
-                            value       = value * std_ / scale_factor + mean;
-                            ggml_ext_tensor_set_f32(vae_latents, value, l, k, j, i);
-                        }
-                    }
-                }
-            }
-        } else {
-            ggml_ext_tensor_iter(latents, [&](ggml_tensor* latents, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-                float value = ggml_ext_tensor_get_f32(latents, i0, i1, i2, i3);
-                value       = (value / scale_factor) + shift_factor;
-                ggml_ext_tensor_set_f32(vae_latents, value, i0, i1, i2, i3);
-            });
-        }
-        return vae_latents;
-    }
-
-    ggml_tensor* vae_to_diffuison_latents(ggml_context* work_ctx, ggml_tensor* latents) {
-        ggml_tensor* diffusion_latents = ggml_dup(work_ctx, latents);
-        if (sd_version_is_flux2(version)) {
-            int channel_dim = 2;
-            std::vector<float> latents_mean_vec;
-            std::vector<float> latents_std_vec;
-            get_latents_mean_std_vec(latents, channel_dim, latents_mean_vec, latents_std_vec);
-
-            float mean;
-            float std_;
-            for (int i = 0; i < latents->ne[3]; i++) {
-                if (channel_dim == 3) {
-                    mean = latents_mean_vec[i];
-                    std_ = latents_std_vec[i];
-                }
-                for (int j = 0; j < latents->ne[2]; j++) {
-                    if (channel_dim == 2) {
-                        mean = latents_mean_vec[j];
-                        std_ = latents_std_vec[j];
-                    }
-                    for (int k = 0; k < latents->ne[1]; k++) {
-                        for (int l = 0; l < latents->ne[0]; l++) {
-                            float value = ggml_ext_tensor_get_f32(latents, l, k, j, i);
-                            value       = (value - mean) * scale_factor / std_;
-                            ggml_ext_tensor_set_f32(diffusion_latents, value, l, k, j, i);
-                        }
-                    }
-                }
-            }
-        } else {
-            ggml_ext_tensor_iter(latents, [&](ggml_tensor* latents, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-                float value = ggml_ext_tensor_get_f32(latents, i0, i1, i2, i3);
-                value       = (value - shift_factor) * scale_factor;
-                ggml_ext_tensor_set_f32(diffusion_latents, value, i0, i1, i2, i3);
-            });
-        }
-        return diffusion_latents;
-    }
-
-    int get_encoder_output_channels(int input_channels) {
-        return ae.get_encoder_output_channels();
-    }
-
-    void test() {
-        ggml_init_params params;
-        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
-        params.mem_buffer = nullptr;
-        params.no_alloc   = false;
-
-        ggml_context* work_ctx = ggml_init(params);
-        GGML_ASSERT(work_ctx != nullptr);
-
-        {
-            // CPU, x{1, 3, 64, 64}: Pass
-            // CUDA, x{1, 3, 64, 64}: Pass, but sill get wrong result for some image, may be due to interlnal nan
-            // CPU, x{2, 3, 64, 64}: Wrong result
-            // CUDA, x{2, 3, 64, 64}: Wrong result, and different from CPU result
-            auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 64, 64, 3, 2);
-            ggml_set_f32(x, 0.5f);
-            print_ggml_tensor(x);
-            ggml_tensor* out = nullptr;
-
-            int64_t t0 = ggml_time_ms();
-            _compute(8, x, false, &out, work_ctx);
-            int64_t t1 = ggml_time_ms();
-
-            print_ggml_tensor(out);
-            LOG_DEBUG("encode test done in %lldms", t1 - t0);
-        }
-
-        if (false) {
-            // CPU, z{1, 4, 8, 8}: Pass
-            // CUDA, z{1, 4, 8, 8}: Pass
-            // CPU, z{3, 4, 8, 8}: Wrong result
-            // CUDA, z{3, 4, 8, 8}: Wrong result, and different from CPU result
-            auto z = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
-            ggml_set_f32(z, 0.5f);
-            print_ggml_tensor(z);
-            ggml_tensor* out = nullptr;
-
-            int64_t t0 = ggml_time_ms();
-            _compute(8, z, true, &out, work_ctx);
-            int64_t t1 = ggml_time_ms();
-
-            print_ggml_tensor(out);
-            LOG_DEBUG("decode test done in %lldms", t1 - t0);
-        }
-    };
-};
-
-#endif  // __AUTO_ENCODER_KL_HPP__
--- a/src/cache_dit.hpp
+++ b/src/cache_dit.hpp
@ -1,894 +0,0 @@
-#ifndef __CACHE_DIT_HPP__
-#define __CACHE_DIT_HPP__
-
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "ggml_extend.hpp"
-
-struct DBCacheConfig {
-    bool enabled                        = false;
-    int Fn_compute_blocks               = 8;
-    int Bn_compute_blocks               = 0;
-    float residual_diff_threshold       = 0.08f;
-    int max_warmup_steps                = 8;
-    int max_cached_steps                = -1;
-    int max_continuous_cached_steps     = -1;
-    float max_accumulated_residual_diff = -1.0f;
-    std::vector<int> steps_computation_mask;
-    bool scm_policy_dynamic = true;
-};
-
-struct TaylorSeerConfig {
-    bool enabled            = false;
-    int n_derivatives       = 1;
-    int max_warmup_steps    = 2;
-    int skip_interval_steps = 1;
-};
-
-struct CacheDitConfig {
-    DBCacheConfig dbcache;
-    TaylorSeerConfig taylorseer;
-    int double_Fn_blocks = -1;
-    int double_Bn_blocks = -1;
-    int single_Fn_blocks = -1;
-    int single_Bn_blocks = -1;
-};
-
-struct TaylorSeerState {
-    int n_derivatives      = 1;
-    int current_step       = -1;
-    int last_computed_step = -1;
-    std::vector<std::vector<float>> dY_prev;
-    std::vector<std::vector<float>> dY_current;
-
-    void init(int n_deriv, size_t hidden_size) {
-        n_derivatives = n_deriv;
-        int order     = n_derivatives + 1;
-        dY_prev.resize(order);
-        dY_current.resize(order);
-        for (int i = 0; i < order; i++) {
-            dY_prev[i].clear();
-            dY_current[i].clear();
-        }
-        current_step       = -1;
-        last_computed_step = -1;
-    }
-
-    void reset() {
-        for (auto& v : dY_prev)
-            v.clear();
-        for (auto& v : dY_current)
-            v.clear();
-        current_step       = -1;
-        last_computed_step = -1;
-    }
-
-    bool can_approximate() const {
-        return last_computed_step >= n_derivatives && !dY_prev.empty() && !dY_prev[0].empty();
-    }
-
-    void update_derivatives(const float* Y, size_t size, int step) {
-        int order = n_derivatives + 1;
-        dY_prev   = dY_current;
-        dY_current[0].resize(size);
-        for (size_t i = 0; i < size; i++) {
-            dY_current[0][i] = Y[i];
-        }
-
-        int window = step - last_computed_step;
-        if (window <= 0)
-            window = 1;
-
-        for (int d = 0; d < n_derivatives; d++) {
-            if (!dY_prev[d].empty() && dY_prev[d].size() == size) {
-                dY_current[d + 1].resize(size);
-                for (size_t i = 0; i < size; i++) {
-                    dY_current[d + 1][i] = (dY_current[d][i] - dY_prev[d][i]) / static_cast<float>(window);
-                }
-            } else {
-                dY_current[d + 1].clear();
-            }
-        }
-
-        current_step       = step;
-        last_computed_step = step;
-    }
-
-    void approximate(float* output, size_t size, int target_step) const {
-        if (!can_approximate() || dY_prev[0].size() != size) {
-            return;
-        }
-
-        int elapsed = target_step - last_computed_step;
-        if (elapsed <= 0)
-            elapsed = 1;
-
-        std::fill(output, output + size, 0.0f);
-        float factorial = 1.0f;
-        int order       = static_cast<int>(dY_prev.size());
-
-        for (int o = 0; o < order; o++) {
-            if (dY_prev[o].empty() || dY_prev[o].size() != size)
-                continue;
-            if (o > 0)
-                factorial *= static_cast<float>(o);
-            float coeff = ::powf(static_cast<float>(elapsed), static_cast<float>(o)) / factorial;
-            for (size_t i = 0; i < size; i++) {
-                output[i] += coeff * dY_prev[o][i];
-            }
-        }
-    }
-};
-
-struct BlockCacheEntry {
-    std::vector<float> residual_img;
-    std::vector<float> residual_txt;
-    std::vector<float> residual;
-    std::vector<float> prev_img;
-    std::vector<float> prev_txt;
-    std::vector<float> prev_output;
-    bool has_prev = false;
-};
-
-struct CacheDitState {
-    CacheDitConfig config;
-    bool initialized = false;
-
-    int total_double_blocks = 0;
-    int total_single_blocks = 0;
-    size_t hidden_size      = 0;
-
-    int current_step     = -1;
-    int total_steps      = 0;
-    int warmup_remaining = 0;
-    std::vector<int> cached_steps;
-    int continuous_cached_steps     = 0;
-    float accumulated_residual_diff = 0.0f;
-
-    std::vector<BlockCacheEntry> double_block_cache;
-    std::vector<BlockCacheEntry> single_block_cache;
-
-    std::vector<float> Fn_residual_img;
-    std::vector<float> Fn_residual_txt;
-    std::vector<float> prev_Fn_residual_img;
-    std::vector<float> prev_Fn_residual_txt;
-    bool has_prev_Fn_residual = false;
-
-    std::vector<float> Bn_buffer_img;
-    std::vector<float> Bn_buffer_txt;
-    std::vector<float> Bn_buffer;
-    bool has_Bn_buffer = false;
-
-    TaylorSeerState taylor_state;
-
-    bool can_cache_this_step  = false;
-    bool is_caching_this_step = false;
-
-    int total_blocks_computed = 0;
-    int total_blocks_cached   = 0;
-
-    void init(const CacheDitConfig& cfg, int num_double_blocks, int num_single_blocks, size_t h_size) {
-        config              = cfg;
-        total_double_blocks = num_double_blocks;
-        total_single_blocks = num_single_blocks;
-        hidden_size         = h_size;
-
-        initialized = cfg.dbcache.enabled || cfg.taylorseer.enabled;
-
-        if (!initialized)
-            return;
-
-        warmup_remaining = cfg.dbcache.max_warmup_steps;
-        double_block_cache.resize(total_double_blocks);
-        single_block_cache.resize(total_single_blocks);
-
-        if (cfg.taylorseer.enabled) {
-            taylor_state.init(cfg.taylorseer.n_derivatives, h_size);
-        }
-
-        reset_runtime();
-    }
-
-    void reset_runtime() {
-        current_step     = -1;
-        total_steps      = 0;
-        warmup_remaining = config.dbcache.max_warmup_steps;
-        cached_steps.clear();
-        continuous_cached_steps   = 0;
-        accumulated_residual_diff = 0.0f;
-
-        for (auto& entry : double_block_cache) {
-            entry.residual_img.clear();
-            entry.residual_txt.clear();
-            entry.prev_img.clear();
-            entry.prev_txt.clear();
-            entry.has_prev = false;
-        }
-
-        for (auto& entry : single_block_cache) {
-            entry.residual.clear();
-            entry.prev_output.clear();
-            entry.has_prev = false;
-        }
-
-        Fn_residual_img.clear();
-        Fn_residual_txt.clear();
-        prev_Fn_residual_img.clear();
-        prev_Fn_residual_txt.clear();
-        has_prev_Fn_residual = false;
-
-        Bn_buffer_img.clear();
-        Bn_buffer_txt.clear();
-        Bn_buffer.clear();
-        has_Bn_buffer = false;
-
-        taylor_state.reset();
-
-        can_cache_this_step  = false;
-        is_caching_this_step = false;
-
-        total_blocks_computed = 0;
-        total_blocks_cached   = 0;
-    }
-
-    bool enabled() const {
-        return initialized && (config.dbcache.enabled || config.taylorseer.enabled);
-    }
-
-    void begin_step(int step_index, float sigma = 0.0f) {
-        if (!enabled())
-            return;
-        if (step_index == current_step)
-            return;
-
-        current_step = step_index;
-        total_steps++;
-
-        bool in_warmup = warmup_remaining > 0;
-        if (in_warmup) {
-            warmup_remaining--;
-        }
-
-        bool scm_allows_cache = true;
-        if (!config.dbcache.steps_computation_mask.empty()) {
-            if (step_index < static_cast<int>(config.dbcache.steps_computation_mask.size())) {
-                scm_allows_cache = (config.dbcache.steps_computation_mask[step_index] == 0);
-                if (!config.dbcache.scm_policy_dynamic && scm_allows_cache) {
-                    can_cache_this_step  = true;
-                    is_caching_this_step = false;
-                    return;
-                }
-            }
-        }
-
-        bool max_cached_ok = (config.dbcache.max_cached_steps < 0) ||
-                             (static_cast<int>(cached_steps.size()) < config.dbcache.max_cached_steps);
-
-        bool max_cont_ok = (config.dbcache.max_continuous_cached_steps < 0) ||
-                           (continuous_cached_steps < config.dbcache.max_continuous_cached_steps);
-
-        bool accum_ok = (config.dbcache.max_accumulated_residual_diff < 0.0f) ||
-                        (accumulated_residual_diff < config.dbcache.max_accumulated_residual_diff);
-
-        can_cache_this_step  = !in_warmup && scm_allows_cache && max_cached_ok && max_cont_ok && accum_ok && has_prev_Fn_residual;
-        is_caching_this_step = false;
-    }
-
-    void end_step(bool was_cached) {
-        if (was_cached) {
-            cached_steps.push_back(current_step);
-            continuous_cached_steps++;
-        } else {
-            continuous_cached_steps = 0;
-        }
-    }
-
-    static float calculate_residual_diff(const float* prev, const float* curr, size_t size) {
-        if (size == 0)
-            return 0.0f;
-
-        float sum_diff = 0.0f;
-        float sum_abs  = 0.0f;
-
-        for (size_t i = 0; i < size; i++) {
-            sum_diff += std::fabs(prev[i] - curr[i]);
-            sum_abs += std::fabs(prev[i]);
-        }
-
-        return sum_diff / (sum_abs + 1e-6f);
-    }
-
-    static float calculate_residual_diff(const std::vector<float>& prev, const std::vector<float>& curr) {
-        if (prev.size() != curr.size() || prev.empty())
-            return 1.0f;
-        return calculate_residual_diff(prev.data(), curr.data(), prev.size());
-    }
-
-    int get_double_Fn_blocks() const {
-        return (config.double_Fn_blocks >= 0) ? config.double_Fn_blocks : config.dbcache.Fn_compute_blocks;
-    }
-
-    int get_double_Bn_blocks() const {
-        return (config.double_Bn_blocks >= 0) ? config.double_Bn_blocks : config.dbcache.Bn_compute_blocks;
-    }
-
-    int get_single_Fn_blocks() const {
-        return (config.single_Fn_blocks >= 0) ? config.single_Fn_blocks : config.dbcache.Fn_compute_blocks;
-    }
-
-    int get_single_Bn_blocks() const {
-        return (config.single_Bn_blocks >= 0) ? config.single_Bn_blocks : config.dbcache.Bn_compute_blocks;
-    }
-
-    bool is_Fn_double_block(int block_idx) const {
-        return block_idx < get_double_Fn_blocks();
-    }
-
-    bool is_Bn_double_block(int block_idx) const {
-        int Bn = get_double_Bn_blocks();
-        return Bn > 0 && block_idx >= (total_double_blocks - Bn);
-    }
-
-    bool is_Mn_double_block(int block_idx) const {
-        return !is_Fn_double_block(block_idx) && !is_Bn_double_block(block_idx);
-    }
-
-    bool is_Fn_single_block(int block_idx) const {
-        return block_idx < get_single_Fn_blocks();
-    }
-
-    bool is_Bn_single_block(int block_idx) const {
-        int Bn = get_single_Bn_blocks();
-        return Bn > 0 && block_idx >= (total_single_blocks - Bn);
-    }
-
-    bool is_Mn_single_block(int block_idx) const {
-        return !is_Fn_single_block(block_idx) && !is_Bn_single_block(block_idx);
-    }
-
-    void store_Fn_residual(const float* img, const float* txt, size_t img_size, size_t txt_size, const float* input_img, const float* input_txt) {
-        Fn_residual_img.resize(img_size);
-        Fn_residual_txt.resize(txt_size);
-
-        for (size_t i = 0; i < img_size; i++) {
-            Fn_residual_img[i] = img[i] - input_img[i];
-        }
-        for (size_t i = 0; i < txt_size; i++) {
-            Fn_residual_txt[i] = txt[i] - input_txt[i];
-        }
-    }
-
-    bool check_cache_decision() {
-        if (!can_cache_this_step) {
-            is_caching_this_step = false;
-            return false;
-        }
-
-        if (!has_prev_Fn_residual || prev_Fn_residual_img.empty()) {
-            is_caching_this_step = false;
-            return false;
-        }
-
-        float diff_img = calculate_residual_diff(prev_Fn_residual_img, Fn_residual_img);
-        float diff_txt = calculate_residual_diff(prev_Fn_residual_txt, Fn_residual_txt);
-        float diff     = (diff_img + diff_txt) / 2.0f;
-
-        if (diff < config.dbcache.residual_diff_threshold) {
-            is_caching_this_step = true;
-            accumulated_residual_diff += diff;
-            return true;
-        }
-
-        is_caching_this_step = false;
-        return false;
-    }
-
-    void update_prev_Fn_residual() {
-        prev_Fn_residual_img = Fn_residual_img;
-        prev_Fn_residual_txt = Fn_residual_txt;
-        has_prev_Fn_residual = !prev_Fn_residual_img.empty();
-    }
-
-    void store_double_block_residual(int block_idx, const float* img, const float* txt, size_t img_size, size_t txt_size, const float* prev_img, const float* prev_txt) {
-        if (block_idx < 0 || block_idx >= static_cast<int>(double_block_cache.size()))
-            return;
-
-        BlockCacheEntry& entry = double_block_cache[block_idx];
-
-        entry.residual_img.resize(img_size);
-        entry.residual_txt.resize(txt_size);
-        for (size_t i = 0; i < img_size; i++) {
-            entry.residual_img[i] = img[i] - prev_img[i];
-        }
-        for (size_t i = 0; i < txt_size; i++) {
-            entry.residual_txt[i] = txt[i] - prev_txt[i];
-        }
-
-        entry.prev_img.resize(img_size);
-        entry.prev_txt.resize(txt_size);
-        for (size_t i = 0; i < img_size; i++) {
-            entry.prev_img[i] = img[i];
-        }
-        for (size_t i = 0; i < txt_size; i++) {
-            entry.prev_txt[i] = txt[i];
-        }
-        entry.has_prev = true;
-    }
-
-    void apply_double_block_cache(int block_idx, float* img, float* txt, size_t img_size, size_t txt_size) {
-        if (block_idx < 0 || block_idx >= static_cast<int>(double_block_cache.size()))
-            return;
-
-        const BlockCacheEntry& entry = double_block_cache[block_idx];
-        if (entry.residual_img.size() != img_size || entry.residual_txt.size() != txt_size)
-            return;
-
-        for (size_t i = 0; i < img_size; i++) {
-            img[i] += entry.residual_img[i];
-        }
-        for (size_t i = 0; i < txt_size; i++) {
-            txt[i] += entry.residual_txt[i];
-        }
-
-        total_blocks_cached++;
-    }
-
-    void store_single_block_residual(int block_idx, const float* output, size_t size, const float* input) {
-        if (block_idx < 0 || block_idx >= static_cast<int>(single_block_cache.size()))
-            return;
-
-        BlockCacheEntry& entry = single_block_cache[block_idx];
-
-        entry.residual.resize(size);
-        for (size_t i = 0; i < size; i++) {
-            entry.residual[i] = output[i] - input[i];
-        }
-
-        entry.prev_output.resize(size);
-        for (size_t i = 0; i < size; i++) {
-            entry.prev_output[i] = output[i];
-        }
-        entry.has_prev = true;
-    }
-
-    void apply_single_block_cache(int block_idx, float* output, size_t size) {
-        if (block_idx < 0 || block_idx >= static_cast<int>(single_block_cache.size()))
-            return;
-
-        const BlockCacheEntry& entry = single_block_cache[block_idx];
-        if (entry.residual.size() != size)
-            return;
-
-        for (size_t i = 0; i < size; i++) {
-            output[i] += entry.residual[i];
-        }
-
-        total_blocks_cached++;
-    }
-
-    void store_Bn_buffer(const float* img, const float* txt, size_t img_size, size_t txt_size, const float* Bn_start_img, const float* Bn_start_txt) {
-        Bn_buffer_img.resize(img_size);
-        Bn_buffer_txt.resize(txt_size);
-
-        for (size_t i = 0; i < img_size; i++) {
-            Bn_buffer_img[i] = img[i] - Bn_start_img[i];
-        }
-        for (size_t i = 0; i < txt_size; i++) {
-            Bn_buffer_txt[i] = txt[i] - Bn_start_txt[i];
-        }
-        has_Bn_buffer = true;
-    }
-
-    void apply_Bn_buffer(float* img, float* txt, size_t img_size, size_t txt_size) {
-        if (!has_Bn_buffer)
-            return;
-        if (Bn_buffer_img.size() != img_size || Bn_buffer_txt.size() != txt_size)
-            return;
-
-        for (size_t i = 0; i < img_size; i++) {
-            img[i] += Bn_buffer_img[i];
-        }
-        for (size_t i = 0; i < txt_size; i++) {
-            txt[i] += Bn_buffer_txt[i];
-        }
-    }
-
-    void taylor_update(const float* hidden_state, size_t size) {
-        if (!config.taylorseer.enabled)
-            return;
-        taylor_state.update_derivatives(hidden_state, size, current_step);
-    }
-
-    bool taylor_can_approximate() const {
-        return config.taylorseer.enabled && taylor_state.can_approximate();
-    }
-
-    void taylor_approximate(float* output, size_t size) {
-        if (!config.taylorseer.enabled)
-            return;
-        taylor_state.approximate(output, size, current_step);
-    }
-
-    bool should_use_taylor_this_step() const {
-        if (!config.taylorseer.enabled)
-            return false;
-        if (current_step < config.taylorseer.max_warmup_steps)
-            return false;
-
-        int interval = config.taylorseer.skip_interval_steps;
-        if (interval <= 0)
-            interval = 1;
-
-        return (current_step % (interval + 1)) != 0;
-    }
-
-    void log_metrics() const {
-        if (!enabled())
-            return;
-
-        int total_blocks  = total_blocks_computed + total_blocks_cached;
-        float cache_ratio = (total_blocks > 0) ? (static_cast<float>(total_blocks_cached) / total_blocks * 100.0f) : 0.0f;
-
-        float step_cache_ratio = (total_steps > 0) ? (static_cast<float>(cached_steps.size()) / total_steps * 100.0f) : 0.0f;
-
-        LOG_INFO("CacheDIT: steps_cached=%zu/%d (%.1f%%), blocks_cached=%d/%d (%.1f%%), accum_diff=%.4f",
-                 cached_steps.size(), total_steps, step_cache_ratio,
-                 total_blocks_cached, total_blocks, cache_ratio,
-                 accumulated_residual_diff);
-    }
-
-    std::string get_summary() const {
-        char buf[256];
-        snprintf(buf, sizeof(buf),
-                 "CacheDIT[thresh=%.2f]: cached %zu/%d steps, %d/%d blocks",
-                 config.dbcache.residual_diff_threshold,
-                 cached_steps.size(), total_steps,
-                 total_blocks_cached, total_blocks_computed + total_blocks_cached);
-        return std::string(buf);
-    }
-};
-
-inline std::vector<int> parse_scm_mask(const std::string& mask_str) {
-    std::vector<int> mask;
-    if (mask_str.empty())
-        return mask;
-
-    size_t pos   = 0;
-    size_t start = 0;
-    while ((pos = mask_str.find(',', start)) != std::string::npos) {
-        std::string token = mask_str.substr(start, pos - start);
-        mask.push_back(std::stoi(token));
-        start = pos + 1;
-    }
-    if (start < mask_str.length()) {
-        mask.push_back(std::stoi(mask_str.substr(start)));
-    }
-
-    return mask;
-}
-
-inline std::vector<int> generate_scm_mask(
-    const std::vector<int>& compute_bins,
-    const std::vector<int>& cache_bins,
-    int total_steps) {
-    std::vector<int> mask;
-    size_t c_idx = 0, cache_idx = 0;
-
-    while (static_cast<int>(mask.size()) < total_steps) {
-        if (c_idx < compute_bins.size()) {
-            for (int i = 0; i < compute_bins[c_idx] && static_cast<int>(mask.size()) < total_steps; i++) {
-                mask.push_back(1);
-            }
-            c_idx++;
-        }
-        if (cache_idx < cache_bins.size()) {
-            for (int i = 0; i < cache_bins[cache_idx] && static_cast<int>(mask.size()) < total_steps; i++) {
-                mask.push_back(0);
-            }
-            cache_idx++;
-        }
-        if (c_idx >= compute_bins.size() && cache_idx >= cache_bins.size())
-            break;
-    }
-
-    if (!mask.empty()) {
-        mask.back() = 1;
-    }
-
-    return mask;
-}
-
-inline void parse_dbcache_options(const std::string& opts, DBCacheConfig& cfg) {
-    if (opts.empty())
-        return;
-
-    int Fn = 8, Bn = 0, warmup = 8, max_cached = -1, max_cont = -1;
-    float thresh = 0.08f;
-
-    sscanf(opts.c_str(), "%d,%d,%f,%d,%d,%d",
-           &Fn, &Bn, &thresh, &warmup, &max_cached, &max_cont);
-
-    cfg.Fn_compute_blocks           = Fn;
-    cfg.Bn_compute_blocks           = Bn;
-    cfg.residual_diff_threshold     = thresh;
-    cfg.max_warmup_steps            = warmup;
-    cfg.max_cached_steps            = max_cached;
-    cfg.max_continuous_cached_steps = max_cont;
-}
-
-inline void parse_taylorseer_options(const std::string& opts, TaylorSeerConfig& cfg) {
-    if (opts.empty())
-        return;
-
-    int n_deriv = 1, warmup = 2, interval = 1;
-    sscanf(opts.c_str(), "%d,%d,%d", &n_deriv, &warmup, &interval);
-
-    cfg.n_derivatives       = n_deriv;
-    cfg.max_warmup_steps    = warmup;
-    cfg.skip_interval_steps = interval;
-}
-
-struct CacheDitConditionState {
-    DBCacheConfig config;
-    TaylorSeerConfig taylor_config;
-    bool initialized = false;
-
-    int current_step_index = -1;
-    bool step_active       = false;
-    bool skip_current_step = false;
-    bool initial_step      = true;
-    int warmup_remaining   = 0;
-    std::vector<int> cached_steps;
-    int continuous_cached_steps     = 0;
-    float accumulated_residual_diff = 0.0f;
-    int total_steps_skipped         = 0;
-
-    const void* anchor_condition = nullptr;
-
-    struct CacheEntry {
-        std::vector<float> diff;
-        std::vector<float> prev_input;
-        std::vector<float> prev_output;
-        bool has_prev = false;
-    };
-    std::unordered_map<const void*, CacheEntry> cache_diffs;
-
-    TaylorSeerState taylor_state;
-
-    float start_sigma = std::numeric_limits<float>::max();
-    float end_sigma   = 0.0f;
-
-    void reset_runtime() {
-        current_step_index = -1;
-        step_active        = false;
-        skip_current_step  = false;
-        initial_step       = true;
-        warmup_remaining   = config.max_warmup_steps;
-        cached_steps.clear();
-        continuous_cached_steps   = 0;
-        accumulated_residual_diff = 0.0f;
-        total_steps_skipped       = 0;
-        anchor_condition          = nullptr;
-        cache_diffs.clear();
-        taylor_state.reset();
-    }
-
-    void init(const DBCacheConfig& dbcfg, const TaylorSeerConfig& tcfg) {
-        config        = dbcfg;
-        taylor_config = tcfg;
-        initialized   = dbcfg.enabled || tcfg.enabled;
-        reset_runtime();
-
-        if (taylor_config.enabled) {
-            taylor_state.init(taylor_config.n_derivatives, 0);
-        }
-    }
-
-    void set_sigmas(const std::vector<float>& sigmas) {
-        if (!initialized || sigmas.size() < 2)
-            return;
-
-        float start_percent = 0.15f;
-        float end_percent   = 0.95f;
-
-        size_t n_steps    = sigmas.size() - 1;
-        size_t start_step = static_cast<size_t>(start_percent * n_steps);
-        size_t end_step   = static_cast<size_t>(end_percent * n_steps);
-
-        if (start_step >= n_steps)
-            start_step = n_steps - 1;
-        if (end_step >= n_steps)
-            end_step = n_steps - 1;
-
-        start_sigma = sigmas[start_step];
-        end_sigma   = sigmas[end_step];
-
-        if (start_sigma < end_sigma) {
-            std::swap(start_sigma, end_sigma);
-        }
-    }
-
-    bool enabled() const {
-        return initialized && (config.enabled || taylor_config.enabled);
-    }
-
-    void begin_step(int step_index, float sigma) {
-        if (!enabled())
-            return;
-        if (step_index == current_step_index)
-            return;
-
-        current_step_index = step_index;
-        skip_current_step  = false;
-        step_active        = false;
-
-        if (sigma > start_sigma)
-            return;
-        if (!(sigma > end_sigma))
-            return;
-
-        step_active = true;
-
-        if (warmup_remaining > 0) {
-            warmup_remaining--;
-            return;
-        }
-
-        if (!config.steps_computation_mask.empty()) {
-            if (step_index < static_cast<int>(config.steps_computation_mask.size())) {
-                if (config.steps_computation_mask[step_index] == 1) {
-                    return;
-                }
-            }
-        }
-
-        if (config.max_cached_steps >= 0 &&
-            static_cast<int>(cached_steps.size()) >= config.max_cached_steps) {
-            return;
-        }
-
-        if (config.max_continuous_cached_steps >= 0 &&
-            continuous_cached_steps >= config.max_continuous_cached_steps) {
-            return;
-        }
-    }
-
-    bool step_is_active() const {
-        return enabled() && step_active;
-    }
-
-    bool is_step_skipped() const {
-        return enabled() && step_active && skip_current_step;
-    }
-
-    bool has_cache(const void* cond) const {
-        auto it = cache_diffs.find(cond);
-        return it != cache_diffs.end() && !it->second.diff.empty();
-    }
-
-    void update_cache(const void* cond, const float* input, const float* output, size_t size) {
-        CacheEntry& entry = cache_diffs[cond];
-        entry.diff.resize(size);
-        for (size_t i = 0; i < size; i++) {
-            entry.diff[i] = output[i] - input[i];
-        }
-
-        entry.prev_input.resize(size);
-        entry.prev_output.resize(size);
-        for (size_t i = 0; i < size; i++) {
-            entry.prev_input[i]  = input[i];
-            entry.prev_output[i] = output[i];
-        }
-        entry.has_prev = true;
-    }
-
-    void apply_cache(const void* cond, const float* input, float* output, size_t size) {
-        auto it = cache_diffs.find(cond);
-        if (it == cache_diffs.end() || it->second.diff.empty())
-            return;
-        if (it->second.diff.size() != size)
-            return;
-
-        for (size_t i = 0; i < size; i++) {
-            output[i] = input[i] + it->second.diff[i];
-        }
-    }
-
-    bool before_condition(const void* cond, ggml_tensor* input, ggml_tensor* output, float sigma, int step_index) {
-        if (!enabled() || step_index < 0)
-            return false;
-
-        if (step_index != current_step_index) {
-            begin_step(step_index, sigma);
-        }
-
-        if (!step_active)
-            return false;
-
-        if (initial_step) {
-            anchor_condition = cond;
-            initial_step     = false;
-        }
-
-        bool is_anchor = (cond == anchor_condition);
-
-        if (skip_current_step) {
-            if (has_cache(cond)) {
-                apply_cache(cond, (float*)input->data, (float*)output->data,
-                            static_cast<size_t>(ggml_nelements(output)));
-                return true;
-            }
-            return false;
-        }
-
-        if (!is_anchor)
-            return false;
-
-        auto it = cache_diffs.find(cond);
-        if (it == cache_diffs.end() || !it->second.has_prev)
-            return false;
-
-        size_t ne = static_cast<size_t>(ggml_nelements(input));
-        if (it->second.prev_input.size() != ne)
-            return false;
-
-        float* input_data = (float*)input->data;
-        float diff        = CacheDitState::calculate_residual_diff(
-                   it->second.prev_input.data(), input_data, ne);
-
-        float effective_threshold = config.residual_diff_threshold;
-        if (config.Fn_compute_blocks > 0) {
-            float fn_confidence = 1.0f + 0.02f * (config.Fn_compute_blocks - 8);
-            fn_confidence       = std::max(0.5f, std::min(2.0f, fn_confidence));
-            effective_threshold *= fn_confidence;
-        }
-        if (config.Bn_compute_blocks > 0) {
-            float bn_quality = 1.0f - 0.03f * config.Bn_compute_blocks;
-            bn_quality       = std::max(0.5f, std::min(1.0f, bn_quality));
-            effective_threshold *= bn_quality;
-        }
-
-        if (diff < effective_threshold) {
-            skip_current_step = true;
-            total_steps_skipped++;
-            cached_steps.push_back(current_step_index);
-            continuous_cached_steps++;
-            accumulated_residual_diff += diff;
-            apply_cache(cond, input_data, (float*)output->data, ne);
-            return true;
-        }
-
-        continuous_cached_steps = 0;
-        return false;
-    }
-
-    void after_condition(const void* cond, ggml_tensor* input, ggml_tensor* output) {
-        if (!step_is_active())
-            return;
-
-        size_t ne = static_cast<size_t>(ggml_nelements(output));
-        update_cache(cond, (float*)input->data, (float*)output->data, ne);
-
-        if (cond == anchor_condition && taylor_config.enabled) {
-            taylor_state.update_derivatives((float*)output->data, ne, current_step_index);
-        }
-    }
-
-    void log_metrics() const {
-        if (!enabled())
-            return;
-
-        LOG_INFO("CacheDIT: steps_skipped=%d/%d (%.1f%%), accum_residual_diff=%.4f",
-                 total_steps_skipped,
-                 current_step_index + 1,
-                 (current_step_index > 0) ? (100.0f * total_steps_skipped / (current_step_index + 1)) : 0.0f,
-                 accumulated_residual_diff);
-    }
-};
-
-#endif
--- a/src/common_dit.hpp
+++ b/src/common_dit.hpp
@ -1,108 +0,0 @@
-#ifndef __COMMON_DIT_HPP__
-#define __COMMON_DIT_HPP__
-
-#include "ggml_extend.hpp"
-
-namespace DiT {
-    ggml_tensor* patchify(ggml_context* ctx,
-                          ggml_tensor* x,
-                          int pw,
-                          int ph,
-                          bool patch_last = true) {
-        // x: [N, C, H, W]
-        // return: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C]
-        int64_t N = x->ne[3];
-        int64_t C = x->ne[2];
-        int64_t H = x->ne[1];
-        int64_t W = x->ne[0];
-        int64_t h = H / ph;
-        int64_t w = W / pw;
-
-        GGML_ASSERT(h * ph == H && w * pw == W);
-
-        x = ggml_reshape_4d(ctx, x, pw, w, ph, h * C * N);     // [N*C*h, ph, w, pw]
-        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N*C*h, w, ph, pw]
-        x = ggml_reshape_4d(ctx, x, pw * ph, w * h, C, N);     // [N, C, h*w, ph*pw]
-        if (patch_last) {
-            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N, h*w, C, ph*pw]
-            x = ggml_reshape_3d(ctx, x, pw * ph * C, w * h, N);    // [N, h*w, C*ph*pw]
-        } else {
-            x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 2, 0, 1, 3));  // [N, h*w, C, ph*pw]
-            x = ggml_reshape_3d(ctx, x, C * pw * ph, w * h, N);              // [N, h*w, ph*pw*C]
-        }
-        return x;
-    }
-
-    ggml_tensor* unpatchify(ggml_context* ctx,
-                            ggml_tensor* x,
-                            int64_t h,
-                            int64_t w,
-                            int ph,
-                            int pw,
-                            bool patch_last = true) {
-        // x: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C]
-        // return: [N, C, H, W]
-        int64_t N = x->ne[2];
-        int64_t C = x->ne[0] / ph / pw;
-        int64_t H = h * ph;
-        int64_t W = w * pw;
-
-        GGML_ASSERT(C * ph * pw == x->ne[0]);
-
-        if (patch_last) {
-            x = ggml_reshape_4d(ctx, x, pw * ph, C, w * h, N);     // [N, h*w, C, ph*pw]
-            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N, C, h*w, ph*pw]
-        } else {
-            x = ggml_reshape_4d(ctx, x, C, pw * ph, w * h, N);     // [N, h*w, ph*pw, C]
-            x = ggml_cont(ctx, ggml_permute(ctx, x, 2, 0, 1, 3));  // [N, C, h*w, ph*pw]
-        }
-
-        x = ggml_reshape_4d(ctx, x, pw, ph, w, h * C * N);     // [N*C*h, w, ph, pw]
-        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N*C*h, ph, w, pw]
-        x = ggml_reshape_4d(ctx, x, W, H, C, N);               // [N, C, h*ph, w*pw]
-
-        return x;
-    }
-
-    ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
-                                   ggml_tensor* x,
-                                   int ph,
-                                   int pw) {
-        int64_t W = x->ne[0];
-        int64_t H = x->ne[1];
-
-        int pad_h = (ph - H % ph) % ph;
-        int pad_w = (pw - W % pw) % pw;
-        x         = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
-        return x;
-    }
-
-    ggml_tensor* pad_and_patchify(GGMLRunnerContext* ctx,
-                                  ggml_tensor* x,
-                                  int ph,
-                                  int pw,
-                                  bool patch_last = true) {
-        x = pad_to_patch_size(ctx, x, ph, pw);
-        x = patchify(ctx->ggml_ctx, x, ph, pw, patch_last);
-        return x;
-    }
-
-    ggml_tensor* unpatchify_and_crop(ggml_context* ctx,
-                                     ggml_tensor* x,
-                                     int64_t H,
-                                     int64_t W,
-                                     int ph,
-                                     int pw,
-                                     bool patch_last = true) {
-        int pad_h = (ph - H % ph) % ph;
-        int pad_w = (pw - W % pw) % pw;
-        int64_t h = ((H + pad_h) / ph);
-        int64_t w = ((W + pad_w) / pw);
-        x         = unpatchify(ctx, x, h, w, ph, pw, patch_last);  // [N, C, H + pad_h, W + pad_w]
-        x         = ggml_ext_slice(ctx, x, 1, 0, H);               // [N, C, H, W + pad_w]
-        x         = ggml_ext_slice(ctx, x, 0, 0, W);               // [N, C, H, W]
-        return x;
-    }
-}  // namespace DiT
-
-#endif  // __COMMON_DIT_HPP__
--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				`Subproject commit 1a34176cd6d39ad3a226b2b69047e71f6797f6bc`