feat: support backend-specific max-vram budgets

sync: update ggml (#1656 )
docs: refresh README guide links
2026-06-15 02:37:31 +00:00 · 2026-06-14 22:46:32 +08:00 · 2026-06-14 20:45:05 +08:00 · 2026-06-14 17:58:58 +08:00 · 2026-06-14 17:30:23 +08:00 · 2026-06-14 16:58:37 +08:00
176 changed files with 31788 additions and 4347867 deletions
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -0,0 +1,15 @@
+## Summary
+
+<!-- Describe what changed and why. Keep the PR focused on one clear change. -->
+
+## Related Issue / Discussion
+
+<!-- Link related issues, discussions, or previous PRs if applicable. -->
+
+## Additional Information
+
+<!-- Add verification notes, screenshots, sample output, or other context when applicable. -->
+
+## Checklist
+
+- [ ] I have read and confirmed this PR follows the [contribution guidelines](https://github.com/leejet/stable-diffusion.cpp/blob/master/CONTRIBUTING.md).
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -14,6 +14,8 @@ on:
    paths:
      [
        ".github/workflows/**",
+        ".dockerignore",
+        "Dockerfile*",
        "**/CMakeLists.txt",
        "**/Makefile",
        "**/*.h",
@ -29,6 +31,8 @@ on:
    paths:
      [
        ".github/workflows/**",
+        ".dockerignore",
+        "Dockerfile*",
        "**/CMakeLists.txt",
        "**/Makefile",
        "**/*.h",
@ -135,7 +139,7 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential libvulkan-dev glslc
+          sudo apt-get install build-essential libvulkan-dev glslc spirv-headers

      - name: Build
        id: cmake_build
@ -176,7 +180,8 @@ jobs:

  build-and-push-docker-images:
    name: Build and push container images
-    runs-on: ubuntu-latest
+    if: ${{ github.event_name != 'pull_request' }}
+    runs-on: ${{ matrix.runner }}

    permissions:
      contents: read
@ -188,6 +193,20 @@ jobs:
    strategy:
      matrix:
        variant: [musa, sycl, vulkan, cuda]
+        platform: [linux/amd64]
+        runner: [ubuntu-latest]
+        build-args: [""]
+        tag-suffix: [""]
+        include:
+          - variant: cuda
+            platform: linux/arm64
+            runner: ubuntu-24.04-arm
+            tag-suffix: "-spark"
+            build-args: |
+              CUDA_VERSION=13.0.0
+              UBUNTU_VERSION=24.04
+              CUDA_ARCHITECTURES=121
+              GGML_CUDA_FA_ALL_QUANTS=ON

    env:
      REGISTRY: ghcr.io
@ -242,12 +261,13 @@ jobs:
        uses: docker/build-push-action@v6
        with:
          context: .
-          platforms: linux/amd64
+          platforms: ${{ matrix.platform }}
          push: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
          file: Dockerfile.${{ matrix.variant }}
-          tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH_NAME }}-${{ matrix.variant }}
+          tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.BRANCH_NAME }}-${{ matrix.variant }}${{ matrix.tag-suffix }}
          labels: ${{ steps.meta.outputs.labels }}
          annotations: ${{ steps.meta.outputs.annotations }}
+          build-args: ${{ matrix.build-args }}

  macOS-latest-cmake:
    runs-on: macos-latest
@ -443,12 +463,129 @@ jobs:
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip

+  windows-latest-rocm:
+    runs-on: windows-2022
+
+    env:
+      ROCM_VERSION: "7.13.0"
+      GPU_TARGETS: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1152;gfx1200;gfx1201"
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      - name: Setup Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
+      - name: Setup pnpm
+        uses: pnpm/action-setup@v4
+        with:
+          version: 10.15.1
+
+      - name: Cache ROCm Installation
+        id: cache-rocm
+        uses: actions/cache@v4
+        with:
+          path: C:\TheRock\build
+          key: rocm-${{ env.ROCM_VERSION }}-gfx1151-${{ runner.os }}
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: windows-latest-rocm-${{ env.ROCM_VERSION }}-x64
+          evict-old-files: 1d
+
+      - name: Install ROCm
+        if: steps.cache-rocm.outputs.cache-hit != 'true'
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "Downloading AMD ROCm ${{ env.ROCM_VERSION }} tarball"
+          Invoke-WebRequest -Uri "https://repo.amd.com/rocm/tarball/therock-dist-windows-gfx1151-${{ env.ROCM_VERSION }}.tar.gz" -OutFile "${env:RUNNER_TEMP}\rocm.tar.gz"
+          write-host "Extracting ROCm tarball"
+          mkdir C:\TheRock\build -Force
+          tar -xzf "${env:RUNNER_TEMP}\rocm.tar.gz" -C C:\TheRock\build --strip-components=1
+          write-host "Completed ROCm extraction"
+
+      - name: Setup ROCm Environment
+        run: |
+          $rocmPath = "C:\TheRock\build"
+          echo "HIP_PATH=$rocmPath" >> $env:GITHUB_ENV
+          echo "HIP_DEVICE_LIB_PATH=$rocmPath\lib\llvm\amdgcn\bitcode" >> $env:GITHUB_ENV
+          echo "HIP_PLATFORM=amd" >> $env:GITHUB_ENV
+          echo "LLVM_PATH=$rocmPath\lib\llvm" >> $env:GITHUB_ENV
+          echo "$rocmPath\bin" >> $env:GITHUB_PATH
+          echo "$rocmPath\lib\llvm\bin" >> $env:GITHUB_PATH
+
+      - name: Build
+        run: |
+          mkdir build
+          cd build
+          cmake .. `
+            -G "Unix Makefiles" `
+            -DCMAKE_PREFIX_PATH="${env:HIP_PATH}" `
+            -DSD_HIPBLAS=ON `
+            -DSD_BUILD_SHARED_LIBS=ON `
+            -DGGML_NATIVE=OFF `
+            -DCMAKE_C_COMPILER="${env:HIP_PATH}\lib\llvm\bin\clang.exe" `
+            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\lib\llvm\bin\clang++.exe" `
+            -DCMAKE_HIP_COMPILER="${env:HIP_PATH}\lib\llvm\bin\clang.exe" `
+            -DHIP_PATH="${env:HIP_PATH}" `
+            -DCMAKE_BUILD_TYPE=Release `
+            -DGPU_TARGETS="${{ env.GPU_TARGETS }}"
+          cmake --build . --config Release --parallel ${env:NUMBER_OF_PROCESSORS}
+
+      - name: Get commit hash
+        id: commit
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: pr-mpt/actions-commit-hash@v2
+
+      - name: Pack artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          $ErrorActionPreference = "Stop"
+          $dst = "build\bin"
+          $rocmBin = Join-Path "${env:HIP_PATH}" "bin"
+          $requiredRocmPaths = @(
+            (Join-Path $rocmBin "rocblas.dll"),
+            (Join-Path $rocmBin "rocblas\library")
+          )
+          foreach ($path in $requiredRocmPaths) {
+            if (!(Test-Path $path)) {
+              throw "Missing ROCm runtime dependency: $path"
+            }
+          }
+
+          foreach ($pattern in @("rocblas*.dll", "hipblas*.dll", "libhipblas*.dll")) {
+            Copy-Item -Path (Join-Path $rocmBin $pattern) -Destination $dst -Force -ErrorAction SilentlyContinue
+          }
+
+          foreach ($dir in @("rocblas", "hipblaslt")) {
+            $src = Join-Path $rocmBin $dir
+            if (Test-Path $src) {
+              Copy-Item -Path $src -Destination $dst -Recurse -Force
+            }
+          }
+
+          7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip .\build\bin\*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip
+          path: |
+            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip
+
  windows-latest-cmake-hip:
    runs-on: windows-2022

    env:
-      HIPSDK_INSTALLER_VERSION: "25.Q3"
-      GPU_TARGETS: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
+      HIPSDK_INSTALLER_VERSION: "26.Q1"
+      ROCM_VERSION: "7.1.1"
+      GPU_TARGETS: "gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"

    steps:
      - uses: actions/checkout@v3
@ -483,7 +620,7 @@ jobs:
        run: |
          $ErrorActionPreference = "Stop"
          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
          $completed = $proc.WaitForExit(600000)
@ -536,47 +673,75 @@ jobs:
        run: |
          md "build\bin\rocblas\library\"
          md "build\bin\hipblaslt\library"
-          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\libhipblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\libhipblaslt.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
          cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
-          7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip .\build\bin\*
+          7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip .\build\bin\*

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
-          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip
          path: |
-            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
+            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.ROCM_VERSION }}-x64.zip

  ubuntu-latest-rocm:
-    runs-on: ubuntu-latest
-    container: rocm/dev-ubuntu-24.04:7.2
+    runs-on: ubuntu-24.04

    env:
-      ROCM_VERSION: "7.2"
      UBUNTU_VERSION: "24.04"
-      GPU_TARGETS: "gfx1151;gfx1150;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+
+    strategy:
+      matrix:
+        include:
+          - ROCM_VERSION: "7.2.1"
+            gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1031;gfx1032;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201"
+            build: 'x64'
+          - ROCM_VERSION: "7.13.0"
+            gpu_targets: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1010;gfx1011;gfx1012;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1152;gfx1200;gfx1201"
+            build: x64

    steps:
-      - run: apt-get update && apt-get install -y git
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
        with:
          submodules: recursive

-      - name: Setup Node
-        uses: actions/setup-node@v4
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
        with:
-          node-version: 20
+          key: ubuntu-rocm-cmake-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
+          evict-old-files: 1d

-      - name: Setup pnpm
-        uses: pnpm/action-setup@v4
-        with:
-          version: 10.15.1
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt install -y build-essential cmake wget zip ninja-build
+
+      - name: Setup Legacy ROCm
+        if: matrix.ROCM_VERSION == '7.2.1'
+        id: legacy_env
+        run: |
+          sudo mkdir --parents --mode=0755 /etc/apt/keyrings
+          wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
+            gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
+
+          sudo tee /etc/apt/sources.list.d/rocm.list << EOF
+          deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${{ matrix.ROCM_VERSION }} noble main
+          EOF
+
+          sudo tee /etc/apt/preferences.d/rocm-pin-600 << EOF
+          Package: *
+          Pin: release o=repo.radeon.com
+          Pin-Priority: 600
+          EOF
+
+          sudo apt update
+          sudo apt-get install -y libssl-dev rocm-hip-sdk

      - name: Free disk space
        run: |
@ -591,51 +756,29 @@ jobs:
          sudo rm -rf /var/lib/apt/lists/* || true
          sudo apt clean

-      - name: Dependencies
-        id: depends
+      - name: Setup TheRock
+        if: matrix.ROCM_VERSION != '7.2.1'
+        id: therock_env
        run: |
-          sudo apt-get update
-          sudo apt install -y \
-            cmake \
-            hip-dev \
-            hipblas-dev \
-            ninja-build \
-            rocm-dev \
-            zip
-          # Clean apt caches to recover disk space
-          sudo apt clean
-          sudo rm -rf /var/lib/apt/lists/* || true
+          wget https://repo.amd.com/rocm/tarball/therock-dist-linux-gfx1151-${{ matrix.ROCM_VERSION }}.tar.gz
+          mkdir install
+          tar -xf *.tar.gz -C install
+          export ROCM_PATH=$(pwd)/install
+          echo ROCM_PATH=$ROCM_PATH >> $GITHUB_ENV
+          echo PATH=$PATH:$ROCM_PATH/bin >> $GITHUB_ENV
+          echo LD_LIBRARY_PATH=$ROCM_PATH/lib:$ROCM_PATH/llvm/lib:$ROCM_PATH/lib/rocprofiler-systems >> $GITHUB_ENV

-      - name: Setup ROCm Environment
-        run: |
-          # Add ROCm to PATH for current session
-          echo "/opt/rocm/bin" >> $GITHUB_PATH
+      # setup-node installs into /opt/hostedtoolcache, which is removed above.
+      # Keep Node/pnpm setup after disk cleanup so the server frontend can be embedded.
+      - name: Setup Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20

-          # Build regex pattern from ${{ env.GPU_TARGETS }} (match target as substring)
-          TARGET_REGEX="($(printf '%s' "${{ env.GPU_TARGETS }}" | sed 's/;/|/g'))"
-
-          # Remove library files for architectures we're not building for to save disk space
-          echo "Cleaning up unneeded architecture files..."
-          cd /opt/rocm/lib/rocblas/library
-          # Keep only our target architectures
-          for file in *; do
-            if printf '%s' "$file" | grep -q 'gfx'; then
-              if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
-                echo "Removing $file" &&
-                sudo rm -f "$file";
-              fi
-            fi
-          done
-
-          cd /opt/rocm/lib/hipblaslt/library
-          for file in *; do
-            if printf '%s' "$file" | grep -q 'gfx'; then
-              if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
-                echo "Removing $file" &&
-                sudo rm -f "$file";
-              fi
-            fi
-          done
+      - name: Setup pnpm
+        uses: pnpm/action-setup@v4
+        with:
+          version: 10.15.1

      - name: Build
        id: cmake_build
@ -643,12 +786,12 @@ jobs:
          mkdir build
          cd build
          cmake .. -G Ninja \
-            -DCMAKE_CXX_COMPILER=amdclang++ \
-            -DCMAKE_C_COMPILER=amdclang \
+            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
+            -DCMAKE_HIP_FLAGS="-mllvm --amdgpu-unroll-threshold-local=600" \
            -DCMAKE_BUILD_TYPE=Release \
            -DSD_HIPBLAS=ON \
-            -DGPU_TARGETS="${{ env.GPU_TARGETS }}" \
-            -DAMDGPU_TARGETS="${{ env.GPU_TARGETS }}" \
+            -DHIP_PLATFORM=amd \
+            -DGPU_TARGETS="${{ matrix.gpu_targets }}" \
            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
            -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
            -DSD_BUILD_SHARED_LIBS=ON
@ -667,16 +810,6 @@ jobs:
          cp ggml/LICENSE ./build/bin/ggml.txt
          cp LICENSE ./build/bin/stable-diffusion.cpp.txt

-          # Move ROCm runtime libraries (to avoid double space consumption)
-          sudo mv /opt/rocm/lib/librocsparse.so* ./build/bin/
-          sudo mv /opt/rocm/lib/libhsa-runtime64.so* ./build/bin/
-          sudo mv /opt/rocm/lib/libamdhip64.so* ./build/bin/
-          sudo mv /opt/rocm/lib/libhipblas.so* ./build/bin/
-          sudo mv /opt/rocm/lib/libhipblaslt.so* ./build/bin/
-          sudo mv /opt/rocm/lib/librocblas.so* ./build/bin/
-          sudo mv /opt/rocm/lib/rocblas/ ./build/bin/
-          sudo mv /opt/rocm/lib/hipblaslt/ ./build/bin/
-
      - name: Fetch system info
        id: system-info
        run: |
@ -691,15 +824,15 @@ jobs:
        run: |
          cp ggml/LICENSE ./build/bin/ggml.txt
          cp LICENSE ./build/bin/stable-diffusion.cpp.txt
-          zip -y -r sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip ./build/bin
+          zip -y -r sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm-${{ matrix.ROCM_VERSION }}.zip ./build/bin

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
-          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm-${{ matrix.ROCM_VERSION }}.zip
          path: |
-            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
+            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm-${{ matrix.ROCM_VERSION }}.zip

  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -714,6 +847,7 @@ jobs:
      - macOS-latest-cmake
      - windows-latest-cmake
      - windows-latest-cmake-hip
+      - windows-latest-rocm

    steps:
      - name: Clone
--- a/.github/workflows/stale-prs.yml
+++ b/.github/workflows/stale-prs.yml
@ -0,0 +1,55 @@
+name: Close inactive PRs
+
+on:
+  schedule:
+    # Run daily. GitHub cron schedules use UTC.
+    - cron: "30 1 * * *"
+  workflow_dispatch:
+    inputs:
+      debug_only:
+        description: "Dry run: log intended actions without changing PRs"
+        required: false
+        default: false
+        type: boolean
+
+permissions:
+  issues: write
+  pull-requests: write
+
+concurrency:
+  group: ${{ github.workflow }}
+  cancel-in-progress: false
+
+jobs:
+  stale-prs:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Mark and close inactive PRs
+        uses: actions/stale@v10
+        with:
+          days-before-issue-stale: -1
+          days-before-issue-close: -1
+
+          days-before-pr-stale: 365
+          days-before-pr-close: 7
+
+          stale-pr-label: pr:inactive
+          close-pr-label: pr:auto-closed
+          exempt-pr-labels: pr:keep-open
+
+          stale-pr-message: >
+            This PR has been inactive for 365 days. If there is no new activity
+            within 7 days, it will be closed automatically. Comment, push new
+            commits, or remove the pr:inactive label to keep it open. Add
+            pr:keep-open to exempt it from future inactive PR cleanup.
+
+          close-pr-message: >
+            Closing this PR because it has had no activity for 7 days after
+            being marked inactive. If this is still useful or ready to move
+            forward, feel free to reopen it with fresh context or updated
+            details. Sorry for any inconvenience.
+
+          remove-pr-stale-when-updated: true
+          delete-branch: false
+          operations-per-run: 100
+          debug-only: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_only || false }}
--- a/.gitmodules
+++ b/.gitmodules
@ -1,6 +1,6 @@
 [submodule "ggml"]
    path = ggml
-	url = https://github.com/ggml-org/ggml.git
+	url = https://github.com/leejet/ggml.git
 [submodule "examples/server/frontend"]
 	path = examples/server/frontend
 	url = https://github.com/leejet/sdcpp-webui.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -11,11 +11,42 @@ endif()
 if (MSVC)
    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
    add_compile_definitions(_SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)
+    add_compile_options(
+        $<$<COMPILE_LANGUAGE:C>:/MP>
+        $<$<COMPILE_LANGUAGE:C>:/utf-8>
+        $<$<COMPILE_LANGUAGE:CXX>:/MP>
+        $<$<COMPILE_LANGUAGE:CXX>:/utf-8>
+    )
 endif()

 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

+if(APPLE)
+    function(sd_set_macos_rpaths target)
+        get_target_property(target_type ${target} TYPE)
+        if(target_type STREQUAL "EXECUTABLE")
+            set(runtime_paths "@executable_path" "@executable_path/../lib")
+        elseif(target_type STREQUAL "SHARED_LIBRARY" OR target_type STREQUAL "MODULE_LIBRARY")
+            set(runtime_paths "@loader_path" "@loader_path/../lib")
+            set_target_properties(${target} PROPERTIES
+                MACOSX_RPATH ON
+                INSTALL_NAME_DIR "@rpath"
+                BUILD_WITH_INSTALL_NAME_DIR ON
+            )
+        else()
+            return()
+        endif()
+
+        # Release artifacts zip the build output directly, so keep macOS rpaths relocatable.
+        set_target_properties(${target} PROPERTIES
+            BUILD_RPATH "${runtime_paths}"
+            INSTALL_RPATH "${runtime_paths}"
+            BUILD_WITH_INSTALL_RPATH ON
+        )
+    endfunction()
+endif()
+
 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    set(SD_STANDALONE ON)
 else()
@ -65,40 +96,46 @@ option(SD_BUILD_SHARED_GGML_LIB      "sd: build ggml as a separate shared lib" O
 option(SD_USE_SYSTEM_GGML            "sd: use system-installed GGML library" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)

+set(CMAKE_C_STANDARD 11)
+set(CMAKE_C_STANDARD_REQUIRED true)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED true)
+
 if(SD_CUDA)
    message("-- Use CUDA as backend stable-diffusion")
    set(GGML_CUDA ON)
-    add_definitions(-DSD_USE_CUDA)
 endif()

 if(SD_METAL)
    message("-- Use Metal as backend stable-diffusion")
    set(GGML_METAL ON)
-    add_definitions(-DSD_USE_METAL)
 endif()

 if (SD_VULKAN)
    message("-- Use Vulkan as backend stable-diffusion")
    set(GGML_VULKAN ON)
-    add_definitions(-DSD_USE_VULKAN)
 endif ()

 if (SD_OPENCL)
    message("-- Use OpenCL as backend stable-diffusion")
    set(GGML_OPENCL ON)
-    add_definitions(-DSD_USE_OPENCL)
 endif ()

 if (SD_HIPBLAS)
    message("-- Use HIPBLAS as backend stable-diffusion")
    set(GGML_HIP ON)
-    add_definitions(-DSD_USE_CUDA)
+    # ggml-hip's device-stub objects must be position-independent, or the
+    # default-PIE sd-cli link fails with `relocation R_X86_64_32 ... cannot be
+    # used when making a PIE object` on distros that default to PIE
+    # (Ubuntu 24.04, Fedora 40+, Debian 12+). The shared-library branch below
+    # already sets this; the static build (the HIP default) did not.
+    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 endif ()

 if(SD_MUSA)
    message("-- Use MUSA as backend stable-diffusion")
    set(GGML_MUSA ON)
-    add_definitions(-DSD_USE_CUDA)
 endif()

 if(SD_WEBP)
@ -108,7 +145,8 @@ if(SD_WEBP)
          "Or link against system library:\n  cmake (...) -DSD_USE_SYSTEM_WEBP=ON")
    endif()
    if(SD_USE_SYSTEM_WEBP)
-        find_package(WebP REQUIRED)
+        find_package(WebP)
+        if(WebP_FOUND)
            add_library(webp ALIAS WebP::webp)
            # libwebp CMake target naming is not consistent across versions/distros.
            # Some export WebP::libwebpmux, others export WebP::webpmux.
@ -122,6 +160,14 @@ if(SD_WEBP)
                    "Expected WebP::libwebpmux or WebP::webpmux."
                )
            endif()
+        else()
+            find_package(PkgConfig REQUIRED)
+            pkg_check_modules(WebP REQUIRED IMPORTED_TARGET GLOBAL libwebp)
+            pkg_check_modules(WebPMux REQUIRED IMPORTED_TARGET GLOBAL libwebpmux)
+            link_libraries(PkgConfig::WebP)
+            link_libraries(PkgConfig::WebPMux)
+            add_library(libwebpmux ALIAS PkgConfig::WebPMux)
+        endif()
    endif()
 endif()

@ -135,6 +181,13 @@ if(SD_WEBM)
          "Or link against system library:\n  cmake (...) -DSD_USE_SYSTEM_WEBM=ON")
    endif()
    if(SD_USE_SYSTEM_WEBM)
+        find_package(PkgConfig)
+        if(PkgConfig_FOUND)
+            pkg_check_modules(WebM REQUIRED IMPORTED_TARGET GLOBAL libwebm)
+        endif()
+        if(PkgConfig_FOUND AND WebM_FOUND)
+            link_libraries(PkgConfig::WebM)
+        else()
            find_path(WEBM_INCLUDE_DIR
                NAMES mkvmuxer/mkvmuxer.h mkvparser/mkvparser.h common/webmids.h
                PATH_SUFFIXES webm
@ -149,13 +202,37 @@ if(SD_WEBM)
                INTERFACE_INCLUDE_DIRECTORIES "${WEBM_INCLUDE_DIR}")
        endif()
    endif()
+endif()
+
+if (SD_RPC)
+    message("-- Use RPC as backend stable-diffusion")
+    set(GGML_RPC ON)
+    add_definitions(-DSD_USE_RPC)
+endif ()

 set(SD_LIB stable-diffusion)

-file(GLOB SD_LIB_SOURCES
+file(GLOB SD_LIB_SOURCES CONFIGURE_DEPENDS
    "src/*.h"
    "src/*.cpp"
    "src/*.hpp"
+    "src/conditioning/*.h"
+    "src/conditioning/*.cpp"
+    "src/conditioning/*.hpp"
+    "src/core/*.h"
+    "src/core/*.cpp"
+    "src/core/*.hpp"
+    "src/extensions/*.h"
+    "src/extensions/*.cpp"
+    "src/extensions/*.hpp"
+    "src/model/*/*.h"
+    "src/model/*/*.cpp"
+    "src/model/*/*.hpp"
+    "src/runtime/*.h"
+    "src/runtime/*.cpp"
+    "src/runtime/*.hpp"
+    "src/model_io/*.h"
+    "src/model_io/*.cpp"
    "src/tokenizers/*.h"
    "src/tokenizers/*.cpp"
    "src/tokenizers/vocab/*.h"
@ -212,11 +289,14 @@ else()
    add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
 endif()

+if(APPLE)
+    sd_set_macos_rpaths(${SD_LIB})
+endif()
+
 if(SD_SYCL)
    message("-- Use SYCL as backend stable-diffusion")
    set(GGML_SYCL ON)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
-    add_definitions(-DSD_USE_SYCL)
    # disable fast-math on host, see:
    # https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/fp-model-fp.html
    if (WIN32)
@ -253,6 +333,7 @@ add_subdirectory(thirdparty)

 target_link_libraries(${SD_LIB} PUBLIC ggml zip)
 target_include_directories(${SD_LIB} PUBLIC . src include)
+target_include_directories(${SD_LIB} PRIVATE src/core)
 target_include_directories(${SD_LIB} PUBLIC . thirdparty)
 target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,67 @@
+# Contributing
+
+This document collects general contribution conventions for this repository.
+
+## Before You Start
+
+Before opening a PR, please search existing PRs to avoid duplicating ongoing work.
+
+For large-scale refactors or changes with broad impact, please open an issue first to discuss the approach before submitting a PR.
+
+If you want to update a third-party dependency, please open an issue first instead of submitting a direct PR. See [Dependency Updates](#dependency-updates) for details.
+
+## Pull Requests
+
+Keep each PR focused on one clear change. Large or overly complex PRs are harder to review and may not be merged.
+
+Follow Conventional Commit-style subjects seen in history: `feat:`, `fix:`, `refactor:`, `ci:`, `docs:`, `chore:`. Keep subjects imperative and scoped.
+
+PRs should include:
+
+- What changed and why (short problem/solution summary).
+- Verification evidence when applicable (commands and key outputs).
+- Linked issue/PR context when applicable.
+- Screenshots or sample outputs for UI/visual behavior changes.
+
+## Code Style
+
+Format code according to the repository style before submitting changes.
+
+Formatting follows `.clang-format` (Chromium base, 4-space indent, no tabs). Run `format-code.sh` before opening a PR. Keep C++ standard at C++17-compatible patterns used in this repo.
+
+Naming conventions:
+
+- Use `PascalCase` for class/struct/type names.
+- In `PascalCase` names, preserve common abbreviations in uppercase, for example `SD`, `API`, `HTTP`, `JSON`, `RGB`, `VAE`, `TAE`, `LoRA`, and `WebP`.
+- Use `snake_case` for functions, methods, variables, and file names unless an existing API requires a different style.
+- Use a trailing underscore for private data member names, for example `hidden_size_` or `tokenizer_`.
+- Use `.h` for C and C++ header files. Do not introduce new `.hpp` headers.
+- Use macro-based header include guards instead of `#pragma once`.
+- Format header include guards as `__SD_{PATH}__`, where `{PATH}` is the header path in uppercase snake case without the file extension. For example, `src/sample.h` should use `__SD_SAMPLE_H__`.
+- Do not introduce anonymous namespaces in new or modified code; prefer `static` file-local functions/variables or an explicit named namespace when scoping is needed.
+- In `class`/`struct` definitions, place data members before member functions unless an existing type already clearly follows a different pattern.
+- Keep `test_*.cpp` / `test_*.py` naming for tests.
+
+Some older code in the project may not fully follow the current conventions. Please do not submit PRs that only rewrite existing code to match style rules.
+
+When adding or modifying model implementations, follow the model config and weight detection conventions in [docs/model_config.md](docs/model_config.md).
+
+## AI-Assisted Contributions
+
+AI tools may be used to assist development, but contributors are responsible for the quality and correctness of the submitted code.
+
+If any part of a contribution was generated with AI assistance, the contributor must perform a thorough human review before submitting the PR and understand every changed line.
+
+Do not list AI tools as co-authors. The human contributor is the sole responsible author of the submitted code.
+
+Please do not submit AI-generated code that you do not understand, and do not include meaningless experiments, temporary test code, or unrelated generated output in a PR.
+
+## Dependency Updates
+
+Do not submit PRs that update `ggml`. `ggml` updates are performed only after local validation by the maintainer.
+
+Other third-party dependencies are not updated unless necessary. If you want to update a dependency, please open an issue first instead of submitting a direct PR.
+
+## Security & Configuration
+
+Do not commit model weights, secrets, or local absolute paths. Keep large binaries out of git unless intentionally tracked release assets.
--- a/13
+++ b/13
@ -2,7 +2,18 @@ ARG UBUNTU_VERSION=24.04

 FROM ubuntu:$UBUNTU_VERSION AS build

-RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake
+# sd-server embeds the web UI at build time, so the build image needs Node/pnpm.
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake ca-certificates curl gnupg && \
+    mkdir -p /etc/apt/keyrings && \
+    curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key -o /tmp/nodesource-repo.gpg.key && \
+    gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg /tmp/nodesource-repo.gpg.key && \
+    rm /tmp/nodesource-repo.gpg.key && \
+    echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" > /etc/apt/sources.list.d/nodesource.list && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends nodejs && \
+    npm install -g pnpm@10.15.1 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*

 WORKDIR /sd.cpp

--- a/Dockerfile.cuda
+++ b/Dockerfile.cuda
@ -3,14 +3,31 @@ ARG UBUNTU_VERSION=24.04

 FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION} AS build

-RUN apt-get update && apt-get install -y --no-install-recommends build-essential git ccache cmake
+# sd-server embeds the web UI at build time, so the build image needs Node/pnpm.
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential git ccache cmake ca-certificates curl gnupg && \
+    mkdir -p /etc/apt/keyrings && \
+    curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key -o /tmp/nodesource-repo.gpg.key && \
+    gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg /tmp/nodesource-repo.gpg.key && \
+    rm /tmp/nodesource-repo.gpg.key && \
+    echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" > /etc/apt/sources.list.d/nodesource.list && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends nodejs && \
+    npm install -g pnpm@10.15.1 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*

 WORKDIR /sd.cpp

 COPY . .

 ARG CUDACXX=/usr/local/cuda/bin/nvcc
-RUN cmake . -B ./build -DSD_CUDA=ON
+ARG CUDA_ARCHITECTURES=""
+ARG GGML_CUDA_FA_ALL_QUANTS=""
+
+RUN cmake . -B ./build \
+    -DSD_CUDA=ON \
+    ${CUDA_ARCHITECTURES:+-DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCHITECTURES}"} \
+    ${GGML_CUDA_FA_ALL_QUANTS:+-DGGML_CUDA_FA_ALL_QUANTS=${GGML_CUDA_FA_ALL_QUANTS}}
 RUN cmake --build ./build --config Release -j$(nproc)

 FROM nvidia/cuda:${CUDA_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION} AS runtime
--- a/Dockerfile.musa
+++ b/Dockerfile.musa
@ -3,7 +3,18 @@ ARG UBUNTU_VERSION=22.04

 FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64 as build

-RUN apt-get update && apt-get install -y ccache cmake git
+# sd-server embeds the web UI at build time, so the build image needs Node/pnpm.
+RUN apt-get update && apt-get install -y --no-install-recommends ccache cmake git ca-certificates curl gnupg && \
+    mkdir -p /etc/apt/keyrings && \
+    curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key -o /tmp/nodesource-repo.gpg.key && \
+    gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg /tmp/nodesource-repo.gpg.key && \
+    rm /tmp/nodesource-repo.gpg.key && \
+    echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" > /etc/apt/sources.list.d/nodesource.list && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends nodejs && \
+    npm install -g pnpm@10.15.1 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*

 WORKDIR /sd.cpp

--- a/Dockerfile.sycl
+++ b/Dockerfile.sycl
@ -1,8 +1,20 @@
-ARG SYCL_VERSION=2025.1.0-0
+# ggml SYCL hardware detection uses BMG G31/WCL architecture enums added in oneAPI 2025.3.
+ARG SYCL_VERSION=2025.3.2-0

 FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS build

-RUN apt-get update && apt-get install -y cmake
+# sd-server embeds the web UI at build time, so the build image needs Node/pnpm.
+RUN apt-get update && apt-get install -y --no-install-recommends cmake ca-certificates curl gnupg && \
+    mkdir -p /etc/apt/keyrings && \
+    curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key -o /tmp/nodesource-repo.gpg.key && \
+    gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg /tmp/nodesource-repo.gpg.key && \
+    rm /tmp/nodesource-repo.gpg.key && \
+    echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" > /etc/apt/sources.list.d/nodesource.list && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends nodejs && \
+    npm install -g pnpm@10.15.1 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*

 WORKDIR /sd.cpp

--- a/Dockerfile.vulkan
+++ b/Dockerfile.vulkan
@ -2,7 +2,18 @@ ARG UBUNTU_VERSION=24.04

 FROM ubuntu:$UBUNTU_VERSION AS build

-RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake libvulkan-dev glslc
+# sd-server embeds the web UI at build time, so the build image needs Node/pnpm.
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake libvulkan-dev glslc spirv-headers ca-certificates curl gnupg && \
+    mkdir -p /etc/apt/keyrings && \
+    curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key -o /tmp/nodesource-repo.gpg.key && \
+    gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg /tmp/nodesource-repo.gpg.key && \
+    rm /tmp/nodesource-repo.gpg.key && \
+    echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" > /etc/apt/sources.list.d/nodesource.list && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends nodejs && \
+    npm install -g pnpm@10.15.1 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*

 WORKDIR /sd.cpp

--- a/README.md
+++ b/README.md
@ -15,29 +15,18 @@ API and command-line option may change frequently.***

 ## 🔥Important News

+* **2026/06/04** 🚀 stable-diffusion.cpp now supports **Ideogram4**
+* **2026/05/31** 🚀 stable-diffusion.cpp now supports **PiD**
+* **2026/05/27** 🚀 stable-diffusion.cpp now supports **Lens**
+* **2026/05/17** 🚀 stable-diffusion.cpp now supports **LTX-2.3**
 * **2026/04/11** 🚀 stable-diffusion.cpp now uses a brand-new embedded web UI.  
-  👉 Details: [PR #1408](https://github.com/leejet/stable-diffusion.cpp/pull/1408)
-
 * **2026/01/18** 🚀 stable-diffusion.cpp now supports **FLUX.2-klein**  
-  👉 Details: [PR #1193](https://github.com/leejet/stable-diffusion.cpp/pull/1193)
-
 * **2025/12/01** 🚀 stable-diffusion.cpp now supports **Z-Image**  
-  👉 Details: [PR #1020](https://github.com/leejet/stable-diffusion.cpp/pull/1020)
-
 * **2025/11/30** 🚀 stable-diffusion.cpp now supports **FLUX.2-dev**  
-  👉 Details: [PR #1016](https://github.com/leejet/stable-diffusion.cpp/pull/1016)
-
 * **2025/10/13** 🚀 stable-diffusion.cpp now supports **Qwen-Image-Edit / Qwen-Image-Edit 2509**  
-  👉 Details: [PR #877](https://github.com/leejet/stable-diffusion.cpp/pull/877)
-
 * **2025/10/12** 🚀 stable-diffusion.cpp now supports **Qwen-Image**  
-  👉 Details: [PR #851](https://github.com/leejet/stable-diffusion.cpp/pull/851)
-
 * **2025/09/14** 🚀 stable-diffusion.cpp now supports **Wan2.1 Vace**  
-  👉 Details: [PR #819](https://github.com/leejet/stable-diffusion.cpp/pull/819)
-
 * **2025/09/06** 🚀 stable-diffusion.cpp now supports **Wan2.1 / Wan2.2**  
-  👉 Details: [PR #778](https://github.com/leejet/stable-diffusion.cpp/pull/778)

 ## Features

@ -45,30 +34,37 @@ API and command-line option may change frequently.***
 - Super lightweight and without external dependencies
 - Supported models
  - Image Models
-    - SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo)
-    - SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
+    - [SD1.x, SD2.x, SD-Turbo](./docs/sd.md)
+    - [SDXL, SDXL-Turbo](./docs/sd.md)
    - [Some SD1.x and SDXL distilled models](./docs/distilled_sd.md)
    - [SD3/SD3.5](./docs/sd3.md)
    - [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
    - [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
+    - [Lens](./docs/lens.md)
    - [Chroma](./docs/chroma.md)
    - [Chroma1-Radiance](./docs/chroma_radiance.md)
    - [Qwen Image](./docs/qwen_image.md)
+    - [PiD](./docs/pid.md)
+    - [LongCat Image](./docs/longcat_image.md)
    - [Z-Image](./docs/z_image.md)
    - [Ovis-Image](./docs/ovis_image.md)
    - [Anima](./docs/anima.md)
    - [ERNIE-Image](./docs/ernie_image.md)
+    - [HiDream-O1-Image](./docs/hidream_o1_image.md)
+    - [Ideogram4](./docs/ideogram4.md)
  - Image Edit Models
    - [FLUX.1-Kontext-dev](./docs/kontext.md)
    - [Qwen Image Edit series](./docs/qwen_image_edit.md)
+    - [LongCat Image Edit](./docs/longcat_image.md)
  - Video Models
    - [Wan2.1/Wan2.2](./docs/wan.md)
-  - [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
+    - [LTX-2.3](./docs/ltx2.md)
+  - [PhotoMaker](./docs/photo_maker.md) support.
  - Control Net support with SD 1.5
  - LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
  - Latent Consistency Models support (LCM/LCM-LoRA)
-  - Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
-  - Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
+  - Faster and memory efficient latent decoding with [TAESD](./docs/taesd.md)
+  - Upscale images generated with [ESRGAN](./docs/esrgan.md)
 - Supported backends
  - CPU (AVX, AVX2 and AVX512 support for x86 architectures)
  - CUDA
@ -77,9 +73,10 @@ API and command-line option may change frequently.***
  - OpenCL
  - SYCL
 - Supported weight formats
-  - Pytorch checkpoint (`.ckpt` or `.pth`)
+  - Pytorch checkpoint (`.ckpt` or `.pth` or `.pt`)
  - Safetensors (`.safetensors`)
  - GGUF (`.gguf`)
+- Convert mode supports converting model weights to `.gguf` or `.safetensors`
 - Supported platforms
    - Linux
    - Mac OS
@ -131,27 +128,14 @@ API and command-line option may change frequently.***
 ## Performance

 If you want to improve performance or reduce VRAM/RAM usage, please refer to [performance guide](./docs/performance.md).
+For runtime and parameter backend placement, see the [backend selection guide](./docs/backend.md).

 ## More Guides

- [SD1.x/SD2.x/SDXL](./docs/sd.md)
- [SD3/SD3.5](./docs/sd3.md)
- [FLUX.1-dev/FLUX.1-schnell](./docs/flux.md)
- [FLUX.2-dev/FLUX.2-klein](./docs/flux2.md)
- [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Chroma](./docs/chroma.md)
- [🔥Qwen Image](./docs/qwen_image.md)
- [🔥Qwen Image Edit series](./docs/qwen_image_edit.md)
- [🔥Wan2.1/Wan2.2](./docs/wan.md)
- [🔥Z-Image](./docs/z_image.md)
- [Ovis-Image](./docs/ovis_image.md)
- [Anima](./docs/anima.md)
- [ERNIE-Image](./docs/ernie_image.md)
+- [Backend selection](./docs/backend.md)
+- [RPC](./docs/rpc.md)
 - [LoRA](./docs/lora.md)
 - [LCM/LCM-LoRA](./docs/lcm.md)
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
- [Using ESRGAN to upscale results](./docs/esrgan.md)
- [Using TAESD to faster decoding](./docs/taesd.md)
 - [Docker](./docs/docker.md)
 - [Quantization and GGUF](./docs/quantization_and_gguf.md)
 - [Inference acceleration via caching](./docs/caching.md)
@ -162,6 +146,7 @@ These projects wrap `stable-diffusion.cpp` for easier use in other languages/fra

 * Golang (non-cgo): [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion)
 * Golang (cgo): [Binozo/GoStableDiffusion](https://github.com/Binozo/GoStableDiffusion)
+* Golang (non-cgo): [l8bloom/gosd](https://github.com/l8bloom/gosd)
 * C#: [DarthAffe/StableDiffusion.NET](https://github.com/DarthAffe/StableDiffusion.NET)
 * Python: [william-murray1204/stable-diffusion-cpp-python](https://github.com/william-murray1204/stable-diffusion-cpp-python)
 * Rust: [newfla/diffusion-rs](https://github.com/newfla/diffusion-rs)
--- a/assets/hidream-o1/dev_example.png
+++ b/assets/hidream-o1/dev_example.png
--- a/assets/ideogram4/example.png
+++ b/assets/ideogram4/example.png
--- a/assets/lens/example.png
+++ b/assets/lens/example.png
--- a/assets/lens/turbo_example.png
+++ b/assets/lens/turbo_example.png
--- a/assets/longcat/example.png
+++ b/assets/longcat/example.png
--- a/assets/ltx2/flf2v.webm
+++ b/assets/ltx2/flf2v.webm
--- a/assets/ltx2/hires_i2v.webm
+++ b/assets/ltx2/hires_i2v.webm
--- a/assets/ltx2/i2v.webm
+++ b/assets/ltx2/i2v.webm
--- a/assets/ltx2/t2v.webm
+++ b/assets/ltx2/t2v.webm
--- a/assets/pid/example.png
+++ b/assets/pid/example.png
--- a/docs/backend.md
+++ b/docs/backend.md
@ -0,0 +1,152 @@
+# Backend selection
+
+`stable-diffusion.cpp` has two backend assignments:
+
+- `--backend` selects the runtime backend used to execute model graphs.
+- `--params-backend` selects where model parameters are kept.
+
+If `--params-backend` is not set, parameters use the same backend as their module runtime backend.
+
+## Syntax
+
+A backend assignment can be a single backend name:
+
+```shell
+sd-cli -m model.safetensors -p "a cat" --backend cpu
+```
+
+This applies to every module that does not have a more specific assignment.
+
+Assignments can also target individual modules:
+
+```shell
+sd-cli -m model.safetensors -p "a cat" --backend te=cpu,vae=cuda0,diffusion=vulkan0
+```
+
+The same syntax is used for parameter placement:
+
+```shell
+sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend te=cpu,vae=cpu
+```
+
+`--params-backend` also accepts the special value `disk`:
+
+```shell
+sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend disk
+```
+
+`--max-vram` can target resolved backend/device names:
+
+```shell
+sd-cli -m model.safetensors -p "a cat" --backend diffusion=cuda0,vae=vulkan0 --max-vram cuda0=6,vulkan0=2
+```
+
+The budget applies to every module running on that backend.
+
+Module names are case-insensitive. Hyphens and underscores in module names are ignored, so `clip_vision`, `clip-vision`, and `clipvision` are equivalent.
+
+`all=`, `default=`, and `*=` can be used to set the default backend inside a mixed assignment:
+
+```shell
+sd-cli -m model.safetensors -p "a cat" --backend all=cuda0,te=cpu
+```
+
+## Modules
+
+| Module | Purpose | Accepted names |
+| --- | --- | --- |
+| `diffusion` | UNet, DiT, MMDiT, Flux, Wan, Qwen Image, and other diffusion models | `diffusion`, `model`, `unet`, `dit` |
+| `te` | Text encoders and conditioners | `te`, `clip`, `text`, `textencoder`, `textencoders`, `conditioner`, `cond`, `llm`, `t5`, `t5xxl` |
+| `clip_vision` | CLIP vision encoder | `clip_vision`, `clipvision`, `clip-vision`, `vision` |
+| `vae` | VAE and TAE | `vae`, `firststage`, `autoencoder`, `tae` |
+| `controlnet` | ControlNet | `controlnet`, `control` |
+| `photomaker` | PhotoMaker ID encoder and PhotoMaker LoRA | `photomaker`, `photomakerid`, `pmid`, `photo` |
+| `upscaler` | ESRGAN upscaler | `upscaler`, `esrgan`, `hires` |
+
+`te` is the preferred module name for text encoders. `clip` is kept as an accepted alias because many existing commands and model names use CLIP terminology.
+
+## Backend names
+
+Backend names are resolved against the GGML backend device list. Matching is case-insensitive and accepts exact names or unique prefixes, so common values include names such as:
+
+- `cpu`
+- `cuda0`
+- `vulkan0`
+- `metal`
+
+The special values `auto`, `default`, and an empty backend name select the default backend. The default preference is GPU, then integrated GPU, then CPU.
+
+The special value `gpu` selects the first GPU backend, falling back to the first integrated GPU backend.
+
+The special value `disk` is accepted only by `--params-backend`. `--backend disk` is invalid because `disk` is a parameter residency mode, not a runtime compute backend.
+
+## Runtime backend vs. parameter backend
+
+The runtime backend controls where graph execution runs. The parameter backend controls where model weights are allocated or whether they are reloaded from disk on demand.
+
+For example:
+
+```shell
+sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend cpu
+```
+
+This runs all modules on `cuda0`, but stores parameters in CPU RAM. During execution, parameters are moved to the runtime backend as needed.
+
+For example:
+
+```shell
+sd-cli -m model.safetensors -p "a cat" --backend cuda0 --params-backend disk
+```
+
+This runs all modules on `cuda0`, reloads parameters from the model file as needed, and releases those parameter buffers after use.
+
+`disk` is never selected implicitly. If `--params-backend` is not set, parameters use the runtime backend.
+
+Per-module assignments can be mixed:
+
+```shell
+sd-cli -m model.safetensors -p "a cat" --backend diffusion=cuda0,te=cpu,vae=cpu --params-backend diffusion=cuda0,te=cpu,vae=cpu
+```
+
+This keeps text encoding and VAE execution on CPU while the diffusion model runs on GPU.
+
+## Backend sharing and lifetime
+
+Backends are managed by `SDBackendManager`.
+
+Within one manager, backend instances are cached by resolved backend device name. If multiple modules request the same backend, they share the same `ggml_backend_t`.
+
+For example:
+
+```shell
+--backend te=cpu,vae=cpu
+```
+
+uses one shared CPU backend for both `te` and `vae` runtime execution.
+
+Runtime and parameter assignments also share the same backend cache. If `--backend diffusion=cuda0` and `--params-backend diffusion=cuda0` resolve to the same device, both use the same backend instance.
+
+`--params-backend disk` does not create a separate backend instance. Parameters are loaded lazily using the module runtime backend.
+
+`SDBackendManager` owns the backend instances and frees them when the context or upscaler is destroyed. Model runners receive non-owning runtime and parameter backend pointers and do not free them.
+
+## Compatibility flags
+
+The example CLI/server still accepts these older CPU placement flags as compatibility aliases:
+
+- `--clip-on-cpu`
+- `--vae-on-cpu`
+- `--control-net-cpu`
+- `--offload-to-cpu`
+
+`--clip-on-cpu`, `--vae-on-cpu`, and `--control-net-cpu` are deprecated. The example argument layer prepends `te=cpu`, `vae=cpu`, and `controlnet=cpu` to `--backend` before creating the context.
+
+`--offload-to-cpu` prepends a CPU default to the parameter assignment in the caller before creating the context:
+
+```shell
+--params-backend '*=cpu'
+```
+
+Because this default is inserted first, later explicit `--params-backend` entries can still override it, for example `--offload-to-cpu --params-backend te=disk` keeps non-TE parameters on CPU and reloads TE parameters from disk.
+
+Library callers should set `backend` and `params_backend` directly. The old CPU/offload fields are no longer part of the C API. Explicit `--backend` and `--params-backend` assignments are preferred for new commands.
--- a/docs/build.md
+++ b/docs/build.md
@ -102,6 +102,11 @@ cmake --build . --config Release
 ## Build with Vulkan

 Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
+On Ubuntu, install the Vulkan development packages and SPIR-V headers:
+
+```shell
+sudo apt-get install build-essential libvulkan-dev glslc spirv-headers
+```

 ```shell
 mkdir build && cd build
--- a/docs/caching.md
+++ b/docs/caching.md
@ -131,8 +131,6 @@ sd-cli -m model.safetensors -p "a cat" --cache-mode spectrum
 | `warmup` | Steps to always compute before caching starts | 4 |
 | `stop` | Stop caching at this fraction of total steps | 0.9 |

-```
-
 ### Performance Tips

 - Start with default thresholds and adjust based on output quality
--- a/docs/hidream_o1_image.md
+++ b/docs/hidream_o1_image.md
@ -0,0 +1,20 @@
+# How to Use
+
+## Download weights
+
+- Download HiDream-O1-Image-Dev
+    - safetensors: https://huggingface.co/Comfy-Org/HiDream-O1-Image/tree/main/checkpoints
+- Download HiDream-O1-Image
+    - safetensors: https://huggingface.co/Comfy-Org/HiDream-O1-Image/tree/main/checkpoints
+
+## Examples
+
+### HiDream-O1-Image-Dev
+
+```
+.\bin\Release\sd-cli.exe -m  ..\..\ComfyUI\models\diffusion_models\hidream_o1_image_dev_bf16.safetensors -p "a lovely cat holding a sign says 
+'hidream o1 cpp'" --cfg-scale 1.0  -v -H 1024 -W 1024
+```
+
+<img width="256" alt="HiDream-O1-Image-Dev example" src="../assets/hidream-o1/dev_example.png" />
+
--- a/docs/hipBLAS_on_Windows.md
+++ b/docs/hipBLAS_on_Windows.md
@ -26,12 +26,12 @@ Fortunately, `AMD` provides complete help documentation, you can use the help do

 Then we must set `ROCM` as environment variables before running cmake.

-Usually if you install according to the official tutorial and do not modify the ROCM path, then there is a high probability that it is here `C:\Program Files\AMD\ROCm\5.5\bin`
+Usually if you install according to the official tutorial and do not modify the ROCM path, then there is a high probability that it is here `C:\Program Files\AMD\ROCm\7.1.1\bin`

 This is what I use to set the clang:
 ```Commandline
-set CC=C:\Program Files\AMD\ROCm\5.5\bin\clang.exe
-set CXX=C:\Program Files\AMD\ROCm\5.5\bin\clang++.exe
+set CC=C:\Program Files\AMD\ROCm\7.1.1\bin\clang.exe
+set CXX=C:\Program Files\AMD\ROCm\7.1.1\bin\clang++.exe
 ```

 ## Ninja
@ -46,7 +46,7 @@ set ninja=C:\Program Files\ninja\ninja.exe
 ## Building stable-diffusion.cpp

 The thing different from the regular CPU build is `-DSD_HIPBLAS=ON` ,
-`-G "Ninja"`, `-DCMAKE_C_COMPILER=clang`, `-DCMAKE_CXX_COMPILER=clang++`, `-DAMDGPU_TARGETS=gfx1100`
+`-G "Ninja"`, `-DCMAKE_C_COMPILER=clang`, `-DCMAKE_CXX_COMPILER=clang++`, `-DAMDGPU_TARGETS=gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032`

 >**Notice**: check the `clang` and `clang++` information:
 ```Commandline
@ -59,26 +59,29 @@ If you see like this, we can continue:
 clang version 17.0.0 (git@github.amd.com:Compute-Mirrors/llvm-project e3201662d21c48894f2156d302276eb1cf47c7be)
 Target: x86_64-pc-windows-msvc
 Thread model: posix
-InstalledDir: C:\Program Files\AMD\ROCm\5.5\bin
+InstalledDir: C:\Program Files\AMD\ROCm\7.1.1\bin
 ```

 ```
 clang version 17.0.0 (git@github.amd.com:Compute-Mirrors/llvm-project e3201662d21c48894f2156d302276eb1cf47c7be)
 Target: x86_64-pc-windows-msvc
 Thread model: posix
-InstalledDir: C:\Program Files\AMD\ROCm\5.5\bin
+InstalledDir: C:\Program Files\AMD\ROCm\7.1.1\bin
 ```

->**Notice** that the `gfx1100` is the GPU architecture of my GPU, you can change it to your GPU architecture. Click here to see your architecture [LLVM Target](https://rocm.docs.amd.com/en/latest/release/windows_support.html#windows-supported-gpus)
+>**Notice** that the GPU targets are now compatible with multiple GPU architectures (ROCm 7.1.1 targets). You can change them to match your GPU architecture. Click here to see your architecture [LLVM Target](https://rocm.docs.amd.com/en/latest/release/windows_support.html#windows-supported-gpus)

-My GPU is AMD Radeon™ RX 7900 XTX Graphics, so I set it to `gfx1100`.
+Examples:
+- AMD Radeon™ RX 7900 XTX Graphics: `gfx1100`
+- AMD Radeon™ RX 7900 XT Graphics: `gfx1101`
+- AMD Radeon™ RX 7900 GRE Graphics: `gfx1102`

 option:

 ```commandline
 mkdir build
 cd build
-cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1100
+cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
 cmake --build . --config Release
 ```

--- a/docs/ideogram4.md
+++ b/docs/ideogram4.md
@ -0,0 +1,40 @@
+# How to Use
+
+## Download weights
+
+- Download Ideogram4
+    - safetensors: https://huggingface.co/ideogram-ai/ideogram-4-fp8/tree/main/transformer
+- Download Ideogram4 uncond
+    - safetensors: https://huggingface.co/ideogram-ai/ideogram-4-fp8/tree/main/unconditional_transformer
+- Download vae
+    - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
+- Download Qwen3-VL-8B-Instruct
+    - gguf: https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF/tree/main
+
+## Convert weights
+
+fp8 scale -> bf16
+
+```
+python .\convert_fp8_scale_to_bf16.py --input .\ideogram4_fp8.safetensors --output ideogram4_bf16.safetensors
+python .\convert_fp8_scale_to_bf16.py --input .\ideogram4_uncond_fp8.safetensors --output ideogram4_uncond_bf16.safetensors
+```
+
+bf16 -> q8
+
+```
+.\bin\Release\sd-cli.exe -M convert -m ideogram4_bf16.safetensors -o ideogram4-Q8_0.gguf --tensor-type-rules "^layers.*adaln_modulation.*weight=q8_0,layers.*attention.o.*weight=q8_0,layers.*attention.qkv.*weight=q8_0,layers.*feed_forward.*weight=q8_0" -v
+
+.\bin\Release\sd-cli.exe -M convert -m ideogram4_uncond_bf16.safetensors -o ideogram4_uncond-Q8_0.gguf --tensor-type-rules "^layers.*adaln_modulation.*weight=q8_0,layers.*attention.o.*weight=q8_0,layers.*attention.qkv.*weight=q8_0,layers.*feed_forward.*weight=q8_0" -v
+```
+
+If you want lower VRAM usage, you can change the quantization from q8_0 to a lower-level quantization, such as q4_0.
+
+
+## Examples
+
+```sh
+.\bin\Release\sd-cli.exe --diffusion-model ideogram4-Q8_0.gguf --uncond-diffusion-model ideogram4_uncond-Q8_0.gguf --llm ..\..\llm\Qwen3VL-8B-Instruct-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors -p '{"high_level_description":"A square 1024 x 1024 luxury fashion magazine cover featuring exactly one short chubby fluffy cat as the main model. The cat sits on a soft ivory studio floor, facing the viewer with a stylish calm expression, wearing tiny black sunglasses, a red silk scarf, and a small gold collar charm. In front of the cat on the floor is a wide horizontal luxury nameplate that clearly reads ideogram4.cpp. The whole design feels premium, fashionable, clean, and editorial.","style_description":{"aesthetics":"luxury fashion magazine cover, high-end pet couture campaign, minimalist editorial design, elegant studio photography, soft paper texture, refined typography, fashionable and polished","lighting":"Soft diffused studio lighting, gentle spotlight on the cat, subtle floor shadow, warm ivory highlights, clean separation between subject and background","photo":"high-resolution fashion editorial photography look, front-facing cat portrait, crisp fur details, glossy sunglasses, clear readable nameplate text, shallow depth of field","medium":"mixed media fashion photography and premium editorial graphic design","color_palette":["#F4EFE7","#111111","#D8B56D","#B73A3A","#FFFFFF","#8A7A6A"]},"compositional_deconstruction":{"canvas":"Square 1024 x 1024 canvas with a normal upright orientation. Do not rotate the poster or any text. Use a clean fashion magazine cover layout.","background":"Warm ivory studio backdrop with subtle paper grain, a soft spotlight gradient, faint floor shadow, and a few minimal gold editorial lines. The background is spacious, premium, and uncluttered.","layout":"Top center has a small elegant headline. Center area features one cat as the main fashion model. Lower foreground has a wide horizontal luxury nameplate placed on the floor in front of the cat. Bottom center has a small footer. All text is horizontal, upright, and readable left to right.","elements":[{"type":"text","desc":"Top center headline reading LOOK WHAT I FOUND in a refined high-fashion serif font. The headline is horizontal, centered, elegant, and secondary to the nameplate text."},{"type":"obj","desc":"Exactly one short chubby fluffy cat sitting in the center like a luxury fashion model. The cat has a large round head, compact body, short legs, soft detailed fur, expressive eyes, and a calm confident pose. The cat is cute and rounded, not tall, not stretched, not duplicated."},{"type":"obj","desc":"Tiny glossy black sunglasses worn naturally by the cat, slightly oversized but still showing the cat face clearly. The sunglasses add a chic fashion-editorial attitude."},{"type":"obj","desc":"A red silk scarf tied neatly around the cat neck, with soft folds and a couture feeling. The scarf must not cover the cat face or the nameplate."},{"type":"obj","desc":"A small gold collar charm or fashion accessory under the scarf, subtle and premium, adding a luxury campaign detail."},{"type":"obj","desc":"In the lower foreground, place a wide horizontal luxury nameplate on the floor in front of the cat. The nameplate is low, flat, landscape-oriented, much wider than tall, like a fashion show seat card or premium display plaque. It is centered, front-facing, level, and fully visible. It must not become vertical, tall, standing, rotated, or side-facing."},{"type":"text","desc":"Print the exact text ideogram4.cpp only on the wide horizontal nameplate. Use clean bold black lettering, perfectly spelled, lowercase, with the number 4 and .cpp extension. The text must fit completely inside the nameplate, stay horizontal, and be readable from left to right."},{"type":"obj","desc":"Add sparse premium editorial accents around the edges: thin gold lines, small code brackets, tiny cursor marks, subtle dots, and minimal geometric details. No extra cats, no stickers, no animal faces, no busy decorations."},{"type":"text","desc":"Bottom center footer reading tiny paws, big compile energy in a small refined monospace or editorial font. The footer is horizontal, centered, understated, and much smaller than the nameplate text."}]}}'  --diffusion-fa -v --offload-to-cpu -H 1024 -W 1024
+```
+
+<img alt="ideogram4 image example" src="../assets/ideogram4/example.png" />
--- a/docs/lens.md
+++ b/docs/lens.md
@ -0,0 +1,32 @@
+# How to Use
+
+Lens uses a Lens diffusion transformer, the FLUX.2 VAE, and GPT-OSS-20B as the LLM text encoder.
+
+## Download weights
+
+- Download Lens
+    - safetensors: https://huggingface.co/Comfy-Org/Lens/tree/main/diffusion_models
+- Download Lens Turbo
+    - safetensors: https://huggingface.co/Comfy-Org/Lens/tree/main/diffusion_models
+- Download vae
+    - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
+- Download GPT-OSS-20B
+    - gguf: https://huggingface.co/unsloth/gpt-oss-20b-GGUF/tree/main
+
+## Examples
+
+### Lens
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\lens_bf16.safetensors --llm "..\..\llm\gpt-oss-20b-UD-Q8_K_XL.gguf" --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --cfg-scale 5.0  -p "A crystal dragon soaring through an aurora borealis sky, its entire body made of transparent faceted crystal refracting the green and purple aurora light into rainbow spectra, ice particles trailing from its wings, high fantasy digital art" --diffusion-fa -v
+```
+
+<img width="256" alt="Lens example" src="../assets/lens/example.png" />
+
+### Lens Turbo
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\lens_turbo_bf16.safetensors --llm "..\..\llm\gpt-oss-20b-UD-Q8_K_XL.gguf" --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors --cfg-scale 1.0  -p "A crystal dragon soaring through an aurora borealis sky, its entire body made of transparent faceted crystal refracting the green and purple aurora light into rainbow spectra, ice particles trailing from its wings, high fantasy digital art" --diffusion-fa -v --steps 4
+```
+
+<img width="256" alt="Lens Turbo example" src="../assets/lens/turbo_example.png" />
--- a/docs/longcat_image.md
+++ b/docs/longcat_image.md
@ -0,0 +1,30 @@
+# How to Use
+
+LongCat-Image uses a LongCat diffusion transformer, the FLUX VAE, and Qwen2.5-VL as the LLM text encoder.
+
+## Download weights
+
+- Download LongCat Image
+    - safetensors: https://huggingface.co/Comfy-Org/LongCat-Image/tree/main/split_files/diffusion_models
+    - gguf: https://huggingface.co/vantagewithai/LongCat-Image-GGUF/tree/main/comfy
+- Download LongCat Image Edit
+    - LongCat Image Edit Turbo: https://huggingface.co/meituan-longcat/LongCat-Image-Edit-Turbo
+    - gguf: https://huggingface.co/vantagewithai/LongCat-Image-Edit-GGUF/tree/main
+- Download vae
+    - safetensors: https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
+- Download qwen_2.5_vl 7b
+    - safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/text_encoders
+    - gguf: https://huggingface.co/mradermacher/Qwen2.5-VL-7B-Instruct-GGUF/tree/main
+    - For image editing with GGUF text encoders, also download the matching mmproj file and pass it with `--llm_vision`.
+
+## Run
+
+LongCat uses quoted text for character-level text rendering. Put target text inside single quotes, double quotes, or Chinese quotes.
+
+### LongCat Image
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\LongCat-Image-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf -p "a lovely cat holding a sign says 'longcat.cpp'" --cfg-scale 5.0 --sampling-method euler --flow-shift 3 -v --offload-to-cpu --diffusion-fa
+```
+
+<img alt="longcat example" src="../assets/longcat/example.png" />
--- a/docs/ltx2.md
+++ b/docs/ltx2.md
@ -0,0 +1,77 @@
+# How to Use
+
+## Download weights
+
+- Download LTX-2.3
+    - safetensors: https://huggingface.co/Kijai/LTX2.3_comfy/tree/main/diffusion_models
+    - gguf: https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main
+- Download gemma-3-12b-it
+    - gguf: https://huggingface.co/unsloth/gemma-3-12b-it-GGUF/tree/main
+- Download embeddings connectors
+    - safetensors: https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main/text_encoders
+- Download vae
+    - safetensors: https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main/vae
+- Download audio vae
+    - safetensors: https://huggingface.co/unsloth/LTX-2.3-GGUF/tree/main/vae
+- Download LTX spatial latent upscaler
+    - safetensors: https://huggingface.co/Lightricks/LTX-2.3/resolve/main/ltx-2.3-spatial-upscaler-x2-1.1.safetensors
+
+## Examples
+
+### LTX-2.3 dev T2V
+
+```
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\ltx-2.3-22b-dev-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_video_vae.safetensors --audio-vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_audio_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\gemma-3-12b-it-qat-UD-Q4_K_XL.gguf --embeddings-connectors ..\..\ComfyUI\models\text_encoders\ltx-2.3-22b-dev_embeddings_connectors.safetensors  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "worst quality, low quality, blurry, distorted, artifacts" -W 1280 -H 720 --diffusion-fa --offload-to-cpu --video-frames 33 --fps 24 -o t2v.webm
+```
+
+<video
+  src="../assets/ltx2/t2v.webm"
+  controls
+  muted
+  style="max-width: 100%; height: auto;"></video>
+
+### LTX-2.3 dev I2V
+
+```
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\ltx-2.3-22b-dev-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_video_vae.safetensors --audio-vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_audio_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\gemma-3-12b-it-qat-UD-Q4_K_XL.gguf --embeddings-connectors ..\..\ComfyUI\models\text_encoders\ltx-2.3-22b-dev_embeddings_connectors.safetensors  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v  -W 1280 -H 720 --diffusion-fa --offload-to-cpu --video-frames 33 -i ..\assets\ernie_image\turbo_example.png -o i2v.webm
+```
+
+<video
+  src="../assets/ltx2/i2v.webm"
+  controls
+  muted
+  style="max-width: 100%; height: auto;"></video>
+
+### LTX-2.3 dev FLF2V
+
+```
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\ltx-2.3-22b-dev-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_video_vae.safetensors --audio-vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_audio_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\gemma-3-12b-it-qat-UD-Q4_K_XL.gguf --embeddings-connectors ..\..\ComfyUI\models\text_encoders\ltx-2.3-22b-dev_embeddings_connectors.safetensors  -p "glass flower blossom" --cfg-scale 6.0 --sampling-method euler -v  -W 1280 -H 720 --diffusion-fa --offload-to-cpu --video-frames 33 --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png -o flf2v.webm
+```
+
+<video
+  src="../assets/ltx2/flf2v.webm"
+  controls
+  muted
+  style="max-width: 100%; height: auto;"></video>
+
+### LTX-2.3 spatial latent upscale
+
+LTX spatial latent upscale runs a model-backed x2 latent upsampler between the low-resolution video pass and the high-resolution refine pass. `-W` and `-H` are the pre-upscale generation size; the spatial upsampler produces x2 latent dimensions.
+
+Put `ltx-2.3-spatial-upscaler-x2-1.1.safetensors` under the directory passed to `--hires-upscalers-dir`, then use the model name without path or extension in `--hires-upscaler`.
+
+```
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\ltx-2.3-22b-dev-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_video_vae.safetensors --audio-vae ..\..\ComfyUI\models\vae\ltx-2.3-22b-dev_audio_vae.safetensors --llm ..\..\ComfyUI\models\text_encoders\gemma-3-12b-it-qat-UD-Q4_K_XL.gguf --embeddings-connectors ..\..\ComfyUI\models\text_encoders\ltx-2.3-22b-dev_embeddings_connectors.safetensors --hires-upscalers-dir ..\..\ComfyUI\models\latent_upscale_models --hires-upscaler ltx-2.3-spatial-upscaler-x2-1.1 --hires --hires-steps 4 -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v  -W 640 -H 360 --diffusion-fa --offload-to-cpu --video-frames 33 -i ..\assets\ernie_image\turbo_example.png -o hires_i2v.webm
+```
+
+By default, the hires refine pass uses the main sampler and scheduler, then trims the second-pass sigma schedule by `--hires-denoising-strength` (`0.7` by default). To reproduce a ComfyUI-style explicit refine schedule, pass custom hires sigmas:
+
+```
+--hires-sigmas "0.85,0.725,0.421875,0.0"
+```
+
+<video
+  src="../assets/ltx2/hires_i2v.webm"
+  controls
+  muted
+  style="max-width: 100%; height: auto;"></video>
--- a/docs/model_config.md
+++ b/docs/model_config.md
@ -0,0 +1,118 @@
+# Model Configuration Conventions
+
+This document describes the conventions for model configuration structs and
+weight-based configuration detection.
+
+## Config Types
+
+Model configuration should live in a model-specific `*Config` struct.
+
+Examples:
+
+- `ZImageConfig`
+- `UNetConfig`
+- `MMDiTConfig`
+- `LLMConfig`
+
+Preserve established acronym casing in type names, such as `UNet`, `MMDiT`,
+`LLM`, `VAE`, and `T5`.
+
+Place the config struct near the top of the model header, before the main model
+blocks and runner types that consume it.
+
+## Config Variables
+
+Variables and members that hold a config should be named `config`.
+
+Examples:
+
+```cpp
+UNetConfig config;
+UnetModelBlock unet;
+
+MMDiTRunner(...)
+    : DiffusionModelRunner(backend, params_backend, prefix),
+      config(MMDiTConfig::detect_from_weights(tensor_storage_map, prefix)),
+      mmdit(config) {
+}
+```
+
+Avoid alternate names such as `params`, `params_cfg`, `model_params`, or
+model-specific aliases unless an existing public API requires them.
+
+## Weight Detection
+
+If a model can derive configuration from loaded weight metadata, expose that
+logic as a static method on the config type:
+
+```cpp
+static XxxConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
+                                     const std::string& prefix);
+```
+
+Additional selector arguments are allowed when required by an existing model
+family, for example `SDVersion version` or an architecture enum:
+
+```cpp
+static UNetConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
+                                      const std::string& prefix,
+                                      SDVersion version = VERSION_SD1);
+```
+
+Use `TensorStorage` metadata, especially `n_dims` and `ne`, to infer shapes.
+Do not load or parse tensor data for config detection.
+
+Detection should respect `prefix`. For nested weights, construct full names from
+`prefix + "." + suffix` or filter entries with `starts_with(name, prefix)`.
+
+Do not add persistent config fields such as `inferred_from_weights` only to
+record whether detection happened. If the function needs to decide whether to
+print a debug line, keep that as local control flow inside `detect_from_weights`.
+
+## Logging
+
+When config values are inferred from weights, print one `LOG_DEBUG` line at the
+end of `detect_from_weights`.
+
+Example:
+
+```cpp
+LOG_DEBUG("llm: num_layers = %" PRId64 ", vocab_size = %" PRId64 ", hidden_size = %" PRId64 ", intermediate_size = %" PRId64,
+          config.num_layers,
+          config.vocab_size,
+          config.hidden_size,
+          config.intermediate_size);
+```
+
+Only print the config detection log when the function actually inferred values
+from weights. Do not duplicate the same config summary in runner constructors or
+model loading code.
+
+Use the correct format specifiers for field types, such as `%" PRId64 "` for
+`int64_t` and `%d` for `int`.
+
+## Runner And Model Responsibilities
+
+Runners should detect the config once and pass it into the model block:
+
+```cpp
+struct XxxRunner : public DiffusionModelRunner {
+    XxxConfig config;
+    XxxModel model;
+
+    XxxRunner(..., const String2TensorStorage& tensor_storage_map, const std::string prefix)
+        : DiffusionModelRunner(backend, params_backend, prefix),
+          config(XxxConfig::detect_from_weights(tensor_storage_map, prefix)),
+          model(config) {
+        model.init(params_ctx, tensor_storage_map, prefix);
+    }
+};
+```
+
+Model blocks should consume `config` directly instead of re-scanning weights in
+their constructors. Keep config-derived behavior centralized in the config
+struct.
+
+If a model has no weight-derived config today, it may still provide
+`detect_from_weights` for API consistency, but it should not print a config
+detection log unless it actually derives values from weights.
--- a/docs/performance.md
+++ b/docs/performance.md
@ -21,6 +21,38 @@ and the compute buffer shrink in the debug log:

 Using `--offload-to-cpu` allows you to offload weights to the CPU, saving VRAM without reducing generation speed.

+## Use params backend to reduce VRAM or RAM usage.
+
+`--params-backend` controls where model parameters are kept. If it is not set, parameters use the same backend as `--backend`, so a GPU runtime backend also keeps parameters in VRAM.
+
+Use CPU params to reduce VRAM usage:
+
+```shell
+--backend cuda0 --params-backend cpu
+```
+
+This keeps model weights in system RAM and moves them to the runtime backend when needed. In the example CLI/server, `--offload-to-cpu` is a compatibility shortcut that prepends `*=cpu` to `--params-backend` before creating the context, so explicit module assignments can still override it:
+
+```shell
+--offload-to-cpu --params-backend te=disk
+```
+
+Use disk params to reduce both VRAM and RAM usage:
+
+```shell
+--backend cuda0 --params-backend disk
+```
+
+This reloads parameters from the model file on demand and releases them after use. It has the lowest memory residency, but can be slower because weights must be read again. `disk` is never selected implicitly; set it explicitly when RAM usage matters more than reload cost.
+
+Per-module assignments can target only the largest modules:
+
+```shell
+--backend cuda0 --params-backend diffusion=disk,te=cpu,vae=cpu
+```
+
+See [backend selection](./backend.md) for full syntax.
+
 ## Use quantization to reduce memory usage.

 [quantization](./quantization_and_gguf.md)
--- a/docs/pid.md
+++ b/docs/pid.md
@ -0,0 +1,39 @@
+# How to Use
+
+PiD is NVIDIA's Pixel Diffusion Decoder. It replaces the usual VAE decode or decode-then-upscale path with a pixel-space diffusion decoder conditioned on a
+source latent and text prompt.
+
+In stable-diffusion.cpp, PiD currently runs as an image edit pipeline: provide a reference image with `-r`/`--ref-image`, encode that image with a matching VAE, then let the PiD diffusion model decode/upscale directly to RGB.
+
+## Download weights
+
+- Download PiD
+    - safetensors: https://huggingface.co/Comfy-Org/PixelDiT/tree/main/diffusion_models
+- Download Gemma 2 2B
+    - safetensors: https://huggingface.co/Comfy-Org/PixelDiT/tree/main/text_encoders
+- Download the VAE that matches the PiD checkpoint backbone
+    - safetensors: https://huggingface.co/nvidia/PiD/tree/main/checkpoints
+    - Flux / Z-Image PiD: use the Flux VAE and pass `--vae-format flux`
+    - SD3 PiD: use the SD3 VAE and pass `--vae-format sd3`
+    - Flux.2 PiD: use the Flux.2 VAE and pass `--vae-format flux2`
+
+The official PiD model card should be checked before use. At the time of the initial PiD release, the official weights are under the NSCLv1 non-commercial license.
+
+## Examples
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\pid_flux1_512_to_2048_4step_bf16.safetensors --llm "..\..\ComfyUI\models\text_encoders\gemma_2_2b_it_elm_bf16.safetensors" --vae ..\..\ComfyUI\models\vae\ae.sft --vae-format flux --cfg-scale 1.0  -p "a lovely cat" -r ..\assets\ernie_image\turbo_example.png --diffusion-fa -v --steps 4 -H 2048 -W 2048 --rng cpu
+```
+
+Before:
+
+<img width="256" alt="ERNIE-Image Turbo example" src="../assets/ernie_image/turbo_example.png" />
+
+After:
+<img width="1024" alt="PiD example" src="../assets/pid/example.png" />
+
+## Notes
+
+- `-r`/`--ref-image` is required. PiD uses the first reference image as the source latent condition.
+- `--vae-format` should match the VAE latent layout used by the PiD checkpoint. This is important when using standalone VAE files because the PiD diffusion
+  checkpoint alone does not identify the VAE format.
--- a/docs/rpc.md
+++ b/docs/rpc.md
@ -0,0 +1,220 @@
+# Building and Using the RPC Server with `stable-diffusion.cpp`
+
+This guide covers how to build a version of [the RPC server from `llama.cpp`](https://github.com/ggml-org/llama.cpp/blob/master/tools/rpc/README.md) that is compatible with your version of `stable-diffusion.cpp` to manage multi-backends setups. RPC allows you to offload specific model components to a remote server.
+
+> **Note on Model Location:** The model files (e.g., `.safetensors` or `.gguf`) remain on the **Client** machine. The client parses the file and transmits the necessary tensor data and computational graphs to the server. The server does not need to store the model files locally.
+
+## 1. Building `stable-diffusion.cpp` with RPC client
+
+First, you should build the client application from source. It requires `SD_RPC=ON` to include the RPC backend to your client.
+
+```bash
+mkdir build
+cd build
+cmake .. \
+    -DSD_RPC=ON \
+    # Add other build flags here (e.g., -DSD_VULKAN=ON)
+cmake --build . --config Release -j $(nproc)
+```
+
+> **Note:** Ensure you add the other flags you would normally use (e.g., `-DSD_VULKAN=ON`, `-DSD_CUDA=ON`, `-DSD_HIPBLAS=ON`, or `-DGGML_METAL=ON`), for more information about building `stable-diffusion.cpp` from source, please refer to the [build.md](build.md) documentation.
+
+## 2. Ensure `llama.cpp` is at the correct commit
+
+`stable-diffusion.cpp`'s RPC client is designed to work with a specific version of `llama.cpp` (compatible with the `ggml` submodule) to ensure API compatibility. The commit hash for `llama.cpp` is stored in `ggml/scripts/sync-llama.last`.
+
+> **Start from Root:** Perform these steps from the root of your `stable-diffusion.cpp` directory.
+
+1.  Read the target commit hash from the submodule tracker:
+
+    ```bash
+    # Linux / WSL / MacOS
+    HASH=$(cat ggml/scripts/sync-llama.last)
+
+    # Windows (PowerShell)
+    $HASH = Get-Content -Path "ggml\scripts\sync-llama.last"
+    ```
+
+2.  Clone `llama.cpp` at the target commit .
+    ```bash
+    git clone https://github.com/ggml-org/llama.cpp.git
+    cd llama.cpp
+    git checkout $HASH
+    ```
+    To save on download time and storage, you can use a shallow clone to download only the target commit:
+    ```bash
+    mkdir -p llama.cpp
+    cd llama.cpp
+    git init
+    git remote add origin https://github.com/ggml-org/llama.cpp.git
+    git fetch --depth 1 origin $HASH
+    git checkout FETCH_HEAD
+    ```
+
+## 3. Build `llama.cpp` (RPC Server)
+
+The RPC server acts as the worker. You must explicitly enable the **backend** (the hardware interface, such as CUDA for Nvidia, Metal for Apple Silicon, or Vulkan) when building, otherwise the server will default to using only the CPU.
+
+To find the correct flags for your system, refer to the official documentation for the [`llama.cpp`](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md) repository.
+
+> **Crucial:** You must include the compiler flags required to satisfy the API compatibility with `stable-diffusion.cpp` (`-DGGML_MAX_NAME=128`). Without this flag, `GGML_MAX_NAME` will default to `64` for the server, and data transfers between the client and server will fail. Of course, `-DGGML_RPC` must also be enabled.
+>
+> I recommend disabling the `LLAMA_CURL` flag to avoid unnecessary dependencies, and disabling shared library builds to avoid potential conflicts.
+
+> **Build Target:** We are specifically building the `rpc-server` target. This prevents the build system from compiling the entire `llama.cpp` suite (like `llama-server`), making the build significantly faster.
+
+### Linux / WSL (Vulkan)
+
+```bash
+mkdir build
+cd build
+cmake .. -DGGML_RPC=ON \
+    -DGGML_VULKAN=ON \        # Ensure backend is enabled
+    -DGGML_BUILD_SHARED_LIBS=OFF \
+    -DLLAMA_CURL=OFF \
+    -DCMAKE_C_FLAGS=-DGGML_MAX_NAME=128 \
+    -DCMAKE_CXX_FLAGS=-DGGML_MAX_NAME=128
+cmake --build . --config Release --target rpc-server -j $(nproc)
+```
+
+### macOS (Metal)
+
+```bash
+mkdir build
+cd build
+cmake .. -DGGML_RPC=ON \
+    -DGGML_METAL=ON \
+    -DGGML_BUILD_SHARED_LIBS=OFF \
+    -DLLAMA_CURL=OFF \
+    -DCMAKE_C_FLAGS=-DGGML_MAX_NAME=128 \
+    -DCMAKE_CXX_FLAGS=-DGGML_MAX_NAME=128
+cmake --build . --config Release --target rpc-server
+```
+
+### Windows (Visual Studio 2022, Vulkan)
+
+```powershell
+mkdir build
+cd build
+cmake .. -G "Visual Studio 17 2022" -A x64 `
+    -DGGML_RPC=ON `
+    -DGGML_VULKAN=ON `
+    -DGGML_BUILD_SHARED_LIBS=OFF `
+    -DLLAMA_CURL=OFF `
+    -DCMAKE_C_FLAGS=-DGGML_MAX_NAME=128 `
+    -DCMAKE_CXX_FLAGS=-DGGML_MAX_NAME=128
+cmake --build . --config Release --target rpc-server
+```
+
+## 4. Usage
+
+Once both applications are built, you can run the server and the client to manage your GPU allocation.
+
+### Step A: Run the RPC Server
+
+Start the server. It listens for connections on the default address (usually `localhost:50052`). If your server is on a different machine, ensure the server binds to the correct interface and your firewall allows the connection.
+
+**On the Server :**
+If running on the same machine, you can use the default address:
+
+```bash
+./rpc-server
+```
+
+If you want to allow connections from other machines on the network:
+
+```bash
+./rpc-server --host 0.0.0.0
+```
+
+> **Security Warning:** The RPC server does not currently support authentication or encryption. **Only run the server on trusted local networks**. Never expose the RPC server directly to the open internet.
+
+> **Drivers & Hardware:** Ensure the Server machine has the necessary drivers installed and functional (e.g., Nvidia Drivers for CUDA, Vulkan SDK, or Metal). If no devices are found, the server will simply fallback to CPU usage.
+
+<!-- ### Step B: Check if the client is able to connect to the server and see the available devices
+
+We're assuming the server is running on your local machine, and listening on the default port `50052`. If it's running on a different machine, you can replace `localhost` with the IP address of the server.
+
+**On the Client:**
+
+```bash
+./sd-cli --rpc-servers localhost:50052 --list-devices
+```
+
+If the server is running and the client is able to connect, you should see `RPC0    localhost:50052` in the list of devices.
+
+Example output:
+(Client built without GPU acceleration, two GPUs available on the server)
+
+```
+List of available GGML devices:
+Name    Description
+-------------------
+CPU     AMD Ryzen 9 5900X 12-Core Processor
+RPC0    localhost:50052
+RPC1    localhost:50052
+``` -->
+
+### Step B: Run with RPC device
+
+If everything is working correctly, you can now run the client while offloading some or all of the work to the RPC server.
+
+Example: Setting the main backend to the RPC0 device for doing all the work on the server.
+
+```bash
+./sd-cli -m models/sd1.5.safetensors -p "A cat" --rpc-servers localhost:50052  --backend RPC0
+```
+
+---
+
+## 5. Scaling: Multiple RPC Servers
+
+You can connect the client to multiple RPC servers simultaneously to scale out your hardware usage.
+
+Example: A main machine (192.168.1.10) with 3 GPUs, with one GPU running CUDA and the other two running Vulkan, and a second machine (192.168.1.11) only one GPU.
+
+**On the first machine (Running two server instances):**
+
+**Terminal 1 (CUDA):**
+
+```bash
+# Linux / WSL
+export CUDA_VISIBLE_DEVICES=0
+cd ./build_cuda/bin/Release
+./rpc-server --host 0.0.0.0
+
+# Windows PowerShell
+$env:CUDA_VISIBLE_DEVICES="0"
+cd .\build_cuda\bin\Release
+./rpc-server --host 0.0.0.0
+```
+
+**Terminal 2 (Vulkan):**
+
+```bash
+cd ./build_vulkan/bin/Release
+# ignore the first GPU (used by CUDA server)
+./rpc-server --host 0.0.0.0 --port 50053 -d Vulkan1,Vulkan2
+```
+
+**On the second machine:**
+
+```bash
+cd ./build/bin/Release
+./rpc-server --host 0.0.0.0
+```
+
+**On the Client:**
+Pass multiple server addresses separated by commas.
+
+```bash
+./sd-cli --rpc-servers 192.168.1.10:50052,192.168.1.10:50053,192.168.1.11:50052 [...]
+```
+
+The client will map these servers to sequential device IDs (e.g., RPC0 from the first server, RPC2, RPC3 from the second, and RPC4 from the third). With this setup, you could for example use RPC0 for the main backend, RPC1 and RPC2 for the text encoders, and RPC3 for the VAE.
+
+---
+
+## 6. Performance Considerations
+
+RPC performance is heavily dependent on network bandwidth, as large weights and activations must be transferred back and forth over the network, especially for large models, or when using high resolutions. For best results, ensure your network connection is stable and has sufficient bandwidth (>1Gbps recommended). This shoumd not be a concern if you are running the server and client on the same machine, as the data transfer will happen over the loopback interface.
--- a/docs/z_image.md
+++ b/docs/z_image.md
@ -21,7 +21,7 @@ You can run Z-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or ev
 ### Z-Image-Turbo

 ```
-.\bin\Release\sd-cli.exe --diffusion-model  z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft  --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512
+.\bin\Release\sd-cli.exe --diffusion-model  z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft  --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512 --steps 8
 ```

 <img width="256" alt="z-image example" src="../assets/z_image/q3_K.png" />
--- a/examples/cli/CMakeLists.txt
+++ b/examples/cli/CMakeLists.txt
@ -7,6 +7,13 @@ add_executable(${TARGET}
    image_metadata.cpp
    main.cpp
 )
+if(APPLE)
+    sd_set_macos_rpaths(${TARGET})
+endif()
+target_include_directories(${TARGET} PRIVATE
+    "${CMAKE_CURRENT_SOURCE_DIR}/.."
+    "${PROJECT_SOURCE_DIR}/src"
+)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE stable-diffusion zip ${CMAKE_THREAD_LIBS_INIT})
 if(SD_WEBP)
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -1,157 +1,9 @@
-# Run
+# Usage

-```
-usage: ./bin/sd-cli  [options]
+For detailed command-line arguments, run:

-CLI Options:
-  -o, --output <string>       path to write result image to. you can use printf-style %d format specifiers for image sequences (default:
-                              ./output.png) (eg. output_%03d.png). For video generation, single-file outputs support .avi, .webm, and animated .webp
-  --preview-path <string>     path to write preview image to (default: ./preview.png). Multi-frame previews support .avi, .webm, and animated .webp
-  --preview-interval <int>    interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
-                              every step)
-  --output-begin-idx <int>    starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)
-  --image <string>            path to the image to inspect (for metadata mode)
-  --metadata-format <string>  metadata output format, one of [text, json] (default: text)
-  --canny                     apply canny preprocessor (edge detection)
-  --convert-name              convert tensor name (for convert mode)
-  -v, --verbose               print extra info
-  --color                     colors the logging tags according to level
-  --taesd-preview-only        prevents usage of taesd for decoding the final image. (for use with --preview tae)
-  --preview-noisy             enables previewing noisy inputs of the models rather than the denoised outputs
-  --metadata-raw              include raw hex previews for unparsed metadata payloads
-  --metadata-brief            truncate long metadata text values in text output
-  --metadata-all              include structural/container entries such as IHDR, IDAT, and non-metadata JPEG segments
-  -M, --mode                  run mode, one of [img_gen, vid_gen, upscale, convert, metadata], default: img_gen
-  --preview                   preview method. must be one of the following [none, proj, tae, vae] (default is none)
-  -h, --help                  show this help message and exit
-
-Context Options:
-  -m, --model <string>                     path to full model
-  --clip_l <string>                        path to the clip-l text encoder
-  --clip_g <string>                        path to the clip-g text encoder
-  --clip_vision <string>                   path to the clip-vision encoder
-  --t5xxl <string>                         path to the t5xxl text encoder
-  --llm <string>                           path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
-  --llm_vision <string>                    path to the llm vit
-  --qwen2vl <string>                       alias of --llm. Deprecated.
-  --qwen2vl_vision <string>                alias of --llm_vision. Deprecated.
-  --diffusion-model <string>               path to the standalone diffusion model
-  --high-noise-diffusion-model <string>    path to the standalone high noise diffusion model
-  --vae <string>                           path to standalone vae model
-  --taesd <string>                         path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
-  --tae <string>                           alias of --taesd
-  --control-net <string>                   path to control net model
-  --embd-dir <string>                      embeddings directory
-  --lora-model-dir <string>                lora model directory
-  --tensor-type-rules <string>             weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
-  --photo-maker <string>                   path to PHOTOMAKER model
-  --upscale-model <string>                 path to esrgan model.
-  -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
-                                           CPU physical cores
-  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
-  --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
-  --vae-tiling                             process vae in tiles to reduce memory usage
-  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
-  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
-  --mmap                                   whether to memory-map model
-  --control-net-cpu                        keep controlnet in cpu (for low vram)
-  --clip-on-cpu                            keep clip in cpu (for low vram)
-  --vae-on-cpu                             keep vae in cpu (for low vram)
-  --fa                                     use flash attention
-  --diffusion-fa                           use flash attention in the diffusion model only
-  --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
-  --vae-conv-direct                        use ggml_conv2d_direct in the vae model
-  --circular                               enable circular padding for convolutions
-  --circularx                              enable circular RoPE wrapping on x-axis (width) only
-  --circulary                              enable circular RoPE wrapping on y-axis (height) only
-  --chroma-disable-dit-mask                disable dit mask for chroma
-  --qwen-image-zero-cond-t                 enable zero_cond_t for qwen image
-  --chroma-enable-t5-mask                  enable t5 mask for chroma
-  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
-                                           type of the weight file
-  --rng                                    RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
-  --sampler-rng                            sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
-  --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
-  --lora-apply-mode                        the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
-                                           contain any quantized parameters, the at_runtime mode will be used; otherwise,
-                                           immediately will be used.The immediately mode may have precision and
-                                           compatibility issues with quantized parameters, but it usually offers faster inference
-                                           speed and, in some cases, lower memory usage. The at_runtime mode, on the
-                                           other hand, is exactly the opposite.
-  --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
-  --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
-                                           (overrides --vae-tile-size)
-
-Generation Options:
-  -p, --prompt <string>                    the prompt to render
-  -n, --negative-prompt <string>           the negative prompt (default: "")
-  -i, --init-img <string>                  path to the init image
-  --end-img <string>                       path to the end image, required by flf2v
-  --mask <string>                          path to the mask image
-  --control-image <string>                 path to control image, control net
-  --control-video <string>                 path to control video frames, It must be a directory path. The video frames inside should be stored as images in
-                                           lexicographical (character) order. For example, if the control video path is
-                                           `frames`, the directory contain images such as 00.png, 01.png, ... etc.
-  --pm-id-images-dir <string>              path to PHOTOMAKER input id images dir
-  --pm-id-embed-path <string>              path to PHOTOMAKER v2 id embed
-  -H, --height <int>                       image height, in pixel space (default: 512)
-  -W, --width <int>                        image width, in pixel space (default: 512)
-  --steps <int>                            number of sample steps (default: 20)
-  --high-noise-steps <int>                 (high noise) number of sample steps (default: -1 = auto)
-  --clip-skip <int>                        ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
-                                           will be 1 for SD1.x, 2 for SD2.x
-  -b, --batch-count <int>                  batch count
-  --video-frames <int>                     video frames (default: 1)
-  --fps <int>                              fps (default: 24)
-  --timestep-shift <int>                   shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
-                                           NitroSD-Vibrant
-  --upscale-repeats <int>                  Run the ESRGAN upscaler this many times (default: 1)
-  --upscale-tile-size <int>                tile size for ESRGAN upscaling (default: 128)
-  --cfg-scale <float>                      unconditional guidance scale: (default: 7.0)
-  --img-cfg-scale <float>                  image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
-  --guidance <float>                       distilled guidance scale for models with guidance input (default: 3.5)
-  --slg-scale <float>                      skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
-                                           medium
-  --skip-layer-start <float>               SLG enabling point (default: 0.01)
-  --skip-layer-end <float>                 SLG disabling point (default: 0.2)
-  --eta <float>                            noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
-  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
-  --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
-  --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
-  --high-noise-guidance <float>            (high noise) distilled guidance scale for models with guidance input (default: 3.5)
-  --high-noise-slg-scale <float>           (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
-  --high-noise-skip-layer-start <float>    (high noise) SLG enabling point (default: 0.01)
-  --high-noise-skip-layer-end <float>      (high noise) SLG disabling point (default: 0.2)
-  --high-noise-eta <float>                 (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
-  --strength <float>                       strength for noising/unnoising (default: 0.75)
-  --pm-style-strength <float>
-  --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
-  --moe-boundary <float>                   timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
-  --vace-strength <float>                  wan vace strength
-  --increase-ref-index                     automatically increase the indices of references images based on the order they are listed (starting with 1).
-  --disable-auto-resize-ref-image          disable auto resize of ref images
-  --disable-image-metadata                 do not embed generation metadata on image files
-  -s, --seed                               RNG seed (default: 42, use random seed for < 0)
-  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
-                                           tcd, res_multistep, res_2s, er_sde] (default: euler for Flux/SD3/Wan, euler_a
-                                           otherwise)
-  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
-                                           ddim_trailing, tcd, res_multistep, res_2s, er_sde] default: euler for Flux/SD3/Wan,
-                                           euler_a otherwise
-  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
-                                           kl_optimal, lcm, bong_tangent], default: discrete
-  --sigmas                                 custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
-  --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
-  --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
-  -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
-  --cache-mode                             caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level),
-                                           'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
-  --cache-option                           named cache params (key=value format, comma-separated). easycache/ucache:
-                                           threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=;
-                                           spectrum: w=,m=,lam=,window=,flex=,warmup=,stop=. Examples:
-                                           "threshold=0.25" or "threshold=1.5,reset=0" or "w=0.4,window=2"
-  --scm-mask                               SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
-  --scm-policy                             SCM policy: 'dynamic' (default) or 'static'
+```bash
+./bin/sd-cli -h
 ```

 Metadata mode inspects PNG/JPEG container metadata without loading any model:
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -169,8 +169,9 @@ struct SDCliParams {
            return 1;
        };

-        auto on_help_arg = [&](int argc, const char** argv, int index) {
+        auto on_help_arg = [&](int argc, const char** argv, int index, bool& valid) {
            normal_exit = true;
+            valid       = true;
            return -1;
        };

@ -278,7 +279,9 @@ void parse_args(int argc, const char** argv, SDCliParams& cli_params, SDContextP
    bool valid = cli_params.resolve_and_validate();
    if (valid && cli_params.mode != METADATA) {
        valid = ctx_params.resolve_and_validate(cli_params.mode) &&
-                gen_params.resolve_and_validate(cli_params.mode, ctx_params.lora_model_dir);
+                gen_params.resolve_and_validate(cli_params.mode,
+                                                ctx_params.lora_model_dir,
+                                                ctx_params.hires_upscalers_dir);
    }

    if (!valid) {
@ -383,11 +386,32 @@ std::string format_frame_idx(std::string pattern, int frame_idx) {
    return result;
 }

+static fs::path get_video_audio_sidecar_path(const SDCliParams& cli_params) {
+    fs::path out_path     = cli_params.output_path;
+    fs::path base_path    = out_path;
+    fs::path ext          = out_path.has_extension() ? out_path.extension() : fs::path{};
+    std::string ext_lower = ext.string();
+    std::transform(ext_lower.begin(), ext_lower.end(), ext_lower.begin(), ::tolower);
+    const EncodedImageFormat output_format = encoded_image_format_from_path(out_path.string());
+    if (!ext.empty()) {
+        if (output_format == EncodedImageFormat::JPEG ||
+            output_format == EncodedImageFormat::PNG ||
+            output_format == EncodedImageFormat::WEBP ||
+            ext_lower == ".avi" ||
+            ext_lower == ".webm") {
+            base_path.replace_extension();
+        }
+    }
+    base_path += ".wav";
+    return base_path;
+}
+
 bool save_results(const SDCliParams& cli_params,
                  const SDContextParams& ctx_params,
                  const SDGenerationParams& gen_params,
                  sd_image_t* results,
-                  int num_results) {
+                  int num_results,
+                  const sd_audio_t* generated_audio = nullptr) {
    if (results == nullptr || num_results <= 0) {
        return false;
    }
@ -431,14 +455,30 @@ bool save_results(const SDCliParams& cli_params,
        if (!img.data)
            return false;

+        const int64_t metadata_seed = cli_params.mode == VID_GEN ? gen_params.seed : gen_params.seed + idx;
        std::string params          = gen_params.embed_image_metadata
-                                 ? get_image_params(ctx_params, gen_params, gen_params.seed + idx)
+                                          ? get_image_params(ctx_params, gen_params, metadata_seed, cli_params.mode)
                                          : "";
        const bool ok               = write_image_to_file(path.string(), img.data, img.width, img.height, img.channel, params, 90);
        LOG_INFO("save result image %d to '%s' (%s)", idx, path.string().c_str(), ok ? "success" : "failure");
        return ok;
    };

+    auto write_audio_sidecar = [&](const fs::path& wav_path) {
+        if (generated_audio == nullptr) {
+            return;
+        }
+        if (write_wav_to_file(wav_path.string(),
+                              generated_audio->data,
+                              generated_audio->sample_count,
+                              generated_audio->channels,
+                              generated_audio->sample_rate)) {
+            LOG_INFO("save result audio to '%s'", wav_path.string().c_str());
+        } else {
+            LOG_WARN("failed to save result audio to '%s'", wav_path.string().c_str());
+        }
+    };
+
    int sucessful_reults = 0;

    if (std::regex_search(cli_params.output_path, format_specifier_regex)) {
@ -462,8 +502,16 @@ bool save_results(const SDCliParams& cli_params,
            ext = ".avi";
        fs::path video_path = base_path;
        video_path += ext;
-        if (create_video_from_sd_images(video_path.string().c_str(), results, num_results, gen_params.fps) == 0) {
+        std::string final_ext_lower = ext.string();
+        std::transform(final_ext_lower.begin(), final_ext_lower.end(), final_ext_lower.begin(), ::tolower);
+        const bool mux_audio = generated_audio != nullptr && (final_ext_lower == ".avi" || final_ext_lower == ".webm");
+        if (create_video_from_sd_images(video_path.string().c_str(), results, num_results, gen_params.fps, 90, mux_audio ? generated_audio : nullptr) == 0) {
            LOG_INFO("save result video to '%s'", video_path.string().c_str());
+            if (generated_audio != nullptr && !mux_audio) {
+                fs::path wav_path = video_path;
+                wav_path.replace_extension(".wav");
+                write_audio_sidecar(wav_path);
+            }
            return true;
        } else {
            LOG_ERROR("Failed to save result video to '%s'", video_path.string().c_str());
@ -485,6 +533,9 @@ bool save_results(const SDCliParams& cli_params,
        }
    }
    LOG_INFO("%d/%d images saved", sucessful_reults, num_results);
+    if (generated_audio != nullptr) {
+        write_audio_sidecar(get_video_audio_sidecar_path(cli_params));
+    }
    return sucessful_reults != 0;
 }

@ -572,8 +623,6 @@ int main(int argc, const char* argv[]) {
        }
    }

-    bool vae_decode_only = true;
-
    auto load_image_and_update_size = [&](const std::string& path,
                                          SDImageOwner& image,
                                          bool resize_image    = true,
@ -595,21 +644,18 @@ int main(int argc, const char* argv[]) {
    };

    if (gen_params.init_image_path.size() > 0) {
-        vae_decode_only = false;
        if (!load_image_and_update_size(gen_params.init_image_path, gen_params.init_image)) {
            return 1;
        }
    }

    if (gen_params.end_image_path.size() > 0) {
-        vae_decode_only = false;
        if (!load_image_and_update_size(gen_params.end_image_path, gen_params.end_image)) {
            return 1;
        }
    }

    if (gen_params.ref_image_paths.size() > 0) {
-        vae_decode_only = false;
        gen_params.ref_images.clear();
        for (auto& path : gen_params.ref_image_paths) {
            SDImageOwner ref_image({0, 0, 3, nullptr});
@ -684,14 +730,11 @@ int main(int argc, const char* argv[]) {
        }
    }

-    if (cli_params.mode == VID_GEN) {
-        vae_decode_only = false;
-    }
-
-    sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(vae_decode_only, true, cli_params.taesd_preview);
+    sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(cli_params.taesd_preview);

    SDImageVec results;
    int num_results             = 0;
+    sd_audio_t* generated_audio = nullptr;

    if (cli_params.mode == UPSCALE) {
        num_results = 1;
@ -723,7 +766,10 @@ int main(int argc, const char* argv[]) {
            results.adopt(generate_image(sd_ctx.get(), &img_gen_params), num_results);
        } else if (cli_params.mode == VID_GEN) {
            sd_vid_gen_params_t vid_gen_params = gen_params.to_sd_vid_gen_params_t();
-            sd_image_t* generated_video        = generate_video(sd_ctx.get(), &vid_gen_params, &num_results);
+            sd_image_t* generated_video        = nullptr;
+            if (!generate_video(sd_ctx.get(), &vid_gen_params, &generated_video, &num_results, &generated_audio)) {
+                generated_video = nullptr;
+            }
            results.adopt(generated_video, num_results);
        }

@ -736,10 +782,11 @@ int main(int argc, const char* argv[]) {
    int upscale_factor = 4;  // unused for RealESRGAN_x4plus_anime_6B.pth
    if (ctx_params.esrgan_path.size() > 0 && gen_params.upscale_repeats > 0) {
        UpscalerCtxPtr upscaler_ctx(new_upscaler_ctx(ctx_params.esrgan_path.c_str(),
-                                                     ctx_params.offload_params_to_cpu,
                                                     ctx_params.diffusion_conv_direct,
                                                     ctx_params.n_threads,
-                                                     gen_params.upscale_tile_size));
+                                                     gen_params.upscale_tile_size,
+                                                     sd_ctx_params.backend,
+                                                     sd_ctx_params.params_backend));

        if (upscaler_ctx == nullptr) {
            LOG_ERROR("new_upscaler_ctx failed");
@ -763,9 +810,12 @@ int main(int argc, const char* argv[]) {
        }
    }

-    if (!save_results(cli_params, ctx_params, gen_params, results.data(), num_results)) {
+    if (!save_results(cli_params, ctx_params, gen_params, results.data(), num_results, generated_audio)) {
+        free_sd_audio(generated_audio);
        return 1;
    }

+    free_sd_audio(generated_audio);
+
    return 0;
 }
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
--- a/examples/common/common.h
+++ b/examples/common/common.h
@ -56,11 +56,42 @@ struct BoolOption {
    bool* target;
 };

+struct ManualFunction {
+    std::function<int(int, const char**, int, bool&)> _func;
+
+    ManualFunction() = default;
+
+    ManualFunction(std::function<int(int argc, const char** argv, int index, bool& valid)> func)
+        : _func(std::move(func)) {
+    }
+
+    template <typename F>
+    ManualFunction(F func)
+        : _func(make_function(func)) {
+    }
+
+    int operator()(int argc, const char** argv, int index, bool& valid) const {
+        return _func(argc, argv, index, valid);
+    }
+
+private:
+    template <typename F>
+    static std::function<int(int, const char**, int, bool&)> make_function(F func) {
+        if constexpr (std::is_invocable_v<F, int, const char**, int, bool&>) {
+            return func;
+        } else {
+            return [func](int argc, const char** argv, int index, bool&) {
+                return func(argc, argv, index);
+            };
+        }
+    }
+};
+
 struct ManualOption {
    std::string short_name;
    std::string long_name;
    std::string desc;
-    std::function<int(int argc, const char** argv, int index)> cb;
+    ManualFunction cb;
 };

 struct ArgOptions {
@ -92,7 +123,11 @@ struct SDContextParams {
    std::string llm_vision_path;
    std::string diffusion_model_path;
    std::string high_noise_diffusion_model_path;
+    std::string uncond_diffusion_model_path;
+    std::string embeddings_connectors_path;
    std::string vae_path;
+    std::string vae_format = "auto";
+    std::string audio_vae_path;
    std::string taesd_path;
    std::string esrgan_path;
    std::string control_net_path;
@ -101,6 +136,7 @@ struct SDContextParams {
    sd_type_t wtype = SD_TYPE_COUNT;
    std::string tensor_type_rules;
    std::string lora_model_dir = ".";
+    std::string hires_upscalers_dir;

    std::map<std::string, std::string> embedding_map;
    std::vector<sd_embedding_t> embedding_vec;
@ -108,6 +144,13 @@ struct SDContextParams {
    rng_type_t rng_type         = CUDA_RNG;
    rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
    bool offload_params_to_cpu  = false;
+    std::string max_vram        = "0";
+    bool stream_layers          = false;
+    std::string backend;
+    std::string params_backend;
+    std::string rpc_servers;
+    std::string effective_backend;
+    std::string effective_params_backend;
    bool enable_mmap           = false;
    bool control_net_cpu       = false;
    bool clip_on_cpu           = false;
@ -135,11 +178,12 @@ struct SDContextParams {
    float flow_shift = INFINITY;
    ArgOptions get_options();
    void build_embedding_map();
+    void prepare_backend_assignments();
    bool resolve(SDMode mode);
    bool validate(SDMode mode);
    bool resolve_and_validate(SDMode mode);
    std::string to_string() const;
-    sd_ctx_params_t to_sd_ctx_params_t(bool vae_decode_only, bool free_params_immediately, bool taesd_preview);
+    sd_ctx_params_t to_sd_ctx_params_t(bool taesd_preview);
 };

 struct SDGenerationParams {
@ -166,6 +210,8 @@ struct SDGenerationParams {

    sd_sample_params_t sample_params;
    sd_sample_params_t high_noise_sample_params;
+    std::string extra_sample_args;
+    std::string high_noise_extra_sample_args;
    std::vector<int> skip_layers            = {7, 8, 9};
    std::vector<int> high_noise_skip_layers = {7, 8, 9};

@ -181,7 +227,8 @@ struct SDGenerationParams {
    int video_frames                     = 1;
    int fps                              = 16;
    float vace_strength                  = 1.f;
-    sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
+    sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr};
+    std::string extra_tiling_args;

    std::string pm_id_images_dir;
    std::string pm_id_embed_path;
@ -190,12 +237,24 @@ struct SDGenerationParams {
    int upscale_repeats   = 1;
    int upscale_tile_size = 128;

+    bool hires_enabled         = false;
+    std::string hires_upscaler = "Latent";
+    std::string hires_upscaler_model_path;
+    float hires_scale              = 2.f;
+    int hires_width                = 0;
+    int hires_height               = 0;
+    int hires_steps                = 0;
+    float hires_denoising_strength = 0.7f;
+    int hires_upscale_tile_size    = 128;
+    std::vector<float> hires_custom_sigmas;
+
    std::map<std::string, float> lora_map;
    std::map<std::string, float> high_noise_lora_map;

    // Derived and normalized fields.
    std::string prompt_with_lora;  // for metadata record only
    std::vector<sd_lora_t> lora_vec;
+    sd_hires_upscaler_t resolved_hires_upscaler;

    // Owned execution payload.
    SDImageOwner init_image;
@ -225,15 +284,25 @@ struct SDGenerationParams {
    void set_width_and_height_if_unset(int w, int h);
    int get_resolved_width() const;
    int get_resolved_height() const;
-    bool resolve(const std::string& lora_model_dir, bool strict = false);
+    bool resolve(const std::string& lora_model_dir, const std::string& hires_upscalers_dir, bool strict = false);
    bool validate(SDMode mode);
-    bool resolve_and_validate(SDMode mode, const std::string& lora_model_dir, bool strict = false);
+    bool resolve_and_validate(SDMode mode,
+                              const std::string& lora_model_dir,
+                              const std::string& hires_upscalers_dir,
+                              bool strict = false);
    sd_img_gen_params_t to_sd_img_gen_params_t();
    sd_vid_gen_params_t to_sd_vid_gen_params_t();
    std::string to_string() const;
 };

 std::string version_string();
-std::string get_image_params(const SDContextParams& ctx_params, const SDGenerationParams& gen_params, int64_t seed);
+std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params,
+                                            const SDGenerationParams& gen_params,
+                                            int64_t seed,
+                                            SDMode mode = IMG_GEN);
+std::string get_image_params(const SDContextParams& ctx_params,
+                             const SDGenerationParams& gen_params,
+                             int64_t seed,
+                             SDMode mode = IMG_GEN);

 #endif  // __EXAMPLES_COMMON_COMMON_H__
--- a/examples/common/media_io.cpp
+++ b/examples/common/media_io.cpp
@ -95,6 +95,57 @@ using WebPMuxPtr         = std::unique_ptr<WebPMux, WebPMuxDeleter>;
 using WebPAnimEncoderPtr = std::unique_ptr<WebPAnimEncoder, WebPAnimEncoderDeleter>;
 #endif

+#ifdef SD_USE_WEBM
+class MemoryMkvWriter : public mkvmuxer::IMkvWriter {
+public:
+    mkvmuxer::int32 Write(const void* buf, mkvmuxer::uint32 len) override {
+        if (buf == nullptr && len > 0) {
+            return -1;
+        }
+        const size_t end_pos = position_ + static_cast<size_t>(len);
+        if (end_pos > data_.size()) {
+            data_.resize(end_pos);
+        }
+        if (len > 0) {
+            memcpy(data_.data() + position_, buf, len);
+        }
+        position_ = end_pos;
+        return 0;
+    }
+
+    mkvmuxer::int64 Position() const override {
+        return static_cast<mkvmuxer::int64>(position_);
+    }
+
+    mkvmuxer::int32 Position(mkvmuxer::int64 position) override {
+        if (position < 0) {
+            return -1;
+        }
+        const size_t target = static_cast<size_t>(position);
+        if (target > data_.size()) {
+            data_.resize(target);
+        }
+        position_ = target;
+        return 0;
+    }
+
+    bool Seekable() const override {
+        return true;
+    }
+
+    void ElementStartNotify(mkvmuxer::uint64, mkvmuxer::int64) override {
+    }
+
+    const std::vector<uint8_t>& data() const {
+        return data_;
+    }
+
+private:
+    std::vector<uint8_t> data_;
+    size_t position_ = 0;
+};
+#endif
+
 bool read_binary_file_bytes(const char* path, std::vector<uint8_t>& data) {
    std::ifstream fin(fs::path(path), std::ios::binary);
    if (!fin) {
@ -562,6 +613,13 @@ typedef struct {
    uint32_t size;
 } avi_index_entry;

+typedef struct {
+    char fourcc[4];
+    uint32_t flags;
+    uint32_t offset;
+    uint32_t size;
+} avi_chunk_index_entry;
+
 void write_u32_le(FILE* f, uint32_t val) {
    fwrite(&val, 4, 1, f);
 }
@ -570,6 +628,59 @@ void write_u16_le(FILE* f, uint16_t val) {
    fwrite(&val, 2, 1, f);
 }

+void write_u32_le(std::vector<uint8_t>& data, uint32_t val) {
+    data.push_back(static_cast<uint8_t>(val & 0xFF));
+    data.push_back(static_cast<uint8_t>((val >> 8) & 0xFF));
+    data.push_back(static_cast<uint8_t>((val >> 16) & 0xFF));
+    data.push_back(static_cast<uint8_t>((val >> 24) & 0xFF));
+}
+
+void write_u16_le(std::vector<uint8_t>& data, uint16_t val) {
+    data.push_back(static_cast<uint8_t>(val & 0xFF));
+    data.push_back(static_cast<uint8_t>((val >> 8) & 0xFF));
+}
+
+void patch_u32_le(std::vector<uint8_t>& data, size_t offset, uint32_t val) {
+    if (offset + 4 > data.size()) {
+        return;
+    }
+    data[offset + 0] = static_cast<uint8_t>(val & 0xFF);
+    data[offset + 1] = static_cast<uint8_t>((val >> 8) & 0xFF);
+    data[offset + 2] = static_cast<uint8_t>((val >> 16) & 0xFF);
+    data[offset + 3] = static_cast<uint8_t>((val >> 24) & 0xFF);
+}
+
+void write_fourcc(std::vector<uint8_t>& data, const char* fourcc) {
+    data.insert(data.end(), fourcc, fourcc + 4);
+}
+
+static std::vector<uint8_t> audio_to_pcm16_bytes(const sd_audio_t* audio) {
+    if (audio == nullptr || audio->data == nullptr || audio->sample_count == 0 || audio->channels == 0 || audio->sample_rate == 0) {
+        return {};
+    }
+
+    const size_t pcm_samples = static_cast<size_t>(audio->sample_count) * static_cast<size_t>(audio->channels);
+    std::vector<uint8_t> bytes(pcm_samples * sizeof(int16_t));
+    auto* pcm = reinterpret_cast<int16_t*>(bytes.data());
+    for (size_t i = 0; i < pcm_samples; ++i) {
+        const float sample = std::clamp(audio->data[i], -1.0f, 1.0f);
+        pcm[i]             = static_cast<int16_t>(std::lrint(sample * 32767.0f));
+    }
+    return bytes;
+}
+
+static std::pair<uint64_t, uint64_t> audio_sample_range_for_video_frame(const sd_audio_t* audio, int frame_idx, int num_frames, int fps) {
+    if (audio == nullptr || fps <= 0 || num_frames <= 0) {
+        return {0, 0};
+    }
+    const uint64_t total = audio->sample_count;
+    const uint64_t start = static_cast<uint64_t>((static_cast<long double>(frame_idx) * total) / num_frames);
+    const uint64_t end   = frame_idx + 1 == num_frames
+                               ? total
+                               : static_cast<uint64_t>((static_cast<long double>(frame_idx + 1) * total) / num_frames);
+    return {start, std::max(start, end)};
+}
+
 EncodedImageFormat encoded_image_format_from_path(const std::string& path) {
    std::string ext = fs::path(path).extension().string();
    std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
@ -699,97 +810,144 @@ uint8_t* load_image_from_memory(const char* image_bytes,
    return load_image_common(true, image_bytes, len, width, height, expected_width, expected_height, expected_channel);
 }

-int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
+std::vector<uint8_t> create_mjpg_avi_from_sd_images_to_vector(sd_image_t* images, int num_images, int fps, int quality, const sd_audio_t* audio) {
    if (num_images == 0) {
        fprintf(stderr, "Error: Image array is empty.\n");
-        return -1;
+        return {};
    }

-    FilePtr file(fopen(filename, "wb"));
-    if (!file) {
-        perror("Error opening file for writing");
-        return -1;
-    }
-    FILE* f = file.get();
-
    uint32_t width    = images[0].width;
    uint32_t height   = images[0].height;
    uint32_t channels = images[0].channel;
    if (channels != 3 && channels != 4) {
        fprintf(stderr, "Error: Unsupported channel count: %u\n", channels);
-        return -1;
+        return {};
    }

-    fwrite("RIFF", 4, 1, f);
-    long riff_size_pos = ftell(f);
-    write_u32_le(f, 0);
-    fwrite("AVI ", 4, 1, f);
+    // stb_image_write changes JPEG sampling behavior above quality 90.
+    // MJPG AVI playback is more compatible when we keep the encoder on the
+    // <= 90 path.
+    const int mjpg_quality               = std::clamp(quality, 1, 90);
+    const bool has_audio                 = audio != nullptr && audio->data != nullptr && audio->sample_count > 0 && audio->channels > 0 && audio->sample_rate > 0;
+    const std::vector<uint8_t> audio_pcm = audio_to_pcm16_bytes(audio);
+    const uint16_t audio_bits_per_sample = 16;
+    const uint16_t audio_block_align     = has_audio ? static_cast<uint16_t>(audio->channels * (audio_bits_per_sample / 8)) : 0;
+    const uint32_t audio_byte_rate       = has_audio ? static_cast<uint32_t>(audio->sample_rate * audio_block_align) : 0;
+    const uint32_t audio_data_size       = has_audio ? static_cast<uint32_t>(audio_pcm.size()) : 0;

-    fwrite("LIST", 4, 1, f);
-    write_u32_le(f, 4 + 8 + 56 + 8 + 4 + 8 + 56 + 8 + 40);
-    fwrite("hdrl", 4, 1, f);
+    std::vector<uint8_t> avi_data;
+    avi_data.reserve(static_cast<size_t>(num_images) * 1024);

-    fwrite("avih", 4, 1, f);
-    write_u32_le(f, 56);
-    write_u32_le(f, 1000000 / fps);
-    write_u32_le(f, 0);
-    write_u32_le(f, 0);
-    write_u32_le(f, 0x110);
-    write_u32_le(f, num_images);
-    write_u32_le(f, 0);
-    write_u32_le(f, 1);
-    write_u32_le(f, width * height * 3);
-    write_u32_le(f, width);
-    write_u32_le(f, height);
-    write_u32_le(f, 0);
-    write_u32_le(f, 0);
-    write_u32_le(f, 0);
-    write_u32_le(f, 0);
+    write_fourcc(avi_data, "RIFF");
+    const size_t riff_size_pos = avi_data.size();
+    write_u32_le(avi_data, 0);
+    write_fourcc(avi_data, "AVI ");

-    fwrite("LIST", 4, 1, f);
-    write_u32_le(f, 4 + 8 + 56 + 8 + 40);
-    fwrite("strl", 4, 1, f);
+    write_fourcc(avi_data, "LIST");
+    uint32_t hdrl_size = 4 + 8 + 56 + 8 + 4 + 8 + 56 + 8 + 40;
+    if (has_audio) {
+        hdrl_size += 8 + (4 + 8 + 56 + 8 + 16);
+    }
+    write_u32_le(avi_data, hdrl_size);
+    write_fourcc(avi_data, "hdrl");

-    fwrite("strh", 4, 1, f);
-    write_u32_le(f, 56);
-    fwrite("vids", 4, 1, f);
-    fwrite("MJPG", 4, 1, f);
-    write_u32_le(f, 0);
-    write_u16_le(f, 0);
-    write_u16_le(f, 0);
-    write_u32_le(f, 0);
-    write_u32_le(f, 1);
-    write_u32_le(f, fps);
-    write_u32_le(f, 0);
-    write_u32_le(f, num_images);
-    write_u32_le(f, width * height * 3);
-    write_u32_le(f, (uint32_t)-1);
-    write_u32_le(f, 0);
-    write_u16_le(f, 0);
-    write_u16_le(f, 0);
-    write_u16_le(f, 0);
-    write_u16_le(f, 0);
+    write_fourcc(avi_data, "avih");
+    write_u32_le(avi_data, 56);
+    write_u32_le(avi_data, 1000000 / fps);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, 0x110);
+    write_u32_le(avi_data, num_images);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, has_audio ? 2 : 1);
+    write_u32_le(avi_data, width * height * 3);
+    write_u32_le(avi_data, width);
+    write_u32_le(avi_data, height);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, 0);

-    fwrite("strf", 4, 1, f);
-    write_u32_le(f, 40);
-    write_u32_le(f, 40);
-    write_u32_le(f, width);
-    write_u32_le(f, height);
-    write_u16_le(f, 1);
-    write_u16_le(f, 24);
-    fwrite("MJPG", 4, 1, f);
-    write_u32_le(f, width * height * 3);
-    write_u32_le(f, 0);
-    write_u32_le(f, 0);
-    write_u32_le(f, 0);
-    write_u32_le(f, 0);
+    write_fourcc(avi_data, "LIST");
+    write_u32_le(avi_data, 4 + 8 + 56 + 8 + 40);
+    write_fourcc(avi_data, "strl");

-    fwrite("LIST", 4, 1, f);
-    long movi_size_pos = ftell(f);
-    write_u32_le(f, 0);
-    fwrite("movi", 4, 1, f);
+    write_fourcc(avi_data, "strh");
+    write_u32_le(avi_data, 56);
+    write_fourcc(avi_data, "vids");
+    write_fourcc(avi_data, "MJPG");
+    write_u32_le(avi_data, 0);
+    write_u16_le(avi_data, 0);
+    write_u16_le(avi_data, 0);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, 1);
+    write_u32_le(avi_data, fps);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, num_images);
+    write_u32_le(avi_data, width * height * 3);
+    write_u32_le(avi_data, static_cast<uint32_t>(-1));
+    write_u32_le(avi_data, 0);
+    write_u16_le(avi_data, 0);
+    write_u16_le(avi_data, 0);
+    write_u16_le(avi_data, 0);
+    write_u16_le(avi_data, 0);

-    std::vector<avi_index_entry> index(static_cast<size_t>(num_images));
+    write_fourcc(avi_data, "strf");
+    write_u32_le(avi_data, 40);
+    write_u32_le(avi_data, 40);
+    write_u32_le(avi_data, width);
+    write_u32_le(avi_data, height);
+    write_u16_le(avi_data, 1);
+    write_u16_le(avi_data, 24);
+    write_fourcc(avi_data, "MJPG");
+    write_u32_le(avi_data, width * height * 3);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, 0);
+    write_u32_le(avi_data, 0);
+
+    if (has_audio) {
+        write_fourcc(avi_data, "LIST");
+        write_u32_le(avi_data, 4 + 8 + 56 + 8 + 16);
+        write_fourcc(avi_data, "strl");
+
+        write_fourcc(avi_data, "strh");
+        write_u32_le(avi_data, 56);
+        write_fourcc(avi_data, "auds");
+        write_u32_le(avi_data, 0);
+        write_u32_le(avi_data, 0);
+        write_u16_le(avi_data, 0);
+        write_u16_le(avi_data, 0);
+        write_u32_le(avi_data, 0);
+        write_u32_le(avi_data, audio_block_align);
+        write_u32_le(avi_data, audio_byte_rate);
+        write_u32_le(avi_data, 0);
+        write_u32_le(avi_data, static_cast<uint32_t>(audio->sample_count));
+        write_u32_le(avi_data, audio_data_size);
+        write_u32_le(avi_data, static_cast<uint32_t>(-1));
+        write_u32_le(avi_data, audio_block_align);
+        write_u16_le(avi_data, 0);
+        write_u16_le(avi_data, 0);
+        write_u16_le(avi_data, 0);
+        write_u16_le(avi_data, 0);
+
+        write_fourcc(avi_data, "strf");
+        write_u32_le(avi_data, 16);
+        write_u16_le(avi_data, 1);
+        write_u16_le(avi_data, static_cast<uint16_t>(audio->channels));
+        write_u32_le(avi_data, audio->sample_rate);
+        write_u32_le(avi_data, audio_byte_rate);
+        write_u16_le(avi_data, audio_block_align);
+        write_u16_le(avi_data, audio_bits_per_sample);
+    }
+
+    write_fourcc(avi_data, "LIST");
+    const size_t movi_size_pos = avi_data.size();
+    write_u32_le(avi_data, 0);
+    write_fourcc(avi_data, "movi");
+
+    std::vector<avi_chunk_index_entry> index;
+    index.reserve(static_cast<size_t>(num_images) + (has_audio ? 1 : 0));
    std::vector<uint8_t> jpeg_data;

    for (int i = 0; i < num_images; i++) {
@ -801,55 +959,80 @@ int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int
            buffer->insert(buffer->end(), src, src + size);
        };

-        if (!stbi_write_jpg_to_func(write_to_buf, &jpeg_data, images[i].width, images[i].height, channels, images[i].data, quality)) {
+        if (!stbi_write_jpg_to_func(write_to_buf, &jpeg_data, images[i].width, images[i].height, channels, images[i].data, mjpg_quality)) {
            fprintf(stderr, "Error: Failed to encode JPEG frame.\n");
-            return -1;
+            return {};
        }

-        fwrite("00dc", 4, 1, f);
-        write_u32_le(f, (uint32_t)jpeg_data.size());
-        index[i].offset = ftell(f) - 8;
-        index[i].size   = (uint32_t)jpeg_data.size();
-        fwrite(jpeg_data.data(), 1, jpeg_data.size(), f);
+        avi_chunk_index_entry video_entry = {};
+        memcpy(video_entry.fourcc, "00dc", 4);
+        video_entry.flags  = 0x10;
+        video_entry.offset = static_cast<uint32_t>(avi_data.size());
+        write_fourcc(avi_data, "00dc");
+        write_u32_le(avi_data, static_cast<uint32_t>(jpeg_data.size()));
+        video_entry.size = static_cast<uint32_t>(jpeg_data.size());
+        avi_data.insert(avi_data.end(), jpeg_data.begin(), jpeg_data.end());
+        index.push_back(video_entry);

        if (jpeg_data.size() % 2) {
-            fputc(0, f);
+            avi_data.push_back(0);
        }
    }

-    long cur_pos   = ftell(f);
-    long movi_size = cur_pos - movi_size_pos - 4;
-    fseek(f, movi_size_pos, SEEK_SET);
-    write_u32_le(f, movi_size);
-    fseek(f, cur_pos, SEEK_SET);
-
-    fwrite("idx1", 4, 1, f);
-    write_u32_le(f, num_images * 16);
-    for (int i = 0; i < num_images; i++) {
-        fwrite("00dc", 4, 1, f);
-        write_u32_le(f, 0x10);
-        write_u32_le(f, index[i].offset);
-        write_u32_le(f, index[i].size);
+    if (has_audio && !audio_pcm.empty()) {
+        avi_chunk_index_entry audio_entry = {};
+        memcpy(audio_entry.fourcc, "01wb", 4);
+        audio_entry.flags  = 0;
+        audio_entry.offset = static_cast<uint32_t>(avi_data.size());
+        audio_entry.size   = static_cast<uint32_t>(audio_pcm.size());
+        write_fourcc(avi_data, "01wb");
+        write_u32_le(avi_data, static_cast<uint32_t>(audio_pcm.size()));
+        avi_data.insert(avi_data.end(), audio_pcm.begin(), audio_pcm.end());
+        index.push_back(audio_entry);
+        if (audio_pcm.size() % 2 != 0) {
+            avi_data.push_back(0);
+        }
    }

-    cur_pos        = ftell(f);
-    long file_size = cur_pos - riff_size_pos - 4;
-    fseek(f, riff_size_pos, SEEK_SET);
-    write_u32_le(f, file_size);
-    fseek(f, cur_pos, SEEK_SET);
+    const size_t movi_size = avi_data.size() - movi_size_pos - 4;
+    patch_u32_le(avi_data, movi_size_pos, static_cast<uint32_t>(movi_size));

+    write_fourcc(avi_data, "idx1");
+    write_u32_le(avi_data, static_cast<uint32_t>(index.size() * 16));
+    for (const auto& entry : index) {
+        write_fourcc(avi_data, entry.fourcc);
+        write_u32_le(avi_data, entry.flags);
+        write_u32_le(avi_data, entry.offset);
+        write_u32_le(avi_data, entry.size);
+    }
+
+    const size_t file_size = avi_data.size() - riff_size_pos - 4;
+    patch_u32_le(avi_data, riff_size_pos, static_cast<uint32_t>(file_size));
+
+    return avi_data;
+}
+
+int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality, const sd_audio_t* audio) {
+    std::vector<uint8_t> avi_data = create_mjpg_avi_from_sd_images_to_vector(images, num_images, fps, quality, audio);
+    if (avi_data.empty()) {
+        return -1;
+    }
+    if (!write_binary_file_bytes(filename, avi_data)) {
+        perror("Error opening file for writing");
+        return -1;
+    }
    return 0;
 }

 #ifdef SD_USE_WEBP
-int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
+std::vector<uint8_t> create_animated_webp_from_sd_images_to_vector(sd_image_t* images, int num_images, int fps, int quality) {
    if (num_images == 0) {
        fprintf(stderr, "Error: Image array is empty.\n");
-        return -1;
+        return {};
    }
    if (fps <= 0) {
        fprintf(stderr, "Error: FPS must be positive.\n");
-        return -1;
+        return {};
    }

    const int width    = static_cast<int>(images[0].width);
@ -857,14 +1040,14 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images
    const int channels = static_cast<int>(images[0].channel);
    if (channels != 1 && channels != 3 && channels != 4) {
        fprintf(stderr, "Error: Unsupported channel count: %d\n", channels);
-        return -1;
+        return {};
    }

    WebPAnimEncoderOptions anim_options;
    WebPConfig config;
    if (!WebPAnimEncoderOptionsInit(&anim_options) || !WebPConfigInit(&config)) {
        fprintf(stderr, "Error: Failed to initialize WebP animation encoder.\n");
-        return -1;
+        return {};
    }

    config.quality      = static_cast<float>(quality);
@ -875,13 +1058,13 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images
    }
    if (!WebPValidateConfig(&config)) {
        fprintf(stderr, "Error: Invalid WebP encoder configuration.\n");
-        return -1;
+        return {};
    }

    WebPAnimEncoderPtr enc(WebPAnimEncoderNew(width, height, &anim_options));
    if (enc == nullptr) {
        fprintf(stderr, "Error: Could not create WebPAnimEncoder object.\n");
-        return -1;
+        return {};
    }

    const int frame_duration_ms = std::max(1, static_cast<int>(std::lround(1000.0 / static_cast<double>(fps))));
@ -891,13 +1074,13 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images
        const sd_image_t& image = images[i];
        if (static_cast<int>(image.width) != width || static_cast<int>(image.height) != height) {
            fprintf(stderr, "Error: Frame dimensions do not match.\n");
-            return -1;
+            return {};
        }

        WebPPictureGuard picture;
        if (!picture.initialized) {
            fprintf(stderr, "Error: Failed to initialize WebPPicture.\n");
-            return -1;
+            return {};
        }
        picture.picture.use_argb = 1;
        picture.picture.width    = width;
@ -921,12 +1104,12 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images

        if (!picture_ok) {
            fprintf(stderr, "Error: Failed to import frame into WebPPicture.\n");
-            return -1;
+            return {};
        }

        if (!WebPAnimEncoderAdd(enc.get(), &picture.picture, timestamp_ms, &config)) {
            fprintf(stderr, "Error: Failed to add frame to animated WebP: %s\n", WebPAnimEncoderGetError(enc.get()));
-            return -1;
+            return {};
        }

        timestamp_ms += frame_duration_ms;
@ -934,52 +1117,50 @@ int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images

    if (!WebPAnimEncoderAdd(enc.get(), nullptr, timestamp_ms, nullptr)) {
        fprintf(stderr, "Error: Failed to finalize animated WebP frames: %s\n", WebPAnimEncoderGetError(enc.get()));
-        return -1;
+        return {};
    }

    WebPDataGuard webp_data;
    if (!WebPAnimEncoderAssemble(enc.get(), &webp_data.data)) {
        fprintf(stderr, "Error: Failed to assemble animated WebP: %s\n", WebPAnimEncoderGetError(enc.get()));
-        return -1;
+        return {};
    }

-    FilePtr f(fopen(filename, "wb"));
-    if (!f) {
+    return std::vector<uint8_t>(webp_data.data.bytes, webp_data.data.bytes + webp_data.data.size);
+}
+
+int create_animated_webp_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
+    std::vector<uint8_t> webp_data = create_animated_webp_from_sd_images_to_vector(images, num_images, fps, quality);
+    if (webp_data.empty()) {
+        return -1;
+    }
+    if (!write_binary_file_bytes(filename, webp_data)) {
        perror("Error opening file for writing");
        return -1;
    }
-    if (webp_data.data.size > 0 && fwrite(webp_data.data.bytes, 1, webp_data.data.size, f.get()) != webp_data.data.size) {
-        fprintf(stderr, "Error: Failed to write animated WebP file.\n");
-        return -1;
-    }
-
    return 0;
 }
 #endif

 #ifdef SD_USE_WEBM
-int create_webm_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
+std::vector<uint8_t> create_webm_from_sd_images_to_vector(sd_image_t* images, int num_images, int fps, int quality, const sd_audio_t* audio) {
    if (num_images == 0) {
        fprintf(stderr, "Error: Image array is empty.\n");
-        return -1;
+        return {};
    }
    if (fps <= 0) {
        fprintf(stderr, "Error: FPS must be positive.\n");
-        return -1;
+        return {};
    }

    const int width  = static_cast<int>(images[0].width);
    const int height = static_cast<int>(images[0].height);
    if (width <= 0 || height <= 0) {
        fprintf(stderr, "Error: Invalid frame dimensions.\n");
-        return -1;
+        return {};
    }

-    mkvmuxer::MkvWriter writer;
-    if (!writer.Open(filename)) {
-        fprintf(stderr, "Error: Could not open WebM file for writing.\n");
-        return -1;
-    }
+    MemoryMkvWriter writer;

    const int ret = [&]() -> int {
        mkvmuxer::Segment segment;
@ -1007,6 +1188,25 @@ int create_webm_from_sd_images(const char* filename, sd_image_t* images, int num
            video_track->set_display_height(static_cast<uint64_t>(height));
            video_track->set_frame_rate(static_cast<double>(fps));
        }
+
+        uint64_t audio_track_number    = 0;
+        std::vector<uint8_t> audio_pcm = audio_to_pcm16_bytes(audio);
+        if (audio != nullptr && !audio_pcm.empty()) {
+            audio_track_number = segment.AddAudioTrack(static_cast<int32_t>(audio->sample_rate), static_cast<int32_t>(audio->channels), 0);
+            if (audio_track_number == 0) {
+                fprintf(stderr, "Error: Failed to add audio track.\n");
+                return -1;
+            }
+            auto* audio_track = static_cast<mkvmuxer::AudioTrack*>(segment.GetTrackByNumber(audio_track_number));
+            if (audio_track == nullptr) {
+                fprintf(stderr, "Error: Failed to get audio track.\n");
+                return -1;
+            }
+            audio_track->set_codec_id("A_PCM/INT/LIT");
+            audio_track->set_bit_depth(16);
+            audio_track->set_sample_rate(static_cast<double>(audio->sample_rate));
+            audio_track->set_channels(audio->channels);
+        }
        segment.GetSegmentInfo()->set_writing_app("stable-diffusion.cpp");
        segment.GetSegmentInfo()->set_muxing_app("stable-diffusion.cpp");

@ -1036,6 +1236,23 @@ int create_webm_from_sd_images(const char* filename, sd_image_t* images, int num
                return -1;
            }

+            if (audio_track_number != 0) {
+                auto [audio_begin, audio_end] = audio_sample_range_for_video_frame(audio, i, num_images, fps);
+                const uint64_t frame_samples  = audio_end - audio_begin;
+                if (frame_samples > 0) {
+                    const uint64_t frame_bytes = frame_samples * audio->channels * sizeof(int16_t);
+                    const uint8_t* frame_ptr   = audio_pcm.data() + audio_begin * audio->channels * sizeof(int16_t);
+                    if (!segment.AddFrame(frame_ptr,
+                                          frame_bytes,
+                                          audio_track_number,
+                                          timestamp_ns,
+                                          true)) {
+                        fprintf(stderr, "Error: Failed to mux audio chunk %d into WebM.\n", i);
+                        return -1;
+                    }
+                }
+            }
+
            timestamp_ns += frame_duration_ns;
        }

@ -1045,30 +1262,115 @@ int create_webm_from_sd_images(const char* filename, sd_image_t* images, int num
        }
        return 0;
    }();
-    writer.Close();
-    return ret;
+    if (ret != 0) {
+        return {};
+    }
+    return writer.data();
+}
+
+int create_webm_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality, const sd_audio_t* audio) {
+    std::vector<uint8_t> webm_data = create_webm_from_sd_images_to_vector(images, num_images, fps, quality, audio);
+    if (webm_data.empty()) {
+        return -1;
+    }
+    if (!write_binary_file_bytes(filename, webm_data)) {
+        perror("Error opening file for writing");
+        return -1;
+    }
+    return 0;
 }
 #endif

-int create_video_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality) {
-    std::string path = filename ? filename : "";
-    auto pos         = path.find_last_of('.');
-    std::string ext  = pos == std::string::npos ? "" : path.substr(pos);
-    for (char& ch : ext) {
-        ch = static_cast<char>(tolower(static_cast<unsigned char>(ch)));
+std::vector<uint8_t> create_video_from_sd_images_to_vector(const std::string& output_format,
+                                                           sd_image_t* images,
+                                                           int num_images,
+                                                           int fps,
+                                                           int quality,
+                                                           const sd_audio_t* audio) {
+    std::string format = output_format;
+    std::transform(format.begin(), format.end(), format.begin(),
+                   [](unsigned char c) { return static_cast<char>(tolower(c)); });
+    if (!format.empty() && format[0] == '.') {
+        format.erase(format.begin());
    }

 #ifdef SD_USE_WEBM
-    if (ext == ".webm") {
-        return create_webm_from_sd_images(filename, images, num_images, fps, quality);
+    if (format == "webm") {
+        return create_webm_from_sd_images_to_vector(images, num_images, fps, quality, audio);
    }
 #endif

 #ifdef SD_USE_WEBP
-    if (ext == ".webp") {
-        return create_animated_webp_from_sd_images(filename, images, num_images, fps, quality);
+    if (format == "webp") {
+        return create_animated_webp_from_sd_images_to_vector(images, num_images, fps, quality);
    }
 #endif

-    return create_mjpg_avi_from_sd_images(filename, images, num_images, fps, quality);
+    return create_mjpg_avi_from_sd_images_to_vector(images, num_images, fps, quality, audio);
+}
+
+int create_video_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality, const sd_audio_t* audio) {
+    std::string path                = filename ? filename : "";
+    auto pos                        = path.find_last_of('.');
+    std::string ext                 = pos == std::string::npos ? "" : path.substr(pos);
+    std::vector<uint8_t> video_data = create_video_from_sd_images_to_vector(ext, images, num_images, fps, quality, audio);
+    if (video_data.empty()) {
+        return -1;
+    }
+    if (!write_binary_file_bytes(filename, video_data)) {
+        perror("Error opening file for writing");
+        return -1;
+    }
+    return 0;
+}
+
+bool write_wav_to_file(const std::string& path,
+                       const float* interleaved_samples,
+                       uint64_t sample_count,
+                       uint32_t channels,
+                       uint32_t sample_rate) {
+    if (interleaved_samples == nullptr || sample_count == 0 || channels == 0 || sample_rate == 0) {
+        return false;
+    }
+
+    std::ofstream file(path, std::ios::binary);
+    if (!file.is_open()) {
+        return false;
+    }
+
+    uint32_t bits_per_sample  = 16;
+    uint32_t bytes_per_sample = bits_per_sample / 8;
+    uint32_t block_align      = channels * bytes_per_sample;
+    uint32_t byte_rate        = sample_rate * block_align;
+    uint32_t data_size        = static_cast<uint32_t>(sample_count * channels * bytes_per_sample);
+    uint32_t riff_size        = 36 + data_size;
+
+    file.write("RIFF", 4);
+    file.write(reinterpret_cast<const char*>(&riff_size), sizeof(riff_size));
+    file.write("WAVE", 4);
+    file.write("fmt ", 4);
+
+    uint32_t fmt_size            = 16;
+    uint16_t audio_format        = 1;
+    uint16_t wav_channels        = static_cast<uint16_t>(channels);
+    uint16_t wav_block_align     = static_cast<uint16_t>(block_align);
+    uint16_t wav_bits_per_sample = static_cast<uint16_t>(bits_per_sample);
+    file.write(reinterpret_cast<const char*>(&fmt_size), sizeof(fmt_size));
+    file.write(reinterpret_cast<const char*>(&audio_format), sizeof(audio_format));
+    file.write(reinterpret_cast<const char*>(&wav_channels), sizeof(wav_channels));
+    file.write(reinterpret_cast<const char*>(&sample_rate), sizeof(sample_rate));
+    file.write(reinterpret_cast<const char*>(&byte_rate), sizeof(byte_rate));
+    file.write(reinterpret_cast<const char*>(&wav_block_align), sizeof(wav_block_align));
+    file.write(reinterpret_cast<const char*>(&wav_bits_per_sample), sizeof(wav_bits_per_sample));
+
+    file.write("data", 4);
+    file.write(reinterpret_cast<const char*>(&data_size), sizeof(data_size));
+
+    std::vector<int16_t> pcm(sample_count * channels);
+    for (size_t i = 0; i < pcm.size(); ++i) {
+        float sample = std::max(-1.0f, std::min(1.0f, interleaved_samples[i]));
+        pcm[i]       = static_cast<int16_t>(std::lrint(sample * 32767.0f));
+    }
+    file.write(reinterpret_cast<const char*>(pcm.data()), static_cast<std::streamsize>(pcm.size() * sizeof(int16_t)));
+    return file.good();
 }
--- a/examples/common/media_io.h
+++ b/examples/common/media_io.h
@ -57,7 +57,13 @@ int create_mjpg_avi_from_sd_images(const char* filename,
                                   sd_image_t* images,
                                   int num_images,
                                   int fps,
-                                   int quality = 90);
+                                   int quality             = 90,
+                                   const sd_audio_t* audio = nullptr);
+std::vector<uint8_t> create_mjpg_avi_from_sd_images_to_vector(sd_image_t* images,
+                                                              int num_images,
+                                                              int fps,
+                                                              int quality             = 90,
+                                                              const sd_audio_t* audio = nullptr);

 #ifdef SD_USE_WEBP
 int create_animated_webp_from_sd_images(const char* filename,
@ -65,6 +71,10 @@ int create_animated_webp_from_sd_images(const char* filename,
                                        int num_images,
                                        int fps,
                                        int quality = 90);
+std::vector<uint8_t> create_animated_webp_from_sd_images_to_vector(sd_image_t* images,
+                                                                   int num_images,
+                                                                   int fps,
+                                                                   int quality = 90);
 #endif

 #ifdef SD_USE_WEBM
@ -72,13 +82,32 @@ int create_webm_from_sd_images(const char* filename,
                               sd_image_t* images,
                               int num_images,
                               int fps,
-                               int quality = 90);
+                               int quality             = 90,
+                               const sd_audio_t* audio = nullptr);
+std::vector<uint8_t> create_webm_from_sd_images_to_vector(sd_image_t* images,
+                                                          int num_images,
+                                                          int fps,
+                                                          int quality             = 90,
+                                                          const sd_audio_t* audio = nullptr);
 #endif

 int create_video_from_sd_images(const char* filename,
                                sd_image_t* images,
                                int num_images,
                                int fps,
-                                int quality = 90);
+                                int quality             = 90,
+                                const sd_audio_t* audio = nullptr);
+std::vector<uint8_t> create_video_from_sd_images_to_vector(const std::string& output_format,
+                                                           sd_image_t* images,
+                                                           int num_images,
+                                                           int fps,
+                                                           int quality             = 90,
+                                                           const sd_audio_t* audio = nullptr);
+
+bool write_wav_to_file(const std::string& path,
+                       const float* interleaved_samples,
+                       uint64_t sample_count,
+                       uint32_t channels,
+                       uint32_t sample_rate);

 #endif  // __MEDIA_IO_H__
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -74,6 +74,9 @@ add_executable(${TARGET}
    routes_sdapi.cpp
    routes_sdcpp.cpp
 )
+if(APPLE)
+    sd_set_macos_rpaths(${TARGET})
+endif()

 if(HAVE_FRONTEND_BUILD)
    add_dependencies(${TARGET} ${TARGET}_frontend)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -117,142 +117,10 @@ In this case, the server will load and serve the specified `index.html` file ins
 * using a custom UI
 * avoiding rebuilding the binary after frontend modifications

-# Run
+# Usage

-```
-usage: ./bin/sd-server  [options]
-
-Svr Options:
-  -l, --listen-ip <string>      server listen ip (default: 127.0.0.1)        
-  --serve-html-path <string>    path to HTML file to serve at root (optional)
-  --listen-port <int>           server listen port (default: 1234)
-  -v, --verbose                 print extra info
-  --color                       colors the logging tags according to level   
-  -h, --help                    show this help message and exit
-
-Context Options:
-  -m, --model <string>                     path to full model
-  --clip_l <string>                        path to the clip-l text encoder
-  --clip_g <string>                        path to the clip-g text encoder
-  --clip_vision <string>                   path to the clip-vision encoder
-  --t5xxl <string>                         path to the t5xxl text encoder
-  --llm <string>                           path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
-  --llm_vision <string>                    path to the llm vit
-  --qwen2vl <string>                       alias of --llm. Deprecated.
-  --qwen2vl_vision <string>                alias of --llm_vision. Deprecated.
-  --diffusion-model <string>               path to the standalone diffusion model
-  --high-noise-diffusion-model <string>    path to the standalone high noise diffusion model
-  --vae <string>                           path to standalone vae model
-  --taesd <string>                         path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
-  --tae <string>                           alias of --taesd
-  --control-net <string>                   path to control net model
-  --embd-dir <string>                      embeddings directory
-  --lora-model-dir <string>                lora model directory
-  --tensor-type-rules <string>             weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
-  --photo-maker <string>                   path to PHOTOMAKER model
-  --upscale-model <string>                 path to esrgan model.
-  -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
-                                           CPU physical cores
-  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
-  --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
-  --vae-tiling                             process vae in tiles to reduce memory usage
-  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
-  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
-  --mmap                                   whether to memory-map model
-  --control-net-cpu                        keep controlnet in cpu (for low vram)
-  --clip-on-cpu                            keep clip in cpu (for low vram)
-  --vae-on-cpu                             keep vae in cpu (for low vram)
-  --fa                                     use flash attention
-  --diffusion-fa                           use flash attention in the diffusion model only
-  --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
-  --vae-conv-direct                        use ggml_conv2d_direct in the vae model
-  --circular                               enable circular padding for convolutions
-  --circularx                              enable circular RoPE wrapping on x-axis (width) only
-  --circulary                              enable circular RoPE wrapping on y-axis (height) only
-  --chroma-disable-dit-mask                disable dit mask for chroma
-  --qwen-image-zero-cond-t                 enable zero_cond_t for qwen image
-  --chroma-enable-t5-mask                  enable t5 mask for chroma
-  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
-                                           type of the weight file
-  --rng                                    RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
-  --sampler-rng                            sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
-  --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
-  --lora-apply-mode                        the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
-                                           contain any quantized parameters, the at_runtime mode will be used; otherwise,
-                                           immediately will be used.The immediately mode may have precision and
-                                           compatibility issues with quantized parameters, but it usually offers faster inference
-                                           speed and, in some cases, lower memory usage. The at_runtime mode, on the
-                                           other hand, is exactly the opposite.
-  --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
-  --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
-                                           (overrides --vae-tile-size)
-
-Default Generation Options:
-  -p, --prompt <string>                    the prompt to render
-  -n, --negative-prompt <string>           the negative prompt (default: "")
-  -i, --init-img <string>                  path to the init image
-  --end-img <string>                       path to the end image, required by flf2v
-  --mask <string>                          path to the mask image
-  --control-image <string>                 path to control image, control net
-  --control-video <string>                 path to control video frames, It must be a directory path. The video frames inside should be stored as images in
-                                           lexicographical (character) order. For example, if the control video path is
-                                           `frames`, the directory contain images such as 00.png, 01.png, ... etc.
-  --pm-id-images-dir <string>              path to PHOTOMAKER input id images dir
-  --pm-id-embed-path <string>              path to PHOTOMAKER v2 id embed
-  -H, --height <int>                       image height, in pixel space (default: 512)
-  -W, --width <int>                        image width, in pixel space (default: 512)
-  --steps <int>                            number of sample steps (default: 20)
-  --high-noise-steps <int>                 (high noise) number of sample steps (default: -1 = auto)
-  --clip-skip <int>                        ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
-                                           will be 1 for SD1.x, 2 for SD2.x
-  -b, --batch-count <int>                  batch count
-  --video-frames <int>                     video frames (default: 1)
-  --fps <int>                              fps (default: 24)
-  --timestep-shift <int>                   shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
-                                           NitroSD-Vibrant
-  --upscale-repeats <int>                  Run the ESRGAN upscaler this many times (default: 1)
-  --upscale-tile-size <int>                tile size for ESRGAN upscaling (default: 128)
-  --cfg-scale <float>                      unconditional guidance scale: (default: 7.0)
-  --img-cfg-scale <float>                  image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
-  --guidance <float>                       distilled guidance scale for models with guidance input (default: 3.5)
-  --slg-scale <float>                      skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
-                                           medium
-  --skip-layer-start <float>               SLG enabling point (default: 0.01)
-  --skip-layer-end <float>                 SLG disabling point (default: 0.2)
-  --eta <float>                            noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
-  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
-  --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
-  --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
-  --high-noise-guidance <float>            (high noise) distilled guidance scale for models with guidance input (default: 3.5)
-  --high-noise-slg-scale <float>           (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
-  --high-noise-skip-layer-start <float>    (high noise) SLG enabling point (default: 0.01)
-  --high-noise-skip-layer-end <float>      (high noise) SLG disabling point (default: 0.2)
-  --high-noise-eta <float>                 (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
-  --strength <float>                       strength for noising/unnoising (default: 0.75)
-  --pm-style-strength <float>
-  --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
-  --moe-boundary <float>                   timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
-  --vace-strength <float>                  wan vace strength
-  --increase-ref-index                     automatically increase the indices of references images based on the order they are listed (starting with 1).
-  --disable-auto-resize-ref-image          disable auto resize of ref images
-  --disable-image-metadata                 do not embed generation metadata on image files
-  -s, --seed                               RNG seed (default: 42, use random seed for < 0)
-  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
-                                           tcd, res_multistep, res_2s, er_sde] (default: euler for Flux/SD3/Wan, euler_a
-                                           otherwise)
-  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
-                                           ddim_trailing, tcd, res_multistep, res_2s, er_sde] default: euler for Flux/SD3/Wan,
-                                           euler_a otherwise
-  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
-                                           kl_optimal, lcm, bong_tangent], default: discrete
-  --sigmas                                 custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
-  --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
-  --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
-  -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
-  --cache-mode                             caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level), 'spectrum' (UNET/DiT Chebyshev+Taylor forecasting)
-  --cache-option                           named cache params (key=value format, comma-separated). easycache/ucache:
-                                           threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
-                                           "threshold=0.25" or "threshold=1.5,reset=0"
-  --scm-mask                               SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
-  --scm-policy                             SCM policy: 'dynamic' (default) or 'static'
+For detailed command-line arguments, run:
+
+```bash
+./bin/sd-server -h
 ```
--- a/examples/server/api.md
+++ b/examples/server/api.md
@ -9,7 +9,7 @@ The server currently exposes three API families:
 - `sdcpp API` under `/sdcpp/v1/...`

 The `sdcpp API` is the native API surface.
-Its request schema is also the canonical schema for `sd_cpp_extra_args`.
+Its request schema is the same schema used by `sd_cpp_extra_args`.

 Global LoRA rule:

@ -38,6 +38,8 @@ Current generation-related endpoints include:
 - `POST /sdapi/v1/txt2img`
 - `POST /sdapi/v1/img2img`
 - `GET /sdapi/v1/loras`
+- `GET /sdapi/v1/upscalers`
+- `GET /sdapi/v1/latent-upscale-modes`
 - `GET /sdapi/v1/samplers`
 - `GET /sdapi/v1/schedulers`
 - `GET /sdapi/v1/sd-models`
@ -55,8 +57,6 @@ Current endpoints include:
 - `POST /sdcpp/v1/jobs/{id}/cancel`
 - `POST /sdcpp/v1/vid_gen`

-`POST /sdcpp/v1/vid_gen` is currently exposed but returns `501 Not Implemented`.
-
 ## `sd_cpp_extra_args`

 `sd_cpp_extra_args` is an extension mechanism for the compatibility APIs.
@ -79,12 +79,12 @@ Behavior:
 - The JSON block is parsed using the same field rules as the `sdcpp API`.
 - The block is removed from the final prompt before generation.

-Intended use:
+Supported use:

 - extend `OpenAI API` requests with native `stable-diffusion.cpp` controls
 - extend `sdapi` requests with native `stable-diffusion.cpp` controls

-Not intended use:
+Unsupported use:

 - do not use `sd_cpp_extra_args` with `/sdcpp/v1/*`

@ -218,6 +218,13 @@ Currently supported request fields:
 | `scheduler` | `string` | Scheduler name |
 | `lora` | `array<object>` | Structured LoRA list |
 | `extra_images` | `array<string>` | Base64 or data URL images |
+| `enable_hr` | `boolean` | Enable highres fix for `txt2img` |
+| `hr_upscaler` | `string` | `Lanczos`, `Nearest`, a latent mode such as `Latent (nearest-exact)`, or an upscaler model name from `/sdapi/v1/upscalers` |
+| `hr_scale` | `number` | Highres scale when resize target is not set |
+| `hr_resize_x` | `integer` | Highres target width, `0` to use scale |
+| `hr_resize_y` | `integer` | Highres target height, `0` to use scale |
+| `hr_steps` | `integer` | Highres second-pass sample steps, `0` to reuse `steps` |
+| `denoising_strength` | `number` | Highres denoising strength for `txt2img` |

 Native extension fields:

@ -243,6 +250,8 @@ Currently supported request fields:
 | `inpainting_mask_invert` | `integer` or `boolean` | Treated as invert flag |
 | `denoising_strength` | `number` | Clamped to `0.0..1.0` |

+Highres fix fields are currently handled for `txt2img`; `img2img` uses `denoising_strength` as image-to-image strength.
+
 Native extension fields:

 - any `sdcpp API` fields embedded through `sd_cpp_extra_args` inside `prompt`
@ -260,6 +269,8 @@ Response fields:
 Currently exposed:

 - `GET /sdapi/v1/loras`
+- `GET /sdapi/v1/upscalers`
+- `GET /sdapi/v1/latent-upscale-modes`
 - `GET /sdapi/v1/samplers`
 - `GET /sdapi/v1/schedulers`
 - `GET /sdapi/v1/sd-models`
@ -274,6 +285,26 @@ Response fields:
 | `[].name` | `string` | Display name derived from file stem |
 | `[].path` | `string` | Relative path under the configured LoRA directory |

+`GET /sdapi/v1/upscalers`
+
+| Field | Type | Notes |
+| --- | --- | --- |
+| `[].name` | `string` | Built-in name or model stem |
+| `[].model_name` | `string \| null` | Model family label for model-backed upscalers |
+| `[].model_path` | `string \| null` | Absolute model path for model-backed upscalers |
+| `[].model_url` | `string \| null` | Currently always null |
+| `[].scale` | `integer` | Currently `4` |
+
+Built-in entries include `None`, `Lanczos`, and `Nearest`. Model-backed entries are scanned from the top level of `--hires-upscalers-dir`; subdirectories are not scanned.
+
+`GET /sdapi/v1/latent-upscale-modes`
+
+| Field | Type | Notes |
+| --- | --- | --- |
+| `[].name` | `string` | WebUI-compatible latent upscale mode name |
+
+Built-in latent modes include `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, and `Latent (bicubic antialiased)`.
+
 `GET /sdapi/v1/samplers`

 | Field | Type | Notes |
@ -372,20 +403,26 @@ Field types:

 Returns frontend-friendly capability metadata.

-Typical contents:
+The mode-aware fields are the primary interface. The top-level compatibility fields are deprecated mirrors kept for older clients.

-| Field | Type |
-| --- | --- |
-| `model` | `object` |
-| `defaults` | `object` |
-| `loras` | `array<object>` |
-| `samplers` | `array<string>` |
-| `schedulers` | `array<string>` |
-| `output_formats` | `array<string>` |
-| `limits` | `object` |
-| `features` | `object` |
+Top-level fields:

-Nested fields currently returned:
+| Field | Type | Notes |
+| --- | --- | --- |
+| `model` | `object` | Loaded model metadata |
+| `current_mode` | `string` | The native generation mode mirrored by top-level compatibility fields |
+| `supported_modes` | `array<string>` | Supported native modes such as `img_gen` or `vid_gen` |
+| `defaults` | `object` | Deprecated compatibility mirror of `defaults_by_mode[current_mode]` |
+| `output_formats` | `array<string>` | Deprecated compatibility mirror of `output_formats_by_mode[current_mode]` |
+| `features` | `object` | Deprecated compatibility mirror of `features_by_mode[current_mode]` |
+| `defaults_by_mode` | `object` | Explicit defaults for each supported mode |
+| `output_formats_by_mode` | `object` | Explicit output formats for each supported mode |
+| `features_by_mode` | `object` | Explicit feature flags for each supported mode |
+| `samplers` | `array<string>` | Available sampling methods |
+| `schedulers` | `array<string>` | Available schedulers |
+| `loras` | `array<object>` | Available LoRA entries |
+| `upscalers` | `array<object>` | Available model-backed highres upscalers |
+| `limits` | `object` | Shared queue and size limits |

 `model`

@ -395,50 +432,24 @@ Nested fields currently returned:
 | `model.stem` | `string` |
 | `model.path` | `string` |

-`defaults`
+Compatibility rules:
+
+- `defaults`, `output_formats`, and `features` are deprecated compatibility mirrors
+- those three top-level fields always mirror `current_mode`
+- `supported_modes`, `defaults_by_mode`, `output_formats_by_mode`, and `features_by_mode` are the mode-aware fields
+
+Mode-aware objects:

 | Field | Type |
 | --- | --- |
-| `defaults.prompt` | `string` |
-| `defaults.negative_prompt` | `string` |
-| `defaults.clip_skip` | `integer` |
-| `defaults.width` | `integer` |
-| `defaults.height` | `integer` |
-| `defaults.strength` | `number` |
-| `defaults.seed` | `integer` |
-| `defaults.batch_count` | `integer` |
-| `defaults.auto_resize_ref_image` | `boolean` |
-| `defaults.increase_ref_index` | `boolean` |
-| `defaults.control_strength` | `number` |
-| `defaults.sample_params` | `object` |
-| `defaults.sample_params.scheduler` | `string` |
-| `defaults.sample_params.sample_method` | `string` |
-| `defaults.sample_params.sample_steps` | `integer` |
-| `defaults.sample_params.eta` | `number \| null` |
-| `defaults.sample_params.shifted_timestep` | `integer` |
-| `defaults.sample_params.flow_shift` | `number \| null` |
-| `defaults.sample_params.guidance` | `object` |
-| `defaults.sample_params.guidance.txt_cfg` | `number` |
-| `defaults.sample_params.guidance.img_cfg` | `number \| null` |
-| `defaults.sample_params.guidance.distilled_guidance` | `number` |
-| `defaults.sample_params.guidance.slg` | `object` |
-| `defaults.sample_params.guidance.slg.layers` | `array<integer>` |
-| `defaults.sample_params.guidance.slg.layer_start` | `number` |
-| `defaults.sample_params.guidance.slg.layer_end` | `number` |
-| `defaults.sample_params.guidance.slg.scale` | `number` |
-| `defaults.vae_tiling_params` | `object` |
-| `defaults.vae_tiling_params.enabled` | `boolean` |
-| `defaults.vae_tiling_params.tile_size_x` | `integer` |
-| `defaults.vae_tiling_params.tile_size_y` | `integer` |
-| `defaults.vae_tiling_params.target_overlap` | `number` |
-| `defaults.vae_tiling_params.rel_size_x` | `number` |
-| `defaults.vae_tiling_params.rel_size_y` | `number` |
-| `defaults.cache_mode` | `string` |
-| `defaults.cache_option` | `string` |
-| `defaults.scm_mask` | `string` |
-| `defaults.scm_policy_dynamic` | `boolean` |
-| `defaults.output_format` | `string` |
-| `defaults.output_compression` | `integer` |
+| `defaults_by_mode.img_gen` | `object` |
+| `defaults_by_mode.vid_gen` | `object` |
+| `output_formats_by_mode.img_gen` | `array<string>` |
+| `output_formats_by_mode.vid_gen` | `array<string>` |
+| `features_by_mode.img_gen` | `object` |
+| `features_by_mode.vid_gen` | `object` |
+
+Shared nested fields:

 `loras`

@ -447,6 +458,14 @@ Nested fields currently returned:
 | `loras[].name` | `string` |
 | `loras[].path` | `string` |

+`upscalers`
+
+| Field | Type | Notes |
+| --- | --- | --- |
+| `upscalers[].name` | `string` | Built-in name or model stem; use this value in `hires.upscaler` |
+
+Built-in entries include `None`, `Lanczos`, `Nearest`, `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, and `Latent (bicubic antialiased)`. Model-backed entries are scanned from the top level of `--hires-upscalers-dir`; subdirectories are not scanned.
+
 `limits`

 | Field | Type |
@ -458,19 +477,115 @@ Nested fields currently returned:
 | `limits.max_batch_count` | `integer` |
 | `limits.max_queue_size` | `integer` |

-`features`
+Shared default fields used by both `img_gen` and `vid_gen`:

 | Field | Type |
 | --- | --- |
-| `features.init_image` | `boolean` |
-| `features.mask_image` | `boolean` |
-| `features.control_image` | `boolean` |
-| `features.ref_images` | `boolean` |
-| `features.lora` | `boolean` |
-| `features.vae_tiling` | `boolean` |
-| `features.cache` | `boolean` |
-| `features.cancel_queued` | `boolean` |
-| `features.cancel_generating` | `boolean` |
+| `prompt` | `string` |
+| `negative_prompt` | `string` |
+| `clip_skip` | `integer` |
+| `width` | `integer` |
+| `height` | `integer` |
+| `strength` | `number` |
+| `seed` | `integer` |
+| `sample_params` | `object` |
+| `sample_params.scheduler` | `string` |
+| `sample_params.sample_method` | `string` |
+| `sample_params.sample_steps` | `integer` |
+| `sample_params.eta` | `number \| null` |
+| `sample_params.shifted_timestep` | `integer` |
+| `sample_params.flow_shift` | `number \| null` |
+| `sample_params.guidance.txt_cfg` | `number` |
+| `sample_params.guidance.img_cfg` | `number \| null` |
+| `sample_params.guidance.distilled_guidance` | `number` |
+| `sample_params.guidance.slg.layers` | `array<integer>` |
+| `sample_params.guidance.slg.layer_start` | `number` |
+| `sample_params.guidance.slg.layer_end` | `number` |
+| `sample_params.guidance.slg.scale` | `number` |
+| `vae_tiling_params` | `object` |
+| `vae_tiling_params.enabled` | `boolean` |
+| `vae_tiling_params.temporal_tiling` | `boolean` |
+| `vae_tiling_params.tile_size_x` | `integer` |
+| `vae_tiling_params.tile_size_y` | `integer` |
+| `vae_tiling_params.target_overlap` | `number` |
+| `vae_tiling_params.rel_size_x` | `number` |
+| `vae_tiling_params.rel_size_y` | `number` |
+| `vae_tiling_params.extra_tiling_args` | `string` |
+| `cache_mode` | `string` |
+| `cache_option` | `string` |
+| `scm_mask` | `string` |
+| `scm_policy_dynamic` | `boolean` |
+| `output_format` | `string` |
+| `output_compression` | `integer` |
+
+`vae_tiling_params.extra_tiling_args` accepts a key=value list. For LTX video VAE temporal tiling, `temporal_tile_frames` defaults to `4` and `temporal_tile_overlap` defaults to `1`.
+
+`img_gen`-specific default fields:
+
+| Field | Type |
+| --- | --- |
+| `batch_count` | `integer` |
+| `auto_resize_ref_image` | `boolean` |
+| `increase_ref_index` | `boolean` |
+| `control_strength` | `number` |
+| `hires` | `object` |
+| `hires.enabled` | `boolean` |
+| `hires.upscaler` | `string` |
+| `hires.scale` | `number` |
+| `hires.target_width` | `integer` |
+| `hires.target_height` | `integer` |
+| `hires.steps` | `integer` |
+| `hires.denoising_strength` | `number` |
+| `hires.custom_sigmas` | `array<number>` |
+| `hires.upscale_tile_size` | `integer` |
+
+`vid_gen`-specific default fields:
+
+| Field | Type |
+| --- | --- |
+| `video_frames` | `integer` |
+| `fps` | `integer` |
+| `moe_boundary` | `number` |
+| `vace_strength` | `number` |
+| `high_noise_sample_params` | `object` |
+| `high_noise_sample_params.scheduler` | `string` |
+| `high_noise_sample_params.sample_method` | `string` |
+| `high_noise_sample_params.sample_steps` | `integer` |
+| `high_noise_sample_params.eta` | `number \| null` |
+| `high_noise_sample_params.shifted_timestep` | `integer` |
+| `high_noise_sample_params.flow_shift` | `number \| null` |
+| `high_noise_sample_params.guidance.txt_cfg` | `number` |
+| `high_noise_sample_params.guidance.img_cfg` | `number \| null` |
+| `high_noise_sample_params.guidance.distilled_guidance` | `number` |
+| `high_noise_sample_params.guidance.slg.layers` | `array<integer>` |
+| `high_noise_sample_params.guidance.slg.layer_start` | `number` |
+| `high_noise_sample_params.guidance.slg.layer_end` | `number` |
+| `high_noise_sample_params.guidance.slg.scale` | `number` |
+
+Fields returned in `features_by_mode.img_gen`:
+
+- `init_image`
+- `mask_image`
+- `control_image`
+- `ref_images`
+- `lora`
+- `vae_tiling`
+- `hires`
+- `cache`
+- `cancel_queued`
+- `cancel_generating`
+
+Fields returned in `features_by_mode.vid_gen`:
+
+- `init_image`
+- `end_image`
+- `control_frames`
+- `high_noise_sample_params`
+- `lora`
+- `vae_tiling`
+- `cache`
+- `cancel_queued`
+- `cancel_generating`

 #### `POST /sdcpp/v1/img_gen`

@ -521,9 +636,7 @@ Typical status codes:
 - `409 Conflict`
 - `410 Gone`

-### Canonical Request Schema
-
-The `sdcpp API` request body is the canonical native schema.
+### Request Body

 Example:

@ -569,14 +682,27 @@ Example:
  },

  "lora": [],
+  "hires": {
+    "enabled": false,
+    "upscaler": "Latent",
+    "scale": 2.0,
+    "target_width": 0,
+    "target_height": 0,
+    "steps": 0,
+    "denoising_strength": 0.7,
+    "custom_sigmas": [],
+    "upscale_tile_size": 128
+  },

  "vae_tiling_params": {
    "enabled": false,
+    "temporal_tiling": false,
    "tile_size_x": 0,
    "tile_size_y": 0,
    "target_overlap": 0.5,
    "rel_size_x": 0.0,
-    "rel_size_y": 0.0
+    "rel_size_y": 0.0,
+    "extra_tiling_args": ""
  },

  "cache_mode": "disabled",
@ -612,7 +738,7 @@ Channel expectations:
 If omitted or null:

 - single-image fields map to an empty `sd_image_t`
- array fields map to `nullptr + count = 0`
+- array fields map to an empty C-style array, represented as `pointer = nullptr` and `count = 0`

 ### Field Mapping Summary

@ -673,12 +799,32 @@ Other native fields:

 | Field | Type |
 | --- | --- |
+| `hires` | `object` |
+| `hires.enabled` | `boolean` |
+| `hires.upscaler` | `string` |
+| `hires.scale` | `number` |
+| `hires.target_width` | `integer` |
+| `hires.target_height` | `integer` |
+| `hires.steps` | `integer` |
+| `hires.denoising_strength` | `number` |
+| `hires.custom_sigmas` | `array<number>` |
+| `hires.upscale_tile_size` | `integer` |
 | `vae_tiling_params` | `object` |
+| `vae_tiling_params.enabled` | `boolean` |
+| `vae_tiling_params.temporal_tiling` | `boolean` |
+| `vae_tiling_params.tile_size_x` | `integer` |
+| `vae_tiling_params.tile_size_y` | `integer` |
+| `vae_tiling_params.target_overlap` | `number` |
+| `vae_tiling_params.rel_size_x` | `number` |
+| `vae_tiling_params.rel_size_y` | `number` |
+| `vae_tiling_params.extra_tiling_args` | `string` |
 | `cache_mode` | `string` |
 | `cache_option` | `string` |
 | `scm_mask` | `string` |
 | `scm_policy_dynamic` | `boolean` |

+For `hires.upscaler`, use `Lanczos`, `Nearest`, `Latent`, `Latent (nearest)`, `Latent (nearest-exact)`, `Latent (antialiased)`, `Latent (bicubic)`, `Latent (bicubic antialiased)`, or an `upscalers[].name` value from `GET /sdcpp/v1/capabilities`. Model-backed upscalers are resolved as `--hires-upscalers-dir / (name + ext)` and must live directly in that directory. `hires.custom_sigmas`, when present, overrides the generated second-pass hires sigma schedule; otherwise the hires schedule is trimmed by `hires.denoising_strength`.
+
 HTTP-only output fields:

 | Field | Type |
@ -686,11 +832,11 @@ HTTP-only output fields:
 | `output_format` | `string` |
 | `output_compression` | `integer` |

-### Optional Field Semantics
+### Optional Field Handling

-Clients should preserve unset semantics for optional sampling fields.
+Optional sampling fields may be omitted.

-If a user has not explicitly provided one of these fields, the client should omit it instead of injecting a guessed fallback:
+When omitted, backend defaults apply to these fields:

 - `sample_params.scheduler`
 - `sample_params.sample_method`
@ -766,29 +912,404 @@ Example cancelled job:
 }
 ```

-### Validation and Retention
+### Submission Errors

-Recommended behavior:
+`POST /sdcpp/v1/img_gen` may return:

- malformed JSON returns `400`
- invalid image payloads return `400`
- invalid parameter structure returns `400`
- queue full returns `429` or `503`
- accepted runtime failures transition the job to `failed`
- unsupported in-progress cancellation may return `409`
+- `202 Accepted` when the job is created
+- `400 Bad Request` for an empty body, unsupported model mode, invalid JSON, or invalid generation parameters
+- `429 Too Many Requests` when the job queue is full
+- `500 Internal Server Error` for unexpected server exceptions during submission

-Recommended retention controls:
+### `vid_gen`

- pending job limit
- completed job TTL
- failed job TTL
+The following section documents the native async contract for video generation.

-### Future `vid_gen`
+#### `POST /sdcpp/v1/vid_gen`

-Future `vid_gen` should reuse the same async job model:
+Submits an async video generation job.

- `POST /sdcpp/v1/vid_gen`
- `GET /sdcpp/v1/jobs/{id}`
- `POST /sdcpp/v1/jobs/{id}/cancel`
+Successful submission returns `202 Accepted`.

-Its request body should mirror `sd_vid_gen_params_t` in the same way that `img_gen` mirrors `sd_img_gen_params_t`.
+Example response:
+
+```json
+{
+  "id": "job_01HTXYZVID",
+  "kind": "vid_gen",
+  "status": "queued",
+  "created": 1775401200,
+  "poll_url": "/sdcpp/v1/jobs/job_01HTXYZVID"
+}
+```
+
+Response fields:
+
+| Field | Type |
+| --- | --- |
+| `id` | `string` |
+| `kind` | `string` |
+| `status` | `string` |
+| `created` | `integer` |
+| `poll_url` | `string` |
+
+### Request Body
+
+Compared with `img_gen`, the `vid_gen` request body:
+
+- `vid_gen` is a single video sequence job, so `batch_count` is not part of the request schema
+- `ref_images`, `mask_image`, `control_image`, `control_strength`, and `embed_image_metadata` are not part of the request schema
+- `vid_gen` adds `end_image`, `control_frames`, `high_noise_sample_params`, `video_frames`, `fps`, `moe_boundary`, and `vace_strength`
+
+Example:
+
+```json
+{
+  "prompt": "a cat walking through a rainy alley",
+  "negative_prompt": "",
+  "clip_skip": -1,
+  "width": 832,
+  "height": 480,
+  "strength": 0.75,
+  "seed": -1,
+  "video_frames": 33,
+  "fps": 16,
+  "moe_boundary": 0.875,
+  "vace_strength": 1.0,
+
+  "init_image": null,
+  "end_image": null,
+  "control_frames": [],
+
+  "sample_params": {
+    "scheduler": "discrete",
+    "sample_method": "euler",
+    "sample_steps": 28,
+    "eta": 1.0,
+    "shifted_timestep": 0,
+    "custom_sigmas": [],
+    "flow_shift": 0.0,
+    "guidance": {
+      "txt_cfg": 7.0,
+      "img_cfg": 7.0,
+      "distilled_guidance": 3.5,
+      "slg": {
+        "layers": [7, 8, 9],
+        "layer_start": 0.01,
+        "layer_end": 0.2,
+        "scale": 0.0
+      }
+    }
+  },
+
+  "high_noise_sample_params": {
+    "scheduler": "discrete",
+    "sample_method": "euler",
+    "sample_steps": -1,
+    "eta": 1.0,
+    "shifted_timestep": 0,
+    "flow_shift": 0.0,
+    "guidance": {
+      "txt_cfg": 7.0,
+      "img_cfg": 7.0,
+      "distilled_guidance": 3.5,
+      "slg": {
+        "layers": [7, 8, 9],
+        "layer_start": 0.01,
+        "layer_end": 0.2,
+        "scale": 0.0
+      }
+    }
+  },
+
+  "lora": [],
+
+  "vae_tiling_params": {
+    "enabled": false,
+    "temporal_tiling": false,
+    "tile_size_x": 0,
+    "tile_size_y": 0,
+    "target_overlap": 0.5,
+    "rel_size_x": 0.0,
+    "rel_size_y": 0.0,
+    "extra_tiling_args": ""
+  },
+
+  "cache_mode": "disabled",
+  "cache_option": "",
+  "scm_mask": "",
+  "scm_policy_dynamic": true,
+
+  "output_format": "webm",
+  "output_compression": 100
+}
+```
+
+### LoRA Rules
+
+- The server only accepts explicit LoRA entries from the `lora` field.
+- Prompt-embedded `<lora:...>` tags are intentionally unsupported.
+- `lora[].is_high_noise` controls whether a LoRA applies only to the high-noise stage.
+
+### Image and Frame Encoding Rules
+
+Any image field accepts:
+
+- a raw base64 string, or
+- a data URL such as `data:image/png;base64,...`
+
+Channel expectations:
+
+- `init_image`: 3 channels
+- `end_image`: 3 channels
+- `control_frames[]`: 3 channels
+
+Frame ordering rules:
+
+- `control_frames[]` order is the conditioning frame order
+- `control_frames[]` is preserved in request order
+
+If omitted or null:
+
+- single-image fields map to an empty `sd_image_t`
+- array fields map to an empty C-style array, represented as `pointer = nullptr` and `count = 0`
+
+### Field Mapping Summary
+
+Top-level scalar fields:
+
+| Field | Type |
+| --- | --- |
+| `prompt` | `string` |
+| `negative_prompt` | `string` |
+| `clip_skip` | `integer` |
+| `width` | `integer` |
+| `height` | `integer` |
+| `strength` | `number` |
+| `seed` | `integer` |
+| `video_frames` | `integer` |
+| `fps` | `integer` |
+| `moe_boundary` | `number` |
+| `vace_strength` | `number` |
+
+Image and frame fields:
+
+| Field | Type |
+| --- | --- |
+| `init_image` | `string \| null` |
+| `end_image` | `string \| null` |
+| `control_frames` | `array<string>` |
+
+LoRA fields:
+
+| Field | Type |
+| --- | --- |
+| `lora[].path` | `string` |
+| `lora[].multiplier` | `number` |
+| `lora[].is_high_noise` | `boolean` |
+
+Sampling fields:
+
+| Field | Type |
+| --- | --- |
+| `sample_params.scheduler` | `string` |
+| `sample_params.sample_method` | `string` |
+| `sample_params.sample_steps` | `integer` |
+| `sample_params.eta` | `number` |
+| `sample_params.shifted_timestep` | `integer` |
+| `sample_params.custom_sigmas` | `array<number>` |
+| `sample_params.flow_shift` | `number` |
+| `sample_params.guidance.txt_cfg` | `number` |
+| `sample_params.guidance.img_cfg` | `number` |
+| `sample_params.guidance.distilled_guidance` | `number` |
+| `sample_params.guidance.slg.layers` | `array<integer>` |
+| `sample_params.guidance.slg.layer_start` | `number` |
+| `sample_params.guidance.slg.layer_end` | `number` |
+| `sample_params.guidance.slg.scale` | `number` |
+
+High-noise sampling fields:
+
+| Field | Type |
+| --- | --- |
+| `high_noise_sample_params.scheduler` | `string` |
+| `high_noise_sample_params.sample_method` | `string` |
+| `high_noise_sample_params.sample_steps` | `integer` |
+| `high_noise_sample_params.eta` | `number` |
+| `high_noise_sample_params.shifted_timestep` | `integer` |
+| `high_noise_sample_params.flow_shift` | `number` |
+| `high_noise_sample_params.guidance.txt_cfg` | `number` |
+| `high_noise_sample_params.guidance.img_cfg` | `number` |
+| `high_noise_sample_params.guidance.distilled_guidance` | `number` |
+| `high_noise_sample_params.guidance.slg.layers` | `array<integer>` |
+| `high_noise_sample_params.guidance.slg.layer_start` | `number` |
+| `high_noise_sample_params.guidance.slg.layer_end` | `number` |
+| `high_noise_sample_params.guidance.slg.scale` | `number` |
+
+Other native fields:
+
+| Field | Type |
+| --- | --- |
+| `vae_tiling_params` | `object` |
+| `vae_tiling_params.enabled` | `boolean` |
+| `vae_tiling_params.temporal_tiling` | `boolean` |
+| `vae_tiling_params.tile_size_x` | `integer` |
+| `vae_tiling_params.tile_size_y` | `integer` |
+| `vae_tiling_params.target_overlap` | `number` |
+| `vae_tiling_params.rel_size_x` | `number` |
+| `vae_tiling_params.rel_size_y` | `number` |
+| `vae_tiling_params.extra_tiling_args` | `string` |
+| `cache_mode` | `string` |
+| `cache_option` | `string` |
+| `scm_mask` | `string` |
+| `scm_policy_dynamic` | `boolean` |
+
+HTTP-only output fields:
+
+| Field | Type |
+| --- | --- |
+| `output_format` | `string` |
+| `output_compression` | `integer` |
+
+For `vid_gen`, `output_format` and `output_compression` control container encoding.
+`fps` is request metadata for the generated sequence and is echoed in the completed job result.
+
+Allowed `output_format` values:
+
+- `webm`
+- `webp`
+- `avi`
+
+Output format behavior:
+
+- `output_format` defaults to `webm`
+- `webp` means animated WebP
+- `avi` means MJPG AVI
+- `webm` requires the server to be built with WebM support; otherwise the request returns `400`
+
+### Result Payload
+
+Completed jobs return one encoded container payload, not a list of per-frame images.
+
+Result fields:
+
+- `result.b64_json` contains the whole encoded container file as base64
+- `result.mime_type` identifies the media type
+- `result.output_format` echoes the selected container format
+- `result.fps` echoes the effective playback FPS
+- `result.frame_count` reports the actual decoded frame count used to build the container
+
+Expected MIME types:
+
+| `output_format` | `mime_type` |
+| --- | --- |
+| `webm` | `video/webm` |
+| `webp` | `image/webp` |
+| `avi` | `video/x-msvideo` |
+
+### Optional Field Handling
+
+Optional sampling fields may be omitted.
+
+When omitted, backend defaults apply to these fields:
+
+- `sample_params.scheduler`
+- `sample_params.sample_method`
+- `sample_params.eta`
+- `sample_params.flow_shift`
+- `sample_params.guidance.img_cfg`
+- `high_noise_sample_params.scheduler`
+- `high_noise_sample_params.sample_method`
+- `high_noise_sample_params.eta`
+- `high_noise_sample_params.flow_shift`
+- `high_noise_sample_params.guidance.img_cfg`
+
+`high_noise_sample_params` may also be omitted entirely.
+
+### Frame Count Semantics
+
+`video_frames` is the requested target length, but the current core video path internally normalizes the effective frame count to the largest `4n + 1` value that does not exceed the requested count.
+
+Examples:
+
+- `video_frames = 33` stays `33`
+- `video_frames = 34` becomes `33`
+- `video_frames = 32` becomes `29`
+
+The completed job payload includes the actual decoded `frame_count`.
+
+### Completion Result
+
+Example completed job:
+
+```json
+{
+  "id": "job_01HTXYZVID",
+  "kind": "vid_gen",
+  "status": "completed",
+  "created": 1775401200,
+  "started": 1775401203,
+  "completed": 1775401215,
+  "queue_position": 0,
+  "result": {
+    "output_format": "webm",
+    "mime_type": "video/webm",
+    "fps": 16,
+    "frame_count": 33,
+    "b64_json": "GkXfo59ChoEBQveBAULygQRC84EIQo..."
+  },
+  "error": null
+}
+```
+
+The response returns the encoded `.webm`, animated `.webp`, or `.avi` container payload directly.
+
+### Failure Result
+
+Example failed job:
+
+```json
+{
+  "id": "job_01HTXYZVID",
+  "kind": "vid_gen",
+  "status": "failed",
+  "created": 1775401200,
+  "started": 1775401203,
+  "completed": 1775401204,
+  "queue_position": 0,
+  "result": null,
+  "error": {
+    "code": "generation_failed",
+    "message": "generate_video returned no results"
+  }
+}
+```
+
+### Cancelled Result
+
+Example cancelled job:
+
+```json
+{
+  "id": "job_01HTXYZVID",
+  "kind": "vid_gen",
+  "status": "cancelled",
+  "created": 1775401200,
+  "started": null,
+  "completed": 1775401202,
+  "queue_position": 0,
+  "result": null,
+  "error": {
+    "code": "cancelled",
+    "message": "job cancelled by client"
+  }
+}
+```
+
+### Submission Errors
+
+`POST /sdcpp/v1/vid_gen` may return:
+
+- `202 Accepted` when the job is created
+- `400 Bad Request` for an empty body, unsupported model mode, invalid JSON, invalid generation parameters, or an unsupported output format
+- `429 Too Many Requests` when the job queue is full
+- `500 Internal Server Error` for unexpected server exceptions during submission
--- a/examples/server/async_jobs.cpp
+++ b/examples/server/async_jobs.cpp
@ -95,6 +95,10 @@ bool cancel_queued_job(AsyncJobManager& manager, AsyncGenerationJob& job) {
    job.status       = AsyncJobStatus::Cancelled;
    job.completed_at = unix_timestamp_now();
    job.result_images_b64.clear();
+    job.result_media_b64.clear();
+    job.result_media_mime_type.clear();
+    job.result_frame_count = 0;
+    job.result_fps         = 0;
    job.error_code         = "cancelled";
    job.error_message      = "job cancelled by client";
    return true;
@ -122,6 +126,15 @@ json make_async_job_json(const AsyncJobManager& manager, const AsyncGenerationJo
    }

    if (job.status == AsyncJobStatus::Completed) {
+        if (job.kind == AsyncJobKind::VidGen) {
+            result["result"] = {
+                {"output_format", job.vid_gen.output_format},
+                {"mime_type", job.result_media_mime_type},
+                {"fps", job.result_fps},
+                {"frame_count", job.result_frame_count},
+                {"b64_json", job.result_media_b64},
+            };
+        } else {
            json images = json::array();
            for (size_t i = 0; i < job.result_images_b64.size(); ++i) {
                images.push_back({{"index", i}, {"b64_json", job.result_images_b64[i]}});
@ -130,6 +143,7 @@ json make_async_job_json(const AsyncJobManager& manager, const AsyncGenerationJo
                {"output_format", job.img_gen.output_format},
                {"images", images},
            };
+        }
        result["error"] = nullptr;
    } else if (job.status == AsyncJobStatus::Failed ||
               job.status == AsyncJobStatus::Cancelled) {
@ -156,16 +170,15 @@ bool execute_img_gen_job(ServerRuntime& runtime,
    sd_img_gen_params_t params = job.img_gen.to_sd_img_gen_params_t();

    SDImageVec results;
-    int num_results = 0;

    {
        std::lock_guard<std::mutex> lock(*runtime.sd_ctx_mutex);
        sd_image_t* raw_results = generate_image(runtime.sd_ctx, &params);
-        num_results             = params.batch_count;
-        results.adopt(raw_results, num_results);
+        results.adopt(raw_results, params.batch_count);
    }

-    if (results.empty() || num_results <= 0) {
+    const int num_results = results.count();
+    if (num_results <= 0) {
        error_message = "generate_image returned no results";
        return false;
    }
@ -208,6 +221,54 @@ bool execute_img_gen_job(ServerRuntime& runtime,
    return true;
 }

+bool execute_vid_gen_job(ServerRuntime& runtime,
+                         AsyncGenerationJob& job,
+                         std::string& output_media_b64,
+                         std::string& output_media_mime_type,
+                         int& output_frame_count,
+                         int& output_fps,
+                         std::string& error_message) {
+    sd_vid_gen_params_t params = job.vid_gen.to_sd_vid_gen_params_t();
+
+    SDImageVec results;
+    int num_results             = 0;
+    sd_audio_t* generated_audio = nullptr;
+
+    {
+        std::lock_guard<std::mutex> lock(*runtime.sd_ctx_mutex);
+        sd_image_t* raw_results = nullptr;
+        if (!generate_video(runtime.sd_ctx, &params, &raw_results, &num_results, &generated_audio)) {
+            raw_results = nullptr;
+        }
+        results.adopt(raw_results, num_results);
+    }
+
+    num_results = results.count();
+    if (num_results <= 0) {
+        free_sd_audio(generated_audio);
+        error_message = "generate_video returned no results";
+        return false;
+    }
+
+    std::vector<uint8_t> video_bytes = create_video_from_sd_images_to_vector(job.vid_gen.output_format,
+                                                                             results.data(),
+                                                                             num_results,
+                                                                             job.vid_gen.gen_params.fps,
+                                                                             job.vid_gen.output_compression,
+                                                                             generated_audio);
+    free_sd_audio(generated_audio);
+    if (video_bytes.empty()) {
+        error_message = "failed to encode generated video container";
+        return false;
+    }
+
+    output_media_b64       = base64_encode(video_bytes);
+    output_media_mime_type = video_mime_type(job.vid_gen.output_format);
+    output_frame_count     = num_results;
+    output_fps             = job.vid_gen.gen_params.fps;
+    return true;
+}
+
 void async_job_worker(ServerRuntime& runtime) {
    AsyncJobManager& manager = *runtime.async_job_manager;

@ -240,11 +301,23 @@ void async_job_worker(ServerRuntime& runtime) {
        }

        std::vector<std::string> output_images;
+        std::string output_media_b64;
+        std::string output_media_mime_type;
+        int output_frame_count = 0;
+        int output_fps         = 0;
        std::string error_message;
        bool ok = false;

        if (job->kind == AsyncJobKind::ImgGen) {
            ok = execute_img_gen_job(runtime, *job, output_images, error_message);
+        } else if (job->kind == AsyncJobKind::VidGen) {
+            ok = execute_vid_gen_job(runtime,
+                                     *job,
+                                     output_media_b64,
+                                     output_media_mime_type,
+                                     output_frame_count,
+                                     output_fps,
+                                     error_message);
        } else {
            error_message = "unsupported job kind";
        }
@ -260,6 +333,10 @@ void async_job_worker(ServerRuntime& runtime) {
            if (ok) {
                job->status                 = AsyncJobStatus::Completed;
                job->result_images_b64      = std::move(output_images);
+                job->result_media_b64       = std::move(output_media_b64);
+                job->result_media_mime_type = std::move(output_media_mime_type);
+                job->result_frame_count     = output_frame_count;
+                job->result_fps             = output_fps;
                job->error_code.clear();
                job->error_message.clear();
            } else {
@ -267,6 +344,10 @@ void async_job_worker(ServerRuntime& runtime) {
                job->error_code    = "generation_failed";
                job->error_message = error_message.empty() ? "unknown generation error" : error_message;
                job->result_images_b64.clear();
+                job->result_media_b64.clear();
+                job->result_media_mime_type.clear();
+                job->result_frame_count = 0;
+                job->result_fps         = 0;
            }

            purge_expired_jobs(manager);
--- a/examples/server/async_jobs.h
+++ b/examples/server/async_jobs.h
@ -36,7 +36,12 @@ struct AsyncGenerationJob {
    int64_t started_at    = 0;
    int64_t completed_at  = 0;
    ImgGenJobRequest img_gen;
+    VidGenJobRequest vid_gen;
    std::vector<std::string> result_images_b64;
+    std::string result_media_b64;
+    std::string result_media_mime_type;
+    int result_frame_count = 0;
+    int result_fps         = 0;
    std::string error_code;
    std::string error_message;
 };
@ -63,4 +68,11 @@ bool execute_img_gen_job(ServerRuntime& runtime,
                         AsyncGenerationJob& job,
                         std::vector<std::string>& output_images,
                         std::string& error_message);
+bool execute_vid_gen_job(ServerRuntime& runtime,
+                         AsyncGenerationJob& job,
+                         std::string& output_media_b64,
+                         std::string& output_media_mime_type,
+                         int& output_frame_count,
+                         int& output_fps,
+                         std::string& error_message);
 void async_job_worker(ServerRuntime& runtime);
--- a/examples/server/frontend
+++ b/examples/server/frontend
@ -1 +1 @@
-Subproject commit 740475a7a6794dc07fb23e8ec5dc56e7e80aa8c1
+Subproject commit 797ccf80825cc035508ba9b599b2a21953e7f835
--- a/examples/server/main.cpp
+++ b/examples/server/main.cpp
@ -48,7 +48,9 @@ static void parse_args(int argc,

    if (!svr_params.resolve_and_validate() ||
        !ctx_params.resolve_and_validate(IMG_GEN) ||
-        !default_gen_params.resolve_and_validate(IMG_GEN, ctx_params.lora_model_dir)) {
+        !default_gen_params.resolve_and_validate(IMG_GEN,
+                                                 ctx_params.lora_model_dir,
+                                                 ctx_params.hires_upscalers_dir)) {
        print_usage(argv[0], options_vec);
        exit(1);
    }
@ -83,7 +85,7 @@ int main(int argc, const char** argv) {
    LOG_DEBUG("%s", ctx_params.to_string().c_str());
    LOG_DEBUG("%s", default_gen_params.to_string().c_str());

-    sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(false, false, false);
+    sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(false);
    SDCtxPtr sd_ctx(new_sd_ctx(&sd_ctx_params));

    if (sd_ctx == nullptr) {
@ -95,6 +97,8 @@ int main(int argc, const char** argv) {

    std::vector<LoraEntry> lora_cache;
    std::mutex lora_mutex;
+    std::vector<UpscalerEntry> upscaler_cache;
+    std::mutex upscaler_mutex;
    AsyncJobManager async_job_manager;
    ServerRuntime runtime = {
        sd_ctx.get(),
@ -104,6 +108,8 @@ int main(int argc, const char** argv) {
        &default_gen_params,
        &lora_cache,
        &lora_mutex,
+        &upscaler_cache,
+        &upscaler_mutex,
        &async_job_manager,
    };

@ -139,7 +145,7 @@ int main(int argc, const char** argv) {
    register_sdapi_endpoints(svr, runtime);
    register_sdcpp_api_endpoints(svr, runtime);

-    LOG_INFO("listening on: %s:%d\n", svr_params.listen_ip.c_str(), svr_params.listen_port);
+    LOG_INFO("listening on: http://%s:%d\n", svr_params.listen_ip.c_str(), svr_params.listen_port);
    svr.listen(svr_params.listen_ip, svr_params.listen_port);

    {
--- a/examples/server/routes_openai.cpp
+++ b/examples/server/routes_openai.cpp
@ -70,7 +70,7 @@ static bool build_openai_generation_request(const httplib::Request& req,
    }

    // Intentionally disable prompt-embedded LoRA tag parsing for server APIs.
-    if (!request.gen_params.resolve_and_validate(IMG_GEN, "", true)) {
+    if (!request.gen_params.resolve_and_validate(IMG_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) {
        error_message = "invalid params";
        return false;
    }
@ -212,7 +212,7 @@ static bool build_openai_edit_request(const httplib::Request& req,
    }

    // Intentionally disable prompt-embedded LoRA tag parsing for server APIs.
-    if (!request.gen_params.resolve_and_validate(IMG_GEN, "", true)) {
+    if (!request.gen_params.resolve_and_validate(IMG_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) {
        error_message = "invalid params";
        return false;
    }
@ -253,6 +253,12 @@ void register_openai_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {

    svr.Post("/v1/images/generations", [runtime](const httplib::Request& req, httplib::Response& res) {
        try {
+            if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) {
+                res.status = 400;
+                res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json");
+                return;
+            }
+
            ImgGenJobRequest request;
            std::string error_message;
            if (!build_openai_generation_request(req, *runtime, request, error_message)) {
@ -319,6 +325,12 @@ void register_openai_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {

    svr.Post("/v1/images/edits", [runtime](const httplib::Request& req, httplib::Response& res) {
        try {
+            if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) {
+                res.status = 400;
+                res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json");
+                return;
+            }
+
            ImgGenJobRequest request;
            std::string error_message;
            if (!build_openai_edit_request(req, *runtime, request, error_message)) {
--- a/examples/server/routes_sdapi.cpp
+++ b/examples/server/routes_sdapi.cpp
@ -1,6 +1,7 @@
 #include "routes.h"

 #include <algorithm>
+#include <cctype>
 #include <cstring>
 #include <regex>
 #include <string_view>
@ -35,14 +36,20 @@ static fs::path resolve_display_model_path(const ServerRuntime& runtime) {
    return {};
 }

+static std::string lower_ascii(std::string value) {
+    std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) {
+        return static_cast<char>(std::tolower(c));
+    });
+    return value;
+}
+
 static enum sample_method_t get_sdapi_sample_method(std::string name) {
    enum sample_method_t result = str_to_sample_method(name.c_str());
    if (result != SAMPLE_METHOD_COUNT) {
        return result;
    }

-    std::transform(name.begin(), name.end(), name.begin(),
-                   [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+    name = lower_ascii(name);
    static const std::unordered_map<std::string_view, sample_method_t> hardcoded{
        {"euler a", EULER_A_SAMPLE_METHOD},
        {"k_euler_a", EULER_A_SAMPLE_METHOD},
@ -60,6 +67,10 @@ static enum sample_method_t get_sdapi_sample_method(std::string name) {
        {"k_res_multistep", RES_MULTISTEP_SAMPLE_METHOD},
        {"res 2s", RES_2S_SAMPLE_METHOD},
        {"k_res_2s", RES_2S_SAMPLE_METHOD},
+        {"euler_cfg_pp", EULER_CFG_PP_SAMPLE_METHOD},
+        {"k_euler_cfg_pp", EULER_CFG_PP_SAMPLE_METHOD},
+        {"euler_a_cfg_pp", EULER_CFG_PP_SAMPLE_METHOD},
+        {"k_euler_a_cfg_pp", EULER_CFG_PP_SAMPLE_METHOD},
    };
    auto it = hardcoded.find(name);
    return it != hardcoded.end() ? it->second : SAMPLE_METHOD_COUNT;
@ -114,6 +125,18 @@ static bool build_sdapi_img_gen_request(const json& j,
    request.gen_params.width                          = j.value("width", -1);
    request.gen_params.height                         = j.value("height", -1);

+    if (!img2img && j.value("enable_hr", false)) {
+        request.gen_params.hires_enabled = true;
+        request.gen_params.hires_scale   = j.value("hr_scale", request.gen_params.hires_scale);
+        request.gen_params.hires_width   = j.value("hr_resize_x", request.gen_params.hires_width);
+        request.gen_params.hires_height  = j.value("hr_resize_y", request.gen_params.hires_height);
+        request.gen_params.hires_steps   = j.value("hr_steps", request.gen_params.hires_steps);
+        request.gen_params.hires_denoising_strength =
+            j.value("denoising_strength", request.gen_params.hires_denoising_strength);
+
+        request.gen_params.hires_upscaler = j.value("hr_upscaler", request.gen_params.hires_upscaler);
+    }
+
    std::string sd_cpp_extra_args_str = extract_and_remove_sd_cpp_extra_args(request.gen_params.prompt);
    if (!sd_cpp_extra_args_str.empty() && !request.gen_params.from_json_str(sd_cpp_extra_args_str)) {
        error_message = "invalid sd_cpp_extra_args";
@ -228,7 +251,7 @@ static bool build_sdapi_img_gen_request(const json& j,
    }

    // Intentionally disable prompt-embedded LoRA tag parsing for server APIs.
-    if (!request.gen_params.resolve_and_validate(IMG_GEN, "", true)) {
+    if (!request.gen_params.resolve_and_validate(IMG_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) {
        error_message = "invalid params";
        return false;
    }
@ -246,6 +269,11 @@ void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt) {
                res.set_content(R"({"error":"empty body"})", "application/json");
                return;
            }
+            if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) {
+                res.status = 400;
+                res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json");
+                return;
+            }

            json j = json::parse(req.body);
            ImgGenJobRequest request;
@ -342,6 +370,52 @@ void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt) {
        res.set_content(result.dump(), "application/json");
    });

+    svr.Get("/sdapi/v1/upscalers", [runtime](const httplib::Request&, httplib::Response& res) {
+        refresh_upscaler_cache(*runtime);
+
+        auto make_builtin = [](const char* name) {
+            json item;
+            item["name"]       = name;
+            item["model_name"] = nullptr;
+            item["model_path"] = nullptr;
+            item["model_url"]  = nullptr;
+            item["scale"]      = 4;
+            return item;
+        };
+
+        json result = json::array();
+        result.push_back(make_builtin("None"));
+        result.push_back(make_builtin("Lanczos"));
+        result.push_back(make_builtin("Nearest"));
+
+        {
+            std::lock_guard<std::mutex> lock(*runtime->upscaler_mutex);
+            for (const auto& e : *runtime->upscaler_cache) {
+                json item;
+                item["name"]       = e.name;
+                item["model_name"] = e.model_name;
+                item["model_path"] = e.fullpath;
+                item["model_url"]  = nullptr;
+                item["scale"]      = e.scale;
+                result.push_back(item);
+            }
+        }
+
+        res.set_content(result.dump(), "application/json");
+    });
+
+    svr.Get("/sdapi/v1/latent-upscale-modes", [](const httplib::Request&, httplib::Response& res) {
+        json result = json::array({
+            {{"name", "Latent"}},
+            {{"name", "Latent (nearest)"}},
+            {{"name", "Latent (nearest-exact)"}},
+            {{"name", "Latent (antialiased)"}},
+            {{"name", "Latent (bicubic)"}},
+            {{"name", "Latent (bicubic antialiased)"}},
+        });
+        res.set_content(result.dump(), "application/json");
+    });
+
    svr.Get("/sdapi/v1/samplers", [runtime](const httplib::Request&, httplib::Response& res) {
        std::vector<std::string> sampler_names;
        sampler_names.push_back("default");
--- a/examples/server/routes_sdcpp.cpp
+++ b/examples/server/routes_sdcpp.cpp
@ -56,11 +56,13 @@ static const char* capability_sample_method_name(enum sample_method_t sample_met
 static json make_vae_tiling_json(const sd_tiling_params_t& params) {
    return {
        {"enabled", params.enabled},
+        {"temporal_tiling", params.temporal_tiling},
        {"tile_size_x", params.tile_size_x},
        {"tile_size_y", params.tile_size_y},
        {"target_overlap", params.target_overlap},
        {"rel_size_x", params.rel_size_x},
        {"rel_size_y", params.rel_size_y},
+        {"extra_tiling_args", params.extra_tiling_args ? params.extra_tiling_args : ""},
    };
 }

@ -75,61 +77,9 @@ static fs::path resolve_display_model_path(const ServerRuntime& runtime) {
    return {};
 }

-static json make_capabilities_json(ServerRuntime& runtime) {
-    refresh_lora_cache(runtime);
-
-    AsyncJobManager& manager  = *runtime.async_job_manager;
-    const auto& defaults      = *runtime.default_gen_params;
-    const auto& sample_params = defaults.sample_params;
+static json make_sample_params_json(const sd_sample_params_t& sample_params, const std::vector<int>& skip_layers) {
    const auto& guidance = sample_params.guidance;
-    const fs::path model_path = resolve_display_model_path(runtime);
-    json samplers             = json::array();
-    json schedulers           = json::array();
-    json output_formats       = json::array({"png", "jpeg"});
-    json available_loras      = json::array();
-
-    for (int i = 0; i < SAMPLE_METHOD_COUNT; ++i) {
-        samplers.push_back(sd_sample_method_name((sample_method_t)i));
-    }
-
-    for (int i = 0; i < SCHEDULER_COUNT; ++i) {
-        schedulers.push_back(sd_scheduler_name((scheduler_t)i));
-    }
-
-#ifdef SD_USE_WEBP
-    output_formats.push_back("webp");
-#endif
-
-    {
-        std::lock_guard<std::mutex> lock(*runtime.lora_mutex);
-        for (const auto& entry : *runtime.lora_cache) {
-            available_loras.push_back({
-                {"name", entry.name},
-                {"path", entry.path},
-            });
-        }
-    }
-
-    json result;
-    result["model"] = {
-        {"name", model_path.filename().u8string()},
-        {"stem", model_path.stem().u8string()},
-        {"path", model_path.u8string()},
-    };
-    result["defaults"] = {
-        {"prompt", defaults.prompt},
-        {"negative_prompt", defaults.negative_prompt},
-        {"clip_skip", defaults.clip_skip},
-        {"width", defaults.width > 0 ? defaults.width : 512},
-        {"height", defaults.height > 0 ? defaults.height : 512},
-        {"strength", defaults.strength},
-        {"seed", defaults.seed},
-        {"batch_count", defaults.batch_count},
-        {"auto_resize_ref_image", defaults.auto_resize_ref_image},
-        {"increase_ref_index", defaults.increase_ref_index},
-        {"control_strength", defaults.control_strength},
-        {"sample_params",
-         {
+    return {
        {"scheduler", capability_scheduler_name(sample_params.scheduler)},
        {"sample_method", capability_sample_method_name(sample_params.sample_method)},
        {"sample_steps", sample_params.sample_steps},
@ -143,21 +93,239 @@ static json make_capabilities_json(ServerRuntime& runtime) {
             {"distilled_guidance", guidance.distilled_guidance},
             {"slg",
              {
-                       {"layers", defaults.skip_layers},
+                  {"layers", skip_layers},
                  {"layer_start", guidance.slg.layer_start},
                  {"layer_end", guidance.slg.layer_end},
                  {"scale", guidance.slg.scale},
              }},
         }},
-         }},
+    };
+}
+
+static json make_hires_json(const SDGenerationParams& defaults) {
+    return {
+        {"enabled", defaults.hires_enabled},
+        {"upscaler", defaults.hires_upscaler},
+        {"scale", defaults.hires_scale},
+        {"target_width", defaults.hires_width},
+        {"target_height", defaults.hires_height},
+        {"steps", defaults.hires_steps},
+        {"denoising_strength", defaults.hires_denoising_strength},
+        {"custom_sigmas", defaults.hires_custom_sigmas},
+        {"upscale_tile_size", defaults.hires_upscale_tile_size},
+    };
+}
+
+static json make_img_gen_defaults_json(const SDGenerationParams& defaults, const std::string& output_format) {
+    return {
+        {"prompt", defaults.prompt},
+        {"negative_prompt", defaults.negative_prompt},
+        {"clip_skip", defaults.clip_skip},
+        {"width", defaults.width > 0 ? defaults.width : 512},
+        {"height", defaults.height > 0 ? defaults.height : 512},
+        {"strength", defaults.strength},
+        {"seed", defaults.seed},
+        {"batch_count", defaults.batch_count},
+        {"auto_resize_ref_image", defaults.auto_resize_ref_image},
+        {"increase_ref_index", defaults.increase_ref_index},
+        {"control_strength", defaults.control_strength},
+        {"sample_params", make_sample_params_json(defaults.sample_params, defaults.skip_layers)},
+        {"hires", make_hires_json(defaults)},
        {"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)},
        {"cache_mode", defaults.cache_mode},
        {"cache_option", defaults.cache_option},
        {"scm_mask", defaults.scm_mask},
        {"scm_policy_dynamic", defaults.scm_policy_dynamic},
-        {"output_format", "png"},
+        {"output_format", output_format},
        {"output_compression", 100},
    };
+}
+
+static json make_vid_gen_defaults_json(const SDGenerationParams& defaults, const std::string& output_format) {
+    return {
+        {"prompt", defaults.prompt},
+        {"negative_prompt", defaults.negative_prompt},
+        {"clip_skip", defaults.clip_skip},
+        {"width", defaults.width > 0 ? defaults.width : 512},
+        {"height", defaults.height > 0 ? defaults.height : 512},
+        {"strength", defaults.strength},
+        {"seed", defaults.seed},
+        {"video_frames", defaults.video_frames},
+        {"fps", defaults.fps},
+        {"moe_boundary", defaults.moe_boundary},
+        {"vace_strength", defaults.vace_strength},
+        {"sample_params", make_sample_params_json(defaults.sample_params, defaults.skip_layers)},
+        {"high_noise_sample_params", make_sample_params_json(defaults.high_noise_sample_params, defaults.high_noise_skip_layers)},
+        {"hires", make_hires_json(defaults)},
+        {"vae_tiling_params", make_vae_tiling_json(defaults.vae_tiling_params)},
+        {"cache_mode", defaults.cache_mode},
+        {"cache_option", defaults.cache_option},
+        {"scm_mask", defaults.scm_mask},
+        {"scm_policy_dynamic", defaults.scm_policy_dynamic},
+        {"output_format", output_format},
+        {"output_compression", 100},
+    };
+}
+
+static json make_img_gen_features_json() {
+    return {
+        {"init_image", true},
+        {"mask_image", true},
+        {"control_image", true},
+        {"ref_images", true},
+        {"lora", true},
+        {"vae_tiling", true},
+        {"hires", true},
+        {"cache", true},
+        {"cancel_queued", true},
+        {"cancel_generating", false},
+    };
+}
+
+static json make_vid_gen_features_json() {
+    return {
+        {"init_image", true},
+        {"end_image", true},
+        {"control_frames", true},
+        {"high_noise_sample_params", true},
+        {"lora", true},
+        {"vae_tiling", true},
+        {"cache", true},
+        {"cancel_queued", true},
+        {"cancel_generating", false},
+    };
+}
+
+static json make_capabilities_json(ServerRuntime& runtime) {
+    refresh_lora_cache(runtime);
+    refresh_upscaler_cache(runtime);
+
+    AsyncJobManager& manager  = *runtime.async_job_manager;
+    const auto& defaults      = *runtime.default_gen_params;
+    const fs::path model_path = resolve_display_model_path(runtime);
+    const bool supports_img   = runtime_supports_generation_mode(runtime, IMG_GEN);
+    const bool supports_vid   = runtime_supports_generation_mode(runtime, VID_GEN);
+    json samplers             = json::array();
+    json schedulers           = json::array();
+    json image_output_formats = supported_img_output_formats();
+    json video_output_formats = supported_vid_output_formats();
+    json available_loras      = json::array();
+    json available_upscalers  = json::array();
+    json supported_modes      = json::array();
+
+    for (int i = 0; i < SAMPLE_METHOD_COUNT; ++i) {
+        samplers.push_back(sd_sample_method_name((sample_method_t)i));
+    }
+
+    for (int i = 0; i < SCHEDULER_COUNT; ++i) {
+        schedulers.push_back(sd_scheduler_name((scheduler_t)i));
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(*runtime.lora_mutex);
+        for (const auto& entry : *runtime.lora_cache) {
+            available_loras.push_back({
+                {"name", entry.name},
+                {"path", entry.path},
+            });
+        }
+    }
+
+    available_upscalers.push_back({
+        {"name", "None"},
+    });
+    available_upscalers.push_back({
+        {"name", "Lanczos"},
+    });
+    available_upscalers.push_back({
+        {"name", "Nearest"},
+    });
+    available_upscalers.push_back({
+        {"name", "Latent"},
+    });
+    available_upscalers.push_back({
+        {"name", "Latent (nearest)"},
+    });
+    available_upscalers.push_back({
+        {"name", "Latent (nearest-exact)"},
+    });
+    available_upscalers.push_back({
+        {"name", "Latent (antialiased)"},
+    });
+    available_upscalers.push_back({
+        {"name", "Latent (bicubic)"},
+    });
+    available_upscalers.push_back({
+        {"name", "Latent (bicubic antialiased)"},
+    });
+    {
+        std::lock_guard<std::mutex> lock(*runtime.upscaler_mutex);
+        for (const auto& entry : *runtime.upscaler_cache) {
+            available_upscalers.push_back({
+                {"name", entry.name},
+            });
+        }
+    }
+
+    if (supports_img) {
+        supported_modes.push_back("img_gen");
+    }
+    if (supports_vid) {
+        supported_modes.push_back("vid_gen");
+    }
+
+    std::string default_img_output_format = "png";
+    std::string default_vid_output_format = "avi";
+    if (!image_output_formats.empty()) {
+        default_img_output_format = image_output_formats[0].get<std::string>();
+    }
+    if (!video_output_formats.empty()) {
+        default_vid_output_format = video_output_formats[0].get<std::string>();
+    }
+
+    json defaults_by_mode       = json::object();
+    json output_formats_by_mode = json::object();
+    json features_by_mode       = json::object();
+    if (supports_img) {
+        defaults_by_mode["img_gen"]       = make_img_gen_defaults_json(defaults, default_img_output_format);
+        output_formats_by_mode["img_gen"] = image_output_formats;
+        features_by_mode["img_gen"]       = make_img_gen_features_json();
+    }
+    if (supports_vid) {
+        defaults_by_mode["vid_gen"]       = make_vid_gen_defaults_json(defaults, default_vid_output_format);
+        output_formats_by_mode["vid_gen"] = video_output_formats;
+        features_by_mode["vid_gen"]       = make_vid_gen_features_json();
+    }
+
+    json top_level_defaults       = json::object();
+    json top_level_output_formats = json::array();
+    json top_level_features       = {
+              {"cancel_queued", true},
+              {"cancel_generating", false},
+    };
+    std::string current_mode = "";
+    if (supports_img) {
+        current_mode             = "img_gen";
+        top_level_defaults       = defaults_by_mode["img_gen"];
+        top_level_output_formats = output_formats_by_mode["img_gen"];
+        top_level_features       = features_by_mode["img_gen"];
+    } else if (supports_vid) {
+        current_mode             = "vid_gen";
+        top_level_defaults       = defaults_by_mode["vid_gen"];
+        top_level_output_formats = output_formats_by_mode["vid_gen"];
+        top_level_features       = features_by_mode["vid_gen"];
+    }
+
+    json result;
+    result["model"] = {
+        {"name", model_path.filename().u8string()},
+        {"stem", model_path.stem().u8string()},
+        {"path", model_path.u8string()},
+    };
+    result["current_mode"]     = current_mode;
+    result["supported_modes"]  = supported_modes;
+    result["defaults"]         = top_level_defaults;
+    result["defaults_by_mode"] = defaults_by_mode;
    result["limits"]           = {
                  {"min_width", 64},
                  {"max_width", 4096},
@ -168,19 +336,12 @@ static json make_capabilities_json(ServerRuntime& runtime) {
    };
    result["samplers"]               = samplers;
    result["schedulers"]             = schedulers;
-    result["output_formats"] = output_formats;
-    result["features"]       = {
-              {"init_image", true},
-              {"mask_image", true},
-              {"control_image", true},
-              {"ref_images", true},
-              {"lora", true},
-              {"vae_tiling", true},
-              {"cache", true},
-              {"cancel_queued", true},
-              {"cancel_generating", false},
-    };
+    result["output_formats"]         = top_level_output_formats;
+    result["output_formats_by_mode"] = output_formats_by_mode;
+    result["features"]               = top_level_features;
+    result["features_by_mode"]       = features_by_mode;
    result["loras"]                  = available_loras;
+    result["upscalers"]              = available_upscalers;
    return result;
 }

@ -204,7 +365,34 @@ static bool parse_img_gen_request(const json& body,
        return false;
    }
    // Intentionally disable prompt-embedded LoRA tag parsing for server APIs.
-    if (!request.gen_params.resolve_and_validate(IMG_GEN, "", true)) {
+    if (!request.gen_params.resolve_and_validate(IMG_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) {
+        error_message = "invalid generation parameters";
+        return false;
+    }
+    return true;
+}
+
+static bool parse_vid_gen_request(const json& body,
+                                  ServerRuntime& runtime,
+                                  VidGenJobRequest& request,
+                                  std::string& error_message) {
+    request.gen_params = *runtime.default_gen_params;
+
+    refresh_lora_cache(runtime);
+    if (!request.gen_params.from_json_str(body.dump(), [&](const std::string& path) {
+            return get_lora_full_path(runtime, path);
+        })) {
+        error_message = "invalid generation parameters";
+        return false;
+    }
+
+    std::string output_format = body.value("output_format", "webm");
+    int output_compression    = body.value("output_compression", 100);
+    if (!assign_output_options(request, output_format, output_compression, error_message)) {
+        return false;
+    }
+    // Intentionally disable prompt-embedded LoRA tag parsing for server APIs.
+    if (!request.gen_params.resolve_and_validate(VID_GEN, "", runtime.ctx_params->hires_upscalers_dir, true)) {
        error_message = "invalid generation parameters";
        return false;
    }
@ -226,6 +414,11 @@ void register_sdcpp_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {
                res.set_content(R"({"error":"empty body"})", "application/json");
                return;
            }
+            if (!runtime_supports_generation_mode(*runtime, IMG_GEN)) {
+                res.status = 400;
+                res.set_content(json({{"error", unsupported_generation_mode_error(IMG_GEN)}}).dump(), "application/json");
+                return;
+            }

            json body = json::parse(req.body);
            ImgGenJobRequest request;
@ -276,9 +469,66 @@ void register_sdcpp_api_endpoints(httplib::Server& svr, ServerRuntime& rt) {
        }
    });

-    svr.Post("/sdcpp/v1/vid_gen", [](const httplib::Request&, httplib::Response& res) {
-        res.status = 501;
-        res.set_content(R"({"error":"vid_gen is reserved and not implemented yet"})", "application/json");
+    svr.Post("/sdcpp/v1/vid_gen", [runtime](const httplib::Request& req, httplib::Response& res) {
+        try {
+            if (req.body.empty()) {
+                res.status = 400;
+                res.set_content(R"({"error":"empty body"})", "application/json");
+                return;
+            }
+            if (!runtime_supports_generation_mode(*runtime, VID_GEN)) {
+                res.status = 400;
+                res.set_content(json({{"error", unsupported_generation_mode_error(VID_GEN)}}).dump(), "application/json");
+                return;
+            }
+
+            json body = json::parse(req.body);
+            VidGenJobRequest request;
+            std::string error_message;
+            if (!parse_vid_gen_request(body, *runtime, request, error_message)) {
+                res.status = 400;
+                res.set_content(json({{"error", error_message}}).dump(), "application/json");
+                return;
+            }
+
+            AsyncJobManager& manager                = *runtime->async_job_manager;
+            std::shared_ptr<AsyncGenerationJob> job = std::make_shared<AsyncGenerationJob>();
+            job->kind                               = AsyncJobKind::VidGen;
+            job->status                             = AsyncJobStatus::Queued;
+            job->created_at                         = unix_timestamp_now();
+            job->vid_gen                            = std::move(request);
+
+            {
+                std::lock_guard<std::mutex> lock(manager.mutex);
+                purge_expired_jobs(manager);
+                if (count_pending_jobs(manager) >= manager.max_pending_jobs) {
+                    res.status = 429;
+                    res.set_content(R"({"error":"job queue is full"})", "application/json");
+                    return;
+                }
+                job->id               = make_async_job_id(manager);
+                manager.jobs[job->id] = job;
+                manager.queue.push_back(job->id);
+            }
+
+            manager.cv.notify_one();
+
+            json out;
+            out["id"]       = job->id;
+            out["kind"]     = async_job_kind_name(job->kind);
+            out["status"]   = async_job_status_name(job->status);
+            out["created"]  = job->created_at;
+            out["poll_url"] = "/sdcpp/v1/jobs/" + job->id;
+
+            res.status = 202;
+            res.set_content(out.dump(), "application/json");
+        } catch (const json::parse_error& e) {
+            res.status = 400;
+            res.set_content(json({{"error", "invalid json"}, {"message", e.what()}}).dump(), "application/json");
+        } catch (const std::exception& e) {
+            res.status = 500;
+            res.set_content(json({{"error", "server_error"}, {"message", e.what()}}).dump(), "application/json");
+        }
    });

    svr.Get(R"(/sdcpp/v1/jobs/([A-Za-z0-9_\-]+))", [runtime](const httplib::Request& req, httplib::Response& res) {
--- a/examples/server/runtime.cpp
+++ b/examples/server/runtime.cpp
@ -1,6 +1,7 @@
 #include "runtime.h"

 #include <algorithm>
+#include <cctype>
 #include <chrono>
 #include <cstdlib>
 #include <filesystem>
@ -13,6 +14,18 @@

 namespace fs = std::filesystem;

+static std::string lower_ascii(std::string value) {
+    std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) {
+        return static_cast<char>(std::tolower(c));
+    });
+    return value;
+}
+
+static bool is_supported_model_ext(const fs::path& p) {
+    auto ext = lower_ascii(p.extension().string());
+    return ext == ".gguf" || ext == ".pt" || ext == ".pth" || ext == ".safetensors";
+}
+
 static const std::string k_base64_chars =
    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    "abcdefghijklmnopqrstuvwxyz"
@ -45,6 +58,44 @@ std::string normalize_output_format(std::string output_format) {
    return output_format;
 }

+std::vector<std::string> supported_img_output_formats(bool allow_webp) {
+    std::vector<std::string> formats = {"png", "jpeg"};
+#ifdef SD_USE_WEBP
+    if (allow_webp) {
+        formats.push_back("webp");
+    }
+#else
+    (void)allow_webp;
+#endif
+    return formats;
+}
+
+std::vector<std::string> supported_vid_output_formats() {
+    std::vector<std::string> formats;
+#ifdef SD_USE_WEBM
+    formats.push_back("webm");
+#endif
+#ifdef SD_USE_WEBP
+    formats.push_back("webp");
+#endif
+    formats.push_back("avi");
+    return formats;
+}
+
+static std::string valid_vid_output_formats_message() {
+    const std::vector<std::string> formats = supported_vid_output_formats();
+
+    std::string message = "invalid output_format, must be one of [";
+    for (size_t i = 0; i < formats.size(); ++i) {
+        if (i > 0) {
+            message += ", ";
+        }
+        message += formats[i];
+    }
+    message += "]";
+    return message;
+}
+
 bool assign_output_options(ImgGenJobRequest& request,
                           std::string output_format,
                           int output_compression,
@ -53,19 +104,88 @@ bool assign_output_options(ImgGenJobRequest& request,
    request.output_format      = normalize_output_format(std::move(output_format));
    request.output_compression = std::clamp(output_compression, 0, 100);

-    const bool valid_format = request.output_format == "png" ||
-                              request.output_format == "jpeg" ||
-                              (allow_webp && request.output_format == "webp");
+    const std::vector<std::string> valid_formats = supported_img_output_formats(allow_webp);
+    const bool valid_format                      = std::find(valid_formats.begin(),
+                                                             valid_formats.end(),
+                                                             request.output_format) != valid_formats.end();
    if (!valid_format) {
-        error_message = allow_webp
-                            ? "invalid output_format, must be one of [png, jpeg, webp]"
-                            : "invalid output_format, must be one of [png, jpeg]";
+        error_message = "invalid output_format, must be one of [";
+        for (size_t i = 0; i < valid_formats.size(); ++i) {
+            if (i > 0) {
+                error_message += ", ";
+            }
+            error_message += valid_formats[i];
+        }
+        error_message += "]";
        return false;
    }

    return true;
 }

+bool assign_output_options(VidGenJobRequest& request,
+                           std::string output_format,
+                           int output_compression,
+                           std::string& error_message) {
+    request.output_format      = normalize_output_format(std::move(output_format));
+    request.output_compression = std::clamp(output_compression, 0, 100);
+
+    if (request.output_format == "avi") {
+        return true;
+    }
+
+    if (request.output_format == "webm") {
+#ifdef SD_USE_WEBM
+        return true;
+#else
+        error_message = valid_vid_output_formats_message();
+        return false;
+#endif
+    }
+
+    if (request.output_format == "webp") {
+#ifdef SD_USE_WEBP
+        return true;
+#else
+        error_message = valid_vid_output_formats_message();
+        return false;
+#endif
+    }
+
+    error_message = valid_vid_output_formats_message();
+    return false;
+}
+
+std::string video_mime_type(const std::string& output_format) {
+    if (output_format == "webm") {
+        return "video/webm";
+    }
+    if (output_format == "webp") {
+        return "image/webp";
+    }
+    return "video/x-msvideo";
+}
+
+bool runtime_supports_generation_mode(const ServerRuntime& runtime, SDMode mode) {
+    if (mode == VID_GEN) {
+        return sd_ctx_supports_video_generation(runtime.sd_ctx);
+    }
+    if (mode == IMG_GEN) {
+        return sd_ctx_supports_image_generation(runtime.sd_ctx);
+    }
+    return true;
+}
+
+std::string unsupported_generation_mode_error(SDMode mode) {
+    if (mode == VID_GEN) {
+        return "loaded model does not support vid_gen";
+    }
+    if (mode == IMG_GEN) {
+        return "loaded model does not support img_gen";
+    }
+    return "loaded model does not support requested mode";
+}
+
 ArgOptions SDSvrParams::get_options() {
    ArgOptions options;

@ -83,8 +203,9 @@ ArgOptions SDSvrParams::get_options() {
        {"", "--color", "colors the logging tags according to level", true, &color},
    };

-    auto on_help_arg = [&](int, const char**, int) {
+    auto on_help_arg = [&](int, const char**, int, bool& valid) {
        normal_exit = true;
+        valid       = true;
        return -1;
    };

@ -134,20 +255,12 @@ void refresh_lora_cache(ServerRuntime& rt) {

    fs::path lora_dir = rt.ctx_params->lora_model_dir;
    if (fs::exists(lora_dir) && fs::is_directory(lora_dir)) {
-        auto is_lora_ext = [](const fs::path& p) {
-            auto ext = p.extension().string();
-            std::transform(ext.begin(), ext.end(), ext.begin(), [](unsigned char c) {
-                return static_cast<char>(std::tolower(c));
-            });
-            return ext == ".gguf" || ext == ".pt" || ext == ".pth" || ext == ".safetensors";
-        };
-
-        for (auto& entry : fs::recursive_directory_iterator(lora_dir)) {
+        for (auto& entry : fs::recursive_directory_iterator(lora_dir, fs::directory_options::skip_permission_denied)) {
            if (!entry.is_regular_file()) {
                continue;
            }
            const fs::path& p = entry.path();
-            if (!is_lora_ext(p)) {
+            if (!is_supported_model_ext(p)) {
                continue;
            }

@ -179,6 +292,40 @@ std::string get_lora_full_path(ServerRuntime& rt, const std::string& path) {
    return it != rt.lora_cache->end() ? it->fullpath : "";
 }

+void refresh_upscaler_cache(ServerRuntime& rt) {
+    std::vector<UpscalerEntry> new_cache;
+
+    fs::path upscaler_dir = rt.ctx_params->hires_upscalers_dir;
+    if (fs::exists(upscaler_dir) && fs::is_directory(upscaler_dir)) {
+        for (auto& entry : fs::directory_iterator(upscaler_dir)) {
+            if (!entry.is_regular_file()) {
+                continue;
+            }
+            const fs::path& p = entry.path();
+            if (!is_supported_model_ext(p)) {
+                continue;
+            }
+
+            UpscalerEntry upscaler_entry;
+            upscaler_entry.name       = p.stem().u8string();
+            upscaler_entry.fullpath   = fs::absolute(p).lexically_normal().u8string();
+            upscaler_entry.model_name = "ESRGAN_4x";
+            upscaler_entry.path       = p.filename().u8string();
+
+            new_cache.push_back(std::move(upscaler_entry));
+        }
+    }
+
+    std::sort(new_cache.begin(), new_cache.end(), [](const UpscalerEntry& a, const UpscalerEntry& b) {
+        return a.name < b.name;
+    });
+
+    {
+        std::lock_guard<std::mutex> lock(*rt.upscaler_mutex);
+        *rt.upscaler_cache = std::move(new_cache);
+    }
+}
+
 int64_t unix_timestamp_now() {
    return std::chrono::duration_cast<std::chrono::seconds>(
               std::chrono::system_clock::now().time_since_epoch())
--- a/examples/server/runtime.h
+++ b/examples/server/runtime.h
@ -37,6 +37,14 @@ struct LoraEntry {
    std::string fullpath;
 };

+struct UpscalerEntry {
+    std::string name;
+    std::string path;
+    std::string fullpath;
+    std::string model_name;
+    int scale = 4;
+};
+
 struct ServerRuntime {
    sd_ctx_t* sd_ctx;
    std::mutex* sd_ctx_mutex;
@ -45,6 +53,8 @@ struct ServerRuntime {
    const SDGenerationParams* default_gen_params;
    std::vector<LoraEntry>* lora_cache;
    std::mutex* lora_mutex;
+    std::vector<UpscalerEntry>* upscaler_cache;
+    std::mutex* upscaler_mutex;
    AsyncJobManager* async_job_manager;
 };

@ -58,13 +68,33 @@ struct ImgGenJobRequest {
    }
 };

+struct VidGenJobRequest {
+    SDGenerationParams gen_params;
+    std::string output_format = "webm";
+    int output_compression    = 100;
+
+    sd_vid_gen_params_t to_sd_vid_gen_params_t() {
+        return gen_params.to_sd_vid_gen_params_t();
+    }
+};
+
 std::string base64_encode(const std::vector<uint8_t>& bytes);
 std::string normalize_output_format(std::string output_format);
+std::vector<std::string> supported_img_output_formats(bool allow_webp = true);
+std::vector<std::string> supported_vid_output_formats();
 bool assign_output_options(ImgGenJobRequest& request,
                           std::string output_format,
                           int output_compression,
                           bool allow_webp,
                           std::string& error_message);
+bool assign_output_options(VidGenJobRequest& request,
+                           std::string output_format,
+                           int output_compression,
+                           std::string& error_message);
+std::string video_mime_type(const std::string& output_format);
+bool runtime_supports_generation_mode(const ServerRuntime& runtime, SDMode mode);
+std::string unsupported_generation_mode_error(SDMode mode);
 void refresh_lora_cache(ServerRuntime& rt);
 std::string get_lora_full_path(ServerRuntime& rt, const std::string& path);
+void refresh_upscaler_cache(ServerRuntime& rt);
 int64_t unix_timestamp_now();
--- a/format-code.ps1
+++ b/format-code.ps1
@ -0,0 +1,54 @@
+$patterns = @(
+    "src/*.cpp"
+    "src/*.h"
+    "src/*.hpp"
+    "src/conditioning/*.cpp"
+    "src/conditioning/*.h"
+    "src/conditioning/*.hpp"
+    "src/core/*.cpp"
+    "src/core/*.h"
+    "src/core/*.hpp"
+    "src/extensions/*.cpp"
+    "src/extensions/*.h"
+    "src/extensions/*.hpp"
+    "src/runtime/*.cpp"
+    "src/runtime/*.h"
+    "src/runtime/*.hpp"
+    "src/model/*/*.cpp"
+    "src/model/*/*.h"
+    "src/model/*/*.hpp"
+    "src/tokenizers/*.h"
+    "src/tokenizers/*.cpp"
+    "src/tokenizers/vocab/*.h"
+    "src/tokenizers/vocab/*.cpp"
+    "src/model_io/*.h"
+    "src/model_io/*.cpp"
+    "examples/cli/*.cpp"
+    "examples/cli/*.h"
+    "examples/server/*.cpp"
+    "examples/common/*.hpp"
+    "examples/common/*.h"
+    "examples/common/*.cpp"
+)
+
+$root = (Get-Location).Path
+
+foreach ($pattern in $patterns) {
+    $files = Get-ChildItem -Path $pattern -File -ErrorAction SilentlyContinue | Sort-Object FullName
+
+    foreach ($file in $files) {
+        $relativePath = $file.FullName.Substring($root.Length).TrimStart('\', '/') -replace '\\', '/'
+
+        if ($relativePath -like "vocab*") {
+            continue
+        }
+
+        Write-Host "formatting '$relativePath'"
+
+        # if ($relativePath -ne "stable-diffusion.h") {
+        #     clang-tidy -fix -p build_linux/ "$relativePath"
+        # }
+
+        & clang-format -style=file -i $relativePath
+    }
+}
--- a/format-code.sh
+++ b/format-code.sh
@ -1,6 +1,13 @@
-for f in src/*.cpp src/*.h src/*.hpp src/tokenizers/*.h src/tokenizers/*.cpp src/tokenizers/vocab/*.h src/tokenizers/vocab/*.cpp \
-         examples/cli/*.cpp examples/cli/*.h examples/server/*.cpp \
+for f in src/*.cpp src/*.h src/*.hpp \
+         src/conditioning/*.cpp src/conditioning/*.h src/conditioning/*.hpp \
+         src/core/*.cpp src/core/*.h src/core/*.hpp \
+         src/extensions/*.cpp src/extensions/*.h src/extensions/*.hpp \
+         src/runtime/*.cpp src/runtime/*.h src/runtime/*.hpp \
+         src/model/*/*.cpp src/model/*/*.h src/model/*/*.hpp \
+         src/tokenizers/*.h src/tokenizers/*.cpp src/tokenizers/vocab/*.h src/tokenizers/vocab/*.cpp \
+         src/model_io/*.h src/model_io/*.cpp examples/cli/*.cpp examples/cli/*.h examples/server/*.cpp \
         examples/common/*.hpp examples/common/*.h examples/common/*.cpp; do
+  [[ -e "$f" ]] || continue
  [[ "$f" == vocab* ]] && continue
  echo "formatting '$f'"
  # if [ "$f" != "stable-diffusion.h" ]; then
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 404fcb9d7c96989569e68c9e7881ee3465a05c50
+Subproject commit 3af5f5760e19a96427f5f7a93b79cbdf3d4b265b
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@ -51,6 +51,9 @@ enum sample_method_t {
    RES_MULTISTEP_SAMPLE_METHOD,
    RES_2S_SAMPLE_METHOD,
    ER_SDE_SAMPLE_METHOD,
+    EULER_CFG_PP_SAMPLE_METHOD,
+    EULER_A_CFG_PP_SAMPLE_METHOD,
+    EULER_GE_SAMPLE_METHOD,
    SAMPLE_METHOD_COUNT
 };

@ -66,6 +69,7 @@ enum scheduler_t {
    KL_OPTIMAL_SCHEDULER,
    LCM_SCHEDULER,
    BONG_TANGENT_SCHEDULER,
+    LTX2_SCHEDULER,
    SCHEDULER_COUNT
 };

@ -122,7 +126,8 @@ enum sd_type_t {
    // SD_TYPE_IQ4_NL_8_8 = 38,
    SD_TYPE_MXFP4 = 39,  // MXFP4 (1 block)
    SD_TYPE_NVFP4 = 40,  // NVFP4 (4 blocks, E4M3 scale)
-    SD_TYPE_COUNT = 41,
+    SD_TYPE_Q1_0  = 41,
+    SD_TYPE_COUNT = 42,
 };

 enum sd_log_level_t {
@ -149,11 +154,13 @@ enum lora_apply_mode_t {

 typedef struct {
    bool enabled;
+    bool temporal_tiling;
    int tile_size_x;
    int tile_size_y;
    float target_overlap;
    float rel_size_x;
    float rel_size_y;
+    const char* extra_tiling_args;
 } sd_tiling_params_t;

 typedef struct {
@ -161,6 +168,14 @@ typedef struct {
    const char* path;
 } sd_embedding_t;

+enum sd_vae_format_t {
+    SD_VAE_FORMAT_AUTO = -1,
+    SD_VAE_FORMAT_FLUX,
+    SD_VAE_FORMAT_SD3,
+    SD_VAE_FORMAT_FLUX2,
+    SD_VAE_FORMAT_COUNT,
+};
+
 typedef struct {
    const char* model_path;
    const char* clip_l_path;
@ -171,26 +186,23 @@ typedef struct {
    const char* llm_vision_path;
    const char* diffusion_model_path;
    const char* high_noise_diffusion_model_path;
+    const char* uncond_diffusion_model_path;
+    const char* embeddings_connectors_path;
    const char* vae_path;
+    const char* audio_vae_path;
    const char* taesd_path;
    const char* control_net_path;
    const sd_embedding_t* embeddings;
    uint32_t embedding_count;
    const char* photo_maker_path;
    const char* tensor_type_rules;
-    bool vae_decode_only;
-    bool free_params_immediately;
    int n_threads;
    enum sd_type_t wtype;
    enum rng_type_t rng_type;
    enum rng_type_t sampler_rng_type;
    enum prediction_t prediction;
    enum lora_apply_mode_t lora_apply_mode;
-    bool offload_params_to_cpu;
    bool enable_mmap;
-    bool keep_clip_on_cpu;
-    bool keep_control_net_on_cpu;
-    bool keep_vae_on_cpu;
    bool flash_attn;
    bool diffusion_flash_attn;
    bool tae_preview_only;
@ -203,8 +215,21 @@ typedef struct {
    bool chroma_use_t5_mask;
    int chroma_t5_mask_pad;
    bool qwen_image_zero_cond_t;
+    enum sd_vae_format_t vae_format;
+    const char* max_vram;  // GiB budget or backend assignment spec for graph-cut segmented param offload (0 = disabled, -1 = auto)
+    bool stream_layers;  // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram)
+    const char* backend;
+    const char* params_backend;
+    const char* rpc_servers;
 } sd_ctx_params_t;

+typedef struct {
+    uint32_t sample_rate;
+    uint32_t channels;
+    uint64_t sample_count;
+    float* data;
+} sd_audio_t;
+
 typedef struct {
    uint32_t width;
    uint32_t height;
@ -237,6 +262,7 @@ typedef struct {
    float* custom_sigmas;
    int custom_sigmas_count;
    float flow_shift;
+    const char* extra_sample_args;
 } sd_sample_params_t;

 typedef struct {
@ -289,6 +315,34 @@ typedef struct {
    const char* path;
 } sd_lora_t;

+enum sd_hires_upscaler_t {
+    SD_HIRES_UPSCALER_NONE,
+    SD_HIRES_UPSCALER_LATENT,
+    SD_HIRES_UPSCALER_LATENT_NEAREST,
+    SD_HIRES_UPSCALER_LATENT_NEAREST_EXACT,
+    SD_HIRES_UPSCALER_LATENT_ANTIALIASED,
+    SD_HIRES_UPSCALER_LATENT_BICUBIC,
+    SD_HIRES_UPSCALER_LATENT_BICUBIC_ANTIALIASED,
+    SD_HIRES_UPSCALER_LANCZOS,
+    SD_HIRES_UPSCALER_NEAREST,
+    SD_HIRES_UPSCALER_MODEL,
+    SD_HIRES_UPSCALER_COUNT,
+};
+
+typedef struct {
+    bool enabled;
+    enum sd_hires_upscaler_t upscaler;
+    const char* model_path;
+    float scale;
+    int target_width;
+    int target_height;
+    int steps;
+    float denoising_strength;
+    int upscale_tile_size;
+    float* custom_sigmas;
+    int custom_sigmas_count;
+} sd_hires_params_t;
+
 typedef struct {
    const sd_lora_t* loras;
    uint32_t lora_count;
@ -312,6 +366,7 @@ typedef struct {
    sd_pm_params_t pm_params;
    sd_tiling_params_t vae_tiling_params;
    sd_cache_params_t cache;
+    sd_hires_params_t hires;
 } sd_img_gen_params_t;

 typedef struct {
@ -332,9 +387,11 @@ typedef struct {
    float strength;
    int64_t seed;
    int video_frames;
+    int fps;
    float vace_strength;
    sd_tiling_params_t vae_tiling_params;
    sd_cache_params_t cache;
+    sd_hires_params_t hires;
 } sd_vid_gen_params_t;

 typedef struct sd_ctx_t sd_ctx_t;
@ -348,6 +405,8 @@ SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
 SD_API void sd_set_preview_callback(sd_preview_cb_t cb, enum preview_t mode, int interval, bool denoised, bool noisy, void* data);
 SD_API int32_t sd_get_num_physical_cores();
 SD_API const char* sd_get_system_info();
+SD_API bool sd_ctx_supports_image_generation(const sd_ctx_t* sd_ctx);
+SD_API bool sd_ctx_supports_video_generation(const sd_ctx_t* sd_ctx);

 SD_API const char* sd_type_name(enum sd_type_t type);
 SD_API enum sd_type_t str_to_sd_type(const char* str);
@ -363,14 +422,18 @@ SD_API const char* sd_preview_name(enum preview_t preview);
 SD_API enum preview_t str_to_preview(const char* str);
 SD_API const char* sd_lora_apply_mode_name(enum lora_apply_mode_t mode);
 SD_API enum lora_apply_mode_t str_to_lora_apply_mode(const char* str);
+SD_API const char* sd_hires_upscaler_name(enum sd_hires_upscaler_t upscaler);
+SD_API enum sd_hires_upscaler_t str_to_sd_hires_upscaler(const char* str);

 SD_API void sd_cache_params_init(sd_cache_params_t* cache_params);
+SD_API void sd_hires_params_init(sd_hires_params_t* hires_params);

 SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
 SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);

 SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params);
 SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
+SD_API void free_sd_audio(sd_audio_t* audio);

 SD_API void sd_sample_params_init(sd_sample_params_t* sample_params);
 SD_API char* sd_sample_params_to_str(const sd_sample_params_t* sample_params);
@ -383,15 +446,20 @@ SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_para
 SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params);

 SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params);
-SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, int* num_frames_out);
+SD_API bool generate_video(sd_ctx_t* sd_ctx,
+                           const sd_vid_gen_params_t* sd_vid_gen_params,
+                           sd_image_t** frames_out,
+                           int* num_frames_out,
+                           sd_audio_t** audio_out);

 typedef struct upscaler_ctx_t upscaler_ctx_t;

 SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
-                                        bool offload_params_to_cpu,
                                        bool direct,
                                        int n_threads,
-                                        int tile_size);
+                                        int tile_size,
+                                        const char* backend,
+                                        const char* params_backend);
 SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);

 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
@ -417,6 +485,10 @@ SD_API bool preprocess_canny(sd_image_t image,
 SD_API const char* sd_commit(void);
 SD_API const char* sd_version(void);

+// for C API, caller needs to call free_sd_images to free the memory after use
+// This helps avoid CRT problems on Windows when memory is allocated in the library but freed in the caller, which may use a different CRT.
+SD_API void free_sd_images(sd_image_t* result_images, int num_images);
+
 #ifdef __cplusplus
 }
 #endif
--- a/script/convert_fp8_scale_to_bf16.py
+++ b/script/convert_fp8_scale_to_bf16.py
@ -0,0 +1,283 @@
+#!/usr/bin/env python
+import argparse
+import json
+import math
+import os
+import struct
+from collections import Counter
+from pathlib import Path
+
+import torch
+from safetensors import safe_open
+
+
+FLOAT_DTYPES = {
+    "BF16",
+    "F16",
+    "F32",
+    "F64",
+    "F8_E4M3",
+    "F8_E4M3FN",
+    "F8_E5M2",
+}
+
+FP8_DTYPES = {
+    "F8_E4M3",
+    "F8_E4M3FN",
+    "F8_E5M2",
+}
+
+DTYPE_SIZES = {
+    "BOOL": 1,
+    "U8": 1,
+    "I8": 1,
+    "F8_E4M3": 1,
+    "F8_E4M3FN": 1,
+    "F8_E5M2": 1,
+    "U16": 2,
+    "I16": 2,
+    "F16": 2,
+    "BF16": 2,
+    "U32": 4,
+    "I32": 4,
+    "F32": 4,
+    "U64": 8,
+    "I64": 8,
+    "F64": 8,
+}
+
+
+def read_safetensors_header(path: Path):
+    with path.open("rb") as f:
+        header_len = struct.unpack("<Q", f.read(8))[0]
+        header = f.read(header_len).decode("utf-8").rstrip()
+    return json.loads(header)
+
+
+def numel(shape):
+    return math.prod(shape) if shape else 1
+
+
+def scale_key_for_weight(name: str):
+    if name.endswith(".weight"):
+        return name[:-len(".weight")] + ".weight_scale"
+    if name.endswith("weight"):
+        return name + "_scale"
+    return None
+
+
+def tensor_nbytes(dtype: str, shape):
+    return numel(shape) * DTYPE_SIZES[dtype]
+
+
+def build_output_plan(header):
+    entries = {k: v for k, v in header.items() if k != "__metadata__"}
+    paired_scale_keys = set()
+    plan = []
+
+    for name, info in entries.items():
+        scale_key = scale_key_for_weight(name)
+        if info["dtype"] in FP8_DTYPES and scale_key in entries:
+            paired_scale_keys.add(scale_key)
+
+    for name, info in entries.items():
+        if name in paired_scale_keys:
+            continue
+
+        dtype = info["dtype"]
+        shape = info["shape"]
+        scale_key = scale_key_for_weight(name)
+
+        if dtype in FP8_DTYPES and scale_key in entries:
+            scale_info = entries[scale_key]
+            plan.append(
+                {
+                    "name": name,
+                    "source_dtype": dtype,
+                    "output_dtype": "BF16",
+                    "shape": shape,
+                    "mode": "fp8_scaled_weight",
+                    "scale_key": scale_key,
+                }
+            )
+            continue
+
+        if dtype in FLOAT_DTYPES:
+            plan.append(
+                {
+                    "name": name,
+                    "source_dtype": dtype,
+                    "output_dtype": "BF16",
+                    "shape": shape,
+                    "mode": "float_to_bf16",
+                }
+            )
+        else:
+            plan.append(
+                {
+                    "name": name,
+                    "source_dtype": dtype,
+                    "output_dtype": dtype,
+                    "shape": shape,
+                    "mode": "copy",
+                }
+            )
+
+    metadata = dict(header.get("__metadata__", {}) or {})
+    metadata["format"] = "pt"
+    metadata["conversion"] = "fp8_weight_scale_to_bf16"
+
+    output_header = {"__metadata__": metadata}
+    offset = 0
+    for item in plan:
+        size = tensor_nbytes(item["output_dtype"], item["shape"])
+        output_header[item["name"]] = {
+            "dtype": item["output_dtype"],
+            "shape": item["shape"],
+            "data_offsets": [offset, offset + size],
+        }
+        offset += size
+
+    return plan, output_header, offset
+
+
+def write_tensor_bytes(out, tensor):
+    tensor = tensor.detach().cpu().contiguous()
+    if tensor.numel() == 0:
+        return
+    if tensor.dtype == torch.bfloat16:
+        tensor.view(torch.uint16).numpy().tofile(out)
+    elif tensor.dtype in (getattr(torch, "float8_e4m3fn", None), getattr(torch, "float8_e5m2", None)):
+        tensor.view(torch.uint8).numpy().tofile(out)
+    else:
+        tensor.numpy().tofile(out)
+
+
+def scale_view_for_chunk(scale, chunk, first_dim_start=0, first_dim_end=None):
+    scale = scale.to(torch.float32)
+
+    if scale.numel() == 1:
+        return scale.reshape((1,) * chunk.ndim)
+
+    if chunk.ndim > 0 and scale.ndim == 1:
+        if first_dim_end is not None and scale.shape[0] >= first_dim_end:
+            scale = scale[first_dim_start:first_dim_end]
+        if scale.shape[0] == chunk.shape[0]:
+            return scale.reshape((scale.shape[0],) + (1,) * (chunk.ndim - 1))
+
+    return scale
+
+
+def write_scaled_fp8_weight(out, weight, scale, chunk_rows):
+    if weight.ndim == 0:
+        result = weight.to(torch.float32) * scale_view_for_chunk(scale, weight)
+        write_tensor_bytes(out, result.to(torch.bfloat16))
+        return
+
+    rows = weight.shape[0]
+    for start in range(0, rows, chunk_rows):
+        end = min(start + chunk_rows, rows)
+        chunk = weight[start:end].to(torch.float32)
+        scale_view = scale_view_for_chunk(scale, chunk, start, end)
+        result = chunk * scale_view
+        write_tensor_bytes(out, result.to(torch.bfloat16))
+
+
+def write_float_as_bf16(out, tensor, chunk_rows):
+    if tensor.dtype == torch.bfloat16:
+        write_tensor_bytes(out, tensor)
+        return
+
+    if tensor.ndim == 0:
+        write_tensor_bytes(out, tensor.to(torch.bfloat16))
+        return
+
+    rows = tensor.shape[0]
+    for start in range(0, rows, chunk_rows):
+        end = min(start + chunk_rows, rows)
+        write_tensor_bytes(out, tensor[start:end].to(torch.bfloat16))
+
+
+def convert(input_path: Path, output_path: Path, chunk_rows: int, dry_run: bool):
+    header = read_safetensors_header(input_path)
+    plan, output_header, data_size = build_output_plan(header)
+
+    source_counts = Counter(item["source_dtype"] for item in plan)
+    output_counts = Counter(item["output_dtype"] for item in plan)
+    scaled_count = sum(item["mode"] == "fp8_scaled_weight" for item in plan)
+    dropped_scales = sum(item["mode"] == "fp8_scaled_weight" for item in plan)
+    header_bytes = json.dumps(output_header, separators=(",", ":")).encode("utf-8")
+    expected_size = 8 + len(header_bytes) + data_size
+
+    print(f"input:  {input_path}")
+    print(f"output: {output_path}")
+    print(f"tensors written: {len(plan)}")
+    print(f"scaled fp8 weights dequantized: {scaled_count}")
+    print(f"weight_scale tensors dropped: {dropped_scales}")
+    print(f"source dtypes: {dict(sorted(source_counts.items()))}")
+    print(f"output dtypes: {dict(sorted(output_counts.items()))}")
+    print(f"expected output size: {expected_size / (1024 ** 3):.2f} GiB")
+
+    if dry_run:
+        return
+
+    if output_path.exists():
+        raise FileExistsError(f"{output_path} already exists; pass --overwrite to replace it")
+
+    tmp_path = output_path.with_suffix(output_path.suffix + ".tmp")
+    if tmp_path.exists():
+        raise FileExistsError(f"{tmp_path} already exists; remove it or choose another output")
+
+    with safe_open(str(input_path), framework="pt", device="cpu") as sf, tmp_path.open("wb") as out:
+        out.write(struct.pack("<Q", len(header_bytes)))
+        out.write(header_bytes)
+
+        for index, item in enumerate(plan, 1):
+            name = item["name"]
+            print(f"[{index:04d}/{len(plan):04d}] {name} -> {item['output_dtype']}")
+
+            tensor = sf.get_tensor(name)
+            if item["mode"] == "fp8_scaled_weight":
+                scale = sf.get_tensor(item["scale_key"])
+                write_scaled_fp8_weight(out, tensor, scale, chunk_rows)
+            elif item["mode"] == "float_to_bf16":
+                write_float_as_bf16(out, tensor, chunk_rows)
+            else:
+                write_tensor_bytes(out, tensor)
+
+        actual_size = out.tell()
+
+    if actual_size != expected_size:
+        tmp_path.unlink(missing_ok=True)
+        raise RuntimeError(f"wrote {actual_size} bytes, expected {expected_size} bytes")
+
+    tmp_path.replace(output_path)
+    print("done")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert an fp8 safetensors checkpoint with weight_scale tensors to bf16."
+    )
+    parser.add_argument("--input", default="ideogram4_fp8.safetensors", type=Path)
+    parser.add_argument("--output", default="ideogram4_bf16.safetensors", type=Path)
+    parser.add_argument("--chunk-rows", default=1024, type=int)
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--overwrite", action="store_true")
+    args = parser.parse_args()
+
+    input_path = args.input.resolve()
+    output_path = args.output.resolve()
+
+    if args.chunk_rows < 1:
+        raise ValueError("--chunk-rows must be >= 1")
+    if not input_path.exists():
+        raise FileNotFoundError(input_path)
+    if args.overwrite and output_path.exists():
+        output_path.unlink()
+
+    convert(input_path, output_path, args.chunk_rows, args.dry_run)
+
+
+if __name__ == "__main__":
+    main()
--- a/src/conditioning/conditioner.hpp
+++ b/src/conditioning/conditioner.hpp
--- a/src/convert.cpp
+++ b/src/convert.cpp
@ -0,0 +1,138 @@
+#include <cstring>
+#include <mutex>
+#include <regex>
+#include <vector>
+
+#include "model_io/gguf_io.h"
+#include "model_io/safetensors_io.h"
+#include "model_loader.h"
+#include "util.h"
+
+#include "ggml_extend_backend.h"
+
+static ggml_type get_export_tensor_type(ModelLoader& model_loader,
+                                        const TensorStorage& tensor_storage,
+                                        ggml_type type,
+                                        const TensorTypeRules& tensor_type_rules) {
+    const std::string& name = tensor_storage.name;
+    ggml_type tensor_type   = tensor_storage.type;
+    ggml_type dst_type      = type;
+
+    for (const auto& tensor_type_rule : tensor_type_rules) {
+        std::regex pattern(tensor_type_rule.first);
+        if (std::regex_search(name, pattern)) {
+            dst_type = tensor_type_rule.second;
+            break;
+        }
+    }
+
+    if (model_loader.tensor_should_be_converted(tensor_storage, dst_type)) {
+        tensor_type = dst_type;
+    }
+
+    return tensor_type;
+}
+
+static bool load_tensors_for_export(ModelLoader& model_loader,
+                                    ggml_context* ggml_ctx,
+                                    ggml_type type,
+                                    const TensorTypeRules& tensor_type_rules,
+                                    std::vector<TensorWriteInfo>& tensors) {
+    std::mutex tensor_mutex;
+    auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
+        const std::string& name = tensor_storage.name;
+        ggml_type tensor_type   = get_export_tensor_type(model_loader, tensor_storage, type, tensor_type_rules);
+
+        std::lock_guard<std::mutex> lock(tensor_mutex);
+        ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
+        if (tensor == nullptr) {
+            LOG_ERROR("ggml_new_tensor failed");
+            return false;
+        }
+        ggml_set_name(tensor, name.c_str());
+
+        if (!tensor->data) {
+            GGML_ASSERT(ggml_nelements(tensor) == 0);
+            // Avoid crashing writers by setting a dummy pointer for zero-sized tensors.
+            LOG_DEBUG("setting dummy pointer for zero-sized tensor %s", name.c_str());
+            tensor->data = ggml_get_mem_buffer(ggml_ctx);
+        }
+
+        TensorWriteInfo write_info;
+        write_info.tensor = tensor;
+        write_info.n_dims = tensor_storage.n_dims;
+        for (int i = 0; i < tensor_storage.n_dims; ++i) {
+            write_info.ne[i] = tensor_storage.ne[i];
+        }
+
+        *dst_tensor = tensor;
+        tensors.push_back(std::move(write_info));
+
+        return true;
+    };
+
+    bool success = model_loader.load_tensors(on_new_tensor_cb);
+    LOG_INFO("load tensors done");
+    return success;
+}
+
+bool convert(const char* input_path,
+             const char* vae_path,
+             const char* output_path,
+             sd_type_t output_type,
+             const char* tensor_type_rules,
+             bool convert_name) {
+    ModelLoader model_loader;
+
+    if (!model_loader.init_from_file(input_path)) {
+        LOG_ERROR("init model loader from file failed: '%s'", input_path);
+        return false;
+    }
+
+    if (vae_path != nullptr && strlen(vae_path) > 0) {
+        if (!model_loader.init_from_file(vae_path, "vae.")) {
+            LOG_ERROR("init model loader from file failed: '%s'", vae_path);
+            return false;
+        }
+    }
+    if (convert_name) {
+        model_loader.convert_tensors_name();
+    }
+
+    ggml_type type             = (ggml_type)output_type;
+    bool output_is_safetensors = ends_with(output_path, ".safetensors");
+    TensorTypeRules type_rules = parse_tensor_type_rules(tensor_type_rules);
+
+    auto backend    = sd_backend_cpu_init();
+    size_t mem_size = 1 * 1024 * 1024;  // for padding
+    mem_size += model_loader.get_tensor_storage_map().size() * ggml_tensor_overhead();
+    mem_size += model_loader.get_params_mem_size(backend, type);
+    LOG_INFO("model tensors mem size: %.2fMB", mem_size / 1024.f / 1024.f);
+    ggml_context* ggml_ctx = ggml_init({mem_size, nullptr, false});
+
+    if (ggml_ctx == nullptr) {
+        LOG_ERROR("ggml_init failed for converter");
+        ggml_backend_free(backend);
+        return false;
+    }
+
+    std::vector<TensorWriteInfo> tensors;
+    bool success = load_tensors_for_export(model_loader, ggml_ctx, type, type_rules, tensors);
+    ggml_backend_free(backend);
+
+    std::string error;
+    if (success) {
+        if (output_is_safetensors) {
+            success = write_safetensors_file(output_path, tensors, &error);
+        } else {
+            success = write_gguf_file(output_path, tensors, &error);
+        }
+    }
+
+    if (!success && !error.empty()) {
+        LOG_ERROR("%s", error.c_str());
+    }
+
+    ggml_free(ggml_ctx);
+    return success;
+}
--- a/src/core/ggml_extend.hpp
+++ b/src/core/ggml_extend.hpp
--- a/src/core/ggml_extend_backend.cpp
+++ b/src/core/ggml_extend_backend.cpp
@ -0,0 +1,689 @@
+#include "core/ggml_extend_backend.h"
+
+#include <algorithm>
+#include <cctype>
+#include <cstdlib>
+#include <mutex>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+#include "core/util.h"
+#include "stable-diffusion.h"
+
+static std::string trim_copy(const std::string& value) {
+    size_t begin = 0;
+    while (begin < value.size() && std::isspace(static_cast<unsigned char>(value[begin]))) {
+        ++begin;
+    }
+    size_t end = value.size();
+    while (end > begin && std::isspace(static_cast<unsigned char>(value[end - 1]))) {
+        --end;
+    }
+    return value.substr(begin, end - begin);
+}
+
+static std::string lower_copy(std::string value) {
+    std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c) {
+        return static_cast<char>(std::tolower(c));
+    });
+    return value;
+}
+
+static std::vector<std::string> split_copy(const std::string& value, char delimiter) {
+    std::vector<std::string> parts;
+    std::string part;
+    std::istringstream stream(value);
+    while (std::getline(stream, part, delimiter)) {
+        parts.push_back(part);
+    }
+    return parts;
+}
+
+static bool is_default_backend_token(const std::string& name) {
+    const std::string lower = lower_copy(trim_copy(name));
+    return lower.empty() || lower == "default" || lower == "auto";
+}
+
+static bool is_disk_backend_token(const std::string& name) {
+    return lower_copy(trim_copy(name)) == "disk";
+}
+
+static bool parse_backend_module(const std::string& raw_name, SDBackendModule* module) {
+    std::string name = lower_copy(trim_copy(raw_name));
+    name.erase(std::remove(name.begin(), name.end(), '-'), name.end());
+    name.erase(std::remove(name.begin(), name.end(), '_'), name.end());
+
+    if (name == "diffusion" || name == "model" || name == "unet" || name == "dit") {
+        *module = SDBackendModule::DIFFUSION;
+        return true;
+    }
+    if (name == "te" || name == "clip" || name == "text" || name == "textencoder" || name == "textencoders" || name == "conditioner" || name == "cond" || name == "llm" || name == "t5" || name == "t5xxl") {
+        *module = SDBackendModule::TE;
+        return true;
+    }
+    if (name == "clipvision" || name == "vision") {
+        *module = SDBackendModule::CLIP_VISION;
+        return true;
+    }
+    if (name == "vae" || name == "firststage" || name == "autoencoder" || name == "tae") {
+        *module = SDBackendModule::VAE;
+        return true;
+    }
+    if (name == "controlnet" || name == "control") {
+        *module = SDBackendModule::CONTROL_NET;
+        return true;
+    }
+    if (name == "photomaker" || name == "photomakerid" || name == "pmid" || name == "photo") {
+        *module = SDBackendModule::PHOTOMAKER;
+        return true;
+    }
+    if (name == "upscaler" || name == "esrgan" || name == "hires") {
+        *module = SDBackendModule::UPSCALER;
+        return true;
+    }
+    return false;
+}
+
+static std::string module_assignment_name(const SDBackendAssignment& assignment, SDBackendModule module) {
+    auto it = assignment.module_names.find(module);
+    if (it != assignment.module_names.end()) {
+        return it->second;
+    }
+    return assignment.default_name;
+}
+
+static std::string backend_cache_key(ggml_backend_t backend) {
+    if (backend == nullptr) {
+        return "";
+    }
+    ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+    if (dev != nullptr) {
+        return lower_copy(ggml_backend_dev_name(dev));
+    }
+    const char* backend_name = ggml_backend_name(backend);
+    return backend_name != nullptr ? lower_copy(backend_name) : "";
+}
+
+static std::string resolve_first_device_by_type(enum ggml_backend_dev_type type) {
+    ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
+    if (dev == nullptr) {
+        return "";
+    }
+    return ggml_backend_dev_name(dev);
+}
+
+static ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) {
+    if (tensor == nullptr) {
+        return nullptr;
+    }
+
+    return tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+}
+
+static bool ggml_backend_tensor_is_host_accessible(const struct ggml_tensor* tensor) {
+    if (tensor == nullptr || tensor->data == nullptr) {
+        return false;
+    }
+
+    ggml_backend_buffer_t buffer = ggml_backend_tensor_buffer(tensor);
+    return buffer == nullptr || ggml_backend_buffer_is_host(buffer);
+}
+
+static size_t ggml_backend_tensor_offset(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+    return static_cast<size_t>(i0 * tensor->nb[0] + i1 * tensor->nb[1] + i2 * tensor->nb[2] + i3 * tensor->nb[3]);
+}
+
+template <typename T>
+static void ggml_backend_tensor_write_scalar(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, T value) {
+    const size_t offset = ggml_backend_tensor_offset(tensor, i0, i1, i2, i3);
+
+    if (ggml_backend_tensor_is_host_accessible(tensor)) {
+        auto* dst = reinterpret_cast<T*>(reinterpret_cast<char*>(tensor->data) + offset);
+        *dst      = value;
+        return;
+    }
+
+    ggml_backend_tensor_set(const_cast<struct ggml_tensor*>(tensor), &value, offset, sizeof(T));
+}
+
+static void ggml_set_f32_nd(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float value) {
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast<int8_t>(value));
+            break;
+        case GGML_TYPE_I16:
+            ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast<int16_t>(value));
+            break;
+        case GGML_TYPE_I32:
+            ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast<int32_t>(value));
+            break;
+        case GGML_TYPE_F16:
+            ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_fp16(value));
+            break;
+        case GGML_TYPE_BF16:
+            ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_bf16(value));
+            break;
+        case GGML_TYPE_F32:
+            ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, value);
+            break;
+        default:
+            GGML_ABORT("fatal error");
+    }
+}
+
+void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value) {
+    if (!ggml_is_contiguous(tensor)) {
+        int64_t id[4] = {0, 0, 0, 0};
+        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value);
+        return;
+    }
+
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast<int8_t>(value));
+            break;
+        case GGML_TYPE_I16:
+            ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast<int16_t>(value));
+            break;
+        case GGML_TYPE_I32:
+            ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast<int32_t>(value));
+            break;
+        case GGML_TYPE_F16:
+            ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_fp16(value));
+            break;
+        case GGML_TYPE_BF16:
+            ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_bf16(value));
+            break;
+        case GGML_TYPE_F32:
+            ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, value);
+            break;
+        default:
+            GGML_ABORT("fatal error");
+    }
+}
+
+bool add_rpc_devices(const std::string& servers) {
+    const std::string in = trim_copy(servers);
+    if (in.empty()) {
+        return true;
+    }
+    auto rpc_servers = split_copy(in, ',');
+    if (rpc_servers.empty()) {
+        LOG_ERROR("invalid RPC servers specification: '%s'", servers.c_str());
+        return false;
+    }
+    ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
+    if (!rpc_reg) {
+        LOG_ERROR("RPC backend not found, cannot add RPC servers");
+        return false;
+    }
+    typedef ggml_backend_reg_t (*ggml_backend_rpc_add_server_t)(const char* endpoint);
+    ggml_backend_rpc_add_server_t ggml_backend_rpc_add_server_fn = (ggml_backend_rpc_add_server_t)ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_server");
+    if (!ggml_backend_rpc_add_server_fn) {
+        LOG_ERROR("RPC backend does not have ggml_backend_rpc_add_server function, cannot add RPC servers");
+        return false;
+    }
+    for (const auto& server : rpc_servers) {
+        LOG_INFO("Adding RPC server: %s", server.c_str());
+        auto reg = ggml_backend_rpc_add_server_fn(server.c_str());
+        // no return value to check for success but should print errors from the RPC backend if it fails to add the server
+        ggml_backend_register(reg);
+    }
+    return true;
+}
+
+static void ggml_backend_load_all_once() {
+    // If the registry already has devices and the CPU backend is present,
+    // assume either static registration or explicit host-side preloading has
+    // completed and avoid rescanning the default paths.
+    if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) {
+        return;
+    }
+    // In dynamic-backend mode the backend modules are discovered at runtime,
+    // so we must load them before asking for the CPU backend or its proc table.
+    // If the host preloaded only a subset of backends, allow one default-path
+    // scan so missing modules can still be discovered.
+    static std::once_flag once;
+    std::call_once(once, []() {
+        if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) {
+            return;
+        }
+        ggml_backend_load_all();
+    });
+}
+
+bool sd_backend_is(ggml_backend_t backend, const std::string& name) {
+    if (!backend) {
+        return false;
+    }
+    ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+    if (!dev) {
+        return false;
+    }
+    std::string dev_name = ggml_backend_dev_name(dev);
+    return lower_copy(dev_name).find(lower_copy(name)) != std::string::npos;
+}
+
+static std::string get_default_backend_name() {
+    ggml_backend_load_all_once();
+    // should pick the same backend preference as ggml_backend_init_best
+    std::string name = resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
+    if (!name.empty()) {
+        return name;
+    }
+    name = resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
+    if (!name.empty()) {
+        return name;
+    }
+    return resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+}
+
+std::string sd_backend_resolve_name(const std::string& name) {
+    ggml_backend_load_all_once();
+    std::string requested = trim_copy(name);
+    std::string lower     = lower_copy(requested);
+
+    if (is_default_backend_token(lower)) {
+        return get_default_backend_name();
+    }
+    if (lower == "gpu") {
+        std::string result = resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
+        if (!result.empty()) {
+            return result;
+        }
+        return resolve_first_device_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
+    }
+
+    const size_t device_count = ggml_backend_dev_count();
+    for (size_t i = 0; i < device_count; ++i) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        std::string dev_name   = ggml_backend_dev_name(dev);
+        if (lower_copy(dev_name) == lower) {
+            return dev_name;
+        }
+    }
+
+    for (size_t i = 0; i < device_count; ++i) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        std::string dev_name   = ggml_backend_dev_name(dev);
+        std::string dev_lower  = lower_copy(dev_name);
+        if (dev_lower.rfind(lower, 0) == 0) {
+            return dev_name;
+        }
+    }
+
+    return "";
+}
+
+static bool backend_name_exists(const std::string& name) {
+    return !sd_backend_resolve_name(name).empty();
+}
+
+static ggml_backend_t init_named_backend(const std::string& name) {
+    ggml_backend_load_all_once();
+    LOG_DEBUG("Initializing backend: %s", name.c_str());
+    if (trim_copy(name).empty()) {
+        return ggml_backend_init_best();
+    }
+
+    std::string resolved = sd_backend_resolve_name(name);
+    if (resolved.empty()) {
+        return nullptr;
+    }
+    return ggml_backend_init_by_name(resolved.c_str(), nullptr);
+}
+
+bool sd_backend_is_cpu(ggml_backend_t backend) {
+    if (backend == nullptr) {
+        return false;
+    }
+    auto dev = ggml_backend_get_device(backend);
+    return dev != nullptr && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU;
+}
+
+ggml_backend_t sd_backend_cpu_init() {
+    ggml_backend_load_all_once();
+    return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+}
+
+bool sd_backend_cpu_set_n_threads(ggml_backend_t backend, int n_threads) {
+    if (backend == nullptr) {
+        return false;
+    }
+    auto dev = ggml_backend_get_device(backend);
+    if (dev != nullptr && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
+        auto reg                           = ggml_backend_dev_backend_reg(dev);
+        auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t)ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+        if (ggml_backend_set_n_threads_fn != nullptr) {
+            ggml_backend_set_n_threads_fn(backend, n_threads);
+            return true;
+        }
+    }
+    return false;
+}
+
+const char* sd_get_system_info() {
+    static std::string cache_info = []() -> std::string {
+        ggml_backend_load_all_once();
+        std::stringstream ss;
+        ss << "System Info: \n";
+        auto dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+        if (dev != nullptr) {
+            auto reg                          = ggml_backend_dev_backend_reg(dev);
+            auto ggml_backend_get_features_fn = (ggml_backend_get_features_t)ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
+            if (ggml_backend_get_features_fn != nullptr) {
+                ggml_backend_feature* feat = ggml_backend_get_features_fn(reg);
+                while (feat->name && feat->value) {
+                    ss << "   " << feat->name << " = " << feat->value << " | ";
+                    feat++;
+                }
+            } else {
+                LOG_WARN("unable to get CPU features");
+            }
+        } else {
+            LOG_WARN("unable to get CPU features");
+        }
+        return ss.str();
+    }();
+    return cache_info.c_str();
+}
+
+static ggml_backend_t sd_get_default_backend() {
+    ggml_backend_load_all_once();
+    static std::once_flag once;
+    std::call_once(once, []() {
+        size_t dev_count = ggml_backend_dev_count();
+        if (dev_count == 0) {
+            LOG_ERROR("No devices found!");
+        } else {
+            LOG_DEBUG("Found %zu backend devices:", dev_count);
+            for (size_t i = 0; i < dev_count; ++i) {
+                auto dev = ggml_backend_dev_get(i);
+                LOG_DEBUG("#%zu: %s", i, ggml_backend_dev_name(dev));
+            }
+        }
+    });
+
+    ggml_backend_t backend   = nullptr;
+    const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE");
+    if (SD_VK_DEVICE != nullptr) {
+        std::string sd_vk_device_str = SD_VK_DEVICE;
+        try {
+            unsigned long long device  = std::stoull(sd_vk_device_str);
+            std::string vk_device_name = "Vulkan" + std::to_string(device);
+            if (backend_name_exists(vk_device_name)) {
+                LOG_INFO("Selecting %s as main device by env var SD_VK_DEVICE", vk_device_name.c_str());
+                backend = init_named_backend(vk_device_name);
+                if (!backend) {
+                    LOG_WARN("Device %s requested by SD_VK_DEVICE failed to init. Falling back to the default device.", vk_device_name.c_str());
+                }
+            } else {
+                LOG_WARN("Device %s requested by SD_VK_DEVICE was not found. Falling back to the default device.", vk_device_name.c_str());
+            }
+        } catch (const std::invalid_argument&) {
+            LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to the default device.", SD_VK_DEVICE);
+        } catch (const std::out_of_range&) {
+            LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to the default device.", SD_VK_DEVICE);
+        }
+    }
+
+    if (!backend) {
+        std::string dev_name = get_default_backend_name();
+        backend              = init_named_backend(dev_name);
+        if (!backend && !dev_name.empty()) {
+            LOG_WARN("device %s failed to init", dev_name.c_str());
+        }
+    }
+
+    if (!backend) {
+        LOG_WARN("loading CPU backend");
+        backend = sd_backend_cpu_init();
+    }
+
+    if (sd_backend_is_cpu(backend)) {
+        LOG_DEBUG("Using CPU backend");
+    }
+
+    return backend;
+}
+
+static bool sd_parse_backend_assignment(const std::string& spec, SDBackendAssignment* assignment, std::string* error) {
+    if (assignment == nullptr) {
+        return false;
+    }
+
+    *assignment          = {};
+    const std::string in = trim_copy(spec);
+    if (in.empty()) {
+        return true;
+    }
+
+    for (const std::string& raw_part : split_copy(in, ',')) {
+        const std::string part = trim_copy(raw_part);
+        if (part.empty()) {
+            continue;
+        }
+
+        const size_t eq = part.find('=');
+        if (eq == std::string::npos) {
+            assignment->set_default(part);
+            continue;
+        }
+
+        const std::string key   = trim_copy(part.substr(0, eq));
+        const std::string value = trim_copy(part.substr(eq + 1));
+        if (key.empty() || value.empty()) {
+            if (error != nullptr) {
+                *error = "invalid backend assignment '" + part + "'";
+            }
+            return false;
+        }
+
+        const std::string key_lower = lower_copy(key);
+        if (key_lower == "all" || key_lower == "default" || key_lower == "*") {
+            assignment->set_default(value);
+            continue;
+        }
+
+        SDBackendModule module = SDBackendModule::DIFFUSION;
+        if (!parse_backend_module(key, &module)) {
+            if (error != nullptr) {
+                *error = "unknown backend module '" + key + "'";
+            }
+            return false;
+        }
+        assignment->set_module(module, value);
+    }
+    return true;
+}
+
+bool SDBackendAssignment::empty() const {
+    return default_name.empty() && module_names.empty();
+}
+
+std::string SDBackendAssignment::get(SDBackendModule module) const {
+    return module_assignment_name(*this, module);
+}
+
+void SDBackendAssignment::set_default(const std::string& name) {
+    default_name = trim_copy(name);
+}
+
+void SDBackendAssignment::set_module(SDBackendModule module, const std::string& name) {
+    module_names[module] = trim_copy(name);
+}
+
+void SDBackendHandleDeleter::operator()(ggml_backend_t backend) const {
+    ggml_backend_free(backend);
+}
+
+SDBackendManager::~SDBackendManager() {
+    reset();
+}
+
+void SDBackendManager::reset() {
+    backends_.clear();
+    runtime_assignment_ = {};
+    params_assignment_  = {};
+}
+
+ggml_backend_t SDBackendManager::runtime_backend(SDBackendModule module) {
+    return init_cached_backend(runtime_assignment_.get(module));
+}
+
+ggml_backend_t SDBackendManager::params_backend(SDBackendModule module) {
+    std::string name = params_assignment_.get(module);
+    if (name.empty()) {
+        return runtime_backend(module);
+    }
+    if (is_disk_backend_token(name)) {
+        return runtime_backend(module);
+    }
+    return init_cached_backend(name);
+}
+
+bool SDBackendManager::runtime_backend_is_cpu(SDBackendModule module) {
+    return sd_backend_is_cpu(runtime_backend(module));
+}
+
+bool SDBackendManager::params_backend_is_cpu(SDBackendModule module) {
+    return sd_backend_is_cpu(params_backend(module));
+}
+
+bool SDBackendManager::params_backend_is_disk(SDBackendModule module) const {
+    return is_disk_backend_token(params_assignment_.get(module));
+}
+
+bool SDBackendManager::runtime_backend_supports_host_buffer(SDBackendModule module) {
+    ggml_backend_t backend = runtime_backend(module);
+    if (backend == nullptr) {
+        return false;
+    }
+    if (sd_backend_is_cpu(backend)) {
+        return true;
+    }
+    ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+    if (dev == nullptr) {
+        return false;
+    }
+    ggml_backend_dev_props props;
+    ggml_backend_dev_get_props(dev, &props);
+    return props.caps.buffer_from_host_ptr;
+}
+
+bool SDBackendManager::init(const char* backend_spec,
+                            const char* params_backend_spec,
+                            std::string* error) {
+    reset();
+
+    if (!sd_parse_backend_assignment(SAFE_STR(backend_spec), &runtime_assignment_, error)) {
+        return false;
+    }
+    if (!sd_parse_backend_assignment(SAFE_STR(params_backend_spec), &params_assignment_, error)) {
+        return false;
+    }
+
+    return validate(error);
+}
+
+bool SDBackendManager::validate(std::string* error) const {
+    auto validate_runtime_name = [&](const std::string& name) -> bool {
+        if (is_default_backend_token(name)) {
+            return true;
+        }
+        if (is_disk_backend_token(name)) {
+            if (error != nullptr) {
+                *error = "backend 'disk' is only supported by params_backend";
+            }
+            return false;
+        }
+        if (!sd_backend_resolve_name(name).empty()) {
+            return true;
+        }
+        if (error != nullptr) {
+            *error = "backend '" + name + "' was not found";
+        }
+        return false;
+    };
+    auto validate_params_name = [&](const std::string& name) -> bool {
+        if (is_disk_backend_token(name)) {
+            return true;
+        }
+        return validate_runtime_name(name);
+    };
+
+    if (!validate_runtime_name(runtime_assignment_.default_name) ||
+        !validate_params_name(params_assignment_.default_name)) {
+        return false;
+    }
+    for (const auto& kv : runtime_assignment_.module_names) {
+        if (!validate_runtime_name(kv.second)) {
+            return false;
+        }
+    }
+    for (const auto& kv : params_assignment_.module_names) {
+        if (!validate_params_name(kv.second)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+ggml_backend_t SDBackendManager::init_cached_backend(const std::string& name) {
+    std::string resolved   = sd_backend_resolve_name(name);
+    std::string key        = lower_copy(resolved);
+    ggml_backend_t backend = nullptr;
+
+    if (!key.empty()) {
+        auto it = backends_.find(key);
+        if (it != backends_.end()) {
+            return it->second.get();
+        }
+    } else if (!is_default_backend_token(name)) {
+        LOG_ERROR("backend '%s' was not found", name.c_str());
+        return nullptr;
+    }
+
+    backend = is_default_backend_token(name) ? sd_get_default_backend() : init_named_backend(resolved);
+    if (backend == nullptr) {
+        LOG_ERROR("failed to initialize backend '%s'", name.c_str());
+        return nullptr;
+    }
+
+    std::string actual_key = backend_cache_key(backend);
+    if (actual_key.empty()) {
+        actual_key = !key.empty() ? key : lower_copy(trim_copy(name));
+    }
+
+    auto it = backends_.find(actual_key);
+    if (it != backends_.end()) {
+        ggml_backend_free(backend);
+        return it->second.get();
+    }
+
+    SDBackendHandle handle(backend);
+    backends_.emplace(actual_key, std::move(handle));
+    return backend;
+}
+
+const char* sd_backend_module_name(SDBackendModule module) {
+    switch (module) {
+        case SDBackendModule::DIFFUSION:
+            return "diffusion";
+        case SDBackendModule::TE:
+            return "te";
+        case SDBackendModule::CLIP_VISION:
+            return "clip_vision";
+        case SDBackendModule::VAE:
+            return "vae";
+        case SDBackendModule::CONTROL_NET:
+            return "controlnet";
+        case SDBackendModule::PHOTOMAKER:
+            return "photomaker";
+        case SDBackendModule::UPSCALER:
+            return "upscaler";
+    }
+    return "unknown";
+}
--- a/src/core/ggml_extend_backend.h
+++ b/src/core/ggml_extend_backend.h
@ -0,0 +1,78 @@
+#ifndef __SD_CORE_GGML_EXTEND_BACKEND_H__
+#define __SD_CORE_GGML_EXTEND_BACKEND_H__
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "ggml-backend.h"
+#include "ggml.h"
+
+enum class SDBackendModule {
+    DIFFUSION,
+    TE,
+    CLIP_VISION,
+    VAE,
+    CONTROL_NET,
+    PHOTOMAKER,
+    UPSCALER,
+};
+
+struct SDBackendAssignment {
+    std::string default_name;
+    std::unordered_map<SDBackendModule, std::string> module_names;
+
+    bool empty() const;
+    std::string get(SDBackendModule module) const;
+    void set_default(const std::string& name);
+    void set_module(SDBackendModule module, const std::string& name);
+};
+
+struct SDBackendHandleDeleter {
+    void operator()(ggml_backend_t backend) const;
+};
+
+using SDBackendHandle = std::unique_ptr<struct ggml_backend, SDBackendHandleDeleter>;
+
+class SDBackendManager {
+private:
+    SDBackendAssignment runtime_assignment_;
+    SDBackendAssignment params_assignment_;
+    std::unordered_map<std::string, SDBackendHandle> backends_;
+
+public:
+    SDBackendManager() = default;
+    ~SDBackendManager();
+
+    SDBackendManager(const SDBackendManager&)            = delete;
+    SDBackendManager& operator=(const SDBackendManager&) = delete;
+
+    bool init(const char* backend_spec,
+              const char* params_backend_spec,
+              std::string* error);
+    void reset();
+
+    ggml_backend_t runtime_backend(SDBackendModule module);
+    ggml_backend_t params_backend(SDBackendModule module);
+
+    bool runtime_backend_is_cpu(SDBackendModule module);
+    bool params_backend_is_cpu(SDBackendModule module);
+    bool params_backend_is_disk(SDBackendModule module) const;
+    bool runtime_backend_supports_host_buffer(SDBackendModule module);
+
+private:
+    bool validate(std::string* error) const;
+    ggml_backend_t init_cached_backend(const std::string& name);
+};
+
+bool sd_backend_is(ggml_backend_t backend, const std::string& name);
+bool sd_backend_is_cpu(ggml_backend_t backend);
+ggml_backend_t sd_backend_cpu_init();
+bool sd_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
+std::string sd_backend_resolve_name(const std::string& name);
+const char* sd_backend_module_name(SDBackendModule module);
+void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value);
+bool add_rpc_devices(const std::string& servers);
+#endif  // __SD_CORE_GGML_EXTEND_BACKEND_H__
--- a/src/core/ggml_graph_cut.cpp
+++ b/src/core/ggml_graph_cut.cpp
--- a/src/core/ggml_graph_cut.h
+++ b/src/core/ggml_graph_cut.h
@ -0,0 +1,128 @@
+#ifndef __SD_CORE_GGML_GRAPH_CUT_H__
+#define __SD_CORE_GGML_GRAPH_CUT_H__
+
+#include <array>
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "ggml-backend.h"
+#include "ggml.h"
+
+namespace sd::ggml_graph_cut {
+
+    // Streaming residency for a segment's params.
+    enum class SegmentResidency : uint8_t {
+        STREAMED = 0,
+        RESIDENT = 1,
+    };
+
+    struct Segment {
+        enum InputType {
+            INPUT_EXTERNAL = 0,
+            INPUT_PREVIOUS_CUT,
+            INPUT_PARAM,
+        };
+
+        struct InputRef {
+            InputType type = INPUT_EXTERNAL;
+            std::string display_name;
+            int leaf_index = -1;
+            int node_index = -1;
+        };
+
+        size_t compute_buffer_size      = 0;
+        size_t output_bytes             = 0;
+        size_t input_external_bytes     = 0;
+        size_t input_previous_cut_bytes = 0;
+        size_t input_param_bytes        = 0;
+        std::string group_name;
+        std::vector<int> internal_node_indices;
+        std::vector<int> output_node_indices;
+        std::vector<InputRef> input_refs;
+        SegmentResidency residency = SegmentResidency::STREAMED;
+    };
+
+    struct Plan {
+        struct InputShape {
+            int leaf_index                        = -1;
+            ggml_type type                        = GGML_TYPE_COUNT;
+            std::array<int64_t, GGML_MAX_DIMS> ne = {0, 0, 0, 0};
+        };
+
+        bool available = false;
+        bool has_cuts  = false;
+        bool valid     = true;
+        int n_nodes    = 0;
+        int n_leafs    = 0;
+        std::vector<InputShape> input_shapes;
+        std::vector<Segment> segments;
+    };
+
+    struct PlanCache {
+        Plan graph_cut_plan;
+        Plan budgeted_graph_cut_plan;
+        size_t budgeted_graph_cut_plan_max_vram_bytes = 0;
+    };
+
+    static constexpr const char* GGML_RUNNER_CUT_PREFIX = "ggml_runner_cut:";
+
+    struct MaxVramAssignment {
+        float default_gib = 0.f;
+        std::unordered_map<std::string, float> backend_gib;
+        std::unordered_map<std::string, size_t> resolved_backend_bytes;
+
+        void reset(float fallback_gib);
+        bool parse(const std::string& raw_spec, std::string* error);
+        bool canonicalize_backend_keys(std::string* error);
+        size_t bytes_for_backend(ggml_backend_t backend);
+    };
+
+    bool is_graph_cut_tensor(const ggml_tensor* tensor);
+    std::string make_graph_cut_name(const std::string& group, const std::string& output);
+    void mark_graph_cut(ggml_tensor* tensor, const std::string& group, const std::string& output);
+    int leaf_count(ggml_cgraph* gf);
+    ggml_tensor* leaf_tensor(ggml_cgraph* gf, int leaf_index);
+    ggml_backend_buffer_t tensor_buffer(const ggml_tensor* tensor);
+    ggml_tensor* cache_source_tensor(ggml_tensor* tensor);
+    size_t cache_tensor_bytes(const ggml_tensor* tensor);
+    bool plan_matches_graph(ggml_cgraph* gf, const Plan& plan);
+    ggml_tensor* output_tensor(ggml_cgraph* gf, const Segment& segment, size_t output_index);
+    ggml_tensor* input_tensor(ggml_cgraph* gf, const Segment::InputRef& input_ref);
+    std::vector<ggml_tensor*> param_tensors(ggml_cgraph* gf, const Segment& segment);
+    std::unordered_set<std::string> collect_future_input_names(ggml_cgraph* gf,
+                                                               const Plan& plan,
+                                                               size_t current_segment_index);
+    ggml_cgraph* build_segment_graph(ggml_cgraph* gf,
+                                     const Segment& segment,
+                                     ggml_context** graph_ctx_out);
+    size_t measure_segment_compute_buffer(ggml_backend_t backend,
+                                          ggml_cgraph* gf,
+                                          const Segment& segment,
+                                          const char* log_desc);
+    size_t max_vram_gib_to_bytes(float max_vram);
+    float resolve_max_vram_gib(float max_vram, ggml_backend_t backend);
+    Plan build_plan(ggml_backend_t backend,
+                    ggml_cgraph* gf,
+                    const std::unordered_set<const ggml_tensor*>& params_tensor_set,
+                    const char* log_desc);
+    Plan apply_max_vram_budget(ggml_cgraph* gf,
+                               const Plan& base_plan,
+                               size_t max_graph_vram_bytes,
+                               ggml_backend_t backend,
+                               const std::unordered_set<const ggml_tensor*>& params_tensor_set,
+                               const char* log_desc);
+    Plan resolve_plan(ggml_backend_t backend,
+                      ggml_cgraph* gf,
+                      PlanCache* cache,
+                      size_t max_graph_vram_bytes,
+                      const std::unordered_set<const ggml_tensor*>& params_tensor_set,
+                      const char* log_desc);
+
+    // Mark leading segments resident when they fit after streamed-segment headroom.
+    void annotate_residency(Plan& plan, size_t max_graph_vram_bytes);
+}  // namespace sd::ggml_graph_cut
+
+#endif  // __SD_CORE_GGML_GRAPH_CUT_H__
--- a/src/core/ordered_map.hpp
+++ b/src/core/ordered_map.hpp
@ -1,5 +1,5 @@
-#ifndef __ORDERED_MAP_HPP__
-#define __ORDERED_MAP_HPP__
+#ifndef __SD_CORE_ORDERED_MAP_HPP__
+#define __SD_CORE_ORDERED_MAP_HPP__

 #include <iostream>
 #include <list>
@ -174,4 +174,4 @@ public:
    }
 };

-#endif  // __ORDERED_MAP_HPP__
+#endif  // __SD_CORE_ORDERED_MAP_HPP__
--- a/src/core/rng.hpp
+++ b/src/core/rng.hpp
@ -1,5 +1,5 @@
-#ifndef __RNG_H__
-#define __RNG_H__
+#ifndef __SD_CORE_RNG_HPP__
+#define __SD_CORE_RNG_HPP__

 #include <random>
 #include <vector>
@ -32,4 +32,4 @@ public:
    }
 };

-#endif  // __RNG_H__
+#endif  // __SD_CORE_RNG_HPP__
--- a/src/core/rng_mt19937.hpp
+++ b/src/core/rng_mt19937.hpp
@ -1,10 +1,10 @@
-#ifndef __RNG_MT19937_HPP__
-#define __RNG_MT19937_HPP__
+#ifndef __SD_CORE_RNG_MT19937_HPP__
+#define __SD_CORE_RNG_MT19937_HPP__

 #include <cmath>
 #include <vector>

-#include "rng.hpp"
+#include "core/rng.hpp"

 // RNG imitiating torch cpu randn on CPU.
 // Port from pytorch, original license: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/LICENSE
@ -144,4 +144,4 @@ public:
    }
 };

-#endif  // __RNG_MT19937_HPP__
+#endif  // __SD_CORE_RNG_MT19937_HPP__
--- a/src/core/rng_philox.hpp
+++ b/src/core/rng_philox.hpp
@ -1,10 +1,10 @@
-#ifndef __RNG_PHILOX_H__
-#define __RNG_PHILOX_H__
+#ifndef __SD_CORE_RNG_PHILOX_HPP__
+#define __SD_CORE_RNG_PHILOX_HPP__

 #include <cmath>
 #include <vector>

-#include "rng.hpp"
+#include "core/rng.hpp"

 // RNG imitiating torch cuda randn on CPU.
 // Port from: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/5ef669de080814067961f28357256e8fe27544f4/modules/rng_philox.py
@ -122,4 +122,4 @@ public:
    }
 };

-#endif  // __RNG_PHILOX_H__
+#endif  // __SD_CORE_RNG_PHILOX_HPP__
--- a/src/core/tensor.hpp
+++ b/src/core/tensor.hpp
@ -1,5 +1,5 @@
-#ifndef __SD_TENSOR_HPP__
-#define __SD_TENSOR_HPP__
+#ifndef __SD_CORE_TENSOR_HPP__
+#define __SD_CORE_TENSOR_HPP__

 #include <algorithm>
 #include <cmath>
@ -16,7 +16,7 @@
 #include <utility>
 #include <vector>

-#include "rng.hpp"
+#include "core/rng.hpp"

 namespace sd {

@ -235,6 +235,7 @@ namespace sd {

        Tensor& masked_fill_(const Tensor<uint8_t>& mask, const T& value);

+        T sum() const;
        T mean() const;

        static Tensor zeros(std::vector<int64_t> shape) {
@ -327,6 +328,24 @@ namespace sd {
        std::vector<int64_t> shape_;
    };

+    template <typename T>
+    inline T Tensor<T>::sum() const {
+        T total = T{};
+        for (const T& value : data_) {
+            total += value;
+        }
+        return total;
+    }
+
+    template <>
+    inline float Tensor<float>::sum() const {
+        double total = 0.0;
+        for (float value : data_) {
+            total += static_cast<double>(value);
+        }
+        return static_cast<float>(total);
+    }
+
    template <typename T>
    inline T Tensor<T>::mean() const {
        if (empty()) {
@ -815,11 +834,202 @@ namespace sd {
    namespace ops {
        enum class InterpolateMode {
            Nearest,
+            NearestExact,
            NearestMax,
            NearestMin,
            NearestAvg,
+            Bilinear,
+            Bicubic,
+            Lanczos,
        };

+        inline bool is_nearest_like_interpolate_mode(InterpolateMode mode) {
+            return mode == InterpolateMode::Nearest ||
+                   mode == InterpolateMode::NearestExact ||
+                   mode == InterpolateMode::NearestMax ||
+                   mode == InterpolateMode::NearestMin ||
+                   mode == InterpolateMode::NearestAvg;
+        }
+
+        inline bool is_2d_filter_interpolate_mode(InterpolateMode mode) {
+            return mode == InterpolateMode::Bilinear ||
+                   mode == InterpolateMode::Bicubic ||
+                   mode == InterpolateMode::Lanczos;
+        }
+
+        inline int64_t nearest_exact_interpolate_index(int64_t output_index,
+                                                       int64_t input_size,
+                                                       int64_t output_size) {
+            const double scale  = static_cast<double>(input_size) / static_cast<double>(output_size);
+            const double center = (static_cast<double>(output_index) + 0.5) * scale - 0.5;
+            return std::min(std::max<int64_t>(static_cast<int64_t>(std::floor(center + 0.5)), 0), input_size - 1);
+        }
+
+        inline double linear_interpolate_weight(double x) {
+            x = std::abs(x);
+            return x < 1.0 ? 1.0 - x : 0.0;
+        }
+
+        inline double cubic_interpolate_weight(double x) {
+            constexpr double a = -0.75;  // Match PyTorch bicubic interpolation.
+            x                  = std::abs(x);
+            if (x <= 1.0) {
+                return ((a + 2.0) * x - (a + 3.0)) * x * x + 1.0;
+            }
+            if (x < 2.0) {
+                return ((a * x - 5.0 * a) * x + 8.0 * a) * x - 4.0 * a;
+            }
+            return 0.0;
+        }
+
+        inline double sinc(double x) {
+            constexpr double pi = 3.14159265358979323846;
+            if (std::abs(x) < 1e-12) {
+                return 1.0;
+            }
+            const double pix = pi * x;
+            return std::sin(pix) / pix;
+        }
+
+        inline double lanczos_interpolate_weight(double x) {
+            constexpr double radius = 3.0;
+            x                       = std::abs(x);
+            if (x >= radius) {
+                return 0.0;
+            }
+            return sinc(x) * sinc(x / radius);
+        }
+
+        struct InterpolateContributor {
+            int64_t index;
+            double weight;
+        };
+
+        inline std::vector<std::vector<InterpolateContributor>> make_interpolate_contributors(
+            int64_t input_size,
+            int64_t output_size,
+            InterpolateMode mode,
+            bool antialias) {
+            std::vector<std::vector<InterpolateContributor>> contributors(static_cast<size_t>(output_size));
+            const double scale        = static_cast<double>(input_size) / static_cast<double>(output_size);
+            const double filter_scale = antialias ? std::max(1.0, scale) : 1.0;
+
+            for (int64_t out = 0; out < output_size; ++out) {
+                const double center = (static_cast<double>(out) + 0.5) * scale - 0.5;
+                int64_t start       = 0;
+                int64_t end         = 0;
+
+                if (mode == InterpolateMode::Bilinear) {
+                    const double support = filter_scale;
+                    start                = static_cast<int64_t>(std::ceil(center - support));
+                    end                  = static_cast<int64_t>(std::floor(center + support));
+                } else if (mode == InterpolateMode::Bicubic) {
+                    const double support = 2.0 * filter_scale;
+                    start                = static_cast<int64_t>(std::ceil(center - support));
+                    end                  = static_cast<int64_t>(std::floor(center + support));
+                } else if (mode == InterpolateMode::Lanczos) {
+                    const double support = 3.0 * filter_scale;
+                    start                = static_cast<int64_t>(std::ceil(center - support));
+                    end                  = static_cast<int64_t>(std::floor(center + support));
+                } else {
+                    tensor_throw_invalid_argument("Unsupported 2D filter interpolate mode: mode=" +
+                                                  std::to_string(static_cast<int>(mode)));
+                }
+
+                double weight_sum                                      = 0.0;
+                std::vector<InterpolateContributor>& axis_contributors = contributors[static_cast<size_t>(out)];
+                axis_contributors.reserve(static_cast<size_t>(end - start + 1));
+
+                for (int64_t in = start; in <= end; ++in) {
+                    double weight = 0.0;
+                    if (mode == InterpolateMode::Bilinear) {
+                        weight = linear_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
+                    } else if (mode == InterpolateMode::Bicubic) {
+                        weight = cubic_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
+                    } else {
+                        weight = lanczos_interpolate_weight((center - static_cast<double>(in)) / filter_scale);
+                    }
+
+                    if (weight == 0.0) {
+                        continue;
+                    }
+
+                    const int64_t clamped_index = std::min(std::max<int64_t>(in, 0), input_size - 1);
+                    axis_contributors.push_back({clamped_index, weight});
+                    weight_sum += weight;
+                }
+
+                if ((antialias || mode == InterpolateMode::Lanczos) &&
+                    std::abs(weight_sum) > 1e-12) {
+                    for (auto& contributor : axis_contributors) {
+                        contributor.weight /= weight_sum;
+                    }
+                }
+
+                if (axis_contributors.empty()) {
+                    const int64_t nearest = std::min(
+                        std::max<int64_t>(static_cast<int64_t>(std::floor(center + 0.5)), 0),
+                        input_size - 1);
+                    axis_contributors.push_back({nearest, 1.0});
+                }
+            }
+
+            return contributors;
+        }
+
+        template <typename T>
+        inline Tensor<T> interpolate_2d_filter(const Tensor<T>& input,
+                                               const std::vector<int64_t>& output_shape,
+                                               InterpolateMode mode,
+                                               bool antialias) {
+            if (input.dim() < 2) {
+                tensor_throw_invalid_argument("2D filter interpolate requires rank >= 2: input_shape=" +
+                                              tensor_shape_to_string(input.shape()) + ", output_shape=" +
+                                              tensor_shape_to_string(output_shape));
+            }
+            for (size_t i = 2; i < output_shape.size(); ++i) {
+                if (input.shape()[i] != output_shape[i]) {
+                    tensor_throw_invalid_argument("2D filter interpolate only supports resizing dimensions 0 and 1: input_shape=" +
+                                                  tensor_shape_to_string(input.shape()) + ", output_shape=" +
+                                                  tensor_shape_to_string(output_shape));
+                }
+            }
+
+            Tensor<T> output(output_shape);
+            const int64_t input_width   = input.shape()[0];
+            const int64_t input_height  = input.shape()[1];
+            const int64_t output_width  = output_shape[0];
+            const int64_t output_height = output_shape[1];
+            const int64_t input_plane   = input_width * input_height;
+            const int64_t output_plane  = output_width * output_height;
+            const int64_t plane_count   = input.numel() / input_plane;
+
+            auto x_contributors = make_interpolate_contributors(input_width, output_width, mode, antialias);
+            auto y_contributors = make_interpolate_contributors(input_height, output_height, mode, antialias);
+
+            for (int64_t plane = 0; plane < plane_count; ++plane) {
+                const int64_t input_plane_offset  = plane * input_plane;
+                const int64_t output_plane_offset = plane * output_plane;
+                for (int64_t y = 0; y < output_height; ++y) {
+                    const auto& y_axis = y_contributors[static_cast<size_t>(y)];
+                    for (int64_t x = 0; x < output_width; ++x) {
+                        const auto& x_axis = x_contributors[static_cast<size_t>(x)];
+                        double value       = 0.0;
+                        for (const auto& yc : y_axis) {
+                            const int64_t input_row_offset = input_plane_offset + yc.index * input_width;
+                            for (const auto& xc : x_axis) {
+                                value += static_cast<double>(input.data()[input_row_offset + xc.index]) *
+                                         xc.weight * yc.weight;
+                            }
+                        }
+                        output.data()[output_plane_offset + y * output_width + x] = static_cast<T>(value);
+                    }
+                }
+            }
+
+            return output;
+        }
+
        inline int64_t normalize_slice_bound(int64_t index, int64_t dim_size) {
            if (index < 0) {
                index += dim_size;
@ -1014,17 +1224,20 @@ namespace sd {
        inline Tensor<T> interpolate(const Tensor<T>& input,
                                     std::vector<int64_t> output_shape,
                                     InterpolateMode mode = InterpolateMode::Nearest,
-                                     bool align_corners   = false) {
-            const bool is_nearest_like_mode = (mode == InterpolateMode::Nearest ||
-                                               mode == InterpolateMode::NearestMax ||
-                                               mode == InterpolateMode::NearestMin ||
-                                               mode == InterpolateMode::NearestAvg);
-            if (!is_nearest_like_mode) {
-                tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" +
+                                     bool align_corners   = false,
+                                     bool antialias       = false) {
+            const bool is_nearest_like_mode = is_nearest_like_interpolate_mode(mode);
+            const bool is_2d_filter_mode    = is_2d_filter_interpolate_mode(mode);
+            if (!is_nearest_like_mode && !is_2d_filter_mode) {
+                tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
+                                              std::to_string(static_cast<int>(mode)));
+            }
+            if (antialias && !is_2d_filter_mode) {
+                tensor_throw_invalid_argument("Tensor interpolate antialias requires a 2D filter mode: mode=" +
                                              std::to_string(static_cast<int>(mode)));
            }
            if (align_corners) {
-                tensor_throw_invalid_argument("align_corners is not supported for nearest-like interpolate: input_shape=" +
+                tensor_throw_invalid_argument("align_corners is not supported for tensor interpolate: input_shape=" +
                                              tensor_shape_to_string(input.shape()) + ", output_shape=" +
                                              tensor_shape_to_string(output_shape));
            }
@ -1051,6 +1264,10 @@ namespace sd {
                }
            }

+            if (is_2d_filter_mode) {
+                return interpolate_2d_filter(input, output_shape, mode, antialias);
+            }
+
            bool has_downsampling = false;
            for (int64_t i = 0; i < input.dim(); ++i) {
                if (input.shape()[i] > output_shape[i]) {
@ -1060,13 +1277,21 @@ namespace sd {
            }

            Tensor<T> output(std::move(output_shape));
-            if (mode == InterpolateMode::Nearest || !has_downsampling) {
+            if (mode == InterpolateMode::Nearest ||
+                mode == InterpolateMode::NearestExact ||
+                !has_downsampling) {
                for (int64_t flat = 0; flat < output.numel(); ++flat) {
                    std::vector<int64_t> output_coord = tensor_unravel_index(flat, output.shape());
                    std::vector<int64_t> input_coord(static_cast<size_t>(input.dim()), 0);
                    for (size_t i = 0; i < static_cast<size_t>(input.dim()); ++i) {
+                        if (mode == InterpolateMode::NearestExact) {
+                            input_coord[i] = nearest_exact_interpolate_index(output_coord[i],
+                                                                             input.shape()[i],
+                                                                             output.shape()[i]);
+                        } else {
                            input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i];
                        }
+                    }
                    output[flat] = input.index(input_coord);
                }

@ -1083,6 +1308,12 @@ namespace sd {
                        return T(0);
                    case InterpolateMode::Nearest:
                        return T(0);
+                    case InterpolateMode::NearestExact:
+                        return T(0);
+                    case InterpolateMode::Bilinear:
+                    case InterpolateMode::Bicubic:
+                    case InterpolateMode::Lanczos:
+                        break;
                }

                tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
@ -1102,6 +1333,12 @@ namespace sd {
                        break;
                    case InterpolateMode::Nearest:
                        break;
+                    case InterpolateMode::NearestExact:
+                        break;
+                    case InterpolateMode::Bilinear:
+                    case InterpolateMode::Bicubic:
+                    case InterpolateMode::Lanczos:
+                        break;
                }
            };

@ -1157,17 +1394,20 @@ namespace sd {
                                     const std::optional<std::vector<int64_t>>& size,
                                     const std::optional<std::vector<double>>& scale_factor,
                                     InterpolateMode mode = InterpolateMode::Nearest,
-                                     bool align_corners   = false) {
-            const bool is_nearest_like_mode = (mode == InterpolateMode::Nearest ||
-                                               mode == InterpolateMode::NearestMax ||
-                                               mode == InterpolateMode::NearestMin ||
-                                               mode == InterpolateMode::NearestAvg);
-            if (!is_nearest_like_mode) {
-                tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" +
+                                     bool align_corners   = false,
+                                     bool antialias       = false) {
+            const bool is_nearest_like_mode = is_nearest_like_interpolate_mode(mode);
+            const bool is_2d_filter_mode    = is_2d_filter_interpolate_mode(mode);
+            if (!is_nearest_like_mode && !is_2d_filter_mode) {
+                tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
+                                              std::to_string(static_cast<int>(mode)));
+            }
+            if (antialias && !is_2d_filter_mode) {
+                tensor_throw_invalid_argument("Tensor interpolate antialias requires a 2D filter mode: mode=" +
                                              std::to_string(static_cast<int>(mode)));
            }
            if (align_corners) {
-                tensor_throw_invalid_argument("align_corners is not supported for nearest-like interpolate: input_shape=" +
+                tensor_throw_invalid_argument("align_corners is not supported for tensor interpolate: input_shape=" +
                                              tensor_shape_to_string(input.shape()));
            }
            if (size.has_value() == scale_factor.has_value()) {
@ -1211,7 +1451,7 @@ namespace sd {
                }
            }

-            return interpolate(input, std::move(output_shape), mode, align_corners);
+            return interpolate(input, std::move(output_shape), mode, align_corners, antialias);
        }

        template <typename T>
@ -1219,12 +1459,14 @@ namespace sd {
                                     const std::optional<std::vector<int64_t>>& size,
                                     double scale_factor,
                                     InterpolateMode mode = InterpolateMode::Nearest,
-                                     bool align_corners   = false) {
+                                     bool align_corners   = false,
+                                     bool antialias       = false) {
            return interpolate(input,
                               size,
                               std::vector<double>(size.has_value() ? size->size() : input.dim(), scale_factor),
                               mode,
-                               align_corners);
+                               align_corners,
+                               antialias);
        }

        template <typename T>
@ -1419,4 +1661,4 @@ namespace sd {

 }  // namespace sd

-#endif
+#endif  // __SD_CORE_TENSOR_HPP__
--- a/src/core/tensor_ggml.hpp
+++ b/src/core/tensor_ggml.hpp
@ -1,5 +1,5 @@
-#ifndef __SD_TENSOR_GGML_HPP__
-#define __SD_TENSOR_GGML_HPP__
+#ifndef __SD_CORE_TENSOR_GGML_HPP__
+#define __SD_CORE_TENSOR_GGML_HPP__

 #include <array>
 #include <cstring>
@ -8,8 +8,8 @@
 #include <string>
 #include <type_traits>

+#include "core/tensor.hpp"
 #include "ggml.h"
-#include "tensor.hpp"

 namespace sd {

@ -104,7 +104,7 @@ namespace sd {
            throw std::invalid_argument("tensor file type does not match requested sd::Tensor type");
        }

-        std::vector<int64_t> shape(4, 1);
+        std::vector<int64_t> shape(n_dims, 1);
        for (int i = 0; i < n_dims; ++i) {
            int32_t dim = 1;
            file.read(reinterpret_cast<char*>(&dim), sizeof(dim));
@ -124,4 +124,4 @@ namespace sd {

 }  // namespace sd

-#endif
+#endif  // __SD_CORE_TENSOR_GGML_HPP__
--- a/src/core/util.cpp
+++ b/src/core/util.cpp
@ -1,8 +1,10 @@
-#include "util.h"
+#include "core/util.h"
 #include <algorithm>
+#include <cctype>
 #include <cmath>
 #include <codecvt>
 #include <cstdarg>
+#include <exception>
 #include <fstream>
 #include <locale>
 #include <regex>
@ -11,7 +13,7 @@
 #include <thread>
 #include <unordered_set>
 #include <vector>
-#include "preprocessing.hpp"
+#include "runtime/preprocessing.hpp"

 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/sysctl.h>
@ -23,7 +25,6 @@
 #include <unistd.h>
 #endif

-#include "ggml-cpu.h"
 #include "ggml.h"
 #include "stable-diffusion.h"

@ -111,7 +112,7 @@ private:
    HANDLE hmapping_;
 };

-std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
+std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename, bool writable) {
    void* mapped_data = nullptr;
    size_t file_size  = 0;

@ -119,10 +120,10 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
        filename.c_str(),
        GENERIC_READ,
        FILE_SHARE_READ,
-        NULL,
+        nullptr,
        OPEN_EXISTING,
        FILE_ATTRIBUTE_NORMAL,
-        NULL);
+        nullptr);

    if (file_handle == INVALID_HANDLE_VALUE) {
        return nullptr;
@ -136,16 +137,20 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {

    file_size = static_cast<size_t>(size.QuadPart);

-    HANDLE mapping_handle = CreateFileMapping(file_handle, NULL, PAGE_READONLY, 0, 0, NULL);
+    DWORD page_prot = writable ? PAGE_WRITECOPY : PAGE_READONLY;

-    if (mapping_handle == NULL) {
+    HANDLE mapping_handle = CreateFileMapping(file_handle, nullptr, page_prot, 0, 0, nullptr);
+
+    if (mapping_handle == nullptr) {
        CloseHandle(file_handle);
        return nullptr;
    }

-    mapped_data = MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, file_size);
+    DWORD view_access = writable ? FILE_MAP_COPY : FILE_MAP_READ;

-    if (mapped_data == NULL) {
+    mapped_data = MapViewOfFile(mapping_handle, view_access, 0, 0, file_size);
+
+    if (mapped_data == nullptr) {
        CloseHandle(mapping_handle);
        CloseHandle(file_handle);
        return nullptr;
@ -171,28 +176,85 @@ bool is_directory(const std::string& path) {
    return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
 }

-class MmapWrapperImpl : public MmapWrapper {
-public:
-    MmapWrapperImpl(void* data, size_t size)
-        : MmapWrapper(data, size) {}
-
-    ~MmapWrapperImpl() override {
-        munmap(data_, size_);
-    }
+struct MmapFlags {
+    bool sequential;
+    bool populate;
+    bool willneed;
+    bool dontneed;
 };

-std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
+static MmapFlags get_mmap_flags() {
+    MmapFlags result          = {};
+    const char* SD_MMAP_FLAGS = std::getenv("SD_MMAP_FLAGS");
+    if (SD_MMAP_FLAGS && *SD_MMAP_FLAGS) {
+        std::stringstream ss(SD_MMAP_FLAGS);
+        std::string token;
+        while (std::getline(ss, token, ',')) {
+            std::string ntoken = trim(token);
+            std::transform(ntoken.begin(), ntoken.end(), ntoken.begin(), ::tolower);
+            if (ntoken == "sequential") {
+                result.sequential = true;
+            } else if (ntoken == "populate") {
+                result.populate = true;
+            } else if (ntoken == "willneed") {
+                result.willneed = true;
+            } else if (ntoken == "dontneed") {
+                result.dontneed = true;
+            }
+        }
+    }
+    return result;
+}
+
+class MmapWrapperImpl : public MmapWrapper {
+public:
+    MmapWrapperImpl(void* data, size_t size, int fd)
+        : MmapWrapper(data, size), fd_(fd) {}
+
+    ~MmapWrapperImpl() override {
+#ifdef __linux__
+        auto cfg_flags = get_mmap_flags();
+
+        // Drop the kernel pagecache pages for this file. madvise(DONTNEED)
+        // alone only unmaps from the process address space; pagecache
+        // entries persist (`free` reports them as buff/cache and the OOM
+        // killer doesn't touch them, but they ARE counted against
+        // overcommit and can starve other allocations on tight-RAM
+        // systems). posix_fadvise(POSIX_FADV_DONTNEED) is the documented
+        // way to evict pagecache for a specific fd's pages.
+        if (cfg_flags.dontneed) {
+            madvise(data_, size_, MADV_DONTNEED);
+            posix_fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
+        }
+#endif
+        munmap(data_, size_);
+        close(fd_);
+    }
+
+private:
+    int fd_;
+};
+
+std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename, bool writable) {
    int file_descriptor = open(filename.c_str(), O_RDONLY);
    if (file_descriptor == -1) {
        return nullptr;
    }

+    auto cfg_flags = get_mmap_flags();
+
    int mmap_flags = MAP_PRIVATE;

 #ifdef __linux__
-    // performance flags used by llama.cpp
-    // posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL);
-    // mmap_flags |= MAP_POPULATE;
+    // Sequential access hint helps the kernel read-ahead efficiently and
+    // also encourages eviction of already-read pages (the kernel keeps
+    // a smaller working set when this is set).
+    if (cfg_flags.sequential) {
+        posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL);
+    }
+    if (cfg_flags.populate) {
+        mmap_flags |= MAP_POPULATE;
+    }
 #endif

    struct stat sb;
@ -203,20 +265,27 @@ std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {

    size_t file_size = sb.st_size;

-    void* mapped_data = mmap(NULL, file_size, PROT_READ, mmap_flags, file_descriptor, 0);
-
+    if (file_size == 0) {
        close(file_descriptor);
+        return nullptr;
+    }
+
+    int mmap_prot = PROT_READ | (writable ? PROT_WRITE : 0);
+
+    void* mapped_data = mmap(nullptr, file_size, mmap_prot, mmap_flags, file_descriptor, 0);

    if (mapped_data == MAP_FAILED) {
+        close(file_descriptor);
        return nullptr;
    }

 #ifdef __linux__
-    // performance flags used by llama.cpp
-    // posix_madvise(mapped_data, file_size, POSIX_MADV_WILLNEED);
+    if (cfg_flags.willneed) {
+        posix_madvise(mapped_data, file_size, POSIX_MADV_WILLNEED);
+    }
 #endif

-    return std::make_unique<MmapWrapperImpl>(mapped_data, file_size);
+    return std::make_unique<MmapWrapperImpl>(mapped_data, file_size, file_descriptor);
 }

 #endif
@ -337,7 +406,89 @@ std::vector<std::string> split_string(const std::string& str, char delimiter) {
    return result;
 }

-static std::string build_progress_bar(int step, int steps) {
+KeyValueArgs parse_key_value_args(const char* args, const char* context) {
+    KeyValueArgs pairs;
+
+    if (args == nullptr || args[0] == '\0') {
+        return pairs;
+    }
+
+    std::string raw(args);
+    size_t start = 0;
+    for (size_t pos = 0; pos <= raw.size(); ++pos) {
+        if (pos != raw.size() && raw[pos] != ',' && raw[pos] != ';') {
+            continue;
+        }
+
+        std::string token = trim(raw.substr(start, pos - start));
+        if (!token.empty()) {
+            size_t eq = token.find('=');
+            if (eq == std::string::npos) {
+                const char* log_context = context ? context : "key=value arg";
+                LOG_WARN("ignoring malformed %s '%s'", log_context, token.c_str());
+            } else {
+                std::string key   = trim(token.substr(0, eq));
+                std::string value = trim(token.substr(eq + 1));
+                pairs.emplace_back(std::move(key), std::move(value));
+            }
+        }
+
+        start = pos + 1;
+    }
+
+    return pairs;
+}
+
+KeyValueArgs parse_key_value_args(const std::string& args, const char* context) {
+    return parse_key_value_args(args.c_str(), context);
+}
+
+bool parse_strict_float(const std::string& text, float& value) {
+    try {
+        size_t consumed = 0;
+        float parsed    = std::stof(text, &consumed);
+        if (!trim(text.substr(consumed)).empty()) {
+            return false;
+        }
+        value = parsed;
+        return true;
+    } catch (const std::exception&) {
+        return false;
+    }
+}
+
+bool parse_strict_int(const std::string& text, int& value) {
+    try {
+        size_t consumed = 0;
+        int parsed      = std::stoi(text, &consumed);
+        if (!trim(text.substr(consumed)).empty()) {
+            return false;
+        }
+        value = parsed;
+        return true;
+    } catch (const std::exception&) {
+        return false;
+    }
+}
+
+bool parse_strict_bool(const std::string& text, bool& value) {
+    std::string lowered = trim(text);
+    std::transform(lowered.begin(), lowered.end(), lowered.begin(), [](unsigned char c) {
+        return static_cast<char>(std::tolower(c));
+    });
+
+    if (lowered == "1" || lowered == "true" || lowered == "yes" || lowered == "on") {
+        value = true;
+        return true;
+    }
+    if (lowered == "0" || lowered == "false" || lowered == "no" || lowered == "off") {
+        value = false;
+        return true;
+    }
+    return false;
+}
+
+static std::string build_progress_bar(int step, int steps, char progress_char = '=', bool show_head = true) {
    std::string progress = "  |";
    int max_progress     = 50;
    int32_t current      = 0;
@ -347,21 +498,21 @@ static std::string build_progress_bar(int step, int steps) {
    for (int i = 0; i < 50; i++) {
        if (i > current) {
            progress += " ";
-        } else if (i == current && i != max_progress - 1) {
+        } else if (show_head && i == current && i != max_progress - 1) {
            progress += ">";
        } else {
-            progress += "=";
+            progress += progress_char;
        }
    }
    progress += "|";
    return progress;
 }

-static void print_progress_line(int step, int steps, const std::string& speed_text) {
+static void print_progress_line(int step, int steps, const std::string& speed_text, char progress_char = '=', bool show_head = true) {
    if (step == 0) {
        return;
    }
-    std::string progress = build_progress_bar(step, steps);
+    std::string progress = build_progress_bar(step, steps, progress_char, show_head);
    const char* lf       = (step == steps ? "\n" : "");
    printf("\r%s %i/%i - %s\033[K%s", progress.c_str(), step, steps, speed_text.c_str(), lf);
    fflush(stdout);  // for linux
@ -401,9 +552,9 @@ void pretty_bytes_progress(int step, int steps, uint64_t bytes_processed, float

    double speed_mb = bytes_per_second / (1024.0 * 1024.0);
    if (speed_mb >= 1024.0) {
-        print_progress_line(step, steps, sd_format("%.2fGB/s", speed_mb / 1024.0));
+        print_progress_line(step, steps, sd_format("%.2fGB/s", speed_mb / 1024.0), '#', false);
    } else {
-        print_progress_line(step, steps, sd_format("%.2fMB/s", speed_mb));
+        print_progress_line(step, steps, sd_format("%.2fMB/s", speed_mb), '#', false);
    }
 }

@ -495,26 +646,6 @@ sd_progress_cb_t sd_get_progress_callback() {
 void* sd_get_progress_callback_data() {
    return sd_progress_cb_data;
 }
-const char* sd_get_system_info() {
-    static char buffer[1024];
-    std::stringstream ss;
-    ss << "System Info: \n";
-    ss << "    SSE3 = " << ggml_cpu_has_sse3() << " | ";
-    ss << "    AVX = " << ggml_cpu_has_avx() << " | ";
-    ss << "    AVX2 = " << ggml_cpu_has_avx2() << " | ";
-    ss << "    AVX512 = " << ggml_cpu_has_avx512() << " | ";
-    ss << "    AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | ";
-    ss << "    AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | ";
-    ss << "    FMA = " << ggml_cpu_has_fma() << " | ";
-    ss << "    NEON = " << ggml_cpu_has_neon() << " | ";
-    ss << "    ARM_FMA = " << ggml_cpu_has_arm_fma() << " | ";
-    ss << "    F16C = " << ggml_cpu_has_f16c() << " | ";
-    ss << "    FP16_VA = " << ggml_cpu_has_fp16_va() << " | ";
-    ss << "    WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | ";
-    ss << "    VSX = " << ggml_cpu_has_vsx() << " | ";
-    snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str());
-    return buffer;
-}

 sd_image_t tensor_to_sd_image(const sd::Tensor<float>& tensor, int frame_index) {
    const auto& shape = tensor.shape();
@ -524,17 +655,7 @@ sd_image_t tensor_to_sd_image(const sd::Tensor<float>& tensor, int frame_index)
    int channel   = static_cast<int>(shape[shape.size() == 5 ? 3 : 2]);
    uint8_t* data = (uint8_t*)malloc(static_cast<size_t>(width * height * channel));
    GGML_ASSERT(data != nullptr);
-
-    for (int iw = 0; iw < width; ++iw) {
-        for (int ih = 0; ih < height; ++ih) {
-            for (int ic = 0; ic < channel; ++ic) {
-                float value                            = shape.size() == 5 ? tensor.index(iw, ih, frame_index, ic, 0)
-                                                                           : tensor.index(iw, ih, ic, frame_index);
-                value                                  = std::clamp(value, 0.0f, 1.0f);
-                data[(ih * width + iw) * channel + ic] = static_cast<uint8_t>(std::round(value * 255.0f));
-            }
-        }
-    }
+    preprocessing_tensor_frame_to_sd_image(tensor, frame_index, data);
    return {
        static_cast<uint32_t>(width),
        static_cast<uint32_t>(height),
@ -718,3 +839,136 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str

    return res;
 }
+
+static size_t get_utf8_char_len(char c) {
+    unsigned char uc = static_cast<unsigned char>(c);
+    if ((uc & 0x80) == 0) {
+        return 1;
+    }
+    if ((uc & 0xE0) == 0xC0) {
+        return 2;
+    }
+    if ((uc & 0xF0) == 0xE0) {
+        return 3;
+    }
+    if ((uc & 0xF8) == 0xF0) {
+        return 4;
+    }
+    return 1;
+}
+
+static bool is_ascii_alpha(char c) {
+    return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+}
+
+static bool starts_with_at(const std::string& text, size_t pos, const std::string& needle) {
+    return pos + needle.size() <= text.size() && text.compare(pos, needle.size(), needle) == 0;
+}
+
+static bool is_word_internal_apostrophe(const std::string& text, size_t pos) {
+    return pos > 0 && pos + 1 < text.size() &&
+           is_ascii_alpha(text[pos - 1]) && is_ascii_alpha(text[pos + 1]);
+}
+
+static std::vector<std::pair<std::string, bool>> split_quotation(const std::string& text) {
+    static const std::vector<std::pair<std::string, std::string>> quote_pairs = {
+        {"'", "'"},
+        {"\"", "\""},
+        {"\xE2\x80\x98", "\xE2\x80\x99"},
+        {"\xE2\x80\x9C", "\xE2\x80\x9D"},
+    };
+
+    std::vector<std::pair<std::string, bool>> result;
+    size_t segment_start = 0;
+    size_t i             = 0;
+
+    auto push_segment = [&](size_t begin, size_t end, bool matched) {
+        if (end > begin) {
+            result.emplace_back(text.substr(begin, end - begin), matched);
+        }
+    };
+
+    while (i < text.size()) {
+        bool matched_quote = false;
+        for (const auto& quote_pair : quote_pairs) {
+            const std::string& open_quote  = quote_pair.first;
+            const std::string& close_quote = quote_pair.second;
+            if (!starts_with_at(text, i, open_quote)) {
+                continue;
+            }
+            if (open_quote == "'" && is_word_internal_apostrophe(text, i)) {
+                continue;
+            }
+
+            size_t search_pos = i + open_quote.size();
+            size_t close_pos  = std::string::npos;
+            bool invalid      = false;
+            while (search_pos < text.size()) {
+                if (open_quote != close_quote && starts_with_at(text, search_pos, open_quote)) {
+                    invalid = true;
+                    break;
+                }
+                if (starts_with_at(text, search_pos, close_quote)) {
+                    if (close_quote == "'" && is_word_internal_apostrophe(text, search_pos)) {
+                        search_pos += close_quote.size();
+                        continue;
+                    }
+                    close_pos = search_pos;
+                    break;
+                }
+
+                size_t char_len = get_utf8_char_len(text[search_pos]);
+                if (search_pos + char_len > text.size()) {
+                    char_len = 1;
+                }
+                search_pos += char_len;
+            }
+            if (invalid || close_pos == std::string::npos) {
+                continue;
+            }
+
+            size_t quote_start = i;
+            push_segment(segment_start, quote_start, false);
+            i = close_pos + close_quote.size();
+            push_segment(quote_start, i, true);
+            segment_start = i;
+            matched_quote = true;
+            break;
+        }
+        if (!matched_quote) {
+            size_t char_len = get_utf8_char_len(text[i]);
+            if (i + char_len > text.size()) {
+                char_len = 1;
+            }
+            i += char_len;
+        }
+    }
+
+    push_segment(segment_start, text.size(), false);
+    return result;
+}
+
+std::vector<std::pair<std::string, float>> split_quotation_attention(
+    const std::vector<std::pair<std::string, float>>& parsed_attention) {
+    std::vector<std::pair<std::string, float>> result;
+    for (const auto& item : parsed_attention) {
+        const std::string& text = item.first;
+        float weight            = item.second;
+        for (const auto& part : split_quotation(text)) {
+            if (part.second) {
+                size_t i = 0;
+                while (i < part.first.size()) {
+                    size_t char_len = get_utf8_char_len(part.first[i]);
+                    if (i + char_len > part.first.size()) {
+                        char_len = 1;
+                    }
+                    result.emplace_back(part.first.substr(i, char_len), weight);
+                    i += char_len;
+                }
+            } else {
+                result.emplace_back(part.first, weight);
+            }
+        }
+    }
+    return result;
+}
--- a/src/core/util.h
+++ b/src/core/util.h
@ -1,13 +1,15 @@
-#ifndef __UTIL_H__
-#define __UTIL_H__
+#ifndef __SD_CORE_UTIL_H__
+#define __SD_CORE_UTIL_H__

 #include <cstdint>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>

+#include "core/tensor.hpp"
+#include "ggml-backend.h"
 #include "stable-diffusion.h"
-#include "tensor.hpp"

 #define SAFE_STR(s) ((s) ? (s) : "")
 #define BOOL_STR(b) ((b) ? "true" : "false")
@ -41,7 +43,7 @@ sd::Tensor<float> clip_preprocess(const sd::Tensor<float>& image, int target_wid

 class MmapWrapper {
 public:
-    static std::unique_ptr<MmapWrapper> create(const std::string& filename);
+    static std::unique_ptr<MmapWrapper> create(const std::string& filename, bool writable = false);

    virtual ~MmapWrapper() = default;

@ -51,6 +53,7 @@ public:
    MmapWrapper& operator=(MmapWrapper&&)      = delete;

    const uint8_t* data() const { return static_cast<uint8_t*>(data_); }
+    uint8_t* writable_data() { return static_cast<uint8_t*>(data_); }
    size_t size() const { return size_; }
    bool copy_data(void* buf, size_t n, size_t offset) const;

@ -63,6 +66,15 @@ protected:

 std::string path_join(const std::string& p1, const std::string& p2);
 std::vector<std::string> split_string(const std::string& str, char delimiter);
+
+using KeyValueArgs = std::vector<std::pair<std::string, std::string>>;
+
+KeyValueArgs parse_key_value_args(const char* args, const char* context = "key=value arg");
+KeyValueArgs parse_key_value_args(const std::string& args, const char* context = "key=value arg");
+bool parse_strict_float(const std::string& text, float& value);
+bool parse_strict_int(const std::string& text, int& value);
+bool parse_strict_bool(const std::string& text, bool& value);
+
 void pretty_progress(int step, int steps, float time);
 void pretty_bytes_progress(int step, int steps, uint64_t bytes_processed, float elapsed_seconds);

@ -71,6 +83,8 @@ void log_printf(sd_log_level_t level, const char* file, int line, const char* fo
 std::string trim(const std::string& s);

 std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text);
+std::vector<std::pair<std::string, float>> split_quotation_attention(
+    const std::vector<std::pair<std::string, float>>& parsed_attention);

 sd_progress_cb_t sd_get_progress_callback();
 void* sd_get_progress_callback_data();
@ -82,8 +96,11 @@ int sd_get_preview_interval();
 bool sd_should_preview_denoised();
 bool sd_should_preview_noisy();

+// test if the backend is a specific one, e.g. "CUDA", "ROCm", "Vulkan" etc.
+bool sd_backend_is(ggml_backend_t backend, const std::string& name);
+
 #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_ERROR(format, ...) log_printf(SD_LOG_ERROR, __FILE__, __LINE__, format, ##__VA_ARGS__)
-#endif  // __UTIL_H__
+#endif  // __SD_CORE_UTIL_H__
--- a/src/diffusion_model.hpp
+++ b/src/diffusion_model.hpp
@ -1,582 +0,0 @@
-#ifndef __DIFFUSION_MODEL_H__
-#define __DIFFUSION_MODEL_H__
-
-#include <optional>
-#include "anima.hpp"
-#include "ernie_image.hpp"
-#include "flux.hpp"
-#include "mmdit.hpp"
-#include "qwen_image.hpp"
-#include "tensor_ggml.hpp"
-#include "unet.hpp"
-#include "wan.hpp"
-#include "z_image.hpp"
-
-struct DiffusionParams {
-    const sd::Tensor<float>* x                        = nullptr;
-    const sd::Tensor<float>* timesteps                = nullptr;
-    const sd::Tensor<float>* context                  = nullptr;
-    const sd::Tensor<float>* c_concat                 = nullptr;
-    const sd::Tensor<float>* y                        = nullptr;
-    const sd::Tensor<int32_t>* t5_ids                 = nullptr;
-    const sd::Tensor<float>* t5_weights               = nullptr;
-    const sd::Tensor<float>* guidance                 = nullptr;
-    const std::vector<sd::Tensor<float>>* ref_latents = nullptr;
-    bool increase_ref_index                           = false;
-    int num_video_frames                              = -1;
-    const std::vector<sd::Tensor<float>>* controls    = nullptr;
-    float control_strength                            = 0.f;
-    const sd::Tensor<float>* vace_context             = nullptr;
-    float vace_strength                               = 1.f;
-    const std::vector<int>* skip_layers               = nullptr;
-};
-
-template <typename T>
-static inline const sd::Tensor<T>& tensor_or_empty(const sd::Tensor<T>* tensor) {
-    static const sd::Tensor<T> kEmpty;
-    return tensor != nullptr ? *tensor : kEmpty;
-}
-
-struct DiffusionModel {
-    virtual std::string get_desc()                                               = 0;
-    virtual sd::Tensor<float> compute(int n_threads,
-                                      const DiffusionParams& diffusion_params)   = 0;
-    virtual void alloc_params_buffer()                                           = 0;
-    virtual void free_params_buffer()                                            = 0;
-    virtual void free_compute_buffer()                                           = 0;
-    virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) = 0;
-    virtual size_t get_params_buffer_size()                                      = 0;
-    virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
-    virtual int64_t get_adm_in_channels()                            = 0;
-    virtual void set_flash_attention_enabled(bool enabled)           = 0;
-    virtual void set_circular_axes(bool circular_x, bool circular_y) = 0;
-};
-
-struct UNetModel : public DiffusionModel {
-    UNetModelRunner unet;
-
-    UNetModel(ggml_backend_t backend,
-              bool offload_params_to_cpu,
-              const String2TensorStorage& tensor_storage_map = {},
-              SDVersion version                              = VERSION_SD1)
-        : unet(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version) {
-    }
-
-    std::string get_desc() override {
-        return unet.get_desc();
-    }
-
-    void alloc_params_buffer() override {
-        unet.alloc_params_buffer();
-    }
-
-    void free_params_buffer() override {
-        unet.free_params_buffer();
-    }
-
-    void free_compute_buffer() override {
-        unet.free_compute_buffer();
-    }
-
-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
-        unet.get_param_tensors(tensors, "model.diffusion_model");
-    }
-
-    size_t get_params_buffer_size() override {
-        return unet.get_params_buffer_size();
-    }
-
-    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
-        unet.set_weight_adapter(adapter);
-    }
-
-    int64_t get_adm_in_channels() override {
-        return unet.unet.adm_in_channels;
-    }
-
-    void set_flash_attention_enabled(bool enabled) {
-        unet.set_flash_attention_enabled(enabled);
-    }
-
-    void set_circular_axes(bool circular_x, bool circular_y) override {
-        unet.set_circular_axes(circular_x, circular_y);
-    }
-
-    sd::Tensor<float> compute(int n_threads,
-                              const DiffusionParams& diffusion_params) override {
-        GGML_ASSERT(diffusion_params.x != nullptr);
-        GGML_ASSERT(diffusion_params.timesteps != nullptr);
-        static const std::vector<sd::Tensor<float>> empty_controls;
-        return unet.compute(n_threads,
-                            *diffusion_params.x,
-                            *diffusion_params.timesteps,
-                            tensor_or_empty(diffusion_params.context),
-                            tensor_or_empty(diffusion_params.c_concat),
-                            tensor_or_empty(diffusion_params.y),
-                            diffusion_params.num_video_frames,
-                            diffusion_params.controls ? *diffusion_params.controls : empty_controls,
-                            diffusion_params.control_strength);
-    }
-};
-
-struct MMDiTModel : public DiffusionModel {
-    MMDiTRunner mmdit;
-
-    MMDiTModel(ggml_backend_t backend,
-               bool offload_params_to_cpu,
-               const String2TensorStorage& tensor_storage_map = {})
-        : mmdit(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model") {
-    }
-
-    std::string get_desc() override {
-        return mmdit.get_desc();
-    }
-
-    void alloc_params_buffer() override {
-        mmdit.alloc_params_buffer();
-    }
-
-    void free_params_buffer() override {
-        mmdit.free_params_buffer();
-    }
-
-    void free_compute_buffer() override {
-        mmdit.free_compute_buffer();
-    }
-
-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
-        mmdit.get_param_tensors(tensors, "model.diffusion_model");
-    }
-
-    size_t get_params_buffer_size() override {
-        return mmdit.get_params_buffer_size();
-    }
-
-    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
-        mmdit.set_weight_adapter(adapter);
-    }
-
-    int64_t get_adm_in_channels() override {
-        return 768 + 1280;
-    }
-
-    void set_flash_attention_enabled(bool enabled) {
-        mmdit.set_flash_attention_enabled(enabled);
-    }
-
-    void set_circular_axes(bool circular_x, bool circular_y) override {
-        mmdit.set_circular_axes(circular_x, circular_y);
-    }
-
-    sd::Tensor<float> compute(int n_threads,
-                              const DiffusionParams& diffusion_params) override {
-        GGML_ASSERT(diffusion_params.x != nullptr);
-        GGML_ASSERT(diffusion_params.timesteps != nullptr);
-        static const std::vector<int> empty_skip_layers;
-        return mmdit.compute(n_threads,
-                             *diffusion_params.x,
-                             *diffusion_params.timesteps,
-                             tensor_or_empty(diffusion_params.context),
-                             tensor_or_empty(diffusion_params.y),
-                             diffusion_params.skip_layers ? *diffusion_params.skip_layers : empty_skip_layers);
-    }
-};
-
-struct FluxModel : public DiffusionModel {
-    Flux::FluxRunner flux;
-
-    FluxModel(ggml_backend_t backend,
-              bool offload_params_to_cpu,
-              const String2TensorStorage& tensor_storage_map = {},
-              SDVersion version                              = VERSION_FLUX,
-              bool use_mask                                  = false)
-        : flux(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version, use_mask) {
-    }
-
-    std::string get_desc() override {
-        return flux.get_desc();
-    }
-
-    void alloc_params_buffer() override {
-        flux.alloc_params_buffer();
-    }
-
-    void free_params_buffer() override {
-        flux.free_params_buffer();
-    }
-
-    void free_compute_buffer() override {
-        flux.free_compute_buffer();
-    }
-
-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
-        flux.get_param_tensors(tensors, "model.diffusion_model");
-    }
-
-    size_t get_params_buffer_size() override {
-        return flux.get_params_buffer_size();
-    }
-
-    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
-        flux.set_weight_adapter(adapter);
-    }
-
-    int64_t get_adm_in_channels() override {
-        return 768;
-    }
-
-    void set_flash_attention_enabled(bool enabled) {
-        flux.set_flash_attention_enabled(enabled);
-    }
-
-    void set_circular_axes(bool circular_x, bool circular_y) override {
-        flux.set_circular_axes(circular_x, circular_y);
-    }
-
-    sd::Tensor<float> compute(int n_threads,
-                              const DiffusionParams& diffusion_params) override {
-        GGML_ASSERT(diffusion_params.x != nullptr);
-        GGML_ASSERT(diffusion_params.timesteps != nullptr);
-        static const std::vector<sd::Tensor<float>> empty_ref_latents;
-        static const std::vector<int> empty_skip_layers;
-        return flux.compute(n_threads,
-                            *diffusion_params.x,
-                            *diffusion_params.timesteps,
-                            tensor_or_empty(diffusion_params.context),
-                            tensor_or_empty(diffusion_params.c_concat),
-                            tensor_or_empty(diffusion_params.y),
-                            tensor_or_empty(diffusion_params.guidance),
-                            diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
-                            diffusion_params.increase_ref_index,
-                            diffusion_params.skip_layers ? *diffusion_params.skip_layers : empty_skip_layers);
-    }
-};
-
-struct AnimaModel : public DiffusionModel {
-    std::string prefix;
-    Anima::AnimaRunner anima;
-
-    AnimaModel(ggml_backend_t backend,
-               bool offload_params_to_cpu,
-               const String2TensorStorage& tensor_storage_map = {},
-               const std::string prefix                       = "model.diffusion_model")
-        : prefix(prefix), anima(backend, offload_params_to_cpu, tensor_storage_map, prefix) {
-    }
-
-    std::string get_desc() override {
-        return anima.get_desc();
-    }
-
-    void alloc_params_buffer() override {
-        anima.alloc_params_buffer();
-    }
-
-    void free_params_buffer() override {
-        anima.free_params_buffer();
-    }
-
-    void free_compute_buffer() override {
-        anima.free_compute_buffer();
-    }
-
-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
-        anima.get_param_tensors(tensors, prefix);
-    }
-
-    size_t get_params_buffer_size() override {
-        return anima.get_params_buffer_size();
-    }
-
-    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
-        anima.set_weight_adapter(adapter);
-    }
-
-    int64_t get_adm_in_channels() override {
-        return 768;
-    }
-
-    void set_flash_attention_enabled(bool enabled) {
-        anima.set_flash_attention_enabled(enabled);
-    }
-
-    void set_circular_axes(bool circular_x, bool circular_y) override {
-        anima.set_circular_axes(circular_x, circular_y);
-    }
-
-    sd::Tensor<float> compute(int n_threads,
-                              const DiffusionParams& diffusion_params) override {
-        GGML_ASSERT(diffusion_params.x != nullptr);
-        GGML_ASSERT(diffusion_params.timesteps != nullptr);
-        return anima.compute(n_threads,
-                             *diffusion_params.x,
-                             *diffusion_params.timesteps,
-                             tensor_or_empty(diffusion_params.context),
-                             tensor_or_empty(diffusion_params.t5_ids),
-                             tensor_or_empty(diffusion_params.t5_weights));
-    }
-};
-
-struct WanModel : public DiffusionModel {
-    std::string prefix;
-    WAN::WanRunner wan;
-
-    WanModel(ggml_backend_t backend,
-             bool offload_params_to_cpu,
-             const String2TensorStorage& tensor_storage_map = {},
-             const std::string prefix                       = "model.diffusion_model",
-             SDVersion version                              = VERSION_WAN2)
-        : prefix(prefix), wan(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
-    }
-
-    std::string get_desc() override {
-        return wan.get_desc();
-    }
-
-    void alloc_params_buffer() override {
-        wan.alloc_params_buffer();
-    }
-
-    void free_params_buffer() override {
-        wan.free_params_buffer();
-    }
-
-    void free_compute_buffer() override {
-        wan.free_compute_buffer();
-    }
-
-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
-        wan.get_param_tensors(tensors, prefix);
-    }
-
-    size_t get_params_buffer_size() override {
-        return wan.get_params_buffer_size();
-    }
-
-    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
-        wan.set_weight_adapter(adapter);
-    }
-
-    int64_t get_adm_in_channels() override {
-        return 768;
-    }
-
-    void set_flash_attention_enabled(bool enabled) {
-        wan.set_flash_attention_enabled(enabled);
-    }
-
-    void set_circular_axes(bool circular_x, bool circular_y) override {
-        wan.set_circular_axes(circular_x, circular_y);
-    }
-
-    sd::Tensor<float> compute(int n_threads,
-                              const DiffusionParams& diffusion_params) override {
-        GGML_ASSERT(diffusion_params.x != nullptr);
-        GGML_ASSERT(diffusion_params.timesteps != nullptr);
-        return wan.compute(n_threads,
-                           *diffusion_params.x,
-                           *diffusion_params.timesteps,
-                           tensor_or_empty(diffusion_params.context),
-                           tensor_or_empty(diffusion_params.y),
-                           tensor_or_empty(diffusion_params.c_concat),
-                           sd::Tensor<float>(),
-                           tensor_or_empty(diffusion_params.vace_context),
-                           diffusion_params.vace_strength);
-    }
-};
-
-struct QwenImageModel : public DiffusionModel {
-    std::string prefix;
-    Qwen::QwenImageRunner qwen_image;
-
-    QwenImageModel(ggml_backend_t backend,
-                   bool offload_params_to_cpu,
-                   const String2TensorStorage& tensor_storage_map = {},
-                   const std::string prefix                       = "model.diffusion_model",
-                   SDVersion version                              = VERSION_QWEN_IMAGE,
-                   bool zero_cond_t                               = false)
-        : prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version, zero_cond_t) {
-    }
-
-    std::string get_desc() override {
-        return qwen_image.get_desc();
-    }
-
-    void alloc_params_buffer() override {
-        qwen_image.alloc_params_buffer();
-    }
-
-    void free_params_buffer() override {
-        qwen_image.free_params_buffer();
-    }
-
-    void free_compute_buffer() override {
-        qwen_image.free_compute_buffer();
-    }
-
-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
-        qwen_image.get_param_tensors(tensors, prefix);
-    }
-
-    size_t get_params_buffer_size() override {
-        return qwen_image.get_params_buffer_size();
-    }
-
-    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
-        qwen_image.set_weight_adapter(adapter);
-    }
-
-    int64_t get_adm_in_channels() override {
-        return 768;
-    }
-
-    void set_flash_attention_enabled(bool enabled) {
-        qwen_image.set_flash_attention_enabled(enabled);
-    }
-
-    void set_circular_axes(bool circular_x, bool circular_y) override {
-        qwen_image.set_circular_axes(circular_x, circular_y);
-    }
-
-    sd::Tensor<float> compute(int n_threads,
-                              const DiffusionParams& diffusion_params) override {
-        GGML_ASSERT(diffusion_params.x != nullptr);
-        GGML_ASSERT(diffusion_params.timesteps != nullptr);
-        static const std::vector<sd::Tensor<float>> empty_ref_latents;
-        return qwen_image.compute(n_threads,
-                                  *diffusion_params.x,
-                                  *diffusion_params.timesteps,
-                                  tensor_or_empty(diffusion_params.context),
-                                  diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
-                                  true);
-    }
-};
-
-struct ZImageModel : public DiffusionModel {
-    std::string prefix;
-    ZImage::ZImageRunner z_image;
-
-    ZImageModel(ggml_backend_t backend,
-                bool offload_params_to_cpu,
-                const String2TensorStorage& tensor_storage_map = {},
-                const std::string prefix                       = "model.diffusion_model",
-                SDVersion version                              = VERSION_Z_IMAGE)
-        : prefix(prefix), z_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
-    }
-
-    std::string get_desc() override {
-        return z_image.get_desc();
-    }
-
-    void alloc_params_buffer() override {
-        z_image.alloc_params_buffer();
-    }
-
-    void free_params_buffer() override {
-        z_image.free_params_buffer();
-    }
-
-    void free_compute_buffer() override {
-        z_image.free_compute_buffer();
-    }
-
-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
-        z_image.get_param_tensors(tensors, prefix);
-    }
-
-    size_t get_params_buffer_size() override {
-        return z_image.get_params_buffer_size();
-    }
-
-    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
-        z_image.set_weight_adapter(adapter);
-    }
-
-    int64_t get_adm_in_channels() override {
-        return 768;
-    }
-
-    void set_flash_attention_enabled(bool enabled) {
-        z_image.set_flash_attention_enabled(enabled);
-    }
-
-    void set_circular_axes(bool circular_x, bool circular_y) override {
-        z_image.set_circular_axes(circular_x, circular_y);
-    }
-
-    sd::Tensor<float> compute(int n_threads,
-                              const DiffusionParams& diffusion_params) override {
-        GGML_ASSERT(diffusion_params.x != nullptr);
-        GGML_ASSERT(diffusion_params.timesteps != nullptr);
-        static const std::vector<sd::Tensor<float>> empty_ref_latents;
-        return z_image.compute(n_threads,
-                               *diffusion_params.x,
-                               *diffusion_params.timesteps,
-                               tensor_or_empty(diffusion_params.context),
-                               diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
-                               true);
-    }
-};
-
-struct ErnieImageModel : public DiffusionModel {
-    std::string prefix;
-    ErnieImage::ErnieImageRunner ernie_image;
-
-    ErnieImageModel(ggml_backend_t backend,
-                    bool offload_params_to_cpu,
-                    const String2TensorStorage& tensor_storage_map = {},
-                    const std::string prefix                       = "model.diffusion_model")
-        : prefix(prefix), ernie_image(backend, offload_params_to_cpu, tensor_storage_map, prefix) {
-    }
-
-    std::string get_desc() override {
-        return ernie_image.get_desc();
-    }
-
-    void alloc_params_buffer() override {
-        ernie_image.alloc_params_buffer();
-    }
-
-    void free_params_buffer() override {
-        ernie_image.free_params_buffer();
-    }
-
-    void free_compute_buffer() override {
-        ernie_image.free_compute_buffer();
-    }
-
-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
-        ernie_image.get_param_tensors(tensors, prefix);
-    }
-
-    size_t get_params_buffer_size() override {
-        return ernie_image.get_params_buffer_size();
-    }
-
-    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
-        ernie_image.set_weight_adapter(adapter);
-    }
-
-    int64_t get_adm_in_channels() override {
-        return 768;
-    }
-
-    void set_flash_attention_enabled(bool enabled) {
-        ernie_image.set_flash_attention_enabled(enabled);
-    }
-
-    void set_circular_axes(bool circular_x, bool circular_y) override {
-        ernie_image.set_circular_axes(circular_x, circular_y);
-    }
-
-    sd::Tensor<float> compute(int n_threads,
-                              const DiffusionParams& diffusion_params) override {
-        GGML_ASSERT(diffusion_params.x != nullptr);
-        GGML_ASSERT(diffusion_params.timesteps != nullptr);
-        return ernie_image.compute(n_threads,
-                                   *diffusion_params.x,
-                                   *diffusion_params.timesteps,
-                                   tensor_or_empty(diffusion_params.context));
-    }
-};
-
-#endif
--- a/src/esrgan.hpp
+++ b/src/esrgan.hpp
@ -1,365 +0,0 @@
-#ifndef __ESRGAN_HPP__
-#define __ESRGAN_HPP__
-
-#include "ggml_extend.hpp"
-#include "model.h"
-
-/*
-    ===================================    ESRGAN  ===================================
-    References:
-    https://github.com/xinntao/Real-ESRGAN/blob/master/inference_realesrgan.py
-    https://github.com/XPixelGroup/BasicSR/blob/v1.4.2/basicsr/archs/rrdbnet_arch.py
-
-*/
-
-class ResidualDenseBlock : public GGMLBlock {
-protected:
-    int num_feat;
-    int num_grow_ch;
-
-public:
-    ResidualDenseBlock(int num_feat = 64, int num_grow_ch = 32)
-        : num_feat(num_feat), num_grow_ch(num_grow_ch) {
-        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
-        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
-        blocks["conv3"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
-        blocks["conv4"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
-        blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
-    }
-
-    ggml_tensor* lrelu(GGMLRunnerContext* ctx, ggml_tensor* x) {
-        return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true);
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
-        // x: [n, num_feat, h, w]
-        // return: [n, num_feat, h, w]
-
-        auto conv1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv1"]);
-        auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv2"]);
-        auto conv3 = std::dynamic_pointer_cast<Conv2d>(blocks["conv3"]);
-        auto conv4 = std::dynamic_pointer_cast<Conv2d>(blocks["conv4"]);
-        auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]);
-
-        auto x1    = lrelu(ctx, conv1->forward(ctx, x));
-        auto x_cat = ggml_concat(ctx->ggml_ctx, x, x1, 2);
-        auto x2    = lrelu(ctx, conv2->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx->ggml_ctx, x_cat, x2, 2);
-        auto x3    = lrelu(ctx, conv3->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx->ggml_ctx, x_cat, x3, 2);
-        auto x4    = lrelu(ctx, conv4->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx->ggml_ctx, x_cat, x4, 2);
-        auto x5    = conv5->forward(ctx, x_cat);
-
-        x5 = ggml_add(ctx->ggml_ctx, ggml_ext_scale(ctx->ggml_ctx, x5, 0.2f), x);
-        return x5;
-    }
-};
-
-class RRDB : public GGMLBlock {
-public:
-    RRDB(int num_feat, int num_grow_ch = 32) {
-        blocks["rdb1"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
-        blocks["rdb2"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
-        blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
-        // x: [n, num_feat, h, w]
-        // return: [n, num_feat, h, w]
-
-        auto rdb1 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb1"]);
-        auto rdb2 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb2"]);
-        auto rdb3 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb3"]);
-
-        auto out = rdb1->forward(ctx, x);
-        out      = rdb2->forward(ctx, out);
-        out      = rdb3->forward(ctx, out);
-
-        out = ggml_add(ctx->ggml_ctx, ggml_ext_scale(ctx->ggml_ctx, out, 0.2f), x);
-        return out;
-    }
-};
-
-class RRDBNet : public GGMLBlock {
-protected:
-    int scale       = 4;
-    int num_block   = 23;
-    int num_in_ch   = 3;
-    int num_out_ch  = 3;
-    int num_feat    = 64;
-    int num_grow_ch = 32;
-
-public:
-    RRDBNet(int scale, int num_block, int num_in_ch, int num_out_ch, int num_feat, int num_grow_ch)
-        : scale(scale), num_block(num_block), num_in_ch(num_in_ch), num_out_ch(num_out_ch), num_feat(num_feat), num_grow_ch(num_grow_ch) {
-        blocks["conv_first"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_in_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
-        for (int i = 0; i < num_block; i++) {
-            std::string name = "body." + std::to_string(i);
-            blocks[name]     = std::shared_ptr<GGMLBlock>(new RRDB(num_feat, num_grow_ch));
-        }
-        blocks["conv_body"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
-        if (scale >= 2) {
-            blocks["conv_up1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
-        }
-        if (scale == 4) {
-            blocks["conv_up2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
-        }
-        blocks["conv_hr"]   = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
-        blocks["conv_last"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_out_ch, {3, 3}, {1, 1}, {1, 1}));
-    }
-
-    int get_scale() { return scale; }
-    int get_num_block() { return num_block; }
-
-    ggml_tensor* lrelu(GGMLRunnerContext* ctx, ggml_tensor* x) {
-        return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true);
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
-        // x: [n, num_in_ch, h, w]
-        // return: [n, num_out_ch, h*scale, w*scale]
-        auto conv_first = std::dynamic_pointer_cast<Conv2d>(blocks["conv_first"]);
-        auto conv_body  = std::dynamic_pointer_cast<Conv2d>(blocks["conv_body"]);
-        auto conv_hr    = std::dynamic_pointer_cast<Conv2d>(blocks["conv_hr"]);
-        auto conv_last  = std::dynamic_pointer_cast<Conv2d>(blocks["conv_last"]);
-
-        auto feat      = conv_first->forward(ctx, x);
-        auto body_feat = feat;
-        for (int i = 0; i < num_block; i++) {
-            std::string name = "body." + std::to_string(i);
-            auto block       = std::dynamic_pointer_cast<RRDB>(blocks[name]);
-
-            body_feat = block->forward(ctx, body_feat);
-        }
-        body_feat = conv_body->forward(ctx, body_feat);
-        feat      = ggml_add(ctx->ggml_ctx, feat, body_feat);
-        // upsample
-        if (scale >= 2) {
-            auto conv_up1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up1"]);
-            feat          = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx->ggml_ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
-            if (scale == 4) {
-                auto conv_up2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up2"]);
-                feat          = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx->ggml_ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
-            }
-        }
-        // for all scales
-        auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat)));
-        return out;
-    }
-};
-
-struct ESRGAN : public GGMLRunner {
-    std::unique_ptr<RRDBNet> rrdb_net;
-    int scale     = 4;
-    int tile_size = 128;  // avoid cuda OOM for 4gb VRAM
-
-    ESRGAN(ggml_backend_t backend,
-           bool offload_params_to_cpu,
-           int tile_size                                  = 128,
-           const String2TensorStorage& tensor_storage_map = {})
-        : GGMLRunner(backend, offload_params_to_cpu) {
-        this->tile_size = tile_size;
-    }
-
-    std::string get_desc() override {
-        return "esrgan";
-    }
-
-    bool load_from_file(const std::string& file_path, int n_threads) {
-        LOG_INFO("loading esrgan from '%s'", file_path.c_str());
-
-        ModelLoader model_loader;
-        if (!model_loader.init_from_file_and_convert_name(file_path)) {
-            LOG_ERROR("init esrgan model loader from file failed: '%s'", file_path.c_str());
-            return false;
-        }
-
-        // Get tensor names
-        auto tensor_names = model_loader.get_tensor_names();
-
-        // Detect if it's ESRGAN format
-        bool is_ESRGAN = std::find(tensor_names.begin(), tensor_names.end(), "model.0.weight") != tensor_names.end();
-
-        // Detect parameters from tensor names
-        int detected_num_block = 0;
-        if (is_ESRGAN) {
-            for (const auto& name : tensor_names) {
-                if (name.find("model.1.sub.") == 0) {
-                    size_t first_dot = name.find('.', 12);
-                    if (first_dot != std::string::npos) {
-                        size_t second_dot = name.find('.', first_dot + 1);
-                        if (second_dot != std::string::npos && name.substr(first_dot + 1, 3) == "RDB") {
-                            try {
-                                int idx            = std::stoi(name.substr(12, first_dot - 12));
-                                detected_num_block = std::max(detected_num_block, idx + 1);
-                            } catch (...) {
-                            }
-                        }
-                    }
-                }
-            }
-        } else {
-            // Original format
-            for (const auto& name : tensor_names) {
-                if (name.find("body.") == 0) {
-                    size_t pos = name.find('.', 5);
-                    if (pos != std::string::npos) {
-                        try {
-                            int idx            = std::stoi(name.substr(5, pos - 5));
-                            detected_num_block = std::max(detected_num_block, idx + 1);
-                        } catch (...) {
-                        }
-                    }
-                }
-            }
-        }
-
-        int detected_scale = 4;  // default
-        if (is_ESRGAN) {
-            // For ESRGAN format, detect scale by highest model number
-            int max_model_num = 0;
-            for (const auto& name : tensor_names) {
-                if (name.find("model.") == 0) {
-                    size_t dot_pos = name.find('.', 6);
-                    if (dot_pos != std::string::npos) {
-                        try {
-                            int num       = std::stoi(name.substr(6, dot_pos - 6));
-                            max_model_num = std::max(max_model_num, num);
-                        } catch (...) {
-                        }
-                    }
-                }
-            }
-            if (max_model_num <= 4) {
-                detected_scale = 1;
-            } else if (max_model_num <= 7) {
-                detected_scale = 2;
-            } else {
-                detected_scale = 4;
-            }
-        } else {
-            // Original format
-            bool has_conv_up2 = std::any_of(tensor_names.begin(), tensor_names.end(), [](const std::string& name) {
-                return name == "conv_up2.weight";
-            });
-            bool has_conv_up1 = std::any_of(tensor_names.begin(), tensor_names.end(), [](const std::string& name) {
-                return name == "conv_up1.weight";
-            });
-            if (has_conv_up2) {
-                detected_scale = 4;
-            } else if (has_conv_up1) {
-                detected_scale = 2;
-            } else {
-                detected_scale = 1;
-            }
-        }
-
-        int detected_num_in_ch   = 3;
-        int detected_num_out_ch  = 3;
-        int detected_num_feat    = 64;
-        int detected_num_grow_ch = 32;
-
-        // Create RRDBNet with detected parameters
-        rrdb_net = std::make_unique<RRDBNet>(detected_scale, detected_num_block, detected_num_in_ch, detected_num_out_ch, detected_num_feat, detected_num_grow_ch);
-        rrdb_net->init(params_ctx, {}, "");
-
-        alloc_params_buffer();
-        std::map<std::string, ggml_tensor*> esrgan_tensors;
-        rrdb_net->get_param_tensors(esrgan_tensors);
-
-        bool success;
-        if (is_ESRGAN) {
-            // Build name mapping for ESRGAN format
-            std::map<std::string, std::string> expected_to_model;
-            expected_to_model["conv_first.weight"] = "model.0.weight";
-            expected_to_model["conv_first.bias"]   = "model.0.bias";
-
-            for (int i = 0; i < detected_num_block; i++) {
-                for (int j = 1; j <= 3; j++) {
-                    for (int k = 1; k <= 5; k++) {
-                        std::string expected_weight        = "body." + std::to_string(i) + ".rdb" + std::to_string(j) + ".conv" + std::to_string(k) + ".weight";
-                        std::string model_weight           = "model.1.sub." + std::to_string(i) + ".RDB" + std::to_string(j) + ".conv" + std::to_string(k) + ".0.weight";
-                        expected_to_model[expected_weight] = model_weight;
-
-                        std::string expected_bias        = "body." + std::to_string(i) + ".rdb" + std::to_string(j) + ".conv" + std::to_string(k) + ".bias";
-                        std::string model_bias           = "model.1.sub." + std::to_string(i) + ".RDB" + std::to_string(j) + ".conv" + std::to_string(k) + ".0.bias";
-                        expected_to_model[expected_bias] = model_bias;
-                    }
-                }
-            }
-
-            if (detected_scale == 1) {
-                expected_to_model["conv_body.weight"] = "model.1.sub." + std::to_string(detected_num_block) + ".weight";
-                expected_to_model["conv_body.bias"]   = "model.1.sub." + std::to_string(detected_num_block) + ".bias";
-                expected_to_model["conv_hr.weight"]   = "model.2.weight";
-                expected_to_model["conv_hr.bias"]     = "model.2.bias";
-                expected_to_model["conv_last.weight"] = "model.4.weight";
-                expected_to_model["conv_last.bias"]   = "model.4.bias";
-            } else {
-                expected_to_model["conv_body.weight"] = "model.1.sub." + std::to_string(detected_num_block) + ".weight";
-                expected_to_model["conv_body.bias"]   = "model.1.sub." + std::to_string(detected_num_block) + ".bias";
-                if (detected_scale >= 2) {
-                    expected_to_model["conv_up1.weight"] = "model.3.weight";
-                    expected_to_model["conv_up1.bias"]   = "model.3.bias";
-                }
-                if (detected_scale == 4) {
-                    expected_to_model["conv_up2.weight"]  = "model.6.weight";
-                    expected_to_model["conv_up2.bias"]    = "model.6.bias";
-                    expected_to_model["conv_hr.weight"]   = "model.8.weight";
-                    expected_to_model["conv_hr.bias"]     = "model.8.bias";
-                    expected_to_model["conv_last.weight"] = "model.10.weight";
-                    expected_to_model["conv_last.bias"]   = "model.10.bias";
-                } else if (detected_scale == 2) {
-                    expected_to_model["conv_hr.weight"]   = "model.5.weight";
-                    expected_to_model["conv_hr.bias"]     = "model.5.bias";
-                    expected_to_model["conv_last.weight"] = "model.7.weight";
-                    expected_to_model["conv_last.bias"]   = "model.7.bias";
-                }
-            }
-
-            std::map<std::string, ggml_tensor*> model_tensors;
-            for (auto& p : esrgan_tensors) {
-                auto it = expected_to_model.find(p.first);
-                if (it != expected_to_model.end()) {
-                    model_tensors[it->second] = p.second;
-                }
-            }
-
-            success = model_loader.load_tensors(model_tensors, {}, n_threads);
-        } else {
-            success = model_loader.load_tensors(esrgan_tensors, {}, n_threads);
-        }
-
-        if (!success) {
-            LOG_ERROR("load esrgan tensors from model loader failed");
-            return false;
-        }
-
-        scale = rrdb_net->get_scale();
-        LOG_INFO("esrgan model loaded with scale=%d, num_block=%d", scale, detected_num_block);
-        return success;
-    }
-
-    ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor) {
-        if (!rrdb_net)
-            return nullptr;
-        constexpr int kGraphNodes = 1 << 16;  // 65k
-        ggml_cgraph* gf           = new_graph_custom(kGraphNodes);
-        ggml_tensor* x            = make_input(x_tensor);
-
-        auto runner_ctx  = get_context();
-        ggml_tensor* out = rrdb_net->forward(&runner_ctx, x);
-        ggml_build_forward_expand(gf, out);
-        return gf;
-    }
-
-    sd::Tensor<float> compute(const int n_threads,
-                              const sd::Tensor<float>& x) {
-        auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x); };
-        auto result    = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
-        return result;
-    }
-};
-
-#endif  // __ESRGAN_HPP__
--- a/src/extensions/generation_extension.h
+++ b/src/extensions/generation_extension.h
@ -0,0 +1,63 @@
+#ifndef __SD_EXTENSIONS_GENERATION_EXTENSION_H__
+#define __SD_EXTENSIONS_GENERATION_EXTENSION_H__
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "conditioning/conditioner.hpp"
+#include "core/ggml_extend_backend.h"
+#include "model_loader.h"
+#include "model_manager.h"
+#include "stable-diffusion.h"
+
+struct GenerationExtensionInitContext {
+    const sd_ctx_params_t* params;
+    SDVersion version;
+    const String2TensorStorage& tensor_storage_map;
+    ModelLoader& model_loader;
+    std::shared_ptr<ModelManager> model_manager;
+    int n_threads;
+    std::function<bool(SDBackendModule)> ensure_backend_pair;
+    std::function<ggml_backend_t(SDBackendModule)> backend_for;
+    std::function<ggml_backend_t(SDBackendModule)> params_backend_for;
+};
+
+struct GenerationExtensionConditionContext {
+    Conditioner* conditioner;
+    ConditionerParams& condition_params;
+    const sd_pm_params_t& pm_params;
+    int n_threads;
+    int total_steps;
+};
+
+struct GenerationExtension {
+    virtual ~GenerationExtension() = default;
+
+    virtual const char* name() const = 0;
+    virtual bool is_enabled() const {
+        return false;
+    }
+    virtual bool init(const GenerationExtensionInitContext&) {
+        return true;
+    }
+    virtual void get_param_tensors(std::map<std::string, ggml_tensor*>&) {}
+    virtual void collect_loras(std::vector<ModelManager::LoraSpec>&) {}
+    virtual void add_ignore_tensors(std::set<std::string>&) const {}
+    virtual void runner_done() {}
+    virtual void reset_runtime_condition() {}
+    virtual bool prepare_condition(GenerationExtensionConditionContext&) {
+        return false;
+    }
+    virtual const SDCondition& before_condition(int step,
+                                                const SDCondition& condition) const {
+        return condition;
+    }
+};
+
+std::shared_ptr<GenerationExtension> create_photomaker_extension();
+
+#endif
--- a/src/extensions/photomaker_extension.cpp
+++ b/src/extensions/photomaker_extension.cpp
@ -0,0 +1,292 @@
+#include "extensions/generation_extension.h"
+
+#include <algorithm>
+#include <cstring>
+#include <tuple>
+#include <utility>
+
+#include "core/tensor_ggml.hpp"
+#include "core/util.h"
+#include "model/adapter/pmid.hpp"
+
+static std::tuple<std::vector<int>, std::vector<float>, std::vector<bool>>
+tokenize_photomaker_trigger(FrozenCLIPEmbedderWithCustomWords& clip_conditioner,
+                            const std::string& text,
+                            int trigger_token_count,
+                            int32_t image_token) {
+    auto tokens_and_weights           = clip_conditioner.tokenize(text);
+    std::vector<int> source_tokens    = std::move(tokens_and_weights.first);
+    std::vector<float> source_weights = std::move(tokens_and_weights.second);
+
+    if (!source_tokens.empty() && source_tokens.front() == clip_conditioner.tokenizer.BOS_TOKEN_ID) {
+        source_tokens.erase(source_tokens.begin());
+        source_weights.erase(source_weights.begin());
+    }
+    if (!source_tokens.empty() && source_tokens.back() == clip_conditioner.tokenizer.EOS_TOKEN_ID) {
+        source_tokens.pop_back();
+        source_weights.pop_back();
+    }
+
+    std::vector<int> tokens;
+    std::vector<float> weights;
+    int32_t class_idx = -1;
+    for (size_t i = 0; i < source_tokens.size(); i++) {
+        int token = source_tokens[i];
+        if (token == image_token) {
+            if (!tokens.empty()) {
+                class_idx          = static_cast<int32_t>(tokens.size()) - 1;
+                int class_token    = tokens.back();
+                float class_weight = weights.back();
+                for (int j = 1; j < trigger_token_count; j++) {
+                    tokens.push_back(class_token);
+                    weights.push_back(class_weight);
+                }
+            }
+            continue;
+        }
+        tokens.push_back(token);
+        weights.push_back(source_weights[i]);
+    }
+
+    clip_conditioner.tokenizer.pad_tokens(tokens,
+                                          &weights,
+                                          nullptr,
+                                          clip_conditioner.text_model->model.n_token,
+                                          clip_conditioner.text_model->model.n_token,
+                                          true);
+    std::vector<bool> class_token_mask;
+    for (int i = 0; i < tokens.size(); i++) {
+        class_token_mask.push_back(class_idx + 1 <= i && i < class_idx + 1 + trigger_token_count);
+    }
+
+    return std::make_tuple(tokens, weights, class_token_mask);
+}
+
+static std::tuple<SDCondition, std::vector<bool>>
+get_photomaker_condition_with_trigger(FrozenCLIPEmbedderWithCustomWords& clip_conditioner,
+                                      int n_threads,
+                                      const ConditionerParams& conditioner_params,
+                                      const std::string& trigger_word,
+                                      int trigger_token_count) {
+    auto image_tokens = clip_conditioner.convert_token_to_id(trigger_word);
+    GGML_ASSERT(image_tokens.size() == 1);
+    auto tokens_and_weights         = tokenize_photomaker_trigger(clip_conditioner,
+                                                                  conditioner_params.text,
+                                                                  trigger_token_count,
+                                                                  image_tokens[0]);
+    std::vector<int>& tokens        = std::get<0>(tokens_and_weights);
+    std::vector<float>& weights     = std::get<1>(tokens_and_weights);
+    std::vector<bool>& trigger_mask = std::get<2>(tokens_and_weights);
+    auto cond                       = clip_conditioner.get_learned_condition_common(n_threads,
+                                                                                    tokens,
+                                                                                    weights,
+                                                                                    conditioner_params.clip_skip,
+                                                                                    conditioner_params.width,
+                                                                                    conditioner_params.height,
+                                                                                    conditioner_params.zero_out_masked);
+    return std::make_tuple(std::move(cond), trigger_mask);
+}
+
+static std::string remove_photomaker_trigger_from_prompt(FrozenCLIPEmbedderWithCustomWords& clip_conditioner,
+                                                         const std::string& prompt,
+                                                         const std::string& trigger_word) {
+    auto image_tokens = clip_conditioner.convert_token_to_id(trigger_word);
+    GGML_ASSERT(image_tokens.size() == 1);
+    auto tokens_and_weights  = clip_conditioner.tokenize(prompt);
+    std::vector<int>& tokens = tokens_and_weights.first;
+    auto it                  = std::find(tokens.begin(), tokens.end(), image_tokens[0]);
+    GGML_ASSERT(it != tokens.end());
+    tokens.erase(it);
+    return clip_conditioner.decode(tokens);
+}
+
+struct PhotoMakerExtension : public GenerationExtension {
+    std::shared_ptr<PhotoMakerIDEncoder> pmid_model;
+    bool enabled = false;
+    std::string model_path;
+    std::string trigger_word = "img";
+    SDCondition id_condition;
+    int start_merge_step = -1;
+
+    const char* name() const override {
+        return "photomaker";
+    }
+
+    bool is_enabled() const override {
+        return enabled;
+    }
+
+    bool init(const GenerationExtensionInitContext& ctx) override {
+        model_path = SAFE_STR(ctx.params->photo_maker_path);
+        if (model_path.empty()) {
+            return true;
+        }
+
+        if (!ctx.ensure_backend_pair(SDBackendModule::PHOTOMAKER)) {
+            return false;
+        }
+
+        PMVersion pm_version = std::strstr(model_path.c_str(), "v2") != nullptr ? PM_VERSION_2 : PM_VERSION_1;
+        LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", model_path.c_str());
+        if (!ctx.model_loader.init_from_file_and_convert_name(model_path, "pmid.")) {
+            LOG_WARN("loading stacked ID embedding from '%s' failed", model_path.c_str());
+            return true;
+        }
+
+        pmid_model = std::make_shared<PhotoMakerIDEncoder>(ctx.backend_for(SDBackendModule::PHOTOMAKER),
+                                                           ctx.tensor_storage_map,
+                                                           "pmid",
+                                                           ctx.version,
+                                                           pm_version,
+                                                           20.f,
+                                                           ctx.model_manager);
+        if (pm_version == PM_VERSION_2) {
+            LOG_INFO("using PhotoMaker Version 2");
+        }
+
+        enabled = true;
+        return true;
+    }
+
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
+        if (!enabled || pmid_model == nullptr) {
+            return;
+        }
+
+        pmid_model->get_param_tensors(tensors, "pmid");
+    }
+
+    void collect_loras(std::vector<ModelManager::LoraSpec>& loras) override {
+        if (!enabled || model_path.empty()) {
+            return;
+        }
+        ModelManager::LoraSpec lora;
+        lora.path                      = model_path;
+        lora.multiplier                = 1.0f;
+        lora.tensor_name_prefix_filter = "lora.model";
+        lora.required                  = true;
+        loras.push_back(std::move(lora));
+    }
+
+    void add_ignore_tensors(std::set<std::string>& ignore_tensors) const override {
+        if (!enabled) {
+            return;
+        }
+        ignore_tensors.insert("pmid.unet.");
+    }
+
+    void runner_done() override {
+        if (pmid_model != nullptr) {
+            pmid_model->runner_done();
+        }
+    }
+
+    void reset_runtime_condition() override {
+        id_condition     = {};
+        start_merge_step = -1;
+    }
+
+    bool prepare_condition(GenerationExtensionConditionContext& ctx) override {
+        reset_runtime_condition();
+        if (!enabled || pmid_model == nullptr) {
+            return false;
+        }
+
+        bool pmv2 = pmid_model->get_version() == PM_VERSION_2;
+        if (ctx.pm_params.id_images_count <= 0 || ctx.pm_params.id_images == nullptr) {
+            LOG_WARN("Provided PhotoMaker model file, but NO input ID images");
+            LOG_WARN("Turn off PhotoMaker for this request");
+            return false;
+        }
+        auto* clip_conditioner = dynamic_cast<FrozenCLIPEmbedderWithCustomWords*>(ctx.conditioner);
+        if (clip_conditioner == nullptr) {
+            LOG_WARN("PhotoMaker requires FrozenCLIPEmbedderWithCustomWords conditioner");
+            LOG_WARN("Turn off PhotoMaker for this request");
+            return false;
+        }
+
+        int clip_image_size        = 224;
+        pmid_model->style_strength = ctx.pm_params.style_strength;
+        sd::Tensor<float> id_image_tensor;
+        for (int i = 0; i < ctx.pm_params.id_images_count; i++) {
+            auto id_image           = sd_image_to_tensor(ctx.pm_params.id_images[i]);
+            auto processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size);
+            if (id_image_tensor.empty()) {
+                id_image_tensor = processed_id_image;
+            } else {
+                id_image_tensor = sd::ops::concat(id_image_tensor, processed_id_image, 3);
+            }
+        }
+
+        int64_t t0                        = ggml_time_ms();
+        int trigger_token_count           = pmv2 ? 2 * ctx.pm_params.id_images_count : ctx.pm_params.id_images_count;
+        auto cond_tup                     = get_photomaker_condition_with_trigger(*clip_conditioner,
+                                                                                  ctx.n_threads,
+                                                                                  ctx.condition_params,
+                                                                                  trigger_word,
+                                                                                  trigger_token_count);
+        SDCondition prepared_id_condition = std::get<0>(cond_tup);
+        auto class_tokens_mask            = std::get<1>(cond_tup);
+        if (std::find(class_tokens_mask.begin(), class_tokens_mask.end(), true) == class_tokens_mask.end()) {
+            LOG_WARN("PhotoMaker trigger word '%s' was not found in prompt", trigger_word.c_str());
+            LOG_WARN("Turn off PhotoMaker for this request");
+            return false;
+        }
+
+        sd::Tensor<float> id_embeds;
+        if (pmv2 && ctx.pm_params.id_embed_path != nullptr) {
+            try {
+                id_embeds = sd::load_tensor_from_file_as_tensor<float>(ctx.pm_params.id_embed_path);
+            } catch (const std::exception&) {
+                id_embeds = {};
+            }
+        }
+        if (pmv2 && id_embeds.empty()) {
+            LOG_WARN("Provided PhotoMaker images, but NO valid ID embeds file for PM v2");
+            LOG_WARN("Turn off PhotoMaker for this request");
+            return false;
+        }
+        if (pmv2 && ctx.pm_params.id_images_count != id_embeds.shape()[1]) {
+            LOG_WARN("PhotoMaker image count (%d) does NOT match ID embeds (%d). You should run face_detect.py again.",
+                     ctx.pm_params.id_images_count,
+                     static_cast<int>(id_embeds.shape()[1]));
+            LOG_WARN("Turn off PhotoMaker for this request");
+            return false;
+        }
+
+        auto res = pmid_model->compute(ctx.n_threads,
+                                       id_image_tensor,
+                                       prepared_id_condition.c_crossattn,
+                                       id_embeds,
+                                       class_tokens_mask);
+        if (res.empty()) {
+            LOG_ERROR("Photomaker ID Stacking failed");
+            LOG_WARN("Turn off PhotoMaker for this request");
+            return false;
+        }
+
+        prepared_id_condition.c_crossattn = std::move(res);
+        int64_t t1                        = ggml_time_ms();
+        id_condition                      = std::move(prepared_id_condition);
+        start_merge_step                  = int(ctx.pm_params.style_strength / 100.f * ctx.total_steps);
+        ctx.condition_params.text         = remove_photomaker_trigger_from_prompt(*clip_conditioner,
+                                                                                  ctx.condition_params.text,
+                                                                                  trigger_word);
+        LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
+        LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step);
+
+        return true;
+    }
+
+    const SDCondition& before_condition(int step,
+                                        const SDCondition& condition) const override {
+        if (!id_condition.empty() && start_merge_step != -1 && step > start_merge_step) {
+            return id_condition;
+        }
+        return condition;
+    }
+};
+
+std::shared_ptr<GenerationExtension> create_photomaker_extension() {
+    return std::make_shared<PhotoMakerExtension>();
+}
--- a/src/gits_noise.inl
+++ b/src/gits_noise.inl
@ -1,349 +0,0 @@
-#ifndef GITS_NOISE_INL
-#define GITS_NOISE_INL
-
-const std::vector<std::vector<float>> GITS_NOISE_0_80 = {
-    { 14.61464119f, 7.49001646f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 6.77309084f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 3.07277966f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 2.05039096f, 0.02916753f },
-    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 2.05039096f, 0.02916753f },
-    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
-    { 14.61464119f, 12.96784878f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 8.75849152f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 8.75849152f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.07277966f, 1.56271636f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.07277966f, 1.56271636f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.07277966f, 1.56271636f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.07277966f, 1.56271636f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.19567990f, 1.98035145f, 0.86115354f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.19567990f, 1.98035145f, 0.86115354f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.88507891f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.07277966f, 1.84880662f, 0.83188516f, 0.02916753f }
-};
-
-const std::vector<std::vector<float>> GITS_NOISE_0_85 = {
-    { 14.61464119f, 7.49001646f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 1.84880662f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 6.77309084f, 1.56271636f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.11996698f, 3.07277966f, 1.24153244f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 5.09240818f, 2.84484982f, 0.95350921f, 0.02916753f },
-    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.09240818f, 2.84484982f, 0.95350921f, 0.02916753f },
-    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.58536053f, 3.19567990f, 1.84880662f, 0.803307f, 0.02916753f },
-    { 14.61464119f, 12.96784878f, 11.54541874f, 8.75849152f, 7.49001646f, 5.58536053f, 3.19567990f, 1.84880662f, 0.803307f, 0.02916753f },
-    { 14.61464119f, 12.96784878f, 11.54541874f, 8.75849152f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 8.75849152f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.60512662f, 2.63833880f, 1.56271636f, 0.72133851f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.88507891f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f }
-};
-
-const std::vector<std::vector<float>> GITS_NOISE_0_90 = {
-    { 14.61464119f, 6.77309084f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 1.56271636f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 3.07277966f, 0.95350921f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 2.54230714f, 0.89115214f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 2.54230714f, 0.89115214f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 5.09240818f, 3.07277966f, 1.61558151f, 0.69515091f, 0.02916753f },
-    { 14.61464119f, 12.23089790f, 8.75849152f, 7.11996698f, 4.86714602f, 3.07277966f, 1.61558151f, 0.69515091f, 0.02916753f },
-    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 2.95596409f, 1.61558151f, 0.69515091f, 0.02916753f },
-    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.19988537f, 1.24153244f, 0.57119018f, 0.02916753f },
-    { 14.61464119f, 12.96784878f, 10.90732002f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.19988537f, 1.24153244f, 0.57119018f, 0.02916753f },
-    { 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.19988537f, 1.24153244f, 0.57119018f, 0.02916753f },
-    { 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.75677586f, 2.84484982f, 1.84880662f, 1.08895338f, 0.52423614f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.75677586f, 2.84484982f, 1.84880662f, 1.08895338f, 0.52423614f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.44769001f, 5.58536053f, 4.45427561f, 3.32507086f, 2.45070267f, 1.61558151f, 0.95350921f, 0.45573691f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.44769001f, 5.58536053f, 4.45427561f, 3.32507086f, 2.45070267f, 1.61558151f, 0.95350921f, 0.45573691f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.86714602f, 3.91689563f, 3.07277966f, 2.27973175f, 1.56271636f, 0.95350921f, 0.45573691f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.86714602f, 3.91689563f, 3.07277966f, 2.27973175f, 1.56271636f, 0.95350921f, 0.45573691f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.86714602f, 3.91689563f, 3.07277966f, 2.27973175f, 1.56271636f, 0.95350921f, 0.45573691f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.45427561f, 3.60512662f, 2.95596409f, 2.19988537f, 1.51179266f, 0.89115214f, 0.43325692f, 0.02916753f }
-};
-
-const std::vector<std::vector<float>> GITS_NOISE_0_95 = {
-    { 14.61464119f, 6.77309084f, 0.02916753f },
-    { 14.61464119f, 6.77309084f, 1.56271636f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 2.84484982f, 0.89115214f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 2.36326075f, 0.803307f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 2.95596409f, 1.56271636f, 0.64427125f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 2.95596409f, 1.56271636f, 0.64427125f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 3.07277966f, 1.91321158f, 1.08895338f, 0.50118381f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.07277966f, 1.91321158f, 1.08895338f, 0.50118381f, 0.02916753f },
-    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.07277966f, 1.91321158f, 1.08895338f, 0.50118381f, 0.02916753f },
-    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.19988537f, 1.41535246f, 0.803307f, 0.38853383f, 0.02916753f },
-    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.46139455f, 2.63833880f, 1.84880662f, 1.24153244f, 0.72133851f, 0.34370604f, 0.02916753f },
-    { 14.61464119f, 12.96784878f, 10.90732002f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.46139455f, 2.63833880f, 1.84880662f, 1.24153244f, 0.72133851f, 0.34370604f, 0.02916753f },
-    { 14.61464119f, 12.96784878f, 10.90732002f, 8.75849152f, 7.49001646f, 6.14220476f, 4.86714602f, 3.75677586f, 2.95596409f, 2.19988537f, 1.56271636f, 1.05362725f, 0.64427125f, 0.32104823f, 0.02916753f },
-    { 14.61464119f, 12.96784878f, 10.90732002f, 8.75849152f, 7.49001646f, 6.44769001f, 5.58536053f, 4.65472794f, 3.60512662f, 2.95596409f, 2.19988537f, 1.56271636f, 1.05362725f, 0.64427125f, 0.32104823f, 0.02916753f },
-    { 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 6.44769001f, 5.58536053f, 4.65472794f, 3.60512662f, 2.95596409f, 2.19988537f, 1.56271636f, 1.05362725f, 0.64427125f, 0.32104823f, 0.02916753f },
-    { 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 6.44769001f, 5.58536053f, 4.65472794f, 3.75677586f, 3.07277966f, 2.45070267f, 1.78698075f, 1.24153244f, 0.83188516f, 0.50118381f, 0.22545385f, 0.02916753f },
-    { 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.45427561f, 3.60512662f, 2.95596409f, 2.36326075f, 1.72759056f, 1.24153244f, 0.83188516f, 0.50118381f, 0.22545385f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.45427561f, 3.60512662f, 2.95596409f, 2.36326075f, 1.72759056f, 1.24153244f, 0.83188516f, 0.50118381f, 0.22545385f, 0.02916753f },
-    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.45427561f, 3.75677586f, 3.07277966f, 2.45070267f, 1.91321158f, 1.46270394f, 1.05362725f, 0.72133851f, 0.43325692f, 0.19894916f, 0.02916753f }
-};
-
-const std::vector<std::vector<float>> GITS_NOISE_1_00 = {
-    { 14.61464119f, 1.56271636f, 0.02916753f },
-    { 14.61464119f, 6.77309084f, 0.95350921f, 0.02916753f },
-    { 14.61464119f, 6.77309084f, 2.36326075f, 0.803307f, 0.02916753f },
-    { 14.61464119f, 7.11996698f, 3.07277966f, 1.56271636f, 0.59516323f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.41535246f, 0.57119018f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.61558151f, 0.86115354f, 0.38853383f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 2.84484982f, 1.61558151f, 0.86115354f, 0.38853383f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 3.07277966f, 1.98035145f, 1.24153244f, 0.72133851f, 0.34370604f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.07277966f, 1.98035145f, 1.24153244f, 0.72133851f, 0.34370604f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.27973175f, 1.51179266f, 0.95350921f, 0.54755926f, 0.25053367f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.36326075f, 1.61558151f, 1.08895338f, 0.72133851f, 0.41087446f, 0.17026083f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.36326075f, 1.61558151f, 1.08895338f, 0.72133851f, 0.41087446f, 0.17026083f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.84484982f, 2.12350607f, 1.56271636f, 1.08895338f, 0.72133851f, 0.41087446f, 0.17026083f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.84484982f, 2.19988537f, 1.61558151f, 1.162866f, 0.803307f, 0.50118381f, 0.27464288f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.75677586f, 3.07277966f, 2.45070267f, 1.84880662f, 1.36964464f, 1.01931262f, 0.72133851f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 6.14220476f, 5.09240818f, 4.26497746f, 3.46139455f, 2.84484982f, 2.19988537f, 1.67050016f, 1.24153244f, 0.92192322f, 0.64427125f, 0.43325692f, 0.25053367f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 6.14220476f, 5.09240818f, 4.26497746f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.12534678f, 0.83188516f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 12.23089790f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 5.09240818f, 4.26497746f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.12534678f, 0.83188516f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 12.23089790f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.26497746f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.12534678f, 0.83188516f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f }
-};
-
-const std::vector<std::vector<float>> GITS_NOISE_1_05 = {
-    { 14.61464119f, 0.95350921f, 0.02916753f },
-    { 14.61464119f, 6.77309084f, 0.89115214f, 0.02916753f },
-    { 14.61464119f, 6.77309084f, 2.05039096f, 0.72133851f, 0.02916753f },
-    { 14.61464119f, 6.77309084f, 2.84484982f, 1.28281462f, 0.52423614f, 0.02916753f },
-    { 14.61464119f, 6.77309084f, 3.07277966f, 1.61558151f, 0.803307f, 0.34370604f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.56271636f, 0.803307f, 0.34370604f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.61558151f, 0.95350921f, 0.52423614f, 0.22545385f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 1.98035145f, 1.24153244f, 0.74807048f, 0.41087446f, 0.17026083f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.27973175f, 1.51179266f, 0.95350921f, 0.59516323f, 0.34370604f, 0.13792117f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 5.09240818f, 3.46139455f, 2.45070267f, 1.61558151f, 1.08895338f, 0.72133851f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 5.09240818f, 3.46139455f, 2.45070267f, 1.61558151f, 1.08895338f, 0.72133851f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.36326075f, 1.61558151f, 1.08895338f, 0.72133851f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.45070267f, 1.72759056f, 1.24153244f, 0.86115354f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.84484982f, 2.19988537f, 1.61558151f, 1.162866f, 0.83188516f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.84484982f, 2.19988537f, 1.67050016f, 1.28281462f, 0.95350921f, 0.72133851f, 0.52423614f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.95596409f, 2.36326075f, 1.84880662f, 1.41535246f, 1.08895338f, 0.83188516f, 0.61951244f, 0.45573691f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.20157266f, 0.95350921f, 0.74807048f, 0.57119018f, 0.43325692f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 8.30717278f, 7.11996698f, 5.85520077f, 4.65472794f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.20157266f, 0.95350921f, 0.74807048f, 0.57119018f, 0.43325692f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 8.30717278f, 7.11996698f, 5.85520077f, 4.65472794f, 3.60512662f, 2.95596409f, 2.45070267f, 1.98035145f, 1.61558151f, 1.32549286f, 1.08895338f, 0.86115354f, 0.69515091f, 0.54755926f, 0.41087446f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f }
-};
-
-const std::vector<std::vector<float>> GITS_NOISE_1_10 = {
-    { 14.61464119f, 0.89115214f, 0.02916753f },
-    { 14.61464119f, 2.36326075f, 0.72133851f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 1.61558151f, 0.57119018f, 0.02916753f },
-    { 14.61464119f, 6.77309084f, 2.45070267f, 1.08895338f, 0.45573691f, 0.02916753f },
-    { 14.61464119f, 6.77309084f, 2.95596409f, 1.56271636f, 0.803307f, 0.34370604f, 0.02916753f },
-    { 14.61464119f, 6.77309084f, 3.07277966f, 1.61558151f, 0.89115214f, 0.4783645f, 0.19894916f, 0.02916753f },
-    { 14.61464119f, 6.77309084f, 3.07277966f, 1.84880662f, 1.08895338f, 0.64427125f, 0.34370604f, 0.13792117f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.61558151f, 0.95350921f, 0.54755926f, 0.27464288f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 2.95596409f, 1.91321158f, 1.24153244f, 0.803307f, 0.4783645f, 0.25053367f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.05039096f, 1.41535246f, 0.95350921f, 0.64427125f, 0.41087446f, 0.22545385f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.27973175f, 1.61558151f, 1.12534678f, 0.803307f, 0.54755926f, 0.36617002f, 0.22545385f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 3.32507086f, 2.45070267f, 1.72759056f, 1.24153244f, 0.89115214f, 0.64427125f, 0.45573691f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 5.09240818f, 3.60512662f, 2.84484982f, 2.05039096f, 1.51179266f, 1.08895338f, 0.803307f, 0.59516323f, 0.43325692f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 5.09240818f, 3.60512662f, 2.84484982f, 2.12350607f, 1.61558151f, 1.24153244f, 0.95350921f, 0.72133851f, 0.54755926f, 0.41087446f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.45070267f, 1.84880662f, 1.41535246f, 1.08895338f, 0.83188516f, 0.64427125f, 0.50118381f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.45070267f, 1.91321158f, 1.51179266f, 1.20157266f, 0.95350921f, 0.74807048f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 5.85520077f, 4.45427561f, 3.46139455f, 2.84484982f, 2.19988537f, 1.72759056f, 1.36964464f, 1.08895338f, 0.86115354f, 0.69515091f, 0.54755926f, 0.43325692f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.46139455f, 2.84484982f, 2.19988537f, 1.72759056f, 1.36964464f, 1.08895338f, 0.86115354f, 0.69515091f, 0.54755926f, 0.43325692f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.46139455f, 2.84484982f, 2.19988537f, 1.72759056f, 1.36964464f, 1.08895338f, 0.89115214f, 0.72133851f, 0.59516323f, 0.4783645f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f }
-};
-
-const std::vector<std::vector<float>> GITS_NOISE_1_15 = {
-    { 14.61464119f, 0.83188516f, 0.02916753f },
-    { 14.61464119f, 1.84880662f, 0.59516323f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 1.56271636f, 0.52423614f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 1.91321158f, 0.83188516f, 0.34370604f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.45070267f, 1.24153244f, 0.59516323f, 0.25053367f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.84484982f, 1.51179266f, 0.803307f, 0.41087446f, 0.17026083f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.84484982f, 1.56271636f, 0.89115214f, 0.50118381f, 0.25053367f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 6.77309084f, 3.07277966f, 1.84880662f, 1.12534678f, 0.72133851f, 0.43325692f, 0.22545385f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 6.77309084f, 3.07277966f, 1.91321158f, 1.24153244f, 0.803307f, 0.52423614f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 2.95596409f, 1.91321158f, 1.24153244f, 0.803307f, 0.52423614f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.05039096f, 1.36964464f, 0.95350921f, 0.69515091f, 0.4783645f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.803307f, 0.59516323f, 0.43325692f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.803307f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.19988537f, 1.61558151f, 1.24153244f, 0.95350921f, 0.74807048f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.45070267f, 1.78698075f, 1.32549286f, 1.01931262f, 0.803307f, 0.64427125f, 0.50118381f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.45070267f, 1.78698075f, 1.32549286f, 1.01931262f, 0.803307f, 0.64427125f, 0.52423614f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.45070267f, 1.84880662f, 1.41535246f, 1.12534678f, 0.89115214f, 0.72133851f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.45070267f, 1.84880662f, 1.41535246f, 1.12534678f, 0.89115214f, 0.72133851f, 0.59516323f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
-};
-
-const std::vector<std::vector<float>> GITS_NOISE_1_20 = {
-    { 14.61464119f, 0.803307f, 0.02916753f },
-    { 14.61464119f, 1.56271636f, 0.52423614f, 0.02916753f },
-    { 14.61464119f, 2.36326075f, 0.92192322f, 0.36617002f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.24153244f, 0.59516323f, 0.25053367f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.05039096f, 0.95350921f, 0.45573691f, 0.17026083f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.45070267f, 1.24153244f, 0.64427125f, 0.29807833f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.45070267f, 1.36964464f, 0.803307f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 0.95350921f, 0.59516323f, 0.36617002f, 0.19894916f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.84484982f, 1.67050016f, 1.08895338f, 0.74807048f, 0.50118381f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.95596409f, 1.84880662f, 1.24153244f, 0.83188516f, 0.59516323f, 0.41087446f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 3.07277966f, 1.98035145f, 1.36964464f, 0.95350921f, 0.69515091f, 0.50118381f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 6.77309084f, 3.46139455f, 2.36326075f, 1.56271636f, 1.08895338f, 0.803307f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 6.77309084f, 3.46139455f, 2.45070267f, 1.61558151f, 1.162866f, 0.86115354f, 0.64427125f, 0.50118381f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.83188516f, 0.64427125f, 0.50118381f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.83188516f, 0.64427125f, 0.50118381f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.83188516f, 0.64427125f, 0.50118381f, 0.41087446f, 0.34370604f, 0.27464288f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.19988537f, 1.61558151f, 1.20157266f, 0.92192322f, 0.72133851f, 0.57119018f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.19988537f, 1.61558151f, 1.24153244f, 0.95350921f, 0.74807048f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.19988537f, 1.61558151f, 1.24153244f, 0.95350921f, 0.74807048f, 0.59516323f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
-};
-
-const std::vector<std::vector<float>> GITS_NOISE_1_25 = {
-    { 14.61464119f, 0.72133851f, 0.02916753f },
-    { 14.61464119f, 1.56271636f, 0.50118381f, 0.02916753f },
-    { 14.61464119f, 2.05039096f, 0.803307f, 0.32104823f, 0.02916753f },
-    { 14.61464119f, 2.36326075f, 0.95350921f, 0.43325692f, 0.17026083f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.24153244f, 0.59516323f, 0.27464288f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 3.07277966f, 1.51179266f, 0.803307f, 0.43325692f, 0.22545385f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.36326075f, 1.24153244f, 0.72133851f, 0.41087446f, 0.22545385f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.45070267f, 1.36964464f, 0.83188516f, 0.52423614f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 0.98595673f, 0.64427125f, 0.43325692f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.84484982f, 1.67050016f, 1.08895338f, 0.74807048f, 0.52423614f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.84484982f, 1.72759056f, 1.162866f, 0.803307f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.95596409f, 1.84880662f, 1.24153244f, 0.86115354f, 0.64427125f, 0.4783645f, 0.36617002f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.95596409f, 1.84880662f, 1.28281462f, 0.92192322f, 0.69515091f, 0.52423614f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.95596409f, 1.91321158f, 1.32549286f, 0.95350921f, 0.72133851f, 0.54755926f, 0.43325692f, 0.34370604f, 0.27464288f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.95596409f, 1.91321158f, 1.32549286f, 0.95350921f, 0.72133851f, 0.57119018f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.95596409f, 1.91321158f, 1.32549286f, 0.95350921f, 0.74807048f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 3.07277966f, 2.05039096f, 1.41535246f, 1.05362725f, 0.803307f, 0.61951244f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 3.07277966f, 2.05039096f, 1.41535246f, 1.05362725f, 0.803307f, 0.64427125f, 0.52423614f, 0.43325692f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 3.07277966f, 2.05039096f, 1.46270394f, 1.08895338f, 0.83188516f, 0.66947293f, 0.54755926f, 0.45573691f, 0.38853383f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
-};
-
-const std::vector<std::vector<float>> GITS_NOISE_1_30 = {
-    { 14.61464119f, 0.72133851f, 0.02916753f },
-    { 14.61464119f, 1.24153244f, 0.43325692f, 0.02916753f },
-    { 14.61464119f, 1.56271636f, 0.59516323f, 0.22545385f, 0.02916753f },
-    { 14.61464119f, 1.84880662f, 0.803307f, 0.36617002f, 0.13792117f, 0.02916753f },
-    { 14.61464119f, 2.36326075f, 1.01931262f, 0.52423614f, 0.25053367f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.36964464f, 0.74807048f, 0.41087446f, 0.22545385f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 3.07277966f, 1.56271636f, 0.89115214f, 0.54755926f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 3.07277966f, 1.61558151f, 0.95350921f, 0.61951244f, 0.41087446f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.45070267f, 1.36964464f, 0.83188516f, 0.54755926f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.45070267f, 1.41535246f, 0.92192322f, 0.64427125f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.6383388f, 1.56271636f, 1.01931262f, 0.72133851f, 0.50118381f, 0.36617002f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 1.05362725f, 0.74807048f, 0.54755926f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 1.08895338f, 0.77538133f, 0.57119018f, 0.43325692f, 0.34370604f, 0.27464288f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.59516323f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.84484982f, 1.72759056f, 1.162866f, 0.83188516f, 0.64427125f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.84484982f, 1.72759056f, 1.162866f, 0.83188516f, 0.64427125f, 0.52423614f, 0.43325692f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.84484982f, 1.78698075f, 1.24153244f, 0.92192322f, 0.72133851f, 0.57119018f, 0.45573691f, 0.38853383f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.84484982f, 1.78698075f, 1.24153244f, 0.92192322f, 0.72133851f, 0.57119018f, 0.4783645f, 0.41087446f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
-};
-
-const std::vector<std::vector<float>> GITS_NOISE_1_35 = {
-    { 14.61464119f, 0.69515091f, 0.02916753f },
-    { 14.61464119f, 0.95350921f, 0.34370604f, 0.02916753f },
-    { 14.61464119f, 1.56271636f, 0.57119018f, 0.19894916f, 0.02916753f },
-    { 14.61464119f, 1.61558151f, 0.69515091f, 0.29807833f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 1.84880662f, 0.83188516f, 0.43325692f, 0.22545385f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.45070267f, 1.162866f, 0.64427125f, 0.36617002f, 0.19894916f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.36964464f, 0.803307f, 0.50118381f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.41535246f, 0.83188516f, 0.54755926f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.56271636f, 0.95350921f, 0.64427125f, 0.45573691f, 0.32104823f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.56271636f, 0.95350921f, 0.64427125f, 0.45573691f, 0.34370604f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 3.07277966f, 1.61558151f, 1.01931262f, 0.72133851f, 0.52423614f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 3.07277966f, 1.61558151f, 1.01931262f, 0.72133851f, 0.52423614f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 3.07277966f, 1.61558151f, 1.05362725f, 0.74807048f, 0.54755926f, 0.43325692f, 0.34370604f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 3.07277966f, 1.72759056f, 1.12534678f, 0.803307f, 0.59516323f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 3.07277966f, 1.72759056f, 1.12534678f, 0.803307f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.45070267f, 1.51179266f, 1.01931262f, 0.74807048f, 0.57119018f, 0.45573691f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.6383388f, 1.61558151f, 1.08895338f, 0.803307f, 0.61951244f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.6383388f, 1.61558151f, 1.08895338f, 0.803307f, 0.64427125f, 0.52423614f, 0.43325692f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 5.85520077f, 2.6383388f, 1.61558151f, 1.08895338f, 0.803307f, 0.64427125f, 0.52423614f, 0.45573691f, 0.38853383f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
-};
-
-const std::vector<std::vector<float>> GITS_NOISE_1_40 = {
-    { 14.61464119f, 0.59516323f, 0.02916753f },
-    { 14.61464119f, 0.95350921f, 0.34370604f, 0.02916753f },
-    { 14.61464119f, 1.08895338f, 0.43325692f, 0.13792117f, 0.02916753f },
-    { 14.61464119f, 1.56271636f, 0.64427125f, 0.27464288f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 1.61558151f, 0.803307f, 0.43325692f, 0.22545385f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.05039096f, 0.95350921f, 0.54755926f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.45070267f, 1.24153244f, 0.72133851f, 0.43325692f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.45070267f, 1.24153244f, 0.74807048f, 0.50118381f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.45070267f, 1.28281462f, 0.803307f, 0.52423614f, 0.36617002f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.45070267f, 1.28281462f, 0.803307f, 0.54755926f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.41535246f, 0.86115354f, 0.59516323f, 0.43325692f, 0.32104823f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.51179266f, 0.95350921f, 0.64427125f, 0.45573691f, 0.34370604f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.51179266f, 0.95350921f, 0.64427125f, 0.4783645f, 0.36617002f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.56271636f, 0.98595673f, 0.69515091f, 0.52423614f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.56271636f, 1.01931262f, 0.72133851f, 0.54755926f, 0.43325692f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.61558151f, 1.05362725f, 0.74807048f, 0.57119018f, 0.45573691f, 0.38853383f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.61951244f, 0.50118381f, 0.41087446f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.61951244f, 0.50118381f, 0.43325692f, 0.38853383f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.64427125f, 0.52423614f, 0.45573691f, 0.41087446f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
-};
-
-const std::vector<std::vector<float>> GITS_NOISE_1_45 = {
-    { 14.61464119f, 0.59516323f, 0.02916753f },
-    { 14.61464119f, 0.803307f, 0.25053367f, 0.02916753f },
-    { 14.61464119f, 0.95350921f, 0.34370604f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 1.24153244f, 0.54755926f, 0.25053367f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 1.56271636f, 0.72133851f, 0.36617002f, 0.19894916f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 1.61558151f, 0.803307f, 0.45573691f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 1.91321158f, 0.95350921f, 0.57119018f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.19988537f, 1.08895338f, 0.64427125f, 0.41087446f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.45070267f, 1.24153244f, 0.74807048f, 0.50118381f, 0.34370604f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.45070267f, 1.24153244f, 0.74807048f, 0.50118381f, 0.36617002f, 0.27464288f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.45070267f, 1.28281462f, 0.803307f, 0.54755926f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.45070267f, 1.28281462f, 0.803307f, 0.57119018f, 0.43325692f, 0.34370604f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.45070267f, 1.28281462f, 0.83188516f, 0.59516323f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.45070267f, 1.28281462f, 0.83188516f, 0.59516323f, 0.45573691f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.51179266f, 0.95350921f, 0.69515091f, 0.52423614f, 0.41087446f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.51179266f, 0.95350921f, 0.69515091f, 0.52423614f, 0.43325692f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.56271636f, 0.98595673f, 0.72133851f, 0.54755926f, 0.45573691f, 0.38853383f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.56271636f, 1.01931262f, 0.74807048f, 0.57119018f, 0.4783645f, 0.41087446f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.84484982f, 1.56271636f, 1.01931262f, 0.74807048f, 0.59516323f, 0.50118381f, 0.43325692f, 0.38853383f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
-};
-
-const std::vector<std::vector<float>> GITS_NOISE_1_50 = {
-    { 14.61464119f, 0.54755926f, 0.02916753f },
-    { 14.61464119f, 0.803307f, 0.25053367f, 0.02916753f },
-    { 14.61464119f, 0.86115354f, 0.32104823f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 1.24153244f, 0.54755926f, 0.25053367f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 1.56271636f, 0.72133851f, 0.36617002f, 0.19894916f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 1.61558151f, 0.803307f, 0.45573691f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 1.61558151f, 0.83188516f, 0.52423614f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 1.84880662f, 0.95350921f, 0.59516323f, 0.38853383f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 1.84880662f, 0.95350921f, 0.59516323f, 0.41087446f, 0.29807833f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 1.84880662f, 0.95350921f, 0.61951244f, 0.43325692f, 0.32104823f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.19988537f, 1.12534678f, 0.72133851f, 0.50118381f, 0.36617002f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.19988537f, 1.12534678f, 0.72133851f, 0.50118381f, 0.36617002f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.36326075f, 1.24153244f, 0.803307f, 0.57119018f, 0.43325692f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.36326075f, 1.24153244f, 0.803307f, 0.57119018f, 0.43325692f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.36326075f, 1.24153244f, 0.803307f, 0.59516323f, 0.45573691f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.36326075f, 1.24153244f, 0.803307f, 0.59516323f, 0.45573691f, 0.38853383f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.45070267f, 1.32549286f, 0.86115354f, 0.64427125f, 0.50118381f, 0.41087446f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.45070267f, 1.36964464f, 0.92192322f, 0.69515091f, 0.54755926f, 0.45573691f, 0.41087446f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
-    { 14.61464119f, 2.45070267f, 1.41535246f, 0.95350921f, 0.72133851f, 0.57119018f, 0.4783645f, 0.43325692f, 0.38853383f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
-};
-
-const std::vector<const std::vector<std::vector<float>>*> GITS_NOISE = {
-    &GITS_NOISE_0_80,
-    &GITS_NOISE_0_85,
-    &GITS_NOISE_0_90,
-    &GITS_NOISE_0_95,
-    &GITS_NOISE_1_00,
-    &GITS_NOISE_1_05,
-    &GITS_NOISE_1_10,
-    &GITS_NOISE_1_15,
-    &GITS_NOISE_1_20,
-    &GITS_NOISE_1_25,
-    &GITS_NOISE_1_30,
-    &GITS_NOISE_1_35,
-    &GITS_NOISE_1_40,
-    &GITS_NOISE_1_45,
-    &GITS_NOISE_1_50
-};
-
-#endif // GITS_NOISE_INL
--- a/src/llm.hpp
+++ b/src/llm.hpp
--- a/src/ltxv.hpp
+++ b/src/ltxv.hpp
@ -1,73 +0,0 @@
-#ifndef __LTXV_HPP__
-#define __LTXV_HPP__
-
-#include "common_block.hpp"
-
-namespace LTXV {
-
-    class CausalConv3d : public GGMLBlock {
-    protected:
-        int time_kernel_size;
-
-    public:
-        CausalConv3d(int64_t in_channels,
-                     int64_t out_channels,
-                     int kernel_size                  = 3,
-                     std::tuple<int, int, int> stride = {1, 1, 1},
-                     int dilation                     = 1,
-                     bool bias                        = true) {
-            time_kernel_size = kernel_size / 2;
-            blocks["conv"]   = std::shared_ptr<GGMLBlock>(new Conv3d(in_channels,
-                                                                     out_channels,
-                                                                     {kernel_size, kernel_size, kernel_size},
-                                                                     stride,
-                                                                     {0, kernel_size / 2, kernel_size / 2},
-                                                                     {dilation, 1, 1},
-                                                                     bias));
-        }
-
-        ggml_tensor* forward(GGMLRunnerContext* ctx,
-                             ggml_tensor* x,
-                             bool causal = true) {
-            // x: [N*IC, ID, IH, IW]
-            // result: [N*OC, OD, OH, OW]
-            auto conv = std::dynamic_pointer_cast<Conv3d>(blocks["conv"]);
-            if (causal) {
-                auto h               = ggml_cont(ctx, ggml_permute(ctx, x, 0, 1, 3, 2));                                                  // [ID, N*IC, IH, IW]
-                auto first_frame     = ggml_view_3d(ctx, h, h->ne[0], h->ne[1], h->ne[2], h->nb[1], h->nb[2], 0);                         // [N*IC, IH, IW]
-                first_frame          = ggml_reshape_4d(ctx, first_frame, first_frame->ne[0], first_frame->ne[1], 1, first_frame->ne[2]);  // [N*IC, 1, IH, IW]
-                auto first_frame_pad = first_frame;
-                for (int i = 1; i < time_kernel_size - 1; i++) {
-                    first_frame_pad = ggml_concat(ctx, first_frame_pad, first_frame, 2);
-                }
-                x = ggml_concat(ctx, first_frame_pad, x, 2);
-            } else {
-                auto h         = ggml_cont(ctx, ggml_permute(ctx, x, 0, 1, 3, 2));  // [ID, N*IC, IH, IW]
-                int64_t offset = h->nb[2] * h->ne[2];
-
-                auto first_frame     = ggml_view_3d(ctx, h, h->ne[0], h->ne[1], h->ne[2], h->nb[1], h->nb[2], 0);                         // [N*IC, IH, IW]
-                first_frame          = ggml_reshape_4d(ctx, first_frame, first_frame->ne[0], first_frame->ne[1], 1, first_frame->ne[2]);  // [N*IC, 1, IH, IW]
-                auto first_frame_pad = first_frame;
-                for (int i = 1; i < (time_kernel_size - 1) / 2; i++) {
-                    first_frame_pad = ggml_concat(ctx, first_frame_pad, first_frame, 2);
-                }
-
-                auto last_frame     = ggml_view_3d(ctx, h, h->ne[0], h->ne[1], h->ne[2], h->nb[1], h->nb[2], offset * (h->ne[3] - 1));  // [N*IC, IH, IW]
-                last_frame          = ggml_reshape_4d(ctx, last_frame, last_frame->ne[0], last_frame->ne[1], 1, last_frame->ne[2]);     // [N*IC, 1, IH, IW]
-                auto last_frame_pad = last_frame;
-                for (int i = 1; i < (time_kernel_size - 1) / 2; i++) {
-                    last_frame_pad = ggml_concat(ctx, last_frame_pad, last_frame, 2);
-                }
-
-                x = ggml_concat(ctx, first_frame_pad, x, 2);
-                x = ggml_concat(ctx, x, last_frame_pad, 2);
-            }
-
-            x = conv->forward(ctx, x);
-            return x;
-        }
-    };
-
-};
-
-#endif
--- a/src/model.h
+++ b/src/model.h
@ -1,24 +1,14 @@
 #ifndef __MODEL_H__
 #define __MODEL_H__

-#include <functional>
-#include <map>
-#include <memory>
-#include <set>
-#include <sstream>
 #include <string>
-#include <tuple>
 #include <utility>
 #include <vector>

+#include "core/ordered_map.hpp"
 #include "ggml-backend.h"
 #include "ggml.h"
-#include "gguf.h"
-#include "json.hpp"
-#include "ordered_map.hpp"
-#include "zip.h"
-
-#define SD_MAX_DIMS 5
+#include "model_io/tensor_storage.h"

 enum SDVersion {
    VERSION_SD1,
@ -49,9 +39,16 @@ enum SDVersion {
    VERSION_ANIMA,
    VERSION_FLUX2,
    VERSION_FLUX2_KLEIN,
+    VERSION_LTXAV,
+    VERSION_HIDREAM_O1,
    VERSION_Z_IMAGE,
    VERSION_OVIS_IMAGE,
    VERSION_ERNIE_IMAGE,
+    VERSION_LENS,
+    VERSION_LONGCAT,
+    VERSION_PID,
+    VERSION_IDEOGRAM4,
+    VERSION_ESRGAN,
    VERSION_COUNT,
 };

@ -111,6 +108,13 @@ static inline bool sd_version_is_flux2(SDVersion version) {
    return false;
 }

+static inline bool sd_version_is_ltxav(SDVersion version) {
+    if (version == VERSION_LTXAV) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_is_wan(SDVersion version) {
    if (version == VERSION_WAN2 || version == VERSION_WAN2_2_I2V || version == VERSION_WAN2_2_TI2V) {
        return true;
@ -139,6 +143,13 @@ static inline bool sd_version_is_z_image(SDVersion version) {
    return false;
 }

+static inline bool sd_version_is_longcat(SDVersion version) {
+    if (version == VERSION_LONGCAT) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_is_ernie_image(SDVersion version) {
    if (version == VERSION_ERNIE_IMAGE) {
        return true;
@ -146,8 +157,29 @@ static inline bool sd_version_is_ernie_image(SDVersion version) {
    return false;
 }

+static inline bool sd_version_is_lens(SDVersion version) {
+    if (version == VERSION_LENS) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_pid(SDVersion version) {
+    if (version == VERSION_PID) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_ideogram4(SDVersion version) {
+    if (version == VERSION_IDEOGRAM4) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_uses_flux2_vae(SDVersion version) {
-    if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version)) {
+    if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_ideogram4(version)) {
        return true;
    }
    return false;
@ -167,12 +199,18 @@ static inline bool sd_version_is_inpaint(SDVersion version) {
 static inline bool sd_version_is_dit(SDVersion version) {
    if (sd_version_is_flux(version) ||
        sd_version_is_flux2(version) ||
+        sd_version_is_ltxav(version) ||
        sd_version_is_sd3(version) ||
        sd_version_is_wan(version) ||
        sd_version_is_qwen_image(version) ||
+        version == VERSION_HIDREAM_O1 ||
        sd_version_is_anima(version) ||
        sd_version_is_z_image(version) ||
-        sd_version_is_ernie_image(version)) {
+        sd_version_is_ernie_image(version) ||
+        sd_version_is_lens(version) ||
+        sd_version_is_longcat(version) ||
+        sd_version_is_pid(version) ||
+        sd_version_is_ideogram4(version)) {
        return true;
    }
    return false;
@ -195,168 +233,7 @@ enum PMVersion {
    PM_VERSION_2,
 };

-struct TensorStorage {
-    std::string name;
-    ggml_type type          = GGML_TYPE_F32;
-    ggml_type expected_type = GGML_TYPE_COUNT;
-    bool is_f8_e4m3         = false;
-    bool is_f8_e5m2         = false;
-    bool is_f64             = false;
-    bool is_i64             = false;
-    int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
-    int n_dims              = 0;
-
-    size_t file_index = 0;
-    int index_in_zip  = -1;  // >= means stored in a zip file
-    uint64_t offset   = 0;   // offset in file
-
-    TensorStorage() = default;
-
-    TensorStorage(std::string name, ggml_type type, const int64_t* ne, int n_dims, size_t file_index, size_t offset = 0)
-        : name(std::move(name)), type(type), n_dims(n_dims), file_index(file_index), offset(offset) {
-        for (int i = 0; i < n_dims; i++) {
-            this->ne[i] = ne[i];
-        }
-    }
-
-    int64_t nelements() const {
-        int64_t n = 1;
-        for (int i = 0; i < SD_MAX_DIMS; i++) {
-            n *= ne[i];
-        }
-        return n;
-    }
-
-    int64_t nbytes() const {
-        return nelements() * ggml_type_size(type) / ggml_blck_size(type);
-    }
-
-    int64_t nbytes_to_read() const {
-        if (is_f8_e4m3 || is_f8_e5m2) {
-            return nbytes() / 2;
-        } else if (is_f64 || is_i64) {
-            return nbytes() * 2;
-        } else {
-            return nbytes();
-        }
-    }
-
-    void unsqueeze() {
-        if (n_dims == 2) {
-            n_dims = 4;
-            ne[3]  = ne[1];
-            ne[2]  = ne[0];
-            ne[1]  = 1;
-            ne[0]  = 1;
-        }
-    }
-
-    std::vector<TensorStorage> chunk(size_t n) {
-        std::vector<TensorStorage> chunks;
-        uint64_t chunk_size = nbytes_to_read() / n;
-        // printf("%d/%d\n", chunk_size, nbytes_to_read());
-        reverse_ne();
-        for (size_t i = 0; i < n; i++) {
-            TensorStorage chunk_i = *this;
-            chunk_i.ne[0]         = ne[0] / n;
-            chunk_i.offset        = offset + i * chunk_size;
-            chunk_i.reverse_ne();
-            chunks.push_back(chunk_i);
-        }
-        reverse_ne();
-        return chunks;
-    }
-
-    void reverse_ne() {
-        int64_t new_ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
-        for (int i = 0; i < n_dims; i++) {
-            new_ne[i] = ne[n_dims - 1 - i];
-        }
-        for (int i = 0; i < n_dims; i++) {
-            ne[i] = new_ne[i];
-        }
-    }
-
-    std::string to_string() const {
-        std::stringstream ss;
-        const char* type_name = ggml_type_name(type);
-        if (is_f8_e4m3) {
-            type_name = "f8_e4m3";
-        } else if (is_f8_e5m2) {
-            type_name = "f8_e5m2";
-        } else if (is_f64) {
-            type_name = "f64";
-        } else if (is_i64) {
-            type_name = "i64";
-        }
-        ss << name << " | " << type_name << " | ";
-        ss << n_dims << " [";
-        for (int i = 0; i < SD_MAX_DIMS; i++) {
-            ss << ne[i];
-            if (i != SD_MAX_DIMS - 1) {
-                ss << ", ";
-            }
-        }
-        ss << "]";
-        return ss.str();
-    }
-};
-
-typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;
-
 typedef OrderedMap<std::string, TensorStorage> String2TensorStorage;
-
-class ModelLoader {
-protected:
-    SDVersion version_ = VERSION_COUNT;
-    std::vector<std::string> file_paths_;
-    String2TensorStorage tensor_storage_map;
-
-    void add_tensor_storage(const TensorStorage& tensor_storage);
-
-    bool parse_data_pkl(uint8_t* buffer,
-                        size_t buffer_size,
-                        zip_t* zip,
-                        std::string dir,
-                        size_t file_index,
-                        const std::string prefix);
-
-    bool init_from_gguf_file(const std::string& file_path, const std::string& prefix = "");
-    bool init_from_safetensors_file(const std::string& file_path, const std::string& prefix = "");
-    bool init_from_ckpt_file(const std::string& file_path, const std::string& prefix = "");
-    bool init_from_diffusers_file(const std::string& file_path, const std::string& prefix = "");
-
-public:
-    bool init_from_file(const std::string& file_path, const std::string& prefix = "");
-    void convert_tensors_name();
-    bool init_from_file_and_convert_name(const std::string& file_path,
-                                         const std::string& prefix = "",
-                                         SDVersion version         = VERSION_COUNT);
-    SDVersion get_sd_version();
-    std::map<ggml_type, uint32_t> get_wtype_stat();
-    std::map<ggml_type, uint32_t> get_conditioner_wtype_stat();
-    std::map<ggml_type, uint32_t> get_diffusion_model_wtype_stat();
-    std::map<ggml_type, uint32_t> get_vae_wtype_stat();
-    String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
-    void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = "");
-    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0, bool use_mmap = false);
-    bool load_tensors(std::map<std::string, ggml_tensor*>& tensors,
-                      std::set<std::string> ignore_tensors = {},
-                      int n_threads                        = 0,
-                      bool use_mmap                        = false);
-
-    std::vector<std::string> get_tensor_names() const {
-        std::vector<std::string> names;
-        for (const auto& [name, tensor_storage] : tensor_storage_map) {
-            names.push_back(name);
-        }
-        return names;
-    }
-
-    bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules);
-    bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
-    int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
-    ~ModelLoader() = default;
-};
+using TensorTypeRules = std::vector<std::pair<std::string, ggml_type>>;

 #endif  // __MODEL_H__
--- a/src/model/adapter/lora.hpp
+++ b/src/model/adapter/lora.hpp
@ -1,8 +1,10 @@
-#ifndef __LORA_HPP__
-#define __LORA_HPP__
+#ifndef __SD_MODEL_ADAPTER_LORA_HPP__
+#define __SD_MODEL_ADAPTER_LORA_HPP__

 #include <mutex>
-#include "ggml_extend.hpp"
+#include "core/ggml_extend.hpp"
+#include "model_loader.h"
+#include "model_manager.h"

 #define LORA_GRAPH_BASE_SIZE 10240

@ -13,7 +15,8 @@ struct LoraModel : public GGMLRunner {
    std::map<ggml_tensor*, ggml_tensor*> original_tensor_to_final_tensor;
    std::set<std::string> applied_lora_tensors;
    std::string file_path;
-    ModelLoader model_loader;
+    std::shared_ptr<ModelManager> model_manager;
+    ggml_backend_t params_backend = nullptr;
    bool load_failed              = false;
    bool applied                  = false;
    bool tensor_preprocessed      = false;
@ -22,12 +25,14 @@ struct LoraModel : public GGMLRunner {

    LoraModel(const std::string& lora_id,
              ggml_backend_t backend,
+              ggml_backend_t params_backend_,
              const std::string& file_path          = "",
              std::string prefix                    = "",
-              SDVersion version            = VERSION_COUNT)
-        : lora_id(lora_id), file_path(file_path), GGMLRunner(backend, false) {
+              SDVersion version                     = VERSION_COUNT,
+              std::shared_ptr<ModelManager> manager = std::make_shared<ModelManager>())
+        : GGMLRunner(backend, manager), lora_id(lora_id), file_path(file_path), model_manager(std::move(manager)), params_backend(params_backend_) {
        prefix = "lora." + prefix;
-        if (!model_loader.init_from_file_and_convert_name(file_path, prefix, version)) {
+        if (model_manager == nullptr || !model_manager->loader().init_from_file_and_convert_name(file_path, prefix, version)) {
            load_failed = true;
        }
    }
@ -69,7 +74,11 @@ struct LoraModel : public GGMLRunner {
            return true;
        };

-        model_loader.load_tensors(on_new_tensor_cb, n_threads);
+        if (model_manager != nullptr) {
+            model_manager->set_n_threads(n_threads);
+        }
+        ModelLoader& model_loader = model_manager->loader();
+        model_loader.load_tensors(on_new_tensor_cb);

        if (tensors_to_create.empty()) {
            return true;
@ -85,22 +94,64 @@ struct LoraModel : public GGMLRunner {
            lora_tensors[name] = real;
        }

-        alloc_params_buffer();
-
-        dry_run = false;
-        model_loader.load_tensors(on_new_tensor_cb, n_threads);
+        std::map<std::string, ggml_tensor*> tensors;
+        for (const auto& pair : lora_tensors) {
+            tensors[pair.first] = pair.second;
+        }
+        if (model_manager == nullptr ||
+            !model_manager->register_param_tensors("LoRA",
+                                                   std::move(tensors),
+                                                   ModelManager::ResidencyMode::ParamBackend,
+                                                   runtime_backend,
+                                                   params_backend) ||
+            !model_manager->validate_registered_tensors()) {
+            LOG_ERROR("lora model manager registration failed");
+            return false;
+        }
+        std::vector<ggml_tensor*> lora_params;
+        lora_params.reserve(lora_tensors.size());
+        for (const auto& pair : lora_tensors) {
+            lora_params.push_back(pair.second);
+        }
+        if (!model_manager->prepare_params(lora_params)) {
+            LOG_ERROR("lora model manager prepare params failed");
+            return false;
+        }

        LOG_DEBUG("finished loaded lora");
        return true;
    }

-    void preprocess_lora_tensors(const std::map<std::string, ggml_tensor*>& model_tensors) {
+    void release_loaded_tensors() {
+        runner_done();
+        free_compute_buffer();
+        model_manager.reset();
+        free_params_ctx();
+        alloc_params_ctx();
+        model_manager  = std::make_shared<ModelManager>();
+        weight_manager = model_manager;
+        lora_tensors.clear();
+        original_tensor_to_final_tensor.clear();
+        applied_lora_tensors.clear();
+        applied             = false;
+        tensor_preprocessed = false;
+    }
+
+    static std::set<std::string> tensor_names(const std::map<std::string, ggml_tensor*>& model_tensors) {
+        std::set<std::string> names;
+        for (const auto& item : model_tensors) {
+            names.insert(item.first);
+        }
+        return names;
+    }
+
+    void preprocess_lora_tensors(const std::set<std::string>& model_tensor_names) {
        if (tensor_preprocessed) {
            return;
        }
        tensor_preprocessed = true;
        // I really hate these hardcoded processes.
-        if (model_tensors.find("cond_stage_model.1.transformer.text_model.encoder.layers.0.self_attn.in_proj.weight") != model_tensors.end()) {
+        if (model_tensor_names.find("cond_stage_model.1.transformer.text_model.encoder.layers.0.self_attn.in_proj.weight") != model_tensor_names.end()) {
            std::unordered_map<std::string, ggml_tensor*> new_lora_tensors;
            for (auto& [old_name, tensor] : lora_tensors) {
                std::string new_name = old_name;
@ -129,7 +180,7 @@ struct LoraModel : public GGMLRunner {
        }
    }

-    ggml_tensor* get_lora_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
+    ggml_tensor* get_lora_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) {
        ggml_tensor* updown = nullptr;
        int index           = 0;
        while (true) {
@ -152,17 +203,17 @@ struct LoraModel : public GGMLRunner {

            auto iter = lora_tensors.find(lora_up_name);
            if (iter != lora_tensors.end()) {
-                lora_up = ggml_ext_cast_f32(ctx, iter->second);
+                lora_up = ggml_ext_cast_f32(ctx, backend, iter->second);
            }

            iter = lora_tensors.find(lora_mid_name);
            if (iter != lora_tensors.end()) {
-                lora_mid = ggml_ext_cast_f32(ctx, iter->second);
+                lora_mid = ggml_ext_cast_f32(ctx, backend, iter->second);
            }

            iter = lora_tensors.find(lora_down_name);
            if (iter != lora_tensors.end()) {
-                lora_down = ggml_ext_cast_f32(ctx, iter->second);
+                lora_down = ggml_ext_cast_f32(ctx, backend, iter->second);
            }

            if (lora_up == nullptr || lora_down == nullptr) {
@ -208,7 +259,7 @@ struct LoraModel : public GGMLRunner {
        return updown;
    }

-    ggml_tensor* get_raw_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
+    ggml_tensor* get_raw_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) {
        ggml_tensor* updown = nullptr;
        int index           = 0;
        while (true) {
@ -225,7 +276,7 @@ struct LoraModel : public GGMLRunner {

            auto iter = lora_tensors.find(diff_name);
            if (iter != lora_tensors.end()) {
-                curr_updown = ggml_ext_cast_f32(ctx, iter->second);
+                curr_updown = ggml_ext_cast_f32(ctx, backend, iter->second);
            } else {
                break;
            }
@ -248,7 +299,7 @@ struct LoraModel : public GGMLRunner {
        return updown;
    }

-    ggml_tensor* get_loha_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
+    ggml_tensor* get_loha_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) {
        ggml_tensor* updown = nullptr;
        int index           = 0;
        while (true) {
@ -276,33 +327,33 @@ struct LoraModel : public GGMLRunner {

            auto iter = lora_tensors.find(hada_1_down_name);
            if (iter != lora_tensors.end()) {
-                hada_1_down = ggml_ext_cast_f32(ctx, iter->second);
+                hada_1_down = ggml_ext_cast_f32(ctx, backend, iter->second);
            }

            iter = lora_tensors.find(hada_1_up_name);
            if (iter != lora_tensors.end()) {
-                hada_1_up = ggml_ext_cast_f32(ctx, iter->second);
+                hada_1_up = ggml_ext_cast_f32(ctx, backend, iter->second);
            }

            iter = lora_tensors.find(hada_1_mid_name);
            if (iter != lora_tensors.end()) {
-                hada_1_mid = ggml_ext_cast_f32(ctx, iter->second);
+                hada_1_mid = ggml_ext_cast_f32(ctx, backend, iter->second);
                hada_1_up  = ggml_cont(ctx, ggml_transpose(ctx, hada_1_up));
            }

            iter = lora_tensors.find(hada_2_down_name);
            if (iter != lora_tensors.end()) {
-                hada_2_down = ggml_ext_cast_f32(ctx, iter->second);
+                hada_2_down = ggml_ext_cast_f32(ctx, backend, iter->second);
            }

            iter = lora_tensors.find(hada_2_up_name);
            if (iter != lora_tensors.end()) {
-                hada_2_up = ggml_ext_cast_f32(ctx, iter->second);
+                hada_2_up = ggml_ext_cast_f32(ctx, backend, iter->second);
            }

            iter = lora_tensors.find(hada_2_mid_name);
            if (iter != lora_tensors.end()) {
-                hada_2_mid = ggml_ext_cast_f32(ctx, iter->second);
+                hada_2_mid = ggml_ext_cast_f32(ctx, backend, iter->second);
                hada_2_up  = ggml_cont(ctx, ggml_transpose(ctx, hada_2_up));
            }

@ -351,7 +402,7 @@ struct LoraModel : public GGMLRunner {
        return updown;
    }

-    ggml_tensor* get_lokr_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
+    ggml_tensor* get_lokr_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) {
        ggml_tensor* updown = nullptr;
        int index           = 0;
        while (true) {
@ -378,24 +429,24 @@ struct LoraModel : public GGMLRunner {

            auto iter = lora_tensors.find(lokr_w1_name);
            if (iter != lora_tensors.end()) {
-                lokr_w1 = ggml_ext_cast_f32(ctx, iter->second);
+                lokr_w1 = ggml_ext_cast_f32(ctx, backend, iter->second);
            }

            iter = lora_tensors.find(lokr_w2_name);
            if (iter != lora_tensors.end()) {
-                lokr_w2 = ggml_ext_cast_f32(ctx, iter->second);
+                lokr_w2 = ggml_ext_cast_f32(ctx, backend, iter->second);
            }

            int64_t rank = 1;
            if (lokr_w1 == nullptr) {
                iter = lora_tensors.find(lokr_w1_a_name);
                if (iter != lora_tensors.end()) {
-                    lokr_w1_a = ggml_ext_cast_f32(ctx, iter->second);
+                    lokr_w1_a = ggml_ext_cast_f32(ctx, backend, iter->second);
                }

                iter = lora_tensors.find(lokr_w1_b_name);
                if (iter != lora_tensors.end()) {
-                    lokr_w1_b = ggml_ext_cast_f32(ctx, iter->second);
+                    lokr_w1_b = ggml_ext_cast_f32(ctx, backend, iter->second);
                }

                if (lokr_w1_a == nullptr || lokr_w1_b == nullptr) {
@ -410,12 +461,12 @@ struct LoraModel : public GGMLRunner {
            if (lokr_w2 == nullptr) {
                iter = lora_tensors.find(lokr_w2_a_name);
                if (iter != lora_tensors.end()) {
-                    lokr_w2_a = ggml_ext_cast_f32(ctx, iter->second);
+                    lokr_w2_a = ggml_ext_cast_f32(ctx, backend, iter->second);
                }

                iter = lora_tensors.find(lokr_w2_b_name);
                if (iter != lora_tensors.end()) {
-                    lokr_w2_b = ggml_ext_cast_f32(ctx, iter->second);
+                    lokr_w2_b = ggml_ext_cast_f32(ctx, backend, iter->second);
                }

                if (lokr_w2_a == nullptr || lokr_w2_b == nullptr) {
@ -468,23 +519,23 @@ struct LoraModel : public GGMLRunner {
        return updown;
    }

-    ggml_tensor* get_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_tensor* model_tensor, bool with_lora_and_lokr = true) {
+    ggml_tensor* get_weight_diff(const std::string& model_tensor_name, ggml_backend_t backend, ggml_context* ctx, ggml_tensor* model_tensor, bool with_lora_and_lokr = true) {
        // lora
        ggml_tensor* diff = nullptr;
        if (with_lora_and_lokr) {
-            diff = get_lora_weight_diff(model_tensor_name, ctx);
+            diff = get_lora_weight_diff(model_tensor_name, ctx, backend);
        }
        // diff
        if (diff == nullptr) {
-            diff = get_raw_weight_diff(model_tensor_name, ctx);
+            diff = get_raw_weight_diff(model_tensor_name, ctx, backend);
        }
        // loha
        if (diff == nullptr) {
-            diff = get_loha_weight_diff(model_tensor_name, ctx);
+            diff = get_loha_weight_diff(model_tensor_name, ctx, backend);
        }
        // lokr
        if (diff == nullptr && with_lora_and_lokr) {
-            diff = get_lokr_weight_diff(model_tensor_name, ctx);
+            diff = get_lokr_weight_diff(model_tensor_name, ctx, backend);
        }
        if (diff != nullptr) {
            if (ggml_nelements(diff) < ggml_nelements(model_tensor)) {
@ -502,6 +553,7 @@ struct LoraModel : public GGMLRunner {
    }

    ggml_tensor* get_out_diff(ggml_context* ctx,
+                              ggml_backend_t backend,
                              ggml_tensor* x,
                              WeightAdapter::ForwardParams forward_params,
                              const std::string& model_tensor_name) {
@ -590,7 +642,7 @@ struct LoraModel : public GGMLRunner {
                }
                scale_value *= multiplier;

-                auto curr_out_diff = ggml_ext_lokr_forward(ctx, x, lokr_w1, lokr_w1_a, lokr_w1_b, lokr_w2, lokr_w2_a, lokr_w2_b, is_conv2d, forward_params.conv2d, scale_value);
+                auto curr_out_diff = ggml_ext_lokr_forward(ctx, backend, x, lokr_w1, lokr_w1_a, lokr_w1_b, lokr_w2, lokr_w2_a, lokr_w2_b, is_conv2d, forward_params.conv2d, scale_value);
                if (out_diff == nullptr) {
                    out_diff = curr_out_diff;
                } else {
@ -606,7 +658,7 @@ struct LoraModel : public GGMLRunner {
                if (lokr_w2)
                    applied_lora_tensors.insert(lokr_w2_name);
                if (lokr_w2_a)
-                    applied_lora_tensors.insert(lokr_w2_name);
+                    applied_lora_tensors.insert(lokr_w2_a_name);
                if (lokr_w2_b)
                    applied_lora_tensors.insert(lokr_w2_b_name);
                applied_lora_tensors.insert(alpha_name);
@ -747,11 +799,13 @@ struct LoraModel : public GGMLRunner {
        return out_diff;
    }

-    ggml_cgraph* build_lora_graph(const std::map<std::string, ggml_tensor*>& model_tensors, SDVersion version) {
+    ggml_cgraph* build_lora_graph(const std::map<std::string, ggml_tensor*>& model_tensors,
+                                  const std::set<std::string>& model_tensor_names,
+                                  SDVersion version) {
        size_t lora_graph_size = LORA_GRAPH_BASE_SIZE + lora_tensors.size() * 10;
        ggml_cgraph* gf        = ggml_new_graph_custom(compute_ctx, lora_graph_size, false);

-        preprocess_lora_tensors(model_tensors);
+        preprocess_lora_tensors(model_tensor_names);

        original_tensor_to_final_tensor.clear();
        applied_lora_tensors.clear();
@ -761,39 +815,43 @@ struct LoraModel : public GGMLRunner {
            ggml_tensor* model_tensor     = it.second;

            // lora
-            ggml_tensor* diff = get_weight_diff(model_tensor_name, compute_ctx, model_tensor);
+            ggml_tensor* diff = get_weight_diff(model_tensor_name, runtime_backend, compute_ctx, model_tensor);
            if (diff == nullptr) {
                continue;
            }

            ggml_tensor* original_tensor = model_tensor;
-            if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
+            if (!sd_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
                model_tensor = ggml_dup_tensor(compute_ctx, model_tensor);
                set_backend_tensor_data(model_tensor, original_tensor->data);
            }

            ggml_tensor* final_tensor;
            if (model_tensor->type != GGML_TYPE_F32 && model_tensor->type != GGML_TYPE_F16) {
-                final_tensor = ggml_ext_cast_f32(compute_ctx, model_tensor);
+                final_tensor = ggml_ext_cast_f32(compute_ctx, runtime_backend, model_tensor);
                final_tensor = ggml_add_inplace(compute_ctx, final_tensor, diff);
                final_tensor = ggml_cpy(compute_ctx, final_tensor, model_tensor);
            } else {
                final_tensor = ggml_add_inplace(compute_ctx, model_tensor, diff);
            }
            ggml_build_forward_expand(gf, final_tensor);
-            if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
+            if (!sd_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
                original_tensor_to_final_tensor[original_tensor] = final_tensor;
            }
        }
        return gf;
    }

-    void apply(std::map<std::string, ggml_tensor*> model_tensors, SDVersion version, int n_threads) {
+    void apply(std::map<std::string, ggml_tensor*> model_tensors,
+               const std::set<std::string>& model_tensor_names,
+               SDVersion version,
+               int n_threads,
+               bool warn_unused = true) {
        auto get_graph = [&]() -> ggml_cgraph* {
-            return build_lora_graph(model_tensors, version);
+            return build_lora_graph(model_tensors, model_tensor_names, version);
        };
-        GGMLRunner::compute<float>(get_graph, n_threads, false, true);
-        stat();
+        GGMLRunner::compute<float>(get_graph, n_threads, false, false, false, true);
+        stat(!warn_unused);
        for (auto item : original_tensor_to_final_tensor) {
            ggml_tensor* original_tensor = item.first;
            ggml_tensor* final_tensor    = item.second;
@ -804,6 +862,10 @@ struct LoraModel : public GGMLRunner {
        GGMLRunner::free_compute_buffer();
    }

+    void apply(std::map<std::string, ggml_tensor*> model_tensors, SDVersion version, int n_threads, bool warn_unused = true) {
+        apply(model_tensors, tensor_names(model_tensors), version, n_threads, warn_unused);
+    }
+
    void stat(bool at_runntime = false) {
        size_t total_lora_tensors_count   = 0;
        size_t applied_lora_tensors_count = 0;
@ -841,34 +903,35 @@ public:
        : lora_models(lora_models) {
    }

-    ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name, bool with_lora_and_lokr) {
+    ggml_tensor* patch_weight(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* weight, const std::string& weight_name, bool with_lora_and_lokr) {
        for (auto& lora_model : lora_models) {
-            ggml_tensor* diff = lora_model->get_weight_diff(weight_name, ctx, weight, with_lora_and_lokr);
+            ggml_tensor* diff = lora_model->get_weight_diff(weight_name, backend, ctx, weight, with_lora_and_lokr);
            if (diff == nullptr) {
                continue;
            }

            if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
-                weight = ggml_ext_cast_f32(ctx, weight);
+                weight = ggml_ext_cast_f32(ctx, backend, weight);
            }
            weight = ggml_add(ctx, weight, diff);
        }
        return weight;
    }

-    ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) override {
-        return patch_weight(ctx, weight, weight_name, true);
+    ggml_tensor* patch_weight(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* weight, const std::string& weight_name) override {
+        return patch_weight(ctx, backend, weight, weight_name, true);
    }

    ggml_tensor* forward_with_lora(ggml_context* ctx,
+                                   ggml_backend_t backend,
                                   ggml_tensor* x,
                                   ggml_tensor* w,
                                   ggml_tensor* b,
                                   const std::string& prefix,
                                   WeightAdapter::ForwardParams forward_params) override {
-        w = patch_weight(ctx, w, prefix + "weight", false);
+        w = patch_weight(ctx, backend, w, prefix + "weight", false);
        if (b) {
-            b = patch_weight(ctx, b, prefix + "bias", false);
+            b = patch_weight(ctx, backend, b, prefix + "bias", false);
        }
        ggml_tensor* out;
        if (forward_params.op_type == ForwardParams::op_type_t::OP_LINEAR) {
@ -890,7 +953,7 @@ public:
                                   forward_params.conv2d.scale);
        }
        for (auto& lora_model : lora_models) {
-            ggml_tensor* out_diff = lora_model->get_out_diff(ctx, x, forward_params, prefix + "weight");
+            ggml_tensor* out_diff = lora_model->get_out_diff(ctx, backend, x, forward_params, prefix + "weight");
            if (out_diff == nullptr) {
                continue;
            }
@ -908,4 +971,4 @@ public:
    }
 };

-#endif  // __LORA_HPP__
+#endif  // __SD_MODEL_ADAPTER_LORA_HPP__
--- a/src/model/adapter/pmid.hpp
+++ b/src/model/adapter/pmid.hpp
@ -1,10 +1,12 @@
-#ifndef __PMI_HPP__
-#define __PMI_HPP__
+#ifndef __SD_MODEL_ADAPTER_PMID_HPP__
+#define __SD_MODEL_ADAPTER_PMID_HPP__

-#include "ggml_extend.hpp"
+#include "core/ggml_extend.hpp"

-#include "clip.hpp"
-#include "lora.hpp"
+#include "model/adapter/lora.hpp"
+#include "model/common/block.hpp"
+#include "model/te/clip.hpp"
+#include "model_loader.h"

 struct FuseBlock : public GGMLBlock {
    // network hparams
@ -411,13 +413,13 @@ public:

 public:
    PhotoMakerIDEncoder(ggml_backend_t backend,
-                        bool offload_params_to_cpu,
                        const String2TensorStorage& tensor_storage_map,
                        const std::string prefix,
                        SDVersion version                                   = VERSION_SDXL,
                        PMVersion pm_v                                      = PM_VERSION_1,
-                        float sty         = 20.f)
-        : GGMLRunner(backend, offload_params_to_cpu),
+                        float sty                                           = 20.f,
+                        std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+        : GGMLRunner(backend, weight_manager),
          version(version),
          pm_version(pm_v),
          style_strength(sty) {
@ -556,24 +558,25 @@ public:
            return build_graph(id_pixel_values, prompt_embeds, class_tokens_mask, id_embeds);
        };

-        return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true));
+        return take_or_empty(GGMLRunner::compute<float>(get_graph, n_threads, true, true, true));
    }
 };

 struct PhotoMakerIDEmbed : public GGMLRunner {
    std::map<std::string, ggml_tensor*> tensors;
    std::string file_path;
-    ModelLoader* model_loader;
+    std::shared_ptr<ModelManager> model_manager;
+    ggml_backend_t params_backend = nullptr;
    bool load_failed              = false;
    bool applied                  = false;

    PhotoMakerIDEmbed(ggml_backend_t backend,
-                      bool offload_params_to_cpu,
-                      ModelLoader* ml,
+                      ggml_backend_t params_backend_,
+                      std::shared_ptr<ModelManager> manager = std::make_shared<ModelManager>(),
                      const std::string& file_path          = "",
                      const std::string& prefix             = "")
-        : file_path(file_path), GGMLRunner(backend, offload_params_to_cpu), model_loader(ml) {
-        if (!model_loader->init_from_file_and_convert_name(file_path, prefix)) {
+        : GGMLRunner(backend, manager), file_path(file_path), model_manager(std::move(manager)), params_backend(params_backend_) {
+        if (model_manager == nullptr || !model_manager->loader().init_from_file_and_convert_name(file_path, prefix)) {
            load_failed = true;
        }
    }
@ -614,11 +617,27 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
            return true;
        };

-        model_loader->load_tensors(on_new_tensor_cb, n_threads);
-        alloc_params_buffer();
-
-        dry_run = false;
-        model_loader->load_tensors(on_new_tensor_cb, n_threads);
+        model_manager->set_n_threads(n_threads);
+        ModelLoader& model_loader = model_manager->loader();
+        model_loader.load_tensors(on_new_tensor_cb);
+        if (!model_manager->register_param_tensors("PhotoMaker ID embeds",
+                                                   tensors,
+                                                   ModelManager::ResidencyMode::ParamBackend,
+                                                   runtime_backend,
+                                                   params_backend) ||
+            !model_manager->validate_registered_tensors()) {
+            LOG_ERROR("PhotoMaker ID embeds model manager registration failed");
+            return false;
+        }
+        std::vector<ggml_tensor*> id_embed_params;
+        id_embed_params.reserve(tensors.size());
+        for (const auto& pair : tensors) {
+            id_embed_params.push_back(pair.second);
+        }
+        if (!model_manager->prepare_params(id_embed_params)) {
+            LOG_ERROR("PhotoMaker ID embeds model manager prepare params failed");
+            return false;
+        }

        LOG_DEBUG("finished loading PhotoMaker ID Embeds ");
        return true;
@ -633,4 +652,4 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
    }
 };

-#endif  // __PMI_HPP__
+#endif  // __SD_MODEL_ADAPTER_PMID_HPP__
--- a/src/model/common/block.hpp
+++ b/src/model/common/block.hpp
@ -1,7 +1,9 @@
-#ifndef __COMMON_BLOCK_HPP__
-#define __COMMON_BLOCK_HPP__
+#ifndef __SD_MODEL_COMMON_BLOCK_HPP__
+#define __SD_MODEL_COMMON_BLOCK_HPP__

-#include "ggml_extend.hpp"
+#include "core/ggml_extend.hpp"
+#include "core/util.h"
+#include "ggml-backend.h"

 class DownSampleBlock : public GGMLBlock {
 protected:
@ -225,6 +227,37 @@ public:
    }
 };

+struct Mlp : public GGMLBlock {
+public:
+    Mlp(int64_t in_features,
+        int64_t hidden_features = -1,
+        int64_t out_features    = -1,
+        bool bias               = true) {
+        // act_layer is always lambda: nn.GELU(approximate="tanh")
+        // norm_layer is always None
+        // use_conv is always False
+        if (hidden_features == -1) {
+            hidden_features = in_features;
+        }
+        if (out_features == -1) {
+            out_features = in_features;
+        }
+        blocks["fc1"] = std::shared_ptr<GGMLBlock>(new Linear(in_features, hidden_features, bias));
+        blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+        // x: [N, n_token, in_features]
+        auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
+        auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
+
+        x = fc1->forward(ctx, x);
+        x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
+        x = fc2->forward(ctx, x);
+        return x;
+    }
+};
+
 class FeedForward : public GGMLBlock {
 public:
    enum class Activation {
@ -248,9 +281,6 @@ public:
        float scale         = 1.f;
        if (precision_fix) {
            scale = 1.f / 128.f;
-#ifdef SD_USE_VULKAN
-            force_prec_f32 = true;
-#endif
        }
        // The purpose of the scale here is to prevent NaN issues in certain situations.
        // For example, when using Vulkan without enabling force_prec_f32,
@ -264,6 +294,9 @@ public:

        auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
        auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
+        if (sd_backend_is(ctx->backend, "Vulkan")) {
+            net_2->set_force_prec_f32(true);
+        }

        x = net_0->forward(ctx, x);  // [ne3, ne2, ne1, inner_dim]
        x = net_2->forward(ctx, x);  // [ne3, ne2, ne1, dim_out]
@ -527,11 +560,11 @@ protected:
        params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
    }

-    float get_alpha() {
+    ggml_tensor* get_alpha(GGMLRunnerContext* ctx) {
        // image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
        // so learned_with_images is same as learned
-        float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
-        return sigmoid(alpha);
+        auto mix_factor = ggml_ext_cast_f32(ctx->ggml_ctx, ctx->backend, params["mix_factor"]);
+        return ggml_sigmoid(ctx->ggml_ctx, mix_factor);
    }

 public:
@ -545,11 +578,12 @@ public:
                         ggml_tensor* x_spatial,
                         ggml_tensor* x_temporal) {
        // image_only_indicator is always tensor([0.])
-        float alpha = get_alpha();
-        auto x      = ggml_add(ctx->ggml_ctx,
-                               ggml_ext_scale(ctx->ggml_ctx, x_spatial, alpha),
-                               ggml_ext_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha));
-        return x;
+        auto alpha = get_alpha(ctx);
+        return ggml_add(ctx->ggml_ctx,
+                        x_temporal,
+                        ggml_mul(ctx->ggml_ctx,
+                                 ggml_sub(ctx->ggml_ctx, x_spatial, x_temporal),
+                                 alpha));
    }
 };

@ -601,4 +635,4 @@ public:
    }
 };

-#endif  // __COMMON_BLOCK_HPP__
+#endif  // __SD_MODEL_COMMON_BLOCK_HPP__
--- a/src/model/common/rope.hpp
+++ b/src/model/common/rope.hpp
@ -1,10 +1,10 @@
-#ifndef __ROPE_HPP__
-#define __ROPE_HPP__
+#ifndef __SD_MODEL_COMMON_ROPE_HPP__
+#define __SD_MODEL_COMMON_ROPE_HPP__

 #include <algorithm>
 #include <cmath>
 #include <vector>
-#include "ggml_extend.hpp"
+#include "core/ggml_extend.hpp"

 namespace Rope {
    enum class EmbedNDLayout {
@ -111,6 +111,16 @@ namespace Rope {
        return txt_ids;
    }

+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_longcat_txt_ids(int bs, int context_len, int axes_dim_num) {
+        auto txt_ids = std::vector<std::vector<float>>(bs * context_len, std::vector<float>(axes_dim_num, 0.0f));
+        for (int i = 0; i < bs * context_len; i++) {
+            float token_index = static_cast<float>(i % context_len);
+            txt_ids[i][1]     = token_index;
+            txt_ids[i][2]     = token_index;
+        }
+        return txt_ids;
+    }
+
    __STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_img_ids(int h,
                                                                       int w,
                                                                       int patch_size,
@ -122,7 +132,6 @@ namespace Rope {
                                                                       bool scale_rope = false) {
        int h_len = (h + (patch_size / 2)) / patch_size;
        int w_len = (w + (patch_size / 2)) / patch_size;
-
        std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(axes_dim_num, 0.0));

        int h_start = h_offset;
@ -135,7 +144,6 @@ namespace Rope {

        std::vector<float> row_ids = linspace<float>(1.f * h_start, 1.f * h_start + h_len - 1, h_len);
        std::vector<float> col_ids = linspace<float>(1.f * w_start, 1.f * w_start + w_len - 1, w_len);
-
        for (int i = 0; i < h_len; ++i) {
            for (int j = 0; j < w_len; ++j) {
                img_ids[i * w_len + j][0] = 1.f * index;
@ -241,17 +249,111 @@ namespace Rope {
        return embed_nd(ids, bs, axis_thetas, axes_dim, wrap_dims, layout);
    }

+    __STATIC_INLINE__ std::vector<float> embed_interleaved_mrope(const std::vector<std::vector<float>>& ids,
+                                                                 int bs,
+                                                                 float theta,
+                                                                 int head_dim,
+                                                                 const std::vector<int>& mrope_section,
+                                                                 const std::vector<std::vector<int>>& axis_wrap_dims = {}) {
+        GGML_ASSERT(bs > 0);
+        GGML_ASSERT(head_dim % 2 == 0);
+        GGML_ASSERT(mrope_section.size() >= 3);
+
+        std::vector<std::vector<float>> trans_ids = transpose(ids);
+        size_t pos_len                            = ids.size() / bs;
+        int half_dim                              = head_dim / 2;
+
+        std::vector<std::vector<std::vector<float>>> axis_embs;
+        axis_embs.reserve(3);
+        for (int axis = 0; axis < 3; ++axis) {
+            std::vector<int> axis_wrap;
+            if (axis < static_cast<int>(axis_wrap_dims.size())) {
+                axis_wrap = axis_wrap_dims[axis];
+            }
+            axis_embs.push_back(rope(trans_ids[axis], head_dim, theta, axis_wrap));
+        }
+
+        std::vector<std::vector<float>> emb = axis_embs[0];
+        for (int axis = 1; axis < 3; ++axis) {
+            int length = std::min<int>(mrope_section[axis] * 3, half_dim);
+            for (int freq_idx = axis; freq_idx < length; freq_idx += 3) {
+                for (size_t pos_idx = 0; pos_idx < bs * pos_len; ++pos_idx) {
+                    for (int k = 0; k < 4; ++k) {
+                        emb[pos_idx][4 * freq_idx + k] = axis_embs[axis][pos_idx][4 * freq_idx + k];
+                    }
+                }
+            }
+        }
+
+        return flatten(emb);
+    }
+
+    __STATIC_INLINE__ std::vector<float> embed_2d_interleaved(int height,
+                                                              int width,
+                                                              int dim,
+                                                              float theta    = 10000.f,
+                                                              float scale    = 16.f,
+                                                              int ref_grid_h = 0,
+                                                              int ref_grid_w = 0) {
+        assert(dim % 4 == 0);
+        int half_dim      = dim / 2;
+        int dim_axis      = dim / 2;
+        int axis_half_dim = dim_axis / 2;
+
+        float h_ntk = 1.f;
+        float w_ntk = 1.f;
+        if (ref_grid_h > 0 && ref_grid_w > 0 && dim_axis > 2) {
+            float power = static_cast<float>(dim_axis) / static_cast<float>(dim_axis - 2);
+            h_ntk       = std::pow(static_cast<float>(height) / static_cast<float>(ref_grid_h), power);
+            w_ntk       = std::pow(static_cast<float>(width) / static_cast<float>(ref_grid_w), power);
+        }
+
+        std::vector<float> x_pos;
+        std::vector<float> y_pos;
+        x_pos.reserve(static_cast<size_t>(height) * width);
+        y_pos.reserve(static_cast<size_t>(height) * width);
+        for (int iy = 0; iy < height; ++iy) {
+            float y = height == 1 ? 0.f : scale * static_cast<float>(iy) / static_cast<float>(height - 1);
+            for (int ix = 0; ix < width; ++ix) {
+                float x = width == 1 ? 0.f : scale * static_cast<float>(ix) / static_cast<float>(width - 1);
+                x_pos.push_back(x);
+                y_pos.push_back(y);
+            }
+        }
+
+        auto x_emb = rope(x_pos, dim_axis, theta * w_ntk);
+        auto y_emb = rope(y_pos, dim_axis, theta * h_ntk);
+
+        std::vector<float> out(static_cast<size_t>(height) * width * half_dim * 4);
+        for (int pos = 0; pos < height * width; ++pos) {
+            for (int i = 0; i < axis_half_dim; ++i) {
+                int jx        = 2 * i;
+                int jy        = 2 * i + 1;
+                size_t base_x = static_cast<size_t>(pos) * half_dim * 4 + static_cast<size_t>(jx) * 4;
+                size_t base_y = static_cast<size_t>(pos) * half_dim * 4 + static_cast<size_t>(jy) * 4;
+                size_t axis   = static_cast<size_t>(i) * 4;
+                for (int k = 0; k < 4; ++k) {
+                    out[base_x + k] = x_emb[pos][axis + k];
+                    out[base_y + k] = y_emb[pos][axis + k];
+                }
+            }
+        }
+        return out;
+    }
+
    __STATIC_INLINE__ std::vector<std::vector<float>> gen_refs_ids(int patch_size,
                                                                   int bs,
                                                                   int axes_dim_num,
+                                                                   int start_index,
                                                                   const std::vector<ggml_tensor*>& ref_latents,
                                                                   bool increase_ref_index,
                                                                   float ref_index_scale,
-                                                                   bool scale_rope) {
+                                                                   bool scale_rope,
+                                                                   int base_offset = 0) {
        std::vector<std::vector<float>> ids;
        int curr_h_offset = 0;
        int curr_w_offset = 0;
-        int index         = 1;
+        int index         = start_index;
        for (ggml_tensor* ref : ref_latents) {
            int h_offset = 0;
            int w_offset = 0;
@ -270,8 +372,8 @@ namespace Rope {
                                            bs,
                                            axes_dim_num,
                                            static_cast<int>(index * ref_index_scale),
-                                            h_offset,
-                                            w_offset,
+                                            h_offset + base_offset,
+                                            w_offset + base_offset,
                                            scale_rope);
            ids          = concat_ids(ids, ref_ids, bs);

@ -294,13 +396,17 @@ namespace Rope {
                                                                   std::set<int> txt_arange_dims,
                                                                   const std::vector<ggml_tensor*>& ref_latents,
                                                                   bool increase_ref_index,
-                                                                   float ref_index_scale) {
-        auto txt_ids = gen_flux_txt_ids(bs, context_len, axes_dim_num, txt_arange_dims);
-        auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num);
+                                                                   float ref_index_scale,
+                                                                   bool is_longcat) {
+        int x_index = is_longcat ? 1 : 0;
+
+        auto txt_ids = is_longcat ? gen_longcat_txt_ids(bs, context_len, axes_dim_num) : gen_flux_txt_ids(bs, context_len, axes_dim_num, txt_arange_dims);
+        int offset   = is_longcat ? context_len : 0;
+        auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num, x_index, offset, offset);

        auto ids = concat_ids(txt_ids, img_ids, bs);
        if (ref_latents.size() > 0) {
-            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, ref_index_scale, false);
+            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, x_index + 1, ref_latents, increase_ref_index, ref_index_scale, false, offset);
            ids           = concat_ids(ids, refs_ids, bs);
        }
        return ids;
@ -319,7 +425,8 @@ namespace Rope {
                                                     int theta,
                                                     bool circular_h,
                                                     bool circular_w,
-                                                     const std::vector<int>& axes_dim) {
+                                                     const std::vector<int>& axes_dim,
+                                                     bool is_longcat) {
        std::vector<std::vector<float>> ids = gen_flux_ids(h,
                                                           w,
                                                           patch_size,
@ -329,7 +436,8 @@ namespace Rope {
                                                           txt_arange_dims,
                                                           ref_latents,
                                                           increase_ref_index,
-                                                           ref_index_scale);
+                                                           ref_index_scale,
+                                                           is_longcat);
        std::vector<std::vector<int>> wrap_dims;
        if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) {
            int h_len = (h + (patch_size / 2)) / patch_size;
@ -394,7 +502,7 @@ namespace Rope {
        auto img_ids     = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num, 0, 0, 0, true);
        auto ids         = concat_ids(txt_ids_repeated, img_ids, bs);
        if (ref_latents.size() > 0) {
-            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, 1.f, true);
+            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, 1, ref_latents, increase_ref_index, 1.f, true);
            ids           = concat_ids(ids, refs_ids, bs);
        }
        return ids;
@ -462,6 +570,52 @@ namespace Rope {
        return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims);
    }

+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_lens_ids(int h,
+                                                                   int w,
+                                                                   int bs,
+                                                                   int context_len,
+                                                                   bool scale_rope = true) {
+        auto img_ids_repeated = gen_flux_img_ids(h, w, 1, bs, 3, 0, 0, 0, scale_rope);
+
+        int txt_id_start = scale_rope ? std::max(h / 2, w / 2) : 0;
+        auto txt_ids     = linspace<float>(1.f * txt_id_start, 1.f * context_len + txt_id_start, context_len);
+        std::vector<std::vector<float>> txt_ids_repeated(bs * context_len, std::vector<float>(3));
+        for (int i = 0; i < bs; ++i) {
+            for (int j = 0; j < txt_ids.size(); ++j) {
+                txt_ids_repeated[i * txt_ids.size() + j] = {txt_ids[j], txt_ids[j], txt_ids[j]};
+            }
+        }
+
+        return concat_ids(img_ids_repeated, txt_ids_repeated, bs);
+    }
+
+    __STATIC_INLINE__ std::vector<float> gen_lens_pe(int h,
+                                                     int w,
+                                                     int bs,
+                                                     int context_len,
+                                                     int theta,
+                                                     bool circular_h,
+                                                     bool circular_w,
+                                                     const std::vector<int>& axes_dim) {
+        std::vector<std::vector<float>> ids = gen_lens_ids(h, w, bs, context_len, true);
+        std::vector<std::vector<int>> wrap_dims;
+        if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) {
+            size_t pos_len = ids.size() / bs;
+            wrap_dims.assign(axes_dim.size(), std::vector<int>(pos_len, 0));
+            const size_t img_tokens = static_cast<size_t>(h) * static_cast<size_t>(w);
+            for (size_t token_i = 0; token_i < img_tokens; ++token_i) {
+                if (circular_h) {
+                    wrap_dims[1][token_i] = h;
+                }
+                if (circular_w) {
+                    wrap_dims[2][token_i] = w;
+                }
+            }
+        }
+
+        return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims);
+    }
+
    __STATIC_INLINE__ std::vector<std::vector<float>> gen_ernie_image_ids(int h,
                                                                          int w,
                                                                          int patch_size,
@ -753,4 +907,4 @@ namespace Rope {
    }
 };  // namespace Rope

-#endif  // __ROPE_HPP__
+#endif  // __SD_MODEL_COMMON_ROPE_HPP__
--- a/src/model/diffusion/anima.hpp
+++ b/src/model/diffusion/anima.hpp
@ -1,18 +1,61 @@
-#ifndef __ANIMA_HPP__
-#define __ANIMA_HPP__
+#ifndef __SD_MODEL_DIFFUSION_ANIMA_HPP__
+#define __SD_MODEL_DIFFUSION_ANIMA_HPP__

+#include <algorithm>
 #include <cmath>
 #include <memory>
 #include <utility>
 #include <vector>

-#include "common_block.hpp"
-#include "flux.hpp"
-#include "rope.hpp"
+#include "model/common/block.hpp"
+#include "model/common/rope.hpp"
+#include "model/diffusion/flux.hpp"
+#include "model/diffusion/model.hpp"

 namespace Anima {
    constexpr int ANIMA_GRAPH_SIZE = 65536;

+    struct AnimaConfig {
+        int64_t in_channels       = 16;
+        int64_t out_channels      = 16;
+        int64_t hidden_size       = 2048;
+        int64_t text_embed_dim    = 1024;
+        int64_t num_heads         = 16;
+        int64_t head_dim          = 128;
+        int patch_size            = 2;
+        int64_t num_layers        = 28;
+        std::vector<int> axes_dim = {44, 42, 42};
+        int theta                 = 10000;
+
+        static AnimaConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
+            AnimaConfig config;
+            int64_t detected_layers = 0;
+            std::string layer_tag   = prefix.empty() ? "blocks." : prefix + ".blocks.";
+            for (const auto& [name, _] : tensor_storage_map) {
+                size_t pos = name.find(layer_tag);
+                if (pos == std::string::npos) {
+                    continue;
+                }
+                size_t start = pos + layer_tag.size();
+                size_t end   = name.find('.', start);
+                if (end == std::string::npos) {
+                    continue;
+                }
+                int64_t layer_id = atoll(name.substr(start, end - start).c_str());
+                detected_layers  = std::max(detected_layers, layer_id + 1);
+            }
+            if (detected_layers > 0) {
+                config.num_layers = detected_layers;
+                LOG_DEBUG("anima: num_layers = %" PRId64 ", hidden_size = %" PRId64 ", num_heads = %" PRId64 ", head_dim = %" PRId64,
+                          config.num_layers,
+                          config.hidden_size,
+                          config.num_heads,
+                          config.head_dim);
+            }
+            return config;
+        }
+    };
+
    __STATIC_INLINE__ ggml_tensor* apply_gate(ggml_context* ctx,
                                              ggml_tensor* x,
                                              ggml_tensor* gate) {
@ -417,31 +460,22 @@ namespace Anima {

    struct AnimaNet : public GGMLBlock {
    public:
-        int64_t in_channels       = 16;
-        int64_t out_channels      = 16;
-        int64_t hidden_size       = 2048;
-        int64_t text_embed_dim    = 1024;
-        int64_t num_heads         = 16;
-        int64_t head_dim          = 128;
-        int patch_size            = 2;
-        int64_t num_layers        = 28;
-        std::vector<int> axes_dim = {44, 42, 42};
-        int theta                 = 10000;
+        AnimaConfig config;

    public:
        AnimaNet() = default;
-        explicit AnimaNet(int64_t num_layers)
-            : num_layers(num_layers) {
-            blocks["x_embedder"]       = std::make_shared<XEmbedder>((in_channels + 1) * patch_size * patch_size, hidden_size);
-            blocks["t_embedder"]       = std::make_shared<TimestepEmbedder>(hidden_size, hidden_size * 3);
-            blocks["t_embedding_norm"] = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
-            for (int i = 0; i < num_layers; i++) {
-                blocks["blocks." + std::to_string(i)] = std::make_shared<TransformerBlock>(hidden_size,
-                                                                                           text_embed_dim,
-                                                                                           num_heads,
-                                                                                           head_dim);
+        explicit AnimaNet(AnimaConfig config)
+            : config(config) {
+            blocks["x_embedder"]       = std::make_shared<XEmbedder>((config.in_channels + 1) * config.patch_size * config.patch_size, config.hidden_size);
+            blocks["t_embedder"]       = std::make_shared<TimestepEmbedder>(config.hidden_size, config.hidden_size * 3);
+            blocks["t_embedding_norm"] = std::make_shared<RMSNorm>(config.hidden_size, 1e-6f);
+            for (int i = 0; i < config.num_layers; i++) {
+                blocks["blocks." + std::to_string(i)] = std::make_shared<TransformerBlock>(config.hidden_size,
+                                                                                           config.text_embed_dim,
+                                                                                           config.num_heads,
+                                                                                           config.head_dim);
            }
-            blocks["final_layer"] = std::make_shared<FinalLayer>(hidden_size, patch_size, out_channels);
+            blocks["final_layer"] = std::make_shared<FinalLayer>(config.hidden_size, config.patch_size, config.out_channels);
            blocks["llm_adapter"] = std::make_shared<LLMAdapter>(1024, 1024, 1024, 6, 16);
        }

@ -468,11 +502,11 @@ namespace Anima {
            auto padding_mask = ggml_ext_zeros(ctx->ggml_ctx, x->ne[0], x->ne[1], 1, x->ne[3]);
            x                 = ggml_concat(ctx->ggml_ctx, x, padding_mask, 2);  // [N, C + 1, H, W]

-            x = DiT::pad_and_patchify(ctx, x, patch_size, patch_size);  // [N, h*w, (C+1)*ph*pw]
+            x = DiT::pad_and_patchify(ctx, x, config.patch_size, config.patch_size);  // [N, h*w, (C+1)*ph*pw]

            x = x_embedder->forward(ctx, x);

-            auto timestep_proj     = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast<int>(hidden_size));
+            auto timestep_proj     = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast<int>(config.hidden_size));
            auto temb              = t_embedder->forward(ctx, timestep_proj);
            auto embedded_timestep = t_embedding_norm->forward(ctx, timestep_proj);

@ -499,53 +533,40 @@ namespace Anima {
                encoder_hidden_states = adapted_context;
            }

-            for (int i = 0; i < num_layers; i++) {
+            sd::ggml_graph_cut::mark_graph_cut(x, "anima.prelude", "x");
+            sd::ggml_graph_cut::mark_graph_cut(embedded_timestep, "anima.prelude", "embedded_timestep");
+            sd::ggml_graph_cut::mark_graph_cut(temb, "anima.prelude", "temb");
+            sd::ggml_graph_cut::mark_graph_cut(encoder_hidden_states, "anima.prelude", "context");
+
+            for (int i = 0; i < config.num_layers; i++) {
                auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["blocks." + std::to_string(i)]);
                x          = block->forward(ctx, x, encoder_hidden_states, embedded_timestep, temb, image_pe);
+                sd::ggml_graph_cut::mark_graph_cut(x, "anima.blocks." + std::to_string(i), "x");
            }

            x = final_layer->forward(ctx, x, embedded_timestep, temb);  // [N, h*w, ph*pw*C]

-            x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, patch_size, patch_size, false);  // [N, C, H, W]
+            x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, config.patch_size, config.patch_size, false);  // [N, C, H, W]

            return x;
        }
    };

-    struct AnimaRunner : public GGMLRunner {
+    struct AnimaRunner : public DiffusionModelRunner {
    public:
        std::vector<float> image_pe_vec;
        std::vector<float> adapter_q_pe_vec;
        std::vector<float> adapter_k_pe_vec;
+        AnimaConfig config;
        AnimaNet net;

        AnimaRunner(ggml_backend_t backend,
-                    bool offload_params_to_cpu,
                    const String2TensorStorage& tensor_storage_map      = {},
-                    const std::string prefix                       = "model.diffusion_model")
-            : GGMLRunner(backend, offload_params_to_cpu) {
-            int64_t num_layers    = 0;
-            std::string layer_tag = prefix + ".net.blocks.";
-            for (const auto& kv : tensor_storage_map) {
-                const std::string& tensor_name = kv.first;
-                size_t pos                     = tensor_name.find(layer_tag);
-                if (pos == std::string::npos) {
-                    continue;
-                }
-                size_t start = pos + layer_tag.size();
-                size_t end   = tensor_name.find('.', start);
-                if (end == std::string::npos) {
-                    continue;
-                }
-                int64_t layer_id = atoll(tensor_name.substr(start, end - start).c_str());
-                num_layers       = std::max(num_layers, layer_id + 1);
-            }
-            if (num_layers <= 0) {
-                num_layers = 28;
-            }
-            LOG_INFO("anima net layers: %" PRId64, num_layers);
-
-            net = AnimaNet(num_layers);
+                    const std::string prefix                            = "model.diffusion_model",
+                    std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+            : DiffusionModelRunner(backend, prefix, weight_manager),
+              config(AnimaConfig::detect_from_weights(tensor_storage_map, prefix + ".net")) {
+            net = AnimaNet(config);
            net.init(params_ctx, tensor_storage_map, prefix + ".net");
        }

@ -553,7 +574,7 @@ namespace Anima {
            return "anima";
        }

-        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
            net.get_param_tensors(tensors, prefix + ".net");
        }

@ -592,7 +613,8 @@ namespace Anima {
                                          {},
                                          empty_ref_latents,
                                          false,
-                                          1.0f);
+                                          1.0f,
+                                          false);

            std::vector<float> axis_thetas = {
                static_cast<float>(theta) * calc_ntk_factor(t_extrapolation_ratio, axes_dim[0]),
@ -615,22 +637,22 @@ namespace Anima {
            GGML_ASSERT(x->ne[3] == 1);
            ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE);

-            int64_t pad_h = (net.patch_size - x->ne[1] % net.patch_size) % net.patch_size;
-            int64_t pad_w = (net.patch_size - x->ne[0] % net.patch_size) % net.patch_size;
+            int64_t pad_h = (config.patch_size - x->ne[1] % config.patch_size) % config.patch_size;
+            int64_t pad_w = (config.patch_size - x->ne[0] % config.patch_size) % config.patch_size;
            int64_t h_pad = x->ne[1] + pad_h;
            int64_t w_pad = x->ne[0] + pad_w;

            image_pe_vec          = gen_anima_image_pe_vec(1,
                                                           static_cast<int>(h_pad),
                                                           static_cast<int>(w_pad),
-                                                           static_cast<int>(net.patch_size),
-                                                           net.theta,
-                                                           net.axes_dim,
+                                                           static_cast<int>(config.patch_size),
+                                                           config.theta,
+                                                           config.axes_dim,
                                                           4.0f,
                                                           4.0f,
                                                           1.0f);
-            int64_t image_pos_len = static_cast<int64_t>(image_pe_vec.size()) / (2 * 2 * (net.head_dim / 2));
-            auto image_pe         = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, net.head_dim / 2, image_pos_len);
+            int64_t image_pos_len = static_cast<int64_t>(image_pe_vec.size()) / (2 * 2 * (config.head_dim / 2));
+            auto image_pe         = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.head_dim / 2, image_pos_len);
            set_backend_tensor_data(image_pe, image_pe_vec.data());

            ggml_tensor* adapter_q_pe = nullptr;
@ -675,9 +697,22 @@ namespace Anima {
            auto get_graph = [&]() -> ggml_cgraph* {
                return build_graph(x, timesteps, context, t5_ids, t5_weights);
            };
-            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const DiffusionParams& diffusion_params) override {
+            GGML_ASSERT(diffusion_params.x != nullptr);
+            GGML_ASSERT(diffusion_params.timesteps != nullptr);
+            const auto* extra = diffusion_extra_as<AnimaDiffusionExtra>(diffusion_params);
+            return compute(n_threads,
+                           *diffusion_params.x,
+                           *diffusion_params.timesteps,
+                           tensor_or_empty(diffusion_params.context),
+                           tensor_or_empty(extra->t5_ids),
+                           tensor_or_empty(extra->t5_weights));
        }
    };
 }  // namespace Anima

-#endif  // __ANIMA_HPP__
+#endif  // __SD_MODEL_DIFFUSION_ANIMA_HPP__
--- a/src/model/diffusion/control.hpp
+++ b/src/model/diffusion/control.hpp
@ -1,8 +1,9 @@
-#ifndef __CONTROL_HPP__
-#define __CONTROL_HPP__
+#ifndef __SD_MODEL_DIFFUSION_CONTROL_HPP__
+#define __SD_MODEL_DIFFUSION_CONTROL_HPP__

-#include "common_block.hpp"
-#include "model.h"
+#include "model/common/block.hpp"
+#include "model_loader.h"
+#include "model_manager.h"

 #define CONTROL_NET_GRAPH_SIZE 1536

@ -309,73 +310,47 @@ public:
 struct ControlNet : public GGMLRunner {
    SDVersion version = VERSION_SD1;
    ControlNetBlock control_net;
+    std::string weight_prefix;

-    ggml_backend_buffer_t control_buffer = nullptr;
-    ggml_context* control_ctx            = nullptr;
    std::vector<ggml_tensor*> control_outputs_ggml;
    ggml_tensor* guided_hint_output_ggml = nullptr;
    std::vector<sd::Tensor<float>> controls;
-    sd::Tensor<float> guided_hint;
    bool guided_hint_cached = false;
+    std::shared_ptr<ModelManager> owned_model_manager;
+    ggml_backend_t params_backend = nullptr;
+
+    static const char* guided_hint_cache_name() {
+        return "controlnet.guided_hint";
+    }

    ControlNet(ggml_backend_t backend,
-               bool offload_params_to_cpu,
+               ggml_backend_t params_backend_,
               const String2TensorStorage& tensor_storage_map      = {},
-               SDVersion version                              = VERSION_SD1)
-        : GGMLRunner(backend, offload_params_to_cpu), control_net(version) {
-        control_net.init(params_ctx, tensor_storage_map, "");
+               SDVersion version                                   = VERSION_SD1,
+               const std::string& prefix                           = "",
+               std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+        : GGMLRunner(backend, weight_manager), version(version), control_net(version), weight_prefix(prefix), params_backend(params_backend_) {
+        control_net.init(params_ctx, tensor_storage_map, prefix);
    }

    ~ControlNet() override {
        free_control_ctx();
    }

-    void alloc_control_ctx(std::vector<ggml_tensor*> outs) {
-        ggml_init_params params;
-        params.mem_size   = static_cast<size_t>(outs.size() * ggml_tensor_overhead()) + 1024 * 1024;
-        params.mem_buffer = nullptr;
-        params.no_alloc   = true;
-        control_ctx       = ggml_init(params);
-
-        control_outputs_ggml.resize(outs.size() - 1);
-
-        size_t control_buffer_size = 0;
-
-        guided_hint_output_ggml = ggml_dup_tensor(control_ctx, outs[0]);
-        control_buffer_size += ggml_nbytes(guided_hint_output_ggml);
-
-        for (int i = 0; i < outs.size() - 1; i++) {
-            control_outputs_ggml[i] = ggml_dup_tensor(control_ctx, outs[i + 1]);
-            control_buffer_size += ggml_nbytes(control_outputs_ggml[i]);
-        }
-
-        control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, runtime_backend);
-
-        LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f);
-    }
-
    void free_control_ctx() {
-        if (control_buffer != nullptr) {
-            ggml_backend_buffer_free(control_buffer);
-            control_buffer = nullptr;
-        }
-        if (control_ctx != nullptr) {
-            ggml_free(control_ctx);
-            control_ctx = nullptr;
-        }
        guided_hint_output_ggml = nullptr;
        guided_hint_cached      = false;
-        guided_hint             = {};
        control_outputs_ggml.clear();
        controls.clear();
+        free_cache_ctx_and_buffer();
    }

    std::string get_desc() override {
        return "control_net";
    }

-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
-        control_net.get_param_tensors(tensors, prefix);
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) {
+        control_net.get_param_tensors(tensors, weight_prefix);
    }

    ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
@ -391,11 +366,17 @@ struct ControlNet : public GGMLRunner {
        ggml_tensor* context   = make_optional_input(context_tensor);
        ggml_tensor* y         = make_optional_input(y_tensor);

+        guided_hint_output_ggml = nullptr;
+        control_outputs_ggml.clear();
+
        ggml_tensor* guided_hint_input = nullptr;
-        if (guided_hint_cached && !guided_hint.empty()) {
-            guided_hint_input = make_input(guided_hint);
-            hint              = nullptr;
-        } else {
+        if (guided_hint_cached) {
+            guided_hint_input = get_cache_tensor_by_name(guided_hint_cache_name());
+            if (guided_hint_input == nullptr) {
+                guided_hint_cached = false;
+            }
+        }
+        if (guided_hint_input == nullptr) {
            hint = make_input(hint_tensor);
        }

@ -409,13 +390,19 @@ struct ControlNet : public GGMLRunner {
                                        context,
                                        y);

-        if (control_ctx == nullptr) {
-            alloc_control_ctx(outs);
+        if (guided_hint_input == nullptr && !outs.empty()) {
+            guided_hint_output_ggml = outs[0];
+            ggml_set_output(guided_hint_output_ggml);
+            cache(guided_hint_cache_name(), guided_hint_output_ggml);
+            ggml_build_forward_expand(gf, guided_hint_output_ggml);
        }

-        ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[0], guided_hint_output_ggml));
-        for (int i = 0; i < outs.size() - 1; i++) {
-            ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[i + 1], control_outputs_ggml[i]));
+        control_outputs_ggml.reserve(outs.size() > 0 ? outs.size() - 1 : 0);
+        for (size_t i = 1; i < outs.size(); i++) {
+            ggml_tensor* control_output = outs[i];
+            ggml_set_output(control_output);
+            ggml_build_forward_expand(gf, control_output);
+            control_outputs_ggml.push_back(control_output);
        }

        return gf;
@ -435,15 +422,12 @@ struct ControlNet : public GGMLRunner {
            return build_graph(x, hint, timesteps, context, y);
        };

-        auto compute_result = GGMLRunner::compute<float>(get_graph, n_threads, false);
+        auto compute_result = GGMLRunner::compute<float>(get_graph, n_threads, false, false, false, true);
        if (!compute_result.has_value()) {
            return std::nullopt;
        }

-        if (guided_hint_output_ggml != nullptr) {
-            guided_hint = restore_trailing_singleton_dims(sd::make_sd_tensor_from_ggml<float>(guided_hint_output_ggml),
-                                                          4);
-        }
+        guided_hint_cached = get_cache_tensor_by_name(guided_hint_cache_name()) != nullptr;
        controls.clear();
        controls.reserve(control_outputs_ggml.size());
        for (ggml_tensor* control : control_outputs_ggml) {
@ -451,33 +435,41 @@ struct ControlNet : public GGMLRunner {
            GGML_ASSERT(!control_host.empty());
            controls.push_back(std::move(control_host));
        }
-        guided_hint_cached = true;
        return controls;
    }

    bool load_from_file(const std::string& file_path, int n_threads) {
        LOG_INFO("loading control net from '%s'", file_path.c_str());
-        alloc_params_buffer();
        std::map<std::string, ggml_tensor*> tensors;
        control_net.get_param_tensors(tensors);
-        std::set<std::string> ignore_tensors;

-        ModelLoader model_loader;
+        auto manager = std::dynamic_pointer_cast<ModelManager>(weight_manager.lock());
+        if (manager == nullptr) {
+            owned_model_manager = std::make_shared<ModelManager>();
+            weight_manager      = owned_model_manager;
+            manager             = owned_model_manager;
+        }
+
+        ModelLoader& model_loader = manager->loader();
        if (!model_loader.init_from_file_and_convert_name(file_path)) {
            LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
            return false;
        }

-        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);
-
-        if (!success) {
-            LOG_ERROR("load control net tensors from model loader failed");
+        manager->set_n_threads(n_threads);
+        if (!manager->register_param_tensors("ControlNet",
+                                             std::move(tensors),
+                                             ModelManager::ResidencyMode::ParamBackend,
+                                             runtime_backend,
+                                             params_backend) ||
+            !manager->validate_registered_tensors()) {
+            LOG_ERROR("register control net tensors with model manager failed");
            return false;
        }

        LOG_INFO("control net model loaded");
-        return success;
+        return true;
    }
 };

-#endif  // __CONTROL_HPP__
+#endif  // __SD_MODEL_DIFFUSION_CONTROL_HPP__
--- a/src/model/diffusion/dit.hpp
+++ b/src/model/diffusion/dit.hpp
@ -1,7 +1,7 @@
-#ifndef __COMMON_DIT_HPP__
-#define __COMMON_DIT_HPP__
+#ifndef __SD_MODEL_DIFFUSION_DIT_HPP__
+#define __SD_MODEL_DIFFUSION_DIT_HPP__

-#include "ggml_extend.hpp"
+#include "core/ggml_extend.hpp"

 namespace DiT {
    inline ggml_tensor* patchify(ggml_context* ctx,
@ -103,6 +103,64 @@ namespace DiT {
        x         = ggml_ext_slice(ctx, x, 0, 0, W);               // [N, C, H, W]
        return x;
    }
+
+    inline ggml_tensor* patchify(ggml_context* ctx,
+                                 ggml_tensor* x,
+                                 int pt,
+                                 int ph,
+                                 int pw,
+                                 int64_t N = 1) {
+        // x: [N*C, T, H, W]
+        // return: [N, h*w, C*pt*ph*pw]
+        int64_t C     = x->ne[3] / N;
+        int64_t T     = x->ne[2];
+        int64_t H     = x->ne[1];
+        int64_t W     = x->ne[0];
+        int64_t t_len = T / pt;
+        int64_t h_len = H / ph;
+        int64_t w_len = W / pw;
+
+        GGML_ASSERT(C * N == x->ne[3]);
+        GGML_ASSERT(t_len * pt == T && h_len * ph == H && w_len * pw == W);
+
+        x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt, t_len * C * N);      // [N*C*t_len, pt, h_len*ph, w_len*pw]
+        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));          // [N*C*t_len, h_len*ph, pt, w_len*pw]
+        x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph, h_len * t_len * C * N);      // [N*C*t_len*h_len, ph, pt, w_len*pw]
+        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));          // [N*C*t_len*h_len, pt, ph, w_len*pw]
+        x = ggml_reshape_4d(ctx, x, pw, w_len, ph * pt, h_len * t_len * C * N);      // [N*C*t_len*h_len, pt*ph, w_len, pw]
+        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));          // [N*C*t_len*h_len, w_len, pt*ph, pw]
+        x = ggml_reshape_4d(ctx, x, pw * ph * pt, w_len * h_len * t_len, C, N);      // [N, C, t_len*h_len*w_len, pt*ph*pw]
+        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));          // [N, t_len*h_len*w_len, C, pt*ph*pw]
+        x = ggml_reshape_4d(ctx, x, pw * ph * pt * C, w_len * h_len * t_len, N, 1);  // [N, t_len*h_len*w_len, C*pt*ph*pw]
+        return x;
+    }
+
+    inline ggml_tensor* unpatchify(ggml_context* ctx,
+                                   ggml_tensor* x,
+                                   int64_t t_len,
+                                   int64_t h_len,
+                                   int64_t w_len,
+                                   int pt,
+                                   int ph,
+                                   int pw) {
+        // x: [N, t_len*h_len*w_len, pt*ph*pw*C]
+        // return: [N*C, t_len*pt, h_len*ph, w_len*pw]
+        int64_t N = x->ne[3];
+        int64_t C = x->ne[0] / pt / ph / pw;
+
+        GGML_ASSERT(C * pt * ph * pw == x->ne[0]);
+
+        x = ggml_reshape_4d(ctx, x, C, pw * ph * pt, w_len * h_len * t_len, N);  // [N, t_len*h_len*w_len, pt*ph*pw, C]
+        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3));      // [N, C, t_len*h_len*w_len, pt*ph*pw]
+        x = ggml_reshape_4d(ctx, x, pw, ph * pt, w_len, h_len * t_len * C * N);  // [N*C*t_len*h_len, w_len, pt*ph, pw]
+        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));      // [N*C*t_len*h_len, pt*ph, w_len, pw]
+        x = ggml_reshape_4d(ctx, x, pw * w_len, ph, pt, h_len * t_len * C * N);  // [N*C*t_len*h_len, pt, ph, w_len*pw]
+        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));      // [N*C*t_len*h_len, ph, pt, w_len*pw]
+        x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph * h_len, t_len * C * N);  // [N*C*t_len, h_len*ph, pt, w_len*pw]
+        x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));      // [N*C*t_len, pt, h_len*ph, w_len*pw]
+        x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt * t_len, C * N);  // [N*C, t_len*pt, h_len*ph, w_len*pw]
+        return x;
+    }
 }  // namespace DiT

-#endif  // __COMMON_DIT_HPP__
+#endif  // __SD_MODEL_DIFFUSION_DIT_HPP__
--- a/src/model/diffusion/ernie_image.hpp
+++ b/src/model/diffusion/ernie_image.hpp
@ -1,17 +1,88 @@
-#ifndef __SD_ERNIE_IMAGE_HPP__
-#define __SD_ERNIE_IMAGE_HPP__
+#ifndef __SD_MODEL_DIFFUSION_ERNIE_IMAGE_HPP__
+#define __SD_MODEL_DIFFUSION_ERNIE_IMAGE_HPP__

 #include <memory>
 #include <vector>

-#include "common_dit.hpp"
-#include "flux.hpp"
-#include "qwen_image.hpp"
-#include "rope.hpp"
+#include "model/common/rope.hpp"
+#include "model/diffusion/dit.hpp"
+#include "model/diffusion/flux.hpp"
+#include "model/diffusion/model.hpp"
+#include "model/diffusion/qwen_image.hpp"

 namespace ErnieImage {
    constexpr int ERNIE_IMAGE_GRAPH_SIZE = 40960;

+    struct ErnieImageConfig {
+        int64_t hidden_size       = 4096;
+        int64_t num_heads         = 32;
+        int64_t num_layers        = 36;
+        int64_t ffn_hidden_size   = 12288;
+        int64_t in_channels       = 128;
+        int64_t out_channels      = 128;
+        int patch_size            = 1;
+        int64_t text_in_dim       = 3072;
+        int theta                 = 256;
+        std::vector<int> axes_dim = {32, 48, 48};
+        int axes_dim_sum          = 128;
+        float eps                 = 1e-6f;
+
+        static ErnieImageConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
+            ErnieImageConfig config;
+            config.num_layers         = 0;
+            int64_t detected_head_dim = 0;
+            for (const auto& [name, tensor_storage] : tensor_storage_map) {
+                if (!starts_with(name, prefix)) {
+                    continue;
+                }
+                if (ends_with(name, "x_embedder.proj.weight") && tensor_storage.n_dims == 4) {
+                    config.patch_size  = static_cast<int>(tensor_storage.ne[0]);
+                    config.in_channels = tensor_storage.ne[2];
+                    config.hidden_size = tensor_storage.ne[3];
+                } else if (ends_with(name, "text_proj.weight") && tensor_storage.n_dims == 2) {
+                    config.text_in_dim = tensor_storage.ne[0];
+                } else if (ends_with(name, "layers.0.self_attention.norm_q.weight")) {
+                    detected_head_dim = tensor_storage.ne[0];
+                } else if (ends_with(name, "layers.0.mlp.gate_proj.weight") && tensor_storage.n_dims == 2) {
+                    config.ffn_hidden_size = tensor_storage.ne[1];
+                } else if (ends_with(name, "final_linear.weight") && tensor_storage.n_dims == 2) {
+                    int64_t out_dim     = tensor_storage.ne[1];
+                    int64_t patch_area  = config.patch_size * config.patch_size;
+                    config.out_channels = out_dim / patch_area;
+                }
+
+                size_t pos = name.find("layers.");
+                if (pos != std::string::npos) {
+                    auto items = split_string(name.substr(pos), '.');
+                    if (items.size() > 1) {
+                        int block_index = atoi(items[1].c_str());
+                        if (block_index + 1 > config.num_layers) {
+                            config.num_layers = block_index + 1;
+                        }
+                    }
+                }
+            }
+            if (config.num_layers == 0) {
+                config.num_layers = 36;
+            }
+            if (detected_head_dim > 0) {
+                config.num_heads = config.hidden_size / detected_head_dim;
+            }
+            config.axes_dim_sum = 0;
+            for (int axis_dim : config.axes_dim) {
+                config.axes_dim_sum += axis_dim;
+            }
+            LOG_DEBUG("ernie_image: num_layers = %" PRId64 ", hidden_size = %" PRId64 ", num_heads = %" PRId64 ", ffn_hidden_size = %" PRId64 ", in_channels = %" PRId64 ", out_channels = %" PRId64,
+                      config.num_layers,
+                      config.hidden_size,
+                      config.num_heads,
+                      config.ffn_hidden_size,
+                      config.in_channels,
+                      config.out_channels);
+            return config;
+        }
+    };
+
    __STATIC_INLINE__ ggml_tensor* timestep_embedding_sin_cos(ggml_context* ctx,
                                                              ggml_tensor* timesteps,
                                                              int dim,
@ -207,51 +278,36 @@ namespace ErnieImage {
        }
    };

-    struct ErnieImageParams {
-        int64_t hidden_size       = 4096;
-        int64_t num_heads         = 32;
-        int64_t num_layers        = 36;
-        int64_t ffn_hidden_size   = 12288;
-        int64_t in_channels       = 128;
-        int64_t out_channels      = 128;
-        int patch_size            = 1;
-        int64_t text_in_dim       = 3072;
-        int theta                 = 256;
-        std::vector<int> axes_dim = {32, 48, 48};
-        int axes_dim_sum          = 128;
-        float eps                 = 1e-6f;
-    };
-
    class ErnieImageModel : public GGMLBlock {
    public:
-        ErnieImageParams params;
+        ErnieImageConfig config;

        ErnieImageModel() = default;
-        ErnieImageModel(ErnieImageParams params)
-            : params(params) {
-            blocks["x_embedder.proj"] = std::make_shared<Conv2d>(params.in_channels,
-                                                                 params.hidden_size,
-                                                                 std::pair<int, int>{params.patch_size, params.patch_size},
-                                                                 std::pair<int, int>{params.patch_size, params.patch_size},
+        ErnieImageModel(ErnieImageConfig config)
+            : config(config) {
+            blocks["x_embedder.proj"] = std::make_shared<Conv2d>(config.in_channels,
+                                                                 config.hidden_size,
+                                                                 std::pair<int, int>{config.patch_size, config.patch_size},
+                                                                 std::pair<int, int>{config.patch_size, config.patch_size},
                                                                 std::pair<int, int>{0, 0},
                                                                 std::pair<int, int>{1, 1},
                                                                 true);
-            if (params.text_in_dim != params.hidden_size) {
-                blocks["text_proj"] = std::make_shared<Linear>(params.text_in_dim, params.hidden_size, false);
+            if (config.text_in_dim != config.hidden_size) {
+                blocks["text_proj"] = std::make_shared<Linear>(config.text_in_dim, config.hidden_size, false);
            }
-            blocks["time_embedding"]     = std::make_shared<Qwen::TimestepEmbedding>(params.hidden_size, params.hidden_size);
-            blocks["adaLN_modulation.1"] = std::make_shared<Linear>(params.hidden_size, 6 * params.hidden_size, true);
+            blocks["time_embedding"]     = std::make_shared<Qwen::TimestepEmbedding>(config.hidden_size, config.hidden_size);
+            blocks["adaLN_modulation.1"] = std::make_shared<Linear>(config.hidden_size, 6 * config.hidden_size, true);

-            for (int i = 0; i < params.num_layers; i++) {
-                blocks["layers." + std::to_string(i)] = std::make_shared<ErnieImageSharedAdaLNBlock>(params.hidden_size,
-                                                                                                     params.num_heads,
-                                                                                                     params.ffn_hidden_size,
-                                                                                                     params.eps);
+            for (int i = 0; i < config.num_layers; i++) {
+                blocks["layers." + std::to_string(i)] = std::make_shared<ErnieImageSharedAdaLNBlock>(config.hidden_size,
+                                                                                                     config.num_heads,
+                                                                                                     config.ffn_hidden_size,
+                                                                                                     config.eps);
            }

-            blocks["final_norm"]   = std::make_shared<ErnieImageAdaLNContinuous>(params.hidden_size, params.eps);
-            blocks["final_linear"] = std::make_shared<Linear>(params.hidden_size,
-                                                              params.patch_size * params.patch_size * params.out_channels,
+            blocks["final_norm"]   = std::make_shared<ErnieImageAdaLNContinuous>(config.hidden_size, config.eps);
+            blocks["final_linear"] = std::make_shared<Linear>(config.hidden_size,
+                                                              config.patch_size * config.patch_size * config.out_channels,
                                                              true);
        }

@ -264,12 +320,12 @@ namespace ErnieImage {
            // context: [N, text_tokens, 3072]
            // pe: [image_tokens + text_tokens, head_dim/2, 2, 2]
            GGML_ASSERT(context != nullptr);
-            GGML_ASSERT(x->ne[1] % params.patch_size == 0 && x->ne[0] % params.patch_size == 0);
+            GGML_ASSERT(x->ne[1] % config.patch_size == 0 && x->ne[0] % config.patch_size == 0);

            int64_t W     = x->ne[0];
            int64_t H     = x->ne[1];
-            int64_t Hp    = H / params.patch_size;
-            int64_t Wp    = W / params.patch_size;
+            int64_t Hp    = H / config.patch_size;
+            int64_t Wp    = W / config.patch_size;
            int64_t n_img = Hp * Wp;
            int64_t N     = x->ne[3];

@ -291,10 +347,12 @@ namespace ErnieImage {

            auto hidden_states = ggml_concat(ctx->ggml_ctx, img, txt, 1);  // [N, image_tokens + text_tokens, hidden_size]

-            auto sample = timestep_embedding_sin_cos(ctx->ggml_ctx, timestep, static_cast<int>(params.hidden_size));
+            auto sample = timestep_embedding_sin_cos(ctx->ggml_ctx, timestep, static_cast<int>(config.hidden_size));
            auto c      = time_embedding->forward(ctx, sample);  // [N, hidden_size]

            auto mod_params = adaLN_mod->forward(ctx, ggml_silu(ctx->ggml_ctx, c));  // [N, 6 * hidden_size]
+            sd::ggml_graph_cut::mark_graph_cut(hidden_states, "ernie_image.prelude", "hidden_states");
+            // sd::ggml_graph_cut::mark_graph_cut(mod_params, "ernie_image.prelude", "mod_params");
            auto chunks = ggml_ext_chunk(ctx->ggml_ctx, mod_params, 6, 0);
            std::vector<ggml_tensor*> temb;
            temb.reserve(6);
@ -302,9 +360,10 @@ namespace ErnieImage {
                temb.push_back(ggml_reshape_3d(ctx->ggml_ctx, chunk, chunk->ne[0], 1, chunk->ne[1]));  // [N, 1, hidden_size]
            }

-            for (int i = 0; i < params.num_layers; i++) {
+            for (int i = 0; i < config.num_layers; i++) {
                auto layer    = std::dynamic_pointer_cast<ErnieImageSharedAdaLNBlock>(blocks["layers." + std::to_string(i)]);
                hidden_states = layer->forward(ctx, hidden_states, pe, temb);
+                sd::ggml_graph_cut::mark_graph_cut(hidden_states, "ernie_image.layers." + std::to_string(i), "hidden_states");
            }

            hidden_states = final_norm->forward(ctx, hidden_states, c);
@ -315,74 +374,25 @@ namespace ErnieImage {
                                       patches,
                                       Hp,
                                       Wp,
-                                       params.patch_size,
-                                       params.patch_size,
+                                       config.patch_size,
+                                       config.patch_size,
                                       false);  // [N, out_channels, H, W]
            return out;
        }
    };

-    struct ErnieImageRunner : public GGMLRunner {
-        ErnieImageParams ernie_params;
+    struct ErnieImageRunner : public DiffusionModelRunner {
+        ErnieImageConfig config;
        ErnieImageModel ernie_image;
        std::vector<float> pe_vec;

        ErnieImageRunner(ggml_backend_t backend,
-                         bool offload_params_to_cpu,
                         const String2TensorStorage& tensor_storage_map      = {},
-                         const std::string prefix                       = "")
-            : GGMLRunner(backend, offload_params_to_cpu) {
-            ernie_params.num_layers = 0;
-            for (const auto& [name, tensor_storage] : tensor_storage_map) {
-                if (!starts_with(name, prefix)) {
-                    continue;
-                }
-                if (ends_with(name, "x_embedder.proj.weight") && tensor_storage.n_dims == 4) {
-                    ernie_params.patch_size  = static_cast<int>(tensor_storage.ne[0]);
-                    ernie_params.in_channels = tensor_storage.ne[2];
-                    ernie_params.hidden_size = tensor_storage.ne[3];
-                } else if (ends_with(name, "text_proj.weight") && tensor_storage.n_dims == 2) {
-                    ernie_params.text_in_dim = tensor_storage.ne[0];
-                } else if (ends_with(name, "layers.0.self_attention.norm_q.weight")) {
-                    int64_t head_dim       = tensor_storage.ne[0];
-                    ernie_params.num_heads = ernie_params.hidden_size / head_dim;
-                } else if (ends_with(name, "layers.0.mlp.gate_proj.weight") && tensor_storage.n_dims == 2) {
-                    ernie_params.ffn_hidden_size = tensor_storage.ne[1];
-                } else if (ends_with(name, "final_linear.weight") && tensor_storage.n_dims == 2) {
-                    int64_t out_dim           = tensor_storage.ne[1];
-                    ernie_params.out_channels = out_dim / ernie_params.patch_size / ernie_params.patch_size;
-                }
-
-                size_t pos = name.find("layers.");
-                if (pos != std::string::npos) {
-                    std::string layer_name = name.substr(pos);
-                    auto items             = split_string(layer_name, '.');
-                    if (items.size() > 1) {
-                        int block_index = atoi(items[1].c_str());
-                        if (block_index + 1 > ernie_params.num_layers) {
-                            ernie_params.num_layers = block_index + 1;
-                        }
-                    }
-                }
-            }
-            if (ernie_params.num_layers == 0) {
-                ernie_params.num_layers = 36;
-            }
-            ernie_params.axes_dim_sum = 0;
-            for (int axis_dim : ernie_params.axes_dim) {
-                ernie_params.axes_dim_sum += axis_dim;
-            }
-
-            LOG_INFO("ernie_image: layers = %" PRId64 ", hidden_size = %" PRId64 ", heads = %" PRId64
-                     ", ffn_hidden_size = %" PRId64 ", in_channels = %" PRId64 ", out_channels = %" PRId64,
-                     ernie_params.num_layers,
-                     ernie_params.hidden_size,
-                     ernie_params.num_heads,
-                     ernie_params.ffn_hidden_size,
-                     ernie_params.in_channels,
-                     ernie_params.out_channels);
-
-            ernie_image = ErnieImageModel(ernie_params);
+                         const std::string prefix                            = "",
+                         std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+            : DiffusionModelRunner(backend, prefix, weight_manager),
+              config(ErnieImageConfig::detect_from_weights(tensor_storage_map, prefix)) {
+            ernie_image = ErnieImageModel(config);
            ernie_image.init(params_ctx, tensor_storage_map, prefix);
        }

@ -390,7 +400,7 @@ namespace ErnieImage {
            return "ernie_image";
        }

-        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
            ernie_image.get_param_tensors(tensors, prefix);
        }

@ -406,15 +416,15 @@ namespace ErnieImage {

            pe_vec      = Rope::gen_ernie_image_pe(static_cast<int>(x->ne[1]),
                                                   static_cast<int>(x->ne[0]),
-                                                   ernie_params.patch_size,
+                                                   config.patch_size,
                                                   static_cast<int>(x->ne[3]),
                                                   static_cast<int>(context->ne[1]),
-                                                   ernie_params.theta,
+                                                   config.theta,
                                                   circular_y_enabled,
                                                   circular_x_enabled,
-                                                   ernie_params.axes_dim);
-            int pos_len = static_cast<int>(pe_vec.size() / ernie_params.axes_dim_sum / 2);
-            auto pe     = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, ernie_params.axes_dim_sum, 1, pos_len, 2);
+                                                   config.axes_dim);
+            int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
+            auto pe     = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, config.axes_dim_sum, 1, pos_len, 2);
            set_backend_tensor_data(pe, pe_vec.data());

            auto runner_ctx  = get_context();
@ -430,9 +440,19 @@ namespace ErnieImage {
            auto get_graph = [&]() -> ggml_cgraph* {
                return build_graph(x, timesteps, context);
            };
-            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const DiffusionParams& diffusion_params) override {
+            GGML_ASSERT(diffusion_params.x != nullptr);
+            GGML_ASSERT(diffusion_params.timesteps != nullptr);
+            return compute(n_threads,
+                           *diffusion_params.x,
+                           *diffusion_params.timesteps,
+                           tensor_or_empty(diffusion_params.context));
        }
    };
 }  // namespace ErnieImage

-#endif  // __SD_ERNIE_IMAGE_HPP__
+#endif  // __SD_MODEL_DIFFUSION_ERNIE_IMAGE_HPP__
--- a/src/model/diffusion/flux.hpp
+++ b/src/model/diffusion/flux.hpp
@ -1,17 +1,167 @@
-#ifndef __FLUX_HPP__
-#define __FLUX_HPP__
+#ifndef __SD_MODEL_DIFFUSION_FLUX_HPP__
+#define __SD_MODEL_DIFFUSION_FLUX_HPP__

 #include <memory>
 #include <vector>

-#include "common_dit.hpp"
-#include "model.h"
-#include "rope.hpp"
+#include "model/common/rope.hpp"
+#include "model/diffusion/dit.hpp"
+#include "model/diffusion/model.hpp"
+#include "model_loader.h"

 #define FLUX_GRAPH_SIZE 10240

 namespace Flux {

+    struct ChromaRadianceConfig {
+        int64_t nerf_hidden_size = 64;
+        int nerf_mlp_ratio       = 4;
+        int nerf_depth           = 4;
+        int nerf_max_freqs       = 8;
+        bool use_x0              = false;
+        bool fake_patch_size_x2  = false;
+    };
+
+    struct FluxConfig {
+        SDVersion version         = VERSION_FLUX;
+        bool is_chroma            = false;
+        int patch_size            = 2;
+        int64_t in_channels       = 64;
+        int64_t out_channels      = 64;
+        int64_t vec_in_dim        = 768;
+        int64_t context_in_dim    = 4096;
+        int64_t hidden_size       = 3072;
+        float mlp_ratio           = 4.0f;
+        int num_heads             = 24;
+        int depth                 = 19;
+        int depth_single_blocks   = 38;
+        std::vector<int> axes_dim = {16, 56, 56};
+        int axes_dim_sum          = 128;
+        int theta                 = 10000;
+        bool qkv_bias             = true;
+        bool guidance_embed       = true;
+        int64_t in_dim            = 64;
+        bool disable_bias         = false;
+        bool share_modulation     = false;
+        bool semantic_txt_norm    = false;
+        bool use_yak_mlp          = false;
+        bool use_mlp_silu_act     = false;
+        float ref_index_scale     = 1.f;
+        ChromaRadianceConfig chroma_radiance_params;
+
+        static FluxConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
+                                              const std::string& prefix,
+                                              SDVersion version = VERSION_FLUX) {
+            FluxConfig config;
+            config.version             = version;
+            config.guidance_embed      = false;
+            config.depth               = 0;
+            config.depth_single_blocks = 0;
+            if (version == VERSION_FLUX_FILL) {
+                config.in_channels = 384;
+            } else if (version == VERSION_FLUX_CONTROLS) {
+                config.in_channels = 128;
+            } else if (version == VERSION_FLEX_2) {
+                config.in_channels = 196;
+            } else if (version == VERSION_CHROMA_RADIANCE) {
+                config.in_channels = 3;
+                config.patch_size  = 16;
+            } else if (version == VERSION_OVIS_IMAGE) {
+                config.semantic_txt_norm = true;
+                config.use_yak_mlp       = true;
+                config.vec_in_dim        = 0;
+            } else if (sd_version_is_flux2(version)) {
+                config.in_channels      = 128;
+                config.patch_size       = 1;
+                config.out_channels     = 128;
+                config.mlp_ratio        = 3.f;
+                config.theta            = 2000;
+                config.axes_dim         = {32, 32, 32, 32};
+                config.vec_in_dim       = 0;
+                config.qkv_bias         = false;
+                config.disable_bias     = true;
+                config.share_modulation = true;
+                config.ref_index_scale  = 10.f;
+                config.use_mlp_silu_act = true;
+            } else if (sd_version_is_longcat(version)) {
+                config.context_in_dim = 3584;
+                config.vec_in_dim     = 0;
+            }
+
+            int64_t head_dim                   = 0;
+            int64_t actual_radiance_patch_size = -1;
+            for (const auto& [name, tensor_storage] : tensor_storage_map) {
+                if (!starts_with(name, prefix)) {
+                    continue;
+                }
+                if (name.find("guidance_in.in_layer.weight") != std::string::npos) {
+                    config.guidance_embed = true;
+                }
+                if (name.find("__x0__") != std::string::npos) {
+                    LOG_DEBUG("using x0 prediction");
+                    config.chroma_radiance_params.use_x0 = true;
+                }
+                if (name.find("__32x32__") != std::string::npos) {
+                    LOG_DEBUG("using patch size 32");
+                    config.patch_size = 32;
+                }
+                if (name.find("img_in_patch.weight") != std::string::npos) {
+                    actual_radiance_patch_size = tensor_storage.ne[0];
+                    LOG_DEBUG("actual radiance patch size: %" PRId64, actual_radiance_patch_size);
+                }
+                if (name.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) {
+                    config.is_chroma = true;
+                }
+                size_t db = name.find("double_blocks.");
+                if (db != std::string::npos) {
+                    std::string block_name = name.substr(db);
+                    int block_depth        = atoi(block_name.substr(14, block_name.find(".", 14)).c_str());
+                    if (block_depth + 1 > config.depth) {
+                        config.depth = block_depth + 1;
+                    }
+                }
+                size_t sb = name.find("single_blocks.");
+                if (sb != std::string::npos) {
+                    std::string block_name = name.substr(sb);
+                    int block_depth        = atoi(block_name.substr(14, block_name.find(".", 14)).c_str());
+                    if (block_depth + 1 > config.depth_single_blocks) {
+                        config.depth_single_blocks = block_depth + 1;
+                    }
+                }
+                if (ends_with(name, "txt_in.weight")) {
+                    config.context_in_dim = tensor_storage.ne[0];
+                    config.hidden_size    = tensor_storage.ne[1];
+                }
+                if (ends_with(name, "single_blocks.0.norm.key_norm.scale")) {
+                    head_dim = tensor_storage.ne[0];
+                }
+                if (ends_with(name, "double_blocks.0.txt_attn.norm.key_norm.scale")) {
+                    head_dim = tensor_storage.ne[0];
+                }
+            }
+            if (actual_radiance_patch_size > 0 && actual_radiance_patch_size != config.patch_size) {
+                GGML_ASSERT(config.patch_size == 2 * actual_radiance_patch_size);
+                LOG_DEBUG("using fake x2 patch size");
+                config.chroma_radiance_params.fake_patch_size_x2 = true;
+            }
+            if (head_dim > 0) {
+                config.num_heads = static_cast<int>(config.hidden_size / head_dim);
+            }
+            config.axes_dim_sum = 0;
+            for (int axis_dim : config.axes_dim) {
+                config.axes_dim_sum += axis_dim;
+            }
+            LOG_DEBUG("flux: depth = %d, depth_single_blocks = %d, guidance_embed = %s, context_in_dim = %" PRId64 ", hidden_size = %" PRId64 ", num_heads = %d",
+                      config.depth,
+                      config.depth_single_blocks,
+                      config.guidance_embed ? "true" : "false",
+                      config.context_in_dim,
+                      config.hidden_size,
+                      config.num_heads);
+            return config;
+        }
+    };
+
    struct MLPEmbedder : public UnaryBlock {
    public:
        MLPEmbedder(int64_t in_dim, int64_t hidden_dim, bool bias = true) {
@ -446,7 +596,6 @@ namespace Flux {
            if (use_yak_mlp || use_mlp_silu_act) {
                mlp_mult_factor = 2;
            }
-
            blocks["linear1"]  = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias));
            blocks["linear2"]  = std::shared_ptr<GGMLBlock>(new Linear(hidden_size + mlp_hidden_dim, hidden_size, mlp_proj_bias));
            blocks["norm"]     = std::shared_ptr<GGMLBlock>(new QKNorm(head_dim));
@ -723,127 +872,90 @@ namespace Flux {
        }
    };

-    struct ChromaRadianceParams {
-        int64_t nerf_hidden_size = 64;
-        int nerf_mlp_ratio       = 4;
-        int nerf_depth           = 4;
-        int nerf_max_freqs       = 8;
-        bool use_x0              = false;
-        bool fake_patch_size_x2  = false;
-    };
-
-    struct FluxParams {
-        SDVersion version         = VERSION_FLUX;
-        bool is_chroma            = false;
-        int patch_size            = 2;
-        int64_t in_channels       = 64;
-        int64_t out_channels      = 64;
-        int64_t vec_in_dim        = 768;
-        int64_t context_in_dim    = 4096;
-        int64_t hidden_size       = 3072;
-        float mlp_ratio           = 4.0f;
-        int num_heads             = 24;
-        int depth                 = 19;
-        int depth_single_blocks   = 38;
-        std::vector<int> axes_dim = {16, 56, 56};
-        int axes_dim_sum          = 128;
-        int theta                 = 10000;
-        bool qkv_bias             = true;
-        bool guidance_embed       = true;
-        int64_t in_dim            = 64;
-        bool disable_bias         = false;
-        bool share_modulation     = false;
-        bool semantic_txt_norm    = false;
-        bool use_yak_mlp          = false;
-        bool use_mlp_silu_act     = false;
-        float ref_index_scale     = 1.f;
-        ChromaRadianceParams chroma_radiance_params;
-    };
-
    struct Flux : public GGMLBlock {
    public:
-        FluxParams params;
+        FluxConfig config;
        Flux() {}
-        Flux(FluxParams params)
-            : params(params) {
-            if (params.version == VERSION_CHROMA_RADIANCE) {
-                std::pair<int, int> kernel_size = {params.patch_size, params.patch_size};
-                if (params.chroma_radiance_params.fake_patch_size_x2) {
-                    kernel_size = {params.patch_size / 2, params.patch_size / 2};
+        Flux(FluxConfig config)
+            : config(config) {
+            if (config.version == VERSION_CHROMA_RADIANCE) {
+                std::pair<int, int> kernel_size = {config.patch_size, config.patch_size};
+                if (config.chroma_radiance_params.fake_patch_size_x2) {
+                    kernel_size = {config.patch_size / 2, config.patch_size / 2};
                }
                std::pair<int, int> stride = kernel_size;

-                blocks["img_in_patch"] = std::make_shared<Conv2d>(params.in_channels,
-                                                                  params.hidden_size,
+                blocks["img_in_patch"] = std::make_shared<Conv2d>(config.in_channels,
+                                                                  config.hidden_size,
                                                                  kernel_size,
                                                                  stride);
            } else {
-                blocks["img_in"] = std::make_shared<Linear>(params.in_channels, params.hidden_size, !params.disable_bias);
+                blocks["img_in"] = std::make_shared<Linear>(config.in_channels, config.hidden_size, !config.disable_bias);
            }
-            if (params.is_chroma) {
-                blocks["distilled_guidance_layer"] = std::make_shared<ChromaApproximator>(params.in_dim, params.hidden_size);
+            if (config.is_chroma) {
+                blocks["distilled_guidance_layer"] = std::make_shared<ChromaApproximator>(config.in_dim, config.hidden_size);
            } else {
-                blocks["time_in"] = std::make_shared<MLPEmbedder>(256, params.hidden_size, !params.disable_bias);
-                if (params.vec_in_dim > 0) {
-                    blocks["vector_in"] = std::make_shared<MLPEmbedder>(params.vec_in_dim, params.hidden_size, !params.disable_bias);
+                blocks["time_in"] = std::make_shared<MLPEmbedder>(256, config.hidden_size, !config.disable_bias);
+                if (config.vec_in_dim > 0) {
+                    blocks["vector_in"] = std::make_shared<MLPEmbedder>(config.vec_in_dim, config.hidden_size, !config.disable_bias);
                }
-                if (params.guidance_embed) {
-                    blocks["guidance_in"] = std::make_shared<MLPEmbedder>(256, params.hidden_size, !params.disable_bias);
+                if (config.guidance_embed) {
+                    blocks["guidance_in"] = std::make_shared<MLPEmbedder>(256, config.hidden_size, !config.disable_bias);
                }
            }
-            if (params.semantic_txt_norm) {
-                blocks["txt_norm"] = std::make_shared<RMSNorm>(params.context_in_dim);
+            if (config.semantic_txt_norm) {
+                blocks["txt_norm"] = std::make_shared<RMSNorm>(config.context_in_dim);
            }
-            blocks["txt_in"] = std::make_shared<Linear>(params.context_in_dim, params.hidden_size, !params.disable_bias);
+            blocks["txt_in"] = std::make_shared<Linear>(config.context_in_dim, config.hidden_size, !config.disable_bias);

-            for (int i = 0; i < params.depth; i++) {
-                blocks["double_blocks." + std::to_string(i)] = std::make_shared<DoubleStreamBlock>(params.hidden_size,
-                                                                                                   params.num_heads,
-                                                                                                   params.mlp_ratio,
+            for (int i = 0; i < config.depth; i++) {
+                blocks["double_blocks." + std::to_string(i)] = std::make_shared<DoubleStreamBlock>(config.hidden_size,
+                                                                                                   config.num_heads,
+                                                                                                   config.mlp_ratio,
                                                                                                   i,
-                                                                                                   params.qkv_bias,
-                                                                                                   params.is_chroma,
-                                                                                                   params.share_modulation,
-                                                                                                   !params.disable_bias,
-                                                                                                   params.use_yak_mlp,
-                                                                                                   params.use_mlp_silu_act);
+                                                                                                   config.qkv_bias,
+                                                                                                   config.is_chroma,
+                                                                                                   config.share_modulation,
+                                                                                                   !config.disable_bias,
+                                                                                                   config.use_yak_mlp,
+                                                                                                   config.use_mlp_silu_act);
            }

-            for (int i = 0; i < params.depth_single_blocks; i++) {
-                blocks["single_blocks." + std::to_string(i)] = std::make_shared<SingleStreamBlock>(params.hidden_size,
-                                                                                                   params.num_heads,
-                                                                                                   params.mlp_ratio,
+            for (int i = 0; i < config.depth_single_blocks; i++) {
+                blocks["single_blocks." + std::to_string(i)] = std::make_shared<SingleStreamBlock>(config.hidden_size,
+                                                                                                   config.num_heads,
+                                                                                                   config.mlp_ratio,
                                                                                                   i,
                                                                                                   0.f,
-                                                                                                   params.is_chroma,
-                                                                                                   params.share_modulation,
-                                                                                                   !params.disable_bias,
-                                                                                                   params.use_yak_mlp,
-                                                                                                   params.use_mlp_silu_act);
+                                                                                                   config.is_chroma,
+                                                                                                   config.share_modulation,
+                                                                                                   !config.disable_bias,
+                                                                                                   config.use_yak_mlp,
+                                                                                                   config.use_mlp_silu_act);
            }

-            if (params.version == VERSION_CHROMA_RADIANCE) {
-                blocks["nerf_image_embedder"] = std::make_shared<NerfEmbedder>(params.in_channels,
-                                                                               params.chroma_radiance_params.nerf_hidden_size,
-                                                                               params.chroma_radiance_params.nerf_max_freqs);
+            if (config.version == VERSION_CHROMA_RADIANCE) {
+                blocks["nerf_image_embedder"] = std::make_shared<NerfEmbedder>(config.in_channels,
+                                                                               config.chroma_radiance_params.nerf_hidden_size,
+                                                                               config.chroma_radiance_params.nerf_max_freqs);

-                for (int i = 0; i < params.chroma_radiance_params.nerf_depth; i++) {
-                    blocks["nerf_blocks." + std::to_string(i)] = std::make_shared<NerfGLUBlock>(params.hidden_size,
-                                                                                                params.chroma_radiance_params.nerf_hidden_size,
-                                                                                                params.chroma_radiance_params.nerf_mlp_ratio);
+                for (int i = 0; i < config.chroma_radiance_params.nerf_depth; i++) {
+                    blocks["nerf_blocks." + std::to_string(i)] = std::make_shared<NerfGLUBlock>(config.hidden_size,
+                                                                                                config.chroma_radiance_params.nerf_hidden_size,
+                                                                                                config.chroma_radiance_params.nerf_mlp_ratio);
                }

-                blocks["nerf_final_layer_conv"] = std::make_shared<NerfFinalLayerConv>(params.chroma_radiance_params.nerf_hidden_size,
-                                                                                       params.in_channels);
+                blocks["nerf_final_layer_conv"] = std::make_shared<NerfFinalLayerConv>(config.chroma_radiance_params.nerf_hidden_size,
+                                                                                       config.in_channels);

            } else {
-                blocks["final_layer"] = std::make_shared<LastLayer>(params.hidden_size, 1, params.out_channels, params.is_chroma, !params.disable_bias);
+                blocks["final_layer"] = std::make_shared<LastLayer>(config.hidden_size, 1, config.out_channels, config.is_chroma, !config.disable_bias);
            }

-            if (params.share_modulation) {
-                blocks["double_stream_modulation_img"] = std::make_shared<Modulation>(params.hidden_size, true, !params.disable_bias);
-                blocks["double_stream_modulation_txt"] = std::make_shared<Modulation>(params.hidden_size, true, !params.disable_bias);
-                blocks["single_stream_modulation"]     = std::make_shared<Modulation>(params.hidden_size, false, !params.disable_bias);
+            if (config.share_modulation) {
+                blocks["double_stream_modulation_img"] = std::make_shared<Modulation>(config.hidden_size, true, !config.disable_bias);
+                blocks["double_stream_modulation_txt"] = std::make_shared<Modulation>(config.hidden_size, true, !config.disable_bias);
+                blocks["single_stream_modulation"]     = std::make_shared<Modulation>(config.hidden_size, false, !config.disable_bias);
            }
        }

@ -866,7 +978,7 @@ namespace Flux {

            ggml_tensor* vec;
            ggml_tensor* txt_img_mask = nullptr;
-            if (params.is_chroma) {
+            if (config.is_chroma) {
                int64_t mod_index_length = 344;
                auto approx              = std::dynamic_pointer_cast<ChromaApproximator>(blocks["distilled_guidance_layer"]);
                auto distill_timestep    = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 16, 10000, 1000.f);
@ -894,7 +1006,7 @@ namespace Flux {
            } else {
                auto time_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]);
                vec          = time_in->forward(ctx, ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f));
-                if (params.guidance_embed) {
+                if (config.guidance_embed) {
                    GGML_ASSERT(guidance != nullptr);
                    auto guidance_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["guidance_in"]);
                    // bf16 and fp16 result is different
@ -902,7 +1014,7 @@ namespace Flux {
                    vec       = ggml_add(ctx->ggml_ctx, vec, guidance_in->forward(ctx, g_in));
                }

-                if (params.vec_in_dim > 0) {
+                if (config.vec_in_dim > 0) {
                    auto vector_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["vector_in"]);
                    vec            = ggml_add(ctx->ggml_ctx, vec, vector_in->forward(ctx, y));
                }
@ -911,7 +1023,7 @@ namespace Flux {
            std::vector<ModulationOut> ds_img_mods;
            std::vector<ModulationOut> ds_txt_mods;
            std::vector<ModulationOut> ss_mods;
-            if (params.share_modulation) {
+            if (config.share_modulation) {
                auto double_stream_modulation_img = std::dynamic_pointer_cast<Modulation>(blocks["double_stream_modulation_img"]);
                auto double_stream_modulation_txt = std::dynamic_pointer_cast<Modulation>(blocks["double_stream_modulation_txt"]);
                auto single_stream_modulation     = std::dynamic_pointer_cast<Modulation>(blocks["single_stream_modulation"]);
@ -921,15 +1033,18 @@ namespace Flux {
                ss_mods     = single_stream_modulation->forward(ctx, vec);
            }

-            if (params.semantic_txt_norm) {
+            if (config.semantic_txt_norm) {
                auto semantic_txt_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm"]);

                txt = semantic_txt_norm->forward(ctx, txt);
            }

            txt = txt_in->forward(ctx, txt);
+            sd::ggml_graph_cut::mark_graph_cut(img, "flux.prelude", "img");
+            sd::ggml_graph_cut::mark_graph_cut(txt, "flux.prelude", "txt");
+            sd::ggml_graph_cut::mark_graph_cut(vec, "flux.prelude", "vec");

-            for (int i = 0; i < params.depth; i++) {
+            for (int i = 0; i < config.depth; i++) {
                if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i) != skip_layers.end()) {
                    continue;
                }
@ -939,16 +1054,19 @@ namespace Flux {
                auto img_txt = block->forward(ctx, img, txt, vec, pe, txt_img_mask, ds_img_mods, ds_txt_mods);
                img          = img_txt.first;   // [N, n_img_token, hidden_size]
                txt          = img_txt.second;  // [N, n_txt_token, hidden_size]
+                sd::ggml_graph_cut::mark_graph_cut(img, "flux.double_blocks." + std::to_string(i), "img");
+                sd::ggml_graph_cut::mark_graph_cut(txt, "flux.double_blocks." + std::to_string(i), "txt");
            }

            auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1);  // [N, n_txt_token + n_img_token, hidden_size]
-            for (int i = 0; i < params.depth_single_blocks; i++) {
-                if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i + params.depth) != skip_layers.end()) {
+            for (int i = 0; i < config.depth_single_blocks; i++) {
+                if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i + config.depth) != skip_layers.end()) {
                    continue;
                }
                auto block = std::dynamic_pointer_cast<SingleStreamBlock>(blocks["single_blocks." + std::to_string(i)]);

                txt_img = block->forward(ctx, txt_img, vec, pe, txt_img_mask, ss_mods);
+                sd::ggml_graph_cut::mark_graph_cut(txt_img, "flux.single_blocks." + std::to_string(i), "txt_img");
            }

            img = ggml_view_3d(ctx->ggml_ctx,
@ -993,14 +1111,14 @@ namespace Flux {
            int64_t W      = x->ne[0];
            int64_t H      = x->ne[1];
            int64_t C      = x->ne[2];
-            int patch_size = params.patch_size;
+            int patch_size = config.patch_size;
            int pad_h      = (patch_size - H % patch_size) % patch_size;
            int pad_w      = (patch_size - W % patch_size) % patch_size;

-            auto img      = DiT::pad_to_patch_size(ctx, x, params.patch_size, params.patch_size);
+            auto img      = DiT::pad_to_patch_size(ctx, x, config.patch_size, config.patch_size);
            auto orig_img = img;

-            if (params.chroma_radiance_params.fake_patch_size_x2) {
+            if (config.chroma_radiance_params.fake_patch_size_x2) {
                // It's supposed to be using GGML_SCALE_MODE_NEAREST, but this seems more stable
                // Maybe the implementation of nearest-neighbor interpolation in ggml behaves differently than the one in PyTorch?
                // img = F.interpolate(img, size=(H//2, W//2), mode="nearest")
@ -1031,7 +1149,7 @@ namespace Flux {
            auto nerf_hidden = ggml_reshape_2d(ctx->ggml_ctx, out, out->ne[0], out->ne[1] * out->ne[2]);  // [N*num_patches, hidden_size]
            auto img_dct     = nerf_image_embedder->forward(ctx, nerf_pixels, dct);                       // [N*num_patches, patch_size*patch_size, nerf_hidden_size]

-            for (int i = 0; i < params.chroma_radiance_params.nerf_depth; i++) {
+            for (int i = 0; i < config.chroma_radiance_params.nerf_depth; i++) {
                auto block = std::dynamic_pointer_cast<NerfGLUBlock>(blocks["nerf_blocks." + std::to_string(i)]);

                img_dct = block->forward(ctx, img_dct, nerf_hidden);
@ -1043,7 +1161,7 @@ namespace Flux {

            out = nerf_final_layer_conv->forward(ctx, img_dct);  // [N, C, H, W]

-            if (params.chroma_radiance_params.use_x0) {
+            if (config.chroma_radiance_params.use_x0) {
                out = _apply_x0_residual(ctx, out, orig_img, timestep);
            }

@ -1067,14 +1185,14 @@ namespace Flux {
            int64_t W      = x->ne[0];
            int64_t H      = x->ne[1];
            int64_t C      = x->ne[2];
-            int patch_size = params.patch_size;
+            int patch_size = config.patch_size;
            int pad_h      = (patch_size - H % patch_size) % patch_size;
            int pad_w      = (patch_size - W % patch_size) % patch_size;

            auto img           = DiT::pad_and_patchify(ctx, x, patch_size, patch_size);
            int64_t img_tokens = img->ne[1];

-            if (params.version == VERSION_FLUX_FILL) {
+            if (config.version == VERSION_FLUX_FILL) {
                GGML_ASSERT(c_concat != nullptr);
                ggml_tensor* masked = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
                ggml_tensor* mask   = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
@ -1083,7 +1201,7 @@ namespace Flux {
                mask   = DiT::pad_and_patchify(ctx, mask, patch_size, patch_size);

                img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, masked, mask, 0), 0);
-            } else if (params.version == VERSION_FLEX_2) {
+            } else if (config.version == VERSION_FLEX_2) {
                GGML_ASSERT(c_concat != nullptr);
                ggml_tensor* masked  = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
                ggml_tensor* mask    = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
@ -1094,7 +1212,7 @@ namespace Flux {
                control = DiT::pad_and_patchify(ctx, control, patch_size, patch_size);

                img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, ggml_concat(ctx->ggml_ctx, masked, mask, 0), control, 0), 0);
-            } else if (params.version == VERSION_FLUX_CONTROLS) {
+            } else if (config.version == VERSION_FLUX_CONTROLS) {
                GGML_ASSERT(c_concat != nullptr);

                auto control = DiT::pad_and_patchify(ctx, c_concat, patch_size, patch_size);
@ -1141,7 +1259,7 @@ namespace Flux {
            // pe: (L, d_head/2, 2, 2)
            // return: (N, C, H, W)

-            if (params.version == VERSION_CHROMA_RADIANCE) {
+            if (config.version == VERSION_CHROMA_RADIANCE) {
                return forward_chroma_radiance(ctx,
                                               x,
                                               timestep,
@ -1171,9 +1289,9 @@ namespace Flux {
        }
    };

-    struct FluxRunner : public GGMLRunner {
+    struct FluxRunner : public DiffusionModelRunner {
    public:
-        FluxParams flux_params;
+        FluxConfig config;
        Flux flux;
        std::vector<float> pe_vec;
        std::vector<float> mod_index_arange_vec;
@ -1183,116 +1301,20 @@ namespace Flux {
        bool use_mask = false;

        FluxRunner(ggml_backend_t backend,
-                   bool offload_params_to_cpu,
                   const String2TensorStorage& tensor_storage_map      = {},
                   const std::string prefix                            = "",
                   SDVersion version                                   = VERSION_FLUX,
-                   bool use_mask                                  = false)
-            : GGMLRunner(backend, offload_params_to_cpu), version(version), use_mask(use_mask) {
-            flux_params.version             = version;
-            flux_params.guidance_embed      = false;
-            flux_params.depth               = 0;
-            flux_params.depth_single_blocks = 0;
-            if (version == VERSION_FLUX_FILL) {
-                flux_params.in_channels = 384;
-            } else if (version == VERSION_FLUX_CONTROLS) {
-                flux_params.in_channels = 128;
-            } else if (version == VERSION_FLEX_2) {
-                flux_params.in_channels = 196;
-            } else if (version == VERSION_CHROMA_RADIANCE) {
-                flux_params.in_channels = 3;
-                flux_params.patch_size  = 16;
-            } else if (version == VERSION_OVIS_IMAGE) {
-                flux_params.semantic_txt_norm = true;
-                flux_params.use_yak_mlp       = true;
-                flux_params.vec_in_dim        = 0;
-            } else if (sd_version_is_flux2(version)) {
-                flux_params.in_channels      = 128;
-                flux_params.patch_size       = 1;
-                flux_params.out_channels     = 128;
-                flux_params.mlp_ratio        = 3.f;
-                flux_params.theta            = 2000;
-                flux_params.axes_dim         = {32, 32, 32, 32};
-                flux_params.vec_in_dim       = 0;
-                flux_params.qkv_bias         = false;
-                flux_params.disable_bias     = true;
-                flux_params.share_modulation = true;
-                flux_params.ref_index_scale  = 10.f;
-                flux_params.use_mlp_silu_act = true;
-            }
-            int64_t head_dim                   = 0;
-            int64_t actual_radiance_patch_size = -1;
-            for (auto pair : tensor_storage_map) {
-                std::string tensor_name = pair.first;
-                if (!starts_with(tensor_name, prefix))
-                    continue;
-                if (tensor_name.find("guidance_in.in_layer.weight") != std::string::npos) {
-                    flux_params.guidance_embed = true;
-                }
-                if (tensor_name.find("__x0__") != std::string::npos) {
-                    LOG_DEBUG("using x0 prediction");
-                    flux_params.chroma_radiance_params.use_x0 = true;
-                }
-                if (tensor_name.find("__32x32__") != std::string::npos) {
-                    LOG_DEBUG("using patch size 32");
-                    flux_params.patch_size = 32;
-                }
-                if (tensor_name.find("img_in_patch.weight") != std::string::npos) {
-                    actual_radiance_patch_size = pair.second.ne[0];
-                    LOG_DEBUG("actual radiance patch size: %d", actual_radiance_patch_size);
-                }
-                if (tensor_name.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) {
-                    // Chroma
-                    flux_params.is_chroma = true;
-                }
-                size_t db = tensor_name.find("double_blocks.");
-                if (db != std::string::npos) {
-                    tensor_name     = tensor_name.substr(db);  // remove prefix
-                    int block_depth = atoi(tensor_name.substr(14, tensor_name.find(".", 14)).c_str());
-                    if (block_depth + 1 > flux_params.depth) {
-                        flux_params.depth = block_depth + 1;
-                    }
-                }
-                size_t sb = tensor_name.find("single_blocks.");
-                if (sb != std::string::npos) {
-                    tensor_name     = tensor_name.substr(sb);  // remove prefix
-                    int block_depth = atoi(tensor_name.substr(14, tensor_name.find(".", 14)).c_str());
-                    if (block_depth + 1 > flux_params.depth_single_blocks) {
-                        flux_params.depth_single_blocks = block_depth + 1;
-                    }
-                }
-                if (ends_with(tensor_name, "txt_in.weight")) {
-                    flux_params.context_in_dim = pair.second.ne[0];
-                    flux_params.hidden_size    = pair.second.ne[1];
-                }
-                if (ends_with(tensor_name, "single_blocks.0.norm.key_norm.scale")) {
-                    head_dim = pair.second.ne[0];
-                }
-                if (ends_with(tensor_name, "double_blocks.0.txt_attn.norm.key_norm.scale")) {
-                    head_dim = pair.second.ne[0];
-                }
-            }
-            if (actual_radiance_patch_size > 0 && actual_radiance_patch_size != flux_params.patch_size) {
-                GGML_ASSERT(flux_params.patch_size == 2 * actual_radiance_patch_size);
-                LOG_DEBUG("using fake x2 patch size");
-                flux_params.chroma_radiance_params.fake_patch_size_x2 = true;
-            }
-
-            flux_params.num_heads = static_cast<int>(flux_params.hidden_size / head_dim);
-
-            LOG_INFO("flux: depth = %d, depth_single_blocks = %d, guidance_embed = %s, context_in_dim = %" PRId64
-                     ", hidden_size = %" PRId64 ", num_heads = %d",
-                     flux_params.depth,
-                     flux_params.depth_single_blocks,
-                     flux_params.guidance_embed ? "true" : "false",
-                     flux_params.context_in_dim,
-                     flux_params.hidden_size,
-                     flux_params.num_heads);
-            if (flux_params.is_chroma) {
+                   bool use_mask                                       = false,
+                   std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+            : DiffusionModelRunner(backend, prefix, weight_manager),
+              config(FluxConfig::detect_from_weights(tensor_storage_map, prefix, version)),
+              version(version),
+              use_mask(use_mask) {
+            if (config.is_chroma) {
                LOG_INFO("Using pruned modulation (Chroma)");
            }

-            flux = Flux(flux_params);
+            flux = Flux(config);
            flux.init(params_ctx, tensor_storage_map, prefix);
        }

@ -1300,7 +1322,7 @@ namespace Flux {
            return "flux";
        }

-        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
            flux.get_param_tensors(tensors, prefix);
        }

@ -1368,10 +1390,10 @@ namespace Flux {
            ggml_tensor* context   = make_optional_input(context_tensor);
            ggml_tensor* c_concat  = make_optional_input(c_concat_tensor);
            ggml_tensor* y         = make_optional_input(y_tensor);
-            if (flux_params.guidance_embed || flux_params.is_chroma) {
+            if (config.guidance_embed || config.is_chroma) {
                if (!guidance_tensor.empty()) {
                    this->guidance_tensor = guidance_tensor;
-                    if (flux_params.is_chroma) {
+                    if (config.is_chroma) {
                        this->guidance_tensor.fill_(0.f);
                    }
                }
@ -1389,7 +1411,7 @@ namespace Flux {
            ggml_tensor* mod_index_arange = nullptr;
            ggml_tensor* dct              = nullptr;  // for chroma radiance

-            if (flux_params.is_chroma) {
+            if (config.is_chroma) {
                if (!use_mask) {
                    y = nullptr;
                }
@ -1406,31 +1428,31 @@ namespace Flux {
            } else if (version == VERSION_OVIS_IMAGE) {
                txt_arange_dims = {1, 2};
            }
-
            pe_vec      = Rope::gen_flux_pe(static_cast<int>(x->ne[1]),
                                            static_cast<int>(x->ne[0]),
-                                            flux_params.patch_size,
+                                            config.patch_size,
                                            static_cast<int>(x->ne[3]),
                                            static_cast<int>(context->ne[1]),
                                            txt_arange_dims,
                                            ref_latents,
                                            increase_ref_index,
-                                            flux_params.ref_index_scale,
-                                            flux_params.theta,
+                                            config.ref_index_scale,
+                                            config.theta,
                                            circular_y_enabled,
                                            circular_x_enabled,
-                                            flux_params.axes_dim);
-            int pos_len = static_cast<int>(pe_vec.size() / flux_params.axes_dim_sum / 2);
+                                            config.axes_dim,
+                                            sd_version_is_longcat(version));
+            int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
            // LOG_DEBUG("pos_len %d", pos_len);
-            auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len);
+            auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.axes_dim_sum / 2, pos_len);
            // pe->data = pe_vec.data();
            // print_ggml_tensor(pe);
            // pe->data = nullptr;
            set_backend_tensor_data(pe, pe_vec.data());

            if (version == VERSION_CHROMA_RADIANCE) {
-                int patch_size     = flux_params.patch_size;
-                int nerf_max_freqs = flux_params.chroma_radiance_params.nerf_max_freqs;
+                int patch_size     = config.patch_size;
+                int nerf_max_freqs = config.chroma_radiance_params.nerf_max_freqs;
                dct_vec            = fetch_dct_pos(patch_size, nerf_max_freqs);
                dct                = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, nerf_max_freqs * nerf_max_freqs, patch_size * patch_size);
                // dct->data = dct_vec.data();
@ -1478,10 +1500,29 @@ namespace Flux {
                return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, skip_layers);
            };

-            auto result = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
+            auto result = restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
            return result;
        }

+        sd::Tensor<float> compute(int n_threads,
+                                  const DiffusionParams& diffusion_params) override {
+            GGML_ASSERT(diffusion_params.x != nullptr);
+            GGML_ASSERT(diffusion_params.timesteps != nullptr);
+            const auto* extra = diffusion_extra_as<FluxDiffusionExtra>(diffusion_params);
+            static const std::vector<sd::Tensor<float>> empty_ref_latents;
+            static const std::vector<int> empty_skip_layers;
+            return compute(n_threads,
+                           *diffusion_params.x,
+                           *diffusion_params.timesteps,
+                           tensor_or_empty(diffusion_params.context),
+                           tensor_or_empty(diffusion_params.c_concat),
+                           tensor_or_empty(diffusion_params.y),
+                           tensor_or_empty(extra->guidance),
+                           diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
+                           diffusion_params.increase_ref_index,
+                           extra->skip_layers ? *extra->skip_layers : empty_skip_layers);
+        }
+
        void test() {
            ggml_init_params params;
            params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1GB
@ -1539,10 +1580,11 @@ namespace Flux {

        static void load_from_file_and_test(const std::string& file_path) {
            // ggml_backend_t backend = ggml_backend_cuda_init(0);
-            ggml_backend_t backend    = ggml_backend_cpu_init();
+            ggml_backend_t backend    = sd_backend_cpu_init();
            ggml_type model_data_type = GGML_TYPE_COUNT;

-            ModelLoader model_loader;
+            auto model_manager        = std::make_shared<ModelManager>();
+            ModelLoader& model_loader = model_manager->loader();
            if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
@ -1558,20 +1600,20 @@ namespace Flux {
            }

            std::shared_ptr<FluxRunner> flux = std::make_shared<FluxRunner>(backend,
-                                                                            false,
                                                                            tensor_storage_map,
                                                                            "model.diffusion_model",
                                                                            VERSION_FLUX2,
-                                                                            false);
+                                                                            false,
+                                                                            model_manager);

-            flux->alloc_params_buffer();
-            std::map<std::string, ggml_tensor*> tensors;
-            flux->get_param_tensors(tensors, "model.diffusion_model");
-
-            bool success = model_loader.load_tensors(tensors);
-
-            if (!success) {
-                LOG_ERROR("load tensors from model loader failed");
+            if (!model_manager->register_runner_params("Flux test",
+                                                       *flux,
+                                                       "model.diffusion_model",
+                                                       ModelManager::ResidencyMode::ParamBackend,
+                                                       backend,
+                                                       backend) ||
+                !model_manager->validate_registered_tensors()) {
+                LOG_ERROR("register flux tensors with model manager failed");
                return;
            }

@ -1582,4 +1624,4 @@ namespace Flux {

 }  // namespace Flux

-#endif  // __FLUX_HPP__
+#endif  // __SD_MODEL_DIFFUSION_FLUX_HPP__
--- a/src/model/diffusion/hidream_o1.hpp
+++ b/src/model/diffusion/hidream_o1.hpp
@ -0,0 +1,674 @@
+#ifndef __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__
+#define __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "conditioning/conditioner.hpp"
+#include "core/util.h"
+#include "model/diffusion/dit.hpp"
+#include "model/diffusion/model.hpp"
+#include "model/te/llm.hpp"
+
+namespace HiDreamO1 {
+    constexpr int HIDREAM_O1_GRAPH_SIZE = 32768;
+    constexpr int PATCH_SIZE            = 32;
+    constexpr int TIMESTEP_TOKEN_NUM    = 1;
+    constexpr int IMAGE_TOKEN_ID        = 151655;
+    constexpr int VISION_START_TOKEN_ID = 151652;
+
+    struct HiDreamO1Config {
+        LLM::LLMConfig llm;
+        int patch_size = PATCH_SIZE;
+
+        static HiDreamO1Config detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
+            (void)tensor_storage_map;
+            (void)prefix;
+            HiDreamO1Config config;
+            config.llm.arch                           = LLM::LLMArch::QWEN3_VL;
+            config.llm.hidden_size                    = 4096;
+            config.llm.intermediate_size              = 12288;
+            config.llm.num_layers                     = 36;
+            config.llm.num_heads                      = 32;
+            config.llm.num_kv_heads                   = 8;
+            config.llm.head_dim                       = 128;
+            config.llm.qkv_bias                       = false;
+            config.llm.qk_norm                        = true;
+            config.llm.vocab_size                     = 151936;
+            config.llm.rms_norm_eps                   = 1e-6f;
+            config.llm.vision.arch                    = LLM::LLMVisionArch::QWEN3_VL;
+            config.llm.vision.num_layers              = 27;
+            config.llm.vision.hidden_size             = 1152;
+            config.llm.vision.intermediate_size       = 4304;
+            config.llm.vision.num_heads               = 16;
+            config.llm.vision.out_hidden_size         = 4096;
+            config.llm.vision.patch_size              = 16;
+            config.llm.vision.spatial_merge_size      = 2;
+            config.llm.vision.temporal_patch_size     = 2;
+            config.llm.vision.num_position_embeddings = 2304;
+            return config;
+        }
+    };
+
+    static inline std::string repeat_special_token(const std::string& token, int64_t count) {
+        std::string out;
+        out.reserve(static_cast<size_t>(count) * token.size());
+        for (int64_t i = 0; i < count; ++i) {
+            out += token;
+        }
+        return out;
+    }
+
+    static inline std::pair<int, int> calculate_dimensions(int max_size, double ratio) {
+        int width  = static_cast<int>(std::sqrt(max_size * max_size * ratio));
+        int height = static_cast<int>(width / ratio);
+        width      = (width / PATCH_SIZE) * PATCH_SIZE;
+        height     = (height / PATCH_SIZE) * PATCH_SIZE;
+        width      = std::max(width, PATCH_SIZE);
+        height     = std::max(height, PATCH_SIZE);
+        return {width, height};
+    }
+
+    static inline sd::Tensor<float> resize_to_area(const sd::Tensor<float>& image, int image_size) {
+        int64_t width  = image.shape()[0];
+        int64_t height = image.shape()[1];
+        int64_t s_max  = static_cast<int64_t>(image_size) * image_size;
+        double scale   = std::sqrt(static_cast<double>(s_max) / static_cast<double>(width * height));
+
+        std::vector<std::pair<int64_t, int64_t>> sizes = {
+            {(static_cast<int64_t>(std::llround(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast<int64_t>(std::llround(height * scale)) / PATCH_SIZE) * PATCH_SIZE},
+            {(static_cast<int64_t>(std::llround(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast<int64_t>(std::floor(height * scale)) / PATCH_SIZE) * PATCH_SIZE},
+            {(static_cast<int64_t>(std::floor(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast<int64_t>(std::llround(height * scale)) / PATCH_SIZE) * PATCH_SIZE},
+            {(static_cast<int64_t>(std::floor(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast<int64_t>(std::floor(height * scale)) / PATCH_SIZE) * PATCH_SIZE},
+        };
+        std::sort(sizes.begin(), sizes.end(), [](const auto& a, const auto& b) {
+            return a.first * a.second > b.first * b.second;
+        });
+
+        std::pair<int64_t, int64_t> new_size = sizes.back();
+        for (const auto& size : sizes) {
+            if (size.first > 0 && size.second > 0 && size.first * size.second <= s_max) {
+                new_size = size;
+                break;
+            }
+        }
+
+        double s1 = static_cast<double>(width) / static_cast<double>(new_size.first);
+        double s2 = static_cast<double>(height) / static_cast<double>(new_size.second);
+        sd::Tensor<float> resized;
+        if (s1 < s2) {
+            int64_t resized_h = static_cast<int64_t>(std::llround(height / s1));
+            resized           = sd::ops::interpolate(image,
+                                                     {new_size.first, resized_h, image.shape()[2], image.shape()[3]},
+                                                     sd::ops::InterpolateMode::Bicubic);
+            int64_t top       = (resized_h - new_size.second) / 2;
+            resized           = sd::ops::slice(resized, 1, top, top + new_size.second);
+        } else {
+            int64_t resized_w = static_cast<int64_t>(std::llround(width / s2));
+            resized           = sd::ops::interpolate(image,
+                                                     {resized_w, new_size.second, image.shape()[2], image.shape()[3]},
+                                                     sd::ops::InterpolateMode::Bicubic);
+            int64_t left      = (resized_w - new_size.first) / 2;
+            resized           = sd::ops::slice(resized, 0, left, left + new_size.first);
+        }
+        return resized;
+    }
+
+    static inline std::vector<int32_t> build_position_ids(const std::vector<int32_t>& input_ids,
+                                                          const std::vector<std::array<int32_t, 3>>& image_grids,
+                                                          const std::vector<int32_t>& skip_vision_start_token) {
+        std::vector<int32_t> position_ids(4 * input_ids.size(), 0);
+        int image_index = 0;
+        int st          = 0;
+        int fix_point   = 4096;
+        std::vector<int32_t> out_t;
+        std::vector<int32_t> out_h;
+        std::vector<int32_t> out_w;
+
+        while (st < static_cast<int>(input_ids.size())) {
+            int ed = st;
+            while (ed < static_cast<int>(input_ids.size()) && input_ids[ed] != IMAGE_TOKEN_ID) {
+                ed++;
+            }
+
+            if (ed >= static_cast<int>(input_ids.size())) {
+                int st_idx = out_t.empty() ? 0 : (*std::max_element(out_t.begin(), out_t.end()) + 1);
+                for (int i = 0; i < static_cast<int>(input_ids.size()) - st; ++i) {
+                    out_t.push_back(st_idx + i);
+                    out_h.push_back(st_idx + i);
+                    out_w.push_back(st_idx + i);
+                }
+                break;
+            }
+
+            int text_len = std::max(0, ed - st - skip_vision_start_token[image_index]);
+            int st_idx   = out_t.empty() ? 0 : (*std::max_element(out_t.begin(), out_t.end()) + 1);
+            for (int i = 0; i < text_len; ++i) {
+                out_t.push_back(st_idx + i);
+                out_h.push_back(st_idx + i);
+                out_w.push_back(st_idx + i);
+            }
+
+            auto grid = image_grids[image_index];
+            int base;
+            if (skip_vision_start_token[image_index]) {
+                if (fix_point > 0) {
+                    base      = fix_point;
+                    fix_point = 0;
+                } else {
+                    base = st_idx;
+                }
+            } else {
+                base = text_len + st_idx;
+            }
+            for (int32_t ti = 0; ti < grid[0]; ++ti) {
+                for (int32_t hi = 0; hi < grid[1]; ++hi) {
+                    for (int32_t wi = 0; wi < grid[2]; ++wi) {
+                        out_t.push_back(base + ti);
+                        out_h.push_back(base + hi);
+                        out_w.push_back(base + wi);
+                    }
+                }
+            }
+
+            st = ed + grid[0] * grid[1] * grid[2];
+            image_index++;
+        }
+
+        GGML_ASSERT(out_t.size() == input_ids.size());
+        for (size_t i = 0; i < input_ids.size(); ++i) {
+            // ggml IMROPE consumes 4 flattened position streams:
+            //   [t, h, w, e]
+            // llama.cpp's generic Qwen-VL fallback expands text positions as
+            // [pos, pos, pos, 0]. Keep the extra stream zeroed here too.
+            position_ids[i]                        = out_t[i];
+            position_ids[input_ids.size() + i]     = out_h[i];
+            position_ids[input_ids.size() * 2 + i] = out_w[i];
+            position_ids[input_ids.size() * 3 + i] = 0;
+        }
+        return position_ids;
+    }
+
+    struct TimestepEmbedder : public GGMLBlock {
+        int frequency_embedding_size = 256;
+
+        TimestepEmbedder(int64_t hidden_size) {
+            blocks["mlp.0"] = std::make_shared<Linear>(frequency_embedding_size, hidden_size, true);
+            blocks["mlp.2"] = std::make_shared<Linear>(hidden_size, hidden_size, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* t) {
+            auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
+            auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);
+            auto emb   = ggml_ext_timestep_embedding(ctx->ggml_ctx, t, frequency_embedding_size, 10000, 1000.0f);
+            emb        = mlp_0->forward(ctx, emb);
+            emb        = ggml_silu_inplace(ctx->ggml_ctx, emb);
+            emb        = mlp_2->forward(ctx, emb);
+            return emb;
+        }
+    };
+
+    struct BottleneckPatchEmbed : public GGMLBlock {
+        BottleneckPatchEmbed(int64_t in_dim, int64_t pca_dim, int64_t embed_dim) {
+            blocks["proj1"] = std::make_shared<Linear>(in_dim, pca_dim, false);
+            blocks["proj2"] = std::make_shared<Linear>(pca_dim, embed_dim, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            auto proj1 = std::dynamic_pointer_cast<Linear>(blocks["proj1"]);
+            auto proj2 = std::dynamic_pointer_cast<Linear>(blocks["proj2"]);
+            return proj2->forward(ctx, proj1->forward(ctx, x));
+        }
+    };
+
+    struct FinalLayer : public GGMLBlock {
+        FinalLayer(int64_t hidden_size, int64_t out_dim) {
+            blocks["linear"] = std::make_shared<Linear>(hidden_size, out_dim, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
+            return linear->forward(ctx, x);
+        }
+    };
+
+    struct HiDreamO1Model : public GGMLBlock {
+        HiDreamO1Config config;
+
+        HiDreamO1Model() = default;
+        explicit HiDreamO1Model(HiDreamO1Config config)
+            : config(std::move(config)) {
+            blocks["language_model"] = std::make_shared<LLM::TextModel>(this->config.llm);
+            blocks["t_embedder1"]    = std::make_shared<TimestepEmbedder>(this->config.llm.hidden_size);
+            blocks["x_embedder"]     = std::make_shared<BottleneckPatchEmbed>(this->config.patch_size * this->config.patch_size * 3,
+                                                                          this->config.llm.hidden_size / 4,
+                                                                          this->config.llm.hidden_size);
+            blocks["final_layer2"]   = std::make_shared<FinalLayer>(this->config.llm.hidden_size,
+                                                                  this->config.patch_size * this->config.patch_size * 3);
+        }
+
+        std::shared_ptr<LLM::TextModel> text_model() {
+            return std::dynamic_pointer_cast<LLM::TextModel>(blocks["language_model"]);
+        }
+
+        std::shared_ptr<TimestepEmbedder> timestep_embedder() {
+            return std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder1"]);
+        }
+
+        std::shared_ptr<BottleneckPatchEmbed> patch_embedder() {
+            return std::dynamic_pointer_cast<BottleneckPatchEmbed>(blocks["x_embedder"]);
+        }
+
+        std::shared_ptr<FinalLayer> final_layer() {
+            return std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer2"]);
+        }
+    };
+
+    struct HiDreamO1VisionRunner : public GGMLRunner {
+        HiDreamO1Config config;
+        std::shared_ptr<LLM::VisionModel> model;
+
+        std::vector<int> window_index_vec;
+        std::vector<int> window_inverse_index_vec;
+        std::vector<float> window_mask_vec;
+        std::vector<float> pe_vec;
+        std::array<std::vector<int32_t>, 4> pos_embed_idx_data_;
+        std::array<std::vector<float>, 4> pos_embed_weight_data_;
+
+        HiDreamO1VisionRunner(ggml_backend_t backend,
+                              const String2TensorStorage& tensor_storage_map      = {},
+                              const std::string& prefix                           = "model.visual",
+                              std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+            : GGMLRunner(backend, weight_manager),
+              config(HiDreamO1Config::detect_from_weights(tensor_storage_map, prefix)),
+              model(std::make_shared<LLM::VisionModel>(false, config.llm.vision)) {
+            model->init(params_ctx, tensor_storage_map, prefix);
+        }
+
+        std::string get_desc() override {
+            return "hidream_o1_vision";
+        }
+
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix = "model.visual") {
+            model->get_param_tensors(tensors, prefix);
+        }
+
+        ggml_tensor* encode_image(GGMLRunnerContext* runner_ctx, ggml_tensor* image) {
+            return LLM::LLMRunner::encode_image_common(this,
+                                                       compute_ctx,
+                                                       runner_ctx,
+                                                       image,
+                                                       config.llm.vision,
+                                                       model,
+                                                       window_index_vec,
+                                                       window_inverse_index_vec,
+                                                       window_mask_vec,
+                                                       pe_vec,
+                                                       pos_embed_idx_data_,
+                                                       pos_embed_weight_data_);
+        }
+
+        ggml_cgraph* build_graph(const sd::Tensor<float>& image_tensor) {
+            ggml_cgraph* gf    = new_graph_custom(HIDREAM_O1_GRAPH_SIZE);
+            ggml_tensor* image = make_input(image_tensor);
+            auto runner_ctx    = get_context();
+            auto image_embeds  = encode_image(&runner_ctx, image);
+            ggml_build_forward_expand(gf, image_embeds);
+            return gf;
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const sd::Tensor<float>& image,
+                                  bool auto_free           = true,
+                                  bool free_compute_buffer = true,
+                                  bool free_compute_params = true) {
+            auto get_graph = [&]() {
+                return build_graph(image);
+            };
+            auto output = GGMLRunner::compute<float>(get_graph, n_threads, auto_free, free_compute_buffer, free_compute_params);
+            return output.has_value() ? std::move(output.value()) : sd::Tensor<float>();
+        }
+    };
+
+    struct HiDreamO1Runner : public DiffusionModelRunner {
+        HiDreamO1Config config;
+        HiDreamO1Model model;
+
+        std::vector<float> attention_mask_vec;
+
+        HiDreamO1Runner(ggml_backend_t backend,
+                        const String2TensorStorage& tensor_storage_map      = {},
+                        const std::string& prefix                           = "model",
+                        std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+            : DiffusionModelRunner(backend, prefix, weight_manager),
+              config(HiDreamO1Config::detect_from_weights(tensor_storage_map, prefix)) {
+            model = HiDreamO1Model(config);
+            model.init(params_ctx, tensor_storage_map, prefix);
+        }
+
+        std::string get_desc() override {
+            return "hidream_o1";
+        }
+
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
+            model.get_param_tensors(tensors, prefix);
+        }
+
+        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                                 const sd::Tensor<float>& timestep_tensor,
+                                 const sd::Tensor<int32_t>& input_ids_tensor,
+                                 const sd::Tensor<int32_t>& input_pos_tensor,
+                                 const sd::Tensor<int32_t>& token_types_tensor,
+                                 const sd::Tensor<int32_t>& vinput_mask_tensor,
+                                 const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds_tensor,
+                                 const std::vector<sd::Tensor<float>>& ref_images) {
+            ggml_cgraph* gf        = new_graph_custom(HIDREAM_O1_GRAPH_SIZE);
+            ggml_tensor* x         = make_input(x_tensor);
+            ggml_tensor* timestep  = make_input(timestep_tensor);
+            ggml_tensor* input_ids = make_input(input_ids_tensor);
+            ggml_tensor* input_pos = make_input(input_pos_tensor);
+
+            auto text_model   = model.text_model();
+            auto t_embedder1  = model.timestep_embedder();
+            auto x_embedder   = model.patch_embedder();
+            auto final_layer2 = model.final_layer();
+
+            std::vector<ggml_tensor*> ref_image_tensors;
+            for (const auto& image : ref_images) {
+                ref_image_tensors.push_back(make_input(image));
+            }
+
+            attention_mask_vec    = std::vector<float>(static_cast<size_t>(token_types_tensor.shape()[0] * token_types_tensor.shape()[0]), 0.0f);
+            int64_t total_seq_len = token_types_tensor.shape()[0];
+            for (int64_t query = 0; query < total_seq_len; ++query) {
+                bool is_gen = token_types_tensor.values()[static_cast<size_t>(query)] > 0;
+                for (int64_t key = 0; key < total_seq_len; ++key) {
+                    if (!is_gen && key > query) {
+                        attention_mask_vec[static_cast<size_t>(query * total_seq_len + key)] = -INFINITY;
+                    }
+                }
+            }
+            auto attention_mask = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, total_seq_len, total_seq_len);
+            set_backend_tensor_data(attention_mask, attention_mask_vec.data());
+
+            auto runner_ctx = get_context();
+            auto txt        = text_model->embed(&runner_ctx, input_ids);
+            std::vector<std::pair<int, ggml_tensor*>> image_embeds;
+            image_embeds.reserve(image_embeds_tensor.size());
+            for (const auto& image_embed : image_embeds_tensor) {
+                image_embeds.emplace_back(image_embed.first, make_input(image_embed.second));
+            }
+            txt = LLM::splice_image_embeds(&runner_ctx, txt, image_embeds);
+
+            auto t_emb          = t_embedder1->forward(&runner_ctx, timestep);
+            int64_t txt_seq_len = input_ids->ne[0];
+            if (txt_seq_len > 1) {
+                auto prefix = ggml_ext_slice(compute_ctx, txt, 1, 0, txt_seq_len - 1);
+                txt         = ggml_concat(compute_ctx, prefix, ggml_reshape_3d(compute_ctx, t_emb, t_emb->ne[0], 1, 1), 1);
+            } else {
+                txt = ggml_reshape_3d(compute_ctx, t_emb, t_emb->ne[0], 1, 1);
+            }
+
+            auto vinputs          = DiT::pad_and_patchify(&runner_ctx, x, PATCH_SIZE, PATCH_SIZE);
+            int64_t target_tokens = vinputs->ne[1];
+            for (ggml_tensor* ref_image : ref_image_tensors) {
+                auto ref = DiT::pad_and_patchify(&runner_ctx, ref_image, PATCH_SIZE, PATCH_SIZE);
+                vinputs  = ggml_concat(compute_ctx, vinputs, ref, 1);
+            }
+            auto vis = x_embedder->forward(&runner_ctx, vinputs);
+
+            auto inputs_embeds = ggml_concat(compute_ctx, txt, vis, 1);
+            auto hidden_states = text_model->forward_embeds(&runner_ctx, inputs_embeds, input_pos, attention_mask, {});
+            auto x_pred_all    = final_layer2->forward(&runner_ctx, hidden_states);
+
+            int64_t x_pred_start = txt_seq_len;
+            if (!vinput_mask_tensor.empty()) {
+                int64_t seq_len      = static_cast<int64_t>(vinput_mask_tensor.shape()[0]);
+                int64_t first_vinput = 0;
+                while (first_vinput < seq_len && vinput_mask_tensor.values()[static_cast<size_t>(first_vinput)] == 0) {
+                    first_vinput++;
+                }
+                x_pred_start = first_vinput;
+            }
+            auto x_pred = ggml_ext_slice(compute_ctx, x_pred_all, 1, x_pred_start, x_pred_start + target_tokens);
+            x_pred      = DiT::unpatchify_and_crop(compute_ctx, x_pred, x->ne[1], x->ne[0], PATCH_SIZE, PATCH_SIZE);
+
+            float sigma = 1.0f - timestep_tensor.values()[0];
+            sigma       = std::max(1e-6f, sigma);
+            auto out    = ggml_scale(compute_ctx, ggml_sub(compute_ctx, x, x_pred), 1.0f / sigma);
+
+            ggml_build_forward_expand(gf, out);
+            return gf;
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const sd::Tensor<float>& x,
+                                  const sd::Tensor<float>& timestep,
+                                  const sd::Tensor<int32_t>& input_ids,
+                                  const sd::Tensor<int32_t>& input_pos,
+                                  const sd::Tensor<int32_t>& token_types,
+                                  const sd::Tensor<int32_t>& vinput_mask,
+                                  const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds,
+                                  const std::vector<sd::Tensor<float>>& ref_images) {
+            auto get_graph = [&]() {
+                return build_graph(x, timestep, input_ids, input_pos, token_types, vinput_mask, image_embeds, ref_images);
+            };
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const DiffusionParams& diffusion_params) override {
+            GGML_ASSERT(diffusion_params.x != nullptr);
+            GGML_ASSERT(diffusion_params.timesteps != nullptr);
+            const auto* extra = diffusion_extra_as<HiDreamO1DiffusionExtra>(diffusion_params);
+            GGML_ASSERT(extra != nullptr);
+            GGML_ASSERT(extra->input_ids != nullptr);
+            GGML_ASSERT(extra->input_pos != nullptr);
+            GGML_ASSERT(extra->token_types != nullptr);
+            static const std::vector<sd::Tensor<float>> empty_images;
+            static const std::vector<std::pair<int, sd::Tensor<float>>> empty_image_embeds;
+            return compute(n_threads,
+                           *diffusion_params.x,
+                           *diffusion_params.timesteps,
+                           *extra->input_ids,
+                           *extra->input_pos,
+                           *extra->token_types,
+                           tensor_or_empty(extra->vinput_mask),
+                           extra->image_embeds ? *extra->image_embeds : empty_image_embeds,
+                           diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_images);
+        }
+    };
+
+    struct HiDreamO1Conditioner : public Conditioner {
+        Qwen2Tokenizer tokenizer;
+        std::shared_ptr<HiDreamO1VisionRunner> vision_runner;
+
+        HiDreamO1Conditioner(ggml_backend_t backend,
+                             const String2TensorStorage& tensor_storage_map      = {},
+                             std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+            : vision_runner(std::make_shared<HiDreamO1VisionRunner>(backend, tensor_storage_map, "model.visual", weight_manager)) {}
+
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
+            vision_runner->get_param_tensors(tensors);
+        }
+
+        void set_max_graph_vram_bytes(size_t max_graph_vram_bytes) override {
+            vision_runner->set_max_graph_vram_bytes(max_graph_vram_bytes);
+        }
+
+        void set_flash_attention_enabled(bool enabled) override {
+            vision_runner->set_flash_attention_enabled(enabled);
+        }
+
+        void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+            vision_runner->set_weight_adapter(adapter);
+        }
+
+        void runner_done() override {
+            vision_runner->runner_done();
+        }
+
+        SDCondition get_learned_condition(int n_threads,
+                                          const ConditionerParams& conditioner_params) override {
+            SDCondition result;
+
+            int width                = conditioner_params.width;
+            int height               = conditioner_params.height;
+            int64_t target_image_len = static_cast<int64_t>(width / PATCH_SIZE) * static_cast<int64_t>(height / PATCH_SIZE);
+
+            std::vector<sd::Tensor<float>> ref_images;
+            if (conditioner_params.ref_images != nullptr) {
+                ref_images = *conditioner_params.ref_images;
+            }
+
+            std::vector<std::pair<int, sd::Tensor<float>>> vlm_images;
+            std::vector<std::array<int32_t, 3>> image_grids;
+            std::vector<int32_t> skip_vision_start;
+
+            std::string prompt = "<|im_start|>user\n";
+
+            if (ref_images.empty()) {
+                prompt += conditioner_params.text;
+                prompt += "<|im_end|>\n<|im_start|>assistant\n<|boi_token|><|tms_token|>";
+                auto input_ids = tokenizer.encode(prompt, nullptr);
+
+                std::vector<int32_t> input_ids_pad = input_ids;
+                input_ids_pad.push_back(VISION_START_TOKEN_ID);
+                input_ids_pad.insert(input_ids_pad.end(), target_image_len - 1, IMAGE_TOKEN_ID);
+
+                image_grids.push_back({1, static_cast<int32_t>(height / PATCH_SIZE), static_cast<int32_t>(width / PATCH_SIZE)});
+                skip_vision_start.push_back(1);
+
+                std::vector<int32_t> token_types(input_ids_pad.size(), 0);
+                int txt_seq_len = static_cast<int>(input_ids.size());
+                int bgn         = txt_seq_len - TIMESTEP_TOKEN_NUM;
+                for (int i = bgn; i < static_cast<int>(token_types.size()); ++i) {
+                    token_types[i] = 1;
+                }
+
+                auto position_ids = build_position_ids(input_ids_pad, image_grids, skip_vision_start);
+
+                std::vector<int64_t> input_shape{static_cast<int64_t>(input_ids.size())};
+                std::vector<int64_t> position_shape{static_cast<int64_t>(input_ids_pad.size() * 4)};
+                std::vector<int64_t> token_type_shape{static_cast<int64_t>(token_types.size())};
+                std::vector<int32_t> vinput_mask(token_types.size(), 0);
+                for (int64_t i = txt_seq_len; i < static_cast<int64_t>(vinput_mask.size()); ++i) {
+                    vinput_mask[static_cast<size_t>(i)] = 1;
+                }
+                std::vector<int64_t> vinput_mask_shape{static_cast<int64_t>(vinput_mask.size())};
+
+                result.c_input_ids    = sd::Tensor<int32_t>(input_shape, std::move(input_ids));
+                result.c_position_ids = sd::Tensor<int32_t>(position_shape, position_ids);
+                result.c_token_types  = sd::Tensor<int32_t>(token_type_shape, std::move(token_types));
+                result.c_vinput_mask  = sd::Tensor<int32_t>(vinput_mask_shape, std::move(vinput_mask));
+                return result;
+            }
+
+            int K = static_cast<int>(ref_images.size());
+            int max_size;
+            if (K == 1) {
+                max_size = std::max(height, width);
+            } else if (K == 2) {
+                max_size = std::max(height, width) * 48 / 64;
+            } else if (K <= 4) {
+                max_size = std::max(height, width) / 2;
+            } else if (K <= 8) {
+                max_size = std::max(height, width) * 24 / 64;
+            } else {
+                max_size = std::max(height, width) / 4;
+            }
+
+            int cond_img_size;
+            if (K <= 4) {
+                cond_img_size = 384;
+            } else if (K <= 8) {
+                cond_img_size = 384 * 48 / 64;
+            } else {
+                cond_img_size = 384 / 2;
+            }
+
+            for (const auto& ref_image : ref_images) {
+                auto resized_ref = resize_to_area(ref_image, max_size);
+                resized_ref      = sd::ops::clamp(resized_ref, 0.0f, 1.0f);
+
+                // VLM image: Qwen3-VL expects mean=[0.5]/std=[0.5] (i.e. range [-1,1]),
+                // not CLIP normalization. Resize the already-resized ref directly to
+                // (cond_w, cond_h) to match the Python pipeline's pil_r.resize().
+                auto dims                   = calculate_dimensions(cond_img_size,
+                                                                   static_cast<double>(resized_ref.shape()[0]) / static_cast<double>(resized_ref.shape()[1]));
+                sd::Tensor<float> vlm_image = sd::ops::interpolate(
+                    resized_ref,
+                    {dims.first, dims.second, resized_ref.shape()[2], resized_ref.shape()[3]});
+                vlm_image            = vlm_image * 2.0f - 1.0f;
+                int64_t image_tokens = static_cast<int64_t>(dims.first / PATCH_SIZE) * static_cast<int64_t>(dims.second / PATCH_SIZE);
+
+                auto patch_img = resized_ref * 2.0f - 1.0f;
+                result.c_ref_images.push_back(std::move(patch_img));
+                int64_t prompt_start = static_cast<int64_t>(tokenizer.encode(prompt + "<|vision_start|>", nullptr).size());
+                prompt += "<|vision_start|>";
+                prompt += repeat_special_token("<|image_pad|>", image_tokens);
+                prompt += "<|vision_end|>";
+                vlm_images.emplace_back(static_cast<int>(prompt_start), std::move(vlm_image));
+                image_grids.push_back({1, dims.second / PATCH_SIZE, dims.first / PATCH_SIZE});
+                skip_vision_start.push_back(0);
+            }
+
+            prompt += conditioner_params.text;
+            prompt += "<|im_end|>\n<|im_start|>assistant\n<|boi_token|><|tms_token|>";
+            auto input_ids = tokenizer.encode(prompt, nullptr);
+
+            std::vector<int32_t> input_ids_pad = input_ids;
+            input_ids_pad.push_back(VISION_START_TOKEN_ID);
+            input_ids_pad.insert(input_ids_pad.end(), target_image_len - 1, IMAGE_TOKEN_ID);
+            image_grids.push_back({1, static_cast<int32_t>(height / PATCH_SIZE), static_cast<int32_t>(width / PATCH_SIZE)});
+            skip_vision_start.push_back(1);
+
+            for (const auto& ref_image : result.c_ref_images) {
+                int64_t ref_len = static_cast<int64_t>(ref_image.shape()[0] / PATCH_SIZE) * static_cast<int64_t>(ref_image.shape()[1] / PATCH_SIZE);
+                input_ids_pad.push_back(VISION_START_TOKEN_ID);
+                input_ids_pad.insert(input_ids_pad.end(), ref_len - 1, IMAGE_TOKEN_ID);
+                image_grids.push_back({1, static_cast<int32_t>(ref_image.shape()[1] / PATCH_SIZE), static_cast<int32_t>(ref_image.shape()[0] / PATCH_SIZE)});
+                skip_vision_start.push_back(1);
+            }
+
+            std::vector<int32_t> token_types(input_ids_pad.size(), 0);
+            int txt_seq_len = static_cast<int>(input_ids.size());
+            int bgn         = txt_seq_len - TIMESTEP_TOKEN_NUM;
+            for (int i = bgn; i < static_cast<int>(token_types.size()); ++i) {
+                token_types[i] = 1;
+            }
+
+            std::vector<int64_t> input_shape{static_cast<int64_t>(input_ids.size())};
+            std::vector<int64_t> position_shape{static_cast<int64_t>(input_ids_pad.size() * 4)};
+            std::vector<int64_t> token_type_shape{static_cast<int64_t>(token_types.size())};
+            std::vector<int32_t> vinput_mask(token_types.size(), 0);
+            for (int i = txt_seq_len; i < static_cast<int>(vinput_mask.size()); ++i) {
+                vinput_mask[static_cast<size_t>(i)] = 1;
+            }
+            std::vector<int64_t> vinput_mask_shape{static_cast<int64_t>(vinput_mask.size())};
+
+            result.c_input_ids    = sd::Tensor<int32_t>(input_shape, std::move(input_ids));
+            result.c_position_ids = sd::Tensor<int32_t>(position_shape, build_position_ids(input_ids_pad, image_grids, skip_vision_start));
+            result.c_token_types  = sd::Tensor<int32_t>(token_type_shape, std::move(token_types));
+            result.c_vinput_mask  = sd::Tensor<int32_t>(vinput_mask_shape, std::move(vinput_mask));
+            result.c_image_embeds.reserve(vlm_images.size());
+            for (const auto& vlm_image : vlm_images) {
+                auto image_embed = vision_runner->compute(n_threads, vlm_image.second, false, true, true);
+                if (image_embed.empty()) {
+                    LOG_ERROR("hidream_o1 conditioner: encode VLM image failed");
+                    return SDCondition();
+                }
+                result.c_image_embeds.emplace_back(vlm_image.first, std::move(image_embed));
+            }
+            return result;
+        }
+    };
+}  // namespace HiDreamO1
+
+#endif  // __SD_MODEL_DIFFUSION_HIDREAM_O1_HPP__
--- a/src/model/diffusion/ideogram4.hpp
+++ b/src/model/diffusion/ideogram4.hpp
@ -0,0 +1,557 @@
+#ifndef __SD_MODEL_DIFFUSION_IDEOGRAM4_HPP__
+#define __SD_MODEL_DIFFUSION_IDEOGRAM4_HPP__
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "core/ggml_extend.hpp"
+#include "core/ggml_graph_cut.h"
+#include "model/common/rope.hpp"
+#include "model/diffusion/model.hpp"
+
+namespace Ideogram4 {
+    constexpr int IDEOGRAM4_GRAPH_SIZE    = 65536;
+    constexpr int OUTPUT_IMAGE_INDICATOR  = 2;
+    constexpr int IMAGE_POSITION_OFFSET   = 65536;
+    constexpr int DEFAULT_MROPE_SECTION_T = 24;
+    constexpr int DEFAULT_MROPE_SECTION_H = 20;
+    constexpr int DEFAULT_MROPE_SECTION_W = 20;
+    constexpr int TIMESTEP_MAX_PERIOD     = 10000;
+    constexpr int LLM_HIDDEN_STATE_LAYERS = 13;
+
+    struct Ideogram4Config {
+        int64_t emb_dim                = 4608;
+        int64_t num_layers             = 34;
+        int64_t num_heads              = 18;
+        int64_t intermediate_size      = 12288;
+        int64_t adanln_dim             = 512;
+        int64_t in_channels            = 128;
+        int64_t llm_features_dim       = 53248;
+        int64_t rope_theta             = 5000000;
+        float norm_eps                 = 1e-5f;
+        int patch_size                 = 2;
+        int ae_channels                = 32;
+        std::vector<int> mrope_section = {DEFAULT_MROPE_SECTION_T,
+                                          DEFAULT_MROPE_SECTION_H,
+                                          DEFAULT_MROPE_SECTION_W};
+
+        static Ideogram4Config detect_from_weights(const String2TensorStorage& tensor_storage_map,
+                                                   const std::string& prefix) {
+            Ideogram4Config config;
+            int64_t detected_layers  = 0;
+            std::string layer_prefix = prefix.empty() ? "layers." : prefix + ".layers.";
+            for (const auto& [name, _] : tensor_storage_map) {
+                if (name.find(layer_prefix) != 0) {
+                    continue;
+                }
+                std::string tail = name.substr(layer_prefix.size());
+                size_t dot       = tail.find('.');
+                if (dot == std::string::npos) {
+                    continue;
+                }
+                int layer_idx   = std::atoi(tail.substr(0, dot).c_str());
+                detected_layers = std::max<int64_t>(detected_layers, layer_idx + 1);
+            }
+            if (detected_layers > 0) {
+                config.num_layers = detected_layers;
+                LOG_DEBUG("ideogram4: num_layers = %" PRId64 ", emb_dim = %" PRId64 ", num_heads = %" PRId64 ", intermediate_size = %" PRId64,
+                          config.num_layers,
+                          config.emb_dim,
+                          config.num_heads,
+                          config.intermediate_size);
+            }
+            return config;
+        }
+    };
+
+    __STATIC_INLINE__ ggml_tensor* timestep_embedding_sin_cos(ggml_context* ctx,
+                                                              ggml_tensor* timesteps,
+                                                              int dim) {
+        GGML_ASSERT(dim % 2 == 0);
+        auto embedding = ggml_ext_timestep_embedding(ctx, timesteps, dim, TIMESTEP_MAX_PERIOD, 10.f);
+        auto chunks    = ggml_ext_chunk(ctx, embedding, 2, 0);
+        return ggml_concat(ctx, chunks[1], chunks[0], 0);
+    }
+
+    __STATIC_INLINE__ ggml_tensor* to_token_modulation(ggml_context* ctx, ggml_tensor* x) {
+        // [N, C] -> [N, 1, C] in PyTorch layout.
+        if (ggml_n_dims(x) < 3 || x->ne[1] != 1) {
+            x = ggml_reshape_3d(ctx, x, x->ne[0], 1, x->ne[1]);
+        }
+        return x;
+    }
+
+    __STATIC_INLINE__ ggml_tensor* interleave_hidden_state_layers(ggml_context* ctx, ggml_tensor* x) {
+        // Match upstream stack(...).permute(1, 2, 3, 0).reshape(...):
+        // [layers * hidden, tokens, batch] -> [hidden * layers, tokens, batch].
+        GGML_ASSERT(x->ne[0] % LLM_HIDDEN_STATE_LAYERS == 0);
+        const int64_t hidden_size = x->ne[0] / LLM_HIDDEN_STATE_LAYERS;
+        const int64_t token_count = x->ne[1];
+        const int64_t batch_count = x->ne[2];
+
+        x = ggml_reshape_4d(ctx, x, hidden_size, LLM_HIDDEN_STATE_LAYERS, token_count, batch_count);
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));
+        return ggml_reshape_3d(ctx, x, hidden_size * LLM_HIDDEN_STATE_LAYERS, token_count, batch_count);
+    }
+
+    __STATIC_INLINE__ ggml_tensor* modulate(ggml_context* ctx, ggml_tensor* x, ggml_tensor* scale) {
+        scale = to_token_modulation(ctx, scale);
+        return ggml_add(ctx, x, ggml_mul(ctx, x, scale));
+    }
+
+    __STATIC_INLINE__ ggml_tensor* patchify(ggml_context* ctx, ggml_tensor* x, const Ideogram4Config& config) {
+        // x: [N, 128, H, W] with channel order [ae, ph, pw].
+        // return: [N, H*W, 128] with token channel order [ph, pw, ae].
+        const int64_t W = x->ne[0];
+        const int64_t H = x->ne[1];
+        const int64_t C = x->ne[2];
+        const int64_t N = x->ne[3];
+
+        GGML_ASSERT(N == 1);
+        GGML_ASSERT(C == config.ae_channels * config.patch_size * config.patch_size);
+
+        x = ggml_cont(ctx, x);
+        x = ggml_reshape_4d(ctx, x, W * H, config.patch_size, config.patch_size, config.ae_channels);
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 3, 1, 2, 0));
+        x = ggml_reshape_3d(ctx, x, C, W * H, N);
+        return x;
+    }
+
+    __STATIC_INLINE__ ggml_tensor* unpatchify(ggml_context* ctx,
+                                              ggml_tensor* x,
+                                              int64_t H,
+                                              int64_t W,
+                                              const Ideogram4Config& config) {
+        const int64_t C = x->ne[0];
+        const int64_t N = x->ne[2];
+
+        GGML_ASSERT(N == 1);
+        GGML_ASSERT(C == config.ae_channels * config.patch_size * config.patch_size);
+        GGML_ASSERT(x->ne[1] == H * W);
+
+        x = ggml_reshape_4d(ctx, x, config.ae_channels, config.patch_size, config.patch_size, H * W);
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 3, 1, 2, 0));
+        x = ggml_reshape_4d(ctx, x, W, H, C, N);
+        return x;
+    }
+
+    __STATIC_INLINE__ std::shared_ptr<Linear> make_linear(int64_t in_features,
+                                                          int64_t out_features,
+                                                          bool bias = true) {
+        return std::make_shared<Linear>(in_features, out_features, bias, false, false, 1.f, true);
+    }
+
+    __STATIC_INLINE__ std::vector<float> gen_ideogram4_pe(int grid_h,
+                                                          int grid_w,
+                                                          int bs,
+                                                          int context_len,
+                                                          int head_dim,
+                                                          int rope_theta,
+                                                          const std::vector<int>& mrope_section,
+                                                          bool circular_x = false,
+                                                          bool circular_y = false) {
+        GGML_ASSERT(bs == 1);
+        std::vector<std::vector<float>> ids(static_cast<size_t>(bs) * (context_len + grid_h * grid_w),
+                                            std::vector<float>(3, 0.f));
+
+        for (int i = 0; i < context_len; ++i) {
+            ids[i] = {static_cast<float>(i), static_cast<float>(i), static_cast<float>(i)};
+        }
+
+        int cursor = context_len;
+        for (int y = 0; y < grid_h; ++y) {
+            for (int x = 0; x < grid_w; ++x) {
+                ids[cursor++] = {static_cast<float>(IMAGE_POSITION_OFFSET),
+                                 static_cast<float>(IMAGE_POSITION_OFFSET + y),
+                                 static_cast<float>(IMAGE_POSITION_OFFSET + x)};
+            }
+        }
+
+        std::vector<std::vector<int>> axis_wrap_dims(3);
+        if (circular_y || circular_x) {
+            size_t total_len = static_cast<size_t>(bs) * (context_len + grid_h * grid_w);
+            axis_wrap_dims[1].assign(total_len, 0);
+            axis_wrap_dims[2].assign(total_len, 0);
+            if (circular_y) {
+                for (size_t idx = static_cast<size_t>(context_len); idx < total_len; ++idx) {
+                    axis_wrap_dims[1][idx] = grid_h;
+                }
+            }
+            if (circular_x) {
+                for (size_t idx = static_cast<size_t>(context_len); idx < total_len; ++idx) {
+                    axis_wrap_dims[2][idx] = grid_w;
+                }
+            }
+        }
+
+        return Rope::embed_interleaved_mrope(ids,
+                                             bs,
+                                             static_cast<float>(rope_theta),
+                                             head_dim,
+                                             mrope_section,
+                                             axis_wrap_dims);
+    }
+
+    class Ideogram4Attention : public GGMLBlock {
+    protected:
+        int64_t hidden_size;
+        int64_t num_heads;
+        int64_t head_dim;
+
+    public:
+        Ideogram4Attention(int64_t hidden_size, int64_t num_heads, float eps)
+            : hidden_size(hidden_size), num_heads(num_heads), head_dim(hidden_size / num_heads) {
+            GGML_ASSERT(hidden_size % num_heads == 0);
+            blocks["qkv"]    = make_linear(hidden_size, hidden_size * 3, false);
+            blocks["norm_q"] = std::make_shared<RMSNorm>(head_dim, eps);
+            blocks["norm_k"] = std::make_shared<RMSNorm>(head_dim, eps);
+            blocks["o"]      = make_linear(hidden_size, hidden_size, false);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* pe,
+                             ggml_tensor* mask = nullptr) {
+            int64_t n_token = x->ne[1];
+            int64_t N       = x->ne[2];
+
+            auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
+            auto norm_q   = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_q"]);
+            auto norm_k   = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_k"]);
+            auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["o"]);
+
+            auto qkv     = qkv_proj->forward(ctx, x);
+            auto qkv_vec = split_qkv(ctx->ggml_ctx, qkv);
+            auto q       = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[0], head_dim, num_heads, n_token, N);
+            auto k       = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[1], head_dim, num_heads, n_token, N);
+            auto v       = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[2], head_dim, num_heads, n_token, N);
+
+            q = norm_q->forward(ctx, q);
+            k = norm_k->forward(ctx, k);
+
+            x = Rope::attention(ctx, q, k, v, pe, mask, 1.f / 128.f, false);
+            x = out_proj->forward(ctx, x);
+            return x;
+        }
+    };
+
+    class Ideogram4MLP : public GGMLBlock {
+    public:
+        Ideogram4MLP(int64_t dim, int64_t hidden_dim) {
+            blocks["w1"] = make_linear(dim, hidden_dim, false);
+            blocks["w2"] = make_linear(hidden_dim, dim, false);
+            blocks["w3"] = make_linear(dim, hidden_dim, false);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            auto w1 = std::dynamic_pointer_cast<Linear>(blocks["w1"]);
+            auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]);
+            auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]);
+
+            auto x1 = ggml_silu(ctx->ggml_ctx, w1->forward(ctx, x));
+            auto x3 = w3->forward(ctx, x);
+            x       = ggml_mul(ctx->ggml_ctx, x1, x3);
+            x       = w2->forward(ctx, x);
+            return x;
+        }
+    };
+
+    class Ideogram4TransformerBlock : public GGMLBlock {
+    public:
+        Ideogram4TransformerBlock(const Ideogram4Config& config) {
+            blocks["attention"]        = std::make_shared<Ideogram4Attention>(config.emb_dim, config.num_heads, config.norm_eps);
+            blocks["feed_forward"]     = std::make_shared<Ideogram4MLP>(config.emb_dim, config.intermediate_size);
+            blocks["attention_norm1"]  = std::make_shared<RMSNorm>(config.emb_dim, config.norm_eps);
+            blocks["ffn_norm1"]        = std::make_shared<RMSNorm>(config.emb_dim, config.norm_eps);
+            blocks["attention_norm2"]  = std::make_shared<RMSNorm>(config.emb_dim, config.norm_eps);
+            blocks["ffn_norm2"]        = std::make_shared<RMSNorm>(config.emb_dim, config.norm_eps);
+            blocks["adaln_modulation"] = make_linear(config.adanln_dim, 4 * config.emb_dim, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* pe,
+                             ggml_tensor* adaln_input,
+                             ggml_tensor* mask = nullptr) {
+            auto attention        = std::dynamic_pointer_cast<Ideogram4Attention>(blocks["attention"]);
+            auto feed_forward     = std::dynamic_pointer_cast<Ideogram4MLP>(blocks["feed_forward"]);
+            auto attention_norm1  = std::dynamic_pointer_cast<RMSNorm>(blocks["attention_norm1"]);
+            auto ffn_norm1        = std::dynamic_pointer_cast<RMSNorm>(blocks["ffn_norm1"]);
+            auto attention_norm2  = std::dynamic_pointer_cast<RMSNorm>(blocks["attention_norm2"]);
+            auto ffn_norm2        = std::dynamic_pointer_cast<RMSNorm>(blocks["ffn_norm2"]);
+            auto adaln_modulation = std::dynamic_pointer_cast<Linear>(blocks["adaln_modulation"]);
+
+            auto mod       = adaln_modulation->forward(ctx, adaln_input);
+            auto mods      = ggml_ext_chunk(ctx->ggml_ctx, mod, 4, 0);
+            auto scale_msa = mods[0];
+            auto gate_msa  = to_token_modulation(ctx->ggml_ctx, ggml_tanh(ctx->ggml_ctx, mods[1]));
+            auto scale_mlp = mods[2];
+            auto gate_mlp  = to_token_modulation(ctx->ggml_ctx, ggml_tanh(ctx->ggml_ctx, mods[3]));
+
+            auto attn_out = attention_norm1->forward(ctx, x);
+            attn_out      = modulate(ctx->ggml_ctx, attn_out, scale_msa);
+            attn_out      = attention->forward(ctx, attn_out, pe, mask);
+            attn_out      = attention_norm2->forward(ctx, attn_out);
+            x             = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn_out, gate_msa));
+
+            auto ffn_out = ffn_norm1->forward(ctx, x);
+            ffn_out      = modulate(ctx->ggml_ctx, ffn_out, scale_mlp);
+            ffn_out      = feed_forward->forward(ctx, ffn_out);
+            ffn_out      = ffn_norm2->forward(ctx, ffn_out);
+            x            = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, ffn_out, gate_mlp));
+
+            return x;
+        }
+    };
+
+    class Ideogram4EmbedScalar : public GGMLBlock {
+    protected:
+        int64_t dim;
+
+    public:
+        Ideogram4EmbedScalar(int64_t dim)
+            : dim(dim) {
+            blocks["mlp_in"]  = make_linear(dim, dim, true);
+            blocks["mlp_out"] = make_linear(dim, dim, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            auto mlp_in  = std::dynamic_pointer_cast<Linear>(blocks["mlp_in"]);
+            auto mlp_out = std::dynamic_pointer_cast<Linear>(blocks["mlp_out"]);
+
+            x = timestep_embedding_sin_cos(ctx->ggml_ctx, x, static_cast<int>(dim));
+            x = ggml_silu(ctx->ggml_ctx, mlp_in->forward(ctx, x));
+            x = mlp_out->forward(ctx, x);
+            return x;
+        }
+    };
+
+    class Ideogram4FinalLayer : public GGMLBlock {
+    public:
+        Ideogram4FinalLayer(const Ideogram4Config& config) {
+            blocks["norm_final"]       = std::make_shared<LayerNorm>(config.emb_dim, 1e-6f, false);
+            blocks["linear"]           = make_linear(config.emb_dim, config.in_channels, true);
+            blocks["adaln_modulation"] = make_linear(config.adanln_dim, config.emb_dim, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* c) {
+            auto norm_final       = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_final"]);
+            auto linear           = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
+            auto adaln_modulation = std::dynamic_pointer_cast<Linear>(blocks["adaln_modulation"]);
+
+            auto scale = adaln_modulation->forward(ctx, ggml_silu(ctx->ggml_ctx, c));
+            x          = norm_final->forward(ctx, x);
+            x          = modulate(ctx->ggml_ctx, x, scale);
+            x          = linear->forward(ctx, x);
+            return x;
+        }
+    };
+
+    class Ideogram4Transformer : public GGMLBlock {
+    protected:
+        Ideogram4Config config;
+
+    public:
+        Ideogram4Transformer() = default;
+        explicit Ideogram4Transformer(Ideogram4Config config)
+            : config(std::move(config)) {
+            blocks["input_proj"]            = make_linear(this->config.in_channels, this->config.emb_dim, true);
+            blocks["llm_cond_norm"]         = std::make_shared<RMSNorm>(this->config.llm_features_dim, 1e-6f);
+            blocks["llm_cond_proj"]         = make_linear(this->config.llm_features_dim, this->config.emb_dim, true);
+            blocks["t_embedding"]           = std::make_shared<Ideogram4EmbedScalar>(this->config.emb_dim);
+            blocks["adaln_proj"]            = make_linear(this->config.emb_dim, this->config.adanln_dim, true);
+            blocks["embed_image_indicator"] = std::make_shared<Embedding>(2, this->config.emb_dim);
+
+            for (int i = 0; i < this->config.num_layers; ++i) {
+                blocks["layers." + std::to_string(i)] = std::make_shared<Ideogram4TransformerBlock>(this->config);
+            }
+            blocks["final_layer"] = std::make_shared<Ideogram4FinalLayer>(this->config);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* timestep,
+                             ggml_tensor* context,
+                             ggml_tensor* pe,
+                             ggml_tensor* image_indicator_ids) {
+            int64_t W = x->ne[0];
+            int64_t H = x->ne[1];
+            int64_t N = x->ne[3];
+            GGML_ASSERT(N == 1);
+
+            auto input_proj            = std::dynamic_pointer_cast<Linear>(blocks["input_proj"]);
+            auto llm_cond_norm         = std::dynamic_pointer_cast<RMSNorm>(blocks["llm_cond_norm"]);
+            auto llm_cond_proj         = std::dynamic_pointer_cast<Linear>(blocks["llm_cond_proj"]);
+            auto t_embedding           = std::dynamic_pointer_cast<Ideogram4EmbedScalar>(blocks["t_embedding"]);
+            auto adaln_proj            = std::dynamic_pointer_cast<Linear>(blocks["adaln_proj"]);
+            auto embed_image_indicator = std::dynamic_pointer_cast<Embedding>(blocks["embed_image_indicator"]);
+            auto final_layer           = std::dynamic_pointer_cast<Ideogram4FinalLayer>(blocks["final_layer"]);
+
+            auto img = patchify(ctx->ggml_ctx, x, config);
+            img      = input_proj->forward(ctx, img);
+
+            ggml_tensor* h      = img;
+            int64_t context_len = 0;
+            if (context != nullptr) {
+                if (ggml_n_dims(context) < 3) {
+                    context = ggml_reshape_3d(ctx->ggml_ctx, context, context->ne[0], context->ne[1], 1);
+                }
+                context     = interleave_hidden_state_layers(ctx->ggml_ctx, context);
+                context_len = context->ne[1];
+                auto txt    = llm_cond_norm->forward(ctx, context);
+                txt         = llm_cond_proj->forward(ctx, txt);
+                h           = ggml_concat(ctx->ggml_ctx, txt, img, 1);
+            }
+
+            auto indicator_embedding = embed_image_indicator->forward(ctx, image_indicator_ids);
+            h                        = ggml_add(ctx->ggml_ctx, h, indicator_embedding);
+
+            auto t_cond      = t_embedding->forward(ctx, timestep);
+            auto adaln_input = ggml_silu(ctx->ggml_ctx, adaln_proj->forward(ctx, t_cond));
+
+            for (int i = 0; i < config.num_layers; ++i) {
+                auto block = std::dynamic_pointer_cast<Ideogram4TransformerBlock>(blocks["layers." + std::to_string(i)]);
+                h          = block->forward(ctx, h, pe, adaln_input, nullptr);
+                sd::ggml_graph_cut::mark_graph_cut(h, "ideogram4.layers." + std::to_string(i), "hidden");
+            }
+
+            h = final_layer->forward(ctx, h, adaln_input);
+            if (context_len > 0) {
+                h = ggml_ext_slice(ctx->ggml_ctx, h, 1, context_len, h->ne[1]);
+            }
+
+            h = unpatchify(ctx->ggml_ctx, h, H, W, config);
+            h = ggml_ext_scale(ctx->ggml_ctx, h, -1.f);
+            return h;
+        }
+    };
+
+    class Ideogram4Runner : public DiffusionModelRunner {
+    protected:
+        bool should_use_uncond_model(const DiffusionParams& diffusion_params) const {
+            return has_uncond_model &&
+                   diffusion_params.context == nullptr &&
+                   diffusion_params.y != nullptr &&
+                   !diffusion_params.y->empty();
+        }
+
+    public:
+        Ideogram4Config config;
+        Ideogram4Transformer model;
+        Ideogram4Transformer uncond_model;
+        bool has_uncond_model = false;
+        std::string uncond_prefix;
+        std::vector<float> pe_vec;
+        std::vector<int32_t> image_indicator_vec;
+
+        Ideogram4Runner(ggml_backend_t backend,
+                        const String2TensorStorage& tensor_storage_map      = {},
+                        const std::string prefix                            = "",
+                        std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+            : DiffusionModelRunner(backend, prefix, weight_manager),
+              config(Ideogram4Config::detect_from_weights(tensor_storage_map, prefix)),
+              uncond_prefix(prefix + ".uncond") {
+            model = Ideogram4Transformer(config);
+            model.init(params_ctx, tensor_storage_map, prefix);
+            for (const auto& pair : tensor_storage_map) {
+                const std::string& name = pair.first;
+                if (starts_with(name, uncond_prefix)) {
+                    has_uncond_model = true;
+                    break;
+                }
+            }
+            if (has_uncond_model) {
+                LOG_DEBUG("using uncond model");
+                uncond_model = Ideogram4Transformer(config);
+                uncond_model.init(params_ctx, tensor_storage_map, uncond_prefix);
+            }
+        }
+
+        std::string get_desc() override {
+            return "ideogram4";
+        }
+
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
+            model.get_param_tensors(tensors, prefix);
+            if (has_uncond_model) {
+                uncond_model.get_param_tensors(tensors, this->uncond_prefix);
+            }
+        }
+
+        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                                 const sd::Tensor<float>& timesteps_tensor,
+                                 const sd::Tensor<float>& context_tensor,
+                                 bool use_uncond_model = false) {
+            ggml_cgraph* gf        = new_graph_custom(IDEOGRAM4_GRAPH_SIZE);
+            ggml_tensor* x         = make_input(x_tensor);
+            ggml_tensor* timesteps = make_input(timesteps_tensor);
+            GGML_ASSERT(x->ne[3] == 1);
+            Ideogram4Transformer& active_model = use_uncond_model ? uncond_model : model;
+
+            ggml_tensor* context = nullptr;
+            int64_t context_len  = 0;
+            if (!context_tensor.empty()) {
+                context     = make_input(context_tensor);
+                context_len = context->ne[1];
+            }
+
+            int64_t grid_w   = x->ne[0];
+            int64_t grid_h   = x->ne[1];
+            int64_t pos_len  = context_len + grid_h * grid_w;
+            int64_t head_dim = config.emb_dim / config.num_heads;
+
+            auto runner_ctx = get_context();
+            pe_vec          = gen_ideogram4_pe(static_cast<int>(grid_h),
+                                               static_cast<int>(grid_w),
+                                               static_cast<int>(x->ne[3]),
+                                               static_cast<int>(context_len),
+                                               static_cast<int>(head_dim),
+                                               static_cast<int>(config.rope_theta),
+                                               config.mrope_section,
+                                               runner_ctx.circular_x_enabled,
+                                               runner_ctx.circular_y_enabled);
+            auto pe         = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, head_dim / 2, pos_len);
+            set_backend_tensor_data(pe, pe_vec.data());
+
+            image_indicator_vec.assign(static_cast<size_t>(pos_len), 1);
+            for (int64_t i = 0; i < context_len; ++i) {
+                image_indicator_vec[static_cast<size_t>(i)] = 0;
+            }
+            auto indicator = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_I32, pos_len, x->ne[3]);
+            set_backend_tensor_data(indicator, image_indicator_vec.data());
+
+            ggml_tensor* out = active_model.forward(&runner_ctx, x, timesteps, context, pe, indicator);
+            ggml_build_forward_expand(gf, out);
+            return gf;
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const sd::Tensor<float>& x,
+                                  const sd::Tensor<float>& timesteps,
+                                  const sd::Tensor<float>& context,
+                                  bool use_uncond_model = false) {
+            auto get_graph = [&]() -> ggml_cgraph* {
+                return build_graph(x, timesteps, context, use_uncond_model);
+            };
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const DiffusionParams& diffusion_params) override {
+            GGML_ASSERT(diffusion_params.x != nullptr);
+            GGML_ASSERT(diffusion_params.timesteps != nullptr);
+            bool use_uncond_model = should_use_uncond_model(diffusion_params);
+            return compute(n_threads,
+                           *diffusion_params.x,
+                           *diffusion_params.timesteps,
+                           tensor_or_empty(diffusion_params.context),
+                           use_uncond_model);
+        }
+    };
+}  // namespace Ideogram4
+
+#endif  // __SD_MODEL_DIFFUSION_IDEOGRAM4_HPP__
--- a/src/model/diffusion/lens.hpp
+++ b/src/model/diffusion/lens.hpp
@ -0,0 +1,426 @@
+#ifndef __SD_MODEL_DIFFUSION_LENS_HPP__
+#define __SD_MODEL_DIFFUSION_LENS_HPP__
+
+#include <memory>
+#include <vector>
+
+#include "model/common/block.hpp"
+#include "model/common/rope.hpp"
+#include "model/diffusion/flux.hpp"
+#include "model/diffusion/model.hpp"
+#include "model/diffusion/qwen_image.hpp"
+
+namespace Lens {
+    constexpr int LENS_GRAPH_SIZE = 40960;
+
+    struct LensConfig {
+        int patch_size              = 2;
+        int64_t in_channels         = 128;
+        int64_t out_channels        = 32;
+        int num_layers              = 48;
+        int64_t attention_head_dim  = 64;
+        int64_t num_attention_heads = 24;
+        int64_t joint_attention_dim = 2880;
+        int selected_layer_count    = 4;
+        int theta                   = 10000;
+        std::vector<int> axes_dim   = {8, 28, 28};
+        int axes_dim_sum            = 64;
+
+        static LensConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
+            LensConfig config;
+            config.num_layers = 0;
+            for (const auto& [name, tensor_storage] : tensor_storage_map) {
+                if (!starts_with(name, prefix)) {
+                    continue;
+                }
+                if (ends_with(name, "img_in.weight") && tensor_storage.n_dims == 2) {
+                    config.in_channels = tensor_storage.ne[0];
+                    int64_t inner_dim  = tensor_storage.ne[1];
+                    if (config.attention_head_dim > 0) {
+                        config.num_attention_heads = inner_dim / config.attention_head_dim;
+                    }
+                } else if (ends_with(name, "txt_in.weight") && tensor_storage.n_dims == 2) {
+                    config.selected_layer_count = static_cast<int>(tensor_storage.ne[0] / config.joint_attention_dim);
+                } else if (ends_with(name, "proj_out.weight") && tensor_storage.n_dims == 2) {
+                    int64_t patch_area  = config.patch_size * config.patch_size;
+                    config.out_channels = tensor_storage.ne[1] / patch_area;
+                } else if (ends_with(name, "transformer_blocks.0.attn.norm_q.weight") && tensor_storage.n_dims == 1) {
+                    config.attention_head_dim = tensor_storage.ne[0];
+                }
+
+                size_t pos = name.find("transformer_blocks.");
+                if (pos != std::string::npos) {
+                    auto items = split_string(name.substr(pos), '.');
+                    if (items.size() > 1) {
+                        int block_index = atoi(items[1].c_str());
+                        if (block_index + 1 > config.num_layers) {
+                            config.num_layers = block_index + 1;
+                        }
+                    }
+                }
+            }
+            if (config.num_layers == 0) {
+                config.num_layers = 48;
+            }
+            config.axes_dim_sum = 0;
+            for (int axis_dim : config.axes_dim) {
+                config.axes_dim_sum += axis_dim;
+            }
+            LOG_DEBUG("lens: num_layers = %d, selected_layer_count = %d, hidden_size = %" PRId64 ", num_attention_heads = %" PRId64 ", attention_head_dim = %" PRId64 ", in_channels = %" PRId64 ", out_channels = %" PRId64,
+                      config.num_layers,
+                      config.selected_layer_count,
+                      config.num_attention_heads * config.attention_head_dim,
+                      config.num_attention_heads,
+                      config.attention_head_dim,
+                      config.in_channels,
+                      config.out_channels);
+            return config;
+        }
+    };
+
+    struct LensTimestepProjEmbeddings : public GGMLBlock {
+        LensTimestepProjEmbeddings(int64_t embedding_dim) {
+            blocks["timestep_embedder"] = std::make_shared<Qwen::TimestepEmbedding>(256, embedding_dim);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* timesteps) {
+            auto timestep_embedder = std::dynamic_pointer_cast<Qwen::TimestepEmbedding>(blocks["timestep_embedder"]);
+            auto timesteps_proj    = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f);
+            return timestep_embedder->forward(ctx, timesteps_proj);
+        }
+    };
+
+    struct LensGateMLP : public GGMLBlock {
+        LensGateMLP(int64_t dim, int64_t hidden_dim) {
+            blocks["w1"] = std::make_shared<Linear>(dim, hidden_dim, false);
+            blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false);
+            blocks["w3"] = std::make_shared<Linear>(dim, hidden_dim, false);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            auto w1 = std::dynamic_pointer_cast<Linear>(blocks["w1"]);
+            auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]);
+            auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]);
+
+            auto gate = ggml_silu(ctx->ggml_ctx, w1->forward(ctx, x));
+            auto up   = w3->forward(ctx, x);
+            x         = ggml_mul(ctx->ggml_ctx, gate, up);
+            return w2->forward(ctx, x);
+        }
+    };
+
+    struct LensJointAttention : public GGMLBlock {
+        int64_t dim_head;
+        int64_t num_heads;
+
+        LensJointAttention(int64_t query_dim,
+                           int64_t dim_head,
+                           int64_t num_heads,
+                           float eps = 1e-5f)
+            : dim_head(dim_head), num_heads(num_heads) {
+            int64_t inner_dim = dim_head * num_heads;
+            blocks["img_qkv"] = std::make_shared<Linear>(query_dim, inner_dim * 3, true);
+            blocks["txt_qkv"] = std::make_shared<Linear>(query_dim, inner_dim * 3, true);
+
+            blocks["norm_q"]       = std::make_shared<RMSNorm>(dim_head, eps);
+            blocks["norm_k"]       = std::make_shared<RMSNorm>(dim_head, eps);
+            blocks["norm_added_q"] = std::make_shared<RMSNorm>(dim_head, eps);
+            blocks["norm_added_k"] = std::make_shared<RMSNorm>(dim_head, eps);
+
+            blocks["to_out.0"]   = std::make_shared<Linear>(inner_dim, query_dim, true);
+            blocks["to_add_out"] = std::make_shared<Linear>(inner_dim, query_dim, true);
+        }
+
+        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                      ggml_tensor* img,
+                                                      ggml_tensor* txt,
+                                                      ggml_tensor* pe,
+                                                      ggml_tensor* mask = nullptr) {
+            auto img_qkv    = std::dynamic_pointer_cast<Linear>(blocks["img_qkv"]);
+            auto txt_qkv    = std::dynamic_pointer_cast<Linear>(blocks["txt_qkv"]);
+            auto norm_q     = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_q"]);
+            auto norm_k     = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_k"]);
+            auto norm_add_q = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_added_q"]);
+            auto norm_add_k = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_added_k"]);
+            auto to_out_0   = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
+            auto to_add_out = std::dynamic_pointer_cast<Linear>(blocks["to_add_out"]);
+            int64_t n_img   = img->ne[1];
+            int64_t n_txt   = txt->ne[1];
+            int64_t N       = img->ne[2];
+            int64_t inner   = dim_head * num_heads;
+
+            auto img_qkv_vec = split_qkv(ctx->ggml_ctx, img_qkv->forward(ctx, img));
+            auto txt_qkv_vec = split_qkv(ctx->ggml_ctx, txt_qkv->forward(ctx, txt));
+
+            auto img_q = ggml_reshape_4d(ctx->ggml_ctx, img_qkv_vec[0], dim_head, num_heads, n_img, N);
+            auto img_k = ggml_reshape_4d(ctx->ggml_ctx, img_qkv_vec[1], dim_head, num_heads, n_img, N);
+            auto img_v = ggml_reshape_4d(ctx->ggml_ctx, img_qkv_vec[2], dim_head, num_heads, n_img, N);
+
+            img_q = norm_q->forward(ctx, img_q);
+            img_k = norm_k->forward(ctx, img_k);
+
+            auto txt_q = ggml_reshape_4d(ctx->ggml_ctx, txt_qkv_vec[0], dim_head, num_heads, n_txt, N);
+            auto txt_k = ggml_reshape_4d(ctx->ggml_ctx, txt_qkv_vec[1], dim_head, num_heads, n_txt, N);
+            auto txt_v = ggml_reshape_4d(ctx->ggml_ctx, txt_qkv_vec[2], dim_head, num_heads, n_txt, N);
+
+            txt_q = norm_add_q->forward(ctx, txt_q);
+            txt_k = norm_add_k->forward(ctx, txt_k);
+
+            auto q = ggml_concat(ctx->ggml_ctx, img_q, txt_q, 2);
+            auto k = ggml_concat(ctx->ggml_ctx, img_k, txt_k, 2);
+            auto v = ggml_concat(ctx->ggml_ctx, img_v, txt_v, 2);
+
+            auto attn = Rope::attention(ctx, q, k, v, pe, mask, (1.0f / 128.f));
+
+            auto img_attn_out = ggml_view_3d(ctx->ggml_ctx,
+                                             attn,
+                                             inner,
+                                             n_img,
+                                             N,
+                                             attn->nb[1],
+                                             attn->nb[2],
+                                             0);
+            auto txt_attn_out = ggml_view_3d(ctx->ggml_ctx,
+                                             attn,
+                                             inner,
+                                             n_txt,
+                                             N,
+                                             attn->nb[1],
+                                             attn->nb[2],
+                                             n_img * attn->nb[1]);
+
+            img_attn_out = to_out_0->forward(ctx, ggml_cont(ctx->ggml_ctx, img_attn_out));
+            txt_attn_out = to_add_out->forward(ctx, ggml_cont(ctx->ggml_ctx, txt_attn_out));
+            return {img_attn_out, txt_attn_out};
+        }
+    };
+
+    struct LensTransformerBlock : public GGMLBlock {
+        LensTransformerBlock(int64_t dim,
+                             int64_t num_attention_heads,
+                             int64_t attention_head_dim,
+                             float eps = 1e-6f) {
+            int64_t mlp_hidden_dim = dim / 3 * 8;
+            blocks["img_mod.1"]    = std::make_shared<Linear>(dim, 6 * dim, true);
+            blocks["txt_mod.1"]    = std::make_shared<Linear>(dim, 6 * dim, true);
+            blocks["img_norm1"]    = std::make_shared<RMSNorm>(dim, eps);
+            blocks["img_norm2"]    = std::make_shared<RMSNorm>(dim, eps);
+            blocks["txt_norm1"]    = std::make_shared<RMSNorm>(dim, eps);
+            blocks["txt_norm2"]    = std::make_shared<RMSNorm>(dim, eps);
+            blocks["img_mlp"]      = std::make_shared<LensGateMLP>(dim, mlp_hidden_dim);
+            blocks["txt_mlp"]      = std::make_shared<LensGateMLP>(dim, mlp_hidden_dim);
+            blocks["attn"]         = std::make_shared<LensJointAttention>(dim, attention_head_dim, num_attention_heads);
+        }
+
+        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                      ggml_tensor* img,
+                                                      ggml_tensor* txt,
+                                                      ggml_tensor* t_emb,
+                                                      ggml_tensor* pe) {
+            auto img_mod_1 = std::dynamic_pointer_cast<Linear>(blocks["img_mod.1"]);
+            auto txt_mod_1 = std::dynamic_pointer_cast<Linear>(blocks["txt_mod.1"]);
+            auto img_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["img_norm1"]);
+            auto img_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["img_norm2"]);
+            auto txt_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm1"]);
+            auto txt_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm2"]);
+            auto img_mlp   = std::dynamic_pointer_cast<LensGateMLP>(blocks["img_mlp"]);
+            auto txt_mlp   = std::dynamic_pointer_cast<LensGateMLP>(blocks["txt_mlp"]);
+            auto attn      = std::dynamic_pointer_cast<LensJointAttention>(blocks["attn"]);
+
+            auto temb = ggml_silu(ctx->ggml_ctx, t_emb);
+
+            auto img_mod_params = img_mod_1->forward(ctx, temb);
+            auto img_mod_vec    = ggml_ext_chunk(ctx->ggml_ctx, img_mod_params, 6, 0);
+            auto txt_mod_params = txt_mod_1->forward(ctx, temb);
+            auto txt_mod_vec    = ggml_ext_chunk(ctx->ggml_ctx, txt_mod_params, 6, 0);
+
+            auto img_normed    = img_norm1->forward(ctx, img);
+            auto img_modulated = Flux::modulate(ctx->ggml_ctx, img_normed, img_mod_vec[0], img_mod_vec[1]);
+            auto txt_normed    = txt_norm1->forward(ctx, txt);
+            auto txt_modulated = Flux::modulate(ctx->ggml_ctx, txt_normed, txt_mod_vec[0], txt_mod_vec[1]);
+
+            auto [img_attn_output, txt_attn_output] = attn->forward(ctx, img_modulated, txt_modulated, pe);
+
+            img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_attn_output, img_mod_vec[2]));
+            txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_attn_output, txt_mod_vec[2]));
+
+            auto img_normed2    = img_norm2->forward(ctx, img);
+            auto img_modulated2 = Flux::modulate(ctx->ggml_ctx, img_normed2, img_mod_vec[3], img_mod_vec[4]);
+            auto txt_normed2    = txt_norm2->forward(ctx, txt);
+            auto txt_modulated2 = Flux::modulate(ctx->ggml_ctx, txt_normed2, txt_mod_vec[3], txt_mod_vec[4]);
+
+            img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_mlp->forward(ctx, img_modulated2), img_mod_vec[5]));
+            txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp->forward(ctx, txt_modulated2), txt_mod_vec[5]));
+            return {img, txt};
+        }
+    };
+
+    struct LensAdaLayerNormContinuous : public GGMLBlock {
+        int64_t hidden_size;
+        float eps;
+
+        LensAdaLayerNormContinuous(int64_t hidden_size, float eps = 1e-6f)
+            : hidden_size(hidden_size), eps(eps) {
+            blocks["linear"] = std::make_shared<Linear>(hidden_size, hidden_size * 2, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* conditioning) {
+            auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
+            auto mods   = ggml_ext_chunk(ctx->ggml_ctx, linear->forward(ctx, ggml_silu(ctx->ggml_ctx, conditioning)), 2, 0);
+            auto scale  = mods[0];
+            auto shift  = mods[1];
+            x           = ggml_norm(ctx->ggml_ctx, x, eps);
+            return Flux::modulate(ctx->ggml_ctx, x, shift, scale);
+        }
+    };
+
+    class LensModel : public GGMLBlock {
+    public:
+        LensConfig config;
+
+        LensModel() = default;
+        LensModel(LensConfig config)
+            : config(config) {
+            int64_t inner_dim         = config.num_attention_heads * config.attention_head_dim;
+            blocks["time_text_embed"] = std::make_shared<LensTimestepProjEmbeddings>(inner_dim);
+            blocks["img_in"]          = std::make_shared<Linear>(config.in_channels, inner_dim, true);
+            blocks["txt_in"]          = std::make_shared<Linear>(config.joint_attention_dim * config.selected_layer_count, inner_dim, true);
+            for (int i = 0; i < config.selected_layer_count; ++i) {
+                blocks["txt_norm." + std::to_string(i)] = std::make_shared<RMSNorm>(config.joint_attention_dim, 1e-5f);
+            }
+            for (int i = 0; i < config.num_layers; ++i) {
+                blocks["transformer_blocks." + std::to_string(i)] = std::make_shared<LensTransformerBlock>(inner_dim,
+                                                                                                           config.num_attention_heads,
+                                                                                                           config.attention_head_dim);
+            }
+            blocks["norm_out"] = std::make_shared<LensAdaLayerNormContinuous>(inner_dim, 1e-6f);
+            blocks["proj_out"] = std::make_shared<Linear>(inner_dim, config.patch_size * config.patch_size * config.out_channels, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* timestep,
+                             ggml_tensor* context,
+                             ggml_tensor* pe) {
+            GGML_ASSERT(context != nullptr);
+            int64_t W = x->ne[0];
+            int64_t H = x->ne[1];
+            int64_t C = x->ne[2];
+            int64_t N = x->ne[3];
+
+            auto time_text_embed = std::dynamic_pointer_cast<LensTimestepProjEmbeddings>(blocks["time_text_embed"]);
+            auto img_in          = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
+            auto txt_in          = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
+            auto norm_out        = std::dynamic_pointer_cast<LensAdaLayerNormContinuous>(blocks["norm_out"]);
+            auto proj_out        = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]);
+
+            auto t_emb = time_text_embed->forward(ctx, timestep);
+
+            auto img = ggml_reshape_3d(ctx->ggml_ctx, x, W * H, C, N);
+            img      = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3));
+            img      = img_in->forward(ctx, img);
+
+            std::vector<ggml_tensor*> txt_chunks = ggml_ext_chunk(ctx->ggml_ctx, context, config.selected_layer_count, 0);
+            ggml_tensor* txt                     = nullptr;
+            for (int i = 0; i < config.selected_layer_count; ++i) {
+                auto txt_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm." + std::to_string(i)]);
+                auto chunk    = txt_norm->forward(ctx, txt_chunks[i]);
+                txt           = txt == nullptr ? chunk : ggml_concat(ctx->ggml_ctx, txt, chunk, 0);
+            }
+            txt = txt_in->forward(ctx, txt);
+
+            sd::ggml_graph_cut::mark_graph_cut(img, "lens.prelude", "img");
+            sd::ggml_graph_cut::mark_graph_cut(txt, "lens.prelude", "txt");
+
+            for (int i = 0; i < config.num_layers; ++i) {
+                auto block = std::dynamic_pointer_cast<LensTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
+                auto out   = block->forward(ctx, img, txt, t_emb, pe);
+                img        = out.first;
+                txt        = out.second;
+                sd::ggml_graph_cut::mark_graph_cut(img, "lens.transformer_blocks." + std::to_string(i), "img");
+                sd::ggml_graph_cut::mark_graph_cut(txt, "lens.transformer_blocks." + std::to_string(i), "txt");
+            }
+
+            img = norm_out->forward(ctx, img, t_emb);
+            img = proj_out->forward(ctx, img);
+
+            auto out = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3));
+            out      = ggml_reshape_4d(ctx->ggml_ctx, out, W, H, config.patch_size * config.patch_size * config.out_channels, N);
+            return out;
+        }
+    };
+
+    struct LensRunner : public DiffusionModelRunner {
+        LensConfig config;
+        LensModel lens;
+        std::vector<float> pe_vec;
+
+        LensRunner(ggml_backend_t backend,
+                   const String2TensorStorage& tensor_storage_map      = {},
+                   const std::string prefix                            = "",
+                   std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+            : DiffusionModelRunner(backend, prefix, weight_manager),
+              config(LensConfig::detect_from_weights(tensor_storage_map, prefix)) {
+            lens = LensModel(config);
+            lens.init(params_ctx, tensor_storage_map, prefix);
+        }
+
+        std::string get_desc() override {
+            return "lens";
+        }
+
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
+            lens.get_param_tensors(tensors, prefix);
+        }
+
+        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                                 const sd::Tensor<float>& timesteps_tensor,
+                                 const sd::Tensor<float>& context_tensor) {
+            ggml_cgraph* gf        = new_graph_custom(LENS_GRAPH_SIZE);
+            ggml_tensor* x         = make_input(x_tensor);
+            ggml_tensor* timesteps = make_input(timesteps_tensor);
+            GGML_ASSERT(x->ne[3] == 1);
+            GGML_ASSERT(!context_tensor.empty());
+            ggml_tensor* context = make_input(context_tensor);
+
+            pe_vec      = Rope::gen_lens_pe(static_cast<int>(x->ne[1]),
+                                            static_cast<int>(x->ne[0]),
+                                            static_cast<int>(x->ne[3]),
+                                            static_cast<int>(context->ne[1]),
+                                            config.theta,
+                                            circular_y_enabled,
+                                            circular_x_enabled,
+                                            config.axes_dim);
+            int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
+            auto pe     = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.axes_dim_sum / 2, pos_len);
+            set_backend_tensor_data(pe, pe_vec.data());
+
+            auto runner_ctx  = get_context();
+            ggml_tensor* out = lens.forward(&runner_ctx, x, timesteps, context, pe);
+            ggml_build_forward_expand(gf, out);
+            return gf;
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const sd::Tensor<float>& x,
+                                  const sd::Tensor<float>& timesteps,
+                                  const sd::Tensor<float>& context) {
+            auto get_graph = [&]() -> ggml_cgraph* {
+                return build_graph(x, timesteps, context);
+            };
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const DiffusionParams& diffusion_params) override {
+            GGML_ASSERT(diffusion_params.x != nullptr);
+            GGML_ASSERT(diffusion_params.timesteps != nullptr);
+            return compute(n_threads,
+                           *diffusion_params.x,
+                           *diffusion_params.timesteps,
+                           tensor_or_empty(diffusion_params.context));
+        }
+    };
+}  // namespace Lens
+
+#endif  // __SD_MODEL_DIFFUSION_LENS_HPP__
--- a/src/model/diffusion/ltxv.hpp
+++ b/src/model/diffusion/ltxv.hpp
--- a/src/model/diffusion/mmdit.hpp
+++ b/src/model/diffusion/mmdit.hpp
@ -1,41 +1,137 @@
-#ifndef __MMDIT_HPP__
-#define __MMDIT_HPP__
+#ifndef __SD_MODEL_DIFFUSION_MMDIT_HPP__
+#define __SD_MODEL_DIFFUSION_MMDIT_HPP__

+#include <algorithm>
 #include <memory>
+#include <string>
+#include <vector>

-#include "ggml_extend.hpp"
-#include "model.h"
+#include "core/ggml_extend.hpp"
+#include "model/common/block.hpp"
+#include "model/diffusion/model.hpp"
+#include "model_loader.h"

 #define MMDIT_GRAPH_SIZE 10240

-struct Mlp : public GGMLBlock {
-public:
-    Mlp(int64_t in_features,
-        int64_t hidden_features = -1,
-        int64_t out_features    = -1,
-        bool bias               = true) {
-        // act_layer is always lambda: nn.GELU(approximate="tanh")
-        // norm_layer is always None
-        // use_conv is always False
-        if (hidden_features == -1) {
-            hidden_features = in_features;
-        }
-        if (out_features == -1) {
-            out_features = in_features;
-        }
-        blocks["fc1"] = std::shared_ptr<GGMLBlock>(new Linear(in_features, hidden_features, bias));
-        blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias));
+struct MMDiTConfig {
+    int64_t input_size               = -1;
+    int patch_size                   = 2;
+    int64_t in_channels              = 16;
+    int64_t d_self                   = -1;  // >=0 for MMdiT-X
+    int64_t depth                    = 24;
+    float mlp_ratio                  = 4.0f;
+    int64_t adm_in_channels          = 2048;
+    int64_t out_channels             = 16;
+    int64_t pos_embed_max_size       = 192;
+    int64_t num_patches              = 36864;  // 192 * 192
+    int64_t context_size             = 4096;
+    int64_t context_embedder_out_dim = 1536;
+    int64_t hidden_size              = 1536;
+    std::string qk_norm;
+
+    static MMDiTConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
+        MMDiTConfig config;
+        bool has_weight_config = false;
+        bool has_pos_embed     = false;
+        bool has_hidden_size   = false;
+        bool has_context_embed = false;
+
+        for (const auto& [name, tensor_storage] : tensor_storage_map) {
+            if (!starts_with(name, prefix)) {
+                continue;
            }

-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
-        // x: [N, n_token, in_features]
-        auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
-        auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
+            if (name.find("x_embedder.proj.weight") != std::string::npos && tensor_storage.n_dims == 4) {
+                has_weight_config  = true;
+                has_hidden_size    = true;
+                config.patch_size  = static_cast<int>(tensor_storage.ne[0]);
+                config.in_channels = tensor_storage.ne[2];
+                config.hidden_size = tensor_storage.ne[3];
+            } else if (name.find("t_embedder.mlp.0.weight") != std::string::npos && tensor_storage.n_dims == 2) {
+                has_weight_config  = true;
+                has_hidden_size    = true;
+                config.hidden_size = tensor_storage.ne[1];
+            } else if (name.find("y_embedder.mlp.0.weight") != std::string::npos && tensor_storage.n_dims == 2) {
+                has_weight_config      = true;
+                has_hidden_size        = true;
+                config.adm_in_channels = tensor_storage.ne[0];
+                config.hidden_size     = tensor_storage.ne[1];
+            } else if (name.find("context_embedder.weight") != std::string::npos && tensor_storage.n_dims == 2) {
+                has_weight_config               = true;
+                has_context_embed               = true;
+                config.context_size             = tensor_storage.ne[0];
+                config.context_embedder_out_dim = tensor_storage.ne[1];
+            } else if (name.find("final_layer.linear.weight") != std::string::npos && tensor_storage.n_dims == 2) {
+                has_weight_config  = true;
+                has_hidden_size    = true;
+                config.hidden_size = tensor_storage.ne[0];
+                int64_t patch_area = static_cast<int64_t>(config.patch_size) * config.patch_size;
+                if (patch_area > 0) {
+                    config.out_channels = tensor_storage.ne[1] / patch_area;
+                }
+            } else if (name.find("pos_embed") != std::string::npos && tensor_storage.n_dims == 3) {
+                has_weight_config  = true;
+                has_pos_embed      = true;
+                has_hidden_size    = true;
+                config.hidden_size = tensor_storage.ne[0];
+                config.num_patches = tensor_storage.ne[1];
+                for (int64_t size = 1; size * size <= config.num_patches; size++) {
+                    if (size * size == config.num_patches) {
+                        config.pos_embed_max_size = size;
+                        break;
+                    }
+                }
+            }

-        x = fc1->forward(ctx, x);
-        x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
-        x = fc2->forward(ctx, x);
-        return x;
+            size_t jb = name.find("joint_blocks.");
+            if (jb == std::string::npos) {
+                continue;
+            }
+
+            has_weight_config      = true;
+            std::string block_name = name.substr(jb);
+            int64_t block_depth    = atoi(block_name.substr(13, block_name.find(".", 13)).c_str());
+            if (block_depth + 1 > config.depth) {
+                config.depth = block_depth + 1;
+            }
+            if (block_name.find("attn.ln") != std::string::npos) {
+                if (block_name.find(".bias") != std::string::npos) {
+                    config.qk_norm = "ln";
+                } else {
+                    config.qk_norm = "rms";
+                }
+            }
+            if (block_name.find("attn2") != std::string::npos) {
+                if (block_depth > config.d_self) {
+                    config.d_self = block_depth;
+                }
+            }
+        }
+
+        if (!has_pos_embed && config.d_self >= 0) {
+            config.pos_embed_max_size *= 2;
+            config.num_patches *= 4;
+        }
+        if (!has_hidden_size || config.hidden_size <= 0) {
+            config.hidden_size = 64 * config.depth;
+        }
+        if (!has_context_embed || config.context_embedder_out_dim <= 0) {
+            config.context_embedder_out_dim = config.hidden_size;
+        }
+
+        if (has_weight_config) {
+            LOG_DEBUG("mmdit: num_layers = %" PRId64 ", num_mmdit_x_layers = %" PRId64 ", hidden_size = %" PRId64 ", patch_size = %d, in_channels = %" PRId64 ", out_channels = %" PRId64 ", context_size = %" PRId64 ", adm_in_channels = %" PRId64 ", qk_norm = %s",
+                      config.depth,
+                      config.d_self + 1,
+                      config.hidden_size,
+                      config.patch_size,
+                      config.in_channels,
+                      config.out_channels,
+                      config.context_size,
+                      config.adm_in_channels,
+                      config.qk_norm.empty() ? "none" : config.qk_norm.c_str());
+        }
+        return config;
    }
 };

@ -611,28 +707,16 @@ public:
 struct MMDiT : public GGMLBlock {
    // Diffusion model with a Transformer backbone.
 protected:
-    int64_t input_size               = -1;
-    int patch_size                   = 2;
-    int64_t in_channels              = 16;
-    int64_t d_self                   = -1;  // >=0 for MMdiT-X
-    int64_t depth                    = 24;
-    float mlp_ratio                  = 4.0f;
-    int64_t adm_in_channels          = 2048;
-    int64_t out_channels             = 16;
-    int64_t pos_embed_max_size       = 192;
-    int64_t num_patchs               = 36864;  // 192 * 192
-    int64_t context_size             = 4096;
-    int64_t context_embedder_out_dim = 1536;
-    int64_t hidden_size;
-    std::string qk_norm;
-
    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
        enum ggml_type wtype = GGML_TYPE_F32;
-        params["pos_embed"]  = ggml_new_tensor_3d(ctx, wtype, hidden_size, num_patchs, 1);
+        params["pos_embed"]  = ggml_new_tensor_3d(ctx, wtype, config.hidden_size, config.num_patches, 1);
    }

 public:
-    MMDiT(const String2TensorStorage& tensor_storage_map = {}) {
+    MMDiTConfig config;
+
+    explicit MMDiT(MMDiTConfig config = {})
+        : config(config) {
        // input_size is always None
        // learn_sigma is always False
        // register_length is alwalys 0
@ -645,64 +729,30 @@ public:
        // pos_embed_offset is not used
        // context_embedder_config is always {'target': 'torch.nn.Linear', 'params': {'in_features': 4096, 'out_features': 1536}}

-        for (auto pair : tensor_storage_map) {
-            std::string tensor_name = pair.first;
-            if (tensor_name.find("model.diffusion_model.") == std::string::npos)
-                continue;
-            size_t jb = tensor_name.find("joint_blocks.");
-            if (jb != std::string::npos) {
-                tensor_name     = tensor_name.substr(jb);  // remove prefix
-                int block_depth = atoi(tensor_name.substr(13, tensor_name.find(".", 13)).c_str());
-                if (block_depth + 1 > depth) {
-                    depth = block_depth + 1;
-                }
-                if (tensor_name.find("attn.ln") != std::string::npos) {
-                    if (tensor_name.find(".bias") != std::string::npos) {
-                        qk_norm = "ln";
-                    } else {
-                        qk_norm = "rms";
-                    }
-                }
-                if (tensor_name.find("attn2") != std::string::npos) {
-                    if (block_depth > d_self) {
-                        d_self = block_depth;
-                    }
-                }
-            }
+        blocks["x_embedder"] = std::shared_ptr<GGMLBlock>(new PatchEmbed(config.input_size,
+                                                                         config.patch_size,
+                                                                         config.in_channels,
+                                                                         config.hidden_size,
+                                                                         true));
+        blocks["t_embedder"] = std::shared_ptr<GGMLBlock>(new TimestepEmbedder(config.hidden_size));
+
+        if (config.adm_in_channels != -1) {
+            blocks["y_embedder"] = std::shared_ptr<GGMLBlock>(new VectorEmbedder(config.adm_in_channels, config.hidden_size));
        }

-        if (d_self >= 0) {
-            pos_embed_max_size *= 2;
-            num_patchs *= 4;
-        }
+        blocks["context_embedder"] = std::shared_ptr<GGMLBlock>(new Linear(config.context_size, config.context_embedder_out_dim, true, true));

-        LOG_INFO("MMDiT layers: %d (including %d MMDiT-x layers)", depth, d_self + 1);
-
-        int64_t default_out_channels = in_channels;
-        hidden_size                  = 64 * depth;
-        context_embedder_out_dim     = 64 * depth;
-        int64_t num_heads            = depth;
-
-        blocks["x_embedder"] = std::shared_ptr<GGMLBlock>(new PatchEmbed(input_size, patch_size, in_channels, hidden_size, true));
-        blocks["t_embedder"] = std::shared_ptr<GGMLBlock>(new TimestepEmbedder(hidden_size));
-
-        if (adm_in_channels != -1) {
-            blocks["y_embedder"] = std::shared_ptr<GGMLBlock>(new VectorEmbedder(adm_in_channels, hidden_size));
-        }
-
-        blocks["context_embedder"] = std::shared_ptr<GGMLBlock>(new Linear(4096, context_embedder_out_dim, true, true));
-
-        for (int i = 0; i < depth; i++) {
-            blocks["joint_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new JointBlock(hidden_size,
-                                                                                                    num_heads,
-                                                                                                    mlp_ratio,
-                                                                                                    qk_norm,
+        for (int i = 0; i < config.depth; i++) {
+            blocks["joint_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new JointBlock(config.hidden_size,
+                                                                                                    config.depth,
+                                                                                                    config.mlp_ratio,
+                                                                                                    config.qk_norm,
                                                                                                    true,
-                                                                                                    i == depth - 1,
-                                                                                                    i <= d_self));
+                                                                                                    i == config.depth - 1,
+                                                                                                    i <= config.d_self));
        }

-        blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new FinalLayer(hidden_size, patch_size, out_channels));
+        blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new FinalLayer(config.hidden_size, config.patch_size, config.out_channels));
    }

    ggml_tensor*
@ -711,22 +761,22 @@ public:
                      int64_t w) {
        auto pos_embed = params["pos_embed"];

-        h = (h + 1) / patch_size;
-        w = (w + 1) / patch_size;
+        h = (h + 1) / config.patch_size;
+        w = (w + 1) / config.patch_size;

-        GGML_ASSERT(h <= pos_embed_max_size && h > 0);
-        GGML_ASSERT(w <= pos_embed_max_size && w > 0);
+        GGML_ASSERT(h <= config.pos_embed_max_size && h > 0);
+        GGML_ASSERT(w <= config.pos_embed_max_size && w > 0);

-        int64_t top  = (pos_embed_max_size - h) / 2;
-        int64_t left = (pos_embed_max_size - w) / 2;
+        int64_t top  = (config.pos_embed_max_size - h) / 2;
+        int64_t left = (config.pos_embed_max_size - w) / 2;

-        auto spatial_pos_embed = ggml_reshape_3d(ctx, pos_embed, hidden_size, pos_embed_max_size, pos_embed_max_size);
+        auto spatial_pos_embed = ggml_reshape_3d(ctx, pos_embed, config.hidden_size, config.pos_embed_max_size, config.pos_embed_max_size);

        // spatial_pos_embed = spatial_pos_embed[:, top : top + h, left : left + w, :]
        spatial_pos_embed = ggml_view_3d(ctx,
                                         spatial_pos_embed,
-                                         hidden_size,
-                                         pos_embed_max_size,
+                                         config.hidden_size,
+                                         config.pos_embed_max_size,
                                         h,
                                         spatial_pos_embed->nb[1],
                                         spatial_pos_embed->nb[2],
@ -734,14 +784,14 @@ public:
        spatial_pos_embed = ggml_cont(ctx, ggml_permute(ctx, spatial_pos_embed, 0, 2, 1, 3));  // [pos_embed_max_size, h, hidden_size]
        spatial_pos_embed = ggml_view_3d(ctx,
                                         spatial_pos_embed,
-                                         hidden_size,
+                                         config.hidden_size,
                                         h,
                                         w,
                                         spatial_pos_embed->nb[1],
                                         spatial_pos_embed->nb[2],
                                         spatial_pos_embed->nb[2] * left);                          // [w, h, hidden_size]
        spatial_pos_embed = ggml_cont(ctx, ggml_permute(ctx, spatial_pos_embed, 0, 2, 1, 3));       // [h, w, hidden_size]
-        spatial_pos_embed = ggml_reshape_3d(ctx, spatial_pos_embed, hidden_size, h * w, 1);    // [1, h*w, hidden_size]
+        spatial_pos_embed = ggml_reshape_3d(ctx, spatial_pos_embed, config.hidden_size, h * w, 1);  // [1, h*w, hidden_size]
        return spatial_pos_embed;
    }

@ -756,7 +806,7 @@ public:
        // return: [N, N*W, patch_size * patch_size * out_channels]
        auto final_layer = std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer"]);

-        for (int i = 0; i < depth; i++) {
+        for (int i = 0; i < config.depth; i++) {
            // skip iteration if i is in skip_layers
            if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i) != skip_layers.end()) {
                continue;
@ -767,6 +817,8 @@ public:
            auto context_x = block->forward(ctx, context, x, c_mod);
            context        = context_x.first;
            x              = context_x.second;
+            sd::ggml_graph_cut::mark_graph_cut(context, "mmdit.joint_blocks." + std::to_string(i), "context");
+            sd::ggml_graph_cut::mark_graph_cut(x, "mmdit.joint_blocks." + std::to_string(i), "x");
        }

        x = final_layer->forward(ctx, x, c_mod);  // (N, T, patch_size ** 2 * out_channels)
@ -797,7 +849,7 @@ public:
        x                = ggml_add(ctx->ggml_ctx, patch_embed, pos_embed);  // [N, H*W, hidden_size]

        auto c = t_embedder->forward(ctx, t);  // [N, hidden_size]
-        if (y != nullptr && adm_in_channels != -1) {
+        if (y != nullptr && config.adm_in_channels != -1) {
            auto y_embedder = std::dynamic_pointer_cast<VectorEmbedder>(blocks["y_embedder"]);

            y = y_embedder->forward(ctx, y);  // [N, hidden_size]
@ -809,22 +861,30 @@ public:

            context = context_embedder->forward(ctx, context);  // [N, L, D] aka [N, L, 1536]
        }
+        sd::ggml_graph_cut::mark_graph_cut(x, "mmdit.prelude", "x");
+        sd::ggml_graph_cut::mark_graph_cut(c, "mmdit.prelude", "c");
+        if (context != nullptr) {
+            sd::ggml_graph_cut::mark_graph_cut(context, "mmdit.prelude", "context");
+        }

        x = forward_core_with_concat(ctx, x, c, context, skip_layers);  // (N, H*W, patch_size ** 2 * out_channels)

-        x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, patch_size, patch_size, /*patch_last*/ false);  // [N, C, H, W]
+        x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, config.patch_size, config.patch_size, /*patch_last*/ false);  // [N, C, H, W]

        return x;
    }
 };
-struct MMDiTRunner : public GGMLRunner {
+struct MMDiTRunner : public DiffusionModelRunner {
+    MMDiTConfig config;
    MMDiT mmdit;

    MMDiTRunner(ggml_backend_t backend,
-                bool offload_params_to_cpu,
                const String2TensorStorage& tensor_storage_map      = {},
-                const std::string prefix                       = "")
-        : GGMLRunner(backend, offload_params_to_cpu), mmdit(tensor_storage_map) {
+                const std::string prefix                            = "",
+                std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+        : DiffusionModelRunner(backend, prefix, weight_manager),
+          config(MMDiTConfig::detect_from_weights(tensor_storage_map, prefix)),
+          mmdit(config) {
        mmdit.init(params_ctx, tensor_storage_map, prefix);
    }

@ -832,7 +892,7 @@ struct MMDiTRunner : public GGMLRunner {
        return "mmdit";
    }

-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
        mmdit.get_param_tensors(tensors, prefix);
    }

@ -875,7 +935,21 @@ struct MMDiTRunner : public GGMLRunner {
            return build_graph(x, timesteps, context, y, skip_layers);
        };

-        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+    }
+
+    sd::Tensor<float> compute(int n_threads,
+                              const DiffusionParams& diffusion_params) override {
+        GGML_ASSERT(diffusion_params.x != nullptr);
+        GGML_ASSERT(diffusion_params.timesteps != nullptr);
+        const auto* extra = diffusion_extra_as<SkipLayerDiffusionExtra>(diffusion_params);
+        static const std::vector<int> empty_skip_layers;
+        return compute(n_threads,
+                       *diffusion_params.x,
+                       *diffusion_params.timesteps,
+                       tensor_or_empty(diffusion_params.context),
+                       tensor_or_empty(diffusion_params.y),
+                       extra->skip_layers ? *extra->skip_layers : empty_skip_layers);
    }

    void test() {
@ -925,26 +999,27 @@ struct MMDiTRunner : public GGMLRunner {

    static void load_from_file_and_test(const std::string& file_path) {
        // ggml_backend_t backend    = ggml_backend_cuda_init(0);
-        ggml_backend_t backend             = ggml_backend_cpu_init();
+        ggml_backend_t backend             = sd_backend_cpu_init();
        ggml_type model_data_type          = GGML_TYPE_F16;
-        std::shared_ptr<MMDiTRunner> mmdit = std::make_shared<MMDiTRunner>(backend, false);
+        auto model_manager                 = std::make_shared<ModelManager>();
+        std::shared_ptr<MMDiTRunner> mmdit = std::make_shared<MMDiTRunner>(backend, String2TensorStorage{}, "", model_manager);
        {
            LOG_INFO("loading from '%s'", file_path.c_str());

-            mmdit->alloc_params_buffer();
-            std::map<std::string, ggml_tensor*> tensors;
-            mmdit->get_param_tensors(tensors, "model.diffusion_model");
-
-            ModelLoader model_loader;
+            ModelLoader& model_loader = model_manager->loader();
            if (!model_loader.init_from_file_and_convert_name(file_path)) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
            }

-            bool success = model_loader.load_tensors(tensors);
-
-            if (!success) {
-                LOG_ERROR("load tensors from model loader failed");
+            if (!model_manager->register_runner_params("MMDiT test",
+                                                       *mmdit,
+                                                       "model.diffusion_model",
+                                                       ModelManager::ResidencyMode::ParamBackend,
+                                                       backend,
+                                                       backend) ||
+                !model_manager->validate_registered_tensors()) {
+                LOG_ERROR("register mmdit tensors with model manager failed");
                return;
            }

@ -954,4 +1029,4 @@ struct MMDiTRunner : public GGMLRunner {
    }
 };

-#endif
+#endif  // __SD_MODEL_DIFFUSION_MMDIT_HPP__
--- a/src/model/diffusion/model.hpp
+++ b/src/model/diffusion/model.hpp
@ -0,0 +1,108 @@
+#ifndef __SD_MODEL_DIFFUSION_MODEL_HPP__
+#define __SD_MODEL_DIFFUSION_MODEL_HPP__
+
+#include <string>
+#include <utility>
+#include <variant>
+
+#include "core/ggml_extend.hpp"
+#include "core/tensor_ggml.hpp"
+#include "model_manager.h"
+
+struct UNetDiffusionExtra {
+    int num_video_frames                           = -1;
+    const std::vector<sd::Tensor<float>>* controls = nullptr;
+    float control_strength                         = 0.f;
+};
+
+struct SkipLayerDiffusionExtra {
+    const std::vector<int>* skip_layers = nullptr;
+};
+
+struct FluxDiffusionExtra {
+    const sd::Tensor<float>* guidance   = nullptr;
+    const std::vector<int>* skip_layers = nullptr;
+};
+
+struct AnimaDiffusionExtra {
+    const sd::Tensor<int32_t>* t5_ids   = nullptr;
+    const sd::Tensor<float>* t5_weights = nullptr;
+};
+
+struct WanDiffusionExtra {
+    const sd::Tensor<float>* vace_context = nullptr;
+    float vace_strength                   = 1.f;
+};
+
+struct HiDreamO1DiffusionExtra {
+    const sd::Tensor<int32_t>* input_ids                               = nullptr;
+    const sd::Tensor<int32_t>* input_pos                               = nullptr;
+    const sd::Tensor<int32_t>* token_types                             = nullptr;
+    const sd::Tensor<int32_t>* vinput_mask                             = nullptr;
+    const std::vector<std::pair<int, sd::Tensor<float>>>* image_embeds = nullptr;
+};
+
+struct LTXAVDiffusionExtra {
+    const sd::Tensor<float>* audio_x         = nullptr;
+    const sd::Tensor<float>* audio_timesteps = nullptr;
+    int audio_length                         = 0;
+    float frame_rate                         = 24.f;
+    const sd::Tensor<float>* video_positions = nullptr;
+};
+
+using DiffusionExtraParams = std::variant<std::monostate,
+                                          UNetDiffusionExtra,
+                                          SkipLayerDiffusionExtra,
+                                          FluxDiffusionExtra,
+                                          AnimaDiffusionExtra,
+                                          WanDiffusionExtra,
+                                          HiDreamO1DiffusionExtra,
+                                          LTXAVDiffusionExtra>;
+
+struct DiffusionParams {
+    const sd::Tensor<float>* x                        = nullptr;
+    const sd::Tensor<float>* timesteps                = nullptr;
+    const sd::Tensor<float>* context                  = nullptr;
+    const sd::Tensor<float>* c_concat                 = nullptr;
+    const sd::Tensor<float>* y                        = nullptr;
+    const std::vector<sd::Tensor<float>>* ref_latents = nullptr;
+    bool increase_ref_index                           = false;
+    DiffusionExtraParams extra                        = std::monostate{};
+};
+
+template <typename T>
+static inline const T* diffusion_extra_as(const DiffusionParams& params) {
+    const auto* extra = std::get_if<T>(&params.extra);
+    GGML_ASSERT(extra != nullptr);
+    return extra;
+}
+
+template <typename T>
+static inline const sd::Tensor<T>& tensor_or_empty(const sd::Tensor<T>* tensor) {
+    static const sd::Tensor<T> kEmpty;
+    return tensor != nullptr ? *tensor : kEmpty;
+}
+
+struct DiffusionModelRunner : public GGMLRunner {
+protected:
+    std::string prefix;
+
+public:
+    DiffusionModelRunner(ggml_backend_t backend,
+                         const std::string& prefix,
+                         std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+        : GGMLRunner(backend, weight_manager),
+          prefix(prefix) {}
+
+    virtual sd::Tensor<float> compute(int n_threads,
+                                      const DiffusionParams& diffusion_params) = 0;
+
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) {
+        get_param_tensors(tensors, prefix);
+    }
+
+    virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors,
+                                   const std::string& prefix) = 0;
+};
+
+#endif  // __SD_MODEL_DIFFUSION_MODEL_HPP__
--- a/src/model/diffusion/pid.hpp
+++ b/src/model/diffusion/pid.hpp
@ -0,0 +1,847 @@
+#ifndef __SD_MODEL_DIFFUSION_PID_HPP__
+#define __SD_MODEL_DIFFUSION_PID_HPP__
+
+#include <cmath>
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "core/ggml_extend.hpp"
+#include "model/common/rope.hpp"
+#include "model/diffusion/dit.hpp"
+#include "model/diffusion/mmdit.hpp"
+
+namespace Pid {
+    constexpr int PID_GRAPH_SIZE = 196608;
+    constexpr float PID_PI       = 3.14159265358979323846f;
+
+    struct PixelDiTConfig {
+        int64_t in_channels            = 3;
+        int64_t hidden_size            = 1536;
+        int64_t num_groups             = 24;
+        int64_t patch_mlp_hidden_dim   = 4096;
+        int64_t pixel_hidden_size      = 16;
+        int64_t pixel_attn_hidden_size = 1152;
+        int64_t pixel_num_groups       = 16;
+        int64_t patch_depth            = 14;
+        int64_t pixel_depth            = 2;
+        int64_t patch_size             = 16;
+        int64_t txt_embed_dim          = 2304;
+        int64_t txt_max_length         = 300;
+        float text_rope_theta          = 10000.f;
+        int64_t lq_latent_channels     = 16;
+        int64_t lq_hidden_dim          = 512;
+        int64_t lq_num_res_blocks      = 4;
+        int64_t lq_interval            = 2;
+        int64_t lq_sr_scale            = 4;
+        int64_t lq_latent_down_factor  = 8;
+        int64_t rope_ref_grid_h        = 64;
+        int64_t rope_ref_grid_w        = 64;
+
+        static PixelDiTConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
+            PixelDiTConfig config;
+            for (const auto& [name, tensor_storage] : tensor_storage_map) {
+                if (!starts_with(name, prefix)) {
+                    continue;
+                }
+                size_t pos = name.find("patch_blocks.");
+                if (pos != std::string::npos) {
+                    auto items = split_string(name.substr(pos), '.');
+                    if (items.size() > 1) {
+                        int block_index    = atoi(items[1].c_str());
+                        config.patch_depth = std::max<int64_t>(config.patch_depth, block_index + 1);
+                    }
+                }
+                pos = name.find("pixel_blocks.");
+                if (pos != std::string::npos) {
+                    auto items = split_string(name.substr(pos), '.');
+                    if (items.size() > 1) {
+                        int block_index    = atoi(items[1].c_str());
+                        config.pixel_depth = std::max<int64_t>(config.pixel_depth, block_index + 1);
+                    }
+                }
+                if (name.find("lq_proj.latent_proj.0.weight") != std::string::npos) {
+                    config.lq_latent_channels    = tensor_storage.ne[2];
+                    config.lq_latent_down_factor = config.lq_latent_channels >= 64 ? 16 : 8;
+                }
+                if (name.find("patch_blocks.0.mlp_x.w1.weight") != std::string::npos) {
+                    config.patch_mlp_hidden_dim = tensor_storage.ne[1];
+                }
+            }
+            LOG_DEBUG("pid: patch_depth = %" PRId64 ", pixel_depth = %" PRId64 ", patch_mlp_hidden_dim = %" PRId64 ", lq_latent_channels = %" PRId64 ", lq_latent_down_factor = %" PRId64,
+                      config.patch_depth,
+                      config.pixel_depth,
+                      config.patch_mlp_hidden_dim,
+                      config.lq_latent_channels,
+                      config.lq_latent_down_factor);
+            return config;
+        }
+    };
+
+    inline std::vector<float> make_rope_1d(int length,
+                                           int dim,
+                                           float theta) {
+        GGML_ASSERT(dim % 2 == 0);
+        return Rope::flatten(Rope::rope(Rope::linspace(0.f, static_cast<float>(length - 1), length), dim, theta));
+    }
+
+    inline std::vector<float> make_rope_2d(int height,
+                                           int width,
+                                           int dim,
+                                           float theta    = 10000.f,
+                                           float scale    = 16.f,
+                                           int ref_grid_h = 0,
+                                           int ref_grid_w = 0) {
+        GGML_ASSERT(dim % 4 == 0);
+        return Rope::embed_2d_interleaved(height, width, dim, theta, scale, ref_grid_h, ref_grid_w);
+    }
+
+    inline std::vector<float> make_pixel_abs_pos(int height,
+                                                 int width,
+                                                 int dim) {
+        GGML_ASSERT(dim % 4 == 0);
+        int half_dim = dim / 2;
+        std::vector<float> x_pos;
+        std::vector<float> y_pos;
+        x_pos.reserve(static_cast<size_t>(height) * width);
+        y_pos.reserve(static_cast<size_t>(height) * width);
+        for (int iy = 0; iy < height; ++iy) {
+            for (int ix = 0; ix < width; ++ix) {
+                x_pos.push_back(static_cast<float>(ix));
+                y_pos.push_back(static_cast<float>(iy));
+            }
+        }
+
+        auto x_emb = timestep_embedding(x_pos, half_dim, 10000, false);
+        auto y_emb = timestep_embedding(y_pos, half_dim, 10000, false);
+
+        std::vector<float> out(static_cast<size_t>(dim) * height * width);
+        for (int pos = 0; pos < height * width; ++pos) {
+            size_t out_base = static_cast<size_t>(pos) * dim;
+            size_t emb_base = static_cast<size_t>(pos) * half_dim;
+            for (int i = 0; i < half_dim; ++i) {
+                out[out_base + i]            = x_emb[emb_base + i];
+                out[out_base + half_dim + i] = y_emb[emb_base + i];
+            }
+        }
+        return out;
+    }
+
+    inline ggml_tensor* apply_adaln(ggml_context* ctx,
+                                    ggml_tensor* x,
+                                    ggml_tensor* shift,
+                                    ggml_tensor* scale) {
+        return ggml_add(ctx, ggml_add(ctx, x, ggml_mul(ctx, x, scale)), shift);
+    }
+
+    struct PatchTokenEmbedder : public GGMLBlock {
+        bool use_rms_norm;
+
+        PatchTokenEmbedder(int64_t in_chans,
+                           int64_t embed_dim,
+                           bool use_rms_norm = false,
+                           bool bias         = true)
+            : use_rms_norm(use_rms_norm) {
+            blocks["proj"] = std::make_shared<Linear>(in_chans, embed_dim, bias);
+            if (use_rms_norm) {
+                blocks["norm"] = std::make_shared<RMSNorm>(embed_dim, 1e-6f);
+            }
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
+            x         = proj->forward(ctx, x);
+            if (use_rms_norm) {
+                auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
+                x         = norm->forward(ctx, x);
+            }
+            return x;
+        }
+    };
+
+    struct PixelDiTTimestepEmbedder : public GGMLBlock {
+        int frequency_embedding_size;
+
+        PixelDiTTimestepEmbedder(int64_t hidden_size,
+                                 int frequency_embedding_size = 256)
+            : frequency_embedding_size(frequency_embedding_size) {
+            blocks["mlp.0"] = std::make_shared<Linear>(frequency_embedding_size, hidden_size, true, true);
+            blocks["mlp.2"] = std::make_shared<Linear>(hidden_size, hidden_size, true, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* t) {
+            auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
+            auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);
+            auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, t, frequency_embedding_size, 10);
+            t_emb      = mlp_0->forward(ctx, t_emb);
+            t_emb      = ggml_silu_inplace(ctx->ggml_ctx, t_emb);
+            return mlp_2->forward(ctx, t_emb);
+        }
+    };
+
+    struct FeedForward : public GGMLBlock {
+        FeedForward(int64_t dim, int64_t hidden_dim) {
+            blocks["w1"] = std::make_shared<Linear>(dim, hidden_dim, false);
+            blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false);
+            blocks["w3"] = std::make_shared<Linear>(dim, hidden_dim, false);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            auto w1 = std::dynamic_pointer_cast<Linear>(blocks["w1"]);
+            auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]);
+            auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]);
+            auto h  = ggml_silu_inplace(ctx->ggml_ctx, w1->forward(ctx, x));
+            h       = ggml_mul_inplace(ctx->ggml_ctx, h, w3->forward(ctx, x));
+            return w2->forward(ctx, h);
+        }
+    };
+
+    struct FinalLayer : public GGMLBlock {
+        FinalLayer(int64_t hidden_size, int64_t out_channels) {
+            blocks["norm"]   = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
+            blocks["linear"] = std::make_shared<Linear>(hidden_size, out_channels, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            auto norm   = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
+            auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
+            return linear->forward(ctx, norm->forward(ctx, x));
+        }
+    };
+
+    struct RotaryAttention : public GGMLBlock {
+        int64_t dim;
+        int64_t num_heads;
+
+        RotaryAttention(int64_t dim, int64_t num_heads)
+            : dim(dim), num_heads(num_heads) {
+            int64_t head_dim = dim / num_heads;
+            blocks["qkv"]    = std::make_shared<Linear>(dim, dim * 3, false);
+            blocks["q_norm"] = std::make_shared<RMSNorm>(head_dim, 1e-6f);
+            blocks["k_norm"] = std::make_shared<RMSNorm>(head_dim, 1e-6f);
+            blocks["proj"]   = std::make_shared<Linear>(dim, dim, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* pos) {
+            auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
+            auto q_norm   = std::dynamic_pointer_cast<RMSNorm>(blocks["q_norm"]);
+            auto k_norm   = std::dynamic_pointer_cast<RMSNorm>(blocks["k_norm"]);
+            auto proj     = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
+
+            auto qkv         = qkv_proj->forward(ctx, x);
+            auto qkv_vec     = split_qkv(ctx->ggml_ctx, qkv);
+            int64_t L        = x->ne[1];
+            int64_t N        = x->ne[2];
+            int64_t head_dim = dim / num_heads;
+            auto q           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[0], head_dim, num_heads, L, N);
+            auto k           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[1], head_dim, num_heads, L, N);
+            auto v           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[2], head_dim, num_heads, L, N);
+            q                = q_norm->forward(ctx, q);
+            k                = k_norm->forward(ctx, k);
+            x                = Rope::attention(ctx, q, k, v, pos, nullptr, 1.0f / 128.f, true);
+            return proj->forward(ctx, x);
+        }
+    };
+
+    struct MMDiTJointAttention : public GGMLBlock {
+        int64_t dim;
+        int64_t num_heads;
+
+        MMDiTJointAttention(int64_t dim, int64_t num_heads)
+            : dim(dim), num_heads(num_heads) {
+            int64_t head_dim   = dim / num_heads;
+            blocks["qkv_x"]    = std::make_shared<Linear>(dim, dim * 3, false);
+            blocks["qkv_y"]    = std::make_shared<Linear>(dim, dim * 3, false);
+            blocks["q_norm_x"] = std::make_shared<RMSNorm>(head_dim, 1e-6f);
+            blocks["k_norm_x"] = std::make_shared<RMSNorm>(head_dim, 1e-6f);
+            blocks["q_norm_y"] = std::make_shared<RMSNorm>(head_dim, 1e-6f);
+            blocks["k_norm_y"] = std::make_shared<RMSNorm>(head_dim, 1e-6f);
+            blocks["proj_x"]   = std::make_shared<Linear>(dim, dim, true);
+            blocks["proj_y"]   = std::make_shared<Linear>(dim, dim, true);
+        }
+
+        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                      ggml_tensor* x,
+                                                      ggml_tensor* y,
+                                                      ggml_tensor* pos_img,
+                                                      ggml_tensor* pos_txt) {
+            auto qkv_x_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv_x"]);
+            auto qkv_y_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv_y"]);
+            auto q_norm_x   = std::dynamic_pointer_cast<RMSNorm>(blocks["q_norm_x"]);
+            auto k_norm_x   = std::dynamic_pointer_cast<RMSNorm>(blocks["k_norm_x"]);
+            auto q_norm_y   = std::dynamic_pointer_cast<RMSNorm>(blocks["q_norm_y"]);
+            auto k_norm_y   = std::dynamic_pointer_cast<RMSNorm>(blocks["k_norm_y"]);
+            auto proj_x     = std::dynamic_pointer_cast<Linear>(blocks["proj_x"]);
+            auto proj_y     = std::dynamic_pointer_cast<Linear>(blocks["proj_y"]);
+
+            int64_t Nx       = x->ne[1];
+            int64_t Ny       = y->ne[1];
+            int64_t N        = x->ne[2];
+            int64_t head_dim = dim / num_heads;
+
+            auto qkv_x = split_qkv(ctx->ggml_ctx, qkv_x_proj->forward(ctx, x));
+            auto qx    = ggml_reshape_4d(ctx->ggml_ctx, qkv_x[0], head_dim, num_heads, Nx, N);
+            auto kx    = ggml_reshape_4d(ctx->ggml_ctx, qkv_x[1], head_dim, num_heads, Nx, N);
+            auto vx    = ggml_reshape_4d(ctx->ggml_ctx, qkv_x[2], head_dim, num_heads, Nx, N);
+            qx         = q_norm_x->forward(ctx, qx);
+            kx         = k_norm_x->forward(ctx, kx);
+
+            auto qkv_y = split_qkv(ctx->ggml_ctx, qkv_y_proj->forward(ctx, y));
+            auto qy    = ggml_reshape_4d(ctx->ggml_ctx, qkv_y[0], head_dim, num_heads, Ny, N);
+            auto ky    = ggml_reshape_4d(ctx->ggml_ctx, qkv_y[1], head_dim, num_heads, Ny, N);
+            auto vy    = ggml_reshape_4d(ctx->ggml_ctx, qkv_y[2], head_dim, num_heads, Ny, N);
+            qy         = q_norm_y->forward(ctx, qy);
+            ky         = k_norm_y->forward(ctx, ky);
+
+            auto q_joint   = ggml_concat(ctx->ggml_ctx, qy, qx, 2);
+            auto k_joint   = ggml_concat(ctx->ggml_ctx, ky, kx, 2);
+            auto v_joint   = ggml_concat(ctx->ggml_ctx, vy, vx, 2);
+            auto pos_joint = ggml_concat(ctx->ggml_ctx, pos_txt, pos_img, 3);
+            auto out       = Rope::attention(ctx, q_joint, k_joint, v_joint, pos_joint, nullptr, 1.0f, true);
+
+            auto out_y = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, Ny);
+            auto out_x = ggml_ext_slice(ctx->ggml_ctx, out, 1, Ny, Ny + Nx);
+            return {proj_x->forward(ctx, out_x), proj_y->forward(ctx, out_y)};
+        }
+    };
+
+    struct MMDiTBlockT2I : public GGMLBlock {
+        int64_t hidden_size;
+
+        MMDiTBlockT2I(int64_t hidden_size, int64_t groups, int64_t mlp_hidden_dim)
+            : hidden_size(hidden_size) {
+            blocks["norm_x1"]                = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
+            blocks["norm_y1"]                = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
+            blocks["attn"]                   = std::make_shared<MMDiTJointAttention>(hidden_size, groups);
+            blocks["norm_x2"]                = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
+            blocks["norm_y2"]                = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
+            blocks["mlp_x"]                  = std::make_shared<FeedForward>(hidden_size, mlp_hidden_dim);
+            blocks["mlp_y"]                  = std::make_shared<FeedForward>(hidden_size, mlp_hidden_dim);
+            blocks["adaLN_modulation_img.0"] = std::make_shared<Linear>(hidden_size, 6 * hidden_size, true);
+            blocks["adaLN_modulation_txt.0"] = std::make_shared<Linear>(hidden_size, 6 * hidden_size, true);
+        }
+
+        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                      ggml_tensor* x,
+                                                      ggml_tensor* y,
+                                                      ggml_tensor* c,
+                                                      ggml_tensor* pos_img,
+                                                      ggml_tensor* pos_txt) {
+            auto norm_x1 = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_x1"]);
+            auto norm_y1 = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_y1"]);
+            auto attn    = std::dynamic_pointer_cast<MMDiTJointAttention>(blocks["attn"]);
+            auto norm_x2 = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_x2"]);
+            auto norm_y2 = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_y2"]);
+            auto mlp_x   = std::dynamic_pointer_cast<FeedForward>(blocks["mlp_x"]);
+            auto mlp_y   = std::dynamic_pointer_cast<FeedForward>(blocks["mlp_y"]);
+            auto ada_img = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation_img.0"]);
+            auto ada_txt = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation_txt.0"]);
+
+            auto mx = ggml_ext_chunk(ctx->ggml_ctx, ada_img->forward(ctx, c), 6, 0);
+            auto my = ggml_ext_chunk(ctx->ggml_ctx, ada_txt->forward(ctx, c), 6, 0);
+
+            auto x_norm   = apply_adaln(ctx->ggml_ctx, norm_x1->forward(ctx, x), mx[0], mx[1]);
+            auto y_norm   = apply_adaln(ctx->ggml_ctx, norm_y1->forward(ctx, y), my[0], my[1]);
+            auto attn_out = attn->forward(ctx, x_norm, y_norm, pos_img, pos_txt);
+
+            x = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn_out.first, mx[2]));
+            y = ggml_add(ctx->ggml_ctx, y, ggml_mul(ctx->ggml_ctx, attn_out.second, my[2]));
+
+            auto x_mlp = mlp_x->forward(ctx, apply_adaln(ctx->ggml_ctx, norm_x2->forward(ctx, x), mx[3], mx[4]));
+            auto y_mlp = mlp_y->forward(ctx, apply_adaln(ctx->ggml_ctx, norm_y2->forward(ctx, y), my[3], my[4]));
+            x          = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, x_mlp, mx[5]));
+            y          = ggml_add(ctx->ggml_ctx, y, ggml_mul(ctx->ggml_ctx, y_mlp, my[5]));
+            return {x, y};
+        }
+    };
+
+    struct PixelTokenEmbedder : public GGMLBlock {
+        int64_t in_channels;
+        int64_t hidden_size_output;
+
+        PixelTokenEmbedder(int64_t in_channels, int64_t hidden_size_output)
+            : in_channels(in_channels), hidden_size_output(hidden_size_output) {
+            blocks["proj"] = std::make_shared<Linear>(in_channels, hidden_size_output, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* inputs,
+                             int64_t patch_size,
+                             ggml_tensor* pos_full) {
+            auto proj  = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
+            int64_t W  = inputs->ne[0];
+            int64_t H  = inputs->ne[1];
+            int64_t B  = inputs->ne[3];
+            int64_t L  = (W / patch_size) * (H / patch_size);
+            int64_t P2 = patch_size * patch_size;
+
+            auto x = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, inputs, 2, 0, 1, 3));
+            x      = ggml_reshape_3d(ctx->ggml_ctx, x, in_channels, W * H, B);
+            x      = proj->forward(ctx, x);
+            x      = ggml_add(ctx->ggml_ctx, x, pos_full);
+            x      = ggml_reshape_4d(ctx->ggml_ctx, x, hidden_size_output, W, H, B);
+            x      = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 1, 2, 0, 3));
+            x      = DiT::patchify(ctx->ggml_ctx, x, static_cast<int>(patch_size), static_cast<int>(patch_size), false);
+            x      = ggml_reshape_3d(ctx->ggml_ctx, x, hidden_size_output, P2, L * B);
+            return x;
+        }
+    };
+
+    struct PiTBlock : public GGMLBlock {
+        int64_t pixel_dim;
+        int64_t context_dim;
+        int64_t attn_dim;
+        int64_t num_heads;
+        int64_t patch_size;
+
+        PiTBlock(int64_t pixel_dim,
+                 int64_t context_dim,
+                 int64_t patch_size,
+                 int64_t attn_dim,
+                 int64_t num_heads)
+            : pixel_dim(pixel_dim),
+              context_dim(context_dim),
+              attn_dim(attn_dim),
+              num_heads(num_heads),
+              patch_size(patch_size) {
+            int64_t p2                   = patch_size * patch_size;
+            blocks["compress_to_attn"]   = std::make_shared<Linear>(p2 * pixel_dim, attn_dim, true);
+            blocks["expand_from_attn"]   = std::make_shared<Linear>(attn_dim, p2 * pixel_dim, true);
+            blocks["norm1"]              = std::make_shared<RMSNorm>(pixel_dim, 1e-6f);
+            blocks["attn"]               = std::make_shared<RotaryAttention>(attn_dim, num_heads);
+            blocks["norm2"]              = std::make_shared<RMSNorm>(pixel_dim, 1e-6f);
+            blocks["mlp"]                = std::make_shared<Mlp>(pixel_dim, pixel_dim * 4);
+            blocks["adaLN_modulation.0"] = std::make_shared<Linear>(context_dim, 6 * pixel_dim * p2, true);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* s_cond,
+                             int64_t image_height,
+                             int64_t image_width,
+                             ggml_tensor* pos_comp) {
+            auto compress = std::dynamic_pointer_cast<Linear>(blocks["compress_to_attn"]);
+            auto expand   = std::dynamic_pointer_cast<Linear>(blocks["expand_from_attn"]);
+            auto norm1    = std::dynamic_pointer_cast<RMSNorm>(blocks["norm1"]);
+            auto attn     = std::dynamic_pointer_cast<RotaryAttention>(blocks["attn"]);
+            auto norm2    = std::dynamic_pointer_cast<RMSNorm>(blocks["norm2"]);
+            auto mlp      = std::dynamic_pointer_cast<Mlp>(blocks["mlp"]);
+            auto ada      = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.0"]);
+
+            int64_t Hs = image_height / patch_size;
+            int64_t Ws = image_width / patch_size;
+            int64_t L  = Hs * Ws;
+            int64_t BL = x->ne[2];
+            int64_t B  = BL / L;
+            int64_t P2 = patch_size * patch_size;
+
+            auto ada_params = ada->forward(ctx, s_cond);
+            ada_params      = ggml_reshape_3d(ctx->ggml_ctx, ada_params, 6 * pixel_dim, P2, BL);
+            auto mod        = ggml_ext_chunk(ctx->ggml_ctx, ada_params, 6, 0);
+
+            auto x_norm    = apply_adaln(ctx->ggml_ctx, norm1->forward(ctx, x), mod[0], mod[1]);
+            auto x_flat    = ggml_reshape_2d(ctx->ggml_ctx, x_norm, P2 * pixel_dim, BL);
+            auto x_comp    = compress->forward(ctx, x_flat);
+            x_comp         = ggml_reshape_3d(ctx->ggml_ctx, x_comp, attn_dim, L, B);
+            auto attn_out  = attn->forward(ctx, x_comp, pos_comp);
+            auto attn_flat = expand->forward(ctx, ggml_reshape_2d(ctx->ggml_ctx, attn_out, attn_dim, BL));
+            auto attn_exp  = ggml_reshape_3d(ctx->ggml_ctx, attn_flat, pixel_dim, P2, BL);
+            x              = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn_exp, mod[2]));
+
+            auto mlp_out = mlp->forward(ctx, apply_adaln(ctx->ggml_ctx, norm2->forward(ctx, x), mod[3], mod[4]));
+            return ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, mlp_out, mod[5]));
+        }
+    };
+
+    struct SigmaAwareGate : public GGMLBlock {
+        int64_t dim;
+
+        SigmaAwareGate(int64_t dim)
+            : dim(dim) {
+            blocks["content_proj"] = std::make_shared<Linear>(dim * 2, dim, true);
+        }
+
+        void init_params(ggml_context* ctx,
+                         const String2TensorStorage& tensor_storage_map = {},
+                         std::string prefix                             = "") override {
+            params["log_alpha"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* lq,
+                             ggml_tensor* sigma) {
+            auto content_proj = std::dynamic_pointer_cast<Linear>(blocks["content_proj"]);
+
+            auto content_logit = content_proj->forward(ctx, ggml_concat(ctx->ggml_ctx, x, lq, 0));
+            sigma              = ggml_reshape_3d(ctx->ggml_ctx, sigma, 1, 1, sigma->ne[0]);
+            auto alpha         = ggml_exp(ctx->ggml_ctx, params["log_alpha"]);
+            auto offset        = ggml_neg(ctx->ggml_ctx, ggml_mul(ctx->ggml_ctx, alpha, sigma));
+            auto gate          = ggml_sigmoid(ctx->ggml_ctx, ggml_add(ctx->ggml_ctx, content_logit, offset));
+            return ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, gate, lq));
+        }
+    };
+
+    struct PiDResBlock : public GGMLBlock {
+        PiDResBlock(int64_t channels) {
+            blocks["block.0"] = std::make_shared<GroupNorm>(4, channels, 1e-5f);
+            blocks["block.2"] = std::make_shared<Conv2d>(channels, channels, std::pair<int, int>{3, 3}, std::pair<int, int>{1, 1}, std::pair<int, int>{1, 1});
+            blocks["block.3"] = std::make_shared<GroupNorm>(4, channels, 1e-5f);
+            blocks["block.5"] = std::make_shared<Conv2d>(channels, channels, std::pair<int, int>{3, 3}, std::pair<int, int>{1, 1}, std::pair<int, int>{1, 1});
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+            auto norm1 = std::dynamic_pointer_cast<GroupNorm>(blocks["block.0"]);
+            auto conv1 = std::dynamic_pointer_cast<Conv2d>(blocks["block.2"]);
+            auto norm2 = std::dynamic_pointer_cast<GroupNorm>(blocks["block.3"]);
+            auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["block.5"]);
+            auto h     = ggml_silu_inplace(ctx->ggml_ctx, norm1->forward(ctx, x));
+            h          = conv1->forward(ctx, h);
+            h          = ggml_silu_inplace(ctx->ggml_ctx, norm2->forward(ctx, h));
+            h          = conv2->forward(ctx, h);
+            return ggml_add(ctx->ggml_ctx, x, h);
+        }
+    };
+
+    struct LQProjection2D : public GGMLBlock {
+        PixelDiTConfig config;
+
+        LQProjection2D(const PixelDiTConfig& config)
+            : config(config) {
+            blocks["latent_proj.0"] = std::make_shared<Conv2d>(config.lq_latent_channels, config.lq_hidden_dim, std::pair<int, int>{3, 3}, std::pair<int, int>{1, 1}, std::pair<int, int>{1, 1});
+            blocks["latent_proj.2"] = std::make_shared<Conv2d>(config.lq_hidden_dim, config.lq_hidden_dim, std::pair<int, int>{3, 3}, std::pair<int, int>{1, 1}, std::pair<int, int>{1, 1});
+            for (int i = 0; i < config.lq_num_res_blocks; ++i) {
+                blocks["latent_proj." + std::to_string(3 + i)] = std::make_shared<PiDResBlock>(config.lq_hidden_dim);
+            }
+
+            int num_outputs = static_cast<int>((config.patch_depth + config.lq_interval - 1) / config.lq_interval);
+            for (int i = 0; i < num_outputs; ++i) {
+                blocks["output_heads." + std::to_string(i)] = std::make_shared<Linear>(config.lq_hidden_dim, config.hidden_size, true);
+                blocks["gate_modules." + std::to_string(i)] = std::make_shared<SigmaAwareGate>(config.hidden_size);
+            }
+        }
+
+        bool is_gate_active(int block_idx) const {
+            return block_idx % config.lq_interval == 0;
+        }
+
+        int get_output_index(int block_idx) const {
+            return block_idx / static_cast<int>(config.lq_interval);
+        }
+
+        ggml_tensor* gate(GGMLRunnerContext* ctx,
+                          ggml_tensor* x,
+                          ggml_tensor* lq,
+                          ggml_tensor* sigma,
+                          int out_idx) {
+            auto gate_module = std::dynamic_pointer_cast<SigmaAwareGate>(blocks["gate_modules." + std::to_string(out_idx)]);
+            return gate_module->forward(ctx, x, lq, sigma);
+        }
+
+        std::vector<ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                          ggml_tensor* lq_latent,
+                                          int64_t target_pH,
+                                          int64_t target_pW) {
+            auto conv0             = std::dynamic_pointer_cast<Conv2d>(blocks["latent_proj.0"]);
+            auto conv2             = std::dynamic_pointer_cast<Conv2d>(blocks["latent_proj.2"]);
+            float z_to_patch_ratio = static_cast<float>(config.lq_sr_scale * config.lq_latent_down_factor) /
+                                     static_cast<float>(config.patch_size);
+            GGML_ASSERT(z_to_patch_ratio >= 1.0f);
+            if (lq_latent->ne[0] != target_pW || lq_latent->ne[1] != target_pH) {
+                lq_latent = ggml_interpolate(ctx->ggml_ctx,
+                                             lq_latent,
+                                             target_pW,
+                                             target_pH,
+                                             lq_latent->ne[2],
+                                             lq_latent->ne[3],
+                                             GGML_SCALE_MODE_NEAREST);
+            }
+
+            auto feat = conv0->forward(ctx, lq_latent);
+            feat      = ggml_silu_inplace(ctx->ggml_ctx, feat);
+            feat      = conv2->forward(ctx, feat);
+            for (int i = 0; i < config.lq_num_res_blocks; ++i) {
+                auto block = std::dynamic_pointer_cast<PiDResBlock>(blocks["latent_proj." + std::to_string(3 + i)]);
+                feat       = block->forward(ctx, feat);
+            }
+
+            int64_t B   = feat->ne[3];
+            int64_t C   = feat->ne[2];
+            int64_t L   = target_pH * target_pW;
+            auto tokens = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, feat, 2, 0, 1, 3));
+            tokens      = ggml_reshape_3d(ctx->ggml_ctx, tokens, C, L, B);
+
+            int num_outputs = static_cast<int>((config.patch_depth + config.lq_interval - 1) / config.lq_interval);
+            std::vector<ggml_tensor*> outputs;
+            outputs.reserve(num_outputs);
+            for (int i = 0; i < num_outputs; ++i) {
+                auto head = std::dynamic_pointer_cast<Linear>(blocks["output_heads." + std::to_string(i)]);
+                outputs.push_back(head->forward(ctx, tokens));
+            }
+            return outputs;
+        }
+    };
+
+    struct PixelDiT : public GGMLBlock {
+        PixelDiTConfig config;
+
+        PixelDiT() = default;
+
+        PixelDiT(const PixelDiTConfig& config)
+            : config(config) {
+            blocks["pixel_embedder"] = std::make_shared<PixelTokenEmbedder>(config.in_channels, config.pixel_hidden_size);
+            blocks["s_embedder"]     = std::make_shared<PatchTokenEmbedder>(config.in_channels * config.patch_size * config.patch_size, config.hidden_size, false, true);
+            blocks["t_embedder"]     = std::make_shared<PixelDiTTimestepEmbedder>(config.hidden_size);
+            blocks["y_embedder"]     = std::make_shared<PatchTokenEmbedder>(config.txt_embed_dim, config.hidden_size, true, true);
+            for (int i = 0; i < config.patch_depth; ++i) {
+                blocks["patch_blocks." + std::to_string(i)] = std::make_shared<MMDiTBlockT2I>(config.hidden_size, config.num_groups, config.patch_mlp_hidden_dim);
+            }
+            for (int i = 0; i < config.pixel_depth; ++i) {
+                blocks["pixel_blocks." + std::to_string(i)] = std::make_shared<PiTBlock>(config.pixel_hidden_size,
+                                                                                         config.hidden_size,
+                                                                                         config.patch_size,
+                                                                                         config.pixel_attn_hidden_size,
+                                                                                         config.pixel_num_groups);
+            }
+            blocks["final_layer"] = std::make_shared<FinalLayer>(config.pixel_hidden_size, config.in_channels);
+            blocks["lq_proj"]     = std::make_shared<LQProjection2D>(config);
+        }
+
+        void init_params(ggml_context* ctx,
+                         const String2TensorStorage& tensor_storage_map = {},
+                         std::string prefix                             = "") override {
+            params["y_pos_embedding"] = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, config.hidden_size, config.txt_max_length, 1);
+        }
+
+        ggml_tensor* forward(GGMLRunnerContext* ctx,
+                             ggml_tensor* x,
+                             ggml_tensor* timesteps,
+                             ggml_tensor* context,
+                             ggml_tensor* lq_latent,
+                             ggml_tensor* degrade_sigma,
+                             ggml_tensor* pos_img,
+                             ggml_tensor* pos_txt,
+                             ggml_tensor* pixel_pos_full,
+                             ggml_tensor* pixel_pos_comp) {
+            auto pixel_embedder = std::dynamic_pointer_cast<PixelTokenEmbedder>(blocks["pixel_embedder"]);
+            auto s_embedder     = std::dynamic_pointer_cast<PatchTokenEmbedder>(blocks["s_embedder"]);
+            auto t_embedder     = std::dynamic_pointer_cast<PixelDiTTimestepEmbedder>(blocks["t_embedder"]);
+            auto y_embedder     = std::dynamic_pointer_cast<PatchTokenEmbedder>(blocks["y_embedder"]);
+            auto final_layer    = std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer"]);
+            auto lq_proj        = std::dynamic_pointer_cast<LQProjection2D>(blocks["lq_proj"]);
+
+            int64_t W_orig = x->ne[0];
+            int64_t H_orig = x->ne[1];
+            x              = DiT::pad_to_patch_size(ctx, x, static_cast<int>(config.patch_size), static_cast<int>(config.patch_size));
+            int64_t W      = x->ne[0];
+            int64_t H      = x->ne[1];
+            int64_t B      = x->ne[3];
+            int64_t Hs     = H / config.patch_size;
+            int64_t Ws     = W / config.patch_size;
+            int64_t L      = Hs * Ws;
+            int64_t P2     = config.patch_size * config.patch_size;
+
+            auto x_patches = DiT::patchify(ctx->ggml_ctx, x, static_cast<int>(config.patch_size), static_cast<int>(config.patch_size), true);
+            auto t_emb     = t_embedder->forward(ctx, timesteps);
+            auto condition = ggml_silu(ctx->ggml_ctx, t_emb);
+
+            GGML_ASSERT(context != nullptr);
+            int64_t Ltxt = std::min<int64_t>(context->ne[1], config.txt_max_length);
+            auto y       = ggml_ext_slice(ctx->ggml_ctx, context, 1, 0, Ltxt);
+            auto y_emb   = y_embedder->forward(ctx, y);
+            auto y_pos   = ggml_ext_slice(ctx->ggml_ctx, params["y_pos_embedding"], 1, 0, Ltxt);
+            y_emb        = ggml_add(ctx->ggml_ctx, y_emb, y_pos);
+
+            std::vector<ggml_tensor*> lq_features = lq_proj->forward(ctx, lq_latent, Hs, Ws);
+
+            auto s = s_embedder->forward(ctx, x_patches);
+
+            for (int i = 0; i < config.patch_depth; ++i) {
+                if (lq_proj->is_gate_active(i)) {
+                    int out_idx = lq_proj->get_output_index(i);
+                    if (out_idx < static_cast<int>(lq_features.size())) {
+                        s = lq_proj->gate(ctx, s, lq_features[out_idx], degrade_sigma, out_idx);
+                    }
+                }
+                auto block = std::dynamic_pointer_cast<MMDiTBlockT2I>(blocks["patch_blocks." + std::to_string(i)]);
+                auto out   = block->forward(ctx,
+                                            s,
+                                            y_emb,
+                                            condition,
+                                            pos_img,
+                                            pos_txt);
+                s          = out.first;
+                y_emb      = out.second;
+                sd::ggml_graph_cut::mark_graph_cut(s, "pid.patch_blocks." + std::to_string(i), "s");
+                sd::ggml_graph_cut::mark_graph_cut(y_emb, "pid.patch_blocks." + std::to_string(i), "y");
+            }
+            s = ggml_silu(ctx->ggml_ctx, ggml_add(ctx->ggml_ctx, s, t_emb));
+
+            auto s_cond = ggml_reshape_2d(ctx->ggml_ctx, s, config.hidden_size, L * B);
+            auto pixels = pixel_embedder->forward(ctx, x, config.patch_size, pixel_pos_full);
+            for (int i = 0; i < config.pixel_depth; ++i) {
+                auto block = std::dynamic_pointer_cast<PiTBlock>(blocks["pixel_blocks." + std::to_string(i)]);
+                pixels     = block->forward(ctx, pixels, s_cond, H, W, pixel_pos_comp);
+                sd::ggml_graph_cut::mark_graph_cut(pixels, "pid.pixel_blocks." + std::to_string(i), "pixels");
+            }
+
+            pixels   = final_layer->forward(ctx, pixels);
+            pixels   = ggml_reshape_3d(ctx->ggml_ctx, pixels, config.in_channels * P2, L, B);
+            auto out = DiT::unpatchify(ctx->ggml_ctx,
+                                       pixels,
+                                       Hs,
+                                       Ws,
+                                       static_cast<int>(config.patch_size),
+                                       static_cast<int>(config.patch_size),
+                                       false);
+            out      = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, H_orig);
+            out      = ggml_ext_slice(ctx->ggml_ctx, out, 0, 0, W_orig);
+            return out;
+        }
+    };
+
+    struct PiDRunner : public DiffusionModelRunner {
+        PixelDiTConfig config;
+        PixelDiT model;
+        std::vector<float> pos_img_vec;
+        std::vector<float> pos_txt_vec;
+        std::vector<float> pixel_pos_vec;
+        std::vector<float> pixel_pos_comp_vec;
+
+        PiDRunner(ggml_backend_t backend,
+                  const String2TensorStorage& tensor_storage_map,
+                  const std::string prefix                            = "model.diffusion_model",
+                  std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+            : DiffusionModelRunner(backend, prefix, weight_manager),
+              config(PixelDiTConfig::detect_from_weights(tensor_storage_map, prefix)) {
+            model = PixelDiT(config);
+            model.init(params_ctx, tensor_storage_map, prefix);
+        }
+
+        std::string get_desc() override {
+            return "PiD";
+        }
+
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
+            model.get_param_tensors(tensors, prefix);
+        }
+
+        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
+                                 const sd::Tensor<float>& timesteps_tensor,
+                                 const sd::Tensor<float>& context_tensor,
+                                 const sd::Tensor<float>& lq_latent_tensor,
+                                 const sd::Tensor<float>& degrade_sigma_tensor) {
+            ggml_cgraph* gf            = new_graph_custom(PID_GRAPH_SIZE);
+            ggml_tensor* x             = make_input(x_tensor);
+            ggml_tensor* timesteps     = make_input(timesteps_tensor);
+            ggml_tensor* context       = make_input(context_tensor);
+            ggml_tensor* lq_latent     = make_input(lq_latent_tensor);
+            ggml_tensor* degrade_sigma = make_input(degrade_sigma_tensor);
+
+            int64_t W  = x->ne[0];
+            int64_t H  = x->ne[1];
+            int64_t B  = x->ne[3];
+            int64_t Wp = align_up(static_cast<int>(W), static_cast<int>(config.patch_size));
+            int64_t Hp = align_up(static_cast<int>(H), static_cast<int>(config.patch_size));
+            int64_t Hs = Hp / config.patch_size;
+            int64_t Ws = Wp / config.patch_size;
+
+            pos_img_vec  = make_rope_2d(static_cast<int>(Hs),
+                                        static_cast<int>(Ws),
+                                        static_cast<int>(config.hidden_size / config.num_groups),
+                                        10000.f,
+                                        16.f,
+                                        static_cast<int>(config.rope_ref_grid_h),
+                                        static_cast<int>(config.rope_ref_grid_w));
+            auto pos_img = ggml_new_tensor_4d(compute_ctx,
+                                              GGML_TYPE_F32,
+                                              2,
+                                              2,
+                                              config.hidden_size / config.num_groups / 2,
+                                              Hs * Ws);
+            set_backend_tensor_data(pos_img, pos_img_vec.data());
+
+            int64_t Ltxt = std::min<int64_t>(context->ne[1], config.txt_max_length);
+            pos_txt_vec  = make_rope_1d(static_cast<int>(Ltxt),
+                                        static_cast<int>(config.hidden_size / config.num_groups),
+                                        config.text_rope_theta);
+            auto pos_txt = ggml_new_tensor_4d(compute_ctx,
+                                              GGML_TYPE_F32,
+                                              2,
+                                              2,
+                                              config.hidden_size / config.num_groups / 2,
+                                              Ltxt);
+            set_backend_tensor_data(pos_txt, pos_txt_vec.data());
+
+            pixel_pos_vec  = make_pixel_abs_pos(static_cast<int>(Hp),
+                                                static_cast<int>(Wp),
+                                                static_cast<int>(config.pixel_hidden_size));
+            auto pixel_pos = ggml_new_tensor_3d(compute_ctx,
+                                                GGML_TYPE_F32,
+                                                config.pixel_hidden_size,
+                                                Wp * Hp,
+                                                1);
+            set_backend_tensor_data(pixel_pos, pixel_pos_vec.data());
+
+            pixel_pos_comp_vec  = make_rope_2d(static_cast<int>(Hs),
+                                               static_cast<int>(Ws),
+                                               static_cast<int>(config.pixel_attn_hidden_size / config.pixel_num_groups),
+                                               10000.f,
+                                               16.f,
+                                               static_cast<int>(config.rope_ref_grid_h),
+                                               static_cast<int>(config.rope_ref_grid_w));
+            auto pixel_pos_comp = ggml_new_tensor_4d(compute_ctx,
+                                                     GGML_TYPE_F32,
+                                                     2,
+                                                     2,
+                                                     config.pixel_attn_hidden_size / config.pixel_num_groups / 2,
+                                                     Hs * Ws);
+            set_backend_tensor_data(pixel_pos_comp, pixel_pos_comp_vec.data());
+
+            auto runner_ctx = get_context();
+            auto out        = model.forward(&runner_ctx,
+                                            x,
+                                            timesteps,
+                                            context,
+                                            lq_latent,
+                                            degrade_sigma,
+                                            pos_img,
+                                            pos_txt,
+                                            pixel_pos,
+                                            pixel_pos_comp);
+            ggml_build_forward_expand(gf, out);
+            return gf;
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const sd::Tensor<float>& x,
+                                  const sd::Tensor<float>& timesteps,
+                                  const sd::Tensor<float>& context,
+                                  const sd::Tensor<float>& lq_latent,
+                                  const sd::Tensor<float>& degrade_sigma) {
+            auto get_graph = [&]() -> ggml_cgraph* {
+                return build_graph(x, timesteps, context, lq_latent, degrade_sigma);
+            };
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const DiffusionParams& diffusion_params) override {
+            GGML_ASSERT(diffusion_params.x != nullptr);
+            GGML_ASSERT(diffusion_params.timesteps != nullptr);
+            GGML_ASSERT(diffusion_params.context != nullptr);
+            GGML_ASSERT(diffusion_params.ref_latents != nullptr);
+            GGML_ASSERT(!diffusion_params.ref_latents->empty());
+            auto degrade_sigma = sd::Tensor<float>::from_vector({0.0f});
+            return compute(n_threads,
+                           *diffusion_params.x,
+                           *diffusion_params.timesteps,
+                           *diffusion_params.context,
+                           diffusion_params.ref_latents->front(),
+                           degrade_sigma);
+        }
+    };
+}  // namespace Pid
+
+#endif  // __SD_MODEL_DIFFUSION_PID_HPP__
--- a/src/model/diffusion/qwen_image.hpp
+++ b/src/model/diffusion/qwen_image.hpp
@ -1,14 +1,58 @@
-#ifndef __QWEN_IMAGE_HPP__
-#define __QWEN_IMAGE_HPP__
+#ifndef __SD_MODEL_DIFFUSION_QWEN_IMAGE_HPP__
+#define __SD_MODEL_DIFFUSION_QWEN_IMAGE_HPP__

 #include <memory>

-#include "common_block.hpp"
-#include "flux.hpp"
+#include "model/common/block.hpp"
+#include "model/diffusion/flux.hpp"
+#include "model/diffusion/model.hpp"
+#include "model_loader.h"

 namespace Qwen {
    constexpr int QWEN_IMAGE_GRAPH_SIZE = 20480;

+    struct QwenImageConfig {
+        int patch_size              = 2;
+        int64_t in_channels         = 64;
+        int64_t out_channels        = 16;
+        int num_layers              = 60;
+        int64_t attention_head_dim  = 128;
+        int64_t num_attention_heads = 24;
+        int64_t joint_attention_dim = 3584;
+        int theta                   = 10000;
+        std::vector<int> axes_dim   = {16, 56, 56};
+        int axes_dim_sum            = 128;
+        bool zero_cond_t            = false;
+
+        static QwenImageConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
+            QwenImageConfig config;
+            config.num_layers = 0;
+            for (const auto& [name, _] : tensor_storage_map) {
+                if (!starts_with(name, prefix)) {
+                    continue;
+                }
+                if (name.find("__index_timestep_zero__") != std::string::npos) {
+                    config.zero_cond_t = true;
+                }
+                size_t pos = name.find("transformer_blocks.");
+                if (pos == std::string::npos) {
+                    continue;
+                }
+                auto items = split_string(name.substr(pos), '.');
+                if (items.size() > 1) {
+                    int block_index = atoi(items[1].c_str());
+                    if (block_index + 1 > config.num_layers) {
+                        config.num_layers = block_index + 1;
+                    }
+                }
+            }
+            LOG_DEBUG("qwen_image: num_layers = %d, zero_cond_t = %s",
+                      config.num_layers,
+                      config.zero_cond_t ? "true" : "false");
+            return config;
+        }
+    };
+
    struct TimestepEmbedding : public GGMLBlock {
    public:
        TimestepEmbedding(int64_t in_channels,
@ -95,9 +139,7 @@ namespace Qwen {

            float scale         = 1.f / 32.f;
            bool force_prec_f32 = false;
-#ifdef SD_USE_VULKAN
-            force_prec_f32 = true;
-#endif
+
            // The purpose of the scale here is to prevent NaN issues in certain situations.
            // For example when using CUDA but the weights are k-quants (not all prompts).
            blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias, false, force_prec_f32, scale));
@ -124,6 +166,10 @@ namespace Qwen {
            auto to_v     = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
            auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);

+            if (sd_backend_is(ctx->backend, "Vulkan")) {
+                to_out_0->set_force_prec_f32(true);
+            }
+
            auto norm_added_q = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_added_q"]);
            auto norm_added_k = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_added_k"]);

@ -347,46 +393,32 @@ namespace Qwen {
        }
    };

-    struct QwenImageParams {
-        int patch_size              = 2;
-        int64_t in_channels         = 64;
-        int64_t out_channels        = 16;
-        int num_layers              = 60;
-        int64_t attention_head_dim  = 128;
-        int64_t num_attention_heads = 24;
-        int64_t joint_attention_dim = 3584;
-        int theta                   = 10000;
-        std::vector<int> axes_dim   = {16, 56, 56};
-        int axes_dim_sum            = 128;
-        bool zero_cond_t            = false;
-    };
-
    class QwenImageModel : public GGMLBlock {
    protected:
-        QwenImageParams params;
+        QwenImageConfig config;

    public:
        QwenImageModel() {}
-        QwenImageModel(QwenImageParams params)
-            : params(params) {
-            int64_t inner_dim         = params.num_attention_heads * params.attention_head_dim;
+        QwenImageModel(QwenImageConfig config)
+            : config(config) {
+            int64_t inner_dim         = config.num_attention_heads * config.attention_head_dim;
            blocks["time_text_embed"] = std::shared_ptr<GGMLBlock>(new QwenTimestepProjEmbeddings(inner_dim));
-            blocks["txt_norm"]        = std::shared_ptr<GGMLBlock>(new RMSNorm(params.joint_attention_dim, 1e-6f));
-            blocks["img_in"]          = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, inner_dim));
-            blocks["txt_in"]          = std::shared_ptr<GGMLBlock>(new Linear(params.joint_attention_dim, inner_dim));
+            blocks["txt_norm"]        = std::shared_ptr<GGMLBlock>(new RMSNorm(config.joint_attention_dim, 1e-6f));
+            blocks["img_in"]          = std::shared_ptr<GGMLBlock>(new Linear(config.in_channels, inner_dim));
+            blocks["txt_in"]          = std::shared_ptr<GGMLBlock>(new Linear(config.joint_attention_dim, inner_dim));

            // blocks
-            for (int i = 0; i < params.num_layers; i++) {
+            for (int i = 0; i < config.num_layers; i++) {
                auto block                                        = std::shared_ptr<GGMLBlock>(new QwenImageTransformerBlock(inner_dim,
-                                                                                                                             params.num_attention_heads,
-                                                                                                                             params.attention_head_dim,
+                                                                                                                             config.num_attention_heads,
+                                                                                                                             config.attention_head_dim,
                                                                                                                             1e-6f,
-                                                                                                                             params.zero_cond_t));
+                                                                                                                             config.zero_cond_t));
                blocks["transformer_blocks." + std::to_string(i)] = block;
            }

            blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new AdaLayerNormContinuous(inner_dim, inner_dim, false, 1e-6f));
-            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, params.patch_size * params.patch_size * params.out_channels));
+            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, config.patch_size * config.patch_size * config.out_channels));
        }

        ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
@ -403,23 +435,28 @@ namespace Qwen {
            auto proj_out        = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]);

            auto t_emb = time_text_embed->forward(ctx, timestep);
-            if (params.zero_cond_t) {
+            if (config.zero_cond_t) {
                auto t_emb_0 = time_text_embed->forward(ctx, ggml_ext_zeros_like(ctx->ggml_ctx, timestep));
                t_emb        = ggml_concat(ctx->ggml_ctx, t_emb, t_emb_0, 1);
            }
            auto img = img_in->forward(ctx, x);
            auto txt = txt_norm->forward(ctx, context);
            txt      = txt_in->forward(ctx, txt);
+            sd::ggml_graph_cut::mark_graph_cut(img, "qwen_image.prelude", "img");
+            sd::ggml_graph_cut::mark_graph_cut(txt, "qwen_image.prelude", "txt");
+            // sd::ggml_graph_cut::mark_graph_cut(t_emb, "qwen_image.prelude", "t_emb");

-            for (int i = 0; i < params.num_layers; i++) {
+            for (int i = 0; i < config.num_layers; i++) {
                auto block = std::dynamic_pointer_cast<QwenImageTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);

                auto result = block->forward(ctx, img, txt, t_emb, pe, modulate_index);
                img         = result.first;
                txt         = result.second;
+                sd::ggml_graph_cut::mark_graph_cut(img, "qwen_image.transformer_blocks." + std::to_string(i), "img");
+                sd::ggml_graph_cut::mark_graph_cut(txt, "qwen_image.transformer_blocks." + std::to_string(i), "txt");
            }

-            if (params.zero_cond_t) {
+            if (config.zero_cond_t) {
                t_emb = ggml_ext_chunk(ctx->ggml_ctx, t_emb, 2, 1)[0];
            }

@ -448,12 +485,12 @@ namespace Qwen {
            int64_t C = x->ne[2];
            int64_t N = x->ne[3];

-            auto img           = DiT::pad_and_patchify(ctx, x, params.patch_size, params.patch_size);
+            auto img           = DiT::pad_and_patchify(ctx, x, config.patch_size, config.patch_size);
            int64_t img_tokens = img->ne[1];

            if (ref_latents.size() > 0) {
                for (ggml_tensor* ref : ref_latents) {
-                    ref = DiT::pad_and_patchify(ctx, ref, params.patch_size, params.patch_size);
+                    ref = DiT::pad_and_patchify(ctx, ref, config.patch_size, config.patch_size);
                    img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
                }
            }
@ -466,54 +503,30 @@ namespace Qwen {
                out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3));  // [N, h*w, C * patch_size * patch_size]
            }

-            out = DiT::unpatchify_and_crop(ctx->ggml_ctx, out, H, W, params.patch_size, params.patch_size);  // [N, C, H, W]
+            out = DiT::unpatchify_and_crop(ctx->ggml_ctx, out, H, W, config.patch_size, config.patch_size);  // [N, C, H, W]

            return out;
        }
    };

-    struct QwenImageRunner : public GGMLRunner {
+    struct QwenImageRunner : public DiffusionModelRunner {
    public:
-        QwenImageParams qwen_image_params;
+        QwenImageConfig config;
        QwenImageModel qwen_image;
        std::vector<float> pe_vec;
        std::vector<float> modulate_index_vec;
        SDVersion version;

        QwenImageRunner(ggml_backend_t backend,
-                        bool offload_params_to_cpu,
                        const String2TensorStorage& tensor_storage_map      = {},
                        const std::string prefix                            = "",
                        SDVersion version                                   = VERSION_QWEN_IMAGE,
-                        bool zero_cond_t                               = false)
-            : GGMLRunner(backend, offload_params_to_cpu) {
-            qwen_image_params.num_layers  = 0;
-            qwen_image_params.zero_cond_t = zero_cond_t;
-            for (auto pair : tensor_storage_map) {
-                std::string tensor_name = pair.first;
-                if (tensor_name.find(prefix) == std::string::npos)
-                    continue;
-                if (tensor_name.find("__index_timestep_zero__") != std::string::npos) {
-                    qwen_image_params.zero_cond_t = true;
-                }
-                size_t pos = tensor_name.find("transformer_blocks.");
-                if (pos != std::string::npos) {
-                    tensor_name = tensor_name.substr(pos);  // remove prefix
-                    auto items  = split_string(tensor_name, '.');
-                    if (items.size() > 1) {
-                        int block_index = atoi(items[1].c_str());
-                        if (block_index + 1 > qwen_image_params.num_layers) {
-                            qwen_image_params.num_layers = block_index + 1;
-                        }
-                    }
-                    continue;
-                }
-            }
-            LOG_INFO("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers);
-            if (qwen_image_params.zero_cond_t) {
-                LOG_INFO("use zero_cond_t");
-            }
-            qwen_image = QwenImageModel(qwen_image_params);
+                        bool zero_cond_t                                    = false,
+                        std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
+            : DiffusionModelRunner(backend, prefix, weight_manager),
+              config(QwenImageConfig::detect_from_weights(tensor_storage_map, prefix)) {
+            config.zero_cond_t = config.zero_cond_t || zero_cond_t;
+            qwen_image         = QwenImageModel(config);
            qwen_image.init(params_ctx, tensor_storage_map, prefix);
        }

@ -521,7 +534,7 @@ namespace Qwen {
            return "qwen_image";
        }

-        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
            qwen_image.get_param_tensors(tensors, prefix);
        }

@ -544,36 +557,36 @@ namespace Qwen {

            pe_vec      = Rope::gen_qwen_image_pe(static_cast<int>(x->ne[1]),
                                                  static_cast<int>(x->ne[0]),
-                                                  qwen_image_params.patch_size,
+                                                  config.patch_size,
                                                  static_cast<int>(x->ne[3]),
                                                  static_cast<int>(context->ne[1]),
                                                  ref_latents,
                                                  increase_ref_index,
-                                                  qwen_image_params.theta,
+                                                  config.theta,
                                                  circular_y_enabled,
                                                  circular_x_enabled,
-                                                  qwen_image_params.axes_dim);
-            int pos_len = static_cast<int>(pe_vec.size() / qwen_image_params.axes_dim_sum / 2);
+                                                  config.axes_dim);
+            int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
            // LOG_DEBUG("pos_len %d", pos_len);
-            auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, qwen_image_params.axes_dim_sum / 2, pos_len);
+            auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.axes_dim_sum / 2, pos_len);
            // pe->data = pe_vec.data();
            // print_ggml_tensor(pe, true, "pe");
            // pe->data = nullptr;
            set_backend_tensor_data(pe, pe_vec.data());

            ggml_tensor* modulate_index = nullptr;
-            if (qwen_image_params.zero_cond_t) {
+            if (config.zero_cond_t) {
                modulate_index_vec.clear();

-                int64_t h_len          = ((x->ne[1] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
-                int64_t w_len          = ((x->ne[0] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
+                int64_t h_len          = ((x->ne[1] + (config.patch_size / 2)) / config.patch_size);
+                int64_t w_len          = ((x->ne[0] + (config.patch_size / 2)) / config.patch_size);
                int64_t num_img_tokens = h_len * w_len;

                modulate_index_vec.insert(modulate_index_vec.end(), num_img_tokens, 0.f);
                int64_t num_ref_img_tokens = 0;
                for (ggml_tensor* ref : ref_latents) {
-                    int64_t h_len = ((ref->ne[1] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
-                    int64_t w_len = ((ref->ne[0] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
+                    int64_t h_len = ((ref->ne[1] + (config.patch_size / 2)) / config.patch_size);
+                    int64_t w_len = ((ref->ne[0] + (config.patch_size / 2)) / config.patch_size);

                    num_ref_img_tokens += h_len * w_len;
                }
@ -614,7 +627,20 @@ namespace Qwen {
                return build_graph(x, timesteps, context, ref_latents, increase_ref_index);
            };

-            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
+            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
+        }
+
+        sd::Tensor<float> compute(int n_threads,
+                                  const DiffusionParams& diffusion_params) override {
+            GGML_ASSERT(diffusion_params.x != nullptr);
+            GGML_ASSERT(diffusion_params.timesteps != nullptr);
+            static const std::vector<sd::Tensor<float>> empty_ref_latents;
+            return compute(n_threads,
+                           *diffusion_params.x,
+                           *diffusion_params.timesteps,
+                           tensor_or_empty(diffusion_params.context),
+                           diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
+                           diffusion_params.increase_ref_index);
        }

        void test() {
@ -662,10 +688,11 @@ namespace Qwen {
            // cuda q8: pass
            // cuda q8 fa: pass
            // ggml_backend_t backend    = ggml_backend_cuda_init(0);
-            ggml_backend_t backend    = ggml_backend_cpu_init();
+            ggml_backend_t backend    = sd_backend_cpu_init();
            ggml_type model_data_type = GGML_TYPE_Q8_0;

-            ModelLoader model_loader;
+            auto model_manager        = std::make_shared<ModelManager>();
+            ModelLoader& model_loader = model_manager->loader();
            if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
@ -679,19 +706,20 @@ namespace Qwen {
            }

            std::shared_ptr<QwenImageRunner> qwen_image = std::make_shared<QwenImageRunner>(backend,
-                                                                                            false,
                                                                                            tensor_storage_map,
                                                                                            "model.diffusion_model",
-                                                                                            VERSION_QWEN_IMAGE);
+                                                                                            VERSION_QWEN_IMAGE,
+                                                                                            false,
+                                                                                            model_manager);

-            qwen_image->alloc_params_buffer();
-            std::map<std::string, ggml_tensor*> tensors;
-            qwen_image->get_param_tensors(tensors, "model.diffusion_model");
-
-            bool success = model_loader.load_tensors(tensors);
-
-            if (!success) {
-                LOG_ERROR("load tensors from model loader failed");
+            if (!model_manager->register_runner_params("Qwen image test",
+                                                       *qwen_image,
+                                                       "model.diffusion_model",
+                                                       ModelManager::ResidencyMode::ParamBackend,
+                                                       backend,
+                                                       backend) ||
+                !model_manager->validate_registered_tensors()) {
+                LOG_ERROR("register qwen_image tensors with model manager failed");
                return;
            }

@ -702,4 +730,4 @@ namespace Qwen {

 }  // namespace name

-#endif  // __QWEN_IMAGE_HPP__
+#endif  // __SD_MODEL_DIFFUSION_QWEN_IMAGE_HPP__
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
leejet	bb90bfa00f	feat: support backend-specific max-vram budgets	2026-06-14 22:46:32 +08:00
leejet	517abc777d	sync: update ggml (#1656 )	2026-06-14 20:45:05 +08:00
leejet	6f00939f75	docs: refresh README guide links	2026-06-14 17:58:58 +08:00
stduhpf	c2df4e1228	feat: add RPC support (#1629 )	2026-06-14 17:30:23 +08:00
leejet	9838264c49	refactor: simplify ControlNet output caching (#1655 )	2026-06-14 16:58:37 +08:00
leejet	17d70b91e6	docs: replace example option lists with help commands	2026-06-14 16:55:15 +08:00
leejet	5db680c2c7	refactor: route cpu placement through backend specs (#1654 )	2026-06-14 15:52:24 +08:00
leejet	749186c0eb	refactor: remove vae_decode_only context flag (#1653 )	2026-06-14 15:23:29 +08:00
leejet	bdb431ad95	feat: support disk params backend (#1651 )	2026-06-14 14:48:50 +08:00
leejet	276025e054	fix: mark LoKR w2_a tensor as applied (#1650 )	2026-06-14 02:11:02 +08:00
leejet	8d4c7af95b	refactor: route all runner params through model manager (#1649 )	2026-06-14 02:05:23 +08:00
leejet	9b0fceb41b	refactor: manage upscaler params through model manager (#1645 )	2026-06-13 15:39:57 +08:00
leejet	563137a592	refactor: centralize runner weight staging and cleanup (#1644 )	2026-06-13 13:19:13 +08:00
Wyatt Caldwell	3a54597776	fix: SD3 conditioning crash when clip_l text encoder is missing (#1638 )	2026-06-13 13:16:59 +08:00
Cyberhan123	1365008348	chore: add script for automatic code formatting (#1636 )	2026-06-13 13:13:07 +08:00
Cyberhan123	1fb6b22850	feat: add free_sd_images function to manage memory for C API (#1633 )	2026-06-13 13:08:14 +08:00
stduhpf	c20769b2c8	feat: add circular RoPE support for ideogram4 (#1627 )	2026-06-13 13:06:34 +08:00
RapidMark	1b702a51e7	fix: correct mask shape for masked flash attention (#1625 )	2026-06-13 13:01:20 +08:00
RapidMark	19bdfe22d2	feat: set tensor names on block params (#1622 )	2026-06-08 23:25:52 +08:00
stduhpf	138da14cc3	apg: normalize diff_norm calculation by tensor size (#1620 )	2026-06-08 21:56:15 +08:00
fszontagh	17a2b4a315	perf: cap planner budget when model dwarfs the streaming budget (#1612 )	2026-06-08 21:53:54 +08:00
leejet	b3d56d0ba1	refactor: split model loader from model definitions (#1619 )	2026-06-07 23:20:12 +08:00
leejet	2a07540c2a	refactor: move photomaker into generation extension (#1618 )	2026-06-07 22:40:02 +08:00
Wagner Bruna	81abfb2548	chore: rename and reformat gits_noise.inl (#1617 )	2026-06-07 22:30:20 +08:00
leejet	f3fd359b58	refactor: reorganize src model layout (#1615 )	2026-06-07 03:21:12 +08:00
leejet	dfb2390dd4	refactor: extract Wan VAE implementation (#1614 )	2026-06-07 01:33:49 +08:00
leejet	cfbc19d186	refactor: unify model config detection (#1613 )	2026-06-07 01:05:12 +08:00
leejet	b9254dda0d	feat: add ideogram4 support (#1609 )	2026-06-06 16:34:16 +08:00
fszontagh	0648f4426b	perf: ratchet streaming budget so plan stops re-merging every step (#1611 )	2026-06-06 16:32:03 +08:00
YOSHIDA Keiji	74f513d512	fix: Suppress spurious error output for --help (#1607 ) (#1608 ) Signed-off-by: kei-g <km.8k6ce+github@gmail.com>	2026-06-06 16:23:44 +08:00
fszontagh	064001b524	perf: allocate CPU-offloaded params from runtime device pinned host buffer (#1601 )	2026-06-06 16:22:18 +08:00
leejet	1f9ee88e09	fix: zero Wan2.2 TI2V timesteps for fixed frames (#1604 )	2026-06-03 23:32:31 +08:00
fszontagh	a7f2e03da4	perf: keep chunk-K residency engaged with runtime LoRA (#1598 )	2026-06-03 23:12:00 +08:00
stduhpf	4513e3fda9	refactor: img-cond->img_uncond (#1594 ) * refactor: img-cond->img_uncond * align APG and CFG++ with img-uncond CFG * set default img_cfg to 1.f --------- Co-authored-by: leejet <leejet714@gmail.com>	2026-06-03 22:57:42 +08:00
leejet	2d40a8b2ad	feat: make Wan2.2 5B FLF2V work (#1110 )	2026-06-02 23:16:09 +08:00
leejet	9c7f9a20b3	chore: embed server web UI in Docker images (#1597 )	2026-06-02 22:46:25 +08:00
fszontagh	ed74577c40	feat: `--stream-layers` for streaming weights from CPU during generation (#1576 )	2026-06-02 22:35:28 +08:00
RapidMark	7948df8ac1	fix(cmake): build HIP backend with PIC so the static-lib PIE link succeeds (#1593 )	2026-06-02 00:07:48 +08:00
Wagner Bruna	02f06370a7	refactor: call CPU backend functions dynamically (#1591 ) Co-authored-by: leejet <leejet714@gmail.com>	2026-06-01 23:41:21 +08:00
stduhpf	f8935d6f25	feat: support img-cfg for edit models (#929 ) Co-authored-by: leejet <leejet714@gmail.com>	2026-06-01 22:54:25 +08:00
stduhpf	be65ac7511	feat: add support for APG (adaptive projected guidance) + unconditionnal SLG (#593 )	2026-06-01 00:55:49 +08:00
leejet	20901f6d8e	fix: remove kv padding from flash attention wrapper (#1453 )	2026-05-31 23:23:19 +08:00
leejet	0982807139	feat: add PiD support (#1585 )	2026-05-31 22:38:39 +08:00
leejet	d2797b8667	fix: correct Gemma3 rope settings and vram limit propagation (#1583 )	2026-05-30 22:23:49 +08:00
leejet	d3b2cb047e	fix: split tokens before normalization (#1582 )	2026-05-30 18:38:46 +08:00
akleine	b4ba55d8d7	fix: prevent crash in case of a mem alloc error and graceful exit (#1566 )	2026-05-30 18:34:07 +08:00
Wagner Bruna	b54bd83a3f	fix: explicitly exclude f8, f64 and i64 tensors from mmap (#1575 )	2026-05-30 18:31:08 +08:00
Wagner Bruna	0e4ee04488	fix: correct tae for models that use the flux2 vae (#1571 )	2026-05-28 09:13:16 +08:00
leejet	29ab511fc7	fix: resolve LLM norm tensor names by architecture (#1570 )	2026-05-28 00:36:16 +08:00
leejet	55c2aed52c	refactor: simplify diffusion model runner params (#1569 )	2026-05-28 00:12:35 +08:00
leejet	8eded497e5	fix: preserve frontend tooling in ROCm CI build (#1568 )	2026-05-27 21:26:16 +08:00
leejet	92dc7268fc	feat: add microsoft lens support (#1560 )	2026-05-27 01:04:17 +08:00
schirik	07b2b18e70	fix: skip permission denied errors in recursive_directory_iterator (#1564 ) Co-authored-by: Serge F. Chirik <s.chirik@timbel.info>	2026-05-27 00:56:16 +08:00
leejet	1ceb5bd9df	fix: package ROCm BLAS runtime in Windows artifacts (#1562 )	2026-05-26 00:57:37 +08:00
leejet	202c6154a2	fix: use flux flow prediction for LTXAV (#1561 )	2026-05-26 00:23:39 +08:00
stduhpf	a397e03488	feat: add Longcat-Image / Longcat-Image-Edit support (#1053 ) Co-authored-by: leejet <leejet714@gmail.com>	2026-05-24 02:02:02 +08:00
leejet	72e512a0cc	fix: make macOS binaries use relocatable rpaths (#1552 )	2026-05-23 12:27:06 +08:00
leejet	0baf721215	feat: add LTX temporal latent upscaler support (#1551 )	2026-05-23 01:35:13 +08:00
leejet	645e6e9089	feat: add LTX rational latent upscaler (#1549 )	2026-05-23 00:28:15 +08:00
stduhpf	cbf92191c3	fix: strip trailing latent channels for preview decode (#1548 )	2026-05-23 00:26:40 +08:00
stduhpf	8cf55a3b3b	fix: load TAESD preview-only model correctly (#1547 )	2026-05-23 00:22:35 +08:00
leejet	3a8788cb7d	refactor: unify extra argument parsing (#1540 )	2026-05-22 01:00:03 +08:00
leejet	449165caf5	feat: stream LTX VAE temporal tile decoding (#1539 )	2026-05-22 00:25:04 +08:00
stduhpf	adaa599a3b	Feat: Temporal tile custom size with overlap (#1510 ) * Temporal tile size + overlap * add --extra-tiling-args support --------- Co-authored-by: leejet <leejet714@gmail.com>	2026-05-21 23:44:12 +08:00
leejet	2e3514625a	perf: run LTX audio VAE decode in one ggml graph (#1538 )	2026-05-21 22:43:14 +08:00
stduhpf	47d8198b69	feat: add taeltx2_3_wide support (#1535 )	2026-05-21 22:34:12 +08:00
leejet	ef92a0027e	feat: add graph cut markers for LTXAV transformer (#1534 )	2026-05-20 23:22:10 +08:00
leejet	b3374e6a71	feat: add LTX spatial latent upscale hires support (#1533 )	2026-05-20 22:27:09 +08:00
stduhpf	bdd937f29a	feat: add taeltx2/taeltx2.3 support (#1531 )	2026-05-20 22:14:05 +08:00
stduhpf	c51ec7cad9	fix: always load runtimle lora params on runtime backend (#1532 )	2026-05-20 22:13:15 +08:00
leejet	5b0267e941	fix: avoid Vulkan f16 repeat in LTX audio VAE (#1528 )	2026-05-19 23:15:26 +08:00
leejet	0045a72b96	fix: trigger ci for docker image changes (#1527 )	2026-05-19 22:05:03 +08:00
leejet	99bd062546	fix: update sycl docker image to oneapi 2025.3 (#1526 )	2026-05-19 21:59:15 +08:00
leejet	9d8c9e4279	fix: build web UI for Windows ROCm server releases (#1525 )	2026-05-19 21:53:48 +08:00
George Sofianos	caa823a8c0	ci: add RDNA1 + RDNA2 targets for ROCm 7.13 (#1511 )	2026-05-19 01:38:02 +08:00
leejet	22c8c40b0d	sync: update ggml (#1520 )	2026-05-19 01:30:11 +08:00
leejet	b706d682ad	fix: restore singleton dims for LLM outputs (#1518 )	2026-05-18 23:47:10 +08:00
leejet	b758b7de13	fix: only enable TAE after successful load (#1517 )	2026-05-18 23:32:03 +08:00
Wagner Bruna	f683c88a28	feat: make negative max_vram control the amount of spare vram (#1503 )	2026-05-18 23:00:06 +08:00
Christoph	21fd4e6788	ci: add CUDA Docker image support for NVIDIA Spark GB10 (#1512 )	2026-05-18 22:52:01 +08:00
leejet	830804262b	docs: update news	2026-05-18 00:24:29 +08:00
leejet	82e03ef137	ci: add inactive pr clean up workflow	2026-05-18 00:09:45 +08:00
leejet	baf7eda1e4	refactor: minify vocab files (#1509 )	2026-05-17 23:06:58 +08:00
Wagner Bruna	e7eb92fd84	feat: add Gradient Estimation sampler (#1484 )	2026-05-17 22:54:28 +08:00
leejet	50134e51dd	refactor: split guidance composition (#1506 )	2026-05-17 20:20:16 +08:00
leejet	e43b24cf48	feat: add ltx2.3 flf2v support (#1505 )	2026-05-17 18:40:14 +08:00
stduhpf	06accf2b39	feat: add ltxav latent2rgb projection matrix (#1502 )	2026-05-17 17:52:05 +08:00
stduhpf	cde20d5ef0	fix: handle stereo format in sd_audio (#1489 ) Co-authored-by: leejet <leejet714@gmail.com>	2026-05-17 16:55:39 +08:00
leejet	67dda3f897	feat: add ltx2.3 support (#1463 ) * add GemmaTokenizer * add basic ltx2.3 support * change vocab file encoding * fix ci * fix ubuntu build * add temporal tiling support * add ltx audio support * update ggml submodule url * fix generate_video * add i2v support * minify bundled Gemma tokenizer vocab sources * pass video fps into temporal rope embeddings * fix av_ca_timestep_scale_multiplier * add LTX2Scheduler support * update docs * fix ci	2026-05-17 16:46:20 +08:00
Mario Limonciello	3b4d26f3d9	ci: update ROCm builds for Windows and Linux to use ROCm 7.13 (#1504 )	2026-05-17 16:32:19 +08:00
Taylor	bd17f53b73	docs: update zit example to 8 steps (#1294 )	2026-05-16 21:32:03 +08:00
leejet	d7ecbe1d01	fix: avoid repeated T5 EOS tokens in Anima prompt weights (#1501 )	2026-05-16 21:22:46 +08:00
leejet	36330724bd	feat: add module backend assignment support (#1500 ) Co-authored-by: Stéphane du Hamel <stephduh@live.fr>	2026-05-16 20:27:06 +08:00
Mario Limonciello	0c1ca170ca	ci: update ROCm Windows builds (#1282 )	2026-05-16 20:25:38 +08:00
Mario Limonciello	839f6a94d2	ci: switch over ROCm builds to artifacts both for stable and preview releases (#1281 )	2026-05-16 20:23:26 +08:00
leejet	38b14adb67	feat: auto-detect max VRAM budget with --max-vram -1 (#1498 )	2026-05-16 16:14:25 +08:00
Wagner Bruna	fd1a2794f3	refactor: unify Euler, Euler Ancestral and DDIM implementations (#1474 )	2026-05-16 16:13:28 +08:00
cphlipot	db08b84607	fix: Fix broken GCC 16 build (enforce C11/C++17 compile ) (#1478 )	2026-05-16 16:10:16 +08:00
Wagner Bruna	686856edca	chore: do not report the fake VAE "allocation" as an error (#1494 )	2026-05-16 16:08:31 +08:00
leejet	0b8296915c	docs: add .github/pull_request_template.md	2026-05-15 01:16:21 +08:00
leejet	381e0df50f	docs: add CONTRIBUTING.md	2026-05-15 01:09:45 +08:00
leejet	0665a7f8bf	feat: add hidream o1 image support (#1485 )	2026-05-15 00:40:21 +08:00
Craig Andrews	eeac950b44	fix: Use PkgConfig for WebP and WebM (#1400 )	2026-05-15 00:31:10 +08:00
Wagner Bruna	57ff2eb0f4	feat: support for memory-mapping model weights (#1414 ) Co-authored-by: Piotr Wilkin <piotr.wilkin@syndatis.com> Co-authored-by: Junmo Kim <me@junmo.kim> Co-authored-by: leejet <leejet714@gmail.com>	2026-05-15 00:30:03 +08:00
Daniele	9d683417cb	feat: add Euler CFG++ and Euler-A CFG++ samplers (#1354 )	2026-05-15 00:29:04 +08:00
l8bloom	60477fd50f	docs: add new go bindings for stable-diffusion.cpp (#1480 )	2026-05-14 23:59:06 +08:00
cphlipot	6ee0684d74	feat: display server url with "http://" prefix. (#1486 )	2026-05-14 23:57:22 +08:00
leejet	90e87bc846	feat: add max-vram based segmented param offload (#1476 )	2026-05-06 21:56:02 +08:00
Wagner Bruna	586b6f1481	feat: adapt res samplers for flow models for eta > 0 (#1436 )	2026-05-06 21:49:06 +08:00
fszontagh	9097ce5211	fix: skip empty MultiLoraAdapter when no LoRAs target a model (#1469 )	2026-05-06 21:45:47 +08:00
leejet	3d6064b37e	perf: speed up tensor_to_sd_image conversion (#1466 )	2026-04-30 01:13:56 +08:00
Wagner Bruna	b8079e253d	feat: transition from compile-time to runtime backend discovery (#1448 ) Co-authored-by: Stéphane du Hamel <stephduh@live.fr> Co-authored-by: Cyberhan123 <255542417@qq.com> Co-authored-by: leejet <leejet714@gmail.com>	2026-04-29 23:26:57 +08:00
Wagner Bruna	331cfa5387	fix: release VAE compute buffer after tiled encoding (#1465 )	2026-04-29 22:25:30 +08:00
Douglas Griffith	a81677f59c	docs: performance tips markup (#1460 )	2026-04-27 22:55:30 +08:00
leejet	f40a707d0f	feat: add sdcpp-specific generation metadata to image outputs (#1462 )	2026-04-27 22:43:13 +08:00
akleine	970c4a3312	chore: replace some NULL with nullptr + use "%zu" for printing some size_t data (#1457 )	2026-04-27 22:42:57 +08:00
leejet	b8bdffc199	feat: add more built-in highres upscalers (#1456 )	2026-04-23 22:17:58 +08:00
leejet	c97702e105	feat: add sd-webui style Hires. fix support (#1451 )	2026-04-22 23:51:09 +08:00
leejet	44cca3d626	feat: support safetensors export in convert mode (#1444 )	2026-04-20 00:22:11 +08:00
leejet	0a7ae07f94	feat: add restricted torch legacy checkpoint loading (#1443 )	2026-04-19 23:09:43 +08:00
leejet	66143340b6	refactor: move model file IO into dedicated module (#1442 )	2026-04-19 17:52:56 +08:00
Wagner Bruna	7023fc4cfb	fix: correct image to image DDIM and TCD (#1410 )	2026-04-19 17:51:28 +08:00
Wagner Bruna	e77e4c46bf	feat: adapt LCM for flow models (#1413 )	2026-04-19 17:49:46 +08:00
leejet	7d33d4b2dd	chore: enable MSVC parallel compilation with /MP (#1438 )	2026-04-18 15:44:43 +08:00
leejet	3c99f700de	ci: skip docker image build job on pull requests (#1439 )	2026-04-18 15:25:04 +08:00
leejet	4d626d24b2	feat(server): implement vid_gen async API and mode-aware capabilities (#1437 )	2026-04-18 15:06:36 +08:00
Wagner Bruna	f3f69e2fbe	feat: add DPM++ (2S) Ancestral implementation for flow models (#1428 )	2026-04-18 15:05:09 +08:00
Erik Scholz	6a9cb31150	fix: tune ernie-image default flow shift (#1433 )	2026-04-18 14:58:00 +08:00