fix: resolve embedding loading issue when calling generate_image multiple times (#1078 )

fix: avoid crash loading LoRAs with bf16 weights (#1077 )
feat: align the spatial size to the corresponding multiple (#1073 )
2025-12-13 05:48:56 +00:00 · 2025-12-12 23:08:12 +08:00 · 2025-12-12 22:36:54 +08:00 · 2025-12-10 23:15:08 +08:00 · 2025-12-10 22:25:19 +08:00 · 2025-12-10 00:26:07 +08:00
118 changed files with 1415128 additions and 5931 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@ -0,0 +1,10 @@
+Checks: >
+  modernize-make-shared,
+  modernize-use-nullptr,
+  modernize-use-override,
+  modernize-pass-by-value,
+  modernize-return-braced-init-list,
+  modernize-deprecated-headers,
+HeaderFilterRegex: '^$'
+WarningsAsErrors: ''
+FormatStyle: none
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@ -0,0 +1,73 @@
+name: 🐞 Bug Report
+description: Report a bug or unexpected behavior
+title: "[Bug] "
+labels: ["bug"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Please use this template and include as many details as possible to help us reproduce and fix the issue.
+  - type: textarea
+    id: commit
+    attributes:
+      label: Git commit
+      description: Which commit are you trying to compile?
+      placeholder: |
+        $git rev-parse HEAD
+        40a6a8710ec15b1b5db6b5a098409f6bc8f654a4
+    validations:
+      required: true
+  - type: input
+    id: os
+    attributes:
+      label: Operating System & Version
+      placeholder: e.g. “Ubuntu 22.04”, “Windows 11 23H2”, “macOS 14.3”
+    validations:
+      required: true
+  - type: dropdown
+    id: backends
+    attributes:
+        label: GGML backends
+        description: Which GGML backends do you know to be affected?
+        options: [CPU, CUDA, HIP, Metal, Musa, SYCL, Vulkan, OpenCL]
+        multiple: true
+    validations:
+      required: true
+  - type: input
+    id: cmd_arguments
+    attributes:
+      label: Command-line arguments used
+      placeholder: The full command line you ran (with all flags)
+    validations:
+      required: true
+  - type: textarea
+    id: steps_to_reproduce
+    attributes:
+      label: Steps to reproduce
+      placeholder: A step-by-step list of what you did
+    validations:
+      required: true
+  - type: textarea
+    id: expected_behavior
+    attributes:
+      label: What you expected to happen
+      placeholder: Describe the expected behavior or result
+    validations:
+      required: true
+  - type: textarea
+    id: actual_behavior
+    attributes:
+      label: What actually happened
+      placeholder: Describe what you saw instead (errors, logs, crash, etc.)
+    validations:
+      required: true
+  - type: textarea
+    id: logs_and_errors
+    attributes:
+      label: Logs / error messages / stack trace
+      placeholder: Paste complete logs or error output
+  - type: textarea
+    id: additional_info
+    attributes:
+      label: Additional context / environment details
+      placeholder: e.g. CPU model, GPU, RAM, model file versions, quantization type, etc.
--- a/.github/ISSUE_TEMPLATE/feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@ -0,0 +1,33 @@
+name: 💡 Feature Request
+description: Suggest a new feature or improvement
+title: "[Feature] "
+labels: ["enhancement"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thank you for suggesting an improvement! Please fill in the fields below.
+  - type: input
+    id: summary
+    attributes:
+      label: Feature Summary
+      placeholder: A one-line summary of the feature you’d like
+    validations:
+      required: true
+  - type: textarea
+    id: description
+    attributes:
+      label: Detailed Description
+      placeholder: What problem does this solve? How do you expect it to work?
+    validations:
+      required: true
+  - type: textarea
+    id: alternatives
+    attributes:
+      label: Alternatives you considered
+      placeholder: Any alternative designs or workarounds you tried
+  - type: textarea
+    id: additional_context
+    attributes:
+      label: Additional context
+      placeholder: Any extra information (use cases, related functionalities, constraints)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -65,7 +65,7 @@ jobs:

      - name: Get commit hash
        id: commit
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: pr-mpt/actions-commit-hash@v2

      - name: Fetch system info
@ -118,7 +118,7 @@ jobs:

      - name: Get commit hash
        id: commit
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: pr-mpt/actions-commit-hash@v2

      - name: Fetch system info
@ -146,26 +146,24 @@ jobs:
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip

  windows-latest-cmake:
-    runs-on: windows-2019
+    runs-on: windows-2025

    env:
-      VULKAN_VERSION: 1.3.261.1
+      VULKAN_VERSION: 1.4.328.1

    strategy:
      matrix:
        include:
          - build: "noavx"
-            defines: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON"
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON"
          - build: "avx2"
-            defines: "-DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
          - build: "avx"
-            defines: "-DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON"
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX=ON -DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON"
          - build: "avx512"
-            defines: "-DGGML_AVX512=ON -DSD_BUILD_SHARED_LIBS=ON"
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
          - build: "cuda12"
-            defines: "-DSD_CUBLAS=ON -DSD_BUILD_SHARED_LIBS=ON"
-          - build: "rocm5.5"
-            defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
+            defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES='61;70;75;80;86;89;90;100;120'"
          - build: 'vulkan'
            defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
    steps:
@ -178,30 +176,17 @@ jobs:
      - name: Install cuda-toolkit
        id: cuda-toolkit
        if: ${{ matrix.build == 'cuda12' }}
-        uses: Jimver/cuda-toolkit@v0.2.11
+        uses: Jimver/cuda-toolkit@v0.2.22
        with:
-          cuda: "12.2.0"
+          cuda: "12.8.1"
          method: "network"
          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'

-      - name: Install rocm-toolkit
-        id: rocm-toolkit
-        if: ${{ matrix.build == 'rocm5.5' }}
-        uses: Cyberhan123/rocm-toolkit@v0.1.0
-        with:
-          rocm: "5.5.0"
-
-      - name: Install Ninja
-        id: install-ninja
-        if: ${{ matrix.build == 'rocm5.5' }}
-        uses: urkle/action-get-ninja@v1
-        with:
-          version: 1.11.1
      - name: Install Vulkan SDK
        id: get_vulkan
        if: ${{ matrix.build == 'vulkan' }}
        run: |
-          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
+          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
@ -254,7 +239,7 @@ jobs:

      - name: Copy and pack Cuda runtime
        id: pack_cuda_runtime
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ matrix.build == 'cuda12' && (github.event_name == 'push' && github.ref == 'refs/heads/master' || github.event.inputs.create_release == 'true') }}
        run: |
          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
          $dst='.\build\bin\cudart\'
@ -262,7 +247,7 @@ jobs:
          7z a cudart-sd-bin-win-cu12-x64.zip $dst\*

      - name: Upload Cuda runtime
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ matrix.build == 'cuda12' && (github.event_name == 'push' && github.ref == 'refs/heads/master' || github.event.inputs.create_release == 'true') }}
        uses: actions/upload-artifact@v4
        with:
          name: sd-cudart-sd-bin-win-cu12-x64.zip
@ -277,6 +262,104 @@ jobs:
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip

+  windows-latest-cmake-hip:
+    runs-on: windows-2022
+
+    env:
+      HIPSDK_INSTALLER_VERSION: "25.Q3"
+      GPU_TARGETS: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      - name: Cache ROCm Installation
+        id: cache-rocm
+        uses: actions/cache@v4
+        with:
+          path: C:\Program Files\AMD\ROCm
+          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-x64
+          evict-old-files: 1d
+
+      - name: Install ROCm
+        if: steps.cache-rocm.outputs.cache-hit != 'true'
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "Downloading AMD HIP SDK Installer"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          write-host "Installing AMD HIP SDK"
+          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
+          $completed = $proc.WaitForExit(600000)
+          if (-not $completed) {
+              Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
+              $proc.Kill()
+              exit 1
+          }
+          if ($proc.ExitCode -ne 0) {
+              Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
+              exit 1
+          }
+          write-host "Completed AMD HIP SDK installation"
+
+      - name: Verify ROCm
+        run: |
+          # Find and test ROCm installation
+          $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
+          if (-not $clangPath) {
+            Write-Error "ROCm installation not found"
+            exit 1
+          }
+          & $clangPath.FullName --version
+          # Set HIP_PATH environment variable for later steps
+          echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)" >> $env:GITHUB_ENV
+
+      - name: Build
+        run: |
+          mkdir build
+          cd build
+          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+          cmake .. `
+            -G "Unix Makefiles" `
+            -DSD_HIPBLAS=ON `
+            -DSD_BUILD_SHARED_LIBS=ON `
+            -DGGML_NATIVE=OFF `
+            -DCMAKE_C_COMPILER=clang `
+            -DCMAKE_CXX_COMPILER=clang++ `
+            -DCMAKE_BUILD_TYPE=Release `
+            -DGPU_TARGETS="${{ env.GPU_TARGETS }}"
+          cmake --build . --config Release --parallel ${env:NUMBER_OF_PROCESSORS}
+
+      - name: Get commit hash
+        id: commit
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: pr-mpt/actions-commit-hash@v2
+
+      - name: Pack artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          md "build\bin\rocblas\library\"
+          md "build\bin\hipblaslt\library"
+          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
+          cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
+          7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip .\build\bin\*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
+          path: |
+            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
+
  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

@ -286,8 +369,14 @@ jobs:
      - ubuntu-latest-cmake
      - macOS-latest-cmake
      - windows-latest-cmake
+      - windows-latest-cmake-hip

    steps:
+      - name: Clone
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
      - name: Download artifacts
        id: download-artifact
        uses: actions/download-artifact@v4
@ -296,20 +385,27 @@ jobs:
          pattern: sd-*
          merge-multiple: true

+      - name: Get commit count
+        id: commit_count
+        run: |
+          echo "count=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+
      - name: Get commit hash
        id: commit
        uses: pr-mpt/actions-commit-hash@v2

      - name: Create release
        id: create_release
+        if: ${{ github.event_name == 'workflow_dispatch' || github.ref_name == 'master' }}
        uses: anzz1/action-create-release@v1
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
-          tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
+          tag_name: ${{ format('{0}-{1}-{2}', env.BRANCH_NAME, steps.commit_count.outputs.count, steps.commit.outputs.short) }}

      - name: Upload release
        id: upload_release
+        if: ${{ github.event_name == 'workflow_dispatch' || github.ref_name == 'master' }}
        uses: actions/github-script@v3
        with:
          github-token: ${{secrets.GITHUB_TOKEN}}
--- a/.gitignore
+++ b/.gitignore
@ -1,9 +1,10 @@
 build*/
+cmake-build-*/
 test/
 .vscode/
+.idea/
 .cache/
 *.swp
-.vscode/
 *.bat
 *.bin
 *.exe
@ -11,3 +12,4 @@ test/
 output*.png
 models*
 *.log
+preview.png
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +1,3 @@
 [submodule "ggml"]
    path = ggml
-	url = https://github.com/ggerganov/ggml.git
+	url = https://github.com/ggml-org/ggml.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -24,20 +24,23 @@ endif()
 # general
 #option(SD_BUILD_TESTS                "sd: build tests"    ${SD_STANDALONE})
 option(SD_BUILD_EXAMPLES             "sd: build examples" ${SD_STANDALONE})
-option(SD_CUBLAS                     "sd: cuda backend" OFF)
+option(SD_CUDA                       "sd: cuda backend" OFF)
 option(SD_HIPBLAS                    "sd: rocm backend" OFF)
 option(SD_METAL                      "sd: metal backend" OFF)
 option(SD_VULKAN                     "sd: vulkan backend" OFF)
+option(SD_OPENCL                     "sd: opencl backend" OFF)
 option(SD_SYCL                       "sd: sycl backend" OFF)
-option(SD_FLASH_ATTN                 "sd: use flash attention for x4 less memory usage" OFF)
+option(SD_MUSA                       "sd: musa backend" OFF)
 option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
 option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
+option(SD_BUILD_SHARED_GGML_LIB      "sd: build ggml as a separate shared lib" OFF)
+option(SD_USE_SYSTEM_GGML            "sd: use system-installed GGML library" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)

-if(SD_CUBLAS)
-    message("-- Use CUBLAS as backend stable-diffusion")
+if(SD_CUDA)
+    message("-- Use CUDA as backend stable-diffusion")
    set(GGML_CUDA ON)
-    add_definitions(-DSD_USE_CUBLAS)
+    add_definitions(-DSD_USE_CUDA)
 endif()

 if(SD_METAL)
@ -52,18 +55,28 @@ if (SD_VULKAN)
    add_definitions(-DSD_USE_VULKAN)
 endif ()

+if (SD_OPENCL)
+    message("-- Use OpenCL as backend stable-diffusion")
+    set(GGML_OPENCL ON)
+    add_definitions(-DSD_USE_OPENCL)
+endif ()
+
 if (SD_HIPBLAS)
    message("-- Use HIPBLAS as backend stable-diffusion")
-    set(GGML_HIPBLAS ON)
-    add_definitions(-DSD_USE_CUBLAS)
+    set(GGML_HIP ON)
+    add_definitions(-DSD_USE_CUDA)
    if(SD_FAST_SOFTMAX)
        set(GGML_CUDA_FAST_SOFTMAX ON)
    endif()
 endif ()

-if(SD_FLASH_ATTN)
-    message("-- Use Flash Attention for memory optimization")
-    add_definitions(-DSD_USE_FLASH_ATTENTION)
+if(SD_MUSA)
+    message("-- Use MUSA as backend stable-diffusion")
+    set(GGML_MUSA ON)
+    add_definitions(-DSD_USE_CUDA)
+    if(SD_FAST_SOFTMAX)
+        set(GGML_CUDA_FAST_SOFTMAX ON)
+    endif()
 endif()

 set(SD_LIB stable-diffusion)
@ -74,24 +87,60 @@ file(GLOB SD_LIB_SOURCES
    "*.hpp"
 )

-# we can get only one share lib
+find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
+if(GIT_EXE)
+    execute_process(COMMAND ${GIT_EXE} describe --tags --abbrev=7 --dirty=+
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE SDCPP_BUILD_VERSION
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+    )
+    execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE SDCPP_BUILD_COMMIT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+    )
+endif()
+
+if(NOT SDCPP_BUILD_VERSION)
+    set(SDCPP_BUILD_VERSION unknown)
+endif()
+message(STATUS "stable-diffusion.cpp version ${SDCPP_BUILD_VERSION}")
+
+if(NOT SDCPP_BUILD_COMMIT)
+    set(SDCPP_BUILD_COMMIT unknown)
+endif()
+message(STATUS "stable-diffusion.cpp commit ${SDCPP_BUILD_COMMIT}")
+
+set_property(
+  SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp
+  APPEND PROPERTY COMPILE_DEFINITIONS
+  SDCPP_BUILD_COMMIT=${SDCPP_BUILD_COMMIT} SDCPP_BUILD_VERSION=${SDCPP_BUILD_VERSION}
+)
+
 if(SD_BUILD_SHARED_LIBS)
    message("-- Build shared library")
    message(${SD_LIB_SOURCES})
+    if(NOT SD_BUILD_SHARED_GGML_LIB)
        set(BUILD_SHARED_LIBS OFF)
+    endif()
    add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES})
    add_definitions(-DSD_BUILD_SHARED_LIB)
    target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL)
    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 else()
    message("-- Build static library")
+    if(NOT SD_BUILD_SHARED_GGML_LIB)
        set(BUILD_SHARED_LIBS OFF)
+    endif()
    add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
 endif()

 if(SD_SYCL)
    message("-- Use SYCL as backend stable-diffusion")
    set(GGML_SYCL ON)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
    add_definitions(-DSD_USE_SYCL)
    # disable fast-math on host, see:
    # https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/fp-model-fp.html
@ -106,23 +155,37 @@ endif()

 set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)

+if (NOT SD_USE_SYSTEM_GGML)
    # see https://github.com/ggerganov/ggml/pull/682
    add_definitions(-DGGML_MAX_NAME=128)
+endif()

 # deps
 # Only add ggml if it hasn't been added yet
 if (NOT TARGET ggml)
+    if (SD_USE_SYSTEM_GGML)
+        find_package(ggml REQUIRED)
+        if (NOT ggml_FOUND)
+            message(FATAL_ERROR "System-installed GGML library not found.")
+        endif()
+        add_library(ggml ALIAS ggml::ggml)
+    else()
        add_subdirectory(ggml)
    endif()
+endif()

 add_subdirectory(thirdparty)

 target_link_libraries(${SD_LIB} PUBLIC ggml zip)
 target_include_directories(${SD_LIB} PUBLIC . thirdparty)
-target_compile_features(${SD_LIB} PUBLIC cxx_std_11)
+target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)


 if (SD_BUILD_EXAMPLES)
    add_subdirectory(examples)
 endif()

+set(SD_PUBLIC_HEADERS stable-diffusion.h)
+set_target_properties(${SD_LIB} PROPERTIES PUBLIC_HEADER "${SD_PUBLIC_HEADERS}")
+
+install(TARGETS ${SD_LIB} LIBRARY PUBLIC_HEADER)
--- a/13
+++ b/13
@ -1,16 +1,21 @@
 ARG UBUNTU_VERSION=22.04

-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build

-RUN apt-get update && apt-get install -y build-essential git cmake
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake

 WORKDIR /sd.cpp

 COPY . .

-RUN mkdir build && cd build && cmake .. && cmake --build . --config Release
+RUN cmake . -B ./build
+RUN cmake --build ./build --config Release --parallel

-FROM ubuntu:$UBUNTU_VERSION as runtime
+FROM ubuntu:$UBUNTU_VERSION AS runtime
+
+RUN apt-get update && \
+    apt-get install --yes --no-install-recommends libgomp1 && \
+    apt-get clean

 COPY --from=build /sd.cpp/build/bin/sd /sd

--- a/Dockerfile.musa
+++ b/Dockerfile.musa
@ -0,0 +1,23 @@
+ARG MUSA_VERSION=rc4.2.0
+ARG UBUNTU_VERSION=22.04
+
+FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64 as build
+
+RUN apt-get update && apt-get install -y ccache cmake git
+
+WORKDIR /sd.cpp
+
+COPY . .
+
+RUN mkdir build && cd build && \
+    cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ \
+        -DCMAKE_C_FLAGS="${CMAKE_C_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
+        -DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
+        -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build . --config Release
+
+FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 as runtime
+
+COPY --from=build /sd.cpp/build/bin/sd /sd
+
+ENTRYPOINT [ "/sd" ]
--- a/Dockerfile.sycl
+++ b/Dockerfile.sycl
@ -0,0 +1,19 @@
+ARG SYCL_VERSION=2025.1.0-0
+
+FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS build
+
+RUN apt-get update && apt-get install -y cmake
+
+WORKDIR /sd.cpp
+
+COPY . .
+
+RUN mkdir build && cd build && \
+    cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build . --config Release -j$(nproc)
+
+FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS runtime
+
+COPY --from=build /sd.cpp/build/bin/sd /sd
+
+ENTRYPOINT [ "/sd" ]
--- a/README.md
+++ b/README.md
@ -1,39 +1,86 @@
 <p align="center">
-  <img src="./assets/cat_with_sd_cpp_42.png" width="360x">
+  <img src="./assets/logo.png" width="360x">
 </p>

 # stable-diffusion.cpp

-Inference of Stable Diffusion and Flux in pure C/C++
+<div align="center">
+<a href="https://trendshift.io/repositories/9714" target="_blank"><img src="https://trendshift.io/api/badge/repositories/9714" alt="leejet%2Fstable-diffusion.cpp | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</div>
+
+Diffusion model(SD,Flux,Wan,...) inference in pure C/C++
+
+***Note that this project is under active development. \
+API and command-line option may change frequently.***
+
+## 🔥Important News
+
+* **2025/12/01** 🚀 stable-diffusion.cpp now supports **Z-Image**  
+  👉 Details: [PR #1020](https://github.com/leejet/stable-diffusion.cpp/pull/1020)
+
+* **2025/11/30** 🚀 stable-diffusion.cpp now supports **FLUX.2-dev**  
+  👉 Details: [PR #1016](https://github.com/leejet/stable-diffusion.cpp/pull/1016)
+
+* **2025/10/13** 🚀 stable-diffusion.cpp now supports **Qwen-Image-Edit / Qwen-Image-Edit 2509**  
+  👉 Details: [PR #877](https://github.com/leejet/stable-diffusion.cpp/pull/877)
+
+* **2025/10/12** 🚀 stable-diffusion.cpp now supports **Qwen-Image**  
+  👉 Details: [PR #851](https://github.com/leejet/stable-diffusion.cpp/pull/851)
+
+* **2025/09/14** 🚀 stable-diffusion.cpp now supports **Wan2.1 Vace**  
+  👉 Details: [PR #819](https://github.com/leejet/stable-diffusion.cpp/pull/819)
+
+* **2025/09/06** 🚀 stable-diffusion.cpp now supports **Wan2.1 / Wan2.2**  
+  👉 Details: [PR #778](https://github.com/leejet/stable-diffusion.cpp/pull/778)

 ## Features

- Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
+- Plain C/C++ implementation based on [ggml](https://github.com/ggml-org/ggml), working in the same way as [llama.cpp](https://github.com/ggml-org/llama.cpp)
 - Super lightweight and without external dependencies
- SD1.x, SD2.x, SDXL and [SD3/SD3.5](./docs/sd3.md) support
-    - !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors).
- [Flux-dev/Flux-schnell Support](./docs/flux.md)
-
- [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo) and [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) support
+- Supported models
+  - Image Models
+    - SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo)
+    - SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
+    - [Some SD1.x and SDXL distilled models](./docs/distilled_sd.md)
+    - [SD3/SD3.5](./docs/sd3.md)
+    - [FlUX.1-dev/FlUX.1-schnell](./docs/flux.md)
+    - [FLUX.2-dev](./docs/flux2.md)
+    - [Chroma](./docs/chroma.md)
+    - [Chroma1-Radiance](./docs/chroma_radiance.md)
+    - [Qwen Image](./docs/qwen_image.md)
+    - [Z-Image](./docs/z_image.md)
+    - [Ovis-Image](./docs/ovis_image.md)
+  - Image Edit Models
+    - [FLUX.1-Kontext-dev](./docs/kontext.md)
+    - [Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
+  - Video Models
+    - [Wan2.1/Wan2.2](./docs/wan.md)
  - [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
- 16-bit, 32-bit float support
- 2-bit, 3-bit, 4-bit, 5-bit and 8-bit integer quantization support
- Accelerated memory-efficient CPU inference
-    - Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB.
- AVX, AVX2 and AVX512 support for x86 architectures
- Full CUDA, Metal, Vulkan and SYCL backend for GPU acceleration.
- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models
-    - No need to convert to `.ggml` or `.gguf` anymore!
- Flash Attention for memory usage optimization (only cpu for now)
- Original `txt2img` and `img2img` mode
- Negative prompt
- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
+  - Control Net support with SD 1.5
  - LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
  - Latent Consistency Models support (LCM/LCM-LoRA)
  - Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
  - Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
+- Supported backends
+  - CPU (AVX, AVX2 and AVX512 support for x86 architectures)
+  - CUDA
+  - Vulkan
+  - Metal
+  - OpenCL
+  - SYCL
+- Supported weight formats
+  - Pytorch checkpoint (`.ckpt` or `.pth`)
+  - Safetensors (`./safetensors`)
+  - GGUF (`.gguf`)
+- Supported platforms
+    - Linux
+    - Mac OS
+    - Windows
+    - Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))
+- Flash Attention for memory usage optimization
+- Negative prompt
+- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
 - VAE tiling processing for reduce memory usage
- Control Net support with SD 1.5
 - Sampling method
    - `Euler A`
    - `Euler`
@ -43,241 +90,52 @@ Inference of Stable Diffusion and Flux in pure C/C++
    - [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
    - `DPM++ 2S a`
    - [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
+- Cross-platform reproducibility
+    - `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG`
+    - `--rng cpu`, consistent with the `comfyui RNG`
 - Embedds generation parameters into png output as webui-compatible text string
- Supported platforms
-    - Linux
-    - Mac OS
-    - Windows
-    - Android (via Termux)

-### TODO
+## Quick Start

- [ ] More sampling methods
- [ ] Make inference faster
-    - The current implementation of ggml_conv_2d is slow and has high memory usage
- [ ] Continuing to reduce memory usage (quantizing the weights of ggml_conv_2d)
- [ ] Implement Inpainting support
+### Get the sd executable

-## Usage
+- Download pre-built binaries from the [releases page](https://github.com/leejet/stable-diffusion.cpp/releases)
+- Or build from source by following the [build guide](./docs/build.md)

-For most users, you can download the built executable program from the latest [release](https://github.com/leejet/stable-diffusion.cpp/releases/latest).
-If the built product does not meet your requirements, you can choose to build it manually.
+### Download model weights

-### Get the Code
-
-```
-git clone --recursive https://github.com/leejet/stable-diffusion.cpp
-cd stable-diffusion.cpp
-```
-
- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
-
-```
-cd stable-diffusion.cpp
-git pull origin master
-git submodule init
-git submodule update
-```
-
-### Download weights
-
- download original weights(.ckpt or .safetensors). For example
-    - Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
-    - Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
-    - Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
-    - Stable Diffusion 3 2B from https://huggingface.co/stabilityai/stable-diffusion-3-medium
-
-    ```shell
-    curl -L -O https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
-    # curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
-    # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-nonema-pruned.safetensors
-    # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors
-    ```
-
-### Build
-
-#### Build from scratch
-
-```shell
-mkdir build
-cd build
-cmake ..
-cmake --build . --config Release
-```
-
-##### Using OpenBLAS
-
-```
-cmake .. -DGGML_OPENBLAS=ON
-cmake --build . --config Release
-```
-
-##### Using CUBLAS
-
-This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
-
-```
-cmake .. -DSD_CUBLAS=ON
-cmake --build . --config Release
-```
-
-##### Using HipBLAS
-This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed.
-
-Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
-
-```
-cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1100
-cmake --build . --config Release
-```
-
-
-##### Using Metal
-
-Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
-
-```
-cmake .. -DSD_METAL=ON
-cmake --build . --config Release
-```
-
-##### Using Vulkan
-
-Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
-
-```
-cmake .. -DSD_VULKAN=ON
-cmake --build . --config Release
-```
-
-##### Using SYCL
-
-Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggerganov/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
-
-```
-# Export relevant ENV variables
-source /opt/intel/oneapi/setvars.sh
-
-# Option 1: Use FP32 (recommended for better performance in most cases)
-cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-# Option 2: Use FP16
-cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
-
-cmake --build . --config Release
-```
-
-Example of text2img by using SYCL backend:
-
- download `stable-diffusion` model weight, refer to [download-weight](#download-weights).
-
- run `./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors --cfg-scale 5 --steps 30 --sampling-method euler  -H 1024 -W 1024 --seed 42 -p "fantasy medieval village world inside a glass sphere , high detail, fantasy, realistic, light effect, hyper detail, volumetric lighting, cinematic, macro, depth of field, blur, red light and clouds from the back, highly detailed epic cinematic concept art cg render made in maya, blender and photoshop, octane render, excellent composition, dynamic dramatic cinematic lighting, aesthetic, very inspirational, world inside a glass sphere by james gurney by artgerm with james jean, joe fenton and tristan eaton by ross tran, fine details, 4k resolution"`
-
-<p align="center">
-  <img src="./assets/sycl_sd3_output.png" width="360x">
-</p>
-
-
-
-##### Using Flash Attention
-
-Enabling flash attention reduces memory usage by at least 400 MB. At the moment, it is not supported when CUBLAS is enabled because the kernel implementation is missing.
-
-```
-cmake .. -DSD_FLASH_ATTN=ON
-cmake --build . --config Release
-```
-
-### Run
-
-```
-usage: ./bin/sd [arguments]
-
-arguments:
-  -h, --help                         show this help message and exit
-  -M, --mode [MODEL]                 run mode (txt2img or img2img or convert, default: txt2img)
-  -t, --threads N                    number of threads to use during computation (default: -1)
-                                     If threads <= 0, then threads will be set to the number of CPU physical cores
-  -m, --model [MODEL]                path to full model
-  --diffusion-model                  path to the standalone diffusion model
-  --clip_l                           path to the clip-l text encoder
-  --clip_g                           path to the clip-l text encoder
-  --t5xxl                            path to the the t5xxl text encoder
-  --vae [VAE]                        path to vae
-  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
-  --control-net [CONTROL_PATH]       path to control net model
-  --embd-dir [EMBEDDING_PATH]        path to embeddings
-  --stacked-id-embd-dir [DIR]        path to PHOTOMAKER stacked id embeddings
-  --input-id-images-dir [DIR]        path to PHOTOMAKER input id images dir
-  --normalize-input                  normalize PHOTOMAKER input id images
-  --upscale-model [ESRGAN_PATH]      path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
-  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)
-  --type [TYPE]                      weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_k, q3_k, q4_k)
-                                     If not specified, the default is the type of the weight file
-  --lora-model-dir [DIR]             lora model directory
-  -i, --init-img [IMAGE]             path to the input image, required by img2img
-  --control-image [IMAGE]            path to image condition, control net
-  -o, --output OUTPUT                path to write result image to (default: ./output.png)
-  -p, --prompt [PROMPT]              the prompt to render
-  -n, --negative-prompt PROMPT       the negative prompt (default: "")
-  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)
-  --strength STRENGTH                strength for noising/unnoising (default: 0.75)
-  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20%)
-  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)
-                                     1.0 corresponds to full destruction of information in init image
-  -H, --height H                     image height, in pixel space (default: 512)
-  -W, --width W                      image width, in pixel space (default: 512)
-  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm}
-                                     sampling method (default: "euler_a")
-  --steps  STEPS                     number of sample steps (default: 20)
-  --rng {std_default, cuda}          RNG (default: cuda)
-  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)
-  -b, --batch-count COUNT            number of images to generate
-  --schedule {discrete, karras, exponential, ays, gits} Denoiser sigma schedule (default: discrete)
-  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
-                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
-  --vae-tiling                       process vae in tiles to reduce memory usage
-  --vae-on-cpu                       keep vae in cpu (for low vram)
-  --clip-on-cpu                      keep clip in cpu (for low vram)
-  --control-net-cpu                  keep controlnet in cpu (for low vram)
-  --canny                            apply canny preprocessor (edge detection)
-  --color                            Colors the logging tags according to level
-  -v, --verbose                      print extra info
-```
-
-#### txt2img example
+- download weights(.ckpt or .safetensors or .gguf). For example
+    - Stable Diffusion v1.5 from https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5 

    ```sh
-./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
-# ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
-# ./bin/sd -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
-# ./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v
-# ./bin/sd --diffusion-model  ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v
-# ./bin/sd -m  ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v
+    curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
    ```

-Using formats of different precisions will yield results of varying quality.
+### Generate an image with just one command

-| f32  | f16  |q8_0  |q5_0  |q5_1  |q4_0  |q4_1  |
-| ----  |----  |----  |----  |----  |----  |----  |
-| ![](./assets/f32.png) |![](./assets/f16.png) |![](./assets/q8_0.png) |![](./assets/q5_0.png) |![](./assets/q5_1.png) |![](./assets/q4_0.png) |![](./assets/q4_1.png) |
-
-#### img2img example
-
- `./output.png` is the image generated from the above txt2img pipeline
-
-
-```
-./bin/sd --mode img2img -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
+```sh
+./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
 ```

-<p align="center">
-  <img src="./assets/img2img_output.png" width="256x">
-</p>
+***For detailed command-line arguments, check out [cli doc](./examples/cli/README.md).***
+
+## Performance
+
+If you want to improve performance or reduce VRAM/RAM usage, please refer to [performance guide](./docs/performance.md).

 ## More Guides

+- [SD1.x/SD2.x/SDXL](./docs/sd.md)
+- [SD3/SD3.5](./docs/sd3.md)
+- [FlUX.1-dev/FlUX.1-schnell](./docs/flux.md)
+- [FLUX.2-dev](./docs/flux2.md)
+- [FLUX.1-Kontext-dev](./docs/kontext.md)
+- [Chroma](./docs/chroma.md)
+- [🔥Qwen Image](./docs/qwen_image.md)
+- [🔥Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
+- [🔥Wan2.1/Wan2.2](./docs/wan.md)
+- [🔥Z-Image](./docs/z_image.md)
+- [Ovis-Image](./docs/ovis_image.md)
 - [LoRA](./docs/lora.md)
 - [LCM/LCM-LoRA](./docs/lcm.md)
 - [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
@ -290,14 +148,25 @@ Using formats of different precisions will yield results of varying quality.

 These projects wrap `stable-diffusion.cpp` for easier use in other languages/frameworks.

-* Golang: [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion)
+* Golang (non-cgo): [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion)
+* Golang (cgo): [Binozo/GoStableDiffusion](https://github.com/Binozo/GoStableDiffusion)
 * C#: [DarthAffe/StableDiffusion.NET](https://github.com/DarthAffe/StableDiffusion.NET)
+* Python: [william-murray1204/stable-diffusion-cpp-python](https://github.com/william-murray1204/stable-diffusion-cpp-python)
+* Rust: [newfla/diffusion-rs](https://github.com/newfla/diffusion-rs)
+* Flutter/Dart: [rmatif/Local-Diffusion](https://github.com/rmatif/Local-Diffusion)

 ## UIs

 These projects use `stable-diffusion.cpp` as a backend for their image generation.

 - [Jellybox](https://jellybox.com)
+- [Stable Diffusion GUI](https://github.com/fszontagh/sd.cpp.gui.wx)
+- [Stable Diffusion CLI-GUI](https://github.com/piallai/stable-diffusion.cpp)
+- [Local Diffusion](https://github.com/rmatif/Local-Diffusion)
+- [sd.cpp-webui](https://github.com/daniandtheweb/sd.cpp-webui)
+- [LocalAI](https://github.com/mudler/LocalAI)
+- [Neural-Pixel](https://github.com/Luiz-Alcantara/Neural-Pixel)
+- [KoboldCpp](https://github.com/LostRuins/koboldcpp)

 ## Contributors

@ -311,7 +180,8 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp

 ## References

- [ggml](https://github.com/ggerganov/ggml)
+- [ggml](https://github.com/ggml-org/ggml)
+- [diffusers](https://github.com/huggingface/diffusers)
 - [stable-diffusion](https://github.com/CompVis/stable-diffusion)
 - [sd3-ref](https://github.com/Stability-AI/sd3-ref)
 - [stable-diffusion-stability-ai](https://github.com/Stability-AI/stablediffusion)
@ -321,3 +191,5 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp
 - [latent-consistency-model](https://github.com/luosiallen/latent-consistency-model)
 - [generative-models](https://github.com/Stability-AI/generative-models/)
 - [PhotoMaker](https://github.com/TencentARC/PhotoMaker)
+- [Wan2.1](https://github.com/Wan-Video/Wan2.1)
+- [Wan2.2](https://github.com/Wan-Video/Wan2.2)
--- a/assets/flux/chroma1-radiance.png
+++ b/assets/flux/chroma1-radiance.png
--- a/assets/flux/chroma_v40.png
+++ b/assets/flux/chroma_v40.png
--- a/assets/flux/kontext1_dev_output.png
+++ b/assets/flux/kontext1_dev_output.png
--- a/assets/flux2/example.png
+++ b/assets/flux2/example.png
--- a/assets/logo.png
+++ b/assets/logo.png
--- a/assets/ovis_image/example.png
+++ b/assets/ovis_image/example.png
--- a/assets/qwen/example.png
+++ b/assets/qwen/example.png
--- a/assets/qwen/qwen_image_edit.png
+++ b/assets/qwen/qwen_image_edit.png
--- a/assets/qwen/qwen_image_edit_2509.png
+++ b/assets/qwen/qwen_image_edit_2509.png
--- a/assets/wan/Wan2.1_1.3B_t2v.mp4
+++ b/assets/wan/Wan2.1_1.3B_t2v.mp4
--- a/assets/wan/Wan2.1_1.3B_vace_r2v.mp4
+++ b/assets/wan/Wan2.1_1.3B_vace_r2v.mp4
--- a/assets/wan/Wan2.1_1.3B_vace_t2v.mp4
+++ b/assets/wan/Wan2.1_1.3B_vace_t2v.mp4
--- a/assets/wan/Wan2.1_1.3B_vace_v2v.mp4
+++ b/assets/wan/Wan2.1_1.3B_vace_v2v.mp4
--- a/assets/wan/Wan2.1_14B_flf2v.mp4
+++ b/assets/wan/Wan2.1_14B_flf2v.mp4
--- a/assets/wan/Wan2.1_14B_i2v.mp4
+++ b/assets/wan/Wan2.1_14B_i2v.mp4
--- a/assets/wan/Wan2.1_14B_t2v.mp4
+++ b/assets/wan/Wan2.1_14B_t2v.mp4
--- a/assets/wan/Wan2.1_14B_vace_r2v.mp4
+++ b/assets/wan/Wan2.1_14B_vace_r2v.mp4
--- a/assets/wan/Wan2.1_14B_vace_t2v.mp4
+++ b/assets/wan/Wan2.1_14B_vace_t2v.mp4
--- a/assets/wan/Wan2.1_14B_vace_v2v.mp4
+++ b/assets/wan/Wan2.1_14B_vace_v2v.mp4
--- a/assets/wan/Wan2.2_14B_flf2v.mp4
+++ b/assets/wan/Wan2.2_14B_flf2v.mp4
--- a/assets/wan/Wan2.2_14B_i2v.mp4
+++ b/assets/wan/Wan2.2_14B_i2v.mp4
--- a/assets/wan/Wan2.2_14B_t2i.png
+++ b/assets/wan/Wan2.2_14B_t2i.png
--- a/assets/wan/Wan2.2_14B_t2v.mp4
+++ b/assets/wan/Wan2.2_14B_t2v.mp4
--- a/assets/wan/Wan2.2_14B_t2v_lora.mp4
+++ b/assets/wan/Wan2.2_14B_t2v_lora.mp4
--- a/assets/wan/Wan2.2_5B_i2v.mp4
+++ b/assets/wan/Wan2.2_5B_i2v.mp4
--- a/assets/wan/Wan2.2_5B_t2v.mp4
+++ b/assets/wan/Wan2.2_5B_t2v.mp4
--- a/assets/z_image/bf16.png
+++ b/assets/z_image/bf16.png
--- a/assets/z_image/q2_K.png
+++ b/assets/z_image/q2_K.png
--- a/assets/z_image/q3_K.png
+++ b/assets/z_image/q3_K.png
--- a/assets/z_image/q4_0.png
+++ b/assets/z_image/q4_0.png
--- a/assets/z_image/q4_K.png
+++ b/assets/z_image/q4_K.png
--- a/assets/z_image/q5_0.png
+++ b/assets/z_image/q5_0.png
--- a/assets/z_image/q6_K.png
+++ b/assets/z_image/q6_K.png
--- a/assets/z_image/q8_0.png
+++ b/assets/z_image/q8_0.png
--- a/clip.hpp
+++ b/clip.hpp
@ -3,35 +3,11 @@

 #include "ggml_extend.hpp"
 #include "model.h"
+#include "tokenize_util.h"

 /*================================================== CLIPTokenizer ===================================================*/

-std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remove_lora(std::string text) {
-    std::regex re("<lora:([^:]+):([^>]+)>");
-    std::smatch matches;
-    std::unordered_map<std::string, float> filename2multiplier;
-
-    while (std::regex_search(text, matches, re)) {
-        std::string filename = matches[1].str();
-        float multiplier     = std::stof(matches[2].str());
-
-        text = std::regex_replace(text, re, "", std::regex_constants::format_first_only);
-
-        if (multiplier == 0.f) {
-            continue;
-        }
-
-        if (filename2multiplier.find(filename) == filename2multiplier.end()) {
-            filename2multiplier[filename] = multiplier;
-        } else {
-            filename2multiplier[filename] += multiplier;
-        }
-    }
-
-    return std::make_pair(filename2multiplier, text);
-}
-
-std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
+__STATIC_INLINE__ std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
    std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
    std::set<int> byte_set;
    for (int b = static_cast<int>('!'); b <= static_cast<int>('~'); ++b) {
@ -72,6 +48,8 @@ private:
    int encoder_len;
    int bpe_len;

+    std::vector<std::string> special_tokens;
+
 public:
    const std::string UNK_TOKEN = "<|endoftext|>";
    const std::string BOS_TOKEN = "<|startoftext|>";
@ -117,6 +95,15 @@ private:
        return pairs;
    }

+    bool is_special_token(const std::string& token) {
+        for (auto& special_token : special_tokens) {
+            if (special_token == token) {
+                return true;
+            }
+        }
+        return false;
+    }
+
 public:
    CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "")
        : PAD_TOKEN_ID(pad_token_id) {
@ -125,6 +112,8 @@ public:
        } else {
            load_from_merges(ModelLoader::load_merges());
        }
+        add_special_token("<|startoftext|>");
+        add_special_token("<|endoftext|>");
    }

    void load_from_merges(const std::string& merges_utf8_str) {
@ -201,6 +190,10 @@ public:
        }
    }

+    void add_special_token(const std::string& token) {
+        special_tokens.push_back(token);
+    }
+
    std::u32string bpe(const std::u32string& token) {
        std::vector<std::u32string> word;

@ -343,6 +336,13 @@ public:
        }
    }

+    std::string clean_up_tokenization(std::string& text) {
+        std::regex pattern(R"( ,)");
+        // Replace " ," with ","
+        std::string result = std::regex_replace(text, pattern, ",");
+        return result;
+    }
+
    std::string decode(const std::vector<int>& tokens) {
        std::string text = "";
        for (int t : tokens) {
@ -351,8 +351,12 @@ public:
            std::u32string ts = decoder[t];
            // printf("%d, %s \n", t,  utf32_to_utf8(ts).c_str());
            std::string s = utf32_to_utf8(ts);
-            if (s.length() >= 4 && ends_with(s, "</w>")) {
-                text += " " + s.replace(s.length() - 4, s.length() - 1, "");
+            if (s.length() >= 4) {
+                if (ends_with(s, "</w>")) {
+                    text += s.replace(s.length() - 4, s.length() - 1, "") + " ";
+                } else {
+                    text += s;
+                }
            } else {
                text += " " + s;
            }
@ -364,28 +368,58 @@ public:

        // std::string s((char *)bytes.data());
        // std::string s = "";
+        text = clean_up_tokenization(text);
        return trim(text);
    }

+    std::vector<std::string> token_split(const std::string& text) {
+        std::regex pat(R"('s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
+                       std::regex::icase);
+        std::sregex_iterator iter(text.begin(), text.end(), pat);
+        std::sregex_iterator end;
+
+        std::vector<std::string> result;
+        for (; iter != end; ++iter) {
+            result.emplace_back(iter->str());
+        }
+
+        return result;
+    }
+
    std::vector<int> encode(std::string text, on_new_token_cb_t on_new_token_cb) {
        std::string original_text = text;
        std::vector<int32_t> bpe_tokens;
        text = whitespace_clean(text);
        std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); });

-        std::regex pat(R"(<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
-                       std::regex::icase);
-
-        std::smatch matches;
        std::string str = text;
        std::vector<std::string> token_strs;
-        while (std::regex_search(str, matches, pat)) {
-            bool skip = on_new_token_cb(str, bpe_tokens);
+
+        auto splited_texts = split_with_special_tokens(text, special_tokens);
+
+        for (auto& splited_text : splited_texts) {
+            LOG_DEBUG("token %s", splited_text.c_str());
+            if (is_special_token(splited_text)) {
+                LOG_DEBUG("special %s", splited_text.c_str());
+                bool skip = on_new_token_cb(splited_text, bpe_tokens);
                if (skip) {
+                    token_strs.push_back(splited_text);
                    continue;
                }
-            for (auto& token : matches) {
-                std::string token_str = token.str();
+                continue;
+            }
+
+            auto tokens = token_split(splited_text);
+            for (auto& token : tokens) {
+                if (on_new_token_cb != nullptr) {
+                    bool skip = on_new_token_cb(token, bpe_tokens);
+                    if (skip) {
+                        token_strs.push_back(token);
+                        continue;
+                    }
+                }
+
+                std::string token_str = token;
                std::u32string utf32_token;
                for (int i = 0; i < token_str.length(); i++) {
                    unsigned char b = token_str[i];
@ -405,14 +439,13 @@ public:
                bpe_tokens.push_back(encoder[bpe_str]);
                token_strs.push_back(utf32_to_utf8(bpe_str));
            }
-            str = matches.suffix();
        }
-        std::stringstream ss;
-        ss << "[";
-        for (auto token : token_strs) {
-            ss << "\"" << token << "\", ";
-        }
-        ss << "]";
+        // std::stringstream ss;
+        // ss << "[";
+        // for (auto token : token_strs) {
+        //     ss << "\"" << token << "\", ";
+        // }
+        // ss << "]";
        // LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
        // printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str());
        return bpe_tokens;
@ -439,16 +472,16 @@ public:
        }
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [N, n_token, d_model]
        auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
        auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);

        x = fc1->forward(ctx, x);
        if (use_gelu) {
-            x = ggml_gelu_inplace(ctx, x);
+            x = ggml_gelu_inplace(ctx->ggml_ctx, x);
        } else {
-            x = ggml_gelu_quick_inplace(ctx, x);
+            x = ggml_gelu_quick_inplace(ctx->ggml_ctx, x);
        }
        x = fc2->forward(ctx, x);
        return x;
@ -464,11 +497,12 @@ protected:
 public:
    CLIPLayer(int64_t d_model,
              int64_t n_head,
-              int64_t intermediate_size)
+              int64_t intermediate_size,
+              bool proj_in = false)
        : d_model(d_model),
          n_head(n_head),
          intermediate_size(intermediate_size) {
-        blocks["self_attn"] = std::shared_ptr<GGMLBlock>(new MultiheadAttention(d_model, n_head, true, true));
+        blocks["self_attn"] = std::shared_ptr<GGMLBlock>(new MultiheadAttention(d_model, n_head, true, true, proj_in));

        blocks["layer_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model));
        blocks["layer_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model));
@ -476,15 +510,15 @@ public:
        blocks["mlp"] = std::shared_ptr<GGMLBlock>(new CLIPMLP(d_model, intermediate_size));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, bool mask = true) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, bool mask = true) {
        // x: [N, n_token, d_model]
        auto self_attn   = std::dynamic_pointer_cast<MultiheadAttention>(blocks["self_attn"]);
        auto layer_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm1"]);
        auto layer_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm2"]);
        auto mlp         = std::dynamic_pointer_cast<CLIPMLP>(blocks["mlp"]);

-        x = ggml_add(ctx, x, self_attn->forward(ctx, layer_norm1->forward(ctx, x), mask));
-        x = ggml_add(ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x)));
+        x = ggml_add(ctx->ggml_ctx, x, self_attn->forward(ctx, layer_norm1->forward(ctx, x), mask));
+        x = ggml_add(ctx->ggml_ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x)));
        return x;
    }
 };
@ -497,15 +531,19 @@ public:
    CLIPEncoder(int64_t n_layer,
                int64_t d_model,
                int64_t n_head,
-                int64_t intermediate_size)
+                int64_t intermediate_size,
+                bool proj_in = false)
        : n_layer(n_layer) {
        for (int i = 0; i < n_layer; i++) {
            std::string name = "layers." + std::to_string(i);
-            blocks[name]     = std::shared_ptr<GGMLBlock>(new CLIPLayer(d_model, n_head, intermediate_size));
+            blocks[name]     = std::shared_ptr<GGMLBlock>(new CLIPLayer(d_model, n_head, intermediate_size, proj_in));
        }
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, int clip_skip = -1, bool mask = true) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                struct ggml_tensor* x,
+                                int clip_skip = -1,
+                                bool mask     = true) {
        // x: [N, n_token, d_model]
        int layer_idx = n_layer - 1;
        // LOG_DEBUG("clip_skip %d", clip_skip);
@ -532,26 +570,37 @@ protected:
    int64_t embed_dim;
    int64_t vocab_size;
    int64_t num_positions;
+    bool force_clip_f32;

-    void init_params(struct ggml_context* ctx, ggml_type wtype) {
-        params["token_embedding.weight"]    = ggml_new_tensor_2d(ctx, wtype, embed_dim, vocab_size);
-        params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, embed_dim, num_positions);
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        enum ggml_type token_wtype = GGML_TYPE_F32;
+        if (!force_clip_f32) {
+            token_wtype = get_type(prefix + "token_embedding.weight", tensor_storage_map, GGML_TYPE_F32);
+            if (!support_get_rows(token_wtype)) {
+                token_wtype = GGML_TYPE_F32;
+            }
+        }
+        enum ggml_type position_wtype       = GGML_TYPE_F32;
+        params["token_embedding.weight"]    = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
+        params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
    }

 public:
    CLIPEmbeddings(int64_t embed_dim,
                   int64_t vocab_size    = 49408,
-                   int64_t num_positions = 77)
+                   int64_t num_positions = 77,
+                   bool force_clip_f32   = false)
        : embed_dim(embed_dim),
          vocab_size(vocab_size),
-          num_positions(num_positions) {
+          num_positions(num_positions),
+          force_clip_f32(force_clip_f32) {
    }

    struct ggml_tensor* get_token_embed_weight() {
        return params["token_embedding.weight"];
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* input_ids,
                                struct ggml_tensor* custom_embed_weight) {
        // input_ids: [N, n_token]
@ -559,12 +608,12 @@ public:
        auto position_embed_weight = params["position_embedding.weight"];

        GGML_ASSERT(input_ids->ne[0] == position_embed_weight->ne[1]);
-        input_ids            = ggml_reshape_3d(ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
-        auto token_embedding = ggml_get_rows(ctx, custom_embed_weight != NULL ? custom_embed_weight : token_embed_weight, input_ids);
-        token_embedding      = ggml_reshape_3d(ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]);
+        input_ids            = ggml_reshape_3d(ctx->ggml_ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
+        auto token_embedding = ggml_get_rows(ctx->ggml_ctx, custom_embed_weight != nullptr ? custom_embed_weight : token_embed_weight, input_ids);
+        token_embedding      = ggml_reshape_3d(ctx->ggml_ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]);

        // token_embedding + position_embedding
-        auto x = ggml_add(ctx,
+        auto x = ggml_add(ctx->ggml_ctx,
                          token_embedding,
                          position_embed_weight);  // [N, n_token, embed_dim]
        return x;
@ -580,10 +629,14 @@ protected:
    int64_t num_patches;
    int64_t num_positions;

-    void init_params(struct ggml_context* ctx, ggml_type wtype) {
-        params["patch_embedding.weight"]    = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, patch_size, patch_size, num_channels, embed_dim);
-        params["class_embedding"]           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, embed_dim);
-        params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, embed_dim, num_positions);
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        enum ggml_type patch_wtype    = GGML_TYPE_F16;
+        enum ggml_type class_wtype    = GGML_TYPE_F32;
+        enum ggml_type position_wtype = GGML_TYPE_F32;
+
+        params["patch_embedding.weight"]    = ggml_new_tensor_4d(ctx, patch_wtype, patch_size, patch_size, num_channels, embed_dim);
+        params["class_embedding"]           = ggml_new_tensor_1d(ctx, class_wtype, embed_dim);
+        params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
    }

 public:
@ -599,7 +652,7 @@ public:
        num_positions = num_patches + 1;
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* pixel_values) {
        // pixel_values: [N, num_channels, image_size, image_size]
        // return: [N, num_positions, embed_dim]
        GGML_ASSERT(pixel_values->ne[0] == image_size && pixel_values->ne[1] == image_size && pixel_values->ne[2] == num_channels);
@ -611,18 +664,18 @@ public:
        // concat(patch_embedding, class_embedding) + position_embedding
        struct ggml_tensor* patch_embedding;
        int64_t N       = pixel_values->ne[3];
-        patch_embedding = ggml_nn_conv_2d(ctx, pixel_values, patch_embed_weight, NULL, patch_size, patch_size);  // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
-        patch_embedding = ggml_reshape_3d(ctx, patch_embedding, num_patches, embed_dim, N);                      // [N, embed_dim, num_patches]
-        patch_embedding = ggml_cont(ctx, ggml_permute(ctx, patch_embedding, 1, 0, 2, 3));                        // [N, num_patches, embed_dim]
-        patch_embedding = ggml_reshape_4d(ctx, patch_embedding, 1, embed_dim, num_patches, N);                   // [N, num_patches, embed_dim, 1]
+        patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size);  // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
+        patch_embedding = ggml_reshape_3d(ctx->ggml_ctx, patch_embedding, num_patches, embed_dim, N);                          // [N, embed_dim, num_patches]
+        patch_embedding = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, patch_embedding, 1, 0, 2, 3));                  // [N, num_patches, embed_dim]
+        patch_embedding = ggml_reshape_4d(ctx->ggml_ctx, patch_embedding, 1, embed_dim, num_patches, N);                       // [N, num_patches, embed_dim, 1]

-        struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, embed_dim, N);
-        class_embedding                     = ggml_repeat(ctx, class_embed_weight, class_embedding);      // [N, embed_dim]
-        class_embedding                     = ggml_reshape_4d(ctx, class_embedding, 1, embed_dim, 1, N);  // [N, 1, embed_dim, 1]
+        struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, embed_dim, N);
+        class_embedding                     = ggml_repeat(ctx->ggml_ctx, class_embed_weight, class_embedding);      // [N, embed_dim]
+        class_embedding                     = ggml_reshape_4d(ctx->ggml_ctx, class_embedding, 1, embed_dim, 1, N);  // [N, 1, embed_dim, 1]

-        struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding, 2);  // [N, num_positions, embed_dim, 1]
-        x                     = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N);   // [N, num_positions, embed_dim]
-        x                     = ggml_add(ctx, x, position_embed_weight);
+        struct ggml_tensor* x = ggml_concat(ctx->ggml_ctx, class_embedding, patch_embedding, 2);  // [N, num_positions, embed_dim, 1]
+        x                     = ggml_reshape_3d(ctx->ggml_ctx, x, embed_dim, num_positions, N);   // [N, num_positions, embed_dim]
+        x                     = ggml_add(ctx->ggml_ctx, x, position_embed_weight);
        return x;  // [N, num_positions, embed_dim]
    }
 };
@ -639,9 +692,10 @@ enum CLIPVersion {

 class CLIPTextModel : public GGMLBlock {
 protected:
-    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
        if (version == OPEN_CLIP_VIT_BIGG_14) {
-            params["text_projection"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, projection_dim, hidden_size);
+            enum ggml_type wtype      = GGML_TYPE_F32;
+            params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
        }
    }

@ -655,12 +709,12 @@ public:
    int32_t n_head            = 12;
    int32_t n_layer           = 12;    // num_hidden_layers
    int32_t projection_dim    = 1280;  // only for OPEN_CLIP_VIT_BIGG_14
-    int32_t clip_skip         = -1;
    bool with_final_ln        = true;

    CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
-                  int clip_skip_value = -1,
-                  bool with_final_ln  = true)
+                  bool with_final_ln  = true,
+                  bool force_clip_f32 = false,
+                  bool proj_in        = false)
        : version(version), with_final_ln(with_final_ln) {
        if (version == OPEN_CLIP_VIT_H_14) {
            hidden_size       = 1024;
@ -673,30 +727,23 @@ public:
            n_head            = 20;
            n_layer           = 32;
        }
-        set_clip_skip(clip_skip_value);

-        blocks["embeddings"]       = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token));
-        blocks["encoder"]          = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
+        blocks["embeddings"]       = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32));
+        blocks["encoder"]          = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size, proj_in));
        blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
    }

-    void set_clip_skip(int skip) {
-        if (skip <= 0) {
-            return;
-        }
-        clip_skip = skip;
-    }
-
    struct ggml_tensor* get_token_embed_weight() {
        auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
        return embeddings->get_token_embed_weight();
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* input_ids,
                                struct ggml_tensor* tkn_embeddings,
                                size_t max_token_idx = 0,
-                                bool return_pooled   = false) {
+                                bool return_pooled   = false,
+                                int clip_skip        = -1) {
        // input_ids: [N, n_token]
        auto embeddings       = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
        auto encoder          = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]);
@ -710,11 +757,11 @@ public:

        if (return_pooled) {
            auto text_projection = params["text_projection"];
-            ggml_tensor* pooled  = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx);
-            if (text_projection != NULL) {
-                pooled           = ggml_nn_linear(ctx, pooled, text_projection, NULL);
+            ggml_tensor* pooled  = ggml_view_1d(ctx->ggml_ctx, x, hidden_size, x->nb[1] * max_token_idx);
+            if (text_projection != nullptr) {
+                pooled = ggml_ext_linear(ctx->ggml_ctx, pooled, text_projection, nullptr);
            } else {
-                LOG_DEBUG("Missing text_projection matrix, assuming identity...");
+                LOG_DEBUG("identity projection");
            }
            return pooled;  // [hidden_size, 1, 1]
        }
@ -736,7 +783,7 @@ public:
    int32_t n_layer           = 24;

 public:
-    CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14) {
+    CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14, bool proj_in = false) {
        if (version == OPEN_CLIP_VIT_H_14) {
            hidden_size       = 1280;
            intermediate_size = 5120;
@ -751,11 +798,14 @@ public:

        blocks["embeddings"]     = std::shared_ptr<GGMLBlock>(new CLIPVisionEmbeddings(hidden_size, num_channels, patch_size, image_size));
        blocks["pre_layernorm"]  = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
-        blocks["encoder"]        = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
+        blocks["encoder"]        = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size, proj_in));
        blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values, bool return_pooled = true) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                struct ggml_tensor* pixel_values,
+                                bool return_pooled = true,
+                                int clip_skip      = -1) {
        // pixel_values: [N, num_channels, image_size, image_size]
        auto embeddings     = std::dynamic_pointer_cast<CLIPVisionEmbeddings>(blocks["embeddings"]);
        auto pre_layernorm  = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_layernorm"]);
@ -764,15 +814,18 @@ public:

        auto x = embeddings->forward(ctx, pixel_values);  // [N, num_positions, embed_dim]
        x      = pre_layernorm->forward(ctx, x);
-        x      = encoder->forward(ctx, x, -1, false);
+        x      = encoder->forward(ctx, x, clip_skip, false);
+        // print_ggml_tensor(x, true, "ClipVisionModel x: ");
+        auto last_hidden_state = x;
        x                      = post_layernorm->forward(ctx, x);  // [N, n_token, hidden_size]

        GGML_ASSERT(x->ne[3] == 1);
        if (return_pooled) {
-            ggml_tensor* pooled = ggml_cont(ctx, ggml_view_2d(ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
+            ggml_tensor* pooled = ggml_cont(ctx->ggml_ctx, ggml_view_2d(ctx->ggml_ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
            return pooled;  // [N, hidden_size]
        } else {
-            return x;  // [N, n_token, hidden_size]
+            // return x;  // [N, n_token, hidden_size]
+            return last_hidden_state;  // [N, n_token, hidden_size]
        }
    }
 };
@ -783,9 +836,9 @@ protected:
    int64_t out_features;
    bool transpose_weight;

-    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
        if (transpose_weight) {
-            LOG_ERROR("transpose_weight");
            params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
        } else {
            params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features);
@ -800,12 +853,12 @@ public:
          out_features(out_features),
          transpose_weight(transpose_weight) {}

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
        struct ggml_tensor* w = params["weight"];
        if (transpose_weight) {
-            w = ggml_cont(ctx, ggml_transpose(ctx, w));
+            w = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, w));
        }
-        return ggml_nn_linear(ctx, x, w, NULL);
+        return ggml_ext_linear(ctx->ggml_ctx, x, w, nullptr);
    }
 };

@ -817,7 +870,8 @@ public:

 public:
    CLIPVisionModelProjection(CLIPVersion version   = OPENAI_CLIP_VIT_L_14,
-                              bool transpose_proj_w = false) {
+                              bool transpose_proj_w = false,
+                              bool proj_in          = false) {
        if (version == OPEN_CLIP_VIT_H_14) {
            hidden_size    = 1280;
            projection_dim = 1024;
@ -825,20 +879,26 @@ public:
            hidden_size = 1664;
        }

-        blocks["vision_model"]      = std::shared_ptr<GGMLBlock>(new CLIPVisionModel(version));
+        blocks["vision_model"]      = std::shared_ptr<GGMLBlock>(new CLIPVisionModel(version, proj_in));
        blocks["visual_projection"] = std::shared_ptr<GGMLBlock>(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                struct ggml_tensor* pixel_values,
+                                bool return_pooled = true,
+                                int clip_skip      = -1) {
        // pixel_values: [N, num_channels, image_size, image_size]
-        // return: [N, projection_dim]
+        // return: [N, projection_dim] if return_pooled else [N, n_token, hidden_size]
        auto vision_model      = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
        auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]);

-        auto x = vision_model->forward(ctx, pixel_values);  // [N, hidden_size]
-        x      = visual_projection->forward(ctx, x);        // [N, projection_dim]
+        auto x = vision_model->forward(ctx, pixel_values, return_pooled, clip_skip);  // [N, hidden_size] or [N, n_token, hidden_size]

-        return x;  // [N, projection_dim]
+        if (return_pooled) {
+            x = visual_projection->forward(ctx, x);  // [N, projection_dim]
+        }
+
+        return x;
    }
 };

@ -846,83 +906,97 @@ struct CLIPTextModelRunner : public GGMLRunner {
    CLIPTextModel model;

    CLIPTextModelRunner(ggml_backend_t backend,
-                        ggml_type wtype,
+                        bool offload_params_to_cpu,
+                        const String2TensorStorage& tensor_storage_map,
+                        const std::string prefix,
                        CLIPVersion version = OPENAI_CLIP_VIT_L_14,
-                        int clip_skip_value = 1,
-                        bool with_final_ln  = true)
-        : GGMLRunner(backend, wtype), model(version, clip_skip_value, with_final_ln) {
-        model.init(params_ctx, wtype);
+                        bool with_final_ln  = true,
+                        bool force_clip_f32 = false)
+        : GGMLRunner(backend, offload_params_to_cpu) {
+        bool proj_in = false;
+        for (const auto& [name, tensor_storage] : tensor_storage_map) {
+            if (!starts_with(name, prefix)) {
+                continue;
+            }
+            if (contains(name, "self_attn.in_proj")) {
+                proj_in = true;
+                break;
+            }
+        }
+        model = CLIPTextModel(version, with_final_ln, force_clip_f32, proj_in);
+        model.init(params_ctx, tensor_storage_map, prefix);
    }

-    std::string get_desc() {
+    std::string get_desc() override {
        return "clip";
    }

-    void set_clip_skip(int clip_skip) {
-        model.set_clip_skip(clip_skip);
-    }
-
    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
        model.get_param_tensors(tensors, prefix);
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* input_ids,
                                struct ggml_tensor* embeddings,
                                size_t max_token_idx = 0,
-                                bool return_pooled   = false) {
+                                bool return_pooled   = false,
+                                int clip_skip        = -1) {
        size_t N       = input_ids->ne[1];
        size_t n_token = input_ids->ne[0];
        if (input_ids->ne[0] > model.n_token) {
            GGML_ASSERT(input_ids->ne[0] % model.n_token == 0);
-            input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
+            input_ids = ggml_reshape_2d(ctx->ggml_ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
        }

-        return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled);
+        return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
    }

    struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
                                    int num_custom_embeddings    = 0,
-                                    void* custom_embeddings_data = NULL,
+                                    void* custom_embeddings_data = nullptr,
                                    size_t max_token_idx         = 0,
-                                    bool return_pooled           = false) {
-        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
+                                    bool return_pooled           = false,
+                                    int clip_skip                = -1) {
+        struct ggml_cgraph* gf = new_graph_custom(2048);

        input_ids = to_backend(input_ids);

-        struct ggml_tensor* embeddings = NULL;
+        struct ggml_tensor* embeddings = nullptr;

-        if (num_custom_embeddings > 0 && custom_embeddings_data != NULL) {
+        if (num_custom_embeddings > 0 && custom_embeddings_data != nullptr) {
+            auto token_embed_weight = model.get_token_embed_weight();
            auto custom_embeddings  = ggml_new_tensor_2d(compute_ctx,
-                                                        wtype,
+                                                         token_embed_weight->type,
                                                         model.hidden_size,
                                                         num_custom_embeddings);
            set_backend_tensor_data(custom_embeddings, custom_embeddings_data);

-            auto token_embed_weight = model.get_token_embed_weight();
            // concatenate custom embeddings
            embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
        }

-        struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, embeddings, max_token_idx, return_pooled);
+        auto runner_ctx = get_context();
+
+        struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);

        ggml_build_forward_expand(gf, hidden_states);

        return gf;
    }

-    void compute(const int n_threads,
+    bool compute(const int n_threads,
                 struct ggml_tensor* input_ids,
                 int num_custom_embeddings,
                 void* custom_embeddings_data,
                 size_t max_token_idx,
                 bool return_pooled,
+                 int clip_skip,
                 ggml_tensor** output,
-                 ggml_context* output_ctx = NULL) {
+                 ggml_context* output_ctx = nullptr) {
        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled);
+            return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
        };
-        GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+        return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
    }
 };

--- a/common.hpp
+++ b/common.hpp
@ -23,12 +23,12 @@ public:
        }
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [N, channels, h, w]
        if (vae_downsample) {
            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);

-            x = ggml_pad(ctx, x, 1, 1, 0, 0);
+            x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
            x = conv->forward(ctx, x);
        } else {
            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
@ -52,11 +52,11 @@ public:
        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [N, channels, h, w]
        auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);

-        x = ggml_upscale(ctx, x, 2);  // [N, channels, h*2, w*2]
+        x = ggml_upscale(ctx->ggml_ctx, x, 2, GGML_SCALE_MODE_NEAREST);  // [N, channels, h*2, w*2]
        x = conv->forward(ctx, x);                                       // [N, out_channels, h*2, w*2]
        return x;
    }
@ -121,7 +121,7 @@ public:
        }
    }

-    virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = NULL) {
+    virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = nullptr) {
        // For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
        // [N, c, t, h, w] => [N, c, t, h * w]
        // x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
@ -131,38 +131,38 @@ public:
        auto out_layers_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out_layers.0"]);
        auto out_layers_3 = std::dynamic_pointer_cast<UnaryBlock>(blocks["out_layers.3"]);

-        if (emb == NULL) {
+        if (emb == nullptr) {
            GGML_ASSERT(skip_t_emb);
        }

        // in_layers
        auto h = in_layers_0->forward(ctx, x);
-        h      = ggml_silu_inplace(ctx, h);
+        h      = ggml_silu_inplace(ctx->ggml_ctx, h);
        h      = in_layers_2->forward(ctx, h);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]

        // emb_layers
        if (!skip_t_emb) {
            auto emb_layer_1 = std::dynamic_pointer_cast<Linear>(blocks["emb_layers.1"]);

-            auto emb_out = ggml_silu(ctx, emb);
+            auto emb_out = ggml_silu(ctx->ggml_ctx, emb);
            emb_out      = emb_layer_1->forward(ctx, emb_out);  // [N, out_channels] if dims == 2 else [N, t, out_channels]

            if (dims == 2) {
-                emb_out = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]);  // [N, out_channels, 1, 1]
+                emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]);  // [N, out_channels, 1, 1]
            } else {
-                emb_out = ggml_reshape_4d(ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]);  // [N, t, out_channels, 1]
+                emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]);  // [N, t, out_channels, 1]
                if (exchange_temb_dims) {
                    // emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
-                    emb_out = ggml_cont(ctx, ggml_permute(ctx, emb_out, 0, 2, 1, 3));  // [N, out_channels, t, 1]
+                    emb_out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, emb_out, 0, 2, 1, 3));  // [N, out_channels, t, 1]
                }
            }

-            h = ggml_add(ctx, h, emb_out);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+            h = ggml_add(ctx->ggml_ctx, h, emb_out);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
        }

        // out_layers
        h = out_layers_0->forward(ctx, h);
-        h = ggml_silu_inplace(ctx, h);
+        h = ggml_silu_inplace(ctx->ggml_ctx, h);
        // dropout, skip for inference
        h = out_layers_3->forward(ctx, h);

@ -172,65 +172,95 @@ public:
            x                    = skip_connection->forward(ctx, x);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
        }

-        h = ggml_add(ctx, h, x);
+        h = ggml_add(ctx->ggml_ctx, h, x);
        return h;  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
    }
 };

-class GEGLU : public GGMLBlock {
+class GEGLU : public UnaryBlock {
 protected:
    int64_t dim_in;
    int64_t dim_out;

-    void init_params(struct ggml_context* ctx, ggml_type wtype) {
-        params["proj.weight"] = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
-        params["proj.bias"]   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim_out * 2);
-    }
-
 public:
    GEGLU(int64_t dim_in, int64_t dim_out)
-        : dim_in(dim_in), dim_out(dim_out) {}
+        : dim_in(dim_in), dim_out(dim_out) {
+        blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out * 2));
+    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
        // x: [ne3, ne2, ne1, dim_in]
        // return: [ne3, ne2, ne1, dim_out]
-        struct ggml_tensor* w = params["proj.weight"];
-        struct ggml_tensor* b = params["proj.bias"];
+        auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);

-        auto x_w    = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0);                        // [dim_out, dim_in]
-        auto x_b    = ggml_view_1d(ctx, b, b->ne[0] / 2, 0);                                            // [dim_out, dim_in]
-        auto gate_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2);  // [dim_out, ]
-        auto gate_b = ggml_view_1d(ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2);                      // [dim_out, ]
+        x          = proj->forward(ctx, x);  // [ne3, ne2, ne1, dim_out*2]
+        auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0);
+        x          = x_vec[0];  // [ne3, ne2, ne1, dim_out]
+        auto gate  = x_vec[1];  // [ne3, ne2, ne1, dim_out]

-        auto x_in = x;
-        x         = ggml_nn_linear(ctx, x_in, x_w, x_b);        // [ne3, ne2, ne1, dim_out]
-        auto gate = ggml_nn_linear(ctx, x_in, gate_w, gate_b);  // [ne3, ne2, ne1, dim_out]
+        gate = ggml_gelu_inplace(ctx->ggml_ctx, gate);

-        gate = ggml_gelu_inplace(ctx, gate);
+        x = ggml_mul(ctx->ggml_ctx, x, gate);  // [ne3, ne2, ne1, dim_out]

-        x = ggml_mul(ctx, x, gate);  // [ne3, ne2, ne1, dim_out]
+        return x;
+    }
+};

+class GELU : public UnaryBlock {
+public:
+    GELU(int64_t dim_in, int64_t dim_out, bool bias = true) {
+        blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out, bias));
+    }
+
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
+        // x: [ne3, ne2, ne1, dim_in]
+        // return: [ne3, ne2, ne1, dim_out]
+        auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
+
+        x = proj->forward(ctx, x);
+        x = ggml_gelu_inplace(ctx->ggml_ctx, x);
        return x;
    }
 };

 class FeedForward : public GGMLBlock {
 public:
+    enum class Activation {
+        GEGLU,
+        GELU
+    };
    FeedForward(int64_t dim,
                int64_t dim_out,
-                int64_t mult = 4) {
+                int64_t mult          = 4,
+                Activation activation = Activation::GEGLU,
+                bool precision_fix    = false) {
        int64_t inner_dim = dim * mult;
-
+        if (activation == Activation::GELU) {
+            blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GELU(dim, inner_dim));
+        } else {
            blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
-        // net_1 is nn.Dropout(), skip for inference
-        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out));
        }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // net_1 is nn.Dropout(), skip for inference
+        bool force_prec_f32 = false;
+        float scale         = 1.f;
+        if (precision_fix) {
+            scale = 1.f / 128.f;
+#ifdef SD_USE_VULKAN
+            force_prec_f32 = true;
+#endif
+        }
+        // The purpose of the scale here is to prevent NaN issues in certain situations.
+        // For example, when using Vulkan without enabling force_prec_f32,
+        // or when using CUDA but the weights are k-quants.
+        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, force_prec_f32, scale));
+    }
+
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [ne3, ne2, ne1, dim]
        // return: [ne3, ne2, ne1, dim_out]

-        auto net_0 = std::dynamic_pointer_cast<GEGLU>(blocks["net.0"]);
+        auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
        auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);

        x = net_0->forward(ctx, x);  // [ne3, ne2, ne1, inner_dim]
@ -265,7 +295,9 @@ public:
        // to_out_1 is nn.Dropout(), skip for inference
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* context) {
        // x: [N, n_token, query_dim]
        // context: [N, n_context, context_dim]
        // return: [N, n_token, query_dim]
@ -283,7 +315,7 @@ public:
        auto k = to_k->forward(ctx, context);  // [N, n_context, inner_dim]
        auto v = to_v->forward(ctx, context);  // [N, n_context, inner_dim]

-        x = ggml_nn_attention_ext(ctx, q, k, v, n_head, NULL, false);  // [N, n_token, inner_dim]
+        x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, false, ctx->flash_attn_enabled);  // [N, n_token, inner_dim]

        x = to_out_0->forward(ctx, x);  // [N, n_token, query_dim]
        return x;
@ -321,7 +353,9 @@ public:
        }
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* context) {
        // x: [N, n_token, query_dim]
        // context: [N, n_context, context_dim]
        // return: [N, n_token, query_dim]
@ -341,21 +375,21 @@ public:
            x           = norm_in->forward(ctx, x);
            x           = ff_in->forward(ctx, x);
            // self.is_res is always True
-            x = ggml_add(ctx, x, x_skip);
+            x = ggml_add(ctx->ggml_ctx, x, x_skip);
        }

        auto r = x;
        x      = norm1->forward(ctx, x);
        x      = attn1->forward(ctx, x, x);  // self-attention
-        x      = ggml_add(ctx, x, r);
+        x      = ggml_add(ctx->ggml_ctx, x, r);
        r      = x;
        x      = norm2->forward(ctx, x);
        x      = attn2->forward(ctx, x, context);  // cross-attention
-        x      = ggml_add(ctx, x, r);
+        x      = ggml_add(ctx->ggml_ctx, x, r);
        r      = x;
        x      = norm3->forward(ctx, x);
        x      = ff->forward(ctx, x);
-        x      = ggml_add(ctx, x, r);
+        x      = ggml_add(ctx->ggml_ctx, x, r);

        return x;
    }
@ -368,38 +402,66 @@ protected:
    int64_t d_head;
    int64_t depth       = 1;    // 1
    int64_t context_dim = 768;  // hidden_size, 1024 for VERSION_SD2
+    bool use_linear     = false;
+
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
+        auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
+        if (iter != tensor_storage_map.end()) {
+            int64_t inner_dim = n_head * d_head;
+            if (iter->second.n_dims == 4 && use_linear) {
+                use_linear         = false;
+                blocks["proj_in"]  = std::make_shared<Conv2d>(in_channels, inner_dim, std::pair{1, 1});
+                blocks["proj_out"] = std::make_shared<Conv2d>(inner_dim, in_channels, std::pair{1, 1});
+            } else if (iter->second.n_dims == 2 && !use_linear) {
+                use_linear         = true;
+                blocks["proj_in"]  = std::make_shared<Linear>(in_channels, inner_dim);
+                blocks["proj_out"] = std::make_shared<Linear>(inner_dim, in_channels);
+            }
+        }
+    }

 public:
    SpatialTransformer(int64_t in_channels,
                       int64_t n_head,
                       int64_t d_head,
                       int64_t depth,
-                       int64_t context_dim)
+                       int64_t context_dim,
+                       bool use_linear)
        : in_channels(in_channels),
          n_head(n_head),
          d_head(d_head),
          depth(depth),
-          context_dim(context_dim) {
-        // We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False
+          context_dim(context_dim),
+          use_linear(use_linear) {
        // disable_self_attn is always False
        int64_t inner_dim = n_head * d_head;  // in_channels
        blocks["norm"]    = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
+        if (use_linear) {
+            blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, inner_dim));
+        } else {
            blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
+        }

        for (int i = 0; i < depth; i++) {
            std::string name = "transformer_blocks." + std::to_string(i);
-            blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim));
+            blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false));
        }

+        if (use_linear) {
+            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, in_channels));
+        } else {
            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
        }
+    }

-    virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
+    virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                        struct ggml_tensor* x,
+                                        struct ggml_tensor* context) {
        // x: [N, in_channels, h, w]
        // context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
        auto norm     = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
-        auto proj_in  = std::dynamic_pointer_cast<Conv2d>(blocks["proj_in"]);
-        auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]);
+        auto proj_in  = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_in"]);
+        auto proj_out = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_out"]);

        auto x_in         = x;
        int64_t n         = x->ne[3];
@ -408,10 +470,15 @@ public:
        int64_t inner_dim = n_head * d_head;

        x = norm->forward(ctx, x);
+        if (use_linear) {
+            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3));  // [N, h, w, inner_dim]
+            x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n);                // [N, h * w, inner_dim]
            x = proj_in->forward(ctx, x);                                              // [N, inner_dim, h, w]
-
-        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3));  // [N, h, w, inner_dim]
-        x = ggml_reshape_3d(ctx, x, inner_dim, w * h, n);      // [N, h * w, inner_dim]
+        } else {
+            x = proj_in->forward(ctx, x);                                              // [N, inner_dim, h, w]
+            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3));  // [N, h, w, inner_dim]
+            x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n);                // [N, h * w, inner_dim]
+        }

        for (int i = 0; i < depth; i++) {
            std::string name       = "transformer_blocks." + std::to_string(i);
@ -420,27 +487,37 @@ public:
            x = transformer_block->forward(ctx, x, context);
        }

-        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));  // [N, inner_dim, h * w]
-        x = ggml_reshape_4d(ctx, x, w, h, inner_dim, n);       // [N, inner_dim, h, w]
-
+        if (use_linear) {
            // proj_out
            x = proj_out->forward(ctx, x);  // [N, in_channels, h, w]

-        x = ggml_add(ctx, x, x_in);
+            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));  // [N, inner_dim, h * w]
+            x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n);                 // [N, inner_dim, h, w]
+        } else {
+            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));  // [N, inner_dim, h * w]
+            x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n);                 // [N, inner_dim, h, w]
+
+            // proj_out
+            x = proj_out->forward(ctx, x);  // [N, in_channels, h, w]
+        }
+
+        x = ggml_add(ctx->ggml_ctx, x, x_in);
        return x;
    }
 };

 class AlphaBlender : public GGMLBlock {
 protected:
-    void init_params(struct ggml_context* ctx, ggml_type wtype) {
-        params["mix_factor"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
+        // Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
+        enum ggml_type wtype = GGML_TYPE_F32;
+        params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
    }

    float get_alpha() {
        // image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
        // so learned_with_images is same as learned
-        float alpha = ggml_backend_tensor_get_f32(params["mix_factor"]);
+        float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
        return sigmoid(alpha);
    }

@ -451,14 +528,14 @@ public:
        // since mix_factor.shape is [1,], we don't need rearrange using  rearrange_pattern
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x_spatial,
                                struct ggml_tensor* x_temporal) {
        // image_only_indicator is always tensor([0.])
        float alpha = get_alpha();
-        auto x      = ggml_add(ctx,
-                               ggml_scale(ctx, x_spatial, alpha),
-                               ggml_scale(ctx, x_temporal, 1.0f - alpha));
+        auto x      = ggml_add(ctx->ggml_ctx,
+                               ggml_scale(ctx->ggml_ctx, x_spatial, alpha),
+                               ggml_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha));
        return x;
    }
 };
@ -476,7 +553,7 @@ public:
        blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* emb,
                                int num_video_frames) {
@ -494,18 +571,18 @@ public:
        int64_t H = x->ne[1];
        int64_t W = x->ne[0];

-        x          = ggml_reshape_4d(ctx, x, W * H, C, T, B);           // (b t) c h w -> b t c (h w)
-        x          = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
+        x          = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B);                     // (b t) c h w -> b t c (h w)
+        x          = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
        auto x_mix = x;

-        emb = ggml_reshape_4d(ctx, emb, emb->ne[0], T, B, emb->ne[3]);  // (b t) ... -> b t ...
+        emb = ggml_reshape_4d(ctx->ggml_ctx, emb, emb->ne[0], T, B, emb->ne[3]);  // (b t) ... -> b t ...

        x = time_stack->forward(ctx, x, emb);  // b t c (h w)

        x = time_mixer->forward(ctx, x_mix, x);  // b t c (h w)

-        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
-        x = ggml_reshape_4d(ctx, x, W, H, C, T * B);           // b t c (h w) -> (b t) c h w
+        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
+        x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B);                     // b t c (h w) -> (b t) c h w

        return x;
    }
--- a/conditioner.hpp
+++ b/conditioner.hpp
--- a/control.hpp
+++ b/control.hpp
@ -27,6 +27,7 @@ protected:
    int num_heads                          = 8;
    int num_head_channels                  = -1;   // channels // num_heads
    int context_dim                        = 768;  // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
+    bool use_linear_projection             = false;

 public:
    int model_channels  = 320;
@ -34,11 +35,11 @@ public:

    ControlNetBlock(SDVersion version = VERSION_SD1)
        : version(version) {
-        if (version == VERSION_SD2) {
+        if (sd_version_is_sd2(version)) {
            context_dim       = 1024;
            num_head_channels = 64;
            num_heads         = -1;
-        } else if (version == VERSION_SDXL) {
+        } else if (sd_version_is_sdxl(version)) {
            context_dim           = 2048;
            attention_resolutions = {4, 2};
            channel_mult          = {1, 2, 4};
@ -58,7 +59,7 @@ public:
        // time_embed_1 is nn.SiLU()
        blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));

-        if (version == VERSION_SDXL || version == VERSION_SVD) {
+        if (sd_version_is_sdxl(version) || version == VERSION_SVD) {
            blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
            // label_emb_1 is nn.SiLU()
            blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
@ -82,7 +83,7 @@ public:
                                       int64_t d_head,
                                       int64_t depth,
                                       int64_t context_dim) -> SpatialTransformer* {
-            return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim);
+            return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
        };

        auto make_zero_conv = [&](int64_t channels) {
@ -165,7 +166,7 @@ public:
    }

    struct ggml_tensor* resblock_forward(std::string name,
-                                         struct ggml_context* ctx,
+                                         GGMLRunnerContext* ctx,
                                         struct ggml_tensor* x,
                                         struct ggml_tensor* emb) {
        auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
@ -173,14 +174,14 @@ public:
    }

    struct ggml_tensor* attention_layer_forward(std::string name,
-                                                struct ggml_context* ctx,
+                                                GGMLRunnerContext* ctx,
                                                struct ggml_tensor* x,
                                                struct ggml_tensor* context) {
        auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
        return block->forward(ctx, x, context);
    }

-    struct ggml_tensor* input_hint_block_forward(struct ggml_context* ctx,
+    struct ggml_tensor* input_hint_block_forward(GGMLRunnerContext* ctx,
                                                 struct ggml_tensor* hint,
                                                 struct ggml_tensor* emb,
                                                 struct ggml_tensor* context) {
@ -192,32 +193,32 @@ public:

                h = block->forward(ctx, h);
            } else {
-                h = ggml_silu_inplace(ctx, h);
+                h = ggml_silu_inplace(ctx->ggml_ctx, h);
            }
        }
        return h;
    }

-    std::vector<struct ggml_tensor*> forward(struct ggml_context* ctx,
+    std::vector<struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
                                             struct ggml_tensor* x,
                                             struct ggml_tensor* hint,
                                             struct ggml_tensor* guided_hint,
                                             struct ggml_tensor* timesteps,
                                             struct ggml_tensor* context,
-                                             struct ggml_tensor* y = NULL) {
+                                             struct ggml_tensor* y = nullptr) {
        // x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
        // timesteps: [N,]
        // context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
        // y: [N, adm_in_channels] or [1, adm_in_channels]
-        if (context != NULL) {
+        if (context != nullptr) {
            if (context->ne[2] != x->ne[3]) {
-                context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
+                context = ggml_repeat(ctx->ggml_ctx, context, ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
            }
        }

-        if (y != NULL) {
+        if (y != nullptr) {
            if (y->ne[1] != x->ne[3]) {
-                y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
+                y = ggml_repeat(ctx->ggml_ctx, y, ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
            }
        }

@ -228,27 +229,27 @@ public:

        auto middle_block_out = std::dynamic_pointer_cast<Conv2d>(blocks["middle_block_out.0"]);

-        auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels);  // [N, model_channels]
+        auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, model_channels);  // [N, model_channels]

        auto emb = time_embed_0->forward(ctx, t_emb);
-        emb      = ggml_silu_inplace(ctx, emb);
+        emb      = ggml_silu_inplace(ctx->ggml_ctx, emb);
        emb      = time_embed_2->forward(ctx, emb);  // [N, time_embed_dim]

        // SDXL/SVD
-        if (y != NULL) {
+        if (y != nullptr) {
            auto label_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.0"]);
            auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);

            auto label_emb = label_embed_0->forward(ctx, y);
-            label_emb      = ggml_silu_inplace(ctx, label_emb);
+            label_emb      = ggml_silu_inplace(ctx->ggml_ctx, label_emb);
            label_emb      = label_embed_2->forward(ctx, label_emb);  // [N, time_embed_dim]

-            emb = ggml_add(ctx, emb, label_emb);  // [N, time_embed_dim]
+            emb = ggml_add(ctx->ggml_ctx, emb, label_emb);  // [N, time_embed_dim]
        }

        std::vector<struct ggml_tensor*> outs;

-        if (guided_hint == NULL) {
+        if (guided_hint == nullptr) {
            guided_hint = input_hint_block_forward(ctx, hint, emb, context);
        }
        outs.push_back(guided_hint);
@ -257,7 +258,7 @@ public:

        // input block 0
        auto h = input_blocks_0_0->forward(ctx, x);
-        h      = ggml_add(ctx, h, guided_hint);
+        h      = ggml_add(ctx->ggml_ctx, h, guided_hint);
        outs.push_back(zero_convs_0->forward(ctx, h));

        // input block 1-11
@ -310,27 +311,28 @@ struct ControlNet : public GGMLRunner {
    SDVersion version = VERSION_SD1;
    ControlNetBlock control_net;

-    ggml_backend_buffer_t control_buffer = NULL;  // keep control output tensors in backend memory
-    ggml_context* control_ctx            = NULL;
+    ggml_backend_buffer_t control_buffer = nullptr;  // keep control output tensors in backend memory
+    ggml_context* control_ctx            = nullptr;
    std::vector<struct ggml_tensor*> controls;  // (12 input block outputs, 1 middle block output) SD 1.5
-    struct ggml_tensor* guided_hint = NULL;     // guided_hint cache, for faster inference
+    struct ggml_tensor* guided_hint = nullptr;  // guided_hint cache, for faster inference
    bool guided_hint_cached         = false;

    ControlNet(ggml_backend_t backend,
-               ggml_type wtype,
+               bool offload_params_to_cpu,
+               const String2TensorStorage& tensor_storage_map = {},
               SDVersion version                              = VERSION_SD1)
-        : GGMLRunner(backend, wtype), control_net(version) {
-        control_net.init(params_ctx, wtype);
+        : GGMLRunner(backend, offload_params_to_cpu), control_net(version) {
+        control_net.init(params_ctx, tensor_storage_map, "");
    }

-    ~ControlNet() {
+    ~ControlNet() override {
        free_control_ctx();
    }

    void alloc_control_ctx(std::vector<struct ggml_tensor*> outs) {
        struct ggml_init_params params;
        params.mem_size   = static_cast<size_t>(outs.size() * ggml_tensor_overhead()) + 1024 * 1024;
-        params.mem_buffer = NULL;
+        params.mem_buffer = nullptr;
        params.no_alloc   = true;
        control_ctx       = ggml_init(params);

@ -346,26 +348,26 @@ struct ControlNet : public GGMLRunner {
            control_buffer_size += ggml_nbytes(controls[i]);
        }

-        control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, backend);
+        control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, runtime_backend);

        LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f);
    }

    void free_control_ctx() {
-        if (control_buffer != NULL) {
+        if (control_buffer != nullptr) {
            ggml_backend_buffer_free(control_buffer);
-            control_buffer = NULL;
+            control_buffer = nullptr;
        }
-        if (control_ctx != NULL) {
+        if (control_ctx != nullptr) {
            ggml_free(control_ctx);
-            control_ctx = NULL;
+            control_ctx = nullptr;
        }
-        guided_hint        = NULL;
+        guided_hint        = nullptr;
        guided_hint_cached = false;
        controls.clear();
    }

-    std::string get_desc() {
+    std::string get_desc() override {
        return "control_net";
    }

@ -377,12 +379,12 @@ struct ControlNet : public GGMLRunner {
                                    struct ggml_tensor* hint,
                                    struct ggml_tensor* timesteps,
                                    struct ggml_tensor* context,
-                                    struct ggml_tensor* y = NULL) {
-        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false);
+                                    struct ggml_tensor* y = nullptr) {
+        struct ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE);

        x = to_backend(x);
        if (guided_hint_cached) {
-            hint = NULL;
+            hint = nullptr;
        } else {
            hint = to_backend(hint);
        }
@ -390,15 +392,17 @@ struct ControlNet : public GGMLRunner {
        y         = to_backend(y);
        timesteps = to_backend(timesteps);

-        auto outs = control_net.forward(compute_ctx,
+        auto runner_ctx = get_context();
+
+        auto outs = control_net.forward(&runner_ctx,
                                        x,
                                        hint,
-                                        guided_hint_cached ? guided_hint : NULL,
+                                        guided_hint_cached ? guided_hint : nullptr,
                                        timesteps,
                                        context,
                                        y);

-        if (control_ctx == NULL) {
+        if (control_ctx == nullptr) {
            alloc_control_ctx(outs);
        }

@ -410,14 +414,14 @@ struct ControlNet : public GGMLRunner {
        return gf;
    }

-    void compute(int n_threads,
+    bool compute(int n_threads,
                 struct ggml_tensor* x,
                 struct ggml_tensor* hint,
                 struct ggml_tensor* timesteps,
                 struct ggml_tensor* context,
                 struct ggml_tensor* y,
-                 struct ggml_tensor** output     = NULL,
-                 struct ggml_context* output_ctx = NULL) {
+                 struct ggml_tensor** output     = nullptr,
+                 struct ggml_context* output_ctx = nullptr) {
        // x: [N, in_channels, h, w]
        // timesteps: [N, ]
        // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
@ -426,11 +430,15 @@ struct ControlNet : public GGMLRunner {
            return build_graph(x, hint, timesteps, context, y);
        };

-        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        bool res = GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        if (res) {
+            // cache guided_hint
            guided_hint_cached = true;
        }
+        return res;
+    }

-    bool load_from_file(const std::string& file_path) {
+    bool load_from_file(const std::string& file_path, int n_threads) {
        LOG_INFO("loading control net from '%s'", file_path.c_str());
        alloc_params_buffer();
        std::map<std::string, ggml_tensor*> tensors;
@ -438,12 +446,12 @@ struct ControlNet : public GGMLRunner {
        std::set<std::string> ignore_tensors;

        ModelLoader model_loader;
-        if (!model_loader.init_from_file(file_path)) {
+        if (!model_loader.init_from_file_and_convert_name(file_path)) {
            LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
            return false;
        }

-        bool success = model_loader.load_tensors(tensors, backend, ignore_tensors);
+        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);

        if (!success) {
            LOG_ERROR("load control net tensors from model loader failed");
--- a/denoiser.hpp
+++ b/denoiser.hpp
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@ -3,78 +3,103 @@

 #include "flux.hpp"
 #include "mmdit.hpp"
+#include "qwen_image.hpp"
 #include "unet.hpp"
+#include "wan.hpp"
+#include "z_image.hpp"
+
+struct DiffusionParams {
+    struct ggml_tensor* x                     = nullptr;
+    struct ggml_tensor* timesteps             = nullptr;
+    struct ggml_tensor* context               = nullptr;
+    struct ggml_tensor* c_concat              = nullptr;
+    struct ggml_tensor* y                     = nullptr;
+    struct ggml_tensor* guidance              = nullptr;
+    std::vector<ggml_tensor*> ref_latents     = {};
+    bool increase_ref_index                   = false;
+    int num_video_frames                      = -1;
+    std::vector<struct ggml_tensor*> controls = {};
+    float control_strength                    = 0.f;
+    struct ggml_tensor* vace_context          = nullptr;
+    float vace_strength                       = 1.f;
+    std::vector<int> skip_layers              = {};
+};

 struct DiffusionModel {
-    virtual void compute(int n_threads,
-                         struct ggml_tensor* x,
-                         struct ggml_tensor* timesteps,
-                         struct ggml_tensor* context,
-                         struct ggml_tensor* c_concat,
-                         struct ggml_tensor* y,
-                         struct ggml_tensor* guidance,
-                         int num_video_frames                      = -1,
-                         std::vector<struct ggml_tensor*> controls = {},
-                         float control_strength                    = 0.f,
-                         struct ggml_tensor** output               = NULL,
-                         struct ggml_context* output_ctx           = NULL,
-                         std::vector<int> skip_layers              = std::vector<int>())             = 0;
+    virtual std::string get_desc()                                                      = 0;
+    virtual bool compute(int n_threads,
+                         DiffusionParams diffusion_params,
+                         struct ggml_tensor** output     = nullptr,
+                         struct ggml_context* output_ctx = nullptr)                     = 0;
    virtual void alloc_params_buffer()                                                  = 0;
    virtual void free_params_buffer()                                                   = 0;
    virtual void free_compute_buffer()                                                  = 0;
    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
    virtual size_t get_params_buffer_size()                                             = 0;
+    virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
    virtual int64_t get_adm_in_channels()             = 0;
+    virtual void set_flash_attn_enabled(bool enabled) = 0;
 };

 struct UNetModel : public DiffusionModel {
    UNetModelRunner unet;

    UNetModel(ggml_backend_t backend,
-              ggml_type wtype,
+              bool offload_params_to_cpu,
+              const String2TensorStorage& tensor_storage_map = {},
              SDVersion version                              = VERSION_SD1)
-        : unet(backend, wtype, version) {
+        : unet(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version) {
    }

-    void alloc_params_buffer() {
+    std::string get_desc() override {
+        return unet.get_desc();
+    }
+
+    void alloc_params_buffer() override {
        unet.alloc_params_buffer();
    }

-    void free_params_buffer() {
+    void free_params_buffer() override {
        unet.free_params_buffer();
    }

-    void free_compute_buffer() {
+    void free_compute_buffer() override {
        unet.free_compute_buffer();
    }

-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
        unet.get_param_tensors(tensors, "model.diffusion_model");
    }

-    size_t get_params_buffer_size() {
+    size_t get_params_buffer_size() override {
        return unet.get_params_buffer_size();
    }

-    int64_t get_adm_in_channels() {
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        unet.set_weight_adapter(adapter);
+    }
+
+    int64_t get_adm_in_channels() override {
        return unet.unet.adm_in_channels;
    }

-    void compute(int n_threads,
-                 struct ggml_tensor* x,
-                 struct ggml_tensor* timesteps,
-                 struct ggml_tensor* context,
-                 struct ggml_tensor* c_concat,
-                 struct ggml_tensor* y,
-                 struct ggml_tensor* guidance,
-                 int num_video_frames                      = -1,
-                 std::vector<struct ggml_tensor*> controls = {},
-                 float control_strength                    = 0.f,
-                 struct ggml_tensor** output               = NULL,
-                 struct ggml_context* output_ctx           = NULL,
-                 std::vector<int> skip_layers              = std::vector<int>()) {
-        (void)skip_layers;  // SLG doesn't work with UNet models
-        return unet.compute(n_threads, x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength, output, output_ctx);
+    void set_flash_attn_enabled(bool enabled) {
+        unet.set_flash_attention_enabled(enabled);
+    }
+
+    bool compute(int n_threads,
+                 DiffusionParams diffusion_params,
+                 struct ggml_tensor** output     = nullptr,
+                 struct ggml_context* output_ctx = nullptr) override {
+        return unet.compute(n_threads,
+                            diffusion_params.x,
+                            diffusion_params.timesteps,
+                            diffusion_params.context,
+                            diffusion_params.c_concat,
+                            diffusion_params.y,
+                            diffusion_params.num_video_frames,
+                            diffusion_params.controls,
+                            diffusion_params.control_strength, output, output_ctx);
    }
 };

@ -82,49 +107,59 @@ struct MMDiTModel : public DiffusionModel {
    MMDiTRunner mmdit;

    MMDiTModel(ggml_backend_t backend,
-               ggml_type wtype,
-               SDVersion version = VERSION_SD3_2B)
-        : mmdit(backend, wtype, version) {
+               bool offload_params_to_cpu,
+               const String2TensorStorage& tensor_storage_map = {})
+        : mmdit(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model") {
    }

-    void alloc_params_buffer() {
+    std::string get_desc() override {
+        return mmdit.get_desc();
+    }
+
+    void alloc_params_buffer() override {
        mmdit.alloc_params_buffer();
    }

-    void free_params_buffer() {
+    void free_params_buffer() override {
        mmdit.free_params_buffer();
    }

-    void free_compute_buffer() {
+    void free_compute_buffer() override {
        mmdit.free_compute_buffer();
    }

-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
        mmdit.get_param_tensors(tensors, "model.diffusion_model");
    }

-    size_t get_params_buffer_size() {
+    size_t get_params_buffer_size() override {
        return mmdit.get_params_buffer_size();
    }

-    int64_t get_adm_in_channels() {
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        mmdit.set_weight_adapter(adapter);
+    }
+
+    int64_t get_adm_in_channels() override {
        return 768 + 1280;
    }

-    void compute(int n_threads,
-                 struct ggml_tensor* x,
-                 struct ggml_tensor* timesteps,
-                 struct ggml_tensor* context,
-                 struct ggml_tensor* c_concat,
-                 struct ggml_tensor* y,
-                 struct ggml_tensor* guidance,
-                 int num_video_frames                      = -1,
-                 std::vector<struct ggml_tensor*> controls = {},
-                 float control_strength                    = 0.f,
-                 struct ggml_tensor** output               = NULL,
-                 struct ggml_context* output_ctx           = NULL,
-                 std::vector<int> skip_layers              = std::vector<int>()) {
-        return mmdit.compute(n_threads, x, timesteps, context, y, output, output_ctx, skip_layers);
+    void set_flash_attn_enabled(bool enabled) {
+        mmdit.set_flash_attention_enabled(enabled);
+    }
+
+    bool compute(int n_threads,
+                 DiffusionParams diffusion_params,
+                 struct ggml_tensor** output     = nullptr,
+                 struct ggml_context* output_ctx = nullptr) override {
+        return mmdit.compute(n_threads,
+                             diffusion_params.x,
+                             diffusion_params.timesteps,
+                             diffusion_params.context,
+                             diffusion_params.y,
+                             output,
+                             output_ctx,
+                             diffusion_params.skip_layers);
    }
 };

@ -132,49 +167,257 @@ struct FluxModel : public DiffusionModel {
    Flux::FluxRunner flux;

    FluxModel(ggml_backend_t backend,
-              ggml_type wtype,
-              SDVersion version = VERSION_FLUX_DEV)
-        : flux(backend, wtype, version) {
+              bool offload_params_to_cpu,
+              const String2TensorStorage& tensor_storage_map = {},
+              SDVersion version                              = VERSION_FLUX,
+              bool use_mask                                  = false)
+        : flux(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version, use_mask) {
    }

-    void alloc_params_buffer() {
+    std::string get_desc() override {
+        return flux.get_desc();
+    }
+
+    void alloc_params_buffer() override {
        flux.alloc_params_buffer();
    }

-    void free_params_buffer() {
+    void free_params_buffer() override {
        flux.free_params_buffer();
    }

-    void free_compute_buffer() {
+    void free_compute_buffer() override {
        flux.free_compute_buffer();
    }

-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
        flux.get_param_tensors(tensors, "model.diffusion_model");
    }

-    size_t get_params_buffer_size() {
+    size_t get_params_buffer_size() override {
        return flux.get_params_buffer_size();
    }

-    int64_t get_adm_in_channels() {
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        flux.set_weight_adapter(adapter);
+    }
+
+    int64_t get_adm_in_channels() override {
        return 768;
    }

-    void compute(int n_threads,
-                 struct ggml_tensor* x,
-                 struct ggml_tensor* timesteps,
-                 struct ggml_tensor* context,
-                 struct ggml_tensor* c_concat,
-                 struct ggml_tensor* y,
-                 struct ggml_tensor* guidance,
-                 int num_video_frames                      = -1,
-                 std::vector<struct ggml_tensor*> controls = {},
-                 float control_strength                    = 0.f,
-                 struct ggml_tensor** output               = NULL,
-                 struct ggml_context* output_ctx           = NULL,
-                 std::vector<int> skip_layers              = std::vector<int>()) {
-        return flux.compute(n_threads, x, timesteps, context, y, guidance, output, output_ctx, skip_layers);
+    void set_flash_attn_enabled(bool enabled) {
+        flux.set_flash_attention_enabled(enabled);
+    }
+
+    bool compute(int n_threads,
+                 DiffusionParams diffusion_params,
+                 struct ggml_tensor** output     = nullptr,
+                 struct ggml_context* output_ctx = nullptr) override {
+        return flux.compute(n_threads,
+                            diffusion_params.x,
+                            diffusion_params.timesteps,
+                            diffusion_params.context,
+                            diffusion_params.c_concat,
+                            diffusion_params.y,
+                            diffusion_params.guidance,
+                            diffusion_params.ref_latents,
+                            diffusion_params.increase_ref_index,
+                            output,
+                            output_ctx,
+                            diffusion_params.skip_layers);
+    }
+};
+
+struct WanModel : public DiffusionModel {
+    std::string prefix;
+    WAN::WanRunner wan;
+
+    WanModel(ggml_backend_t backend,
+             bool offload_params_to_cpu,
+             const String2TensorStorage& tensor_storage_map = {},
+             const std::string prefix                       = "model.diffusion_model",
+             SDVersion version                              = VERSION_WAN2)
+        : prefix(prefix), wan(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
+    }
+
+    std::string get_desc() override {
+        return wan.get_desc();
+    }
+
+    void alloc_params_buffer() override {
+        wan.alloc_params_buffer();
+    }
+
+    void free_params_buffer() override {
+        wan.free_params_buffer();
+    }
+
+    void free_compute_buffer() override {
+        wan.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+        wan.get_param_tensors(tensors, prefix);
+    }
+
+    size_t get_params_buffer_size() override {
+        return wan.get_params_buffer_size();
+    }
+
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        wan.set_weight_adapter(adapter);
+    }
+
+    int64_t get_adm_in_channels() override {
+        return 768;
+    }
+
+    void set_flash_attn_enabled(bool enabled) {
+        wan.set_flash_attention_enabled(enabled);
+    }
+
+    bool compute(int n_threads,
+                 DiffusionParams diffusion_params,
+                 struct ggml_tensor** output     = nullptr,
+                 struct ggml_context* output_ctx = nullptr) override {
+        return wan.compute(n_threads,
+                           diffusion_params.x,
+                           diffusion_params.timesteps,
+                           diffusion_params.context,
+                           diffusion_params.y,
+                           diffusion_params.c_concat,
+                           nullptr,
+                           diffusion_params.vace_context,
+                           diffusion_params.vace_strength,
+                           output,
+                           output_ctx);
+    }
+};
+
+struct QwenImageModel : public DiffusionModel {
+    std::string prefix;
+    Qwen::QwenImageRunner qwen_image;
+
+    QwenImageModel(ggml_backend_t backend,
+                   bool offload_params_to_cpu,
+                   const String2TensorStorage& tensor_storage_map = {},
+                   const std::string prefix                       = "model.diffusion_model",
+                   SDVersion version                              = VERSION_QWEN_IMAGE)
+        : prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
+    }
+
+    std::string get_desc() override {
+        return qwen_image.get_desc();
+    }
+
+    void alloc_params_buffer() override {
+        qwen_image.alloc_params_buffer();
+    }
+
+    void free_params_buffer() override {
+        qwen_image.free_params_buffer();
+    }
+
+    void free_compute_buffer() override {
+        qwen_image.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+        qwen_image.get_param_tensors(tensors, prefix);
+    }
+
+    size_t get_params_buffer_size() override {
+        return qwen_image.get_params_buffer_size();
+    }
+
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        qwen_image.set_weight_adapter(adapter);
+    }
+
+    int64_t get_adm_in_channels() override {
+        return 768;
+    }
+
+    void set_flash_attn_enabled(bool enabled) {
+        qwen_image.set_flash_attention_enabled(enabled);
+    }
+
+    bool compute(int n_threads,
+                 DiffusionParams diffusion_params,
+                 struct ggml_tensor** output     = nullptr,
+                 struct ggml_context* output_ctx = nullptr) override {
+        return qwen_image.compute(n_threads,
+                                  diffusion_params.x,
+                                  diffusion_params.timesteps,
+                                  diffusion_params.context,
+                                  diffusion_params.ref_latents,
+                                  true,  // increase_ref_index
+                                  output,
+                                  output_ctx);
+    }
+};
+
+struct ZImageModel : public DiffusionModel {
+    std::string prefix;
+    ZImage::ZImageRunner z_image;
+
+    ZImageModel(ggml_backend_t backend,
+                bool offload_params_to_cpu,
+                const String2TensorStorage& tensor_storage_map = {},
+                const std::string prefix                       = "model.diffusion_model",
+                SDVersion version                              = VERSION_Z_IMAGE)
+        : prefix(prefix), z_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
+    }
+
+    std::string get_desc() override {
+        return z_image.get_desc();
+    }
+
+    void alloc_params_buffer() override {
+        z_image.alloc_params_buffer();
+    }
+
+    void free_params_buffer() override {
+        z_image.free_params_buffer();
+    }
+
+    void free_compute_buffer() override {
+        z_image.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+        z_image.get_param_tensors(tensors, prefix);
+    }
+
+    size_t get_params_buffer_size() override {
+        return z_image.get_params_buffer_size();
+    }
+
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        z_image.set_weight_adapter(adapter);
+    }
+
+    int64_t get_adm_in_channels() override {
+        return 768;
+    }
+
+    void set_flash_attn_enabled(bool enabled) {
+        z_image.set_flash_attention_enabled(enabled);
+    }
+
+    bool compute(int n_threads,
+                 DiffusionParams diffusion_params,
+                 struct ggml_tensor** output     = nullptr,
+                 struct ggml_context* output_ctx = nullptr) override {
+        return z_image.compute(n_threads,
+                               diffusion_params.x,
+                               diffusion_params.timesteps,
+                               diffusion_params.context,
+                               diffusion_params.ref_latents,
+                               true,  // increase_ref_index
+                               output,
+                               output_ctx);
    }
 };

--- a/docs/build.md
+++ b/docs/build.md
@ -0,0 +1,173 @@
+# Build from scratch
+
+## Get the Code
+
+```
+git clone --recursive https://github.com/leejet/stable-diffusion.cpp
+cd stable-diffusion.cpp
+```
+
+- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
+
+```
+cd stable-diffusion.cpp
+git pull origin master
+git submodule init
+git submodule update
+```
+
+## Build (CPU only)
+
+If you don't have a GPU or CUDA installed, you can build a CPU-only version.
+
+```shell
+mkdir build && cd build
+cmake ..
+cmake --build . --config Release
+```
+
+## Build with OpenBLAS
+
+```shell
+mkdir build && cd build
+cmake .. -DGGML_OPENBLAS=ON
+cmake --build . --config Release
+```
+
+## Build with CUDA
+
+This provides GPU acceleration using NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
+
+```shell
+mkdir build && cd build
+cmake .. -DSD_CUDA=ON
+cmake --build . --config Release
+```
+
+## Build with HipBLAS
+
+This provides GPU acceleration using AMD GPU. Make sure to have the ROCm toolkit installed.
+To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards.
+
+Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
+
+```shell
+mkdir build && cd build
+if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
+if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
+cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+cmake --build . --config Release
+```
+
+## Build with MUSA
+
+This provides GPU acceleration using Moore Threads GPU. Make sure to have the MUSA toolkit installed.
+
+```shell
+mkdir build && cd build
+cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
+cmake --build . --config Release
+```
+
+## Build with Metal
+
+Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
+
+```shell
+mkdir build && cd build
+cmake .. -DSD_METAL=ON
+cmake --build . --config Release
+```
+
+## Build with Vulkan
+
+Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
+
+```shell
+mkdir build && cd build
+cmake .. -DSD_VULKAN=ON
+cmake --build . --config Release
+```
+
+## Build with OpenCL (for Adreno GPU)
+
+Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
+
+To build for Windows ARM please refers to [Windows 11 Arm64](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
+
+Building for Android:
+
+  Android NDK:
+       Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
+
+Setup OpenCL Dependencies for NDK:
+
+You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
+
+*   OpenCL Headers:
+    ```bash
+    # In a temporary working directory
+    git clone https://github.com/KhronosGroup/OpenCL-Headers
+    cd OpenCL-Headers
+    # Replace <YOUR_NDK_PATH> with your actual NDK installation path
+    # e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+    sudo cp -r CL <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+    cd ..
+    ```
+
+*   OpenCL ICD Loader:
+    ```shell
+    # In the same temporary working directory
+    git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
+    cd OpenCL-ICD-Loader
+    mkdir build_ndk && cd build_ndk
+
+    # Replace <YOUR_NDK_PATH> in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
+    cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
+      -DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
+      -DOPENCL_ICD_LOADER_HEADERS_DIR=<YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
+      -DANDROID_ABI=arm64-v8a \
+      -DANDROID_PLATFORM=24 \
+      -DANDROID_STL=c++_shared
+
+    ninja
+    # Replace <YOUR_NDK_PATH>
+    # e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
+    sudo cp libOpenCL.so <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
+    cd ../..
+    ```
+
+Build `stable-diffusion.cpp` for Android with OpenCL:
+
+```shell
+mkdir build-android && cd build-android
+
+# Replace <YOUR_NDK_PATH> with your actual NDK installation path
+# e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake
+cmake .. -G Ninja \
+  -DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
+  -DANDROID_ABI=arm64-v8a \
+  -DANDROID_PLATFORM=android-28 \
+  -DGGML_OPENMP=OFF \
+  -DSD_OPENCL=ON
+
+ninja
+```
+*(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)*
+
+## Build with SYCL
+
+Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
+
+```shell
+# Export relevant ENV variables
+source /opt/intel/oneapi/setvars.sh
+
+# Option 1: Use FP32 (recommended for better performance in most cases)
+cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+# Option 2: Use FP16
+cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+
+cmake --build . --config Release
+```
--- a/docs/chroma.md
+++ b/docs/chroma.md
@ -0,0 +1,33 @@
+# How to Use
+
+You can run Chroma using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
+
+## Download weights
+
+- Download Chroma
+    - If you don't want to do the conversion yourself, download the preconverted gguf model from [silveroxides/Chroma-GGUF](https://huggingface.co/silveroxides/Chroma-GGUF)
+    - Otherwise, download chroma's safetensors from [lodestones/Chroma](https://huggingface.co/lodestones/Chroma)
+- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
+- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
+
+## Convert Chroma weights
+
+You can download the preconverted gguf weights from [silveroxides/Chroma-GGUF](https://huggingface.co/silveroxides/Chroma-GGUF), this way you don't have to do the conversion yourself.
+
+```
+.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\chroma-unlocked-v40.safetensors -o ..\models\chroma-unlocked-v40-q8_0.gguf -v --type q8_0
+```
+
+## Run
+
+### Example
+For example:
+
+```
+ .\bin\Release\sd.exe --diffusion-model  ..\models\chroma-unlocked-v40-q8_0.gguf --vae ..\models\ae.sft --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'chroma.cpp'" --cfg-scale 4.0 --sampling-method euler -v --chroma-disable-dit-mask --clip-on-cpu
+```
+
+![](../assets/flux/chroma_v40.png)
+
+
+
--- a/docs/chroma_radiance.md
+++ b/docs/chroma_radiance.md
@ -0,0 +1,21 @@
+# How to Use
+
+## Download weights
+
+- Download Chroma1-Radiance
+    - safetensors: https://huggingface.co/lodestones/Chroma1-Radiance/tree/main
+    - gguf: https://huggingface.co/silveroxides/Chroma1-Radiance-GGUF/tree/main
+
+- Download t5xxl
+    - safetensors: https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
+
+## Examples
+
+```
+.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Chroma1-Radiance-v0.4-Q8_0.gguf --t5xxl ..\..\ComfyUI\models\clip\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'chroma  radiance cpp'" --cfg-scale 4.0 --sampling-method euler -v
+```
+
+<img alt="Chroma1-Radiance" src="../assets/flux/chroma1-radiance.png" />
+
+
+
--- a/docs/distilled_sd.md
+++ b/docs/distilled_sd.md
@ -0,0 +1,99 @@
+# Running distilled models: SSD1B and SDx.x with tiny U-Nets
+
+## Preface 
+
+These models feature a reduced U-Net architecture. Unlike standard SDXL models, the SSD-1B U-Net contains only one middle block and fewer attention layers in its up- and down-blocks, resulting in significantly smaller file sizes. Using these models can reduce inference time by more than 33%. For more details, refer to Segmind's paper: https://arxiv.org/abs/2401.02677v1.
+Similarly, SD1.x- and SD2.x-style models with a tiny U-Net consist of only 6 U-Net blocks, leading to very small files and time savings of up to 50%. For more information, see the paper: https://arxiv.org/pdf/2305.15798.pdf.
+
+## SSD1B
+
+Note that not all of these models follow the standard parameter naming conventions. However, several useful SSD-1B models are available online, such as:
+
+ * https://huggingface.co/segmind/SSD-1B/resolve/main/SSD-1B-A1111.safetensors
+ * https://huggingface.co/hassenhamdi/SSD-1B-fp8_e4m3fn/resolve/main/SSD-1B_fp8_e4m3fn.safetensors
+
+Useful LoRAs are also available:
+
+ * https://huggingface.co/seungminh/lora-swarovski-SSD-1B/resolve/main/pytorch_lora_weights.safetensors
+ * https://huggingface.co/kylielee505/mylcmlorassd/resolve/main/pytorch_lora_weights.safetensors
+
+These files can be used out-of-the-box, unlike the models described in the next section.
+
+
+## SD1.x, SD2.x with tiny U-Nets
+
+These models require conversion before use. You will need a Python script provided by the diffusers team, available on GitHub:
+
+ * https://raw.githubusercontent.com/huggingface/diffusers/refs/heads/main/scripts/convert_diffusers_to_original_stable_diffusion.py
+
+### SD2.x
+
+NotaAI provides the following model online:
+
+* https://huggingface.co/nota-ai/bk-sdm-v2-tiny
+
+Creating a .safetensors file involves two steps. First, run this short Python script to download the model from Hugging Face:
+
+```python
+from diffusers import StableDiffusionPipeline
+pipe = StableDiffusionPipeline.from_pretrained("nota-ai/bk-sdm-v2-tiny",cache_dir="./")
+```
+
+Second, create the .safetensors file by running:
+
+```bash
+python convert_diffusers_to_original_stable_diffusion.py \
+      --model_path  models--nota-ai--bk-sdm-v2-tiny/snapshots/68277af553777858cd47e133f92e4db47321bc74 \
+      --checkpoint_path bk-sdm-v2-tiny.safetensors --half --use_safetensors
+```
+
+This will generate the **file bk-sdm-v2-tiny.safetensors**, which is now ready for use with sd.cpp.
+
+### SD1.x
+
+Several Tiny SD 1.x models are available online, such as:
+
+ * https://huggingface.co/segmind/tiny-sd
+ * https://huggingface.co/segmind/portrait-finetuned
+ * https://huggingface.co/nota-ai/bk-sdm-tiny
+
+These models also require conversion, partly because some tensors are stored in a non-contiguous manner. To create a usable checkpoint file, follow these simple steps:
+Download and prepare the model using Python: 
+
+##### Download the model using Python on your computer, for example this way:
+
+```python
+import torch
+from diffusers import StableDiffusionPipeline
+pipe = StableDiffusionPipeline.from_pretrained("segmind/tiny-sd")
+unet=pipe.unet
+for param in unet.parameters():
+    param.data = param.data.contiguous()     # <- important here
+pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
+```
+
+##### Run the conversion script:
+
+```bash
+python convert_diffusers_to_original_stable_diffusion.py \
+      --model_path  ./segmindtiny-sd \
+      --checkpoint_path ./segmind_tiny-sd.ckpt --half
+```
+
+The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
+
+
+### Another available .ckpt file:
+
+ * https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
+
+To use this file, you must first adjust its non-contiguous tensors:
+
+```python
+import torch
+ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu'))
+for key, value in ckpt['state_dict'].items():
+    if isinstance(value, torch.Tensor):
+        ckpt['state_dict'][key] = value.contiguous()
+torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
+```
--- a/docs/flux.md
+++ b/docs/flux.md
@ -15,7 +15,7 @@ You can run Flux using stable-diffusion.cpp with a GPU that has 6GB or even 4GB

 You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself.

-Using fp16 will lead to overflow, but ggml's support for bf16 is not yet fully developed. Therefore, we need to convert flux to gguf format here, which also saves VRAM. For example:
+For example:
 ```
 .\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
 ```
@ -28,7 +28,7 @@ Using fp16 will lead to overflow, but ggml's support for bf16 is not yet fully d
 For example:

 ```
- .\bin\Release\sd.exe --diffusion-model  ..\models\flux1-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v
+ .\bin\Release\sd.exe --diffusion-model  ..\models\flux1-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
 ```

 Using formats of different precisions will yield results of varying quality.
@ -44,7 +44,7 @@ Using formats of different precisions will yield results of varying quality.


 ```
- .\bin\Release\sd.exe --diffusion-model  ..\models\flux1-schnell-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --steps 4
+ .\bin\Release\sd.exe --diffusion-model  ..\models\flux1-schnell-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --steps 4 --clip-on-cpu
 ```

 | q8_0  |
@ -60,7 +60,7 @@ Since many flux LoRA training libraries have used various LoRA naming formats, i
 - LoRA model from https://huggingface.co/XLabs-AI/flux-lora-collection/tree/main (using comfy converted version!!!)

 ```
-.\bin\Release\sd.exe --diffusion-model  ..\models\flux1-dev-q8_0.gguf --vae ...\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'<lora:realism_lora_comfy_converted:1>" --cfg-scale 1.0 --sampling-method euler -v --lora-model-dir ../models
+.\bin\Release\sd.exe --diffusion-model  ..\models\flux1-dev-q8_0.gguf --vae ...\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'<lora:realism_lora_comfy_converted:1>" --cfg-scale 1.0 --sampling-method euler -v --lora-model-dir ../models --clip-on-cpu
 ```

 ![output](../assets/flux/flux1-dev-q8_0%20with%20lora.png)
--- a/docs/flux2.md
+++ b/docs/flux2.md
@ -0,0 +1,21 @@
+# How to Use
+
+## Download weights
+
+- Download FLUX.2-dev
+    - gguf: https://huggingface.co/city96/FLUX.2-dev-gguf/tree/main
+- Download vae
+    - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
+- Download Mistral-Small-3.2-24B-Instruct-2506-GGUF
+    - gguf: https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF/tree/main
+
+## Examples
+
+```
+.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\flux2-dev-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\Mistral-Small-3.2-24B-Instruct-2506-Q4_K_M.gguf -r .\kontext_input.png -p "change 'flux.cpp' to 'flux2-dev.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu
+```
+
+<img alt="flux2 example" src="../assets/flux2/example.png" />
+
+
+
--- a/docs/kontext.md
+++ b/docs/kontext.md
@ -0,0 +1,39 @@
+# How to Use
+
+You can run Kontext using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
+
+## Download weights
+
+- Download Kontext
+    - If you don't want to do the conversion yourself, download the preconverted gguf model from [FLUX.1-Kontext-dev-GGUF](https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF)
+    - Otherwise, download FLUX.1-Kontext-dev from https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev/blob/main/flux1-kontext-dev.safetensors
+- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
+- Download clip_l from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/clip_l.safetensors
+- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
+
+## Convert Kontext weights
+
+You can download the preconverted gguf weights from [FLUX.1-Kontext-dev-GGUF](https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF), this way you don't have to do the conversion yourself.
+
+```
+.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-kontext-dev.safetensors -o ..\models\flux1-kontext-dev-q8_0.gguf -v --type q8_0
+```
+
+## Run
+
+- `--cfg-scale` is recommended to be set to 1. 
+
+### Example
+For example:
+
+```
+ .\bin\Release\sd.exe -r .\flux1-dev-q8_0.png --diffusion-model  ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
+```
+
+
+| ref_image | prompt  | output  |
+| ---- | ----  |----  |
+| ![](../assets/flux/flux1-dev-q8_0.png) | change 'flux.cpp' to 'kontext.cpp' |![](../assets/flux/kontext1_dev_output.png) |
+
+
+
--- a/docs/lora.md
+++ b/docs/lora.md
@ -11,3 +11,16 @@ Here's a simple example:
 ```

 `../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model
+
+# Lora Apply Mode
+
+There are two ways to apply LoRA: **immediately** and **at_runtime**. You can specify it using the `--lora-apply-mode` parameter.
+
+By default, the mode is selected automatically:
+
+* If the model weights contain any quantized parameters, the **at_runtime** mode is used;
+* Otherwise, the **immediately** mode is used.
+
+The **immediately** mode may have precision and compatibility issues with quantized parameters, but it usually offers faster inference speed and, in some cases, lower memory usage.
+In contrast, the **at_runtime** mode provides better compatibility and higher precision, but inference may be slower and memory usage may be higher in some cases.
+
--- a/docs/ovis_image.md
+++ b/docs/ovis_image.md
@ -0,0 +1,19 @@
+# How to Use
+
+## Download weights
+
+- Download Ovis-Image-7B
+    - safetensors: https://huggingface.co/Comfy-Org/Ovis-Image/tree/main/split_files/diffusion_models
+    - gguf: https://huggingface.co/leejet/Ovis-Image-7B-GGUF
+- Download vae
+    - safetensors: https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
+- Download Ovis 2.5
+    - safetensors: https://huggingface.co/Comfy-Org/Ovis-Image/tree/main/split_files/text_encoders
+
+## Examples
+
+```
+.\bin\Release\sd.exe --diffusion-model  ovis_image-Q4_0.gguf --vae ..\..\ComfyUI\models\vae\ae.sft  --llm ..\..\ComfyUI\models\text_encoders\ovis_2.5.safetensors -p "a lovely cat" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa
+```
+
+<img alt="ovis image example" src="../assets/ovis_image/example.png" />
--- a/docs/performance.md
+++ b/docs/performance.md
@ -0,0 +1,26 @@
+## Use Flash Attention to save memory and improve speed.
+
+Enabling flash attention for the diffusion model reduces memory usage by varying amounts of MB.
+eg.:
+ - flux 768x768 ~600mb
+ - SD2 768x768 ~1400mb
+
+For most backends, it slows things down, but for cuda it generally speeds it up too.
+At the moment, it is only supported for some models and some backends (like cpu, cuda/rocm, metal).
+
+Run by adding `--diffusion-fa` to the arguments and watch for:
+```
+[INFO ] stable-diffusion.cpp:312  - Using flash attention in the diffusion model
+```
+and the compute buffer shrink in the debug log:
+```
+[DEBUG] ggml_extend.hpp:1004 - flux compute buffer size: 650.00 MB(VRAM)
+```
+
+## Offload weights to the CPU to save VRAM without reducing generation speed.
+
+Using `--offload-to-cpu` allows you to offload weights to the CPU, saving VRAM without reducing generation speed.
+
+## Use quantization to reduce memory usage.
+
+[quantization](./quantization_and_gguf.md)
--- a/docs/photo_maker.md
+++ b/docs/photo_maker.md
@ -6,16 +6,15 @@ You can use [PhotoMaker](https://github.com/TencentARC/PhotoMaker) to personaliz

 Download PhotoMaker model file (in safetensor format) [here](https://huggingface.co/bssrdf/PhotoMaker). The official release of the model file (in .bin format) does not work with ```stablediffusion.cpp```.

- Specify the PhotoMaker model path using the `--stacked-id-embd-dir PATH` parameter.
- Specify the input images path using the `--input-id-images-dir PATH` parameter.
-  - input images **must** have the same width and height for preprocessing (to be improved)
+- Specify the PhotoMaker model path using the `--photo-maker PATH` parameter.
+- Specify the input images path using the `--pm-id-images-dir PATH` parameter.

 In prompt, make sure you have a class word followed by the trigger word ```"img"``` (hard-coded for now). The class word could be one of ```"man, woman, girl, boy"```. If input ID images contain asian faces, add ```Asian``` before the class
 word.

 Another PhotoMaker specific parameter:

- ```--style-ratio  (0-100)%```: default is 20 and 10-20 typically gets good results. Lower ratio means more faithfully following input ID (not necessarily better quality).
+- ```--pm-style-strength  (0-100)%```: default is 20 and 10-20 typically gets good results. Lower ratio means more faithfully following input ID (not necessarily better quality).

 Other parameters recommended for running Photomaker:

@ -28,5 +27,27 @@ If on low memory GPUs (<= 8GB), recommend running with ```--vae-on-cpu``` option
 Example:

 ```bash
-bin/sd -m ../models/sdxlUnstableDiffusers_v11.safetensors  --vae ../models/sdxl_vae.safetensors --stacked-id-embd-dir ../models/photomaker-v1.safetensors --input-id-images-dir ../assets/photomaker_examples/scarletthead_woman -p "a girl img, retro futurism, retro game art style but extremely beautiful, intricate details, masterpiece, best quality, space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed" -n "realistic, photo-realistic, worst quality, greyscale, bad anatomy, bad hands, error, text" --cfg-scale 5.0  --sampling-method euler -H 1024 -W 1024 --style-ratio 10 --vae-on-cpu -o output.png
+bin/sd -m ../models/sdxlUnstableDiffusers_v11.safetensors  --vae ../models/sdxl_vae.safetensors --photo-maker ../models/photomaker-v1.safetensors --pm-id-images-dir ../assets/photomaker_examples/scarletthead_woman -p "a girl img, retro futurism, retro game art style but extremely beautiful, intricate details, masterpiece, best quality, space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed" -n "realistic, photo-realistic, worst quality, greyscale, bad anatomy, bad hands, error, text" --cfg-scale 5.0  --sampling-method euler -H 1024 -W 1024 --pm-style-strength 10 --vae-on-cpu --steps 50
 ```
+
+## PhotoMaker Version 2
+
+[PhotoMaker Version 2 (PMV2)](https://github.com/TencentARC/PhotoMaker/blob/main/README_pmv2.md) has some key improvements. Unfortunately it has a very heavy dependency which makes running it a bit involved in ```SD.cpp```. 
+
+Running PMV2 is now a two-step process:
+
+- Run a python script ```face_detect.py``` to obtain **id_embeds** for the given input images
+```
+python face_detect.py input_image_dir
+```
+An ```id_embeds.bin``` file will be generated in ```input_images_dir```
+
+**Note: this step is only needed to run once; the same ```id_embeds``` can be reused**
+
+- Run the same command as in version 1 but replacing ```photomaker-v1.safetensors``` with ```photomaker-v2.safetensors```.
+
+  You can download ```photomaker-v2.safetensors``` from [here](https://huggingface.co/bssrdf/PhotoMakerV2)
+
+- All the command line parameters from Version 1 remain the same for Version 2 plus one extra pointing to a valid ```id_embeds``` file:  --pm-id-embed-path [path_to__id_embeds.bin] 
+
+
--- a/docs/qwen_image.md
+++ b/docs/qwen_image.md
@ -0,0 +1,23 @@
+# How to Use
+
+## Download weights
+
+- Download Qwen Image
+    - safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/diffusion_models
+    - gguf: https://huggingface.co/QuantStack/Qwen-Image-GGUF/tree/main
+- Download vae
+    - safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/vae
+- Download qwen_2.5_vl 7b
+    - safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/text_encoders
+    - gguf: https://huggingface.co/mradermacher/Qwen2.5-VL-7B-Instruct-GGUF/tree/main
+
+## Examples
+
+```
+.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf  -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线： 探索视觉生成基础模型的极限，开创理解与生成一体化的未来。二、Qwen-Image的模型特色：1、复杂文字渲染。支持中英渲染、自动布局； 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景：赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
+```
+
+<img alt="qwen example" src="../assets/qwen/example.png" />
+
+
+
--- a/docs/qwen_image_edit.md
+++ b/docs/qwen_image_edit.md
@ -0,0 +1,35 @@
+# How to Use
+
+## Download weights
+
+- Download Qwen Image
+    - Qwen Image Edit
+        - safetensors: https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/tree/main/split_files/diffusion_models
+        - gguf: https://huggingface.co/QuantStack/Qwen-Image-Edit-GGUF/tree/main
+    - Qwen Image Edit 2509
+        - safetensors: https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/tree/main/split_files/diffusion_models
+        - gguf: https://huggingface.co/QuantStack/Qwen-Image-Edit-2509-GGUF/tree/main
+- Download vae
+    - safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/vae
+- Download qwen_2.5_vl 7b
+    - safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/text_encoders
+    - gguf: https://huggingface.co/mradermacher/Qwen2.5-VL-7B-Instruct-GGUF/tree/main
+
+## Examples
+
+### Qwen Image Edit
+
+```
+.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
+```
+
+<img alt="qwen_image_edit" src="../assets/qwen/qwen_image_edit.png" />
+
+
+### Qwen Image Edit 2509
+
+```
+.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --llm_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
+```
+
+<img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2509.png" />
--- a/docs/sd.md
+++ b/docs/sd.md
@ -0,0 +1,37 @@
+## Download weights
+
+- download original weights(.ckpt or .safetensors). For example
+    - Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
+    - Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
+    - Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
+    - Stable Diffusion 3 2B from https://huggingface.co/stabilityai/stable-diffusion-3-medium
+
+### txt2img example
+
+```sh
+./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
+# ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
+# ./bin/sd -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
+# ./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
+# ./bin/sd --diffusion-model  ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
+# ./bin/sd -m  ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
+```
+
+Using formats of different precisions will yield results of varying quality.
+
+| f32  | f16  |q8_0  |q5_0  |q5_1  |q4_0  |q4_1  |
+| ----  |----  |----  |----  |----  |----  |----  |
+| ![](../assets/f32.png) |![](../assets/f16.png) |![](../assets/q8_0.png) |![](../assets/q5_0.png) |![](../assets/q5_1.png) |![](../assets/q4_0.png) |![](../assets/q4_1.png) |
+
+### img2img example
+
+- `./output.png` is the image generated from the above txt2img pipeline
+
+
+```
+./bin/sd -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
+```
+
+<p align="center">
+  <img src="../assets/img2img_output.png" width="256x">
+</p>
--- a/docs/sd3.md
+++ b/docs/sd3.md
@ -14,7 +14,7 @@
 For example:

 ```
-.\bin\Release\sd.exe -m  ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v
+.\bin\Release\sd.exe -m  ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
 ```

 ![](../assets/sd3.5_large.png)
--- a/docs/taesd.md
+++ b/docs/taesd.md
@ -7,7 +7,7 @@ You can use TAESD to accelerate the decoding of latent images by following these
 Or curl

 ```bash
-curl -L -O https://huggingface.co/madebyollin/taesd/blob/main/diffusion_pytorch_model.safetensors
+curl -L -O https://huggingface.co/madebyollin/taesd/resolve/main/diffusion_pytorch_model.safetensors
 ```

 - Specify the model path using the `--taesd PATH` parameter. example:
--- a/docs/wan.md
+++ b/docs/wan.md
@ -0,0 +1,204 @@
+# How to Use
+
+## Download weights
+
+- Download Wan
+    - Wan2.1
+        - Wan2.1 T2V 1.3B
+            - safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
+        - Wan2.1 T2V 14B
+            - safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
+            - gguf: https://huggingface.co/city96/Wan2.1-T2V-14B-gguf/tree/main
+        - Wan2.1 I2V 14B 480P
+            - safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
+            - gguf: https://huggingface.co/city96/Wan2.1-I2V-14B-480P-gguf/tree/main
+        - Wan2.1 I2V 14B 720P
+            - safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
+            - gguf: https://huggingface.co/city96/Wan2.1-I2V-14B-720P-gguf/tree/main
+        - Wan2.1 FLF2V 14B 720P
+            - safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
+            - gguf: https://huggingface.co/city96/Wan2.1-FLF2V-14B-720P-gguf/tree/main
+        - Wan2.1 VACE 1.3B
+            - safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
+            - gguf: https://huggingface.co/calcuis/wan-1.3b-gguf/tree/main
+        - Wan2.1 VACE 14B
+            - safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/tree/main/split_files/diffusion_models
+            - gguf: https://huggingface.co/QuantStack/Wan2.1_14B_VACE-GGUF/tree/main
+    - Wan2.2
+        - Wan2.2 TI2V 5B
+            - safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/tree/main/split_files/diffusion_models
+            - gguf: https://huggingface.co/QuantStack/Wan2.2-TI2V-5B-GGUF/tree/main
+        - Wan2.2 T2V A14B
+            - safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/tree/main/split_files/diffusion_models
+            - gguf: https://huggingface.co/QuantStack/Wan2.2-T2V-A14B-GGUF/tree/main
+        - Wan2.2 I2V A14B
+            - safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/tree/main/split_files/diffusion_models
+            - gguf: https://huggingface.co/QuantStack/Wan2.2-I2V-A14B-GGUF/tree/main
+- Download vae
+    - wan_2.1_vae (for all the wan model except Wan2.2 TI2V 5B)
+        - safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/vae/wan_2.1_vae.safetensors
+    - wan_2.2_vae (for Wan2.2 TI2V 5B only)
+        - safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/blob/main/split_files/vae/wan2.2_vae.safetensors
+- Download umt5_xxl
+    - safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/text_encoders/umt5_xxl_fp16.safetensors
+    - gguf: https://huggingface.co/city96/umt5-xxl-encoder-gguf/tree/main
+
+- Download clip_vison_h (for Wan2.1 I2V/FLF2V only)
+    - safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/clip_vision/clip_vision_h.safetensors
+
+
+## Examples
+
+### Wan2.1 T2V 1.3B
+
+```
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1_t2v_1.3B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --video-frames 33 --flow-shift 3.0
+```
+
+<video src=../assets/wan/Wan2.1_1.3B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
+
+### Wan2.1 T2V 14B
+
+```
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-t2v-14b-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa  --offload-to-cpu --video-frames 33 --flow-shift 3.0
+```
+
+<video src=../assets/wan/Wan2.1_14B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
+
+
+
+### Wan2.1 I2V 14B
+
+```
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-i2v-14b-480p-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --clip_vision ..\..\ComfyUI\models\clip_vision\clip_vision_h.safetensors -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
+```
+
+<video src=../assets/wan/Wan2.1_14B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
+
+### Wan2.2 T2V A14B
+
+```
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33 --flow-shift 3.0
+```
+
+<video src=../assets/wan/Wan2.2_14B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
+
+### Wan2.2 I2V A14B
+
+```
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33 --offload-to-cpu -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
+```
+
+<video src=../assets/wan/Wan2.2_14B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
+
+### Wan2.2 T2V A14B T2I
+
+```
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --flow-shift 3.0
+```
+
+<img width="832" height="480" alt="Wan2 2_14B_t2i" src="../assets/wan/Wan2.2_14B_t2i.png" />
+
+### Wan2.2 T2V 14B with Lora
+
+```
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat<lora:wan2.2_t2v_lightx2v_4steps_lora_v1.1_low_noise:1><lora:|high_noise|wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise:1>" --cfg-scale 3.5 --sampling-method euler --steps 4 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 4 -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --lora-model-dir ..\..\ComfyUI\models\loras --video-frames 33 --flow-shift 3.0
+```
+
+<video src=../assets/wan/Wan2.2_14B_t2v_lora.mp4 controls="controls" muted="muted" type="video/mp4"></video>
+
+
+
+### Wan2.2 TI2V 5B
+
+#### T2V
+
+```
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.2_ti2v_5B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan2.2_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --offload-to-cpu --video-frames 33 --flow-shift 3.0
+```
+
+<video src=../assets/wan/Wan2.2_5B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
+
+#### I2V
+
+```
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.2_ti2v_5B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan2.2_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --offload-to-cpu --video-frames 33 -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
+```
+
+<video src=../assets/wan/Wan2.2_5B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
+
+### Wan2.1 FLF2V 14B
+
+```
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-flf2v-14b-720p-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --clip_vision ..\..\ComfyUI\models\clip_vision\clip_vision_h.safetensors -p "glass flower blossom" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png --flow-shift 3.0
+```
+
+
+<video src=../assets/wan/Wan2.1_14B_flf2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
+
+### Wan2.2 FLF2V 14B
+
+```
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -p "glass flower blossom" -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png --flow-shift 3.0
+```
+
+<video src=../assets/wan/Wan2.2_14B_flf2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
+
+### Wan2.1 VACE 1.3B
+
+#### T2V
+
+```
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --video-frames 1 --offload-to-cpu
+```
+
+<video src=../assets/wan/Wan2.1_1.3B_vace_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
+
+
+#### R2V
+
+```
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa -i ..\assets\cat_with_sd_cpp_42.png --video-frames 33 --offload-to-cpu
+```
+
+<video src=../assets/wan/Wan2.1_1.3B_vace_r2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
+
+
+#### V2V
+
+```
+mkdir post+depth
+ffmpeg -i ..\..\ComfyUI\input\post+depth.mp4 -qscale:v 1 -vf fps=8 post+depth\frame_%04d.jpg
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "The girl is dancing in a sea of flowers, slowly moving her hands. There is a close - up shot of her upper body. The character is surrounded by other transparent glass flowers in the style of Nicoletta Ceccoli, creating a beautiful, surreal, and emotionally expressive movie scene with a white. transparent feel and a dreamyl atmosphere." --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa -i ..\..\ComfyUI\input\dance_girl.jpg --control-video ./post+depth --video-frames 33 --offload-to-cpu
+```
+
+<video src=../assets/wan/Wan2.1_1.3B_vace_v2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
+
+### Wan2.1 VACE 14B
+
+#### T2V
+
+```
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --video-frames 33 --offload-to-cpu
+```
+
+<video src=../assets/wan/Wan2.1_14B_vace_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
+
+
+#### R2V
+
+```
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa -i ..\assets\cat_with_sd_cpp_42.png --video-frames 33 --offload-to-cpu
+```
+
+<video src=../assets/wan/Wan2.1_14B_vace_r2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
+
+
+
+#### V2V
+
+```
+.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "The girl is dancing in a sea of flowers, slowly moving her hands. There is a close - up shot of her upper body. The character is surrounded by other transparent glass flowers in the style of Nicoletta Ceccoli, creating a beautiful, surreal, and emotionally expressive movie scene with a white. transparent feel and a dreamyl atmosphere." --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa -i ..\..\ComfyUI\input\dance_girl.jpg --control-video ./post+depth --video-frames 33 --offload-to-cpu
+```
+
+<video src=../assets/wan/Wan2.1_14B_vace_v2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
--- a/docs/z_image.md
+++ b/docs/z_image.md
@ -0,0 +1,28 @@
+# How to Use
+
+You can run Z-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or even less.
+
+## Download weights
+
+- Download Z-Image-Turbo
+    - safetensors: https://huggingface.co/Comfy-Org/z_image_turbo/tree/main/split_files/diffusion_models
+    - gguf: https://huggingface.co/leejet/Z-Image-Turbo-GGUF/tree/main
+- Download vae
+    - safetensors: https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
+- Download Qwen3 4b
+    - safetensors: https://huggingface.co/Comfy-Org/z_image_turbo/tree/main/split_files/text_encoders
+    - gguf: https://huggingface.co/unsloth/Qwen3-4B-Instruct-2507-GGUF/tree/main
+
+## Examples
+
+```
+.\bin\Release\sd.exe --diffusion-model  z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft  --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512
+```
+
+<img width="256" alt="z-image example" src="../assets/z_image/q3_K.png" />
+
+## Comparison of Different Quantization Types
+
+| bf16 | q8_0 | q6_K | q5_0 | q4_K | q4_0 | q3_K | q2_K|
+|---|---|---|---|---|---|---|---|
+| <img width="256" alt="bf16" src="../assets/z_image/bf16.png" /> | <img width="256" alt="q8_0" src="../assets/z_image/q8_0.png" /> | <img width="256" alt="q6_K" src="../assets/z_image/q6_K.png" /> | <img width="256" alt="q5_0" src="../assets/z_image/q5_0.png" />  | <img width="256" alt="q4_K" src="../assets/z_image/q4_K.png" /> | <img width="256" alt="q4_0" src="../assets/z_image/q4_0.png" /> | <img width="256" alt="q3_K" src="../assets/z_image/q3_K.png" /> | <img width="256" alt="q2_K" src="../assets/z_image/q2_K.png" /> |
--- a/easycache.hpp
+++ b/easycache.hpp
@ -0,0 +1,265 @@
+#include <cmath>
+#include <limits>
+#include <unordered_map>
+#include <vector>
+
+#include "denoiser.hpp"
+#include "ggml_extend.hpp"
+
+struct EasyCacheConfig {
+    bool enabled          = false;
+    float reuse_threshold = 0.2f;
+    float start_percent   = 0.15f;
+    float end_percent     = 0.95f;
+};
+
+struct EasyCacheCacheEntry {
+    std::vector<float> diff;
+};
+
+struct EasyCacheState {
+    EasyCacheConfig config;
+    Denoiser* denoiser                  = nullptr;
+    float start_sigma                   = std::numeric_limits<float>::max();
+    float end_sigma                     = 0.0f;
+    bool initialized                    = false;
+    bool initial_step                   = true;
+    bool skip_current_step              = false;
+    bool step_active                    = false;
+    const SDCondition* anchor_condition = nullptr;
+    std::unordered_map<const SDCondition*, EasyCacheCacheEntry> cache_diffs;
+    std::vector<float> prev_input;
+    std::vector<float> prev_output;
+    float output_prev_norm                = 0.0f;
+    bool has_prev_input                   = false;
+    bool has_prev_output                  = false;
+    bool has_output_prev_norm             = false;
+    bool has_relative_transformation_rate = false;
+    float relative_transformation_rate    = 0.0f;
+    float cumulative_change_rate          = 0.0f;
+    float last_input_change               = 0.0f;
+    bool has_last_input_change            = false;
+    int total_steps_skipped               = 0;
+    int current_step_index                = -1;
+
+    void reset_runtime() {
+        initial_step      = true;
+        skip_current_step = false;
+        step_active       = false;
+        anchor_condition  = nullptr;
+        cache_diffs.clear();
+        prev_input.clear();
+        prev_output.clear();
+        output_prev_norm                 = 0.0f;
+        has_prev_input                   = false;
+        has_prev_output                  = false;
+        has_output_prev_norm             = false;
+        has_relative_transformation_rate = false;
+        relative_transformation_rate     = 0.0f;
+        cumulative_change_rate           = 0.0f;
+        last_input_change                = 0.0f;
+        has_last_input_change            = false;
+        total_steps_skipped              = 0;
+        current_step_index               = -1;
+    }
+
+    void init(const EasyCacheConfig& cfg, Denoiser* d) {
+        config      = cfg;
+        denoiser    = d;
+        initialized = cfg.enabled && d != nullptr;
+        reset_runtime();
+        if (initialized) {
+            start_sigma = percent_to_sigma(config.start_percent);
+            end_sigma   = percent_to_sigma(config.end_percent);
+        }
+    }
+
+    bool enabled() const {
+        return initialized && config.enabled;
+    }
+
+    float percent_to_sigma(float percent) const {
+        if (!denoiser) {
+            return 0.0f;
+        }
+        if (percent <= 0.0f) {
+            return std::numeric_limits<float>::max();
+        }
+        if (percent >= 1.0f) {
+            return 0.0f;
+        }
+        float t = (1.0f - percent) * (TIMESTEPS - 1);
+        return denoiser->t_to_sigma(t);
+    }
+
+    void begin_step(int step_index, float sigma) {
+        if (!enabled()) {
+            return;
+        }
+        if (step_index == current_step_index) {
+            return;
+        }
+        current_step_index    = step_index;
+        skip_current_step     = false;
+        has_last_input_change = false;
+        step_active           = false;
+        if (sigma > start_sigma) {
+            return;
+        }
+        if (!(sigma > end_sigma)) {
+            return;
+        }
+        step_active = true;
+    }
+
+    bool step_is_active() const {
+        return enabled() && step_active;
+    }
+
+    bool is_step_skipped() const {
+        return enabled() && step_active && skip_current_step;
+    }
+
+    bool has_cache(const SDCondition* cond) const {
+        auto it = cache_diffs.find(cond);
+        return it != cache_diffs.end() && !it->second.diff.empty();
+    }
+
+    void update_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
+        EasyCacheCacheEntry& entry = cache_diffs[cond];
+        size_t ne                  = static_cast<size_t>(ggml_nelements(output));
+        entry.diff.resize(ne);
+        float* out_data = (float*)output->data;
+        float* in_data  = (float*)input->data;
+        for (size_t i = 0; i < ne; ++i) {
+            entry.diff[i] = out_data[i] - in_data[i];
+        }
+    }
+
+    void apply_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
+        auto it = cache_diffs.find(cond);
+        if (it == cache_diffs.end() || it->second.diff.empty()) {
+            return;
+        }
+        copy_ggml_tensor(output, input);
+        float* out_data                = (float*)output->data;
+        const std::vector<float>& diff = it->second.diff;
+        for (size_t i = 0; i < diff.size(); ++i) {
+            out_data[i] += diff[i];
+        }
+    }
+
+    bool before_condition(const SDCondition* cond,
+                          ggml_tensor* input,
+                          ggml_tensor* output,
+                          float sigma,
+                          int step_index) {
+        if (!enabled() || step_index < 0) {
+            return false;
+        }
+        if (step_index != current_step_index) {
+            begin_step(step_index, sigma);
+        }
+        if (!step_active) {
+            return false;
+        }
+        if (initial_step) {
+            anchor_condition = cond;
+            initial_step     = false;
+        }
+        bool is_anchor = (cond == anchor_condition);
+        if (skip_current_step) {
+            if (has_cache(cond)) {
+                apply_cache(cond, input, output);
+                return true;
+            }
+            return false;
+        }
+        if (!is_anchor) {
+            return false;
+        }
+        if (!has_prev_input || !has_prev_output || !has_cache(cond)) {
+            return false;
+        }
+        size_t ne = static_cast<size_t>(ggml_nelements(input));
+        if (prev_input.size() != ne) {
+            return false;
+        }
+        float* input_data = (float*)input->data;
+        last_input_change = 0.0f;
+        for (size_t i = 0; i < ne; ++i) {
+            last_input_change += std::fabs(input_data[i] - prev_input[i]);
+        }
+        if (ne > 0) {
+            last_input_change /= static_cast<float>(ne);
+        }
+        has_last_input_change = true;
+
+        if (has_output_prev_norm && has_relative_transformation_rate && last_input_change > 0.0f && output_prev_norm > 0.0f) {
+            float approx_output_change_rate = (relative_transformation_rate * last_input_change) / output_prev_norm;
+            cumulative_change_rate += approx_output_change_rate;
+            if (cumulative_change_rate < config.reuse_threshold) {
+                skip_current_step = true;
+                total_steps_skipped++;
+                apply_cache(cond, input, output);
+                return true;
+            } else {
+                cumulative_change_rate = 0.0f;
+            }
+        }
+
+        return false;
+    }
+
+    void after_condition(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
+        if (!step_is_active()) {
+            return;
+        }
+        update_cache(cond, input, output);
+        if (cond != anchor_condition) {
+            return;
+        }
+
+        size_t ne      = static_cast<size_t>(ggml_nelements(input));
+        float* in_data = (float*)input->data;
+        prev_input.resize(ne);
+        for (size_t i = 0; i < ne; ++i) {
+            prev_input[i] = in_data[i];
+        }
+        has_prev_input = true;
+
+        float* out_data     = (float*)output->data;
+        float output_change = 0.0f;
+        if (has_prev_output && prev_output.size() == ne) {
+            for (size_t i = 0; i < ne; ++i) {
+                output_change += std::fabs(out_data[i] - prev_output[i]);
+            }
+            if (ne > 0) {
+                output_change /= static_cast<float>(ne);
+            }
+        }
+
+        prev_output.resize(ne);
+        for (size_t i = 0; i < ne; ++i) {
+            prev_output[i] = out_data[i];
+        }
+        has_prev_output = true;
+
+        float mean_abs = 0.0f;
+        for (size_t i = 0; i < ne; ++i) {
+            mean_abs += std::fabs(out_data[i]);
+        }
+        output_prev_norm     = (ne > 0) ? (mean_abs / static_cast<float>(ne)) : 0.0f;
+        has_output_prev_norm = output_prev_norm > 0.0f;
+
+        if (has_last_input_change && last_input_change > 0.0f && output_change > 0.0f) {
+            float rate = output_change / last_input_change;
+            if (std::isfinite(rate)) {
+                relative_transformation_rate     = rate;
+                has_relative_transformation_rate = true;
+            }
+        }
+        cumulative_change_rate = 0.0f;
+        has_last_input_change  = false;
+    }
+};
--- a/esrgan.hpp
+++ b/esrgan.hpp
@ -27,11 +27,11 @@ public:
        blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
    }

-    struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
-        return ggml_leaky_relu(ctx, x, 0.2f, true);
+    struct ggml_tensor* lrelu(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true);
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [n, num_feat, h, w]
        // return: [n, num_feat, h, w]

@ -42,16 +42,16 @@ public:
        auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]);

        auto x1    = lrelu(ctx, conv1->forward(ctx, x));
-        auto x_cat = ggml_concat(ctx, x, x1, 2);
+        auto x_cat = ggml_concat(ctx->ggml_ctx, x, x1, 2);
        auto x2    = lrelu(ctx, conv2->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx, x_cat, x2, 2);
+        x_cat      = ggml_concat(ctx->ggml_ctx, x_cat, x2, 2);
        auto x3    = lrelu(ctx, conv3->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx, x_cat, x3, 2);
+        x_cat      = ggml_concat(ctx->ggml_ctx, x_cat, x3, 2);
        auto x4    = lrelu(ctx, conv4->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx, x_cat, x4, 2);
+        x_cat      = ggml_concat(ctx->ggml_ctx, x_cat, x4, 2);
        auto x5    = conv5->forward(ctx, x_cat);

-        x5 = ggml_add(ctx, ggml_scale(ctx, x5, 0.2f), x);
+        x5 = ggml_add(ctx->ggml_ctx, ggml_scale(ctx->ggml_ctx, x5, 0.2f), x);
        return x5;
    }
 };
@ -64,7 +64,7 @@ public:
        blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [n, num_feat, h, w]
        // return: [n, num_feat, h, w]

@ -76,46 +76,51 @@ public:
        out      = rdb2->forward(ctx, out);
        out      = rdb3->forward(ctx, out);

-        out = ggml_add(ctx, ggml_scale(ctx, out, 0.2f), x);
+        out = ggml_add(ctx->ggml_ctx, ggml_scale(ctx->ggml_ctx, out, 0.2f), x);
        return out;
    }
 };

 class RRDBNet : public GGMLBlock {
 protected:
-    int scale       = 4;  // default RealESRGAN_x4plus_anime_6B
-    int num_block   = 6;  // default RealESRGAN_x4plus_anime_6B
+    int scale       = 4;
+    int num_block   = 23;
    int num_in_ch   = 3;
    int num_out_ch  = 3;
-    int num_feat    = 64;  // default RealESRGAN_x4plus_anime_6B
-    int num_grow_ch = 32;  // default RealESRGAN_x4plus_anime_6B
+    int num_feat    = 64;
+    int num_grow_ch = 32;

 public:
-    RRDBNet() {
+    RRDBNet(int scale, int num_block, int num_in_ch, int num_out_ch, int num_feat, int num_grow_ch)
+        : scale(scale), num_block(num_block), num_in_ch(num_in_ch), num_out_ch(num_out_ch), num_feat(num_feat), num_grow_ch(num_grow_ch) {
        blocks["conv_first"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_in_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
        for (int i = 0; i < num_block; i++) {
            std::string name = "body." + std::to_string(i);
            blocks[name]     = std::shared_ptr<GGMLBlock>(new RRDB(num_feat, num_grow_ch));
        }
        blocks["conv_body"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
-        // upsample
+        if (scale >= 2) {
            blocks["conv_up1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
+        }
+        if (scale == 4) {
            blocks["conv_up2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
+        }
        blocks["conv_hr"]   = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
        blocks["conv_last"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_out_ch, {3, 3}, {1, 1}, {1, 1}));
    }

-    struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
-        return ggml_leaky_relu(ctx, x, 0.2f, true);
+    int get_scale() { return scale; }
+    int get_num_block() { return num_block; }
+
+    struct ggml_tensor* lrelu(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true);
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [n, num_in_ch, h, w]
-        // return: [n, num_out_ch, h*4, w*4]
+        // return: [n, num_out_ch, h*scale, w*scale]
        auto conv_first = std::dynamic_pointer_cast<Conv2d>(blocks["conv_first"]);
        auto conv_body  = std::dynamic_pointer_cast<Conv2d>(blocks["conv_body"]);
-        auto conv_up1   = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up1"]);
-        auto conv_up2   = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up2"]);
        auto conv_hr    = std::dynamic_pointer_cast<Conv2d>(blocks["conv_hr"]);
        auto conv_last  = std::dynamic_pointer_cast<Conv2d>(blocks["conv_last"]);

@ -128,70 +133,235 @@ public:
            body_feat = block->forward(ctx, body_feat);
        }
        body_feat = conv_body->forward(ctx, body_feat);
-        feat      = ggml_add(ctx, feat, body_feat);
+        feat      = ggml_add(ctx->ggml_ctx, feat, body_feat);
        // upsample
-        feat     = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2)));
-        feat     = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2)));
+        if (scale >= 2) {
+            auto conv_up1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up1"]);
+            feat          = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx->ggml_ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
+            if (scale == 4) {
+                auto conv_up2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up2"]);
+                feat          = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx->ggml_ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
+            }
+        }
+        // for all scales
        auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat)));
        return out;
    }
 };

 struct ESRGAN : public GGMLRunner {
-    RRDBNet rrdb_net;
+    std::unique_ptr<RRDBNet> rrdb_net;
    int scale     = 4;
    int tile_size = 128;  // avoid cuda OOM for 4gb VRAM

    ESRGAN(ggml_backend_t backend,
-           ggml_type wtype)
-        : GGMLRunner(backend, wtype) {
-        rrdb_net.init(params_ctx, wtype);
+           bool offload_params_to_cpu,
+           int tile_size                                  = 128,
+           const String2TensorStorage& tensor_storage_map = {})
+        : GGMLRunner(backend, offload_params_to_cpu) {
+        this->tile_size = tile_size;
    }

-    std::string get_desc() {
+    std::string get_desc() override {
        return "esrgan";
    }

-    bool load_from_file(const std::string& file_path) {
+    bool load_from_file(const std::string& file_path, int n_threads) {
        LOG_INFO("loading esrgan from '%s'", file_path.c_str());

-        alloc_params_buffer();
-        std::map<std::string, ggml_tensor*> esrgan_tensors;
-        rrdb_net.get_param_tensors(esrgan_tensors);
-
        ModelLoader model_loader;
-        if (!model_loader.init_from_file(file_path)) {
+        if (!model_loader.init_from_file_and_convert_name(file_path)) {
            LOG_ERROR("init esrgan model loader from file failed: '%s'", file_path.c_str());
            return false;
        }

-        bool success = model_loader.load_tensors(esrgan_tensors, backend);
+        // Get tensor names
+        auto tensor_names = model_loader.get_tensor_names();
+
+        // Detect if it's ESRGAN format
+        bool is_ESRGAN = std::find(tensor_names.begin(), tensor_names.end(), "model.0.weight") != tensor_names.end();
+
+        // Detect parameters from tensor names
+        int detected_num_block = 0;
+        if (is_ESRGAN) {
+            for (const auto& name : tensor_names) {
+                if (name.find("model.1.sub.") == 0) {
+                    size_t first_dot = name.find('.', 12);
+                    if (first_dot != std::string::npos) {
+                        size_t second_dot = name.find('.', first_dot + 1);
+                        if (second_dot != std::string::npos && name.substr(first_dot + 1, 3) == "RDB") {
+                            try {
+                                int idx            = std::stoi(name.substr(12, first_dot - 12));
+                                detected_num_block = std::max(detected_num_block, idx + 1);
+                            } catch (...) {
+                            }
+                        }
+                    }
+                }
+            }
+        } else {
+            // Original format
+            for (const auto& name : tensor_names) {
+                if (name.find("body.") == 0) {
+                    size_t pos = name.find('.', 5);
+                    if (pos != std::string::npos) {
+                        try {
+                            int idx            = std::stoi(name.substr(5, pos - 5));
+                            detected_num_block = std::max(detected_num_block, idx + 1);
+                        } catch (...) {
+                        }
+                    }
+                }
+            }
+        }
+
+        int detected_scale = 4;  // default
+        if (is_ESRGAN) {
+            // For ESRGAN format, detect scale by highest model number
+            int max_model_num = 0;
+            for (const auto& name : tensor_names) {
+                if (name.find("model.") == 0) {
+                    size_t dot_pos = name.find('.', 6);
+                    if (dot_pos != std::string::npos) {
+                        try {
+                            int num       = std::stoi(name.substr(6, dot_pos - 6));
+                            max_model_num = std::max(max_model_num, num);
+                        } catch (...) {
+                        }
+                    }
+                }
+            }
+            if (max_model_num <= 4) {
+                detected_scale = 1;
+            } else if (max_model_num <= 7) {
+                detected_scale = 2;
+            } else {
+                detected_scale = 4;
+            }
+        } else {
+            // Original format
+            bool has_conv_up2 = std::any_of(tensor_names.begin(), tensor_names.end(), [](const std::string& name) {
+                return name == "conv_up2.weight";
+            });
+            bool has_conv_up1 = std::any_of(tensor_names.begin(), tensor_names.end(), [](const std::string& name) {
+                return name == "conv_up1.weight";
+            });
+            if (has_conv_up2) {
+                detected_scale = 4;
+            } else if (has_conv_up1) {
+                detected_scale = 2;
+            } else {
+                detected_scale = 1;
+            }
+        }
+
+        int detected_num_in_ch   = 3;
+        int detected_num_out_ch  = 3;
+        int detected_num_feat    = 64;
+        int detected_num_grow_ch = 32;
+
+        // Create RRDBNet with detected parameters
+        rrdb_net = std::make_unique<RRDBNet>(detected_scale, detected_num_block, detected_num_in_ch, detected_num_out_ch, detected_num_feat, detected_num_grow_ch);
+        rrdb_net->init(params_ctx, {}, "");
+
+        alloc_params_buffer();
+        std::map<std::string, ggml_tensor*> esrgan_tensors;
+        rrdb_net->get_param_tensors(esrgan_tensors);
+
+        bool success;
+        if (is_ESRGAN) {
+            // Build name mapping for ESRGAN format
+            std::map<std::string, std::string> expected_to_model;
+            expected_to_model["conv_first.weight"] = "model.0.weight";
+            expected_to_model["conv_first.bias"]   = "model.0.bias";
+
+            for (int i = 0; i < detected_num_block; i++) {
+                for (int j = 1; j <= 3; j++) {
+                    for (int k = 1; k <= 5; k++) {
+                        std::string expected_weight        = "body." + std::to_string(i) + ".rdb" + std::to_string(j) + ".conv" + std::to_string(k) + ".weight";
+                        std::string model_weight           = "model.1.sub." + std::to_string(i) + ".RDB" + std::to_string(j) + ".conv" + std::to_string(k) + ".0.weight";
+                        expected_to_model[expected_weight] = model_weight;
+
+                        std::string expected_bias        = "body." + std::to_string(i) + ".rdb" + std::to_string(j) + ".conv" + std::to_string(k) + ".bias";
+                        std::string model_bias           = "model.1.sub." + std::to_string(i) + ".RDB" + std::to_string(j) + ".conv" + std::to_string(k) + ".0.bias";
+                        expected_to_model[expected_bias] = model_bias;
+                    }
+                }
+            }
+
+            if (detected_scale == 1) {
+                expected_to_model["conv_body.weight"] = "model.1.sub." + std::to_string(detected_num_block) + ".weight";
+                expected_to_model["conv_body.bias"]   = "model.1.sub." + std::to_string(detected_num_block) + ".bias";
+                expected_to_model["conv_hr.weight"]   = "model.2.weight";
+                expected_to_model["conv_hr.bias"]     = "model.2.bias";
+                expected_to_model["conv_last.weight"] = "model.4.weight";
+                expected_to_model["conv_last.bias"]   = "model.4.bias";
+            } else {
+                expected_to_model["conv_body.weight"] = "model.1.sub." + std::to_string(detected_num_block) + ".weight";
+                expected_to_model["conv_body.bias"]   = "model.1.sub." + std::to_string(detected_num_block) + ".bias";
+                if (detected_scale >= 2) {
+                    expected_to_model["conv_up1.weight"] = "model.3.weight";
+                    expected_to_model["conv_up1.bias"]   = "model.3.bias";
+                }
+                if (detected_scale == 4) {
+                    expected_to_model["conv_up2.weight"]  = "model.6.weight";
+                    expected_to_model["conv_up2.bias"]    = "model.6.bias";
+                    expected_to_model["conv_hr.weight"]   = "model.8.weight";
+                    expected_to_model["conv_hr.bias"]     = "model.8.bias";
+                    expected_to_model["conv_last.weight"] = "model.10.weight";
+                    expected_to_model["conv_last.bias"]   = "model.10.bias";
+                } else if (detected_scale == 2) {
+                    expected_to_model["conv_hr.weight"]   = "model.5.weight";
+                    expected_to_model["conv_hr.bias"]     = "model.5.bias";
+                    expected_to_model["conv_last.weight"] = "model.7.weight";
+                    expected_to_model["conv_last.bias"]   = "model.7.bias";
+                }
+            }
+
+            std::map<std::string, ggml_tensor*> model_tensors;
+            for (auto& p : esrgan_tensors) {
+                auto it = expected_to_model.find(p.first);
+                if (it != expected_to_model.end()) {
+                    model_tensors[it->second] = p.second;
+                }
+            }
+
+            success = model_loader.load_tensors(model_tensors, {}, n_threads);
+        } else {
+            success = model_loader.load_tensors(esrgan_tensors, {}, n_threads);
+        }

        if (!success) {
            LOG_ERROR("load esrgan tensors from model loader failed");
            return false;
        }

-        LOG_INFO("esrgan model loaded");
+        scale = rrdb_net->get_scale();
+        LOG_INFO("esrgan model loaded with scale=%d, num_block=%d", scale, detected_num_block);
        return success;
    }

    struct ggml_cgraph* build_graph(struct ggml_tensor* x) {
-        struct ggml_cgraph* gf  = ggml_new_graph(compute_ctx);
+        if (!rrdb_net)
+            return nullptr;
+        constexpr int kGraphNodes = 1 << 16;  // 65k
+        struct ggml_cgraph* gf    = new_graph_custom(kGraphNodes);
        x                         = to_backend(x);
-        struct ggml_tensor* out = rrdb_net.forward(compute_ctx, x);
+
+        auto runner_ctx         = get_context();
+        struct ggml_tensor* out = rrdb_net->forward(&runner_ctx, x);
        ggml_build_forward_expand(gf, out);
        return gf;
    }

-    void compute(const int n_threads,
+    bool compute(const int n_threads,
                 struct ggml_tensor* x,
                 ggml_tensor** output,
-                 ggml_context* output_ctx = NULL) {
+                 ggml_context* output_ctx = nullptr) {
        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(x);
        };
-        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
    }
 };

--- a/examples/cli/CMakeLists.txt
+++ b/examples/cli/CMakeLists.txt
@ -3,4 +3,4 @@ set(TARGET sd)
 add_executable(${TARGET} main.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PUBLIC cxx_std_11)
+target_compile_features(${TARGET} PUBLIC c_std_11 cxx_std_17)
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -0,0 +1,128 @@
+# Run
+
+```
+usage: ./bin/sd  [options]
+
+CLI Options:
+  -o, --output <string>       path to write result image to (default: ./output.png)
+  --preview-path <string>     path to write preview image to (default: ./preview.png)
+  --preview-interval <int>    interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
+                              every step)
+  --canny                     apply canny preprocessor (edge detection)
+  -v, --verbose               print extra info
+  --color                     colors the logging tags according to level
+  --taesd-preview-only        prevents usage of taesd for decoding the final image. (for use with --preview tae)
+  --preview-noisy             enables previewing noisy inputs of the models rather than the denoised outputs
+  -M, --mode                  run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
+  --preview                   preview method. must be one of the following [none, proj, tae, vae] (default is none)
+  -h, --help                  show this help message and exit
+
+Context Options:
+  -m, --model <string>                     path to full model
+  --clip_l <string>                        path to the clip-l text encoder
+  --clip_g <string>                        path to the clip-g text encoder
+  --clip_vision <string>                   path to the clip-vision encoder
+  --t5xxl <string>                         path to the t5xxl text encoder
+  --llm <string>                           path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
+  --llm_vision <string>                    path to the llm vit
+  --qwen2vl <string>                       alias of --llm. Deprecated.
+  --qwen2vl_vision <string>                alias of --llm_vision. Deprecated.
+  --diffusion-model <string>               path to the standalone diffusion model
+  --high-noise-diffusion-model <string>    path to the standalone high noise diffusion model
+  --vae <string>                           path to standalone vae model
+  --taesd <string>                         path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
+  --control-net <string>                   path to control net model
+  --embd-dir <string>                      embeddings directory
+  --lora-model-dir <string>                lora model directory
+  --tensor-type-rules <string>             weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
+  --photo-maker <string>                   path to PHOTOMAKER model
+  --upscale-model <string>                 path to esrgan model.
+  -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
+                                           CPU physical cores
+  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
+  --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
+  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
+  --vae-tiling                             process vae in tiles to reduce memory usage
+  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
+  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
+  --control-net-cpu                        keep controlnet in cpu (for low vram)
+  --clip-on-cpu                            keep clip in cpu (for low vram)
+  --vae-on-cpu                             keep vae in cpu (for low vram)
+  --diffusion-fa                           use flash attention in the diffusion model
+  --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
+  --vae-conv-direct                        use ggml_conv2d_direct in the vae model
+  --chroma-disable-dit-mask                disable dit mask for chroma
+  --chroma-enable-t5-mask                  enable t5 mask for chroma
+  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
+                                           type of the weight file
+  --rng                                    RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
+  --sampler-rng                            sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
+  --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
+  --lora-apply-mode                        the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
+                                           contain any quantized parameters, the at_runtime mode will be used; otherwise,
+                                           immediately will be used.The immediately mode may have precision and
+                                           compatibility issues with quantized parameters, but it usually offers faster inference
+                                           speed and, in some cases, lower memory usage. The at_runtime mode, on the
+                                           other hand, is exactly the opposite.
+  --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
+  --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
+                                           (overrides --vae-tile-size)
+
+Generation Options:
+  -p, --prompt <string>                    the prompt to render
+  -n, --negative-prompt <string>           the negative prompt (default: "")
+  -i, --init-img <string>                  path to the init image
+  --end-img <string>                       path to the end image, required by flf2v
+  --mask <string>                          path to the mask image
+  --control-image <string>                 path to control image, control net
+  --control-video <string>                 path to control video frames, It must be a directory path. The video frames inside should be stored as images in
+                                           lexicographical (character) order. For example, if the control video path is
+                                           `frames`, the directory contain images such as 00.png, 01.png, ... etc.
+  --pm-id-images-dir <string>              path to PHOTOMAKER input id images dir
+  --pm-id-embed-path <string>              path to PHOTOMAKER v2 id embed
+  -H, --height <int>                       image height, in pixel space (default: 512)
+  -W, --width <int>                        image width, in pixel space (default: 512)
+  --steps <int>                            number of sample steps (default: 20)
+  --high-noise-steps <int>                 (high noise) number of sample steps (default: -1 = auto)
+  --clip-skip <int>                        ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
+                                           will be 1 for SD1.x, 2 for SD2.x
+  -b, --batch-count <int>                  batch count
+  --video-frames <int>                     video frames (default: 1)
+  --fps <int>                              fps (default: 24)
+  --timestep-shift <int>                   shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
+                                           NitroSD-Vibrant
+  --upscale-repeats <int>                  Run the ESRGAN upscaler this many times (default: 1)
+  --cfg-scale <float>                      unconditional guidance scale: (default: 7.0)
+  --img-cfg-scale <float>                  image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
+  --guidance <float>                       distilled guidance scale for models with guidance input (default: 3.5)
+  --slg-scale <float>                      skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
+                                           medium
+  --skip-layer-start <float>               SLG enabling point (default: 0.01)
+  --skip-layer-end <float>                 SLG disabling point (default: 0.2)
+  --eta <float>                            eta in DDIM, only for DDIM and TCD (default: 0)
+  --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
+  --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
+  --high-noise-guidance <float>            (high noise) distilled guidance scale for models with guidance input (default: 3.5)
+  --high-noise-slg-scale <float>           (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
+  --high-noise-skip-layer-start <float>    (high noise) SLG enabling point (default: 0.01)
+  --high-noise-skip-layer-end <float>      (high noise) SLG disabling point (default: 0.2)
+  --high-noise-eta <float>                 (high noise) eta in DDIM, only for DDIM and TCD (default: 0)
+  --strength <float>                       strength for noising/unnoising (default: 0.75)
+  --pm-style-strength <float>
+  --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
+  --moe-boundary <float>                   timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
+  --vace-strength <float>                  wan vace strength
+  --increase-ref-index                     automatically increase the indices of references images based on the order they are listed (starting with 1).
+  --disable-auto-resize-ref-image          disable auto resize of ref images
+  -s, --seed                               RNG seed (default: 42, use random seed for < 0)
+  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
+                                           tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
+  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
+                                           ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
+  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm],
+                                           default: discrete
+  --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
+  --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
+  -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
+  --easycache                              enable EasyCache for DiT models with optional "threshold,start_percent,end_percent" (default: 0.2,0.15,0.95)
+```
--- a/examples/cli/avi_writer.h
+++ b/examples/cli/avi_writer.h
@ -0,0 +1,217 @@
+#ifndef __AVI_WRITER_H__
+#define __AVI_WRITER_H__
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "stable-diffusion.h"
+
+#ifndef INCLUDE_STB_IMAGE_WRITE_H
+#include "stb_image_write.h"
+#endif
+
+typedef struct {
+    uint32_t offset;
+    uint32_t size;
+} avi_index_entry;
+
+// Write 32-bit little-endian integer
+void write_u32_le(FILE* f, uint32_t val) {
+    fwrite(&val, 4, 1, f);
+}
+
+// Write 16-bit little-endian integer
+void write_u16_le(FILE* f, uint16_t val) {
+    fwrite(&val, 2, 1, f);
+}
+
+/**
+ * Create an MJPG AVI file from an array of sd_image_t images.
+ * Images are encoded to JPEG using stb_image_write.
+ *
+ * @param filename Output AVI file name.
+ * @param images Array of input images.
+ * @param num_images Number of images in the array.
+ * @param fps Frames per second for the video.
+ * @param quality JPEG quality (0-100).
+ * @return 0 on success, -1 on failure.
+ */
+int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality = 90) {
+    if (num_images == 0) {
+        fprintf(stderr, "Error: Image array is empty.\n");
+        return -1;
+    }
+
+    FILE* f = fopen(filename, "wb");
+    if (!f) {
+        perror("Error opening file for writing");
+        return -1;
+    }
+
+    uint32_t width    = images[0].width;
+    uint32_t height   = images[0].height;
+    uint32_t channels = images[0].channel;
+    if (channels != 3 && channels != 4) {
+        fprintf(stderr, "Error: Unsupported channel count: %u\n", channels);
+        fclose(f);
+        return -1;
+    }
+
+    // --- RIFF AVI Header ---
+    fwrite("RIFF", 4, 1, f);
+    long riff_size_pos = ftell(f);
+    write_u32_le(f, 0);  // Placeholder for file size
+    fwrite("AVI ", 4, 1, f);
+
+    // 'hdrl' LIST (header list)
+    fwrite("LIST", 4, 1, f);
+    write_u32_le(f, 4 + 8 + 56 + 8 + 4 + 8 + 56 + 8 + 40);
+    fwrite("hdrl", 4, 1, f);
+
+    // 'avih' chunk (AVI main header)
+    fwrite("avih", 4, 1, f);
+    write_u32_le(f, 56);
+    write_u32_le(f, 1000000 / fps);       // Microseconds per frame
+    write_u32_le(f, 0);                   // Max bytes per second
+    write_u32_le(f, 0);                   // Padding granularity
+    write_u32_le(f, 0x110);               // Flags (HASINDEX | ISINTERLEAVED)
+    write_u32_le(f, num_images);          // Total frames
+    write_u32_le(f, 0);                   // Initial frames
+    write_u32_le(f, 1);                   // Number of streams
+    write_u32_le(f, width * height * 3);  // Suggested buffer size
+    write_u32_le(f, width);
+    write_u32_le(f, height);
+    write_u32_le(f, 0);  // Reserved
+    write_u32_le(f, 0);  // Reserved
+    write_u32_le(f, 0);  // Reserved
+    write_u32_le(f, 0);  // Reserved
+
+    // 'strl' LIST (stream list)
+    fwrite("LIST", 4, 1, f);
+    write_u32_le(f, 4 + 8 + 56 + 8 + 40);
+    fwrite("strl", 4, 1, f);
+
+    // 'strh' chunk (stream header)
+    fwrite("strh", 4, 1, f);
+    write_u32_le(f, 56);
+    fwrite("vids", 4, 1, f);              // Stream type: video
+    fwrite("MJPG", 4, 1, f);              // Codec: Motion JPEG
+    write_u32_le(f, 0);                   // Flags
+    write_u16_le(f, 0);                   // Priority
+    write_u16_le(f, 0);                   // Language
+    write_u32_le(f, 0);                   // Initial frames
+    write_u32_le(f, 1);                   // Scale
+    write_u32_le(f, fps);                 // Rate
+    write_u32_le(f, 0);                   // Start
+    write_u32_le(f, num_images);          // Length
+    write_u32_le(f, width * height * 3);  // Suggested buffer size
+    write_u32_le(f, (uint32_t)-1);        // Quality
+    write_u32_le(f, 0);                   // Sample size
+    write_u16_le(f, 0);                   // rcFrame.left
+    write_u16_le(f, 0);                   // rcFrame.top
+    write_u16_le(f, 0);                   // rcFrame.right
+    write_u16_le(f, 0);                   // rcFrame.bottom
+
+    // 'strf' chunk (stream format: BITMAPINFOHEADER)
+    fwrite("strf", 4, 1, f);
+    write_u32_le(f, 40);
+    write_u32_le(f, 40);  // biSize
+    write_u32_le(f, width);
+    write_u32_le(f, height);
+    write_u16_le(f, 1);                   // biPlanes
+    write_u16_le(f, 24);                  // biBitCount
+    fwrite("MJPG", 4, 1, f);              // biCompression (FOURCC)
+    write_u32_le(f, width * height * 3);  // biSizeImage
+    write_u32_le(f, 0);                   // XPelsPerMeter
+    write_u32_le(f, 0);                   // YPelsPerMeter
+    write_u32_le(f, 0);                   // Colors used
+    write_u32_le(f, 0);                   // Colors important
+
+    // 'movi' LIST (video frames)
+    // long movi_list_pos = ftell(f);
+    fwrite("LIST", 4, 1, f);
+    long movi_size_pos = ftell(f);
+    write_u32_le(f, 0);  // Placeholder for movi size
+    fwrite("movi", 4, 1, f);
+
+    avi_index_entry* index = (avi_index_entry*)malloc(sizeof(avi_index_entry) * num_images);
+    if (!index) {
+        fclose(f);
+        return -1;
+    }
+
+    // Encode and write each frame as JPEG
+    struct {
+        uint8_t* buf;
+        size_t size;
+    } jpeg_data;
+
+    for (int i = 0; i < num_images; i++) {
+        jpeg_data.buf  = nullptr;
+        jpeg_data.size = 0;
+
+        // Callback function to collect JPEG data into memory
+        auto write_to_buf = [](void* context, void* data, int size) {
+            auto jd = (decltype(jpeg_data)*)context;
+            jd->buf = (uint8_t*)realloc(jd->buf, jd->size + size);
+            memcpy(jd->buf + jd->size, data, size);
+            jd->size += size;
+        };
+
+        // Encode to JPEG in memory
+        stbi_write_jpg_to_func(
+            write_to_buf,
+            &jpeg_data,
+            images[i].width,
+            images[i].height,
+            channels,
+            images[i].data,
+            quality);
+
+        // Write '00dc' chunk (video frame)
+        fwrite("00dc", 4, 1, f);
+        write_u32_le(f, jpeg_data.size);
+        index[i].offset = ftell(f) - 8;
+        index[i].size   = jpeg_data.size;
+        fwrite(jpeg_data.buf, 1, jpeg_data.size, f);
+
+        // Align to even byte size
+        if (jpeg_data.size % 2)
+            fputc(0, f);
+
+        free(jpeg_data.buf);
+    }
+
+    // Finalize 'movi' size
+    long cur_pos   = ftell(f);
+    long movi_size = cur_pos - movi_size_pos - 4;
+    fseek(f, movi_size_pos, SEEK_SET);
+    write_u32_le(f, movi_size);
+    fseek(f, cur_pos, SEEK_SET);
+
+    // Write 'idx1' index
+    fwrite("idx1", 4, 1, f);
+    write_u32_le(f, num_images * 16);
+    for (int i = 0; i < num_images; i++) {
+        fwrite("00dc", 4, 1, f);
+        write_u32_le(f, 0x10);
+        write_u32_le(f, index[i].offset);
+        write_u32_le(f, index[i].size);
+    }
+
+    // Finalize RIFF size
+    cur_pos        = ftell(f);
+    long file_size = cur_pos - riff_size_pos - 4;
+    fseek(f, riff_size_pos, SEEK_SET);
+    write_u32_le(f, file_size);
+    fseek(f, cur_pos, SEEK_SET);
+
+    fclose(f);
+    free(index);
+
+    return 0;
+}
+
+#endif  // __AVI_WRITER_H__
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
--- a/face_detect.py
+++ b/face_detect.py
@ -0,0 +1,88 @@
+import os
+import sys
+
+import numpy as np
+import torch
+from diffusers.utils import load_image
+# pip install insightface==0.7.3
+from insightface.app import FaceAnalysis
+from insightface.data import get_image as ins_get_image
+from safetensors.torch import save_file
+
+### 
+# https://github.com/cubiq/ComfyUI_IPAdapter_plus/issues/165#issue-2055829543
+###
+class FaceAnalysis2(FaceAnalysis):
+    # NOTE: allows setting det_size for each detection call.
+    # the model allows it but the wrapping code from insightface
+    # doesn't show it, and people end up loading duplicate models
+    # for different sizes where there is absolutely no need to
+    def get(self, img, max_num=0, det_size=(640, 640)):
+        if det_size is not None:
+            self.det_model.input_size = det_size
+
+        return super().get(img, max_num)
+
+def analyze_faces(face_analysis: FaceAnalysis, img_data: np.ndarray, det_size=(640, 640)):
+    # NOTE: try detect faces, if no faces detected, lower det_size until it does
+    detection_sizes = [None] + [(size, size) for size in range(640, 256, -64)] + [(256, 256)]
+
+    for size in detection_sizes:
+        faces = face_analysis.get(img_data, det_size=size)
+        if len(faces) > 0:
+            return faces
+
+    return []
+
+if __name__ == "__main__":
+    #face_detector = FaceAnalysis2(providers=['CUDAExecutionProvider'], allowed_modules=['detection', 'recognition'])
+    face_detector = FaceAnalysis2(providers=['CPUExecutionProvider'], allowed_modules=['detection', 'recognition'])
+    face_detector.prepare(ctx_id=0, det_size=(640, 640))
+    #input_folder_name = './scarletthead_woman'
+    input_folder_name = sys.argv[1]
+    image_basename_list = os.listdir(input_folder_name)
+    image_path_list = sorted([os.path.join(input_folder_name, basename) for basename in image_basename_list])
+
+    input_id_images = []
+    for image_path in image_path_list:
+        input_id_images.append(load_image(image_path))
+    
+    id_embed_list = []
+    
+    for img in input_id_images:
+        img = np.array(img)
+        img = img[:, :, ::-1]
+        faces = analyze_faces(face_detector, img)
+        if len(faces) > 0:
+            id_embed_list.append(torch.from_numpy((faces[0]['embedding'])))
+    
+    if len(id_embed_list) == 0:
+        raise ValueError(f"No face detected in input image pool")
+    
+    id_embeds = torch.stack(id_embed_list)    
+    
+    # for r in id_embeds:
+    #     print(r)
+    # #torch.save(id_embeds, input_folder_name+'/id_embeds.pt');
+    # weights = dict()
+    # weights["id_embeds"] = id_embeds
+    # save_file(weights, input_folder_name+'/id_embeds.safetensors')
+
+    binary_data = id_embeds.numpy().tobytes()
+    two = 4
+    zero = 0
+    one = 1
+    tensor_name = "id_embeds"
+# Write binary data to a file
+    with open(input_folder_name+'/id_embeds.bin', "wb") as f:
+        f.write(two.to_bytes(4, byteorder='little'))
+        f.write((len(tensor_name)).to_bytes(4, byteorder='little'))
+        f.write(zero.to_bytes(4, byteorder='little'))
+        f.write((id_embeds.shape[1]).to_bytes(4, byteorder='little'))
+        f.write((id_embeds.shape[0]).to_bytes(4, byteorder='little'))
+        f.write(one.to_bytes(4, byteorder='little'))
+        f.write(one.to_bytes(4, byteorder='little'))
+        f.write(tensor_name.encode('ascii'))
+        f.write(binary_data)
+
+    
--- a/flux.hpp
+++ b/flux.hpp
--- a/format-code.sh
+++ b/format-code.sh
@ -1,2 +1,8 @@
-clang-format -style=file -i *.cpp *.h *.hpp
-clang-format -style=file -i examples/cli/*.cpp
+for f in *.cpp *.h *.hpp examples/cli/*.cpp examples/cli/*.h; do
+  [[ "$f" == vocab* ]] && continue
+  echo "formatting '$f'"
+  # if [ "$f" != "stable-diffusion.h" ]; then
+  #   clang-tidy -fix -p build_linux/ "$f"
+  # fi
+  clang-format -style=file -i "$f"
+done
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 21d3a308fcb7f31cb9beceaeebad4fb622f3c337
+Subproject commit 2d3876d554551d35c06dccc5852be50d5fd2a275
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
--- a/gguf_reader.hpp
+++ b/gguf_reader.hpp
@ -0,0 +1,231 @@
+#ifndef __GGUF_READER_HPP__
+#define __GGUF_READER_HPP__
+
+#include <cstdint>
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "ggml.h"
+#include "util.h"
+
+struct GGUFTensorInfo {
+    std::string name;
+    ggml_type type;
+    std::vector<int64_t> shape;
+    size_t offset;
+};
+
+enum class GGUFMetadataType : uint32_t {
+    UINT8   = 0,
+    INT8    = 1,
+    UINT16  = 2,
+    INT16   = 3,
+    UINT32  = 4,
+    INT32   = 5,
+    FLOAT32 = 6,
+    BOOL    = 7,
+    STRING  = 8,
+    ARRAY   = 9,
+    UINT64  = 10,
+    INT64   = 11,
+    FLOAT64 = 12,
+};
+
+class GGUFReader {
+private:
+    std::vector<GGUFTensorInfo> tensors_;
+    size_t data_offset_;
+    size_t alignment_ = 32;  // default alignment is 32
+
+    template <typename T>
+    bool safe_read(std::ifstream& fin, T& value) {
+        fin.read(reinterpret_cast<char*>(&value), sizeof(T));
+        return fin.good();
+    }
+
+    bool safe_read(std::ifstream& fin, char* buffer, size_t size) {
+        fin.read(buffer, size);
+        return fin.good();
+    }
+
+    bool safe_seek(std::ifstream& fin, std::streamoff offset, std::ios::seekdir dir) {
+        fin.seekg(offset, dir);
+        return fin.good();
+    }
+
+    bool read_metadata(std::ifstream& fin) {
+        uint64_t key_len = 0;
+        if (!safe_read(fin, key_len))
+            return false;
+
+        std::string key(key_len, '\0');
+        if (!safe_read(fin, (char*)key.data(), key_len))
+            return false;
+
+        uint32_t type = 0;
+        if (!safe_read(fin, type))
+            return false;
+
+        if (key == "general.alignment") {
+            uint32_t align_val = 0;
+            if (!safe_read(fin, align_val))
+                return false;
+
+            if (align_val != 0 && (align_val & (align_val - 1)) == 0) {
+                alignment_ = align_val;
+                LOG_DEBUG("Found alignment: %zu", alignment_);
+            } else {
+                LOG_ERROR("Invalid alignment value %u, fallback to default %zu", align_val, alignment_);
+            }
+            return true;
+        }
+
+        switch (static_cast<GGUFMetadataType>(type)) {
+            case GGUFMetadataType::UINT8:
+            case GGUFMetadataType::INT8:
+            case GGUFMetadataType::BOOL:
+                return safe_seek(fin, 1, std::ios::cur);
+
+            case GGUFMetadataType::UINT16:
+            case GGUFMetadataType::INT16:
+                return safe_seek(fin, 2, std::ios::cur);
+
+            case GGUFMetadataType::UINT32:
+            case GGUFMetadataType::INT32:
+            case GGUFMetadataType::FLOAT32:
+                return safe_seek(fin, 4, std::ios::cur);
+
+            case GGUFMetadataType::UINT64:
+            case GGUFMetadataType::INT64:
+            case GGUFMetadataType::FLOAT64:
+                return safe_seek(fin, 8, std::ios::cur);
+
+            case GGUFMetadataType::STRING: {
+                uint64_t len = 0;
+                if (!safe_read(fin, len))
+                    return false;
+                return safe_seek(fin, len, std::ios::cur);
+            }
+
+            case GGUFMetadataType::ARRAY: {
+                uint32_t elem_type = 0;
+                uint64_t len       = 0;
+                if (!safe_read(fin, elem_type))
+                    return false;
+                if (!safe_read(fin, len))
+                    return false;
+
+                for (uint64_t i = 0; i < len; i++) {
+                    if (!read_metadata(fin))
+                        return false;
+                }
+                return true;
+            }
+
+            default:
+                LOG_ERROR("Unknown metadata type=%u", type);
+                return false;
+        }
+    }
+
+    GGUFTensorInfo read_tensor_info(std::ifstream& fin) {
+        GGUFTensorInfo info;
+
+        uint64_t name_len;
+        if (!safe_read(fin, name_len))
+            throw std::runtime_error("read tensor name length failed");
+
+        info.name.resize(name_len);
+        if (!safe_read(fin, (char*)info.name.data(), name_len))
+            throw std::runtime_error("read tensor name failed");
+
+        uint32_t n_dims;
+        if (!safe_read(fin, n_dims))
+            throw std::runtime_error("read tensor dims failed");
+
+        info.shape.resize(n_dims);
+        for (uint32_t i = 0; i < n_dims; i++) {
+            if (!safe_read(fin, info.shape[i]))
+                throw std::runtime_error("read tensor shape failed");
+        }
+
+        if (n_dims > GGML_MAX_DIMS) {
+            for (int i = GGML_MAX_DIMS; i < n_dims; i++) {
+                info.shape[GGML_MAX_DIMS - 1] *= info.shape[i];  // stack to last dim;
+            }
+            info.shape.resize(GGML_MAX_DIMS);
+            n_dims = GGML_MAX_DIMS;
+        }
+
+        uint32_t type;
+        if (!safe_read(fin, type))
+            throw std::runtime_error("read tensor type failed");
+        info.type = static_cast<ggml_type>(type);
+
+        if (!safe_read(fin, info.offset))
+            throw std::runtime_error("read tensor offset failed");
+
+        return info;
+    }
+
+public:
+    bool load(const std::string& file_path) {
+        std::ifstream fin(file_path, std::ios::binary);
+        if (!fin) {
+            LOG_ERROR("failed to open '%s'", file_path.c_str());
+            return false;
+        }
+
+        // --- Header ---
+        char magic[4];
+        if (!safe_read(fin, magic, 4) || strncmp(magic, "GGUF", 4) != 0) {
+            LOG_ERROR("not a valid GGUF file");
+            return false;
+        }
+
+        uint32_t version;
+        if (!safe_read(fin, version))
+            return false;
+
+        uint64_t tensor_count, metadata_kv_count;
+        if (!safe_read(fin, tensor_count))
+            return false;
+        if (!safe_read(fin, metadata_kv_count))
+            return false;
+
+        LOG_DEBUG("GGUF v%u, tensor_count=%llu, metadata_kv_count=%llu",
+                  version, (unsigned long long)tensor_count, (unsigned long long)metadata_kv_count);
+
+        // --- Read Metadata ---
+        for (uint64_t i = 0; i < metadata_kv_count; i++) {
+            if (!read_metadata(fin)) {
+                LOG_ERROR("read meta data failed");
+                return false;
+            }
+        }
+
+        // --- Tensor Infos ---
+        tensors_.clear();
+        try {
+            for (uint64_t i = 0; i < tensor_count; i++) {
+                tensors_.push_back(read_tensor_info(fin));
+            }
+        } catch (const std::runtime_error& e) {
+            LOG_ERROR("%s", e.what());
+            return false;
+        }
+
+        data_offset_ = static_cast<size_t>(fin.tellg());
+        if ((data_offset_ % alignment_) != 0) {
+            data_offset_ = ((data_offset_ + alignment_ - 1) / alignment_) * alignment_;
+        }
+        fin.close();
+        return true;
+    }
+
+    const std::vector<GGUFTensorInfo>& tensors() const { return tensors_; }
+    size_t data_offset() const { return data_offset_; }
+};
+
+#endif  // __GGUF_READER_HPP__
--- a/gits_noise.inl
+++ b/gits_noise.inl
@ -329,21 +329,21 @@ const std::vector<std::vector<float>> GITS_NOISE_1_50 = {
 };

 const std::vector<const std::vector<std::vector<float>>*> GITS_NOISE = {
-    { &GITS_NOISE_0_80 },
-    { &GITS_NOISE_0_85 },
-    { &GITS_NOISE_0_90 },
-    { &GITS_NOISE_0_95 },
-    { &GITS_NOISE_1_00 },
-    { &GITS_NOISE_1_05 },
-    { &GITS_NOISE_1_10 },
-    { &GITS_NOISE_1_15 },
-    { &GITS_NOISE_1_20 },
-    { &GITS_NOISE_1_25 },
-    { &GITS_NOISE_1_30 },
-    { &GITS_NOISE_1_35 },
-    { &GITS_NOISE_1_40 },
-    { &GITS_NOISE_1_45 },
-    { &GITS_NOISE_1_50 }
+    &GITS_NOISE_0_80,
+    &GITS_NOISE_0_85,
+    &GITS_NOISE_0_90,
+    &GITS_NOISE_0_95,
+    &GITS_NOISE_1_00,
+    &GITS_NOISE_1_05,
+    &GITS_NOISE_1_10,
+    &GITS_NOISE_1_15,
+    &GITS_NOISE_1_20,
+    &GITS_NOISE_1_25,
+    &GITS_NOISE_1_30,
+    &GITS_NOISE_1_35,
+    &GITS_NOISE_1_40,
+    &GITS_NOISE_1_45,
+    &GITS_NOISE_1_50
 };

 #endif // GITS_NOISE_INL
--- a/latent-preview.h
+++ b/latent-preview.h
@ -0,0 +1,234 @@
+#include <cstddef>
+#include <cstdint>
+#include "ggml.h"
+
+const float wan_21_latent_rgb_proj[16][3] = {
+    {0.015123f, -0.148418f, 0.479828f},
+    {0.003652f, -0.010680f, -0.037142f},
+    {0.212264f, 0.063033f, 0.016779f},
+    {0.232999f, 0.406476f, 0.220125f},
+    {-0.051864f, -0.082384f, -0.069396f},
+    {0.085005f, -0.161492f, 0.010689f},
+    {-0.245369f, -0.506846f, -0.117010f},
+    {-0.151145f, 0.017721f, 0.007207f},
+    {-0.293239f, -0.207936f, -0.421135f},
+    {-0.187721f, 0.050783f, 0.177649f},
+    {-0.013067f, 0.265964f, 0.166578f},
+    {0.028327f, 0.109329f, 0.108642f},
+    {-0.205343f, 0.043991f, 0.148914f},
+    {0.014307f, -0.048647f, -0.007219f},
+    {0.217150f, 0.053074f, 0.319923f},
+    {0.155357f, 0.083156f, 0.064780f}};
+float wan_21_latent_rgb_bias[3] = {-0.270270f, -0.234976f, -0.456853f};
+
+const float wan_22_latent_rgb_proj[48][3] = {
+    {0.017126f, -0.027230f, -0.019257f},
+    {-0.113739f, -0.028715f, -0.022885f},
+    {-0.000106f, 0.021494f, 0.004629f},
+    {-0.013273f, -0.107137f, -0.033638f},
+    {-0.000381f, 0.000279f, 0.025877f},
+    {-0.014216f, -0.003975f, 0.040528f},
+    {0.001638f, -0.000748f, 0.011022f},
+    {0.029238f, -0.006697f, 0.035933f},
+    {0.021641f, -0.015874f, 0.040531f},
+    {-0.101984f, -0.070160f, -0.028855f},
+    {0.033207f, -0.021068f, 0.002663f},
+    {-0.104711f, 0.121673f, 0.102981f},
+    {0.082647f, -0.004991f, 0.057237f},
+    {-0.027375f, 0.031581f, 0.006868f},
+    {-0.045434f, 0.029444f, 0.019287f},
+    {-0.046572f, -0.012537f, 0.006675f},
+    {0.074709f, 0.033690f, 0.025289f},
+    {-0.008251f, -0.002745f, -0.006999f},
+    {0.012685f, -0.061856f, -0.048658f},
+    {0.042304f, -0.007039f, 0.000295f},
+    {-0.007644f, -0.060843f, -0.033142f},
+    {0.159909f, 0.045628f, 0.367541f},
+    {0.095171f, 0.086438f, 0.010271f},
+    {0.006812f, 0.019643f, 0.029637f},
+    {0.003467f, -0.010705f, 0.014252f},
+    {-0.099681f, -0.066272f, -0.006243f},
+    {0.047357f, 0.037040f, 0.000185f},
+    {-0.041797f, -0.089225f, -0.032257f},
+    {0.008928f, 0.017028f, 0.018684f},
+    {-0.042255f, 0.016045f, 0.006849f},
+    {0.011268f, 0.036462f, 0.037387f},
+    {0.011553f, -0.016375f, -0.048589f},
+    {0.046266f, -0.027189f, 0.056979f},
+    {0.009640f, -0.017576f, 0.030324f},
+    {-0.045794f, -0.036083f, -0.010616f},
+    {0.022418f, 0.039783f, -0.032939f},
+    {-0.052714f, -0.015525f, 0.007438f},
+    {0.193004f, 0.223541f, 0.264175f},
+    {-0.059406f, -0.008188f, 0.022867f},
+    {-0.156742f, -0.263791f, -0.007385f},
+    {-0.015717f, 0.016570f, 0.033969f},
+    {0.037969f, 0.109835f, 0.200449f},
+    {-0.000782f, -0.009566f, -0.008058f},
+    {0.010709f, 0.052960f, -0.044195f},
+    {0.017271f, 0.045839f, 0.034569f},
+    {0.009424f, 0.013088f, -0.001714f},
+    {-0.024805f, -0.059378f, -0.033756f},
+    {-0.078293f, 0.029070f, 0.026129f}};
+float wan_22_latent_rgb_bias[3] = {0.013160f, -0.096492f, -0.071323f};
+
+const float flux_latent_rgb_proj[16][3] = {
+    {-0.041168f, 0.019917f, 0.097253f},
+    {0.028096f, 0.026730f, 0.129576f},
+    {0.065618f, -0.067950f, -0.014651f},
+    {-0.012998f, -0.014762f, 0.081251f},
+    {0.078567f, 0.059296f, -0.024687f},
+    {-0.015987f, -0.003697f, 0.005012f},
+    {0.033605f, 0.138999f, 0.068517f},
+    {-0.024450f, -0.063567f, -0.030101f},
+    {-0.040194f, -0.016710f, 0.127185f},
+    {0.112681f, 0.088764f, -0.041940f},
+    {-0.023498f, 0.093664f, 0.025543f},
+    {0.082899f, 0.048320f, 0.007491f},
+    {0.075712f, 0.074139f, 0.081965f},
+    {-0.143501f, 0.018263f, -0.136138f},
+    {-0.025767f, -0.082035f, -0.040023f},
+    {-0.111849f, -0.055589f, -0.032361f}};
+float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f};
+
+const float flux2_latent_rgb_proj[32][3] = {
+    {0.000736f, -0.008385f, -0.019710f},
+    {-0.001352f, -0.016392f, 0.020693f},
+    {-0.006376f, 0.002428f, 0.036736f},
+    {0.039384f, 0.074167f, 0.119789f},
+    {0.007464f, -0.005705f, -0.004734f},
+    {-0.004086f, 0.005287f, -0.000409f},
+    {-0.032835f, 0.050802f, -0.028120f},
+    {-0.003158f, -0.000835f, 0.000406f},
+    {-0.112840f, -0.084337f, -0.023083f},
+    {0.001462f, -0.006656f, 0.000549f},
+    {-0.009980f, -0.007480f, 0.009702f},
+    {0.032540f, 0.000214f, -0.061388f},
+    {0.011023f, 0.000694f, 0.007143f},
+    {-0.001468f, -0.006723f, -0.001678f},
+    {-0.005921f, -0.010320f, -0.003907f},
+    {-0.028434f, 0.027584f, 0.018457f},
+    {0.014349f, 0.011523f, 0.000441f},
+    {0.009874f, 0.003081f, 0.001507f},
+    {0.002218f, 0.005712f, 0.001563f},
+    {0.053010f, -0.019844f, 0.008683f},
+    {-0.002507f, 0.005384f, 0.000938f},
+    {-0.002177f, -0.011366f, 0.003559f},
+    {-0.000261f, 0.015121f, -0.003240f},
+    {-0.003944f, -0.002083f, 0.005043f},
+    {-0.009138f, 0.011336f, 0.003781f},
+    {0.011429f, 0.003985f, -0.003855f},
+    {0.010518f, -0.005586f, 0.010131f},
+    {0.007883f, 0.002912f, -0.001473f},
+    {-0.003318f, -0.003160f, 0.003684f},
+    {-0.034560f, -0.008740f, 0.012996f},
+    {0.000166f, 0.001079f, -0.012153f},
+    {0.017772f, 0.000937f, -0.011953f}};
+float flux2_latent_rgb_bias[3] = {-0.028738f, -0.098463f, -0.107619f};
+
+// This one was taken straight from
+// https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303
+// (MiT Licence)
+const float sd3_latent_rgb_proj[16][3] = {
+    {-0.0645f, 0.0177f, 0.1052f},
+    {0.0028f, 0.0312f, 0.0650f},
+    {0.1848f, 0.0762f, 0.0360f},
+    {0.0944f, 0.0360f, 0.0889f},
+    {0.0897f, 0.0506f, -0.0364f},
+    {-0.0020f, 0.1203f, 0.0284f},
+    {0.0855f, 0.0118f, 0.0283f},
+    {-0.0539f, 0.0658f, 0.1047f},
+    {-0.0057f, 0.0116f, 0.0700f},
+    {-0.0412f, 0.0281f, -0.0039f},
+    {0.1106f, 0.1171f, 0.1220f},
+    {-0.0248f, 0.0682f, -0.0481f},
+    {0.0815f, 0.0846f, 0.1207f},
+    {-0.0120f, -0.0055f, -0.0867f},
+    {-0.0749f, -0.0634f, -0.0456f},
+    {-0.1418f, -0.1457f, -0.1259f},
+};
+float sd3_latent_rgb_bias[3] = {0, 0, 0};
+
+const float sdxl_latent_rgb_proj[4][3] = {
+    {0.258303f, 0.277640f, 0.329699f},
+    {-0.299701f, 0.105446f, 0.014194f},
+    {0.050522f, 0.186163f, -0.143257f},
+    {-0.211938f, -0.149892f, -0.080036f}};
+float sdxl_latent_rgb_bias[3] = {0.144381f, -0.033313f, 0.007061f};
+
+const float sd_latent_rgb_proj[4][3] = {
+    {0.337366f, 0.216344f, 0.257386f},
+    {0.165636f, 0.386828f, 0.046994f},
+    {-0.267803f, 0.237036f, 0.223517f},
+    {-0.178022f, -0.200862f, -0.678514f}};
+float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f};
+
+void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) {
+    size_t buffer_head = 0;
+
+    uint32_t latent_width  = latents->ne[0];
+    uint32_t latent_height = latents->ne[1];
+    uint32_t dim           = latents->ne[ggml_n_dims(latents) - 1];
+    uint32_t frames        = 1;
+    if (ggml_n_dims(latents) == 4) {
+        frames = latents->ne[2];
+    }
+
+    uint32_t rgb_width  = latent_width * patch_size;
+    uint32_t rgb_height = latent_height * patch_size;
+
+    uint32_t unpatched_dim = dim / (patch_size * patch_size);
+
+    for (int k = 0; k < frames; k++) {
+        for (int rgb_x = 0; rgb_x < rgb_width; rgb_x++) {
+            for (int rgb_y = 0; rgb_y < rgb_height; rgb_y++) {
+                int latent_x = rgb_x / patch_size;
+                int latent_y = rgb_y / patch_size;
+
+                int channel_offset = 0;
+                if (patch_size > 1) {
+                    channel_offset = ((rgb_y % patch_size) * patch_size + (rgb_x % patch_size));
+                }
+
+                size_t latent_id = (latent_x * latents->nb[0] + latent_y * latents->nb[1] + k * latents->nb[2]);
+
+                // should be incremented by 1 for each pixel
+                size_t pixel_id = k * rgb_width * rgb_height + rgb_y * rgb_width + rgb_x;
+
+                float r = 0, g = 0, b = 0;
+                if (latent_rgb_proj != nullptr) {
+                    for (int d = 0; d < unpatched_dim; d++) {
+                        float value = *(float*)((char*)latents->data + latent_id + (d * patch_size * patch_size + channel_offset) * latents->nb[ggml_n_dims(latents) - 1]);
+                        r += value * latent_rgb_proj[d][0];
+                        g += value * latent_rgb_proj[d][1];
+                        b += value * latent_rgb_proj[d][2];
+                    }
+                } else {
+                    // interpret first 3 channels as RGB
+                    r = *(float*)((char*)latents->data + latent_id + 0 * latents->nb[ggml_n_dims(latents) - 1]);
+                    g = *(float*)((char*)latents->data + latent_id + 1 * latents->nb[ggml_n_dims(latents) - 1]);
+                    b = *(float*)((char*)latents->data + latent_id + 2 * latents->nb[ggml_n_dims(latents) - 1]);
+                }
+                if (latent_rgb_bias != nullptr) {
+                    // bias
+                    r += latent_rgb_bias[0];
+                    g += latent_rgb_bias[1];
+                    b += latent_rgb_bias[2];
+                }
+                // change range
+                r = r * .5f + .5f;
+                g = g * .5f + .5f;
+                b = b * .5f + .5f;
+
+                // clamp rgb values to [0,1] range
+                r = r >= 0 ? r <= 1 ? r : 1 : 0;
+                g = g >= 0 ? g <= 1 ? g : 1 : 0;
+                b = b >= 0 ? b <= 1 ? b : 1 : 0;
+
+                buffer[pixel_id * 3 + 0] = (uint8_t)(r * 255);
+                buffer[pixel_id * 3 + 1] = (uint8_t)(g * 255);
+                buffer[pixel_id * 3 + 2] = (uint8_t)(b * 255);
+            }
+        }
+    }
+}
--- a/llm.hpp
+++ b/llm.hpp
--- a/lora.hpp
+++ b/lora.hpp
@ -1,35 +1,42 @@
 #ifndef __LORA_HPP__
 #define __LORA_HPP__

+#include <mutex>
 #include "ggml_extend.hpp"

-#define LORA_GRAPH_SIZE 10240
+#define LORA_GRAPH_BASE_SIZE 10240

 struct LoraModel : public GGMLRunner {
+    std::string lora_id;
    float multiplier = 1.0f;
-    std::map<std::string, struct ggml_tensor*> lora_tensors;
+    std::unordered_map<std::string, struct ggml_tensor*> lora_tensors;
+    std::map<ggml_tensor*, ggml_tensor*> original_tensor_to_final_tensor;
+    std::set<std::string> applied_lora_tensors;
    std::string file_path;
    ModelLoader model_loader;
    bool load_failed         = false;
    bool applied             = false;
-    std::vector<int> zero_index_vec = {0};
-    ggml_tensor* zero_index         = NULL;
+    bool tensor_preprocessed = false;

-    LoraModel(ggml_backend_t backend,
-              ggml_type wtype,
+    typedef std::function<bool(const std::string&)> filter_t;
+
+    LoraModel(const std::string& lora_id,
+              ggml_backend_t backend,
              const std::string& file_path = "",
-              const std::string& prefix    = "")
-        : file_path(file_path), GGMLRunner(backend, wtype) {
-        if (!model_loader.init_from_file(file_path, prefix)) {
+              std::string prefix           = "",
+              SDVersion version            = VERSION_COUNT)
+        : lora_id(lora_id), file_path(file_path), GGMLRunner(backend, false) {
+        prefix = "lora." + prefix;
+        if (!model_loader.init_from_file_and_convert_name(file_path, prefix, version)) {
            load_failed = true;
        }
    }

-    std::string get_desc() {
+    std::string get_desc() override {
        return "lora";
    }

-    bool load_from_file(bool filter_tensor = false) {
+    bool load_from_file(int n_threads, filter_t filter = nullptr) {
        LOG_INFO("loading LoRA from '%s'", file_path.c_str());

        if (load_failed) {
@ -37,143 +44,671 @@ struct LoraModel : public GGMLRunner {
            return false;
        }

+        std::unordered_map<std::string, TensorStorage> tensors_to_create;
+        std::mutex lora_mutex;
        bool dry_run          = true;
        auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
+            if (dry_run) {
                const std::string& name = tensor_storage.name;

-            if (filter_tensor && !contains(name, "lora")) {
-                // LOG_INFO("skipping LoRA tesnor '%s'", name.c_str());
+                if (filter && !filter(name)) {
                    return true;
                }

-            if (dry_run) {
-                struct ggml_tensor* real = ggml_new_tensor(params_ctx,
-                                                           tensor_storage.type,
-                                                           tensor_storage.n_dims,
-                                                           tensor_storage.ne);
-                lora_tensors[name]       = real;
-            } else {
-                auto real   = lora_tensors[name];
-                *dst_tensor = real;
+                {
+                    std::lock_guard<std::mutex> lock(lora_mutex);
+                    tensors_to_create[name] = tensor_storage;
+                }
+            } else {
+                const std::string& name = tensor_storage.name;
+                auto iter               = lora_tensors.find(name);
+                if (iter != lora_tensors.end()) {
+                    *dst_tensor = iter->second;
+                }
            }
-
            return true;
        };

-        model_loader.load_tensors(on_new_tensor_cb, backend);
+        model_loader.load_tensors(on_new_tensor_cb, n_threads);
+
+        if (tensors_to_create.empty()) {
+            return true;
+        }
+
+        for (const auto& pair : tensors_to_create) {
+            const auto& name         = pair.first;
+            const auto& ts           = pair.second;
+            struct ggml_tensor* real = ggml_new_tensor(params_ctx,
+                                                       ts.type,
+                                                       ts.n_dims,
+                                                       ts.ne);
+            lora_tensors[name]       = real;
+        }
+
        alloc_params_buffer();

        dry_run = false;
-        model_loader.load_tensors(on_new_tensor_cb, backend);
+        model_loader.load_tensors(on_new_tensor_cb, n_threads);

        LOG_DEBUG("finished loaded lora");
        return true;
    }

-    ggml_tensor* to_f32(ggml_context* ctx, ggml_tensor* a) {
-        auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a));
-        out      = ggml_get_rows(ctx, out, zero_index);
-        out      = ggml_reshape(ctx, out, a);
-        return out;
+    void preprocess_lora_tensors(const std::map<std::string, ggml_tensor*>& model_tensors) {
+        if (tensor_preprocessed) {
+            return;
        }
+        tensor_preprocessed = true;
+        // I really hate these hardcoded processes.
+        if (model_tensors.find("cond_stage_model.1.transformer.text_model.encoder.layers.0.self_attn.in_proj.weight") != model_tensors.end()) {
+            std::unordered_map<std::string, ggml_tensor*> new_lora_tensors;
+            for (auto& [old_name, tensor] : lora_tensors) {
+                std::string new_name = old_name;

-    struct ggml_cgraph* build_lora_graph(std::map<std::string, struct ggml_tensor*> model_tensors) {
-        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, LORA_GRAPH_SIZE, false);
-
-        zero_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1);
-        set_backend_tensor_data(zero_index, zero_index_vec.data());
-        ggml_build_forward_expand(gf, zero_index);
-
-        std::set<std::string> applied_lora_tensors;
-        for (auto it : model_tensors) {
-            std::string k_tensor       = it.first;
-            struct ggml_tensor* weight = model_tensors[it.first];
-
-            size_t k_pos = k_tensor.find(".weight");
-            if (k_pos == std::string::npos) {
-                continue;
+                if (contains(new_name, "cond_stage_model.1.transformer.text_model.encoder.layers")) {
+                    std::vector<std::pair<std::string, std::string>> qkv_name_map = {
+                        {"self_attn.q_proj.weight", "self_attn.in_proj.weight"},
+                        {"self_attn.q_proj.bias", "self_attn.in_proj.bias"},
+                        {"self_attn.k_proj.weight", "self_attn.in_proj.weight.1"},
+                        {"self_attn.k_proj.bias", "self_attn.in_proj.bias.1"},
+                        {"self_attn.v_proj.weight", "self_attn.in_proj.weight.2"},
+                        {"self_attn.v_proj.bias", "self_attn.in_proj.bias.2"},
+                    };
+                    for (auto kv : qkv_name_map) {
+                        size_t pos = new_name.find(kv.first);
+                        if (pos != std::string::npos) {
+                            new_name.replace(pos, kv.first.size(), kv.second);
                        }
-            k_tensor = k_tensor.substr(0, k_pos);
-            replace_all_chars(k_tensor, '.', '_');
-            // LOG_DEBUG("k_tensor %s", k_tensor.c_str());
-            std::string lora_up_name = "lora." + k_tensor + ".lora_up.weight";
-            if (lora_tensors.find(lora_up_name) == lora_tensors.end()) {
-                if (k_tensor == "model_diffusion_model_output_blocks_2_2_conv") {
-                    // fix for some sdxl lora, like lcm-lora-xl
-                    k_tensor     = "model_diffusion_model_output_blocks_2_1_conv";
-                    lora_up_name = "lora." + k_tensor + ".lora_up.weight";
                    }
                }

-            std::string lora_down_name = "lora." + k_tensor + ".lora_down.weight";
-            std::string alpha_name     = "lora." + k_tensor + ".alpha";
-            std::string scale_name     = "lora." + k_tensor + ".scale";
-
-            ggml_tensor* lora_up   = NULL;
-            ggml_tensor* lora_down = NULL;
-
-            if (lora_tensors.find(lora_up_name) != lora_tensors.end()) {
-                lora_up = lora_tensors[lora_up_name];
+                new_lora_tensors[new_name] = tensor;
            }

-            if (lora_tensors.find(lora_down_name) != lora_tensors.end()) {
-                lora_down = lora_tensors[lora_down_name];
+            lora_tensors = std::move(new_lora_tensors);
+        }
    }

-            if (lora_up == NULL || lora_down == NULL) {
-                continue;
+    ggml_tensor* get_lora_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
+        ggml_tensor* updown = nullptr;
+        int index           = 0;
+        while (true) {
+            std::string key;
+            if (index == 0) {
+                key = model_tensor_name;
+            } else {
+                key = model_tensor_name + "." + std::to_string(index);
+            }
+
+            std::string lora_down_name = "lora." + key + ".lora_down";
+            std::string lora_up_name   = "lora." + key + ".lora_up";
+            std::string lora_mid_name  = "lora." + key + ".lora_mid";
+            std::string scale_name     = "lora." + key + ".scale";
+            std::string alpha_name     = "lora." + key + ".alpha";
+
+            ggml_tensor* lora_up   = nullptr;
+            ggml_tensor* lora_mid  = nullptr;
+            ggml_tensor* lora_down = nullptr;
+
+            auto iter = lora_tensors.find(lora_up_name);
+            if (iter != lora_tensors.end()) {
+                lora_up = ggml_ext_cast_f32(ctx, iter->second);
+            }
+
+            iter = lora_tensors.find(lora_mid_name);
+            if (iter != lora_tensors.end()) {
+                lora_mid = ggml_ext_cast_f32(ctx, iter->second);
+            }
+
+            iter = lora_tensors.find(lora_down_name);
+            if (iter != lora_tensors.end()) {
+                lora_down = ggml_ext_cast_f32(ctx, iter->second);
+            }
+
+            if (lora_up == nullptr || lora_down == nullptr) {
+                break;
            }

            applied_lora_tensors.insert(lora_up_name);
            applied_lora_tensors.insert(lora_down_name);
-            applied_lora_tensors.insert(alpha_name);
-            applied_lora_tensors.insert(scale_name);

-            // calc_cale
-            int64_t dim       = lora_down->ne[ggml_n_dims(lora_down) - 1];
+            if (lora_mid) {
+                applied_lora_tensors.insert(lora_mid_name);
+            }
+
            float scale_value = 1.0f;
-            if (lora_tensors.find(scale_name) != lora_tensors.end()) {
-                scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]);
-            } else if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
-                float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
-                scale_value = alpha / dim;
+
+            int64_t rank = lora_down->ne[ggml_n_dims(lora_down) - 1];
+            iter         = lora_tensors.find(scale_name);
+            if (iter != lora_tensors.end()) {
+                scale_value = ggml_ext_backend_tensor_get_f32(iter->second);
+                applied_lora_tensors.insert(scale_name);
+            } else {
+                iter = lora_tensors.find(alpha_name);
+                if (iter != lora_tensors.end()) {
+                    float alpha = ggml_ext_backend_tensor_get_f32(iter->second);
+                    scale_value = alpha / rank;
+                    // LOG_DEBUG("rank %s %ld %.2f %.2f", alpha_name.c_str(), rank, alpha, scale_value);
+                    applied_lora_tensors.insert(alpha_name);
+                }
            }
            scale_value *= multiplier;

-            // flat lora tensors to multiply it
-            int64_t lora_up_rows   = lora_up->ne[ggml_n_dims(lora_up) - 1];
-            lora_up                = ggml_reshape_2d(compute_ctx, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows);
-            int64_t lora_down_rows = lora_down->ne[ggml_n_dims(lora_down) - 1];
-            lora_down              = ggml_reshape_2d(compute_ctx, lora_down, ggml_nelements(lora_down) / lora_down_rows, lora_down_rows);
+            auto curr_updown = ggml_ext_merge_lora(ctx, lora_down, lora_up, lora_mid);
+            curr_updown      = ggml_scale_inplace(ctx, curr_updown, scale_value);

-            // ggml_mul_mat requires tensor b transposed
-            lora_down                  = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, lora_down));
-            struct ggml_tensor* updown = ggml_mul_mat(compute_ctx, lora_up, lora_down);
-            updown                     = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, updown));
-            updown                     = ggml_reshape(compute_ctx, updown, weight);
-            GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
-            updown = ggml_scale_inplace(compute_ctx, updown, scale_value);
-            ggml_tensor* final_weight;
-            if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
-                // final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, ggml_n_dims(weight), weight->ne);
-                // final_weight = ggml_cpy(compute_ctx, weight, final_weight);
-                final_weight = to_f32(compute_ctx, weight);
-                final_weight = ggml_add_inplace(compute_ctx, final_weight, updown);
-                final_weight = ggml_cpy(compute_ctx, final_weight, weight);
+            if (updown == nullptr) {
+                updown = curr_updown;
            } else {
-                final_weight = ggml_add_inplace(compute_ctx, weight, updown);
-            }
-            // final_weight = ggml_add_inplace(compute_ctx, weight, updown);  // apply directly
-            ggml_build_forward_expand(gf, final_weight);
+                updown = ggml_concat(ctx, updown, curr_updown, ggml_n_dims(updown) - 1);
            }

+            index++;
+        }
+        return updown;
+    }
+
+    ggml_tensor* get_raw_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
+        ggml_tensor* updown = nullptr;
+        int index           = 0;
+        while (true) {
+            std::string key;
+            if (index == 0) {
+                key = model_tensor_name;
+            } else {
+                key = model_tensor_name + "." + std::to_string(index);
+            }
+
+            std::string diff_name = "lora." + key + ".diff";
+
+            ggml_tensor* curr_updown = nullptr;
+
+            auto iter = lora_tensors.find(diff_name);
+            if (iter != lora_tensors.end()) {
+                curr_updown = ggml_ext_cast_f32(ctx, iter->second);
+            } else {
+                break;
+            }
+
+            applied_lora_tensors.insert(diff_name);
+
+            float scale_value = 1.0f;
+            scale_value *= multiplier;
+
+            curr_updown = ggml_scale_inplace(ctx, curr_updown, scale_value);
+
+            if (updown == nullptr) {
+                updown = curr_updown;
+            } else {
+                updown = ggml_concat(ctx, updown, curr_updown, ggml_n_dims(updown) - 1);
+            }
+
+            index++;
+        }
+        return updown;
+    }
+
+    ggml_tensor* get_loha_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
+        ggml_tensor* updown = nullptr;
+        int index           = 0;
+        while (true) {
+            std::string key;
+            if (index == 0) {
+                key = model_tensor_name;
+            } else {
+                key = model_tensor_name + "." + std::to_string(index);
+            }
+            std::string hada_1_down_name = "lora." + key + ".hada_w1_b";
+            std::string hada_1_mid_name  = "lora." + key + ".hada_t1";
+            std::string hada_1_up_name   = "lora." + key + ".hada_w1_a";
+            std::string hada_2_down_name = "lora." + key + ".hada_w2_b";
+            std::string hada_2_mid_name  = "lora." + key + ".hada_t2";
+            std::string hada_2_up_name   = "lora." + key + ".hada_w2_a";
+            std::string alpha_name       = "lora." + key + ".alpha";
+
+            ggml_tensor* hada_1_mid  = nullptr;  // tau for tucker decomposition
+            ggml_tensor* hada_1_up   = nullptr;
+            ggml_tensor* hada_1_down = nullptr;
+
+            ggml_tensor* hada_2_mid  = nullptr;  // tau for tucker decomposition
+            ggml_tensor* hada_2_up   = nullptr;
+            ggml_tensor* hada_2_down = nullptr;
+
+            auto iter = lora_tensors.find(hada_1_down_name);
+            if (iter != lora_tensors.end()) {
+                hada_1_down = ggml_ext_cast_f32(ctx, iter->second);
+            }
+
+            iter = lora_tensors.find(hada_1_up_name);
+            if (iter != lora_tensors.end()) {
+                hada_1_up = ggml_ext_cast_f32(ctx, iter->second);
+            }
+
+            iter = lora_tensors.find(hada_1_mid_name);
+            if (iter != lora_tensors.end()) {
+                hada_1_mid = ggml_ext_cast_f32(ctx, iter->second);
+                hada_1_up  = ggml_cont(ctx, ggml_transpose(ctx, hada_1_up));
+            }
+
+            iter = lora_tensors.find(hada_2_down_name);
+            if (iter != lora_tensors.end()) {
+                hada_2_down = ggml_ext_cast_f32(ctx, iter->second);
+            }
+
+            iter = lora_tensors.find(hada_2_up_name);
+            if (iter != lora_tensors.end()) {
+                hada_2_up = ggml_ext_cast_f32(ctx, iter->second);
+            }
+
+            iter = lora_tensors.find(hada_2_mid_name);
+            if (iter != lora_tensors.end()) {
+                hada_2_mid = ggml_ext_cast_f32(ctx, iter->second);
+                hada_2_up  = ggml_cont(ctx, ggml_transpose(ctx, hada_2_up));
+            }
+
+            if (hada_1_up == nullptr || hada_1_down == nullptr || hada_2_up == nullptr || hada_2_down == nullptr) {
+                break;
+            }
+
+            applied_lora_tensors.insert(hada_1_down_name);
+            applied_lora_tensors.insert(hada_1_up_name);
+            applied_lora_tensors.insert(hada_2_down_name);
+            applied_lora_tensors.insert(hada_2_up_name);
+            applied_lora_tensors.insert(alpha_name);
+
+            if (hada_1_mid) {
+                applied_lora_tensors.insert(hada_1_mid_name);
+            }
+
+            if (hada_2_mid) {
+                applied_lora_tensors.insert(hada_2_mid_name);
+            }
+
+            float scale_value = 1.0f;
+
+            // calc_scale
+            // TODO: .dora_scale?
+            int64_t rank = hada_1_down->ne[ggml_n_dims(hada_1_down) - 1];
+            iter         = lora_tensors.find(alpha_name);
+            if (iter != lora_tensors.end()) {
+                float alpha = ggml_ext_backend_tensor_get_f32(iter->second);
+                scale_value = alpha / rank;
+                applied_lora_tensors.insert(alpha_name);
+            }
+            scale_value *= multiplier;
+
+            struct ggml_tensor* updown_1 = ggml_ext_merge_lora(ctx, hada_1_down, hada_1_up, hada_1_mid);
+            struct ggml_tensor* updown_2 = ggml_ext_merge_lora(ctx, hada_2_down, hada_2_up, hada_2_mid);
+            auto curr_updown             = ggml_mul_inplace(ctx, updown_1, updown_2);
+            curr_updown                  = ggml_scale_inplace(ctx, curr_updown, scale_value);
+            if (updown == nullptr) {
+                updown = curr_updown;
+            } else {
+                updown = ggml_concat(ctx, updown, curr_updown, ggml_n_dims(updown) - 1);
+            }
+            index++;
+        }
+        return updown;
+    }
+
+    ggml_tensor* get_lokr_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
+        ggml_tensor* updown = nullptr;
+        int index           = 0;
+        while (true) {
+            std::string key;
+            if (index == 0) {
+                key = model_tensor_name;
+            } else {
+                key = model_tensor_name + "." + std::to_string(index);
+            }
+            std::string lokr_w1_name   = "lora." + key + ".lokr_w1";
+            std::string lokr_w1_a_name = "lora." + key + ".lokr_w1_a";
+            std::string lokr_w1_b_name = "lora." + key + ".lokr_w1_b";
+            std::string lokr_w2_name   = "lora." + key + ".lokr_w2";
+            std::string lokr_w2_a_name = "lora." + key + ".lokr_w2_a";
+            std::string lokr_w2_b_name = "lora." + key + ".lokr_w2_b";
+            std::string alpha_name     = "lora." + key + ".alpha";
+
+            ggml_tensor* lokr_w1   = nullptr;
+            ggml_tensor* lokr_w1_a = nullptr;
+            ggml_tensor* lokr_w1_b = nullptr;
+            ggml_tensor* lokr_w2   = nullptr;
+            ggml_tensor* lokr_w2_a = nullptr;
+            ggml_tensor* lokr_w2_b = nullptr;
+
+            auto iter = lora_tensors.find(lokr_w1_name);
+            if (iter != lora_tensors.end()) {
+                lokr_w1 = ggml_ext_cast_f32(ctx, iter->second);
+            }
+
+            iter = lora_tensors.find(lokr_w2_name);
+            if (iter != lora_tensors.end()) {
+                lokr_w2 = ggml_ext_cast_f32(ctx, iter->second);
+            }
+
+            int64_t rank = 1;
+            if (lokr_w1 == nullptr) {
+                iter = lora_tensors.find(lokr_w1_a_name);
+                if (iter != lora_tensors.end()) {
+                    lokr_w1_a = ggml_ext_cast_f32(ctx, iter->second);
+                }
+
+                iter = lora_tensors.find(lokr_w1_b_name);
+                if (iter != lora_tensors.end()) {
+                    lokr_w1_b = ggml_ext_cast_f32(ctx, iter->second);
+                }
+
+                if (lokr_w1_a == nullptr || lokr_w1_b == nullptr) {
+                    break;
+                }
+
+                rank = lokr_w1_b->ne[ggml_n_dims(lokr_w1_b) - 1];
+
+                lokr_w1 = ggml_ext_merge_lora(ctx, lokr_w1_b, lokr_w1_a);
+            }
+
+            if (lokr_w2 == nullptr) {
+                iter = lora_tensors.find(lokr_w2_a_name);
+                if (iter != lora_tensors.end()) {
+                    lokr_w2_a = ggml_ext_cast_f32(ctx, iter->second);
+                }
+
+                iter = lora_tensors.find(lokr_w2_b_name);
+                if (iter != lora_tensors.end()) {
+                    lokr_w2_b = ggml_ext_cast_f32(ctx, iter->second);
+                }
+
+                if (lokr_w2_a == nullptr || lokr_w2_b == nullptr) {
+                    break;
+                }
+
+                rank = lokr_w2_b->ne[ggml_n_dims(lokr_w2_b) - 1];
+
+                lokr_w2 = ggml_ext_merge_lora(ctx, lokr_w2_b, lokr_w2_a);
+            }
+
+            if (!lokr_w1_a) {
+                applied_lora_tensors.insert(lokr_w1_name);
+            } else {
+                applied_lora_tensors.insert(lokr_w1_a_name);
+                applied_lora_tensors.insert(lokr_w1_b_name);
+            }
+
+            if (!lokr_w2_a) {
+                applied_lora_tensors.insert(lokr_w2_name);
+            } else {
+                applied_lora_tensors.insert(lokr_w2_a_name);
+                applied_lora_tensors.insert(lokr_w2_b_name);
+            }
+
+            float scale_value = 1.0f;
+            iter              = lora_tensors.find(alpha_name);
+            if (iter != lora_tensors.end()) {
+                float alpha = ggml_ext_backend_tensor_get_f32(iter->second);
+                scale_value = alpha / rank;
+                applied_lora_tensors.insert(alpha_name);
+            }
+
+            if (rank == 1) {
+                scale_value = 1.0f;
+            }
+
+            scale_value *= multiplier;
+
+            auto curr_updown = ggml_ext_kronecker(ctx, lokr_w1, lokr_w2);
+            curr_updown      = ggml_scale_inplace(ctx, curr_updown, scale_value);
+
+            if (updown == nullptr) {
+                updown = curr_updown;
+            } else {
+                updown = ggml_concat(ctx, updown, curr_updown, ggml_n_dims(updown) - 1);
+            }
+            index++;
+        }
+        return updown;
+    }
+
+    ggml_tensor* get_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_tensor* model_tensor, bool with_lora = true) {
+        // lora
+        ggml_tensor* diff = nullptr;
+        if (with_lora) {
+            diff = get_lora_weight_diff(model_tensor_name, ctx);
+        }
+        // diff
+        if (diff == nullptr) {
+            diff = get_raw_weight_diff(model_tensor_name, ctx);
+        }
+        // loha
+        if (diff == nullptr) {
+            diff = get_loha_weight_diff(model_tensor_name, ctx);
+        }
+        // lokr
+        if (diff == nullptr) {
+            diff = get_lokr_weight_diff(model_tensor_name, ctx);
+        }
+        if (diff != nullptr) {
+            if (ggml_nelements(diff) < ggml_nelements(model_tensor)) {
+                if (ggml_n_dims(diff) == 2 && ggml_n_dims(model_tensor) == 2 && diff->ne[0] == model_tensor->ne[0]) {
+                    LOG_WARN("pad for %s", model_tensor_name.c_str());
+                    auto pad_tensor = ggml_ext_zeros(ctx, diff->ne[0], model_tensor->ne[1] - diff->ne[1], 1, 1);
+                    diff            = ggml_concat(ctx, diff, pad_tensor, 1);
+                }
+            }
+
+            GGML_ASSERT(ggml_nelements(diff) == ggml_nelements(model_tensor));
+            diff = ggml_reshape(ctx, diff, model_tensor);
+        }
+        return diff;
+    }
+
+    ggml_tensor* get_out_diff(ggml_context* ctx,
+                              ggml_tensor* x,
+                              WeightAdapter::ForwardParams forward_params,
+                              const std::string& model_tensor_name) {
+        ggml_tensor* out_diff = nullptr;
+        int index             = 0;
+        while (true) {
+            std::string key;
+            if (index == 0) {
+                key = model_tensor_name;
+            } else {
+                key = model_tensor_name + "." + std::to_string(index);
+            }
+
+            std::string lora_down_name = "lora." + key + ".lora_down";
+            std::string lora_up_name   = "lora." + key + ".lora_up";
+            std::string lora_mid_name  = "lora." + key + ".lora_mid";
+            std::string scale_name     = "lora." + key + ".scale";
+            std::string alpha_name     = "lora." + key + ".alpha";
+
+            ggml_tensor* lora_up   = nullptr;
+            ggml_tensor* lora_mid  = nullptr;
+            ggml_tensor* lora_down = nullptr;
+
+            bool is_conv2d = forward_params.op_type == WeightAdapter::ForwardParams::op_type_t::OP_CONV2D;
+
+            auto iter = lora_tensors.find(lora_up_name);
+            if (iter != lora_tensors.end()) {
+                lora_up = iter->second;
+                if (is_conv2d && lora_up->type != GGML_TYPE_F16) {
+                    lora_up = ggml_cast(ctx, lora_up, GGML_TYPE_F16);
+                }
+            }
+
+            iter = lora_tensors.find(lora_mid_name);
+            if (iter != lora_tensors.end()) {
+                lora_mid = iter->second;
+                if (is_conv2d && lora_mid->type != GGML_TYPE_F16) {
+                    lora_mid = ggml_cast(ctx, lora_mid, GGML_TYPE_F16);
+                }
+            }
+
+            iter = lora_tensors.find(lora_down_name);
+            if (iter != lora_tensors.end()) {
+                lora_down = iter->second;
+                if (is_conv2d && lora_down->type != GGML_TYPE_F16) {
+                    lora_down = ggml_cast(ctx, lora_down, GGML_TYPE_F16);
+                }
+            }
+
+            if (lora_up == nullptr || lora_down == nullptr) {
+                break;
+            }
+
+            applied_lora_tensors.insert(lora_up_name);
+            applied_lora_tensors.insert(lora_down_name);
+
+            if (lora_mid) {
+                applied_lora_tensors.insert(lora_mid_name);
+            }
+
+            float scale_value = 1.0f;
+
+            int64_t rank = lora_down->ne[ggml_n_dims(lora_down) - 1];
+            iter         = lora_tensors.find(scale_name);
+            if (iter != lora_tensors.end()) {
+                scale_value = ggml_ext_backend_tensor_get_f32(iter->second);
+                applied_lora_tensors.insert(scale_name);
+            } else {
+                iter = lora_tensors.find(alpha_name);
+                if (iter != lora_tensors.end()) {
+                    float alpha = ggml_ext_backend_tensor_get_f32(iter->second);
+                    scale_value = alpha / rank;
+                    // LOG_DEBUG("rank %s %ld %.2f %.2f", alpha_name.c_str(), rank, alpha, scale_value);
+                    applied_lora_tensors.insert(alpha_name);
+                }
+            }
+            scale_value *= multiplier;
+
+            ggml_tensor* lx;
+            if (!is_conv2d) {
+                lx = ggml_ext_linear(ctx, x, lora_down, nullptr, forward_params.linear.force_prec_f32, forward_params.linear.scale);
+                if (lora_mid) {
+                    lx = ggml_ext_linear(ctx, lx, lora_mid, nullptr, forward_params.linear.force_prec_f32, forward_params.linear.scale);
+                }
+                lx = ggml_ext_linear(ctx, lx, lora_up, nullptr, forward_params.linear.force_prec_f32, forward_params.linear.scale);
+            } else {  // OP_CONV2D
+                lx = ggml_ext_conv_2d(ctx,
+                                      x,
+                                      lora_down,
+                                      nullptr,
+                                      forward_params.conv2d.s0,
+                                      forward_params.conv2d.s1,
+                                      forward_params.conv2d.p0,
+                                      forward_params.conv2d.p1,
+                                      forward_params.conv2d.d0,
+                                      forward_params.conv2d.d1,
+                                      forward_params.conv2d.direct,
+                                      forward_params.conv2d.scale);
+                if (lora_mid) {
+                    lx = ggml_ext_conv_2d(ctx,
+                                          lx,
+                                          lora_mid,
+                                          nullptr,
+                                          1,
+                                          1,
+                                          0,
+                                          0,
+                                          1,
+                                          1,
+                                          forward_params.conv2d.direct,
+                                          forward_params.conv2d.scale);
+                }
+                lx = ggml_ext_conv_2d(ctx,
+                                      lx,
+                                      lora_up,
+                                      nullptr,
+                                      1,
+                                      1,
+                                      0,
+                                      0,
+                                      1,
+                                      1,
+                                      forward_params.conv2d.direct,
+                                      forward_params.conv2d.scale);
+            }
+
+            auto curr_out_diff = ggml_scale_inplace(ctx, lx, scale_value);
+
+            if (out_diff == nullptr) {
+                out_diff = curr_out_diff;
+            } else {
+                out_diff = ggml_concat(ctx, out_diff, curr_out_diff, 0);
+            }
+
+            index++;
+        }
+        return out_diff;
+    }
+
+    struct ggml_cgraph* build_lora_graph(const std::map<std::string, ggml_tensor*>& model_tensors, SDVersion version) {
+        size_t lora_graph_size = LORA_GRAPH_BASE_SIZE + lora_tensors.size() * 10;
+        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, lora_graph_size, false);
+
+        preprocess_lora_tensors(model_tensors);
+
+        original_tensor_to_final_tensor.clear();
+        applied_lora_tensors.clear();
+
+        for (auto it : model_tensors) {
+            std::string model_tensor_name = it.first;
+            ggml_tensor* model_tensor     = it.second;
+
+            // lora
+            ggml_tensor* diff = get_weight_diff(model_tensor_name, compute_ctx, model_tensor);
+            if (diff == nullptr) {
+                continue;
+            }
+
+            ggml_tensor* original_tensor = model_tensor;
+            if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
+                model_tensor = ggml_dup_tensor(compute_ctx, model_tensor);
+                set_backend_tensor_data(model_tensor, original_tensor->data);
+            }
+
+            ggml_tensor* final_tensor;
+            if (model_tensor->type != GGML_TYPE_F32 && model_tensor->type != GGML_TYPE_F16) {
+                final_tensor = ggml_ext_cast_f32(compute_ctx, model_tensor);
+                final_tensor = ggml_add_inplace(compute_ctx, final_tensor, diff);
+                final_tensor = ggml_cpy(compute_ctx, final_tensor, model_tensor);
+            } else {
+                final_tensor = ggml_add_inplace(compute_ctx, model_tensor, diff);
+            }
+            ggml_build_forward_expand(gf, final_tensor);
+            if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
+                original_tensor_to_final_tensor[original_tensor] = final_tensor;
+            }
+        }
+        return gf;
+    }
+
+    void apply(std::map<std::string, struct ggml_tensor*> model_tensors, SDVersion version, int n_threads) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_lora_graph(model_tensors, version);
+        };
+        GGMLRunner::compute(get_graph, n_threads, false);
+        stat();
+        for (auto item : original_tensor_to_final_tensor) {
+            ggml_tensor* original_tensor = item.first;
+            ggml_tensor* final_tensor    = item.second;
+
+            ggml_backend_tensor_copy(final_tensor, original_tensor);
+        }
+        original_tensor_to_final_tensor.clear();
+        GGMLRunner::free_compute_buffer();
+    }
+
+    void stat(bool at_runntime = false) {
        size_t total_lora_tensors_count   = 0;
        size_t applied_lora_tensors_count = 0;

        for (auto& kv : lora_tensors) {
            total_lora_tensors_count++;
            if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) {
-                LOG_WARN("unused lora tensor %s", kv.first.c_str());
+                if (!at_runntime) {
+                    LOG_WARN("unused lora tensor |%s|", kv.first.c_str());
+                    print_ggml_tensor(kv.second, true);
+                }
            } else {
                applied_lora_tensors_count++;
            }
@ -181,22 +716,87 @@ struct LoraModel : public GGMLRunner {
        /* Don't worry if this message shows up twice in the logs per LoRA,
         * this function is called once to calculate the required buffer size
         * and then again to actually generate a graph to be used */
-        if (applied_lora_tensors_count != total_lora_tensors_count) {
-            LOG_WARN("Only (%lu / %lu) LoRA tensors have been applied",
-                     applied_lora_tensors_count, total_lora_tensors_count);
+        if (!at_runntime && applied_lora_tensors_count != total_lora_tensors_count) {
+            LOG_WARN("Only (%lu / %lu) LoRA tensors have been applied, lora_file_path = %s",
+                     applied_lora_tensors_count, total_lora_tensors_count, file_path.c_str());
        } else {
-            LOG_DEBUG("(%lu / %lu) LoRA tensors applied successfully",
-                      applied_lora_tensors_count, total_lora_tensors_count);
+            LOG_INFO("(%lu / %lu) LoRA tensors have been applied, lora_file_path = %s",
+                     applied_lora_tensors_count, total_lora_tensors_count, file_path.c_str());
        }
-
-        return gf;
    }
-
-    void apply(std::map<std::string, struct ggml_tensor*> model_tensors, int n_threads) {
-        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_lora_graph(model_tensors);
 };
-        GGMLRunner::compute(get_graph, n_threads, true);
+
+struct MultiLoraAdapter : public WeightAdapter {
+protected:
+    std::vector<std::shared_ptr<LoraModel>> lora_models;
+
+public:
+    explicit MultiLoraAdapter(const std::vector<std::shared_ptr<LoraModel>>& lora_models)
+        : lora_models(lora_models) {
+    }
+
+    ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name, bool with_lora) {
+        for (auto& lora_model : lora_models) {
+            ggml_tensor* diff = lora_model->get_weight_diff(weight_name, ctx, weight, with_lora);
+            if (diff == nullptr) {
+                continue;
+            }
+
+            if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
+                weight = ggml_ext_cast_f32(ctx, weight);
+            }
+            weight = ggml_add(ctx, weight, diff);
+        }
+        return weight;
+    }
+
+    ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) override {
+        return patch_weight(ctx, weight, weight_name, true);
+    }
+
+    ggml_tensor* forward_with_lora(ggml_context* ctx,
+                                   ggml_tensor* x,
+                                   ggml_tensor* w,
+                                   ggml_tensor* b,
+                                   const std::string& prefix,
+                                   WeightAdapter::ForwardParams forward_params) override {
+        w = patch_weight(ctx, w, prefix + "weight", false);
+        if (b) {
+            b = patch_weight(ctx, b, prefix + "bias", false);
+        }
+        ggml_tensor* out;
+        if (forward_params.op_type == ForwardParams::op_type_t::OP_LINEAR) {
+            out = ggml_ext_linear(ctx, x, w, b, forward_params.linear.force_prec_f32, forward_params.linear.scale);
+        } else {  // OP_CONV2D
+            out = ggml_ext_conv_2d(ctx,
+                                   x,
+                                   w,
+                                   b,
+                                   forward_params.conv2d.s0,
+                                   forward_params.conv2d.s1,
+                                   forward_params.conv2d.p0,
+                                   forward_params.conv2d.p1,
+                                   forward_params.conv2d.d0,
+                                   forward_params.conv2d.d1,
+                                   forward_params.conv2d.direct,
+                                   forward_params.conv2d.scale);
+        }
+        for (auto& lora_model : lora_models) {
+            ggml_tensor* out_diff = lora_model->get_out_diff(ctx, x, forward_params, prefix + "weight");
+            if (out_diff == nullptr) {
+                continue;
+            }
+            out = ggml_add_inplace(ctx, out, out_diff);
+        }
+        return out;
+    }
+
+    size_t get_extra_graph_size() override {
+        size_t lora_tensor_num = 0;
+        for (auto& lora_model : lora_models) {
+            lora_tensor_num += lora_model->lora_tensors.size();
+        }
+        return LORA_GRAPH_BASE_SIZE + lora_tensor_num * 10;
    }
 };

--- a/ltxv.hpp
+++ b/ltxv.hpp
@ -0,0 +1,74 @@
+#ifndef __LTXV_HPP__
+#define __LTXV_HPP__
+
+#include "common.hpp"
+#include "ggml_extend.hpp"
+
+namespace LTXV {
+
+    class CausalConv3d : public GGMLBlock {
+    protected:
+        int time_kernel_size;
+
+    public:
+        CausalConv3d(int64_t in_channels,
+                     int64_t out_channels,
+                     int kernel_size                  = 3,
+                     std::tuple<int, int, int> stride = {1, 1, 1},
+                     int dilation                     = 1,
+                     bool bias                        = true) {
+            time_kernel_size = kernel_size / 2;
+            blocks["conv"]   = std::shared_ptr<GGMLBlock>(new Conv3d(in_channels,
+                                                                     out_channels,
+                                                                     {kernel_size, kernel_size, kernel_size},
+                                                                     stride,
+                                                                     {0, kernel_size / 2, kernel_size / 2},
+                                                                     {dilation, 1, 1},
+                                                                     bias));
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* x,
+                                    bool causal = true) {
+            // x: [N*IC, ID, IH, IW]
+            // result: [N*OC, OD, OH, OW]
+            auto conv = std::dynamic_pointer_cast<Conv3d>(blocks["conv"]);
+            if (causal) {
+                auto h               = ggml_cont(ctx, ggml_permute(ctx, x, 0, 1, 3, 2));                                                  // [ID, N*IC, IH, IW]
+                auto first_frame     = ggml_view_3d(ctx, h, h->ne[0], h->ne[1], h->ne[2], h->nb[1], h->nb[2], 0);                         // [N*IC, IH, IW]
+                first_frame          = ggml_reshape_4d(ctx, first_frame, first_frame->ne[0], first_frame->ne[1], 1, first_frame->ne[2]);  // [N*IC, 1, IH, IW]
+                auto first_frame_pad = first_frame;
+                for (int i = 1; i < time_kernel_size - 1; i++) {
+                    first_frame_pad = ggml_concat(ctx, first_frame_pad, first_frame, 2);
+                }
+                x = ggml_concat(ctx, first_frame_pad, x, 2);
+            } else {
+                auto h         = ggml_cont(ctx, ggml_permute(ctx, x, 0, 1, 3, 2));  // [ID, N*IC, IH, IW]
+                int64_t offset = h->nb[2] * h->ne[2];
+
+                auto first_frame     = ggml_view_3d(ctx, h, h->ne[0], h->ne[1], h->ne[2], h->nb[1], h->nb[2], 0);                         // [N*IC, IH, IW]
+                first_frame          = ggml_reshape_4d(ctx, first_frame, first_frame->ne[0], first_frame->ne[1], 1, first_frame->ne[2]);  // [N*IC, 1, IH, IW]
+                auto first_frame_pad = first_frame;
+                for (int i = 1; i < (time_kernel_size - 1) / 2; i++) {
+                    first_frame_pad = ggml_concat(ctx, first_frame_pad, first_frame, 2);
+                }
+
+                auto last_frame     = ggml_view_3d(ctx, h, h->ne[0], h->ne[1], h->ne[2], h->nb[1], h->nb[2], offset * (h->ne[3] - 1));  // [N*IC, IH, IW]
+                last_frame          = ggml_reshape_4d(ctx, last_frame, last_frame->ne[0], last_frame->ne[1], 1, last_frame->ne[2]);     // [N*IC, 1, IH, IW]
+                auto last_frame_pad = last_frame;
+                for (int i = 1; i < (time_kernel_size - 1) / 2; i++) {
+                    last_frame_pad = ggml_concat(ctx, last_frame_pad, last_frame, 2);
+                }
+
+                x = ggml_concat(ctx, first_frame_pad, x, 2);
+                x = ggml_concat(ctx, x, last_frame_pad, 2);
+            }
+
+            x = conv->forward(ctx, x);
+            return x;
+        }
+    };
+
+};
+
+#endif
--- a/mmdit.hpp
+++ b/mmdit.hpp
@ -1,6 +1,8 @@
 #ifndef __MMDIT_HPP__
 #define __MMDIT_HPP__

+#include <memory>
+
 #include "ggml_extend.hpp"
 #include "model.h"

@ -25,13 +27,13 @@ public:
        blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [N, n_token, in_features]
        auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
        auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);

        x = fc1->forward(ctx, x);
-        x = ggml_gelu_inplace(ctx, x);
+        x = ggml_gelu_inplace(ctx->ggml_ctx, x);
        x = fc2->forward(ctx, x);
        return x;
    }
@ -70,7 +72,7 @@ public:
                                                               bias));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [N, C, H, W]
        // return: [N, H*W, embed_dim]
        auto proj = std::dynamic_pointer_cast<Conv2d>(blocks["proj"]);
@ -80,13 +82,13 @@ public:
            int64_t H = x->ne[1];
            int pad_h = (patch_size - H % patch_size) % patch_size;
            int pad_w = (patch_size - W % patch_size) % patch_size;
-            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // TODO: reflect pad mode
+            x         = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0);  // TODO: reflect pad mode
        }
        x = proj->forward(ctx, x);

        if (flatten) {
-            x = ggml_reshape_3d(ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]);
-            x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));
+            x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]);
+            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));
        }
        return x;
    }
@ -99,22 +101,26 @@ protected:

 public:
    TimestepEmbedder(int64_t hidden_size,
-                     int64_t frequency_embedding_size = 256)
+                     int64_t frequency_embedding_size = 256,
+                     int64_t out_channels             = 0)
        : frequency_embedding_size(frequency_embedding_size) {
+        if (out_channels <= 0) {
+            out_channels = hidden_size;
+        }
        blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size, true, true));
-        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
+        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, out_channels, true, true));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* t) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* t) {
        // t: [N, ]
        // return: [N, hidden_size]
        auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
        auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);

-        auto t_freq = ggml_nn_timestep_embedding(ctx, t, frequency_embedding_size);  // [N, frequency_embedding_size]
+        auto t_freq = ggml_ext_timestep_embedding(ctx->ggml_ctx, t, frequency_embedding_size);  // [N, frequency_embedding_size]

        auto t_emb = mlp_0->forward(ctx, t_freq);
-        t_emb      = ggml_silu_inplace(ctx, t_emb);
+        t_emb      = ggml_silu_inplace(ctx->ggml_ctx, t_emb);
        t_emb      = mlp_2->forward(ctx, t_emb);
        return t_emb;
    }
@ -129,42 +135,19 @@ public:
        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [N, input_dim]
        // return: [N, hidden_size]
        auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
        auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);

        x = mlp_0->forward(ctx, x);
-        x = ggml_silu_inplace(ctx, x);
+        x = ggml_silu_inplace(ctx->ggml_ctx, x);
        x = mlp_2->forward(ctx, x);
        return x;
    }
 };

-class RMSNorm : public UnaryBlock {
-protected:
-    int64_t hidden_size;
-    float eps;
-
-    void init_params(struct ggml_context* ctx, ggml_type wtype) {
-        params["weight"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-    }
-
-public:
-    RMSNorm(int64_t hidden_size,
-            float eps = 1e-06f)
-        : hidden_size(hidden_size),
-          eps(eps) {}
-
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
-        struct ggml_tensor* w = params["weight"];
-        x                     = ggml_rms_norm(ctx, x, eps);
-        x                     = ggml_mul(ctx, x, w);
-        return x;
-    }
-};
-
 class SelfAttention : public GGMLBlock {
 public:
    int64_t num_heads;
@ -192,14 +175,14 @@ public:
        }
    }

-    std::vector<struct ggml_tensor*> pre_attention(struct ggml_context* ctx, struct ggml_tensor* x) {
+    std::vector<struct ggml_tensor*> pre_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);

        auto qkv         = qkv_proj->forward(ctx, x);
-        auto qkv_vec     = split_qkv(ctx, qkv);
+        auto qkv_vec     = split_qkv(ctx->ggml_ctx, qkv);
        int64_t head_dim = qkv_vec[0]->ne[0] / num_heads;
-        auto q           = ggml_reshape_4d(ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]);  // [N, n_token, n_head, d_head]
-        auto k           = ggml_reshape_4d(ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]);  // [N, n_token, n_head, d_head]
+        auto q           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]);  // [N, n_token, n_head, d_head]
+        auto k           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]);  // [N, n_token, n_head, d_head]
        auto v           = qkv_vec[2];                                                                                             // [N, n_token, n_head*d_head]

        if (qk_norm == "rms" || qk_norm == "ln") {
@ -209,13 +192,13 @@ public:
            k         = ln_k->forward(ctx, k);
        }

-        q = ggml_reshape_3d(ctx, q, q->ne[0] * q->ne[1], q->ne[2], q->ne[3]);  // [N, n_token, n_head*d_head]
-        k = ggml_reshape_3d(ctx, k, k->ne[0] * k->ne[1], k->ne[2], k->ne[3]);  // [N, n_token, n_head*d_head]
+        q = ggml_reshape_3d(ctx->ggml_ctx, q, q->ne[0] * q->ne[1], q->ne[2], q->ne[3]);  // [N, n_token, n_head*d_head]
+        k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0] * k->ne[1], k->ne[2], k->ne[3]);  // [N, n_token, n_head*d_head]

        return {q, k, v};
    }

-    struct ggml_tensor* post_attention(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* post_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        GGML_ASSERT(!pre_only);

        auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
@ -225,9 +208,10 @@ public:
    }

    // x: [N, n_token, dim]
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                struct ggml_tensor* x) {
        auto qkv = pre_attention(ctx, x);
-        x        = ggml_nn_attention_ext(ctx, qkv[0], qkv[1], qkv[2], num_heads);  // [N, n_token, dim]
+        x        = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, ctx->flash_attn_enabled);  // [N, n_token, dim]
        x        = post_attention(ctx, x);                                                                                                                  // [N, n_token, dim]
        return x;
    }
@ -289,7 +273,7 @@ public:
        blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, n_mods * hidden_size));
    }

-    std::tuple<std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>> pre_attention_x(struct ggml_context* ctx,
+    std::tuple<std::vector<ggml_tensor*>, std::vector<ggml_tensor*>, std::vector<ggml_tensor*>> pre_attention_x(GGMLRunnerContext* ctx,
                                                                                                                struct ggml_tensor* x,
                                                                                                                struct ggml_tensor* c) {
        GGML_ASSERT(self_attn);
@ -301,35 +285,35 @@ public:
        auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);

        int64_t n_mods = 9;
-        auto m         = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c));  // [N, n_mods * hidden_size]
-        m              = ggml_reshape_3d(ctx, m, c->ne[0], n_mods, c->ne[1]);  // [N, n_mods, hidden_size]
-        m              = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3));     // [n_mods, N, hidden_size]
+        auto m         = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c));         // [N, n_mods * hidden_size]
+        m              = ggml_reshape_3d(ctx->ggml_ctx, m, c->ne[0], n_mods, c->ne[1]);         // [N, n_mods, hidden_size]
+        m              = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, m, 0, 2, 1, 3));  // [n_mods, N, hidden_size]

        int64_t offset = m->nb[1] * m->ne[1];
-        auto shift_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
-        auto scale_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]
-        auto gate_msa  = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2);  // [N, hidden_size]
+        auto shift_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
+        auto scale_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]
+        auto gate_msa  = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2);  // [N, hidden_size]

-        auto shift_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3);  // [N, hidden_size]
-        auto scale_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4);  // [N, hidden_size]
-        auto gate_mlp  = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5);  // [N, hidden_size]
+        auto shift_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3);  // [N, hidden_size]
+        auto scale_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4);  // [N, hidden_size]
+        auto gate_mlp  = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5);  // [N, hidden_size]

-        auto shift_msa2 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 6);  // [N, hidden_size]
-        auto scale_msa2 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 7);  // [N, hidden_size]
-        auto gate_msa2  = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 8);  // [N, hidden_size]
+        auto shift_msa2 = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 6);  // [N, hidden_size]
+        auto scale_msa2 = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 7);  // [N, hidden_size]
+        auto gate_msa2  = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 8);  // [N, hidden_size]

        auto x_norm = norm1->forward(ctx, x);

-        auto attn_in = modulate(ctx, x_norm, shift_msa, scale_msa);
+        auto attn_in = modulate(ctx->ggml_ctx, x_norm, shift_msa, scale_msa);
        auto qkv     = attn->pre_attention(ctx, attn_in);

-        auto attn2_in = modulate(ctx, x_norm, shift_msa2, scale_msa2);
+        auto attn2_in = modulate(ctx->ggml_ctx, x_norm, shift_msa2, scale_msa2);
        auto qkv2     = attn2->pre_attention(ctx, attn2_in);

        return {qkv, qkv2, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp, gate_msa2}};
    }

-    std::pair<std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>> pre_attention(struct ggml_context* ctx,
+    std::pair<std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>> pre_attention(GGMLRunnerContext* ctx,
                                                                                                struct ggml_tensor* x,
                                                                                                struct ggml_tensor* c) {
        // x: [N, n_token, hidden_size]
@ -342,33 +326,33 @@ public:
        if (pre_only) {
            n_mods = 2;
        }
-        auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c));  // [N, n_mods * hidden_size]
-        m      = ggml_reshape_3d(ctx, m, c->ne[0], n_mods, c->ne[1]);  // [N, n_mods, hidden_size]
-        m      = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3));     // [n_mods, N, hidden_size]
+        auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c));         // [N, n_mods * hidden_size]
+        m      = ggml_reshape_3d(ctx->ggml_ctx, m, c->ne[0], n_mods, c->ne[1]);         // [N, n_mods, hidden_size]
+        m      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, m, 0, 2, 1, 3));  // [n_mods, N, hidden_size]

        int64_t offset = m->nb[1] * m->ne[1];
-        auto shift_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
-        auto scale_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]
+        auto shift_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
+        auto scale_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]
        if (!pre_only) {
-            auto gate_msa  = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2);  // [N, hidden_size]
-            auto shift_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3);  // [N, hidden_size]
-            auto scale_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4);  // [N, hidden_size]
-            auto gate_mlp  = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5);  // [N, hidden_size]
+            auto gate_msa  = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2);  // [N, hidden_size]
+            auto shift_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3);  // [N, hidden_size]
+            auto scale_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4);  // [N, hidden_size]
+            auto gate_mlp  = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5);  // [N, hidden_size]

-            auto attn_in = modulate(ctx, norm1->forward(ctx, x), shift_msa, scale_msa);
+            auto attn_in = modulate(ctx->ggml_ctx, norm1->forward(ctx, x), shift_msa, scale_msa);

            auto qkv = attn->pre_attention(ctx, attn_in);

            return {qkv, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp}};
        } else {
-            auto attn_in = modulate(ctx, norm1->forward(ctx, x), shift_msa, scale_msa);
+            auto attn_in = modulate(ctx->ggml_ctx, norm1->forward(ctx, x), shift_msa, scale_msa);
            auto qkv     = attn->pre_attention(ctx, attn_in);

-            return {qkv, {NULL, NULL, NULL, NULL, NULL}};
+            return {qkv, {nullptr, nullptr, nullptr, nullptr, nullptr}};
        }
    }

-    struct ggml_tensor* post_attention_x(struct ggml_context* ctx,
+    struct ggml_tensor* post_attention_x(GGMLRunnerContext* ctx,
                                         struct ggml_tensor* attn_out,
                                         struct ggml_tensor* attn2_out,
                                         struct ggml_tensor* x,
@ -391,22 +375,22 @@ public:
        auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
        auto mlp   = std::dynamic_pointer_cast<Mlp>(blocks["mlp"]);

-        gate_msa  = ggml_reshape_3d(ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]);     // [N, 1, hidden_size]
-        gate_mlp  = ggml_reshape_3d(ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]);     // [N, 1, hidden_size]
-        gate_msa2 = ggml_reshape_3d(ctx, gate_msa2, gate_msa2->ne[0], 1, gate_msa2->ne[1]);  // [N, 1, hidden_size]
+        gate_msa  = ggml_reshape_3d(ctx->ggml_ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]);     // [N, 1, hidden_size]
+        gate_mlp  = ggml_reshape_3d(ctx->ggml_ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]);     // [N, 1, hidden_size]
+        gate_msa2 = ggml_reshape_3d(ctx->ggml_ctx, gate_msa2, gate_msa2->ne[0], 1, gate_msa2->ne[1]);  // [N, 1, hidden_size]

        attn_out  = attn->post_attention(ctx, attn_out);
        attn2_out = attn2->post_attention(ctx, attn2_out);

-        x            = ggml_add(ctx, x, ggml_mul(ctx, attn_out, gate_msa));
-        x            = ggml_add(ctx, x, ggml_mul(ctx, attn2_out, gate_msa2));
-        auto mlp_out = mlp->forward(ctx, modulate(ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp));
-        x            = ggml_add(ctx, x, ggml_mul(ctx, mlp_out, gate_mlp));
+        x            = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn_out, gate_msa));
+        x            = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn2_out, gate_msa2));
+        auto mlp_out = mlp->forward(ctx, modulate(ctx->ggml_ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp));
+        x            = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, mlp_out, gate_mlp));

        return x;
    }

-    struct ggml_tensor* post_attention(struct ggml_context* ctx,
+    struct ggml_tensor* post_attention(GGMLRunnerContext* ctx,
                                       struct ggml_tensor* attn_out,
                                       struct ggml_tensor* x,
                                       struct ggml_tensor* gate_msa,
@ -426,19 +410,21 @@ public:
        auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
        auto mlp   = std::dynamic_pointer_cast<Mlp>(blocks["mlp"]);

-        gate_msa = ggml_reshape_3d(ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]);  // [N, 1, hidden_size]
-        gate_mlp = ggml_reshape_3d(ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]);  // [N, 1, hidden_size]
+        gate_msa = ggml_reshape_3d(ctx->ggml_ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]);  // [N, 1, hidden_size]
+        gate_mlp = ggml_reshape_3d(ctx->ggml_ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]);  // [N, 1, hidden_size]

        attn_out = attn->post_attention(ctx, attn_out);

-        x            = ggml_add(ctx, x, ggml_mul(ctx, attn_out, gate_msa));
-        auto mlp_out = mlp->forward(ctx, modulate(ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp));
-        x            = ggml_add(ctx, x, ggml_mul(ctx, mlp_out, gate_mlp));
+        x            = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn_out, gate_msa));
+        auto mlp_out = mlp->forward(ctx, modulate(ctx->ggml_ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp));
+        x            = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, mlp_out, gate_mlp));

        return x;
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* c) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* c) {
        // x: [N, n_token, hidden_size]
        // c: [N, hidden_size]
        // return: [N, n_token, hidden_size]
@ -453,8 +439,8 @@ public:
            auto qkv2          = std::get<1>(qkv_intermediates);
            auto intermediates = std::get<2>(qkv_intermediates);

-            auto attn_out  = ggml_nn_attention_ext(ctx, qkv[0], qkv[1], qkv[2], num_heads);     // [N, n_token, dim]
-            auto attn2_out = ggml_nn_attention_ext(ctx, qkv2[0], qkv2[1], qkv2[2], num_heads);  // [N, n_token, dim]
+            auto attn_out  = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, ctx->flash_attn_enabled);     // [N, n_token, dim]
+            auto attn2_out = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv2[0], qkv2[1], qkv2[2], num_heads, nullptr, false, false, ctx->flash_attn_enabled);  // [N, n_token, dim]
            x              = post_attention_x(ctx,
                                              attn_out,
                                              attn2_out,
@ -470,7 +456,7 @@ public:
            auto qkv               = qkv_intermediates.first;
            auto intermediates     = qkv_intermediates.second;

-            auto attn_out = ggml_nn_attention_ext(ctx, qkv[0], qkv[1], qkv[2], num_heads);  // [N, n_token, dim]
+            auto attn_out = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, ctx->flash_attn_enabled);  // [N, n_token, dim]
            x             = post_attention(ctx,
                                           attn_out,
                                           intermediates[0],
@ -484,7 +470,7 @@ public:
 };

 __STATIC_INLINE__ std::pair<struct ggml_tensor*, struct ggml_tensor*>
-block_mixing(struct ggml_context* ctx,
+block_mixing(GGMLRunnerContext* ctx,
             struct ggml_tensor* context,
             struct ggml_tensor* x,
             struct ggml_tensor* c,
@ -511,12 +497,12 @@ block_mixing(struct ggml_context* ctx,
    }
    std::vector<struct ggml_tensor*> qkv;
    for (int i = 0; i < 3; i++) {
-        qkv.push_back(ggml_concat(ctx, context_qkv[i], x_qkv[i], 1));
+        qkv.push_back(ggml_concat(ctx->ggml_ctx, context_qkv[i], x_qkv[i], 1));
    }

-    auto attn         = ggml_nn_attention_ext(ctx, qkv[0], qkv[1], qkv[2], x_block->num_heads);  // [N, n_context + n_token, hidden_size]
-    attn              = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));                     // [n_context + n_token, N, hidden_size]
-    auto context_attn = ggml_view_3d(ctx,
+    auto attn         = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], x_block->num_heads, nullptr, false, false, ctx->flash_attn_enabled);  // [N, n_context + n_token, hidden_size]
+    attn              = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, attn, 0, 2, 1, 3));                                                                          // [n_context + n_token, N, hidden_size]
+    auto context_attn = ggml_view_3d(ctx->ggml_ctx,
                                     attn,
                                     attn->ne[0],
                                     attn->ne[1],
@ -524,8 +510,8 @@ block_mixing(struct ggml_context* ctx,
                                     attn->nb[1],
                                     attn->nb[2],
                                     0);                                                                  // [n_context, N, hidden_size]
-    context_attn      = ggml_cont(ctx, ggml_permute(ctx, context_attn, 0, 2, 1, 3));  // [N, n_context, hidden_size]
-    auto x_attn       = ggml_view_3d(ctx,
+    context_attn      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, context_attn, 0, 2, 1, 3));  // [N, n_context, hidden_size]
+    auto x_attn       = ggml_view_3d(ctx->ggml_ctx,
                                     attn,
                                     attn->ne[0],
                                     attn->ne[1],
@ -533,7 +519,7 @@ block_mixing(struct ggml_context* ctx,
                                     attn->nb[1],
                                     attn->nb[2],
                                     attn->nb[2] * context->ne[1]);                                 // [n_token, N, hidden_size]
-    x_attn            = ggml_cont(ctx, ggml_permute(ctx, x_attn, 0, 2, 1, 3));  // [N, n_token, hidden_size]
+    x_attn            = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x_attn, 0, 2, 1, 3));  // [N, n_token, hidden_size]

    if (!context_block->pre_only) {
        context = context_block->post_attention(ctx,
@ -544,11 +530,11 @@ block_mixing(struct ggml_context* ctx,
                                                context_intermediates[3],
                                                context_intermediates[4]);
    } else {
-        context = NULL;
+        context = nullptr;
    }

    if (x_block->self_attn) {
-        auto attn2 = ggml_nn_attention_ext(ctx, x_qkv2[0], x_qkv2[1], x_qkv2[2], x_block->num_heads);  // [N, n_token, hidden_size]
+        auto attn2 = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, x_qkv2[0], x_qkv2[1], x_qkv2[2], x_block->num_heads, nullptr, false, false, ctx->flash_attn_enabled);  // [N, n_token, hidden_size]

        x = x_block->post_attention_x(ctx,
                                      x_attn,
@ -581,11 +567,11 @@ public:
               bool qkv_bias       = false,
               bool pre_only       = false,
               bool self_attn_x    = false) {
-        blocks["context_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, pre_only));
+        blocks["context_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, pre_only, false));
        blocks["x_block"]       = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, false, self_attn_x));
    }

-    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
+    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
                                                                struct ggml_tensor* context,
                                                                struct ggml_tensor* x,
                                                                struct ggml_tensor* c) {
@ -608,7 +594,7 @@ public:
        blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* c) {
        // x: [N, n_token, hidden_size]
@ -618,15 +604,15 @@ public:
        auto linear             = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
        auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);

-        auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c));  // [N, 2 * hidden_size]
-        m      = ggml_reshape_3d(ctx, m, c->ne[0], 2, c->ne[1]);       // [N, 2, hidden_size]
-        m      = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3));     // [2, N, hidden_size]
+        auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c));         // [N, 2 * hidden_size]
+        m      = ggml_reshape_3d(ctx->ggml_ctx, m, c->ne[0], 2, c->ne[1]);              // [N, 2, hidden_size]
+        m      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, m, 0, 2, 1, 3));  // [2, N, hidden_size]

        int64_t offset = m->nb[1] * m->ne[1];
-        auto shift     = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
-        auto scale     = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]
+        auto shift     = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
+        auto scale     = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]

-        x = modulate(ctx, norm_final->forward(ctx, x), shift, scale);
+        x = modulate(ctx->ggml_ctx, norm_final->forward(ctx, x), shift, scale);
        x = linear->forward(ctx, x);

        return x;
@ -636,7 +622,6 @@ public:
 struct MMDiT : public GGMLBlock {
    // Diffusion model with a Transformer backbone.
 protected:
-    SDVersion version                = VERSION_SD3_2B;
    int64_t input_size               = -1;
    int64_t patch_size               = 2;
    int64_t in_channels              = 16;
@ -652,13 +637,13 @@ protected:
    int64_t hidden_size;
    std::string qk_norm;

-    void init_params(struct ggml_context* ctx, ggml_type wtype) {
-        params["pos_embed"] = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hidden_size, num_patchs, 1);
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
+        enum ggml_type wtype = GGML_TYPE_F32;
+        params["pos_embed"]  = ggml_new_tensor_3d(ctx, wtype, hidden_size, num_patchs, 1);
    }

 public:
-    MMDiT(SDVersion version = VERSION_SD3_2B)
-        : version(version) {
+    MMDiT(const String2TensorStorage& tensor_storage_map = {}) {
        // input_size is always None
        // learn_sigma is always False
        // register_length is alwalys 0
@ -670,48 +655,43 @@ public:
        // pos_embed_scaling_factor is not used
        // pos_embed_offset is not used
        // context_embedder_config is always {'target': 'torch.nn.Linear', 'params': {'in_features': 4096, 'out_features': 1536}}
-        if (version == VERSION_SD3_2B) {
-            input_size               = -1;
-            patch_size               = 2;
-            in_channels              = 16;
-            depth                    = 24;
-            mlp_ratio                = 4.0f;
-            adm_in_channels          = 2048;
-            out_channels             = 16;
-            pos_embed_max_size       = 192;
-            num_patchs               = 36864;  // 192 * 192
-            context_size             = 4096;
-            context_embedder_out_dim = 1536;
-        } else if (version == VERSION_SD3_5_8B) {
-            input_size               = -1;
-            patch_size               = 2;
-            in_channels              = 16;
-            depth                    = 38;
-            mlp_ratio                = 4.0f;
-            adm_in_channels          = 2048;
-            out_channels             = 16;
-            pos_embed_max_size       = 192;
-            num_patchs               = 36864;  // 192 * 192
-            context_size             = 4096;
-            context_embedder_out_dim = 2432;
-            qk_norm                  = "rms";
-        } else if (version == VERSION_SD3_5_2B) {
-            input_size               = -1;
-            patch_size               = 2;
-            in_channels              = 16;
-            depth                    = 24;
-            d_self                   = 12;
-            mlp_ratio                = 4.0f;
-            adm_in_channels          = 2048;
-            out_channels             = 16;
-            pos_embed_max_size       = 384;
-            num_patchs               = 147456;
-            context_size             = 4096;
-            context_embedder_out_dim = 1536;
+
+        for (auto pair : tensor_storage_map) {
+            std::string tensor_name = pair.first;
+            if (tensor_name.find("model.diffusion_model.") == std::string::npos)
+                continue;
+            size_t jb = tensor_name.find("joint_blocks.");
+            if (jb != std::string::npos) {
+                tensor_name     = tensor_name.substr(jb);  // remove prefix
+                int block_depth = atoi(tensor_name.substr(13, tensor_name.find(".", 13)).c_str());
+                if (block_depth + 1 > depth) {
+                    depth = block_depth + 1;
+                }
+                if (tensor_name.find("attn.ln") != std::string::npos) {
+                    if (tensor_name.find(".bias") != std::string::npos) {
+                        qk_norm = "ln";
+                    } else {
                        qk_norm = "rms";
                    }
+                }
+                if (tensor_name.find("attn2") != std::string::npos) {
+                    if (block_depth > d_self) {
+                        d_self = block_depth;
+                    }
+                }
+            }
+        }
+
+        if (d_self >= 0) {
+            pos_embed_max_size *= 2;
+            num_patchs *= 4;
+        }
+
+        LOG_INFO("MMDiT layers: %d (including %d MMDiT-x layers)", depth, d_self + 1);
+
        int64_t default_out_channels = in_channels;
        hidden_size                  = 64 * depth;
+        context_embedder_out_dim     = 64 * depth;
        int64_t num_heads            = depth;

        blocks["x_embedder"] = std::shared_ptr<GGMLBlock>(new PatchEmbed(input_size, patch_size, in_channels, hidden_size, true));
@ -798,7 +778,7 @@ public:
        return x;
    }

-    struct ggml_tensor* forward_core_with_concat(struct ggml_context* ctx,
+    struct ggml_tensor* forward_core_with_concat(GGMLRunnerContext* ctx,
                                                 struct ggml_tensor* x,
                                                 struct ggml_tensor* c_mod,
                                                 struct ggml_tensor* context,
@ -827,11 +807,11 @@ public:
        return x;
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* t,
-                                struct ggml_tensor* y        = NULL,
-                                struct ggml_tensor* context  = NULL,
+                                struct ggml_tensor* y        = nullptr,
+                                struct ggml_tensor* context  = nullptr,
                                std::vector<int> skip_layers = std::vector<int>()) {
        // Forward pass of DiT.
        // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
@ -846,18 +826,18 @@ public:
        int64_t h = x->ne[1];

        auto patch_embed = x_embedder->forward(ctx, x);                      // [N, H*W, hidden_size]
-        auto pos_embed   = cropped_pos_embed(ctx, h, w);           // [1, H*W, hidden_size]
-        x                = ggml_add(ctx, patch_embed, pos_embed);  // [N, H*W, hidden_size]
+        auto pos_embed   = cropped_pos_embed(ctx->ggml_ctx, h, w);           // [1, H*W, hidden_size]
+        x                = ggml_add(ctx->ggml_ctx, patch_embed, pos_embed);  // [N, H*W, hidden_size]

        auto c = t_embedder->forward(ctx, t);  // [N, hidden_size]
-        if (y != NULL && adm_in_channels != -1) {
+        if (y != nullptr && adm_in_channels != -1) {
            auto y_embedder = std::dynamic_pointer_cast<VectorEmbedder>(blocks["y_embedder"]);

            y = y_embedder->forward(ctx, y);  // [N, hidden_size]
-            c = ggml_add(ctx, c, y);
+            c = ggml_add(ctx->ggml_ctx, c, y);
        }

-        if (context != NULL) {
+        if (context != nullptr) {
            auto context_embedder = std::dynamic_pointer_cast<Linear>(blocks["context_embedder"]);

            context = context_embedder->forward(ctx, context);  // [N, L, D] aka [N, L, 1536]
@ -865,23 +845,23 @@ public:

        x = forward_core_with_concat(ctx, x, c, context, skip_layers);  // (N, H*W, patch_size ** 2 * out_channels)

-        x = unpatchify(ctx, x, h, w);  // [N, C, H, W]
+        x = unpatchify(ctx->ggml_ctx, x, h, w);  // [N, C, H, W]

        return x;
    }
 };
-
 struct MMDiTRunner : public GGMLRunner {
    MMDiT mmdit;

    MMDiTRunner(ggml_backend_t backend,
-                ggml_type wtype,
-                SDVersion version = VERSION_SD3_2B)
-        : GGMLRunner(backend, wtype), mmdit(version) {
-        mmdit.init(params_ctx, wtype);
+                bool offload_params_to_cpu,
+                const String2TensorStorage& tensor_storage_map = {},
+                const std::string prefix                       = "")
+        : GGMLRunner(backend, offload_params_to_cpu), mmdit(tensor_storage_map) {
+        mmdit.init(params_ctx, tensor_storage_map, prefix);
    }

-    std::string get_desc() {
+    std::string get_desc() override {
        return "mmdit";
    }

@ -894,14 +874,15 @@ struct MMDiTRunner : public GGMLRunner {
                                    struct ggml_tensor* context,
                                    struct ggml_tensor* y,
                                    std::vector<int> skip_layers = std::vector<int>()) {
-        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, MMDIT_GRAPH_SIZE, false);
+        struct ggml_cgraph* gf = new_graph_custom(MMDIT_GRAPH_SIZE);

        x         = to_backend(x);
        context   = to_backend(context);
        y         = to_backend(y);
        timesteps = to_backend(timesteps);

-        struct ggml_tensor* out = mmdit.forward(compute_ctx,
+        auto runner_ctx         = get_context();
+        struct ggml_tensor* out = mmdit.forward(&runner_ctx,
                                                x,
                                                timesteps,
                                                y,
@ -913,13 +894,13 @@ struct MMDiTRunner : public GGMLRunner {
        return gf;
    }

-    void compute(int n_threads,
+    bool compute(int n_threads,
                 struct ggml_tensor* x,
                 struct ggml_tensor* timesteps,
                 struct ggml_tensor* context,
                 struct ggml_tensor* y,
-                 struct ggml_tensor** output     = NULL,
-                 struct ggml_context* output_ctx = NULL,
+                 struct ggml_tensor** output     = nullptr,
+                 struct ggml_context* output_ctx = nullptr,
                 std::vector<int> skip_layers    = std::vector<int>()) {
        // x: [N, in_channels, h, w]
        // timesteps: [N, ]
@ -929,17 +910,17 @@ struct MMDiTRunner : public GGMLRunner {
            return build_graph(x, timesteps, context, y, skip_layers);
        };

-        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
    }

    void test() {
        struct ggml_init_params params;
        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
-        params.mem_buffer = NULL;
+        params.mem_buffer = nullptr;
        params.no_alloc   = false;

        struct ggml_context* work_ctx = ggml_init(params);
-        GGML_ASSERT(work_ctx != NULL);
+        GGML_ASSERT(work_ctx != nullptr);

        {
            // cpu f16: pass
@ -960,7 +941,7 @@ struct MMDiTRunner : public GGMLRunner {
            ggml_set_f32(y, 0.01f);
            // print_ggml_tensor(y);

-            struct ggml_tensor* out = NULL;
+            struct ggml_tensor* out = nullptr;

            int t0 = ggml_time_ms();
            compute(8, x, timesteps, context, y, &out, work_ctx);
@ -975,7 +956,7 @@ struct MMDiTRunner : public GGMLRunner {
        // ggml_backend_t backend    = ggml_backend_cuda_init(0);
        ggml_backend_t backend             = ggml_backend_cpu_init();
        ggml_type model_data_type          = GGML_TYPE_F16;
-        std::shared_ptr<MMDiTRunner> mmdit = std::shared_ptr<MMDiTRunner>(new MMDiTRunner(backend, model_data_type));
+        std::shared_ptr<MMDiTRunner> mmdit = std::make_shared<MMDiTRunner>(backend, false);
        {
            LOG_INFO("loading from '%s'", file_path.c_str());

@ -984,12 +965,12 @@ struct MMDiTRunner : public GGMLRunner {
            mmdit->get_param_tensors(tensors, "model.diffusion_model");

            ModelLoader model_loader;
-            if (!model_loader.init_from_file(file_path)) {
+            if (!model_loader.init_from_file_and_convert_name(file_path)) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
            }

-            bool success = model_loader.load_tensors(tensors, backend);
+            bool success = model_loader.load_tensors(tensors);

            if (!success) {
                LOG_ERROR("load tensors from model loader failed");
--- a/model.cpp
+++ b/model.cpp
--- a/model.h
+++ b/model.h
@ -8,44 +8,183 @@
 #include <sstream>
 #include <string>
 #include <tuple>
+#include <utility>
 #include <vector>

 #include "ggml-backend.h"
 #include "ggml.h"
+#include "gguf.h"
 #include "json.hpp"
+#include "ordered_map.hpp"
 #include "zip.h"

 #define SD_MAX_DIMS 5

 enum SDVersion {
    VERSION_SD1,
+    VERSION_SD1_INPAINT,
+    VERSION_SD1_PIX2PIX,
+    VERSION_SD1_TINY_UNET,
    VERSION_SD2,
+    VERSION_SD2_INPAINT,
+    VERSION_SD2_TINY_UNET,
    VERSION_SDXL,
+    VERSION_SDXL_INPAINT,
+    VERSION_SDXL_PIX2PIX,
+    VERSION_SDXL_SSD1B,
    VERSION_SVD,
-    VERSION_SD3_2B,
-    VERSION_FLUX_DEV,
-    VERSION_FLUX_SCHNELL,
-    VERSION_SD3_5_8B,
-    VERSION_SD3_5_2B,
+    VERSION_SD3,
+    VERSION_FLUX,
+    VERSION_FLUX_FILL,
+    VERSION_FLUX_CONTROLS,
+    VERSION_FLEX_2,
+    VERSION_CHROMA_RADIANCE,
+    VERSION_WAN2,
+    VERSION_WAN2_2_I2V,
+    VERSION_WAN2_2_TI2V,
+    VERSION_QWEN_IMAGE,
+    VERSION_FLUX2,
+    VERSION_Z_IMAGE,
+    VERSION_OVIS_IMAGE,
    VERSION_COUNT,
 };

+static inline bool sd_version_is_sd1(SDVersion version) {
+    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_sd2(SDVersion version) {
+    if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_sdxl(SDVersion version) {
+    if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT || version == VERSION_SDXL_PIX2PIX || version == VERSION_SDXL_SSD1B) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_unet(SDVersion version) {
+    if (sd_version_is_sd1(version) ||
+        sd_version_is_sd2(version) ||
+        sd_version_is_sdxl(version)) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_sd3(SDVersion version) {
+    if (version == VERSION_SD3) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_flux(SDVersion version) {
+    if (version == VERSION_FLUX ||
+        version == VERSION_FLUX_FILL ||
+        version == VERSION_FLUX_CONTROLS ||
+        version == VERSION_FLEX_2 ||
+        version == VERSION_OVIS_IMAGE ||
+        version == VERSION_CHROMA_RADIANCE) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_flux2(SDVersion version) {
+    if (version == VERSION_FLUX2) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_wan(SDVersion version) {
+    if (version == VERSION_WAN2 || version == VERSION_WAN2_2_I2V || version == VERSION_WAN2_2_TI2V) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_qwen_image(SDVersion version) {
+    if (version == VERSION_QWEN_IMAGE) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_z_image(SDVersion version) {
+    if (version == VERSION_Z_IMAGE) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_inpaint(SDVersion version) {
+    if (version == VERSION_SD1_INPAINT ||
+        version == VERSION_SD2_INPAINT ||
+        version == VERSION_SDXL_INPAINT ||
+        version == VERSION_FLUX_FILL ||
+        version == VERSION_FLEX_2) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_dit(SDVersion version) {
+    if (sd_version_is_flux(version) ||
+        sd_version_is_flux2(version) ||
+        sd_version_is_sd3(version) ||
+        sd_version_is_wan(version) ||
+        sd_version_is_qwen_image(version) ||
+        sd_version_is_z_image(version)) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_unet_edit(SDVersion version) {
+    return version == VERSION_SD1_PIX2PIX || version == VERSION_SDXL_PIX2PIX;
+}
+
+static inline bool sd_version_is_control(SDVersion version) {
+    return version == VERSION_FLUX_CONTROLS || version == VERSION_FLEX_2;
+}
+
+static bool sd_version_is_inpaint_or_unet_edit(SDVersion version) {
+    return sd_version_is_unet_edit(version) || sd_version_is_inpaint(version) || sd_version_is_control(version);
+}
+
+enum PMVersion {
+    PM_VERSION_1,
+    PM_VERSION_2,
+};
+
 struct TensorStorage {
    std::string name;
    ggml_type type          = GGML_TYPE_F32;
-    bool is_bf16            = false;
+    ggml_type expected_type = GGML_TYPE_COUNT;
    bool is_f8_e4m3         = false;
+    bool is_f8_e5m2         = false;
+    bool is_f64             = false;
+    bool is_i64             = false;
    int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
    int n_dims              = 0;

    size_t file_index = 0;
    int index_in_zip  = -1;  // >= means stored in a zip file
-    size_t offset     = 0;   // offset in file
+    uint64_t offset   = 0;   // offset in file

    TensorStorage() = default;

-    TensorStorage(const std::string& name, ggml_type type, int64_t* ne, int n_dims, size_t file_index, size_t offset = 0)
-        : name(name), type(type), n_dims(n_dims), file_index(file_index), offset(offset) {
+    TensorStorage(std::string name, ggml_type type, const int64_t* ne, int n_dims, size_t file_index, size_t offset = 0)
+        : name(std::move(name)), type(type), n_dims(n_dims), file_index(file_index), offset(offset) {
        for (int i = 0; i < n_dims; i++) {
            this->ne[i] = ne[i];
        }
@ -64,8 +203,10 @@ struct TensorStorage {
    }

    int64_t nbytes_to_read() const {
-        if (is_bf16 || is_f8_e4m3) {
+        if (is_f8_e4m3 || is_f8_e5m2) {
            return nbytes() / 2;
+        } else if (is_f64 || is_i64) {
+            return nbytes() * 2;
        } else {
            return nbytes();
        }
@ -83,10 +224,10 @@ struct TensorStorage {

    std::vector<TensorStorage> chunk(size_t n) {
        std::vector<TensorStorage> chunks;
-        size_t chunk_size = nbytes_to_read() / n;
+        uint64_t chunk_size = nbytes_to_read() / n;
        // printf("%d/%d\n", chunk_size, nbytes_to_read());
        reverse_ne();
-        for (int i = 0; i < n; i++) {
+        for (size_t i = 0; i < n; i++) {
            TensorStorage chunk_i = *this;
            chunk_i.ne[0]         = ne[0] / n;
            chunk_i.offset        = offset + i * chunk_size;
@ -110,10 +251,14 @@ struct TensorStorage {
    std::string to_string() const {
        std::stringstream ss;
        const char* type_name = ggml_type_name(type);
-        if (is_bf16) {
-            type_name = "bf16";
-        } else if (is_f8_e4m3) {
+        if (is_f8_e4m3) {
            type_name = "f8_e4m3";
+        } else if (is_f8_e5m2) {
+            type_name = "f8_e5m2";
+        } else if (is_f64) {
+            type_name = "f64";
+        } else if (is_i64) {
+            type_name = "i64";
        }
        ss << name << " | " << type_name << " | ";
        ss << n_dims << " [";
@ -130,17 +275,22 @@ struct TensorStorage {

 typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;

+typedef OrderedMap<std::string, TensorStorage> String2TensorStorage;
+
 class ModelLoader {
 protected:
+    SDVersion version_ = VERSION_COUNT;
    std::vector<std::string> file_paths_;
-    std::vector<TensorStorage> tensor_storages;
+    String2TensorStorage tensor_storage_map;
+
+    void add_tensor_storage(const TensorStorage& tensor_storage);

    bool parse_data_pkl(uint8_t* buffer,
                        size_t buffer_size,
                        zip_t* zip,
                        std::string dir,
                        size_t file_index,
-                        const std::string& prefix);
+                        const std::string prefix);

    bool init_from_gguf_file(const std::string& file_path, const std::string& prefix = "");
    bool init_from_safetensors_file(const std::string& file_path, const std::string& prefix = "");
@ -149,22 +299,41 @@ protected:

 public:
    bool init_from_file(const std::string& file_path, const std::string& prefix = "");
+    void convert_tensors_name();
+    bool init_from_file_and_convert_name(const std::string& file_path,
+                                         const std::string& prefix = "",
+                                         SDVersion version         = VERSION_COUNT);
    SDVersion get_sd_version();
-    ggml_type get_sd_wtype();
-    ggml_type get_conditioner_wtype();
-    ggml_type get_diffusion_model_wtype();
-    ggml_type get_vae_wtype();
-    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend);
+    std::map<ggml_type, uint32_t> get_wtype_stat();
+    std::map<ggml_type, uint32_t> get_conditioner_wtype_stat();
+    std::map<ggml_type, uint32_t> get_diffusion_model_wtype_stat();
+    std::map<ggml_type, uint32_t> get_vae_wtype_stat();
+    String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
+    void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = "");
+    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0);
    bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
-                      ggml_backend_t backend,
-                      std::set<std::string> ignore_tensors = {});
-    bool save_to_gguf_file(const std::string& file_path, ggml_type type);
+                      std::set<std::string> ignore_tensors = {},
+                      int n_threads                        = 0);
+
+    std::vector<std::string> get_tensor_names() const {
+        std::vector<std::string> names;
+        for (const auto& [name, tensor_storage] : tensor_storage_map) {
+            names.push_back(name);
+        }
+        return names;
+    }
+
+    bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules);
    bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
    int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
    ~ModelLoader() = default;

    static std::string load_merges();
+    static std::string load_qwen2_merges();
+    static std::string load_mistral_merges();
+    static std::string load_mistral_vocab_json();
    static std::string load_t5_tokenizer_json();
+    static std::string load_umt5_tokenizer_json();
 };

 #endif  // __MODEL_H__
--- a/name_conversion.cpp
+++ b/name_conversion.cpp
--- a/name_conversion.h
+++ b/name_conversion.h
@ -0,0 +1,14 @@
+#ifndef __NAME_CONVERSTION_H__
+#define __NAME_CONVERSTION_H__
+
+#include <string>
+
+#include "model.h"
+
+bool is_cond_stage_model_name(const std::string& name);
+bool is_diffusion_model_name(const std::string& name);
+bool is_first_stage_model_name(const std::string& name);
+
+std::string convert_tensor_name(std::string name, SDVersion version);
+
+#endif  // __NAME_CONVERSTION_H__
--- a/ordered_map.hpp
+++ b/ordered_map.hpp
@ -0,0 +1,177 @@
+#ifndef __ORDERED_MAP_HPP__
+#define __ORDERED_MAP_HPP__
+
+#include <iostream>
+#include <list>
+#include <string>
+#include <unordered_map>
+
+#include <initializer_list>
+#include <iterator>
+#include <list>
+#include <stdexcept>
+#include <unordered_map>
+#include <utility>
+
+template <typename Key, typename T>
+class OrderedMap {
+public:
+    using key_type        = Key;
+    using mapped_type     = T;
+    using value_type      = std::pair<const Key, T>;
+    using list_type       = std::list<value_type>;
+    using size_type       = typename list_type::size_type;
+    using difference_type = typename list_type::difference_type;
+    using iterator        = typename list_type::iterator;
+    using const_iterator  = typename list_type::const_iterator;
+
+private:
+    list_type data_;
+    std::unordered_map<Key, iterator> index_;
+
+public:
+    // --- constructors ---
+    OrderedMap() = default;
+
+    OrderedMap(std::initializer_list<value_type> init) {
+        for (const auto& kv : init)
+            insert(kv);
+    }
+
+    OrderedMap(const OrderedMap&)                = default;
+    OrderedMap(OrderedMap&&) noexcept            = default;
+    OrderedMap& operator=(const OrderedMap&)     = default;
+    OrderedMap& operator=(OrderedMap&&) noexcept = default;
+
+    // --- element access ---
+    T& at(const Key& key) {
+        auto it = index_.find(key);
+        if (it == index_.end())
+            throw std::out_of_range("OrderedMap::at: key not found");
+        return it->second->second;
+    }
+
+    const T& at(const Key& key) const {
+        auto it = index_.find(key);
+        if (it == index_.end())
+            throw std::out_of_range("OrderedMap::at: key not found");
+        return it->second->second;
+    }
+
+    T& operator[](const Key& key) {
+        auto it = index_.find(key);
+        if (it == index_.end()) {
+            data_.emplace_back(key, T{});
+            auto iter   = std::prev(data_.end());
+            index_[key] = iter;
+            return iter->second;
+        }
+        return it->second->second;
+    }
+
+    // --- iterators ---
+    iterator begin() noexcept { return data_.begin(); }
+    const_iterator begin() const noexcept { return data_.begin(); }
+    const_iterator cbegin() const noexcept { return data_.cbegin(); }
+
+    iterator end() noexcept { return data_.end(); }
+    const_iterator end() const noexcept { return data_.end(); }
+    const_iterator cend() const noexcept { return data_.cend(); }
+
+    // --- capacity ---
+    bool empty() const noexcept { return data_.empty(); }
+    size_type size() const noexcept { return data_.size(); }
+
+    // --- modifiers ---
+    void clear() noexcept {
+        data_.clear();
+        index_.clear();
+    }
+
+    std::pair<iterator, bool> insert(const value_type& value) {
+        auto it = index_.find(value.first);
+        if (it != index_.end()) {
+            return {it->second, false};
+        }
+        data_.push_back(value);
+        auto iter           = std::prev(data_.end());
+        index_[value.first] = iter;
+        return {iter, true};
+    }
+
+    std::pair<iterator, bool> insert(value_type&& value) {
+        auto it = index_.find(value.first);
+        if (it != index_.end()) {
+            return {it->second, false};
+        }
+        data_.push_back(std::move(value));
+        auto iter           = std::prev(data_.end());
+        index_[iter->first] = iter;
+        return {iter, true};
+    }
+
+    void erase(const Key& key) {
+        auto it = index_.find(key);
+        if (it != index_.end()) {
+            data_.erase(it->second);
+            index_.erase(it);
+        }
+    }
+
+    iterator erase(iterator pos) {
+        index_.erase(pos->first);
+        return data_.erase(pos);
+    }
+
+    // --- lookup ---
+    size_type count(const Key& key) const {
+        return index_.count(key);
+    }
+
+    iterator find(const Key& key) {
+        auto it = index_.find(key);
+        if (it == index_.end())
+            return data_.end();
+        return it->second;
+    }
+
+    const_iterator find(const Key& key) const {
+        auto it = index_.find(key);
+        if (it == index_.end())
+            return data_.end();
+        return it->second;
+    }
+
+    bool contains(const Key& key) const {
+        return index_.find(key) != index_.end();
+    }
+
+    // --- comparison ---
+    bool operator==(const OrderedMap& other) const {
+        return data_ == other.data_;
+    }
+
+    bool operator!=(const OrderedMap& other) const {
+        return !(*this == other);
+    }
+
+    template <typename... Args>
+    std::pair<iterator, bool> emplace(Args&&... args) {
+        value_type value(std::forward<Args>(args)...);
+        auto it = index_.find(value.first);
+        if (it != index_.end()) {
+            return {it->second, false};
+        }
+        data_.push_back(std::move(value));
+        auto iter           = std::prev(data_.end());
+        index_[iter->first] = iter;
+        return {iter, true};
+    }
+
+    void swap(OrderedMap& other) noexcept {
+        data_.swap(other.data_);
+        index_.swap(other.index_);
+    }
+};
+
+#endif  // __ORDERED_MAP_HPP__
--- a/pmid.hpp
+++ b/pmid.hpp
@ -21,7 +21,7 @@ public:
        blocks["layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(in_dim));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [N, channels, h, w]

        auto fc1        = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
@ -29,19 +29,221 @@ public:
        auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layernorm"]);

        struct ggml_tensor* r = x;
-        // x = ggml_nn_layer_norm(ctx, x, ln_w, ln_b);
+        // x = ggml_ext_layer_norm(ctx, x, ln_w, ln_b);
        x = layer_norm->forward(ctx, x);
        // x = ggml_add(ctx, ggml_mul_mat(ctx, fc1_w, x),  fc1_b);
        x = fc1->forward(ctx, x);
-        x = ggml_gelu_inplace(ctx, x);
+        x = ggml_gelu_inplace(ctx->ggml_ctx, x);
        x = fc2->forward(ctx, x);
        // x = ggml_add(ctx, ggml_mul_mat(ctx, fc2_w, x),  fc2_b);
        if (use_residue)
-            x = ggml_add(ctx, x, r);
+            x = ggml_add(ctx->ggml_ctx, x, r);
        return x;
    }
 };

+struct PMFeedForward : public GGMLBlock {
+    // network hparams
+    int dim;
+
+public:
+    PMFeedForward(int d, int multi = 4)
+        : dim(d) {
+        int inner_dim = dim * multi;
+        blocks["0"]   = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
+        blocks["1"]   = std::shared_ptr<GGMLBlock>(new Mlp(dim, inner_dim, dim, false));
+    }
+
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                struct ggml_tensor* x) {
+        auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["0"]);
+        auto ff   = std::dynamic_pointer_cast<Mlp>(blocks["1"]);
+
+        x = norm->forward(ctx, x);
+        x = ff->forward(ctx, x);
+        return x;
+    }
+};
+
+struct PerceiverAttention : public GGMLBlock {
+    // network hparams
+    float scale;   // = dim_head**-0.5
+    int dim_head;  // = dim_head
+    int heads;     // = heads
+public:
+    PerceiverAttention(int dim, int dim_h = 64, int h = 8)
+        : scale(powf(dim_h, -0.5)), dim_head(dim_h), heads(h) {
+        int inner_dim    = dim_head * heads;
+        blocks["norm1"]  = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
+        blocks["norm2"]  = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
+        blocks["to_q"]   = std::shared_ptr<GGMLBlock>(new Linear(dim, inner_dim, false));
+        blocks["to_kv"]  = std::shared_ptr<GGMLBlock>(new Linear(dim, inner_dim * 2, false));
+        blocks["to_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim, false));
+    }
+
+    struct ggml_tensor* reshape_tensor(struct ggml_context* ctx,
+                                       struct ggml_tensor* x,
+                                       int heads) {
+        int64_t ne[4];
+        for (int i = 0; i < 4; ++i)
+            ne[i] = x->ne[i];
+        x = ggml_reshape_4d(ctx, x, x->ne[0] / heads, heads, x->ne[1], x->ne[2]);
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));
+        return x;
+    }
+
+    std::vector<struct ggml_tensor*> chunk_half(struct ggml_context* ctx,
+                                                struct ggml_tensor* x) {
+        auto tlo = ggml_view_4d(ctx, x, x->ne[0] / 2, x->ne[1], x->ne[2], x->ne[3], x->nb[1], x->nb[2], x->nb[3], 0);
+        auto tli = ggml_view_4d(ctx, x, x->ne[0] / 2, x->ne[1], x->ne[2], x->ne[3], x->nb[1], x->nb[2], x->nb[3], x->nb[0] * x->ne[0] / 2);
+        return {ggml_cont(ctx, tlo),
+                ggml_cont(ctx, tli)};
+    }
+
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* latents) {
+        // x (torch.Tensor): image features
+        //     shape (b, n1, D)
+        // latent (torch.Tensor): latent features
+        //     shape (b, n2, D)
+        int64_t ne[4];
+        for (int i = 0; i < 4; ++i)
+            ne[i] = latents->ne[i];
+
+        auto norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
+        auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
+        x          = norm1->forward(ctx, x);
+        latents    = norm2->forward(ctx, latents);
+        auto to_q  = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
+        auto q     = to_q->forward(ctx, latents);
+
+        auto kv_input = ggml_concat(ctx->ggml_ctx, x, latents, 1);
+        auto to_kv    = std::dynamic_pointer_cast<Linear>(blocks["to_kv"]);
+        auto kv       = to_kv->forward(ctx, kv_input);
+        auto k        = ggml_view_4d(ctx->ggml_ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, 0);
+        auto v        = ggml_view_4d(ctx->ggml_ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, kv->nb[0] * (kv->ne[0] / 2));
+        k             = ggml_cont(ctx->ggml_ctx, k);
+        v             = ggml_cont(ctx->ggml_ctx, v);
+        q             = reshape_tensor(ctx->ggml_ctx, q, heads);
+        k             = reshape_tensor(ctx->ggml_ctx, k, heads);
+        v             = reshape_tensor(ctx->ggml_ctx, v, heads);
+        scale         = 1.f / sqrt(sqrt((float)dim_head));
+        k             = ggml_scale_inplace(ctx->ggml_ctx, k, scale);
+        q             = ggml_scale_inplace(ctx->ggml_ctx, q, scale);
+        // auto weight = ggml_mul_mat(ctx, q, k);
+        auto weight = ggml_mul_mat(ctx->ggml_ctx, k, q);  // NOTE order of mul is opposite to pytorch
+
+        // GGML's softmax() is equivalent to pytorch's softmax(x, dim=-1)
+        // in this case, dimension along which Softmax will be computed is the last dim
+        // in torch and the first dim in GGML, consistent with the convention that pytorch's
+        // last dimension (varying most rapidly) corresponds to GGML's first (varying most rapidly).
+        // weight = ggml_soft_max(ctx, weight);
+        weight = ggml_soft_max_inplace(ctx->ggml_ctx, weight);
+        v      = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, v));
+        // auto out = ggml_mul_mat(ctx, weight, v);
+        auto out    = ggml_mul_mat(ctx->ggml_ctx, v, weight);  // NOTE order of mul is opposite to pytorch
+        out         = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3));
+        out         = ggml_reshape_3d(ctx->ggml_ctx, out, ne[0], ne[1], ggml_nelements(out) / (ne[0] * ne[1]));
+        auto to_out = std::dynamic_pointer_cast<Linear>(blocks["to_out"]);
+        out         = to_out->forward(ctx, out);
+        return out;
+    }
+};
+
+struct FacePerceiverResampler : public GGMLBlock {
+    // network hparams
+    int depth;
+
+public:
+    FacePerceiverResampler(int dim           = 768,
+                           int d             = 4,
+                           int dim_head      = 64,
+                           int heads         = 16,
+                           int embedding_dim = 1280,
+                           int output_dim    = 768,
+                           int ff_mult       = 4)
+        : depth(d) {
+        blocks["proj_in"]  = std::shared_ptr<GGMLBlock>(new Linear(embedding_dim, dim, true));
+        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(dim, output_dim, true));
+        blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new LayerNorm(output_dim));
+
+        for (int i = 0; i < depth; i++) {
+            std::string name = "layers." + std::to_string(i) + ".0";
+            blocks[name]     = std::shared_ptr<GGMLBlock>(new PerceiverAttention(dim, dim_head, heads));
+            name             = "layers." + std::to_string(i) + ".1";
+            blocks[name]     = std::shared_ptr<GGMLBlock>(new PMFeedForward(dim, ff_mult));
+        }
+    }
+
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                struct ggml_tensor* latents,
+                                struct ggml_tensor* x) {
+        // x: [N, channels, h, w]
+        auto proj_in  = std::dynamic_pointer_cast<Linear>(blocks["proj_in"]);
+        auto proj_out = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]);
+        auto norm_out = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_out"]);
+
+        x = proj_in->forward(ctx, x);
+        for (int i = 0; i < depth; i++) {
+            std::string name = "layers." + std::to_string(i) + ".0";
+            auto attn        = std::dynamic_pointer_cast<PerceiverAttention>(blocks[name]);
+            name             = "layers." + std::to_string(i) + ".1";
+            auto ff          = std::dynamic_pointer_cast<PMFeedForward>(blocks[name]);
+            auto t           = attn->forward(ctx, x, latents);
+            latents          = ggml_add(ctx->ggml_ctx, t, latents);
+            t                = ff->forward(ctx, latents);
+            latents          = ggml_add(ctx->ggml_ctx, t, latents);
+        }
+        latents = proj_out->forward(ctx, latents);
+        latents = norm_out->forward(ctx, latents);
+        return latents;
+    }
+};
+
+struct QFormerPerceiver : public GGMLBlock {
+    // network hparams
+    int num_tokens;
+    int cross_attention_dim;
+    bool use_residul;
+
+public:
+    QFormerPerceiver(int id_embeddings_dim, int cross_attention_d, int num_t, int embedding_dim = 1024, bool use_r = true, int ratio = 4)
+        : cross_attention_dim(cross_attention_d), num_tokens(num_t), use_residul(use_r) {
+        blocks["token_proj"]          = std::shared_ptr<GGMLBlock>(new Mlp(id_embeddings_dim,
+                                                                           id_embeddings_dim * ratio,
+                                                                           cross_attention_dim * num_tokens,
+                                                                           true));
+        blocks["token_norm"]          = std::shared_ptr<GGMLBlock>(new LayerNorm(cross_attention_d));
+        blocks["perceiver_resampler"] = std::shared_ptr<GGMLBlock>(new FacePerceiverResampler(
+            cross_attention_dim,
+            4,
+            128,
+            cross_attention_dim / 128,
+            embedding_dim,
+            cross_attention_dim,
+            4));
+    }
+
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* last_hidden_state) {
+        // x: [N, channels, h, w]
+        auto token_proj          = std::dynamic_pointer_cast<Mlp>(blocks["token_proj"]);
+        auto token_norm          = std::dynamic_pointer_cast<LayerNorm>(blocks["token_norm"]);
+        auto perceiver_resampler = std::dynamic_pointer_cast<FacePerceiverResampler>(blocks["perceiver_resampler"]);
+
+        x                       = token_proj->forward(ctx, x);
+        int64_t nel             = ggml_nelements(x);
+        x                       = ggml_reshape_3d(ctx->ggml_ctx, x, cross_attention_dim, num_tokens, nel / (cross_attention_dim * num_tokens));
+        x                       = token_norm->forward(ctx, x);
+        struct ggml_tensor* out = perceiver_resampler->forward(ctx, x, last_hidden_state);
+        if (use_residul)
+            out = ggml_add(ctx->ggml_ctx, x, out);
+        return out;
+    }
+};
+
 struct FuseModule : public GGMLBlock {
    // network hparams
    int embed_dim;
@ -54,33 +256,24 @@ public:
        blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(embed_dim));
    }

-    struct ggml_tensor* fuse_fn(struct ggml_context* ctx,
+    struct ggml_tensor* fuse_fn(GGMLRunnerContext* ctx,
                                struct ggml_tensor* prompt_embeds,
                                struct ggml_tensor* id_embeds) {
        auto mlp1       = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp1"]);
        auto mlp2       = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp2"]);
        auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm"]);

-        auto prompt_embeds0 = ggml_cont(ctx, ggml_permute(ctx, prompt_embeds, 2, 0, 1, 3));
-        auto id_embeds0     = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
-        // concat is along dim 2
-        auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds0, id_embeds0, 2);
-        stacked_id_embeds      = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 1, 2, 0, 3));
-
-        // stacked_id_embeds = mlp1.forward(ctx, stacked_id_embeds);
-        // stacked_id_embeds = ggml_add(ctx, stacked_id_embeds, prompt_embeds);
-        // stacked_id_embeds = mlp2.forward(ctx, stacked_id_embeds);
-        // stacked_id_embeds = ggml_nn_layer_norm(ctx, stacked_id_embeds, ln_w, ln_b);
+        auto stacked_id_embeds = ggml_concat(ctx->ggml_ctx, prompt_embeds, id_embeds, 0);

        stacked_id_embeds = mlp1->forward(ctx, stacked_id_embeds);
-        stacked_id_embeds = ggml_add(ctx, stacked_id_embeds, prompt_embeds);
+        stacked_id_embeds = ggml_add(ctx->ggml_ctx, stacked_id_embeds, prompt_embeds);
        stacked_id_embeds = mlp2->forward(ctx, stacked_id_embeds);
        stacked_id_embeds = layer_norm->forward(ctx, stacked_id_embeds);

        return stacked_id_embeds;
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* prompt_embeds,
                                struct ggml_tensor* id_embeds,
                                struct ggml_tensor* class_tokens_mask,
@ -91,29 +284,27 @@ public:

        struct ggml_tensor* valid_id_embeds = id_embeds;
        // # slice out the image token embeddings
-        // print_ggml_tensor(class_tokens_mask_pos, false);
        ggml_set_name(class_tokens_mask_pos, "class_tokens_mask_pos");
        ggml_set_name(prompt_embeds, "prompt_embeds");
-        // print_ggml_tensor(valid_id_embeds, true, "valid_id_embeds");
-        // print_ggml_tensor(class_tokens_mask_pos, true, "class_tokens_mask_pos");
-        struct ggml_tensor* image_token_embeds = ggml_get_rows(ctx, prompt_embeds, class_tokens_mask_pos);
+        struct ggml_tensor* image_token_embeds = ggml_get_rows(ctx->ggml_ctx, prompt_embeds, class_tokens_mask_pos);
        ggml_set_name(image_token_embeds, "image_token_embeds");
+        valid_id_embeds                       = ggml_reshape_2d(ctx->ggml_ctx, valid_id_embeds, valid_id_embeds->ne[0],
+                                                                ggml_nelements(valid_id_embeds) / valid_id_embeds->ne[0]);
        struct ggml_tensor* stacked_id_embeds = fuse_fn(ctx, image_token_embeds, valid_id_embeds);

-        stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
        if (left && right) {
-            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 2);
-            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 2);
+            stacked_id_embeds = ggml_concat(ctx->ggml_ctx, left, stacked_id_embeds, 1);
+            stacked_id_embeds = ggml_concat(ctx->ggml_ctx, stacked_id_embeds, right, 1);
        } else if (left) {
-            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 2);
+            stacked_id_embeds = ggml_concat(ctx->ggml_ctx, left, stacked_id_embeds, 1);
        } else if (right) {
-            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 2);
+            stacked_id_embeds = ggml_concat(ctx->ggml_ctx, stacked_id_embeds, right, 1);
        }
-        stacked_id_embeds                         = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
-        class_tokens_mask                         = ggml_cont(ctx, ggml_transpose(ctx, class_tokens_mask));
-        class_tokens_mask                         = ggml_repeat(ctx, class_tokens_mask, prompt_embeds);
-        prompt_embeds                             = ggml_mul(ctx, prompt_embeds, class_tokens_mask);
-        struct ggml_tensor* updated_prompt_embeds = ggml_add(ctx, prompt_embeds, stacked_id_embeds);
+
+        class_tokens_mask                         = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, class_tokens_mask));
+        class_tokens_mask                         = ggml_repeat(ctx->ggml_ctx, class_tokens_mask, prompt_embeds);
+        prompt_embeds                             = ggml_mul(ctx->ggml_ctx, prompt_embeds, class_tokens_mask);
+        struct ggml_tensor* updated_prompt_embeds = ggml_add(ctx->ggml_ctx, prompt_embeds, stacked_id_embeds);
        ggml_set_name(updated_prompt_embeds, "updated_prompt_embeds");
        return updated_prompt_embeds;
    }
@ -126,7 +317,7 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
        blocks["fuse_module"]         = std::shared_ptr<GGMLBlock>(new FuseModule(2048));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* id_pixel_values,
                                struct ggml_tensor* prompt_embeds,
                                struct ggml_tensor* class_tokens_mask,
@ -143,11 +334,53 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
        struct ggml_tensor* id_embeds        = visual_projection->forward(ctx, shared_id_embeds);    // [N, proj_dim(768)]
        struct ggml_tensor* id_embeds_2      = visual_projection_2->forward(ctx, shared_id_embeds);  // [N, 1280]

-        id_embeds   = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
-        id_embeds_2 = ggml_cont(ctx, ggml_permute(ctx, id_embeds_2, 2, 0, 1, 3));
+        id_embeds   = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds, 2, 0, 1, 3));
+        id_embeds_2 = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds_2, 2, 0, 1, 3));

-        id_embeds = ggml_concat(ctx, id_embeds, id_embeds_2, 2);  // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
-        id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 1, 2, 0, 3));
+        id_embeds = ggml_concat(ctx->ggml_ctx, id_embeds, id_embeds_2, 2);  // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
+        id_embeds = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds, 1, 2, 0, 3));
+
+        struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
+                                                                         prompt_embeds,
+                                                                         id_embeds,
+                                                                         class_tokens_mask,
+                                                                         class_tokens_mask_pos,
+                                                                         left, right);
+        return updated_prompt_embeds;
+    }
+};
+
+struct PhotoMakerIDEncoder_CLIPInsightfaceExtendtokenBlock : public CLIPVisionModelProjection {
+    int cross_attention_dim;
+    int num_tokens;
+
+    PhotoMakerIDEncoder_CLIPInsightfaceExtendtokenBlock(int id_embeddings_dim = 512)
+        : CLIPVisionModelProjection(OPENAI_CLIP_VIT_L_14),
+          cross_attention_dim(2048),
+          num_tokens(2) {
+        blocks["visual_projection_2"] = std::shared_ptr<GGMLBlock>(new Linear(1024, 1280, false));
+        blocks["fuse_module"]         = std::shared_ptr<GGMLBlock>(new FuseModule(2048));
+        blocks["qformer_perceiver"]   = std::shared_ptr<GGMLBlock>(new QFormerPerceiver(id_embeddings_dim,
+                                                                                        cross_attention_dim,
+                                                                                        num_tokens));
+    }
+
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                struct ggml_tensor* id_pixel_values,
+                                struct ggml_tensor* prompt_embeds,
+                                struct ggml_tensor* class_tokens_mask,
+                                struct ggml_tensor* class_tokens_mask_pos,
+                                struct ggml_tensor* id_embeds,
+                                struct ggml_tensor* left,
+                                struct ggml_tensor* right) {
+        // x: [N, channels, h, w]
+        auto vision_model      = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
+        auto fuse_module       = std::dynamic_pointer_cast<FuseModule>(blocks["fuse_module"]);
+        auto qformer_perceiver = std::dynamic_pointer_cast<QFormerPerceiver>(blocks["qformer_perceiver"]);
+
+        // struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values);          // [N, hidden_size]
+        struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values, false);  // [N, hidden_size]
+        id_embeds                             = qformer_perceiver->forward(ctx, id_embeds, last_hidden_state);

        struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
                                                                         prompt_embeds,
@ -162,7 +395,9 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
 struct PhotoMakerIDEncoder : public GGMLRunner {
 public:
    SDVersion version    = VERSION_SDXL;
+    PMVersion pm_version = PM_VERSION_1;
    PhotoMakerIDEncoderBlock id_encoder;
+    PhotoMakerIDEncoder_CLIPInsightfaceExtendtokenBlock id_encoder2;
    float style_strength;

    std::vector<float> ctm;
@ -175,25 +410,44 @@ public:
    std::vector<float> zeros_right;

 public:
-    PhotoMakerIDEncoder(ggml_backend_t backend, ggml_type wtype, SDVersion version = VERSION_SDXL, float sty = 20.f)
-        : GGMLRunner(backend, wtype),
+    PhotoMakerIDEncoder(ggml_backend_t backend,
+                        bool offload_params_to_cpu,
+                        const String2TensorStorage& tensor_storage_map,
+                        const std::string prefix,
+                        SDVersion version = VERSION_SDXL,
+                        PMVersion pm_v    = PM_VERSION_1,
+                        float sty         = 20.f)
+        : GGMLRunner(backend, offload_params_to_cpu),
          version(version),
+          pm_version(pm_v),
          style_strength(sty) {
-        id_encoder.init(params_ctx, wtype);
+        if (pm_version == PM_VERSION_1) {
+            id_encoder.init(params_ctx, tensor_storage_map, prefix);
+        } else if (pm_version == PM_VERSION_2) {
+            id_encoder2.init(params_ctx, tensor_storage_map, prefix);
+        }
    }

    std::string get_desc() {
        return "pmid";
    }

+    PMVersion get_version() const {
+        return pm_version;
+    }
+
    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        if (pm_version == PM_VERSION_1)
            id_encoder.get_param_tensors(tensors, prefix);
+        else if (pm_version == PM_VERSION_2)
+            id_encoder2.get_param_tensors(tensors, prefix);
    }

    struct ggml_cgraph* build_graph(  // struct ggml_allocr* allocr,
        struct ggml_tensor* id_pixel_values,
        struct ggml_tensor* prompt_embeds,
-        std::vector<bool>& class_tokens_mask) {
+        std::vector<bool>& class_tokens_mask,
+        struct ggml_tensor* id_embeds) {
        ctm.clear();
        ctmf16.clear();
        ctmpos.clear();
@ -202,7 +456,7 @@ public:
        zeros_right.clear();
        zeros_right_16.clear();

-        ggml_context* ctx0 = compute_ctx;
+        auto runner_ctx = get_context();

        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);

@ -210,31 +464,38 @@ public:
        int64_t seq_length  = prompt_embeds->ne[1];
        ggml_type type      = GGML_TYPE_F32;

-        struct ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(ctx0, type, class_tokens_mask.size());
+        struct ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(runner_ctx.ggml_ctx, type, class_tokens_mask.size());

        struct ggml_tensor* id_pixel_values_d = to_backend(id_pixel_values);
        struct ggml_tensor* prompt_embeds_d   = to_backend(prompt_embeds);
+        struct ggml_tensor* id_embeds_d       = to_backend(id_embeds);

-        struct ggml_tensor* left  = NULL;
-        struct ggml_tensor* right = NULL;
+        struct ggml_tensor* left  = nullptr;
+        struct ggml_tensor* right = nullptr;
        for (int i = 0; i < class_tokens_mask.size(); i++) {
            if (class_tokens_mask[i]) {
+                // printf(" 1,");
                ctm.push_back(0.f);                        // here use 0.f instead of 1.f to make a scale mask
                ctmf16.push_back(ggml_fp32_to_fp16(0.f));  // here use 0.f instead of 1.f to make a scale mask
                ctmpos.push_back(i);
            } else {
+                // printf(" 0,");
                ctm.push_back(1.f);                        // here use 1.f instead of 0.f to make a scale mask
                ctmf16.push_back(ggml_fp32_to_fp16(1.f));  // here use 0.f instead of 1.f to make a scale mask
            }
        }
+        // printf("\n");
        if (ctmpos[0] > 0) {
-            left = ggml_new_tensor_3d(ctx0, type, hidden_size, 1, ctmpos[0]);
+            // left = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type, hidden_size, 1, ctmpos[0]);
+            left = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type, hidden_size, ctmpos[0], 1);
        }
        if (ctmpos[ctmpos.size() - 1] < seq_length - 1) {
-            right = ggml_new_tensor_3d(ctx0, type,
-                                       hidden_size, 1, seq_length - ctmpos[ctmpos.size() - 1] - 1);
+            // right = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type,
+            //                            hidden_size, 1, seq_length - ctmpos[ctmpos.size() - 1] - 1);
+            right = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type,
+                                       hidden_size, seq_length - ctmpos[ctmpos.size() - 1] - 1, 1);
        }
-        struct ggml_tensor* class_tokens_mask_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ctmpos.size());
+        struct ggml_tensor* class_tokens_mask_pos = ggml_new_tensor_1d(runner_ctx.ggml_ctx, GGML_TYPE_I32, ctmpos.size());

        {
            if (type == GGML_TYPE_F16)
@ -265,30 +526,115 @@ public:
                }
            }
        }
-        struct ggml_tensor* updated_prompt_embeds = id_encoder.forward(ctx0,
+        struct ggml_tensor* updated_prompt_embeds = nullptr;
+        if (pm_version == PM_VERSION_1)
+            updated_prompt_embeds = id_encoder.forward(&runner_ctx,
                                                       id_pixel_values_d,
                                                       prompt_embeds_d,
                                                       class_tokens_mask_d,
                                                       class_tokens_mask_pos,
                                                       left, right);
+        else if (pm_version == PM_VERSION_2)
+            updated_prompt_embeds = id_encoder2.forward(&runner_ctx,
+                                                        id_pixel_values_d,
+                                                        prompt_embeds_d,
+                                                        class_tokens_mask_d,
+                                                        class_tokens_mask_pos,
+                                                        id_embeds_d,
+                                                        left, right);
+
        ggml_build_forward_expand(gf, updated_prompt_embeds);

        return gf;
    }

-    void compute(const int n_threads,
+    bool compute(const int n_threads,
                 struct ggml_tensor* id_pixel_values,
                 struct ggml_tensor* prompt_embeds,
+                 struct ggml_tensor* id_embeds,
                 std::vector<bool>& class_tokens_mask,
                 struct ggml_tensor** updated_prompt_embeds,
                 ggml_context* output_ctx) {
        auto get_graph = [&]() -> struct ggml_cgraph* {
            // return build_graph(compute_allocr, id_pixel_values, prompt_embeds, class_tokens_mask);
-            return build_graph(id_pixel_values, prompt_embeds, class_tokens_mask);
+            return build_graph(id_pixel_values, prompt_embeds, class_tokens_mask, id_embeds);
        };

        // GGMLRunner::compute(get_graph, n_threads, updated_prompt_embeds);
-        GGMLRunner::compute(get_graph, n_threads, true, updated_prompt_embeds, output_ctx);
+        return GGMLRunner::compute(get_graph, n_threads, true, updated_prompt_embeds, output_ctx);
+    }
+};
+
+struct PhotoMakerIDEmbed : public GGMLRunner {
+    std::map<std::string, struct ggml_tensor*> tensors;
+    std::string file_path;
+    ModelLoader* model_loader;
+    bool load_failed = false;
+    bool applied     = false;
+
+    PhotoMakerIDEmbed(ggml_backend_t backend,
+                      bool offload_params_to_cpu,
+                      ModelLoader* ml,
+                      const std::string& file_path = "",
+                      const std::string& prefix    = "")
+        : file_path(file_path), GGMLRunner(backend, offload_params_to_cpu), model_loader(ml) {
+        if (!model_loader->init_from_file_and_convert_name(file_path, prefix)) {
+            load_failed = true;
+        }
+    }
+
+    std::string get_desc() {
+        return "id_embeds";
+    }
+
+    bool load_from_file(bool filter_tensor, int n_threads) {
+        LOG_INFO("loading PhotoMaker ID Embeds from '%s'", file_path.c_str());
+
+        if (load_failed) {
+            LOG_ERROR("init photomaker id embed from file failed: '%s'", file_path.c_str());
+            return false;
+        }
+
+        bool dry_run = true;
+        std::mutex tensor_mutex;
+        auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
+            const std::string& name = tensor_storage.name;
+
+            if (filter_tensor && !contains(name, "pmid.id_embeds")) {
+                // LOG_INFO("skipping LoRA tesnor '%s'", name.c_str());
+                return true;
+            }
+            if (dry_run) {
+                std::lock_guard<std::mutex> lock(tensor_mutex);
+                struct ggml_tensor* real = ggml_new_tensor(params_ctx,
+                                                           tensor_storage.type,
+                                                           tensor_storage.n_dims,
+                                                           tensor_storage.ne);
+                tensors[name]            = real;
+            } else {
+                auto real   = tensors[name];
+                *dst_tensor = real;
+            }
+
+            return true;
+        };
+
+        model_loader->load_tensors(on_new_tensor_cb, n_threads);
+        alloc_params_buffer();
+
+        dry_run = false;
+        model_loader->load_tensors(on_new_tensor_cb, n_threads);
+
+        LOG_DEBUG("finished loading PhotoMaker ID Embeds ");
+        return true;
+    }
+
+    struct ggml_tensor* get() {
+        std::map<std::string, struct ggml_tensor*>::iterator pos;
+        pos = tensors.find("pmid.id_embeds");
+        if (pos != tensors.end())
+            return pos->second;
+        return nullptr;
    }
 };

--- a/preprocessing.hpp
+++ b/preprocessing.hpp
@ -6,8 +6,8 @@

 void convolve(struct ggml_tensor* input, struct ggml_tensor* output, struct ggml_tensor* kernel, int padding) {
    struct ggml_init_params params;
-    params.mem_size                 = 20 * 1024 * 1024;  // 10
-    params.mem_buffer               = NULL;
+    params.mem_size                 = 80 * input->ne[0] * input->ne[1];  // 20M for 512x512
+    params.mem_buffer               = nullptr;
    params.no_alloc                 = false;
    struct ggml_context* ctx0       = ggml_init(params);
    struct ggml_tensor* kernel_fp16 = ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, kernel->ne[0], kernel->ne[1], 1, 1);
@ -28,7 +28,7 @@ void gaussian_kernel(struct ggml_tensor* kernel) {
        for (int x = 0; x < kernel->ne[1]; x++) {
            float gy = -ks_mid + x;
            float k_ = expf(-((gx * gx + gy * gy) / (2.0f * powf(sigma, 2.0f)))) * normal;
-            ggml_tensor_set_f32(kernel, k_, x, y);
+            ggml_ext_tensor_set_f32(kernel, k_, x, y);
        }
    }
 }
@ -36,11 +36,11 @@ void gaussian_kernel(struct ggml_tensor* kernel) {
 void grayscale(struct ggml_tensor* rgb_img, struct ggml_tensor* grayscale) {
    for (int iy = 0; iy < rgb_img->ne[1]; iy++) {
        for (int ix = 0; ix < rgb_img->ne[0]; ix++) {
-            float r    = ggml_tensor_get_f32(rgb_img, ix, iy);
-            float g    = ggml_tensor_get_f32(rgb_img, ix, iy, 1);
-            float b    = ggml_tensor_get_f32(rgb_img, ix, iy, 2);
+            float r    = ggml_ext_tensor_get_f32(rgb_img, ix, iy);
+            float g    = ggml_ext_tensor_get_f32(rgb_img, ix, iy, 1);
+            float b    = ggml_ext_tensor_get_f32(rgb_img, ix, iy, 2);
            float gray = 0.2989f * r + 0.5870f * g + 0.1140f * b;
-            ggml_tensor_set_f32(grayscale, gray, ix, iy);
+            ggml_ext_tensor_set_f32(grayscale, gray, ix, iy);
        }
    }
 }
@ -81,37 +81,37 @@ void normalize_tensor(struct ggml_tensor* g) {
 void non_max_supression(struct ggml_tensor* result, struct ggml_tensor* G, struct ggml_tensor* D) {
    for (int iy = 1; iy < result->ne[1] - 1; iy++) {
        for (int ix = 1; ix < result->ne[0] - 1; ix++) {
-            float angle = ggml_tensor_get_f32(D, ix, iy) * 180.0f / M_PI_;
+            float angle = ggml_ext_tensor_get_f32(D, ix, iy) * 180.0f / M_PI_;
            angle       = angle < 0.0f ? angle += 180.0f : angle;
            float q     = 1.0f;
            float r     = 1.0f;

            // angle 0
            if ((0 >= angle && angle < 22.5f) || (157.5f >= angle && angle <= 180)) {
-                q = ggml_tensor_get_f32(G, ix, iy + 1);
-                r = ggml_tensor_get_f32(G, ix, iy - 1);
+                q = ggml_ext_tensor_get_f32(G, ix, iy + 1);
+                r = ggml_ext_tensor_get_f32(G, ix, iy - 1);
            }
            // angle 45
            else if (22.5f >= angle && angle < 67.5f) {
-                q = ggml_tensor_get_f32(G, ix + 1, iy - 1);
-                r = ggml_tensor_get_f32(G, ix - 1, iy + 1);
+                q = ggml_ext_tensor_get_f32(G, ix + 1, iy - 1);
+                r = ggml_ext_tensor_get_f32(G, ix - 1, iy + 1);
            }
            // angle 90
            else if (67.5f >= angle && angle < 112.5) {
-                q = ggml_tensor_get_f32(G, ix + 1, iy);
-                r = ggml_tensor_get_f32(G, ix - 1, iy);
+                q = ggml_ext_tensor_get_f32(G, ix + 1, iy);
+                r = ggml_ext_tensor_get_f32(G, ix - 1, iy);
            }
            // angle 135
            else if (112.5 >= angle && angle < 157.5f) {
-                q = ggml_tensor_get_f32(G, ix - 1, iy - 1);
-                r = ggml_tensor_get_f32(G, ix + 1, iy + 1);
+                q = ggml_ext_tensor_get_f32(G, ix - 1, iy - 1);
+                r = ggml_ext_tensor_get_f32(G, ix + 1, iy + 1);
            }

-            float cur = ggml_tensor_get_f32(G, ix, iy);
+            float cur = ggml_ext_tensor_get_f32(G, ix, iy);
            if ((cur >= q) && (cur >= r)) {
-                ggml_tensor_set_f32(result, cur, ix, iy);
+                ggml_ext_tensor_set_f32(result, cur, ix, iy);
            } else {
-                ggml_tensor_set_f32(result, 0.0f, ix, iy);
+                ggml_ext_tensor_set_f32(result, 0.0f, ix, iy);
            }
        }
    }
@ -138,9 +138,9 @@ void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float lo
    for (int iy = 0; iy < img->ne[1]; iy++) {
        for (int ix = 0; ix < img->ne[0]; ix++) {
            if (ix >= 3 && ix <= img->ne[0] - 3 && iy >= 3 && iy <= img->ne[1] - 3) {
-                ggml_tensor_set_f32(img, ggml_tensor_get_f32(img, ix, iy), ix, iy);
+                ggml_ext_tensor_set_f32(img, ggml_ext_tensor_get_f32(img, ix, iy), ix, iy);
            } else {
-                ggml_tensor_set_f32(img, 0.0f, ix, iy);
+                ggml_ext_tensor_set_f32(img, 0.0f, ix, iy);
            }
        }
    }
@ -148,30 +148,30 @@ void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float lo
    // hysteresis
    for (int iy = 1; iy < img->ne[1] - 1; iy++) {
        for (int ix = 1; ix < img->ne[0] - 1; ix++) {
-            float imd_v = ggml_tensor_get_f32(img, ix, iy);
+            float imd_v = ggml_ext_tensor_get_f32(img, ix, iy);
            if (imd_v == weak) {
-                if (ggml_tensor_get_f32(img, ix + 1, iy - 1) == strong || ggml_tensor_get_f32(img, ix + 1, iy) == strong ||
-                    ggml_tensor_get_f32(img, ix, iy - 1) == strong || ggml_tensor_get_f32(img, ix, iy + 1) == strong ||
-                    ggml_tensor_get_f32(img, ix - 1, iy - 1) == strong || ggml_tensor_get_f32(img, ix - 1, iy) == strong) {
-                    ggml_tensor_set_f32(img, strong, ix, iy);
+                if (ggml_ext_tensor_get_f32(img, ix + 1, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix + 1, iy) == strong ||
+                    ggml_ext_tensor_get_f32(img, ix, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix, iy + 1) == strong ||
+                    ggml_ext_tensor_get_f32(img, ix - 1, iy - 1) == strong || ggml_ext_tensor_get_f32(img, ix - 1, iy) == strong) {
+                    ggml_ext_tensor_set_f32(img, strong, ix, iy);
                } else {
-                    ggml_tensor_set_f32(img, 0.0f, ix, iy);
+                    ggml_ext_tensor_set_f32(img, 0.0f, ix, iy);
                }
            }
        }
    }
 }

-uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
+bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
    struct ggml_init_params params;
-    params.mem_size               = static_cast<size_t>(10 * 1024 * 1024);  // 10
-    params.mem_buffer             = NULL;
+    params.mem_size               = static_cast<size_t>(40 * img.width * img.height);  // 10MB for 512x512
+    params.mem_buffer             = nullptr;
    params.no_alloc               = false;
    struct ggml_context* work_ctx = ggml_init(params);

    if (!work_ctx) {
        LOG_ERROR("ggml_init() failed");
-        return NULL;
+        return false;
    }

    float kX[9] = {
@ -192,13 +192,13 @@ uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_thresh
    struct ggml_tensor* sf_ky = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
    memcpy(sf_ky->data, kY, ggml_nbytes(sf_ky));
    gaussian_kernel(gkernel);
-    struct ggml_tensor* image      = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-    struct ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1);
+    struct ggml_tensor* image      = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 3, 1);
+    struct ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, img.width, img.height, 1, 1);
    struct ggml_tensor* iX         = ggml_dup_tensor(work_ctx, image_gray);
    struct ggml_tensor* iY         = ggml_dup_tensor(work_ctx, image_gray);
    struct ggml_tensor* G          = ggml_dup_tensor(work_ctx, image_gray);
    struct ggml_tensor* tetha      = ggml_dup_tensor(work_ctx, image_gray);
-    sd_image_to_tensor(img, image);
+    sd_image_to_ggml_tensor(img, image);
    grayscale(image, image_gray);
    convolve(image_gray, image_gray, gkernel, 2);
    convolve(image_gray, iX, sf_kx, 1);
@ -209,19 +209,18 @@ uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_thresh
    non_max_supression(image_gray, G, tetha);
    threshold_hystersis(image_gray, high_threshold, low_threshold, weak, strong);
    // to RGB channels
-    for (int iy = 0; iy < height; iy++) {
-        for (int ix = 0; ix < width; ix++) {
-            float gray = ggml_tensor_get_f32(image_gray, ix, iy);
+    for (int iy = 0; iy < img.height; iy++) {
+        for (int ix = 0; ix < img.width; ix++) {
+            float gray = ggml_ext_tensor_get_f32(image_gray, ix, iy);
            gray       = inverse ? 1.0f - gray : gray;
-            ggml_tensor_set_f32(image, gray, ix, iy);
-            ggml_tensor_set_f32(image, gray, ix, iy, 1);
-            ggml_tensor_set_f32(image, gray, ix, iy, 2);
+            ggml_ext_tensor_set_f32(image, gray, ix, iy);
+            ggml_ext_tensor_set_f32(image, gray, ix, iy, 1);
+            ggml_ext_tensor_set_f32(image, gray, ix, iy, 2);
        }
    }
-    free(img);
-    uint8_t* output = sd_tensor_to_image(image);
+    ggml_tensor_to_sd_image(image, img.data);
    ggml_free(work_ctx);
-    return output;
+    return true;
 }

 #endif  // __PREPROCESSING_HPP__
--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@ -0,0 +1,687 @@
+#ifndef __QWEN_IMAGE_HPP__
+#define __QWEN_IMAGE_HPP__
+
+#include <memory>
+
+#include "common.hpp"
+#include "flux.hpp"
+#include "ggml_extend.hpp"
+
+namespace Qwen {
+    constexpr int QWEN_IMAGE_GRAPH_SIZE = 20480;
+
+    struct TimestepEmbedding : public GGMLBlock {
+    public:
+        TimestepEmbedding(int64_t in_channels,
+                          int64_t time_embed_dim,
+                          int64_t out_dim       = 0,
+                          int64_t cond_proj_dim = 0,
+                          bool sample_proj_bias = true) {
+            blocks["linear_1"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, time_embed_dim, sample_proj_bias));
+            if (cond_proj_dim > 0) {
+                blocks["cond_proj"] = std::shared_ptr<GGMLBlock>(new Linear(cond_proj_dim, in_channels, false));
+            }
+            if (out_dim <= 0) {
+                out_dim = time_embed_dim;
+            }
+            blocks["linear_2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, out_dim, sample_proj_bias));
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* sample,
+                                    struct ggml_tensor* condition = nullptr) {
+            if (condition != nullptr) {
+                auto cond_proj = std::dynamic_pointer_cast<Linear>(blocks["cond_proj"]);
+                sample         = ggml_add(ctx->ggml_ctx, sample, cond_proj->forward(ctx, condition));
+            }
+            auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
+            auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);
+
+            sample = linear_1->forward(ctx, sample);
+            sample = ggml_silu_inplace(ctx->ggml_ctx, sample);
+            sample = linear_2->forward(ctx, sample);
+            return sample;
+        }
+    };
+
+    struct QwenTimestepProjEmbeddings : public GGMLBlock {
+    public:
+        QwenTimestepProjEmbeddings(int64_t embedding_dim) {
+            blocks["timestep_embedder"] = std::shared_ptr<GGMLBlock>(new TimestepEmbedding(256, embedding_dim));
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* timesteps) {
+            // timesteps: [N,]
+            // return: [N, embedding_dim]
+            auto timestep_embedder = std::dynamic_pointer_cast<TimestepEmbedding>(blocks["timestep_embedder"]);
+
+            auto timesteps_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1.f);
+            auto timesteps_emb  = timestep_embedder->forward(ctx, timesteps_proj);
+            return timesteps_emb;
+        }
+    };
+
+    struct QwenImageAttention : public GGMLBlock {
+    protected:
+        int64_t dim_head;
+
+    public:
+        QwenImageAttention(int64_t query_dim,
+                           int64_t dim_head,
+                           int64_t num_heads,
+                           int64_t out_dim         = 0,
+                           int64_t out_context_dim = 0,
+                           bool bias               = true,
+                           bool out_bias           = true,
+                           float eps               = 1e-6)
+            : dim_head(dim_head) {
+            int64_t inner_dim = out_dim > 0 ? out_dim : dim_head * num_heads;
+            out_dim           = out_dim > 0 ? out_dim : query_dim;
+            out_context_dim   = out_context_dim > 0 ? out_context_dim : query_dim;
+
+            blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, bias));
+            blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, bias));
+            blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, bias));
+
+            blocks["norm_q"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps));
+            blocks["norm_k"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps));
+
+            blocks["add_q_proj"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, bias));
+            blocks["add_k_proj"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, bias));
+            blocks["add_v_proj"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, bias));
+
+            blocks["norm_added_q"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps));
+            blocks["norm_added_k"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps));
+
+            float scale         = 1.f / 32.f;
+            bool force_prec_f32 = false;
+#ifdef SD_USE_VULKAN
+            force_prec_f32 = true;
+#endif
+            // The purpose of the scale here is to prevent NaN issues in certain situations.
+            // For example when using CUDA but the weights are k-quants (not all prompts).
+            blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias, false, force_prec_f32, scale));
+            // to_out.1 is nn.Dropout
+
+            blocks["to_add_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_context_dim, out_bias, false, false, scale));
+        }
+
+        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                      struct ggml_tensor* img,
+                                                      struct ggml_tensor* txt,
+                                                      struct ggml_tensor* pe,
+                                                      struct ggml_tensor* mask = nullptr) {
+            // img: [N, n_img_token, hidden_size]
+            // txt: [N, n_txt_token, hidden_size]
+            // pe: [n_img_token + n_txt_token, d_head/2, 2, 2]
+            // return: ([N, n_img_token, hidden_size], [N, n_txt_token, hidden_size])
+
+            auto norm_q = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_q"]);
+            auto norm_k = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_k"]);
+
+            auto to_q     = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
+            auto to_k     = std::dynamic_pointer_cast<Linear>(blocks["to_k"]);
+            auto to_v     = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
+            auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
+
+            auto norm_added_q = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_added_q"]);
+            auto norm_added_k = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_added_k"]);
+
+            auto add_q_proj = std::dynamic_pointer_cast<Linear>(blocks["add_q_proj"]);
+            auto add_k_proj = std::dynamic_pointer_cast<Linear>(blocks["add_k_proj"]);
+            auto add_v_proj = std::dynamic_pointer_cast<Linear>(blocks["add_v_proj"]);
+            auto to_add_out = std::dynamic_pointer_cast<Linear>(blocks["to_add_out"]);
+
+            int64_t N           = img->ne[2];
+            int64_t n_img_token = img->ne[1];
+            int64_t n_txt_token = txt->ne[1];
+
+            auto img_q        = to_q->forward(ctx, img);
+            int64_t num_heads = img_q->ne[0] / dim_head;
+            img_q             = ggml_reshape_4d(ctx->ggml_ctx, img_q, dim_head, num_heads, n_img_token, N);  // [N, n_img_token, n_head, d_head]
+            auto img_k        = to_k->forward(ctx, img);
+            img_k             = ggml_reshape_4d(ctx->ggml_ctx, img_k, dim_head, num_heads, n_img_token, N);  // [N, n_img_token, n_head, d_head]
+            auto img_v        = to_v->forward(ctx, img);
+            img_v             = ggml_reshape_4d(ctx->ggml_ctx, img_v, dim_head, num_heads, n_img_token, N);  // [N, n_img_token, n_head, d_head]
+
+            img_q = norm_q->forward(ctx, img_q);
+            img_k = norm_k->forward(ctx, img_k);
+
+            auto txt_q = add_q_proj->forward(ctx, txt);
+            txt_q      = ggml_reshape_4d(ctx->ggml_ctx, txt_q, dim_head, num_heads, n_txt_token, N);  // [N, n_txt_token, n_head, d_head]
+            auto txt_k = add_k_proj->forward(ctx, txt);
+            txt_k      = ggml_reshape_4d(ctx->ggml_ctx, txt_k, dim_head, num_heads, n_txt_token, N);  // [N, n_txt_token, n_head, d_head]
+            auto txt_v = add_v_proj->forward(ctx, txt);
+            txt_v      = ggml_reshape_4d(ctx->ggml_ctx, txt_v, dim_head, num_heads, n_txt_token, N);  // [N, n_txt_token, n_head, d_head]
+
+            txt_q = norm_added_q->forward(ctx, txt_q);
+            txt_k = norm_added_k->forward(ctx, txt_k);
+
+            auto q = ggml_concat(ctx->ggml_ctx, txt_q, img_q, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
+            auto k = ggml_concat(ctx->ggml_ctx, txt_k, img_k, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
+            auto v = ggml_concat(ctx->ggml_ctx, txt_v, img_v, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
+
+            auto attn         = Rope::attention(ctx, q, k, v, pe, mask, (1.0f / 128.f));                  // [N, n_txt_token + n_img_token, n_head*d_head]
+            attn              = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, attn, 0, 2, 1, 3));  // [n_txt_token + n_img_token, N, hidden_size]
+            auto txt_attn_out = ggml_view_3d(ctx->ggml_ctx,
+                                             attn,
+                                             attn->ne[0],
+                                             attn->ne[1],
+                                             txt->ne[1],
+                                             attn->nb[1],
+                                             attn->nb[2],
+                                             0);                                                                  // [n_txt_token, N, hidden_size]
+            txt_attn_out      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, txt_attn_out, 0, 2, 1, 3));  // [N, n_txt_token, hidden_size]
+            auto img_attn_out = ggml_view_3d(ctx->ggml_ctx,
+                                             attn,
+                                             attn->ne[0],
+                                             attn->ne[1],
+                                             img->ne[1],
+                                             attn->nb[1],
+                                             attn->nb[2],
+                                             attn->nb[2] * txt->ne[1]);                                           // [n_img_token, N, hidden_size]
+            img_attn_out      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, img_attn_out, 0, 2, 1, 3));  // [N, n_img_token, hidden_size]
+
+            img_attn_out = to_out_0->forward(ctx, img_attn_out);
+            txt_attn_out = to_add_out->forward(ctx, txt_attn_out);
+
+            return {img_attn_out, txt_attn_out};
+        }
+    };
+
+    class QwenImageTransformerBlock : public GGMLBlock {
+    public:
+        QwenImageTransformerBlock(int64_t dim,
+                                  int64_t num_attention_heads,
+                                  int64_t attention_head_dim,
+                                  float eps = 1e-6) {
+            // img_mod.0 is nn.SiLU()
+            blocks["img_mod.1"] = std::shared_ptr<GGMLBlock>(new Linear(dim, 6 * dim, true));
+
+            blocks["img_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim, eps, false));
+            blocks["img_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim, eps, false));
+            blocks["img_mlp"]   = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim, 4, FeedForward::Activation::GELU, true));
+
+            // txt_mod.0 is nn.SiLU()
+            blocks["txt_mod.1"] = std::shared_ptr<GGMLBlock>(new Linear(dim, 6 * dim, true));
+
+            blocks["txt_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim, eps, false));
+            blocks["txt_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim, eps, false));
+            blocks["txt_mlp"]   = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim, 4, FeedForward::Activation::GELU));
+
+            blocks["attn"] = std::shared_ptr<GGMLBlock>(new QwenImageAttention(dim,
+                                                                               attention_head_dim,
+                                                                               num_attention_heads,
+                                                                               0,     // out_dim
+                                                                               0,     // out_context-dim
+                                                                               true,  // bias
+                                                                               true,  // out_bias
+                                                                               eps));
+        }
+
+        virtual std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                              struct ggml_tensor* img,
+                                                              struct ggml_tensor* txt,
+                                                              struct ggml_tensor* t_emb,
+                                                              struct ggml_tensor* pe) {
+            // img: [N, n_img_token, hidden_size]
+            // txt: [N, n_txt_token, hidden_size]
+            // pe: [n_img_token + n_txt_token, d_head/2, 2, 2]
+            // return: ([N, n_img_token, hidden_size], [N, n_txt_token, hidden_size])
+
+            auto img_mod_1 = std::dynamic_pointer_cast<Linear>(blocks["img_mod.1"]);
+            auto img_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["img_norm1"]);
+            auto img_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["img_norm2"]);
+            auto img_mlp   = std::dynamic_pointer_cast<FeedForward>(blocks["img_mlp"]);
+
+            auto txt_mod_1 = std::dynamic_pointer_cast<Linear>(blocks["txt_mod.1"]);
+            auto txt_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["txt_norm1"]);
+            auto txt_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["txt_norm2"]);
+            auto txt_mlp   = std::dynamic_pointer_cast<FeedForward>(blocks["txt_mlp"]);
+
+            auto attn = std::dynamic_pointer_cast<QwenImageAttention>(blocks["attn"]);
+
+            auto img_mod_params    = ggml_silu(ctx->ggml_ctx, t_emb);
+            img_mod_params         = img_mod_1->forward(ctx, img_mod_params);
+            auto img_mod_param_vec = ggml_ext_chunk(ctx->ggml_ctx, img_mod_params, 6, 0);
+
+            auto txt_mod_params    = ggml_silu(ctx->ggml_ctx, t_emb);
+            txt_mod_params         = txt_mod_1->forward(ctx, txt_mod_params);
+            auto txt_mod_param_vec = ggml_ext_chunk(ctx->ggml_ctx, txt_mod_params, 6, 0);
+
+            auto img_normed    = img_norm1->forward(ctx, img);
+            auto img_modulated = Flux::modulate(ctx->ggml_ctx, img_normed, img_mod_param_vec[0], img_mod_param_vec[1]);
+            auto img_gate1     = img_mod_param_vec[2];
+
+            auto txt_normed    = txt_norm1->forward(ctx, txt);
+            auto txt_modulated = Flux::modulate(ctx->ggml_ctx, txt_normed, txt_mod_param_vec[0], txt_mod_param_vec[1]);
+            auto txt_gate1     = txt_mod_param_vec[2];
+
+            auto [img_attn_output, txt_attn_output] = attn->forward(ctx, img_modulated, txt_modulated, pe);
+
+            img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_attn_output, img_gate1));
+            txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_attn_output, txt_gate1));
+
+            auto img_normed2    = img_norm2->forward(ctx, img);
+            auto img_modulated2 = Flux::modulate(ctx->ggml_ctx, img_normed2, img_mod_param_vec[3], img_mod_param_vec[4]);
+            auto img_gate2      = img_mod_param_vec[5];
+
+            auto txt_normed2    = txt_norm2->forward(ctx, txt);
+            auto txt_modulated2 = Flux::modulate(ctx->ggml_ctx, txt_normed2, txt_mod_param_vec[3], txt_mod_param_vec[4]);
+            auto txt_gate2      = txt_mod_param_vec[5];
+
+            auto img_mlp_out = img_mlp->forward(ctx, img_modulated2);
+            auto txt_mlp_out = txt_mlp->forward(ctx, txt_modulated2);
+
+            img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_mlp_out, img_gate2));
+            txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp_out, txt_gate2));
+
+            return {img, txt};
+        }
+    };
+
+    struct AdaLayerNormContinuous : public GGMLBlock {
+    public:
+        AdaLayerNormContinuous(int64_t embedding_dim,
+                               int64_t conditioning_embedding_dim,
+                               bool elementwise_affine = true,
+                               float eps               = 1e-5f,
+                               bool bias               = true) {
+            blocks["norm"]   = std::shared_ptr<GGMLBlock>(new LayerNorm(conditioning_embedding_dim, eps, elementwise_affine, bias));
+            blocks["linear"] = std::shared_ptr<GGMLBlock>(new Linear(conditioning_embedding_dim, embedding_dim * 2, bias));
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* x,
+                                    struct ggml_tensor* c) {
+            // x: [N, n_token, hidden_size]
+            // c: [N, hidden_size]
+            // return: [N, n_token, patch_size * patch_size * out_channels]
+
+            auto norm   = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
+            auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
+
+            auto emb   = linear->forward(ctx, ggml_silu(ctx->ggml_ctx, c));
+            auto mods  = ggml_ext_chunk(ctx->ggml_ctx, emb, 2, 0);
+            auto scale = mods[0];
+            auto shift = mods[1];
+
+            x = norm->forward(ctx, x);
+            x = Flux::modulate(ctx->ggml_ctx, x, shift, scale);
+
+            return x;
+        }
+    };
+
+    struct QwenImageParams {
+        int64_t patch_size          = 2;
+        int64_t in_channels         = 64;
+        int64_t out_channels        = 16;
+        int64_t num_layers          = 60;
+        int64_t attention_head_dim  = 128;
+        int64_t num_attention_heads = 24;
+        int64_t joint_attention_dim = 3584;
+        float theta                 = 10000;
+        std::vector<int> axes_dim   = {16, 56, 56};
+        int64_t axes_dim_sum        = 128;
+    };
+
+    class QwenImageModel : public GGMLBlock {
+    protected:
+        QwenImageParams params;
+
+    public:
+        QwenImageModel() {}
+        QwenImageModel(QwenImageParams params)
+            : params(params) {
+            int64_t inner_dim         = params.num_attention_heads * params.attention_head_dim;
+            blocks["time_text_embed"] = std::shared_ptr<GGMLBlock>(new QwenTimestepProjEmbeddings(inner_dim));
+            blocks["txt_norm"]        = std::shared_ptr<GGMLBlock>(new RMSNorm(params.joint_attention_dim, 1e-6f));
+            blocks["img_in"]          = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, inner_dim));
+            blocks["txt_in"]          = std::shared_ptr<GGMLBlock>(new Linear(params.joint_attention_dim, inner_dim));
+
+            // blocks
+            for (int i = 0; i < params.num_layers; i++) {
+                auto block                                        = std::shared_ptr<GGMLBlock>(new QwenImageTransformerBlock(inner_dim,
+                                                                                                                             params.num_attention_heads,
+                                                                                                                             params.attention_head_dim,
+                                                                                                                             1e-6f));
+                blocks["transformer_blocks." + std::to_string(i)] = block;
+            }
+
+            blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new AdaLayerNormContinuous(inner_dim, inner_dim, false, 1e-6f));
+            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, params.patch_size * params.patch_size * params.out_channels));
+        }
+
+        struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx,
+                                              struct ggml_tensor* x) {
+            int64_t W = x->ne[0];
+            int64_t H = x->ne[1];
+
+            int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
+            int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
+            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            return x;
+        }
+
+        struct ggml_tensor* patchify(struct ggml_context* ctx,
+                                     struct ggml_tensor* x) {
+            // x: [N, C, H, W]
+            // return: [N, h*w, C * patch_size * patch_size]
+            int64_t N = x->ne[3];
+            int64_t C = x->ne[2];
+            int64_t H = x->ne[1];
+            int64_t W = x->ne[0];
+            int64_t p = params.patch_size;
+            int64_t h = H / params.patch_size;
+            int64_t w = W / params.patch_size;
+
+            GGML_ASSERT(h * p == H && w * p == W);
+
+            x = ggml_reshape_4d(ctx, x, p, w, p, h * C * N);       // [N*C*h, p, w, p]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N*C*h, w, p, p]
+            x = ggml_reshape_4d(ctx, x, p * p, w * h, C, N);       // [N, C, h*w, p*p]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N, h*w, C, p*p]
+            x = ggml_reshape_3d(ctx, x, p * p * C, w * h, N);      // [N, h*w, C*p*p]
+            return x;
+        }
+
+        struct ggml_tensor* process_img(struct ggml_context* ctx,
+                                        struct ggml_tensor* x) {
+            x = pad_to_patch_size(ctx, x);
+            x = patchify(ctx, x);
+            return x;
+        }
+
+        struct ggml_tensor* unpatchify(struct ggml_context* ctx,
+                                       struct ggml_tensor* x,
+                                       int64_t h,
+                                       int64_t w) {
+            // x: [N, h*w, C*patch_size*patch_size]
+            // return: [N, C, H, W]
+            int64_t N = x->ne[2];
+            int64_t C = x->ne[0] / params.patch_size / params.patch_size;
+            int64_t H = h * params.patch_size;
+            int64_t W = w * params.patch_size;
+            int64_t p = params.patch_size;
+
+            GGML_ASSERT(C * p * p == x->ne[0]);
+
+            x = ggml_reshape_4d(ctx, x, p * p, C, w * h, N);       // [N, h*w, C, p*p]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N, C, h*w, p*p]
+            x = ggml_reshape_4d(ctx, x, p, p, w, h * C * N);       // [N*C*h, w, p, p]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N*C*h, p, w, p]
+            x = ggml_reshape_4d(ctx, x, W, H, C, N);               // [N, C, h*p, w*p]
+
+            return x;
+        }
+
+        struct ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
+                                         struct ggml_tensor* x,
+                                         struct ggml_tensor* timestep,
+                                         struct ggml_tensor* context,
+                                         struct ggml_tensor* pe) {
+            auto time_text_embed = std::dynamic_pointer_cast<QwenTimestepProjEmbeddings>(blocks["time_text_embed"]);
+            auto txt_norm        = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm"]);
+            auto img_in          = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
+            auto txt_in          = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
+            auto norm_out        = std::dynamic_pointer_cast<AdaLayerNormContinuous>(blocks["norm_out"]);
+            auto proj_out        = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]);
+
+            auto t_emb = time_text_embed->forward(ctx, timestep);
+            auto img   = img_in->forward(ctx, x);
+            auto txt   = txt_norm->forward(ctx, context);
+            txt        = txt_in->forward(ctx, txt);
+
+            for (int i = 0; i < params.num_layers; i++) {
+                auto block = std::dynamic_pointer_cast<QwenImageTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
+
+                auto result = block->forward(ctx, img, txt, t_emb, pe);
+                img         = result.first;
+                txt         = result.second;
+            }
+
+            img = norm_out->forward(ctx, img, t_emb);
+            img = proj_out->forward(ctx, img);
+
+            return img;
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* x,
+                                    struct ggml_tensor* timestep,
+                                    struct ggml_tensor* context,
+                                    struct ggml_tensor* pe,
+                                    std::vector<ggml_tensor*> ref_latents = {}) {
+            // Forward pass of DiT.
+            // x: [N, C, H, W]
+            // timestep: [N,]
+            // context: [N, L, D]
+            // pe: [L, d_head/2, 2, 2]
+            // return: [N, C, H, W]
+
+            int64_t W = x->ne[0];
+            int64_t H = x->ne[1];
+            int64_t C = x->ne[2];
+            int64_t N = x->ne[3];
+
+            auto img            = process_img(ctx->ggml_ctx, x);
+            uint64_t img_tokens = img->ne[1];
+
+            if (ref_latents.size() > 0) {
+                for (ggml_tensor* ref : ref_latents) {
+                    ref = process_img(ctx->ggml_ctx, ref);
+                    img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
+                }
+            }
+
+            int64_t h_len = ((H + (params.patch_size / 2)) / params.patch_size);
+            int64_t w_len = ((W + (params.patch_size / 2)) / params.patch_size);
+
+            auto out = forward_orig(ctx, img, timestep, context, pe);  // [N, h_len*w_len, ph*pw*C]
+
+            if (out->ne[1] > img_tokens) {
+                out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3));  // [num_tokens, N, C * patch_size * patch_size]
+                out = ggml_view_3d(ctx->ggml_ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0);
+                out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3));  // [N, h*w, C * patch_size * patch_size]
+            }
+
+            out = unpatchify(ctx->ggml_ctx, out, h_len, w_len);  // [N, C, H + pad_h, W + pad_w]
+
+            // slice
+            out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, H);  // [N, C, H, W + pad_w]
+            out = ggml_ext_slice(ctx->ggml_ctx, out, 0, 0, W);  // [N, C, H, W]
+
+            return out;
+        }
+    };
+
+    struct QwenImageRunner : public GGMLRunner {
+    public:
+        QwenImageParams qwen_image_params;
+        QwenImageModel qwen_image;
+        std::vector<float> pe_vec;
+        SDVersion version;
+
+        QwenImageRunner(ggml_backend_t backend,
+                        bool offload_params_to_cpu,
+                        const String2TensorStorage& tensor_storage_map = {},
+                        const std::string prefix                       = "",
+                        SDVersion version                              = VERSION_QWEN_IMAGE)
+            : GGMLRunner(backend, offload_params_to_cpu) {
+            qwen_image_params.num_layers = 0;
+            for (auto pair : tensor_storage_map) {
+                std::string tensor_name = pair.first;
+                if (tensor_name.find(prefix) == std::string::npos)
+                    continue;
+                size_t pos = tensor_name.find("transformer_blocks.");
+                if (pos != std::string::npos) {
+                    tensor_name = tensor_name.substr(pos);  // remove prefix
+                    auto items  = split_string(tensor_name, '.');
+                    if (items.size() > 1) {
+                        int block_index = atoi(items[1].c_str());
+                        if (block_index + 1 > qwen_image_params.num_layers) {
+                            qwen_image_params.num_layers = block_index + 1;
+                        }
+                    }
+                    continue;
+                }
+            }
+            LOG_INFO("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers);
+            qwen_image = QwenImageModel(qwen_image_params);
+            qwen_image.init(params_ctx, tensor_storage_map, prefix);
+        }
+
+        std::string get_desc() override {
+            return "qwen_image";
+        }
+
+        void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+            qwen_image.get_param_tensors(tensors, prefix);
+        }
+
+        struct ggml_cgraph* build_graph(struct ggml_tensor* x,
+                                        struct ggml_tensor* timesteps,
+                                        struct ggml_tensor* context,
+                                        std::vector<ggml_tensor*> ref_latents = {},
+                                        bool increase_ref_index               = false) {
+            GGML_ASSERT(x->ne[3] == 1);
+            struct ggml_cgraph* gf = new_graph_custom(QWEN_IMAGE_GRAPH_SIZE);
+
+            x         = to_backend(x);
+            context   = to_backend(context);
+            timesteps = to_backend(timesteps);
+
+            for (int i = 0; i < ref_latents.size(); i++) {
+                ref_latents[i] = to_backend(ref_latents[i]);
+            }
+
+            pe_vec      = Rope::gen_qwen_image_pe(x->ne[1],
+                                                  x->ne[0],
+                                                  qwen_image_params.patch_size,
+                                                  x->ne[3],
+                                                  context->ne[1],
+                                                  ref_latents,
+                                                  increase_ref_index,
+                                                  qwen_image_params.theta,
+                                                  qwen_image_params.axes_dim);
+            int pos_len = pe_vec.size() / qwen_image_params.axes_dim_sum / 2;
+            // LOG_DEBUG("pos_len %d", pos_len);
+            auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, qwen_image_params.axes_dim_sum / 2, pos_len);
+            // pe->data = pe_vec.data();
+            // print_ggml_tensor(pe, true, "pe");
+            // pe->data = nullptr;
+            set_backend_tensor_data(pe, pe_vec.data());
+
+            auto runner_ctx = get_context();
+
+            struct ggml_tensor* out = qwen_image.forward(&runner_ctx,
+                                                         x,
+                                                         timesteps,
+                                                         context,
+                                                         pe,
+                                                         ref_latents);
+
+            ggml_build_forward_expand(gf, out);
+
+            return gf;
+        }
+
+        bool compute(int n_threads,
+                     struct ggml_tensor* x,
+                     struct ggml_tensor* timesteps,
+                     struct ggml_tensor* context,
+                     std::vector<ggml_tensor*> ref_latents = {},
+                     bool increase_ref_index               = false,
+                     struct ggml_tensor** output           = nullptr,
+                     struct ggml_context* output_ctx       = nullptr) {
+            // x: [N, in_channels, h, w]
+            // timesteps: [N, ]
+            // context: [N, max_position, hidden_size]
+            auto get_graph = [&]() -> struct ggml_cgraph* {
+                return build_graph(x, timesteps, context, ref_latents, increase_ref_index);
+            };
+
+            return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        }
+
+        void test() {
+            struct ggml_init_params params;
+            params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1GB
+            params.mem_buffer = nullptr;
+            params.no_alloc   = false;
+
+            struct ggml_context* work_ctx = ggml_init(params);
+            GGML_ASSERT(work_ctx != nullptr);
+
+            {
+                // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1);
+                // ggml_set_f32(x, 0.01f);
+                auto x = load_tensor_from_file(work_ctx, "./qwen_image_x.bin");
+                print_ggml_tensor(x);
+
+                std::vector<float> timesteps_vec(1, 1000.f);
+                auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
+
+                // auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 3584, 256, 1);
+                // ggml_set_f32(context, 0.01f);
+                auto context = load_tensor_from_file(work_ctx, "./qwen_image_context.bin");
+                print_ggml_tensor(context);
+
+                struct ggml_tensor* out = nullptr;
+
+                int t0 = ggml_time_ms();
+                compute(8, x, timesteps, context, {}, false, &out, work_ctx);
+                int t1 = ggml_time_ms();
+
+                print_ggml_tensor(out);
+                LOG_DEBUG("qwen_image test done in %dms", t1 - t0);
+            }
+        }
+
+        static void load_from_file_and_test(const std::string& file_path) {
+            // cuda q8: pass
+            // cuda q8 fa: pass
+            // ggml_backend_t backend    = ggml_backend_cuda_init(0);
+            ggml_backend_t backend    = ggml_backend_cpu_init();
+            ggml_type model_data_type = GGML_TYPE_Q8_0;
+
+            ModelLoader model_loader;
+            if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
+                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
+                return;
+            }
+
+            auto& tensor_storage_map = model_loader.get_tensor_storage_map();
+            for (auto& [name, tensor_storage] : tensor_storage_map) {
+                if (ends_with(name, "weight")) {
+                    tensor_storage.expected_type = model_data_type;
+                }
+            }
+
+            std::shared_ptr<QwenImageRunner> qwen_image = std::make_shared<QwenImageRunner>(backend,
+                                                                                            false,
+                                                                                            tensor_storage_map,
+                                                                                            "model.diffusion_model",
+                                                                                            VERSION_QWEN_IMAGE);
+
+            qwen_image->alloc_params_buffer();
+            std::map<std::string, ggml_tensor*> tensors;
+            qwen_image->get_param_tensors(tensors, "model.diffusion_model");
+
+            bool success = model_loader.load_tensors(tensors);
+
+            if (!success) {
+                LOG_ERROR("load tensors from model loader failed");
+                return;
+            }
+
+            LOG_INFO("qwen_image model loaded");
+            qwen_image->test();
+        }
+    };
+
+}  // namespace name
+
+#endif  // __QWEN_IMAGE_HPP__
--- a/rng.hpp
+++ b/rng.hpp
@ -15,11 +15,11 @@ private:
    std::default_random_engine generator;

 public:
-    void manual_seed(uint64_t seed) {
+    void manual_seed(uint64_t seed) override {
        generator.seed((unsigned int)seed);
    }

-    std::vector<float> randn(uint32_t n) {
+    std::vector<float> randn(uint32_t n) override {
        std::vector<float> result;
        float mean   = 0.0;
        float stddev = 1.0;
--- a/rng_mt19937.hpp
+++ b/rng_mt19937.hpp
@ -0,0 +1,147 @@
+#ifndef __RNG_MT19937_HPP__
+#define __RNG_MT19937_HPP__
+
+#include <cmath>
+#include <vector>
+
+#include "rng.hpp"
+
+// RNG imitiating torch cpu randn on CPU.
+// Port from pytorch, original license: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/LICENSE
+// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/TransformationHelper.h, for uniform_real
+// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/native/cpu/DistributionTemplates.h, for normal_kernel/normal_fill/normal_fill_16
+// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/MT19937RNGEngine.h, for mt19937_engine
+// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/DistributionsHelper.h, for uniform_real_distribution/normal_distribution
+class MT19937RNG : public RNG {
+    static const int N             = 624;
+    static const int M             = 397;
+    static const uint32_t MATRIX_A = 0x9908b0dfU;
+    static const uint32_t UMASK    = 0x80000000U;
+    static const uint32_t LMASK    = 0x7fffffffU;
+
+    struct State {
+        uint64_t seed_;
+        int left_;
+        bool seeded_;
+        uint32_t next_;
+        std::array<uint32_t, N> state_;
+        bool has_next_gauss = false;
+        double next_gauss   = 0.0f;
+    };
+
+    State s;
+
+    uint32_t mix_bits(uint32_t u, uint32_t v) { return (u & UMASK) | (v & LMASK); }
+    uint32_t twist(uint32_t u, uint32_t v) { return (mix_bits(u, v) >> 1) ^ ((v & 1) ? MATRIX_A : 0); }
+    void next_state() {
+        uint32_t* p = s.state_.data();
+        s.left_     = N;
+        s.next_     = 0;
+        for (int j = N - M + 1; --j; p++)
+            p[0] = p[M] ^ twist(p[0], p[1]);
+        for (int j = M; --j; p++)
+            p[0] = p[M - N] ^ twist(p[0], p[1]);
+        p[0] = p[M - N] ^ twist(p[0], s.state_[0]);
+    }
+
+    uint32_t rand_uint32() {
+        if (--s.left_ == 0)
+            next_state();
+        uint32_t y = s.state_[s.next_++];
+        y ^= (y >> 11);
+        y ^= (y << 7) & 0x9d2c5680U;
+        y ^= (y << 15) & 0xefc60000U;
+        y ^= (y >> 18);
+        return y;
+    }
+
+    uint64_t rand_uint64() {
+        uint64_t high = (uint64_t)rand_uint32();
+        uint64_t low  = (uint64_t)rand_uint32();
+        return (high << 32) | low;
+    }
+
+    template <typename T, typename V>
+    T uniform_real(V val, T from, T to) {
+        constexpr auto MASK    = static_cast<V>((static_cast<uint64_t>(1) << std::numeric_limits<T>::digits) - 1);
+        constexpr auto DIVISOR = static_cast<T>(1) / (static_cast<uint64_t>(1) << std::numeric_limits<T>::digits);
+        T x                    = (val & MASK) * DIVISOR;
+        return (x * (to - from) + from);
+    }
+
+    double normal_double_value(double mean, double std) {
+        if (s.has_next_gauss) {
+            s.has_next_gauss = false;
+            return s.next_gauss;
+        }
+        double u1 = uniform_real(rand_uint64(), 0., 1.);  // double
+        double u2 = uniform_real(rand_uint64(), 0., 1.);  // double
+
+        double r         = std::sqrt(-2.0 * std::log1p(-u2));
+        double theta     = 2.0 * 3.14159265358979323846 * u1;
+        double value     = r * std::cos(theta) * std + mean;
+        s.next_gauss     = r * std::sin(theta) * std + mean;
+        s.has_next_gauss = true;
+        return value;
+    }
+
+    void normal_fill_16(float* data, float mean, float std) {
+        for (int j = 0; j < 8; ++j) {
+            float u1    = 1.0f - data[j];
+            float u2    = data[j + 8];
+            float r     = std::sqrt(-2.0f * std::log(u1));
+            float theta = 2.0f * 3.14159265358979323846 * u2;
+            data[j]     = r * std::cos(theta) * std + mean;
+            data[j + 8] = r * std::sin(theta) * std + mean;
+        }
+    }
+
+    void randn(float* data, int64_t size, float mean = 0.0f, float std = 1.0f) {
+        if (size >= 16) {
+            for (int64_t i = 0; i < size; i++) {
+                data[i] = uniform_real(rand_uint32(), 0.f, 1.f);
+            }
+            for (int64_t i = 0; i < size - 15; i += 16) {
+                normal_fill_16(data + i, mean, std);
+            }
+            if (size % 16 != 0) {
+                // Recompute the last 16 values.
+                data = data + size - 16;
+                for (int64_t i = 0; i < 16; i++) {
+                    data[i] = uniform_real(rand_uint32(), 0.f, 1.f);
+                }
+                normal_fill_16(data, mean, std);
+            }
+        } else {
+            // Strange handling, hard to understand, but keeping it consistent with PyTorch.
+            for (int64_t i = 0; i < size; i++) {
+                data[i] = (float)normal_double_value(mean, std);
+            }
+        }
+    }
+
+public:
+    MT19937RNG(uint64_t seed = 0) { manual_seed(seed); }
+
+    void manual_seed(uint64_t seed) override {
+        s.seed_     = seed;
+        s.seeded_   = true;
+        s.state_[0] = (uint32_t)(seed & 0xffffffffU);
+        for (int j = 1; j < N; j++) {
+            uint32_t prev = s.state_[j - 1];
+            s.state_[j]   = 1812433253U * (prev ^ (prev >> 30)) + j;
+        }
+        s.left_          = 1;
+        s.next_          = 0;
+        s.has_next_gauss = false;
+    }
+
+    std::vector<float> randn(uint32_t n) override {
+        std::vector<float> out;
+        out.resize(n);
+        randn((float*)out.data(), out.size());
+        return out;
+    }
+};
+
+#endif  // __RNG_MT19937_HPP__
--- a/rng_philox.hpp
+++ b/rng_philox.hpp
@ -93,12 +93,12 @@ public:
        this->offset = 0;
    }

-    void manual_seed(uint64_t seed) {
+    void manual_seed(uint64_t seed) override {
        this->seed   = seed;
        this->offset = 0;
    }

-    std::vector<float> randn(uint32_t n) {
+    std::vector<float> randn(uint32_t n) override {
        std::vector<std::vector<uint32_t>> counter(4, std::vector<uint32_t>(n, 0));
        for (uint32_t i = 0; i < n; i++) {
            counter[0][i] = this->offset;
--- a/rope.hpp
+++ b/rope.hpp
@ -0,0 +1,496 @@
+#ifndef __ROPE_HPP__
+#define __ROPE_HPP__
+
+#include <vector>
+#include "ggml_extend.hpp"
+
+namespace Rope {
+    template <class T>
+    __STATIC_INLINE__ std::vector<T> linspace(T start, T end, int num) {
+        std::vector<T> result(num);
+        if (num == 1) {
+            result[0] = start;
+            return result;
+        }
+        T step = (end - start) / (num - 1);
+        for (int i = 0; i < num; ++i) {
+            result[i] = start + i * step;
+        }
+        return result;
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>> transpose(const std::vector<std::vector<float>>& mat) {
+        int rows = mat.size();
+        int cols = mat[0].size();
+        std::vector<std::vector<float>> transposed(cols, std::vector<float>(rows));
+        for (int i = 0; i < rows; ++i) {
+            for (int j = 0; j < cols; ++j) {
+                transposed[j][i] = mat[i][j];
+            }
+        }
+        return transposed;
+    }
+
+    __STATIC_INLINE__ std::vector<float> flatten(const std::vector<std::vector<float>>& vec) {
+        std::vector<float> flat_vec;
+        for (const auto& sub_vec : vec) {
+            flat_vec.insert(flat_vec.end(), sub_vec.begin(), sub_vec.end());
+        }
+        return flat_vec;
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>> rope(const std::vector<float>& pos, int dim, int theta) {
+        assert(dim % 2 == 0);
+        int half_dim = dim / 2;
+
+        std::vector<float> scale = linspace(0.f, (dim * 1.f - 2) / dim, half_dim);
+
+        std::vector<float> omega(half_dim);
+        for (int i = 0; i < half_dim; ++i) {
+            omega[i] = 1.0 / std::pow(theta, scale[i]);
+        }
+
+        int pos_size = pos.size();
+        std::vector<std::vector<float>> out(pos_size, std::vector<float>(half_dim));
+        for (int i = 0; i < pos_size; ++i) {
+            for (int j = 0; j < half_dim; ++j) {
+                out[i][j] = pos[i] * omega[j];
+            }
+        }
+
+        std::vector<std::vector<float>> result(pos_size, std::vector<float>(half_dim * 4));
+        for (int i = 0; i < pos_size; ++i) {
+            for (int j = 0; j < half_dim; ++j) {
+                result[i][4 * j]     = std::cos(out[i][j]);
+                result[i][4 * j + 1] = -std::sin(out[i][j]);
+                result[i][4 * j + 2] = std::sin(out[i][j]);
+                result[i][4 * j + 3] = std::cos(out[i][j]);
+            }
+        }
+
+        return result;
+    }
+
+    // Generate IDs for image patches and text
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_txt_ids(int bs, int context_len, int axes_dim_num, std::set<int> arange_dims) {
+        auto txt_ids = std::vector<std::vector<float>>(bs * context_len, std::vector<float>(axes_dim_num, 0.0f));
+        for (int dim = 0; dim < axes_dim_num; dim++) {
+            if (arange_dims.find(dim) != arange_dims.end()) {
+                for (int i = 0; i < bs * context_len; i++) {
+                    txt_ids[i][dim] = (i % context_len);
+                }
+            }
+        }
+        return txt_ids;
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_img_ids(int h,
+                                                                       int w,
+                                                                       int patch_size,
+                                                                       int bs,
+                                                                       int axes_dim_num,
+                                                                       int index    = 0,
+                                                                       int h_offset = 0,
+                                                                       int w_offset = 0) {
+        int h_len = (h + (patch_size / 2)) / patch_size;
+        int w_len = (w + (patch_size / 2)) / patch_size;
+
+        std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(axes_dim_num, 0.0));
+
+        std::vector<float> row_ids = linspace<float>(h_offset, h_len - 1 + h_offset, h_len);
+        std::vector<float> col_ids = linspace<float>(w_offset, w_len - 1 + w_offset, w_len);
+
+        for (int i = 0; i < h_len; ++i) {
+            for (int j = 0; j < w_len; ++j) {
+                img_ids[i * w_len + j][0] = index;
+                img_ids[i * w_len + j][1] = row_ids[i];
+                img_ids[i * w_len + j][2] = col_ids[j];
+            }
+        }
+
+        std::vector<std::vector<float>> img_ids_repeated(bs * img_ids.size(), std::vector<float>(3));
+        for (int i = 0; i < bs; ++i) {
+            for (int j = 0; j < img_ids.size(); ++j) {
+                img_ids_repeated[i * img_ids.size() + j] = img_ids[j];
+            }
+        }
+        return img_ids_repeated;
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>> concat_ids(const std::vector<std::vector<float>>& a,
+                                                                 const std::vector<std::vector<float>>& b,
+                                                                 int bs) {
+        size_t a_len = a.size() / bs;
+        size_t b_len = b.size() / bs;
+        std::vector<std::vector<float>> ids(a.size() + b.size(), std::vector<float>(3));
+        for (int i = 0; i < bs; ++i) {
+            for (int j = 0; j < a_len; ++j) {
+                ids[i * (a_len + b_len) + j] = a[i * a_len + j];
+            }
+            for (int j = 0; j < b_len; ++j) {
+                ids[i * (a_len + b_len) + a_len + j] = b[i * b_len + j];
+            }
+        }
+        return ids;
+    }
+
+    __STATIC_INLINE__ std::vector<float> embed_nd(const std::vector<std::vector<float>>& ids,
+                                                  int bs,
+                                                  int theta,
+                                                  const std::vector<int>& axes_dim) {
+        std::vector<std::vector<float>> trans_ids = transpose(ids);
+        size_t pos_len                            = ids.size() / bs;
+        int num_axes                              = axes_dim.size();
+        // for (int i = 0; i < pos_len; i++) {
+        //     std::cout << trans_ids[0][i] << " " << trans_ids[1][i] << " " << trans_ids[2][i] << std::endl;
+        // }
+
+        int emb_dim = 0;
+        for (int d : axes_dim)
+            emb_dim += d / 2;
+
+        std::vector<std::vector<float>> emb(bs * pos_len, std::vector<float>(emb_dim * 2 * 2, 0.0));
+        int offset = 0;
+        for (int i = 0; i < num_axes; ++i) {
+            std::vector<std::vector<float>> rope_emb = rope(trans_ids[i], axes_dim[i], theta);  // [bs*pos_len, axes_dim[i]/2 * 2 * 2]
+            for (int b = 0; b < bs; ++b) {
+                for (int j = 0; j < pos_len; ++j) {
+                    for (int k = 0; k < rope_emb[0].size(); ++k) {
+                        emb[b * pos_len + j][offset + k] = rope_emb[j][k];
+                    }
+                }
+            }
+            offset += rope_emb[0].size();
+        }
+
+        return flatten(emb);
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_refs_ids(int patch_size,
+                                                                   int bs,
+                                                                   int axes_dim_num,
+                                                                   const std::vector<ggml_tensor*>& ref_latents,
+                                                                   bool increase_ref_index,
+                                                                   float ref_index_scale) {
+        std::vector<std::vector<float>> ids;
+        uint64_t curr_h_offset = 0;
+        uint64_t curr_w_offset = 0;
+        int index              = 1;
+        for (ggml_tensor* ref : ref_latents) {
+            uint64_t h_offset = 0;
+            uint64_t w_offset = 0;
+            if (!increase_ref_index) {
+                if (ref->ne[1] + curr_h_offset > ref->ne[0] + curr_w_offset) {
+                    w_offset = curr_w_offset;
+                } else {
+                    h_offset = curr_h_offset;
+                }
+            }
+
+            auto ref_ids = gen_flux_img_ids(ref->ne[1],
+                                            ref->ne[0],
+                                            patch_size,
+                                            bs,
+                                            axes_dim_num,
+                                            static_cast<int>(index * ref_index_scale),
+                                            h_offset,
+                                            w_offset);
+            ids          = concat_ids(ids, ref_ids, bs);
+
+            if (increase_ref_index) {
+                index++;
+            }
+
+            curr_h_offset = std::max(curr_h_offset, ref->ne[1] + h_offset);
+            curr_w_offset = std::max(curr_w_offset, ref->ne[0] + w_offset);
+        }
+        return ids;
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_ids(int h,
+                                                                   int w,
+                                                                   int patch_size,
+                                                                   int bs,
+                                                                   int axes_dim_num,
+                                                                   int context_len,
+                                                                   std::set<int> txt_arange_dims,
+                                                                   const std::vector<ggml_tensor*>& ref_latents,
+                                                                   bool increase_ref_index,
+                                                                   float ref_index_scale) {
+        auto txt_ids = gen_flux_txt_ids(bs, context_len, axes_dim_num, txt_arange_dims);
+        auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num);
+
+        auto ids = concat_ids(txt_ids, img_ids, bs);
+        if (ref_latents.size() > 0) {
+            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, ref_index_scale);
+            ids           = concat_ids(ids, refs_ids, bs);
+        }
+        return ids;
+    }
+
+    // Generate flux positional embeddings
+    __STATIC_INLINE__ std::vector<float> gen_flux_pe(int h,
+                                                     int w,
+                                                     int patch_size,
+                                                     int bs,
+                                                     int context_len,
+                                                     std::set<int> txt_arange_dims,
+                                                     const std::vector<ggml_tensor*>& ref_latents,
+                                                     bool increase_ref_index,
+                                                     float ref_index_scale,
+                                                     int theta,
+                                                     const std::vector<int>& axes_dim) {
+        std::vector<std::vector<float>> ids = gen_flux_ids(h,
+                                                           w,
+                                                           patch_size,
+                                                           bs,
+                                                           static_cast<int>(axes_dim.size()),
+                                                           context_len,
+                                                           txt_arange_dims,
+                                                           ref_latents,
+                                                           increase_ref_index,
+                                                           ref_index_scale);
+        return embed_nd(ids, bs, theta, axes_dim);
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_qwen_image_ids(int h,
+                                                                         int w,
+                                                                         int patch_size,
+                                                                         int bs,
+                                                                         int context_len,
+                                                                         const std::vector<ggml_tensor*>& ref_latents,
+                                                                         bool increase_ref_index) {
+        int h_len        = (h + (patch_size / 2)) / patch_size;
+        int w_len        = (w + (patch_size / 2)) / patch_size;
+        int txt_id_start = std::max(h_len, w_len);
+        auto txt_ids     = linspace<float>(txt_id_start, context_len + txt_id_start, context_len);
+        std::vector<std::vector<float>> txt_ids_repeated(bs * context_len, std::vector<float>(3));
+        for (int i = 0; i < bs; ++i) {
+            for (int j = 0; j < txt_ids.size(); ++j) {
+                txt_ids_repeated[i * txt_ids.size() + j] = {txt_ids[j], txt_ids[j], txt_ids[j]};
+            }
+        }
+        int axes_dim_num = 3;
+        auto img_ids     = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num);
+        auto ids         = concat_ids(txt_ids_repeated, img_ids, bs);
+        if (ref_latents.size() > 0) {
+            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, 1.f);
+            ids           = concat_ids(ids, refs_ids, bs);
+        }
+        return ids;
+    }
+
+    // Generate qwen_image positional embeddings
+    __STATIC_INLINE__ std::vector<float> gen_qwen_image_pe(int h,
+                                                           int w,
+                                                           int patch_size,
+                                                           int bs,
+                                                           int context_len,
+                                                           const std::vector<ggml_tensor*>& ref_latents,
+                                                           bool increase_ref_index,
+                                                           int theta,
+                                                           const std::vector<int>& axes_dim) {
+        std::vector<std::vector<float>> ids = gen_qwen_image_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index);
+        return embed_nd(ids, bs, theta, axes_dim);
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_vid_ids(int t,
+                                                                  int h,
+                                                                  int w,
+                                                                  int pt,
+                                                                  int ph,
+                                                                  int pw,
+                                                                  int bs,
+                                                                  int t_offset = 0,
+                                                                  int h_offset = 0,
+                                                                  int w_offset = 0) {
+        int t_len = (t + (pt / 2)) / pt;
+        int h_len = (h + (ph / 2)) / ph;
+        int w_len = (w + (pw / 2)) / pw;
+
+        std::vector<std::vector<float>> vid_ids(t_len * h_len * w_len, std::vector<float>(3, 0.0));
+
+        std::vector<float> t_ids = linspace<float>(t_offset, t_len - 1 + t_offset, t_len);
+        std::vector<float> h_ids = linspace<float>(h_offset, h_len - 1 + h_offset, h_len);
+        std::vector<float> w_ids = linspace<float>(w_offset, w_len - 1 + w_offset, w_len);
+
+        for (int i = 0; i < t_len; ++i) {
+            for (int j = 0; j < h_len; ++j) {
+                for (int k = 0; k < w_len; ++k) {
+                    int idx         = i * h_len * w_len + j * w_len + k;
+                    vid_ids[idx][0] = t_ids[i];
+                    vid_ids[idx][1] = h_ids[j];
+                    vid_ids[idx][2] = w_ids[k];
+                }
+            }
+        }
+
+        std::vector<std::vector<float>> vid_ids_repeated(bs * vid_ids.size(), std::vector<float>(3));
+        for (int i = 0; i < bs; ++i) {
+            for (int j = 0; j < vid_ids.size(); ++j) {
+                vid_ids_repeated[i * vid_ids.size() + j] = vid_ids[j];
+            }
+        }
+        return vid_ids_repeated;
+    }
+
+    // Generate wan positional embeddings
+    __STATIC_INLINE__ std::vector<float> gen_wan_pe(int t,
+                                                    int h,
+                                                    int w,
+                                                    int pt,
+                                                    int ph,
+                                                    int pw,
+                                                    int bs,
+                                                    int theta,
+                                                    const std::vector<int>& axes_dim) {
+        std::vector<std::vector<float>> ids = gen_vid_ids(t, h, w, pt, ph, pw, bs);
+        return embed_nd(ids, bs, theta, axes_dim);
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_qwen2vl_ids(int grid_h,
+                                                                      int grid_w,
+                                                                      int merge_size,
+                                                                      const std::vector<int>& window_index) {
+        std::vector<std::vector<float>> ids(grid_h * grid_w, std::vector<float>(2, 0.0));
+        int index = 0;
+        for (int ih = 0; ih < grid_h; ih += merge_size) {
+            for (int iw = 0; iw < grid_w; iw += merge_size) {
+                for (int iy = 0; iy < merge_size; iy++) {
+                    for (int ix = 0; ix < merge_size; ix++) {
+                        int inverse_index = window_index[index / (merge_size * merge_size)];
+                        int i             = inverse_index * (merge_size * merge_size) + index % (merge_size * merge_size);
+
+                        GGML_ASSERT(i < grid_h * grid_w);
+
+                        ids[i][0] = ih + iy;
+                        ids[i][1] = iw + ix;
+                        index++;
+                    }
+                }
+            }
+        }
+        return ids;
+    }
+
+    // Generate qwen2vl positional embeddings
+    __STATIC_INLINE__ std::vector<float> gen_qwen2vl_pe(int grid_h,
+                                                        int grid_w,
+                                                        int merge_size,
+                                                        const std::vector<int>& window_index,
+                                                        int theta,
+                                                        const std::vector<int>& axes_dim) {
+        std::vector<std::vector<float>> ids = gen_qwen2vl_ids(grid_h, grid_w, merge_size, window_index);
+        return embed_nd(ids, 1, theta, axes_dim);
+    }
+
+    __STATIC_INLINE__ int bound_mod(int a, int m) {
+        return (m - (a % m)) % m;
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_z_image_ids(int h,
+                                                                      int w,
+                                                                      int patch_size,
+                                                                      int bs,
+                                                                      int context_len,
+                                                                      int seq_multi_of,
+                                                                      const std::vector<ggml_tensor*>& ref_latents,
+                                                                      bool increase_ref_index) {
+        int padded_context_len = context_len + bound_mod(context_len, seq_multi_of);
+        auto txt_ids           = std::vector<std::vector<float>>(bs * padded_context_len, std::vector<float>(3, 0.0f));
+        for (int i = 0; i < bs * padded_context_len; i++) {
+            txt_ids[i][0] = (i % padded_context_len) + 1.f;
+        }
+
+        int axes_dim_num = 3;
+        int index        = padded_context_len + 1;
+        auto img_ids     = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num, index);
+
+        int img_pad_len = bound_mod(static_cast<int>(img_ids.size() / bs), seq_multi_of);
+        if (img_pad_len > 0) {
+            std::vector<std::vector<float>> img_pad_ids(bs * img_pad_len, std::vector<float>(3, 0.f));
+            img_ids = concat_ids(img_ids, img_pad_ids, bs);
+        }
+
+        auto ids = concat_ids(txt_ids, img_ids, bs);
+
+        // ignore ref_latents for now
+        return ids;
+    }
+
+    // Generate z_image positional embeddings
+    __STATIC_INLINE__ std::vector<float> gen_z_image_pe(int h,
+                                                        int w,
+                                                        int patch_size,
+                                                        int bs,
+                                                        int context_len,
+                                                        int seq_multi_of,
+                                                        const std::vector<ggml_tensor*>& ref_latents,
+                                                        bool increase_ref_index,
+                                                        int theta,
+                                                        const std::vector<int>& axes_dim) {
+        std::vector<std::vector<float>> ids = gen_z_image_ids(h, w, patch_size, bs, context_len, seq_multi_of, ref_latents, increase_ref_index);
+        return embed_nd(ids, bs, theta, axes_dim);
+    }
+
+    __STATIC_INLINE__ struct ggml_tensor* apply_rope(struct ggml_context* ctx,
+                                                     struct ggml_tensor* x,
+                                                     struct ggml_tensor* pe,
+                                                     bool rope_interleaved = true) {
+        // x: [N, L, n_head, d_head]
+        // pe: [L, d_head/2, 2, 2], [[cos, -sin], [sin, cos]]
+        int64_t d_head = x->ne[0];
+        int64_t n_head = x->ne[1];
+        int64_t L      = x->ne[2];
+        int64_t N      = x->ne[3];
+        x              = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N, n_head, L, d_head]
+        if (rope_interleaved) {
+            x = ggml_reshape_4d(ctx, x, 2, d_head / 2, L, n_head * N);  // [N * n_head, L, d_head/2, 2]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 3, 0, 1, 2));       // [2, N * n_head, L, d_head/2]
+        } else {
+            x = ggml_reshape_4d(ctx, x, d_head / 2, 2, L, n_head * N);       // [N * n_head, L, 2, d_head/2]
+            x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 3, 1));  // [2, N * n_head, L, d_head/2]
+        }
+
+        int64_t offset = x->nb[2] * x->ne[2];
+        auto x_0       = ggml_view_3d(ctx, x, x->ne[0], x->ne[1], x->ne[2], x->nb[1], x->nb[2], offset * 0);  // [N * n_head, L, d_head/2]
+        auto x_1       = ggml_view_3d(ctx, x, x->ne[0], x->ne[1], x->ne[2], x->nb[1], x->nb[2], offset * 1);  // [N * n_head, L, d_head/2]
+        x_0            = ggml_reshape_4d(ctx, x_0, 1, x_0->ne[0], x_0->ne[1], x_0->ne[2]);                    // [N * n_head, L, d_head/2, 1]
+        x_1            = ggml_reshape_4d(ctx, x_1, 1, x_1->ne[0], x_1->ne[1], x_1->ne[2]);                    // [N * n_head, L, d_head/2, 1]
+        auto temp_x    = ggml_new_tensor_4d(ctx, x_0->type, 2, x_0->ne[1], x_0->ne[2], x_0->ne[3]);
+        x_0            = ggml_repeat(ctx, x_0, temp_x);  // [N * n_head, L, d_head/2, 2]
+        x_1            = ggml_repeat(ctx, x_1, temp_x);  // [N * n_head, L, d_head/2, 2]
+
+        pe        = ggml_cont(ctx, ggml_permute(ctx, pe, 3, 0, 1, 2));  // [2, L, d_head/2, 2]
+        offset    = pe->nb[2] * pe->ne[2];
+        auto pe_0 = ggml_view_3d(ctx, pe, pe->ne[0], pe->ne[1], pe->ne[2], pe->nb[1], pe->nb[2], offset * 0);  // [L, d_head/2, 2]
+        auto pe_1 = ggml_view_3d(ctx, pe, pe->ne[0], pe->ne[1], pe->ne[2], pe->nb[1], pe->nb[2], offset * 1);  // [L, d_head/2, 2]
+
+        auto x_out = ggml_add_inplace(ctx, ggml_mul(ctx, x_0, pe_0), ggml_mul(ctx, x_1, pe_1));  // [N * n_head, L, d_head/2, 2]
+        if (!rope_interleaved) {
+            x_out = ggml_cont(ctx, ggml_permute(ctx, x_out, 1, 0, 2, 3));  // [N * n_head, L, x, d_head/2]
+        }
+        x_out = ggml_reshape_3d(ctx, x_out, d_head, L, n_head * N);  // [N*n_head, L, d_head]
+        return x_out;
+    }
+
+    __STATIC_INLINE__ struct ggml_tensor* attention(GGMLRunnerContext* ctx,
+                                                    struct ggml_tensor* q,
+                                                    struct ggml_tensor* k,
+                                                    struct ggml_tensor* v,
+                                                    struct ggml_tensor* pe,
+                                                    struct ggml_tensor* mask,
+                                                    float kv_scale        = 1.0f,
+                                                    bool rope_interleaved = true) {
+        // q,k,v: [N, L, n_head, d_head]
+        // pe: [L, d_head/2, 2, 2]
+        // return: [N, L, n_head*d_head]
+        q = apply_rope(ctx->ggml_ctx, q, pe, rope_interleaved);  // [N*n_head, L, d_head]
+        k = apply_rope(ctx->ggml_ctx, k, pe, rope_interleaved);  // [N*n_head, L, d_head]
+
+        auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, v->ne[1], mask, false, true, ctx->flash_attn_enabled, kv_scale);  // [N, L, n_head*d_head]
+        return x;
+    }
+};  // namespace Rope
+
+#endif  // __ROPE_HPP__
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
--- a/Show More
+++ b/Show More