refactor: update c api

feat: add Instruct-Pix2pix/CosXL-Edit support (#679 )
* Instruct-p2p support * support 2 conditionings cfg * Do not re-encode the exact same image twice * fixes for 2-cfg * Fix pix2pix latent inputs + improve inpainting a bit + fix naming * prepare for other pix2pix-like models * Support sdxl ip2p * fix reference image embeddings * Support 2-cond cfg properly in cli * fix typo in help * Support masks for ip2p models * unify code style * delete unused code * use edit mode * add img_cond * format code --------- Co-authored-by: leejet <leejet714@gmail.com>
2025-12-13 05:48:56 +00:00 · 2025-07-13 18:46:48 +08:00 · 2025-07-12 15:36:45 +08:00 · 2025-07-08 00:11:38 +08:00 · 2025-07-06 23:24:55 +08:00 · 2025-07-06 23:10:10 +08:00
86 changed files with 2446357 additions and 4947 deletions
--- a/.clang-format
+++ b/.clang-format
@ -3,7 +3,6 @@ UseTab: Never
 IndentWidth: 4
 TabWidth: 4
 AllowShortIfStatementsOnASingleLine: false
-IndentCaseLabels: false
 ColumnLimit: 0
 AccessModifierOffset: -4
 NamespaceIndentation: All
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -4,17 +4,36 @@ on:
  workflow_dispatch: # allows manual triggering
    inputs:
      create_release:
-        description: 'Create new release'
+        description: "Create new release"
        required: true
        type: boolean
  push:
    branches:
      - master
      - ci
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
+    paths:
+      [
+        ".github/workflows/**",
+        "**/CMakeLists.txt",
+        "**/Makefile",
+        "**/*.h",
+        "**/*.hpp",
+        "**/*.c",
+        "**/*.cpp",
+        "**/*.cu",
+      ]
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
+    paths:
+      [
+        "**/CMakeLists.txt",
+        "**/Makefile",
+        "**/*.h",
+        "**/*.hpp",
+        "**/*.c",
+        "**/*.cpp",
+        "**/*.cu",
+      ]

 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@ -30,7 +49,6 @@ jobs:
        with:
          submodules: recursive

-
      - name: Dependencies
        id: depends
        run: |
@ -42,14 +60,37 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake ..
+          cmake .. -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON
          cmake --build . --config Release

-      #- name: Test
-        #id: cmake_test
-        #run: |
-          #cd build
-          #ctest --verbose --timeout 900
+      - name: Get commit hash
+        id: commit
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
+        uses: pr-mpt/actions-commit-hash@v2
+
+      - name: Fetch system info
+        id: system-info
+        run: |
+          echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
+          echo "OS_NAME=`lsb_release -s -i`" >> "$GITHUB_OUTPUT"
+          echo "OS_VERSION=`lsb_release -s -r`" >> "$GITHUB_OUTPUT"
+          echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp ggml/LICENSE ./build/bin/ggml.txt
+          cp LICENSE ./build/bin/stable-diffusion.cpp.txt
+          zip -j sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
+          path: |
+            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip

  macOS-latest-cmake:
    runs-on: macos-latest
@ -63,9 +104,8 @@ jobs:

      - name: Dependencies
        id: depends
-        continue-on-error: true
        run: |
-          brew update
+          brew install zip

      - name: Build
        id: cmake_build
@ -73,30 +113,61 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake ..
+          cmake .. -DGGML_AVX2=ON -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" -DSD_BUILD_SHARED_LIBS=ON
          cmake --build . --config Release

-      #- name: Test
-        #id: cmake_test
-        #run: |
-          #cd build
-          #ctest --verbose --timeout 900
+      - name: Get commit hash
+        id: commit
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
+        uses: pr-mpt/actions-commit-hash@v2
+
+      - name: Fetch system info
+        id: system-info
+        run: |
+          echo "CPU_ARCH=`uname -m`" >> "$GITHUB_OUTPUT"
+          echo "OS_NAME=`sw_vers -productName`" >> "$GITHUB_OUTPUT"
+          echo "OS_VERSION=`sw_vers -productVersion`" >> "$GITHUB_OUTPUT"
+          echo "OS_TYPE=`uname -s`" >> "$GITHUB_OUTPUT"
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp ggml/LICENSE ./build/bin/ggml.txt
+          cp LICENSE ./build/bin/stable-diffusion.cpp.txt
+          zip -j sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
+          path: |
+            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip

  windows-latest-cmake:
-    runs-on: windows-latest
+    runs-on: windows-2025
+
+    env:
+      VULKAN_VERSION: 1.3.261.1

    strategy:
      matrix:
        include:
-          - build: 'noavx'
-            defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
-          - build: 'avx2'
-            defines: '-DGGML_AVX2=ON'
-          - build: 'avx'
-            defines: '-DGGML_AVX2=OFF'
-          - build: 'avx512'
-            defines: '-DGGML_AVX512=ON'
-
+          - build: "noavx"
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON"
+          - build: "avx2"
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
+          - build: "avx"
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX=ON -DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON"
+          - build: "avx512"
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
+          - build: "cuda12"
+            defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES=90;89;80;75"
+          # - build: "rocm5.5"
+          #   defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
+          - build: 'vulkan'
+            defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
    steps:
      - name: Clone
        id: checkout
@ -104,6 +175,37 @@ jobs:
        with:
          submodules: recursive

+      - name: Install cuda-toolkit
+        id: cuda-toolkit
+        if: ${{ matrix.build == 'cuda12' }}
+        uses: Jimver/cuda-toolkit@v0.2.19
+        with:
+          cuda: "12.6.2"
+          method: "network"
+          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
+
+      - name: Install rocm-toolkit
+        id: rocm-toolkit
+        if: ${{ matrix.build == 'rocm5.5' }}
+        uses: Cyberhan123/rocm-toolkit@v0.1.0
+        with:
+          rocm: "5.5.0"
+
+      - name: Install Ninja
+        id: install-ninja
+        if: ${{ matrix.build == 'rocm5.5' }}
+        uses: urkle/action-get-ninja@v1
+        with:
+          version: 1.11.1
+      - name: Install Vulkan SDK
+        id: get_vulkan
+        if: ${{ matrix.build == 'vulkan' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
+          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
+          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
+          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
+
      - name: Build
        id: cmake_build
        run: |
@ -125,12 +227,6 @@ jobs:
          & $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
          .\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"

-      #- name: Test
-        #id: cmake_test
-        #run: |
-          #cd build
-          #ctest -C Release --verbose --timeout 900
-
      - name: Get commit hash
        id: commit
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -140,14 +236,44 @@ jobs:
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
-          Copy-Item ggml/LICENSE .\build\bin\Release\ggml.txt
-          Copy-Item LICENSE .\build\bin\Release\stable-diffusion.cpp.txt
-          7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
+          $filePath = ".\build\bin\Release\*"
+          if (Test-Path $filePath) {
+            echo "Exists at path $filePath"
+            Copy-Item ggml/LICENSE .\build\bin\Release\ggml.txt
+            Copy-Item LICENSE .\build\bin\Release\stable-diffusion.cpp.txt
+          } elseif (Test-Path ".\build\bin\stable-diffusion.dll") {
+          $filePath = ".\build\bin\*"
+            echo "Exists at path $filePath"
+            Copy-Item ggml/LICENSE .\build\bin\ggml.txt
+            Copy-Item LICENSE .\build\bin\stable-diffusion.cpp.txt
+          } else {
+            ls .\build\bin
+            throw "Can't find stable-diffusion.dll"
+          }
+          7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip $filePath
+
+      - name: Copy and pack Cuda runtime
+        id: pack_cuda_runtime
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          $dst='.\build\bin\cudart\'
+          robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
+          7z a cudart-sd-bin-win-cu12-x64.zip $dst\*
+
+      - name: Upload Cuda runtime
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: sd-cudart-sd-bin-win-cu12-x64.zip
+          path: |
+            cudart-sd-bin-win-cu12-x64.zip

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip

@ -164,7 +290,11 @@ jobs:
    steps:
      - name: Download artifacts
        id: download-artifact
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
+        with:
+          path: ./artifact
+          pattern: sd-*
+          merge-multiple: true

      - name: Get commit hash
        id: commit
--- a/.gitignore
+++ b/.gitignore
@ -10,5 +10,4 @@ test/
 *.gguf
 output*.png
 models*
-!taesd-model.gguf
 *.log
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +1,3 @@
 [submodule "ggml"]
    path = ggml
-	url = https://github.com/leejet/ggml.git
+	url = https://github.com/ggerganov/ggml.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -24,44 +24,111 @@ endif()
 # general
 #option(SD_BUILD_TESTS                "sd: build tests"    ${SD_STANDALONE})
 option(SD_BUILD_EXAMPLES             "sd: build examples" ${SD_STANDALONE})
-option(SD_CUBLAS                     "sd: cuda backend" OFF)
+option(SD_CUDA                       "sd: cuda backend" OFF)
+option(SD_HIPBLAS                    "sd: rocm backend" OFF)
 option(SD_METAL                      "sd: metal backend" OFF)
-option(SD_FLASH_ATTN                 "sd: use flash attention for x4 less memory usage" OFF)
+option(SD_VULKAN                     "sd: vulkan backend" OFF)
+option(SD_OPENCL                     "sd: opencl backend" OFF)
+option(SD_SYCL                       "sd: sycl backend" OFF)
+option(SD_MUSA                       "sd: musa backend" OFF)
 option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
-option(BUILD_SHARED_LIBS             "sd: build shared libs" OFF)
+option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)

-if(SD_CUBLAS)
-	message("Use CUBLAS as backend stable-diffusion")
-    set(GGML_CUBLAS ON)
-    add_definitions(-DSD_USE_CUBLAS)
+if(SD_CUDA)
+    message("-- Use CUDA as backend stable-diffusion")
+    set(GGML_CUDA ON)
+    add_definitions(-DSD_USE_CUDA)
+endif()
+
+if(SD_METAL)
+    message("-- Use Metal as backend stable-diffusion")
+    set(GGML_METAL ON)
+    add_definitions(-DSD_USE_METAL)
+endif()
+
+if (SD_VULKAN)
+    message("-- Use Vulkan as backend stable-diffusion")
+    set(GGML_VULKAN ON)
+    add_definitions(-DSD_USE_VULKAN)
+endif ()
+
+if (SD_OPENCL)
+    message("-- Use OpenCL as backend stable-diffusion")
+    set(GGML_OPENCL ON)
+    add_definitions(-DSD_USE_OPENCL)
+endif ()
+
+if (SD_HIPBLAS)
+    message("-- Use HIPBLAS as backend stable-diffusion")
+    set(GGML_HIP ON)
+    add_definitions(-DSD_USE_CUDA)
+    if(SD_FAST_SOFTMAX)
+        set(GGML_CUDA_FAST_SOFTMAX ON)
+    endif()
+endif ()
+
+if(SD_MUSA)
+    message("-- Use MUSA as backend stable-diffusion")
+    set(GGML_MUSA ON)
+    add_definitions(-DSD_USE_CUDA)
    if(SD_FAST_SOFTMAX)
        set(GGML_CUDA_FAST_SOFTMAX ON)
    endif()
 endif()

-if(SD_METAL)
-	message("Use Metal as backend stable-diffusion")
-    set(GGML_METAL ON)
-    add_definitions(-DSD_USE_METAL)
+set(SD_LIB stable-diffusion)
+
+file(GLOB SD_LIB_SOURCES
+    "*.h"
+    "*.cpp"
+    "*.hpp"
+)
+
+# we can get only one share lib
+if(SD_BUILD_SHARED_LIBS)
+    message("-- Build shared library")
+    message(${SD_LIB_SOURCES})
+    set(BUILD_SHARED_LIBS OFF)
+    add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES})
+    add_definitions(-DSD_BUILD_SHARED_LIB)
+    target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL)
+    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+else()
+    message("-- Build static library")
+    set(BUILD_SHARED_LIBS OFF)
+    add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
 endif()

-if(SD_FLASH_ATTN)
-    message("Use Flash Attention for memory optimization")
-    add_definitions(-DSD_USE_FLASH_ATTENTION)
+if(SD_SYCL)
+    message("-- Use SYCL as backend stable-diffusion")
+    set(GGML_SYCL ON)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
+    add_definitions(-DSD_USE_SYCL)
+    # disable fast-math on host, see:
+    # https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/fp-model-fp.html
+    if (WIN32)
+        set(SYCL_COMPILE_OPTIONS /fp:precise)
+    else()
+        set(SYCL_COMPILE_OPTIONS -fp-model=precise)
+    endif()
+    message("-- Turn off fast-math for host in SYCL backend")
+    target_compile_options(${SD_LIB} PRIVATE ${SYCL_COMPILE_OPTIONS})
 endif()

-
 set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+
+# see https://github.com/ggerganov/ggml/pull/682
+add_definitions(-DGGML_MAX_NAME=128)
+
 # deps
-add_subdirectory(ggml)
+# Only add ggml if it hasn't been added yet
+if (NOT TARGET ggml)
+    add_subdirectory(ggml)
+endif()

 add_subdirectory(thirdparty)

-set(SD_LIB stable-diffusion)
-
-add_library(${SD_LIB} stable-diffusion.h stable-diffusion.cpp model.h model.cpp util.h util.cpp upscaler.cpp
-             ggml_extend.hpp clip.hpp common.hpp unet.hpp tae.hpp esrgan.hpp lora.hpp denoiser.hpp rng.hpp rng_philox.hpp)
 target_link_libraries(${SD_LIB} PUBLIC ggml zip)
 target_include_directories(${SD_LIB} PUBLIC . thirdparty)
 target_compile_features(${SD_LIB} PUBLIC cxx_std_11)
--- a/Dockerfile.musa
+++ b/Dockerfile.musa
@ -0,0 +1,22 @@
+ARG MUSA_VERSION=rc3.1.1
+
+FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu22.04 as build
+
+RUN apt-get update && apt-get install -y ccache cmake git
+
+WORKDIR /sd.cpp
+
+COPY . .
+
+RUN mkdir build && cd build && \
+    cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ \
+        -DCMAKE_C_FLAGS="${CMAKE_C_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
+        -DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
+        -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build . --config Release
+
+FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu22.04 as runtime
+
+COPY --from=build /sd.cpp/build/bin/sd /sd
+
+ENTRYPOINT [ "/sd" ]
--- a/README.md
+++ b/README.md
@ -1,28 +1,31 @@
 <p align="center">
-  <img src="./assets/a%20lovely%20cat.png" width="256x">
+  <img src="./assets/cat_with_sd_cpp_42.png" width="360x">
 </p>

 # stable-diffusion.cpp

-Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in pure C/C++
+Inference of Stable Diffusion and Flux in pure C/C++

 ## Features

 - Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
 - Super lightweight and without external dependencies
- SD1.x, SD2.x and SDXL support
+- SD1.x, SD2.x, SDXL and [SD3/SD3.5](./docs/sd3.md) support
    - !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors).
-
+- [Flux-dev/Flux-schnell Support](./docs/flux.md)
+- [FLUX.1-Kontext-dev](./docs/kontext.md)
+- [Chroma](./docs/chroma.md)
 - [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo) and [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) support
+- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
 - 16-bit, 32-bit float support
- 4-bit, 5-bit and 8-bit integer quantization support
+- 2-bit, 3-bit, 4-bit, 5-bit and 8-bit integer quantization support
 - Accelerated memory-efficient CPU inference
    - Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB.
 - AVX, AVX2 and AVX512 support for x86 architectures
- Full CUDA and Metal backend for GPU acceleration.
+- Full CUDA, Metal, Vulkan, OpenCL and SYCL backend for GPU acceleration.
 - Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models
    - No need to convert to `.ggml` or `.gguf` anymore!
- Flash Attention for memory usage optimization (only cpu for now)
+- Flash Attention for memory usage optimization
 - Original `txt2img` and `img2img` mode
 - Negative prompt
 - [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
@ -31,6 +34,7 @@ Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in
 - Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
 - Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
 - VAE tiling processing for reduce memory usage
+- Control Net support with SD 1.5
 - Sampling method
    - `Euler A`
    - `Euler`
@ -46,21 +50,21 @@ Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in
    - Linux
    - Mac OS
    - Windows
-    - Android (via Termux)
+    - Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))

 ### TODO

 - [ ] More sampling methods
 - [ ] Make inference faster
    - The current implementation of ggml_conv_2d is slow and has high memory usage
-    - Implement Winograd Convolution 2D for 3x3 kernel filtering
 - [ ] Continuing to reduce memory usage (quantizing the weights of ggml_conv_2d)
- [ ] Implement Textual Inversion (embeddings)
 - [ ] Implement Inpainting support
- [ ] k-quants support

 ## Usage

+For most users, you can download the built executable program from the latest [release](https://github.com/leejet/stable-diffusion.cpp/releases/latest).
+If the built product does not meet your requirements, you can choose to build it manually.
+
 ### Get the Code

 ```
@ -83,11 +87,13 @@ git submodule update
    - Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
    - Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
    - Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
+    - Stable Diffusion 3 2B from https://huggingface.co/stabilityai/stable-diffusion-3-medium

    ```shell
    curl -L -O https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
    # curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
    # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-nonema-pruned.safetensors
+    # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors
    ```

 ### Build
@ -108,12 +114,31 @@ cmake .. -DGGML_OPENBLAS=ON
 cmake --build . --config Release
 ```

-##### Using CUBLAS
+##### Using CUDA

 This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.

 ```
-cmake .. -DSD_CUBLAS=ON
+cmake .. -DSD_CUDA=ON
+cmake --build . --config Release
+```
+
+##### Using HipBLAS
+This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed.
+
+Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
+
+```
+cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1100
+cmake --build . --config Release
+```
+
+##### Using MUSA
+
+This provides BLAS acceleration using the MUSA cores of your Moore Threads GPU. Make sure to have the MUSA toolkit installed.
+
+```bash
+cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
 cmake --build . --config Release
 ```

@ -126,15 +151,130 @@ cmake .. -DSD_METAL=ON
 cmake --build . --config Release
 ```

-### Using Flash Attention
+##### Using Vulkan

-Enabling flash attention reduces memory usage by at least 400 MB. At the moment, it is not supported when CUBLAS is enabled because the kernel implementation is missing.
+Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.

 ```
-cmake .. -DSD_FLASH_ATTN=ON
+cmake .. -DSD_VULKAN=ON
 cmake --build . --config Release
 ```

+##### Using OpenCL (for Adreno GPU)
+
+Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
+
+To build for Windows ARM please refers to [Windows 11 Arm64
+](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
+
+Building for Android:
+
+  Android NDK:
+       Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
+
+Setup OpenCL Dependencies for NDK:
+
+You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
+
+*   OpenCL Headers:
+    ```bash
+    # In a temporary working directory
+    git clone https://github.com/KhronosGroup/OpenCL-Headers
+    cd OpenCL-Headers
+    # Replace <YOUR_NDK_PATH> with your actual NDK installation path
+    # e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+    sudo cp -r CL <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+    cd ..
+    ```
+
+*   OpenCL ICD Loader:
+    ```bash
+    # In the same temporary working directory
+    git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
+    cd OpenCL-ICD-Loader
+    mkdir build_ndk && cd build_ndk
+
+    # Replace <YOUR_NDK_PATH> in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
+    cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
+      -DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
+      -DOPENCL_ICD_LOADER_HEADERS_DIR=<YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
+      -DANDROID_ABI=arm64-v8a \
+      -DANDROID_PLATFORM=24 \
+      -DANDROID_STL=c++_shared
+
+    ninja
+    # Replace <YOUR_NDK_PATH>
+    # e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
+    sudo cp libOpenCL.so <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
+    cd ../..
+    ```
+
+Build `stable-diffusion.cpp` for Android with OpenCL:
+
+```bash
+mkdir build-android && cd build-android
+
+# Replace <YOUR_NDK_PATH> with your actual NDK installation path
+# e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake
+cmake .. -G Ninja \
+  -DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
+  -DANDROID_ABI=arm64-v8a \
+  -DANDROID_PLATFORM=android-28 \
+  -DGGML_OPENMP=OFF \
+  -DSD_OPENCL=ON
+
+ninja
+```
+*(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)*
+
+##### Using SYCL
+
+Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggerganov/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
+
+```
+# Export relevant ENV variables
+source /opt/intel/oneapi/setvars.sh
+
+# Option 1: Use FP32 (recommended for better performance in most cases)
+cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+# Option 2: Use FP16
+cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+
+cmake --build . --config Release
+```
+
+Example of text2img by using SYCL backend:
+
+- download `stable-diffusion` model weight, refer to [download-weight](#download-weights).
+
+- run `./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors --cfg-scale 5 --steps 30 --sampling-method euler  -H 1024 -W 1024 --seed 42 -p "fantasy medieval village world inside a glass sphere , high detail, fantasy, realistic, light effect, hyper detail, volumetric lighting, cinematic, macro, depth of field, blur, red light and clouds from the back, highly detailed epic cinematic concept art cg render made in maya, blender and photoshop, octane render, excellent composition, dynamic dramatic cinematic lighting, aesthetic, very inspirational, world inside a glass sphere by james gurney by artgerm with james jean, joe fenton and tristan eaton by ross tran, fine details, 4k resolution"`
+
+<p align="center">
+  <img src="./assets/sycl_sd3_output.png" width="360x">
+</p>
+
+
+
+##### Using Flash Attention
+
+Enabling flash attention for the diffusion model reduces memory usage by varying amounts of MB.
+eg.:
+ - flux 768x768 ~600mb
+ - SD2 768x768 ~1400mb
+
+For most backends, it slows things down, but for cuda it generally speeds it up too.
+At the moment, it is only supported for some models and some backends (like cpu, cuda/rocm, metal).
+
+Run by adding `--diffusion-fa` to the arguments and watch for:
+```
+[INFO ] stable-diffusion.cpp:312  - Using flash attention in the diffusion model
+```
+and the compute buffer shrink in the debug log:
+```
+[DEBUG] ggml_extend.hpp:1004 - flux compute buffer size: 650.00 MB(VRAM)
+```
+
 ### Run

 ```
@ -142,53 +282,83 @@ usage: ./bin/sd [arguments]

 arguments:
  -h, --help                         show this help message and exit
-  -M, --mode [txt2img or img2img]    generation mode (default: txt2img)
-  -t, --threads N                    number of threads to use during computation (default: -1).
+  -M, --mode [MODE]                  run mode, one of: [img_gen, convert], default: img_gen
+  -t, --threads N                    number of threads to use during computation (default: -1)
                                     If threads <= 0, then threads will be set to the number of CPU physical cores
-  -m, --model [MODEL]                path to model
+  -m, --model [MODEL]                path to full model
+  --diffusion-model                  path to the standalone diffusion model
+  --clip_l                           path to the clip-l text encoder
+  --clip_g                           path to the clip-g text encoder
+  --t5xxl                            path to the t5xxl text encoder
  --vae [VAE]                        path to vae
  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
-  --upscale-model [ESRGAN_PATH]      path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now.
-  --type [TYPE]                      weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)
-                                     If not specified, the default is the type of the weight file.
+  --control-net [CONTROL_PATH]       path to control net model
+  --embd-dir [EMBEDDING_PATH]        path to embeddings
+  --stacked-id-embd-dir [DIR]        path to PHOTOMAKER stacked id embeddings
+  --input-id-images-dir [DIR]        path to PHOTOMAKER input id images dir
+  --normalize-input                  normalize PHOTOMAKER input id images
+  --upscale-model [ESRGAN_PATH]      path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
+  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)
+  --type [TYPE]                      weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)
+                                     If not specified, the default is the type of the weight file
+  --tensor-type-rules [EXPRESSION]   weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
  --lora-model-dir [DIR]             lora model directory
  -i, --init-img [IMAGE]             path to the input image, required by img2img
+  --mask [MASK]                      path to the mask image, required by img2img with mask
+  --control-image [IMAGE]            path to image condition, control net
+  -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times)
  -o, --output OUTPUT                path to write result image to (default: ./output.png)
  -p, --prompt [PROMPT]              the prompt to render
  -n, --negative-prompt PROMPT       the negative prompt (default: "")
  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)
+  --img-cfg-scale SCALE              image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
+  --guidance SCALE                   distilled guidance scale for models with guidance input (default: 3.5)
+  --slg-scale SCALE                  skip layer guidance (SLG) scale, only for DiT models: (default: 0)
+                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium
+  --eta SCALE                        eta in DDIM, only for DDIM and TCD: (default: 0)
+  --skip-layers LAYERS               Layers to skip for SLG steps: (default: [7,8,9])
+  --skip-layer-start START           SLG enabling point: (default: 0.01)
+  --skip-layer-end END               SLG disabling point: (default: 0.2)
+                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
  --strength STRENGTH                strength for noising/unnoising (default: 0.75)
+  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20)
+  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)
                                     1.0 corresponds to full destruction of information in init image
  -H, --height H                     image height, in pixel space (default: 512)
  -W, --width W                      image width, in pixel space (default: 512)
-  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, lcm}
+  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
                                     sampling method (default: "euler_a")
  --steps  STEPS                     number of sample steps (default: 20)
  --rng {std_default, cuda}          RNG (default: cuda)
  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)
-  -b, --batch-count COUNT            number of images to generate.
-  --schedule {discrete, karras}      Denoiser sigma schedule (default: discrete)
-  --clip-skip N                      number of layers to skip of clip model (default: 0)
+  -b, --batch-count COUNT            number of images to generate
+  --schedule {discrete, karras, exponential, ays, gits} Denoiser sigma schedule (default: discrete)
+  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
+                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
  --vae-tiling                       process vae in tiles to reduce memory usage
+  --vae-on-cpu                       keep vae in cpu (for low vram)
+  --clip-on-cpu                      keep clip in cpu (for low vram)
+  --diffusion-fa                     use flash attention in the diffusion model (for low vram)
+                                     Might lower quality, since it implies converting k and v to f16.
+                                     This might crash if it is not supported by the backend.
+  --control-net-cpu                  keep controlnet in cpu (for low vram)
+  --canny                            apply canny preprocessor (edge detection)
+  --color                            colors the logging tags according to level
+  --chroma-disable-dit-mask          disable dit mask for chroma
+  --chroma-enable-t5-mask            enable t5 mask for chroma
+  --chroma-t5-mask-pad  PAD_SIZE     t5 mask pad size of chroma
  -v, --verbose                      print extra info
 ```

-#### Quantization
-
-You can specify the model weight type using the `--type` parameter. The weights are automatically converted when loading the model.
-
- `f16` for 16-bit floating-point
- `f32` for 32-bit floating-point
- `q8_0` for 8-bit integer quantization
- `q5_0` or `q5_1` for 5-bit integer quantization
- `q4_0` or `q4_1` for 4-bit integer quantization
-
 #### txt2img example

 ```sh
 ./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
 # ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
 # ./bin/sd -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
+# ./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v
+# ./bin/sd --diffusion-model  ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v
+# ./bin/sd -m  ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v
 ```

 Using formats of different precisions will yield results of varying quality.
@ -203,93 +373,42 @@ Using formats of different precisions will yield results of varying quality.


 ```
-./bin/sd --mode img2img -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
+./bin/sd -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
 ```

 <p align="center">
  <img src="./assets/img2img_output.png" width="256x">
 </p>

-#### with LoRA
+## More Guides

- You can specify the directory where the lora weights are stored via `--lora-model-dir`. If not specified, the default is the current working directory.
+- [LoRA](./docs/lora.md)
+- [LCM/LCM-LoRA](./docs/lcm.md)
+- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
+- [Using ESRGAN to upscale results](./docs/esrgan.md)
+- [Using TAESD to faster decoding](./docs/taesd.md)
+- [Docker](./docs/docker.md)
+- [Quantization and GGUF](./docs/quantization_and_gguf.md)

- LoRA is specified via prompt, just like [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora).
+## Bindings

-Here's a simple example:
+These projects wrap `stable-diffusion.cpp` for easier use in other languages/frameworks.

-```
-./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
-```
+* Golang (non-cgo): [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion)
+* Golang (cgo): [Binozo/GoStableDiffusion](https://github.com/Binozo/GoStableDiffusion)
+* C#: [DarthAffe/StableDiffusion.NET](https://github.com/DarthAffe/StableDiffusion.NET)
+* Python: [william-murray1204/stable-diffusion-cpp-python](https://github.com/william-murray1204/stable-diffusion-cpp-python)
+* Rust: [newfla/diffusion-rs](https://github.com/newfla/diffusion-rs)
+* Flutter/Dart: [rmatif/Local-Diffusion](https://github.com/rmatif/Local-Diffusion)

-`../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model
+## UIs

-#### LCM/LCM-LoRA
+These projects use `stable-diffusion.cpp` as a backend for their image generation.

- Download LCM-LoRA form https://huggingface.co/latent-consistency/lcm-lora-sdv1-5
- Specify LCM-LoRA by adding `<lora:lcm-lora-sdv1-5:1>` to prompt
- It's advisable to set `--cfg-scale` to `1.0` instead of the default `7.0`. For `--steps`, a range of `2-8` steps is recommended. For `--sampling-method`, `lcm`/`euler_a` is recommended.
-
-Here's a simple example:
-
-```
-./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:lcm-lora-sdv1-5:1>" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
-```
-
-| without LCM-LoRA (--cfg-scale 7)  | with LCM-LoRA (--cfg-scale 1)  |
-| ----  |----    |
-| ![](./assets/without_lcm.png) |![](./assets/with_lcm.png)  |
-
-## Using TAESD to faster decoding
-
-You can use TAESD to accelerate the decoding of latent images by following these steps:
-
- Download the model [weights](https://huggingface.co/madebyollin/taesd/blob/main/diffusion_pytorch_model.safetensors).
-
-Or curl
-
-```bash
-curl -L -O https://huggingface.co/madebyollin/taesd/blob/main/diffusion_pytorch_model.safetensors
-```
-
- Specify the model path using the `--taesd PATH` parameter. example:
-
-```bash
-sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --taesd ../models/diffusion_pytorch_model.safetensors
-```
-
-## Using ESRGAN to upscale results
-
-You can use ESRGAN to upscale the generated images. At the moment, only the [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth) model is supported. Support for more models of this architecture will be added soon.
-
- Specify the model path using the `--upscale-model PATH` parameter. example:
-
-```bash
-sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --upscale-model ../models/RealESRGAN_x4plus_anime_6B.pth
-```
-
-### Docker
-
-#### Building using Docker
-
-```shell
-docker build -t sd .
-```
-
-#### Run
-
-```shell
-docker run -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
-# For example
-# docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
-```
-
-## Memory Requirements
-
-| precision | f32  | f16  |q8_0  |q5_0  |q5_1  |q4_0  |q4_1  |
-| ----         | ----  |----  |----  |----  |----  |----  |----  |
-|  **Memory** (txt2img - 512 x 512) | ~2.8G | ~2.3G | ~2.1G | ~2.0G | ~2.0G | ~2.0G | ~2.0G |
-|  **Memory** (txt2img - 512 x 512) *with Flash Attention* | ~2.4G | ~1.9G | ~1.6G | ~1.5G | ~1.5G | ~1.5G | ~1.5G |
+- [Jellybox](https://jellybox.com)
+- [Stable Diffusion GUI](https://github.com/fszontagh/sd.cpp.gui.wx)
+- [Stable Diffusion CLI-GUI](https://github.com/piallai/stable-diffusion.cpp)
+- [Local Diffusion](https://github.com/rmatif/Local-Diffusion)

 ## Contributors

@ -297,12 +416,19 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp

 [![Contributors](https://contrib.rocks/image?repo=leejet/stable-diffusion.cpp)](https://github.com/leejet/stable-diffusion.cpp/graphs/contributors)

+## Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=leejet/stable-diffusion.cpp&type=Date)](https://star-history.com/#leejet/stable-diffusion.cpp&Date)
+
 ## References

 - [ggml](https://github.com/ggerganov/ggml)
 - [stable-diffusion](https://github.com/CompVis/stable-diffusion)
+- [sd3-ref](https://github.com/Stability-AI/sd3-ref)
 - [stable-diffusion-stability-ai](https://github.com/Stability-AI/stablediffusion)
 - [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui)
+- [ComfyUI](https://github.com/comfyanonymous/ComfyUI)
 - [k-diffusion](https://github.com/crowsonkb/k-diffusion)
 - [latent-consistency-model](https://github.com/luosiallen/latent-consistency-model)
 - [generative-models](https://github.com/Stability-AI/generative-models/)
+- [PhotoMaker](https://github.com/TencentARC/PhotoMaker)
--- a/assets/cat_with_sd_cpp_20184.png
+++ b/assets/cat_with_sd_cpp_20184.png
--- a/assets/cat_with_sd_cpp_42.png
+++ b/assets/cat_with_sd_cpp_42.png
--- a/assets/control.png
+++ b/assets/control.png
--- a/assets/control_2.png
+++ b/assets/control_2.png
--- a/assets/control_3.png
+++ b/assets/control_3.png
--- a/assets/flux/chroma_v40.png
+++ b/assets/flux/chroma_v40.png
--- a/assets/flux/flux1-dev-q2_k.png
+++ b/assets/flux/flux1-dev-q2_k.png
--- a/assets/flux/flux1-dev-q3_k.png
+++ b/assets/flux/flux1-dev-q3_k.png
--- a/assets/flux/flux1-dev-q4_0.png
+++ b/assets/flux/flux1-dev-q4_0.png
--- a/assets/flux/flux1-dev-q4_k.png
+++ b/assets/flux/flux1-dev-q4_k.png
--- a/assets/flux/flux1-dev-q8_0
+++ b/assets/flux/flux1-dev-q8_0
--- a/assets/flux/flux1-dev-q8_0.png
+++ b/assets/flux/flux1-dev-q8_0.png
--- a/assets/flux/flux1-schnell-q8_0.png
+++ b/assets/flux/flux1-schnell-q8_0.png
--- a/assets/flux/kontext1_dev_output.png
+++ b/assets/flux/kontext1_dev_output.png
--- a/assets/photomaker_examples/lenna_woman/lenna.jpg
+++ b/assets/photomaker_examples/lenna_woman/lenna.jpg
--- a/assets/photomaker_examples/newton_man/newton_0.jpg
+++ b/assets/photomaker_examples/newton_man/newton_0.jpg
--- a/assets/photomaker_examples/newton_man/newton_1.jpg
+++ b/assets/photomaker_examples/newton_man/newton_1.jpg
--- a/assets/photomaker_examples/newton_man/newton_2.png
+++ b/assets/photomaker_examples/newton_man/newton_2.png
--- a/assets/photomaker_examples/newton_man/newton_3.jpg
+++ b/assets/photomaker_examples/newton_man/newton_3.jpg
--- a/assets/photomaker_examples/scarletthead_woman/scarlett_0.jpg
+++ b/assets/photomaker_examples/scarletthead_woman/scarlett_0.jpg
--- a/assets/photomaker_examples/scarletthead_woman/scarlett_1.jpg
+++ b/assets/photomaker_examples/scarletthead_woman/scarlett_1.jpg
--- a/assets/photomaker_examples/scarletthead_woman/scarlett_2.jpg
+++ b/assets/photomaker_examples/scarletthead_woman/scarlett_2.jpg
--- a/assets/photomaker_examples/scarletthead_woman/scarlett_3.jpg
+++ b/assets/photomaker_examples/scarletthead_woman/scarlett_3.jpg
--- a/assets/photomaker_examples/yangmi_woman/yangmi_1.jpg
+++ b/assets/photomaker_examples/yangmi_woman/yangmi_1.jpg
--- a/assets/photomaker_examples/yangmi_woman/yangmi_2.jpeg
+++ b/assets/photomaker_examples/yangmi_woman/yangmi_2.jpeg
--- a/assets/photomaker_examples/yangmi_woman/yangmi_3.jpg
+++ b/assets/photomaker_examples/yangmi_woman/yangmi_3.jpg
--- a/assets/photomaker_examples/yangmi_woman/yangmi_4.jpg
+++ b/assets/photomaker_examples/yangmi_woman/yangmi_4.jpg
--- a/assets/photomaker_examples/yangmi_woman/yangmi_5.jpg
+++ b/assets/photomaker_examples/yangmi_woman/yangmi_5.jpg
--- a/assets/photomaker_examples/yangmi_woman/yangmi_6.jpg
+++ b/assets/photomaker_examples/yangmi_woman/yangmi_6.jpg
--- a/assets/sd3.5_large.png
+++ b/assets/sd3.5_large.png
--- a/assets/sycl_sd3_output.png
+++ b/assets/sycl_sd3_output.png
--- a/clip.hpp
+++ b/clip.hpp
--- a/common.hpp
+++ b/common.hpp
@ -3,84 +3,521 @@

 #include "ggml_extend.hpp"

-struct DownSample {
-    // hparams
+class DownSampleBlock : public GGMLBlock {
+protected:
    int channels;
    int out_channels;
+    bool vae_downsample;

-    // conv2d params
-    struct ggml_tensor* op_w;  // [out_channels, channels, 3, 3]
-    struct ggml_tensor* op_b;  // [out_channels,]
-
-    bool vae_downsample = false;
-
-    size_t calculate_mem_size(ggml_type wtype) {
-        double mem_size = 0;
-        mem_size += out_channels * channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // op_w
-        mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32);                     // op_b
-        return static_cast<size_t>(mem_size);
-    }
-
-    void init_params(struct ggml_context* ctx, ggml_type wtype) {
-        op_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, out_channels);
-        op_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
-    }
-
-    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+public:
+    DownSampleBlock(int channels,
+                    int out_channels,
+                    bool vae_downsample = false)
+        : channels(channels),
+          out_channels(out_channels),
+          vae_downsample(vae_downsample) {
        if (vae_downsample) {
-            tensors[prefix + "conv.weight"] = op_w;
-            tensors[prefix + "conv.bias"]   = op_b;
+            blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}));
        } else {
-            tensors[prefix + "op.weight"] = op_w;
-            tensors[prefix + "op.bias"]   = op_b;
+            blocks["op"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}));
        }
    }

    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [N, channels, h, w]
-        struct ggml_tensor* c = NULL;
        if (vae_downsample) {
-            c = ggml_pad(ctx, x, 1, 1, 0, 0);
-            c = ggml_nn_conv_2d(ctx, c, op_w, op_b, 2, 2, 0, 0);
+            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
+
+            x = ggml_pad(ctx, x, 1, 1, 0, 0);
+            x = conv->forward(ctx, x);
        } else {
-            c = ggml_nn_conv_2d(ctx, x, op_w, op_b, 2, 2, 1, 1);
+            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
+
+            x = conv->forward(ctx, x);
        }
-        return c;  // [N, out_channels, h/2, w/2]
+        return x;  // [N, out_channels, h/2, w/2]
    }
 };

-struct UpSample {
-    // hparams
+class UpSampleBlock : public GGMLBlock {
+protected:
    int channels;
    int out_channels;

-    // conv2d params
-    struct ggml_tensor* conv_w;  // [out_channels, channels, 3, 3]
-    struct ggml_tensor* conv_b;  // [out_channels,]
-
-    size_t calculate_mem_size(ggml_type wtype) {
-        double mem_size = 0;
-        mem_size += out_channels * channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // op_w
-        mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32);                     // op_b
-        return static_cast<size_t>(mem_size);
-    }
-
-    void init_params(struct ggml_context* ctx, ggml_type wtype) {
-        conv_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, out_channels);
-        conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
-    }
-
-    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
-        tensors[prefix + "conv.weight"] = conv_w;
-        tensors[prefix + "conv.bias"]   = conv_b;
+public:
+    UpSampleBlock(int channels,
+                  int out_channels)
+        : channels(channels),
+          out_channels(out_channels) {
+        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
    }

    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [N, channels, h, w]
-        x = ggml_upscale(ctx, x, 2);                              // [N, channels, h*2, w*2]
-        x = ggml_nn_conv_2d(ctx, x, conv_w, conv_b, 1, 1, 1, 1);  // [N, out_channels, h*2, w*2]
+        auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
+
+        x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST);  // [N, channels, h*2, w*2]
+        x = conv->forward(ctx, x);                             // [N, out_channels, h*2, w*2]
        return x;
    }
 };

-#endif  // __COMMON_HPP__
+class ResBlock : public GGMLBlock {
+protected:
+    // network hparams
+    int64_t channels;      // model_channels * (1, 1, 1, 2, 2, 4, 4, 4)
+    int64_t emb_channels;  // time_embed_dim
+    int64_t out_channels;  // mult * model_channels
+    std::pair<int, int> kernel_size;
+    int dims;
+    bool skip_t_emb;
+    bool exchange_temb_dims;
+
+    std::shared_ptr<GGMLBlock> conv_nd(int dims,
+                                       int64_t in_channels,
+                                       int64_t out_channels,
+                                       std::pair<int, int> kernel_size,
+                                       std::pair<int, int> padding) {
+        GGML_ASSERT(dims == 2 || dims == 3);
+        if (dims == 3) {
+            return std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(in_channels, out_channels, kernel_size.first, 1, padding.first));
+        } else {
+            return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding));
+        }
+    }
+
+public:
+    ResBlock(int64_t channels,
+             int64_t emb_channels,
+             int64_t out_channels,
+             std::pair<int, int> kernel_size = {3, 3},
+             int dims                        = 2,
+             bool exchange_temb_dims         = false,
+             bool skip_t_emb                 = false)
+        : channels(channels),
+          emb_channels(emb_channels),
+          out_channels(out_channels),
+          kernel_size(kernel_size),
+          dims(dims),
+          skip_t_emb(skip_t_emb),
+          exchange_temb_dims(exchange_temb_dims) {
+        std::pair<int, int> padding = {kernel_size.first / 2, kernel_size.second / 2};
+        blocks["in_layers.0"]       = std::shared_ptr<GGMLBlock>(new GroupNorm32(channels));
+        // in_layer_1 is nn.SILU()
+        blocks["in_layers.2"] = conv_nd(dims, channels, out_channels, kernel_size, padding);
+
+        if (!skip_t_emb) {
+            // emb_layer_0 is nn.SILU()
+            blocks["emb_layers.1"] = std::shared_ptr<GGMLBlock>(new Linear(emb_channels, out_channels));
+        }
+
+        blocks["out_layers.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(out_channels));
+        // out_layer_1 is nn.SILU()
+        // out_layer_2 is nn.Dropout(), skip for inference
+        blocks["out_layers.3"] = conv_nd(dims, out_channels, out_channels, kernel_size, padding);
+
+        if (out_channels != channels) {
+            blocks["skip_connection"] = conv_nd(dims, channels, out_channels, {1, 1}, {0, 0});
+        }
+    }
+
+    virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = NULL) {
+        // For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
+        // [N, c, t, h, w] => [N, c, t, h * w]
+        // x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
+        // emb: [N, emb_channels] if dims == 2 else [N, t, emb_channels]
+        auto in_layers_0  = std::dynamic_pointer_cast<GroupNorm32>(blocks["in_layers.0"]);
+        auto in_layers_2  = std::dynamic_pointer_cast<UnaryBlock>(blocks["in_layers.2"]);
+        auto out_layers_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out_layers.0"]);
+        auto out_layers_3 = std::dynamic_pointer_cast<UnaryBlock>(blocks["out_layers.3"]);
+
+        if (emb == NULL) {
+            GGML_ASSERT(skip_t_emb);
+        }
+
+        // in_layers
+        auto h = in_layers_0->forward(ctx, x);
+        h      = ggml_silu_inplace(ctx, h);
+        h      = in_layers_2->forward(ctx, h);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+
+        // emb_layers
+        if (!skip_t_emb) {
+            auto emb_layer_1 = std::dynamic_pointer_cast<Linear>(blocks["emb_layers.1"]);
+
+            auto emb_out = ggml_silu(ctx, emb);
+            emb_out      = emb_layer_1->forward(ctx, emb_out);  // [N, out_channels] if dims == 2 else [N, t, out_channels]
+
+            if (dims == 2) {
+                emb_out = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]);  // [N, out_channels, 1, 1]
+            } else {
+                emb_out = ggml_reshape_4d(ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]);  // [N, t, out_channels, 1]
+                if (exchange_temb_dims) {
+                    // emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
+                    emb_out = ggml_cont(ctx, ggml_permute(ctx, emb_out, 0, 2, 1, 3));  // [N, out_channels, t, 1]
+                }
+            }
+
+            h = ggml_add(ctx, h, emb_out);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+        }
+
+        // out_layers
+        h = out_layers_0->forward(ctx, h);
+        h = ggml_silu_inplace(ctx, h);
+        // dropout, skip for inference
+        h = out_layers_3->forward(ctx, h);
+
+        // skip connection
+        if (out_channels != channels) {
+            auto skip_connection = std::dynamic_pointer_cast<UnaryBlock>(blocks["skip_connection"]);
+            x                    = skip_connection->forward(ctx, x);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+        }
+
+        h = ggml_add(ctx, h, x);
+        return h;  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+    }
+};
+
+class GEGLU : public GGMLBlock {
+protected:
+    int64_t dim_in;
+    int64_t dim_out;
+
+    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
+        enum ggml_type wtype      = (tensor_types.find(prefix + "proj.weight") != tensor_types.end()) ? tensor_types[prefix + "proj.weight"] : GGML_TYPE_F32;
+        enum ggml_type bias_wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "proj.bias") != tensor_types.end()) ? tensor_types[prefix + "proj.bias"] : GGML_TYPE_F32;
+        params["proj.weight"]     = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
+        params["proj.bias"]       = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2);
+    }
+
+public:
+    GEGLU(int64_t dim_in, int64_t dim_out)
+        : dim_in(dim_in), dim_out(dim_out) {}
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [ne3, ne2, ne1, dim_in]
+        // return: [ne3, ne2, ne1, dim_out]
+        struct ggml_tensor* w = params["proj.weight"];
+        struct ggml_tensor* b = params["proj.bias"];
+
+        auto x_w    = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0);                        // [dim_out, dim_in]
+        auto x_b    = ggml_view_1d(ctx, b, b->ne[0] / 2, 0);                                            // [dim_out, dim_in]
+        auto gate_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2);  // [dim_out, ]
+        auto gate_b = ggml_view_1d(ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2);                      // [dim_out, ]
+
+        auto x_in = x;
+        x         = ggml_nn_linear(ctx, x_in, x_w, x_b);        // [ne3, ne2, ne1, dim_out]
+        auto gate = ggml_nn_linear(ctx, x_in, gate_w, gate_b);  // [ne3, ne2, ne1, dim_out]
+
+        gate = ggml_gelu_inplace(ctx, gate);
+
+        x = ggml_mul(ctx, x, gate);  // [ne3, ne2, ne1, dim_out]
+
+        return x;
+    }
+};
+
+class FeedForward : public GGMLBlock {
+public:
+    FeedForward(int64_t dim,
+                int64_t dim_out,
+                int64_t mult = 4) {
+        int64_t inner_dim = dim * mult;
+
+        blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
+        // net_1 is nn.Dropout(), skip for inference
+        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [ne3, ne2, ne1, dim]
+        // return: [ne3, ne2, ne1, dim_out]
+
+        auto net_0 = std::dynamic_pointer_cast<GEGLU>(blocks["net.0"]);
+        auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
+
+        x = net_0->forward(ctx, x);  // [ne3, ne2, ne1, inner_dim]
+        x = net_2->forward(ctx, x);  // [ne3, ne2, ne1, dim_out]
+        return x;
+    }
+};
+
+class CrossAttention : public GGMLBlock {
+protected:
+    int64_t query_dim;
+    int64_t context_dim;
+    int64_t n_head;
+    int64_t d_head;
+    bool flash_attn;
+
+public:
+    CrossAttention(int64_t query_dim,
+                   int64_t context_dim,
+                   int64_t n_head,
+                   int64_t d_head,
+                   bool flash_attn = false)
+        : n_head(n_head),
+          d_head(d_head),
+          query_dim(query_dim),
+          context_dim(context_dim),
+          flash_attn(flash_attn) {
+        int64_t inner_dim = d_head * n_head;
+
+        blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
+        blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
+        blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
+
+        blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, query_dim));
+        // to_out_1 is nn.Dropout(), skip for inference
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
+        // x: [N, n_token, query_dim]
+        // context: [N, n_context, context_dim]
+        // return: [N, n_token, query_dim]
+        auto to_q     = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
+        auto to_k     = std::dynamic_pointer_cast<Linear>(blocks["to_k"]);
+        auto to_v     = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
+        auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
+
+        int64_t n         = x->ne[2];
+        int64_t n_token   = x->ne[1];
+        int64_t n_context = context->ne[1];
+        int64_t inner_dim = d_head * n_head;
+
+        auto q = to_q->forward(ctx, x);        // [N, n_token, inner_dim]
+        auto k = to_k->forward(ctx, context);  // [N, n_context, inner_dim]
+        auto v = to_v->forward(ctx, context);  // [N, n_context, inner_dim]
+
+        x = ggml_nn_attention_ext(ctx, q, k, v, n_head, NULL, false, false, flash_attn);  // [N, n_token, inner_dim]
+
+        x = to_out_0->forward(ctx, x);  // [N, n_token, query_dim]
+        return x;
+    }
+};
+
+class BasicTransformerBlock : public GGMLBlock {
+protected:
+    int64_t n_head;
+    int64_t d_head;
+    bool ff_in;
+
+public:
+    BasicTransformerBlock(int64_t dim,
+                          int64_t n_head,
+                          int64_t d_head,
+                          int64_t context_dim,
+                          bool ff_in      = false,
+                          bool flash_attn = false)
+        : n_head(n_head), d_head(d_head), ff_in(ff_in) {
+        // disable_self_attn is always False
+        // disable_temporal_crossattention is always False
+        // switch_temporal_ca_to_sa is always False
+        // inner_dim is always None or equal to dim
+        // gated_ff is always True
+        blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head, flash_attn));
+        blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head, flash_attn));
+        blocks["ff"]    = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
+        blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
+        blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
+        blocks["norm3"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
+
+        if (ff_in) {
+            blocks["norm_in"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
+            blocks["ff_in"]   = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
+        }
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
+        // x: [N, n_token, query_dim]
+        // context: [N, n_context, context_dim]
+        // return: [N, n_token, query_dim]
+
+        auto attn1 = std::dynamic_pointer_cast<CrossAttention>(blocks["attn1"]);
+        auto attn2 = std::dynamic_pointer_cast<CrossAttention>(blocks["attn2"]);
+        auto ff    = std::dynamic_pointer_cast<FeedForward>(blocks["ff"]);
+        auto norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
+        auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
+        auto norm3 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm3"]);
+
+        if (ff_in) {
+            auto norm_in = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_in"]);
+            auto ff_in   = std::dynamic_pointer_cast<FeedForward>(blocks["ff_in"]);
+
+            auto x_skip = x;
+            x           = norm_in->forward(ctx, x);
+            x           = ff_in->forward(ctx, x);
+            // self.is_res is always True
+            x = ggml_add(ctx, x, x_skip);
+        }
+
+        auto r = x;
+        x      = norm1->forward(ctx, x);
+        x      = attn1->forward(ctx, x, x);  // self-attention
+        x      = ggml_add(ctx, x, r);
+        r      = x;
+        x      = norm2->forward(ctx, x);
+        x      = attn2->forward(ctx, x, context);  // cross-attention
+        x      = ggml_add(ctx, x, r);
+        r      = x;
+        x      = norm3->forward(ctx, x);
+        x      = ff->forward(ctx, x);
+        x      = ggml_add(ctx, x, r);
+
+        return x;
+    }
+};
+
+class SpatialTransformer : public GGMLBlock {
+protected:
+    int64_t in_channels;  // mult * model_channels
+    int64_t n_head;
+    int64_t d_head;
+    int64_t depth       = 1;    // 1
+    int64_t context_dim = 768;  // hidden_size, 1024 for VERSION_SD2
+
+public:
+    SpatialTransformer(int64_t in_channels,
+                       int64_t n_head,
+                       int64_t d_head,
+                       int64_t depth,
+                       int64_t context_dim,
+                       bool flash_attn = false)
+        : in_channels(in_channels),
+          n_head(n_head),
+          d_head(d_head),
+          depth(depth),
+          context_dim(context_dim) {
+        // We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False
+        // disable_self_attn is always False
+        int64_t inner_dim = n_head * d_head;  // in_channels
+        blocks["norm"]    = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
+        blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
+
+        for (int i = 0; i < depth; i++) {
+            std::string name = "transformer_blocks." + std::to_string(i);
+            blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false, flash_attn));
+        }
+
+        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
+    }
+
+    virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
+        // x: [N, in_channels, h, w]
+        // context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
+        auto norm     = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
+        auto proj_in  = std::dynamic_pointer_cast<Conv2d>(blocks["proj_in"]);
+        auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]);
+
+        auto x_in         = x;
+        int64_t n         = x->ne[3];
+        int64_t h         = x->ne[1];
+        int64_t w         = x->ne[0];
+        int64_t inner_dim = n_head * d_head;
+
+        x = norm->forward(ctx, x);
+        x = proj_in->forward(ctx, x);  // [N, inner_dim, h, w]
+
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3));  // [N, h, w, inner_dim]
+        x = ggml_reshape_3d(ctx, x, inner_dim, w * h, n);      // [N, h * w, inner_dim]
+
+        for (int i = 0; i < depth; i++) {
+            std::string name       = "transformer_blocks." + std::to_string(i);
+            auto transformer_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[name]);
+
+            x = transformer_block->forward(ctx, x, context);
+        }
+
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));  // [N, inner_dim, h * w]
+        x = ggml_reshape_4d(ctx, x, w, h, inner_dim, n);       // [N, inner_dim, h, w]
+
+        // proj_out
+        x = proj_out->forward(ctx, x);  // [N, in_channels, h, w]
+
+        x = ggml_add(ctx, x, x_in);
+        return x;
+    }
+};
+
+class AlphaBlender : public GGMLBlock {
+protected:
+    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
+        // Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
+        enum ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.ypes.find(prefix + "mix_factor") != tensor_types.end()) ? tensor_types[prefix + "mix_factor"] : GGML_TYPE_F32;
+        params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
+    }
+
+    float get_alpha() {
+        // image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
+        // so learned_with_images is same as learned
+        float alpha = ggml_backend_tensor_get_f32(params["mix_factor"]);
+        return sigmoid(alpha);
+    }
+
+public:
+    AlphaBlender() {
+        // merge_strategy is always learned_with_images
+        // for inference, we don't need to set alpha
+        // since mix_factor.shape is [1,], we don't need rearrange using  rearrange_pattern
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x_spatial,
+                                struct ggml_tensor* x_temporal) {
+        // image_only_indicator is always tensor([0.])
+        float alpha = get_alpha();
+        auto x      = ggml_add(ctx,
+                               ggml_scale(ctx, x_spatial, alpha),
+                               ggml_scale(ctx, x_temporal, 1.0f - alpha));
+        return x;
+    }
+};
+
+class VideoResBlock : public ResBlock {
+public:
+    VideoResBlock(int channels,
+                  int emb_channels,
+                  int out_channels,
+                  std::pair<int, int> kernel_size = {3, 3},
+                  int64_t video_kernel_size       = 3,
+                  int dims                        = 2)  // always 2
+        : ResBlock(channels, emb_channels, out_channels, kernel_size, dims) {
+        blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, emb_channels, out_channels, kernel_size, 3, true));
+        blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* emb,
+                                int num_video_frames) {
+        // x: [N, channels, h, w] aka [b*t, channels, h, w]
+        // emb: [N, emb_channels] aka [b*t, emb_channels]
+        // image_only_indicator is always tensor([0.])
+        auto time_stack = std::dynamic_pointer_cast<ResBlock>(blocks["time_stack"]);
+        auto time_mixer = std::dynamic_pointer_cast<AlphaBlender>(blocks["time_mixer"]);
+
+        x = ResBlock::forward(ctx, x, emb);
+
+        int64_t T = num_video_frames;
+        int64_t B = x->ne[3] / T;
+        int64_t C = x->ne[2];
+        int64_t H = x->ne[1];
+        int64_t W = x->ne[0];
+
+        x          = ggml_reshape_4d(ctx, x, W * H, C, T, B);           // (b t) c h w -> b t c (h w)
+        x          = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
+        auto x_mix = x;
+
+        emb = ggml_reshape_4d(ctx, emb, emb->ne[0], T, B, emb->ne[3]);  // (b t) ... -> b t ...
+
+        x = time_stack->forward(ctx, x, emb);  // b t c (h w)
+
+        x = time_mixer->forward(ctx, x_mix, x);  // b t c (h w)
+
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
+        x = ggml_reshape_4d(ctx, x, W, H, C, T * B);           // b t c (h w) -> (b t) c h w
+
+        return x;
+    }
+};
+
+#endif  // __COMMON_HPP__
--- a/conditioner.hpp
+++ b/conditioner.hpp
--- a/control.hpp
+++ b/control.hpp
@ -0,0 +1,458 @@
+#ifndef __CONTROL_HPP__
+#define __CONTROL_HPP__
+
+#include "common.hpp"
+#include "ggml_extend.hpp"
+#include "model.h"
+
+#define CONTROL_NET_GRAPH_SIZE 1536
+
+/*
+    =================================== ControlNet ===================================
+    Reference: https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/cldm/cldm.py
+
+*/
+class ControlNetBlock : public GGMLBlock {
+protected:
+    SDVersion version = VERSION_SD1;
+    // network hparams
+    int in_channels                        = 4;
+    int out_channels                       = 4;
+    int hint_channels                      = 3;
+    int num_res_blocks                     = 2;
+    std::vector<int> attention_resolutions = {4, 2, 1};
+    std::vector<int> channel_mult          = {1, 2, 4, 4};
+    std::vector<int> transformer_depth     = {1, 1, 1, 1};
+    int time_embed_dim                     = 1280;  // model_channels*4
+    int num_heads                          = 8;
+    int num_head_channels                  = -1;   // channels // num_heads
+    int context_dim                        = 768;  // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
+
+public:
+    int model_channels  = 320;
+    int adm_in_channels = 2816;  // only for VERSION_SDXL
+
+    ControlNetBlock(SDVersion version = VERSION_SD1)
+        : version(version) {
+        if (sd_version_is_sd2(version)) {
+            context_dim       = 1024;
+            num_head_channels = 64;
+            num_heads         = -1;
+        } else if (sd_version_is_sdxl(version)) {
+            context_dim           = 2048;
+            attention_resolutions = {4, 2};
+            channel_mult          = {1, 2, 4};
+            transformer_depth     = {1, 2, 10};
+            num_head_channels     = 64;
+            num_heads             = -1;
+        } else if (version == VERSION_SVD) {
+            in_channels       = 8;
+            out_channels      = 4;
+            context_dim       = 1024;
+            adm_in_channels   = 768;
+            num_head_channels = 64;
+            num_heads         = -1;
+        }
+
+        blocks["time_embed.0"] = std::shared_ptr<GGMLBlock>(new Linear(model_channels, time_embed_dim));
+        // time_embed_1 is nn.SiLU()
+        blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
+
+        if (sd_version_is_sdxl(version) || version == VERSION_SVD) {
+            blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
+            // label_emb_1 is nn.SiLU()
+            blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
+        }
+
+        // input_blocks
+        blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}));
+
+        std::vector<int> input_block_chans;
+        input_block_chans.push_back(model_channels);
+        int ch              = model_channels;
+        int input_block_idx = 0;
+        int ds              = 1;
+
+        auto get_resblock = [&](int64_t channels, int64_t emb_channels, int64_t out_channels) -> ResBlock* {
+            return new ResBlock(channels, emb_channels, out_channels);
+        };
+
+        auto get_attention_layer = [&](int64_t in_channels,
+                                       int64_t n_head,
+                                       int64_t d_head,
+                                       int64_t depth,
+                                       int64_t context_dim) -> SpatialTransformer* {
+            return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim);
+        };
+
+        auto make_zero_conv = [&](int64_t channels) {
+            return new Conv2d(channels, channels, {1, 1});
+        };
+
+        blocks["zero_convs.0.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(model_channels));
+
+        blocks["input_hint_block.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(hint_channels, 16, {3, 3}, {1, 1}, {1, 1}));
+        // nn.SiLU()
+        blocks["input_hint_block.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 16, {3, 3}, {1, 1}, {1, 1}));
+        // nn.SiLU()
+        blocks["input_hint_block.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 32, {3, 3}, {2, 2}, {1, 1}));
+        // nn.SiLU()
+        blocks["input_hint_block.6"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 32, {3, 3}, {1, 1}, {1, 1}));
+        // nn.SiLU()
+        blocks["input_hint_block.8"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 96, {3, 3}, {2, 2}, {1, 1}));
+        // nn.SiLU()
+        blocks["input_hint_block.10"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 96, {3, 3}, {1, 1}, {1, 1}));
+        // nn.SiLU()
+        blocks["input_hint_block.12"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 256, {3, 3}, {2, 2}, {1, 1}));
+        // nn.SiLU()
+        blocks["input_hint_block.14"] = std::shared_ptr<GGMLBlock>(new Conv2d(256, model_channels, {3, 3}, {1, 1}, {1, 1}));
+
+        size_t len_mults = channel_mult.size();
+        for (int i = 0; i < len_mults; i++) {
+            int mult = channel_mult[i];
+            for (int j = 0; j < num_res_blocks; j++) {
+                input_block_idx += 1;
+                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
+                blocks[name]     = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, mult * model_channels));
+
+                ch = mult * model_channels;
+                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
+                    int n_head = num_heads;
+                    int d_head = ch / num_heads;
+                    if (num_head_channels != -1) {
+                        d_head = num_head_channels;
+                        n_head = ch / d_head;
+                    }
+                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
+                    blocks[name]     = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
+                                                                                      n_head,
+                                                                                      d_head,
+                                                                                      transformer_depth[i],
+                                                                                      context_dim));
+                }
+                blocks["zero_convs." + std::to_string(input_block_idx) + ".0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
+                input_block_chans.push_back(ch);
+            }
+            if (i != len_mults - 1) {
+                input_block_idx += 1;
+                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
+                blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(ch, ch));
+
+                blocks["zero_convs." + std::to_string(input_block_idx) + ".0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
+
+                input_block_chans.push_back(ch);
+                ds *= 2;
+            }
+        }
+
+        // middle blocks
+        int n_head = num_heads;
+        int d_head = ch / num_heads;
+        if (num_head_channels != -1) {
+            d_head = num_head_channels;
+            n_head = ch / d_head;
+        }
+        blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
+        blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
+                                                                                  n_head,
+                                                                                  d_head,
+                                                                                  transformer_depth[transformer_depth.size() - 1],
+                                                                                  context_dim));
+        blocks["middle_block.2"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
+
+        // middle_block_out
+        blocks["middle_block_out.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
+    }
+
+    struct ggml_tensor* resblock_forward(std::string name,
+                                         struct ggml_context* ctx,
+                                         struct ggml_tensor* x,
+                                         struct ggml_tensor* emb) {
+        auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
+        return block->forward(ctx, x, emb);
+    }
+
+    struct ggml_tensor* attention_layer_forward(std::string name,
+                                                struct ggml_context* ctx,
+                                                struct ggml_tensor* x,
+                                                struct ggml_tensor* context) {
+        auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
+        return block->forward(ctx, x, context);
+    }
+
+    struct ggml_tensor* input_hint_block_forward(struct ggml_context* ctx,
+                                                 struct ggml_tensor* hint,
+                                                 struct ggml_tensor* emb,
+                                                 struct ggml_tensor* context) {
+        int num_input_blocks = 15;
+        auto h               = hint;
+        for (int i = 0; i < num_input_blocks; i++) {
+            if (i % 2 == 0) {
+                auto block = std::dynamic_pointer_cast<Conv2d>(blocks["input_hint_block." + std::to_string(i)]);
+
+                h = block->forward(ctx, h);
+            } else {
+                h = ggml_silu_inplace(ctx, h);
+            }
+        }
+        return h;
+    }
+
+    std::vector<struct ggml_tensor*> forward(struct ggml_context* ctx,
+                                             struct ggml_tensor* x,
+                                             struct ggml_tensor* hint,
+                                             struct ggml_tensor* guided_hint,
+                                             struct ggml_tensor* timesteps,
+                                             struct ggml_tensor* context,
+                                             struct ggml_tensor* y = NULL) {
+        // x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
+        // timesteps: [N,]
+        // context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
+        // y: [N, adm_in_channels] or [1, adm_in_channels]
+        if (context != NULL) {
+            if (context->ne[2] != x->ne[3]) {
+                context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
+            }
+        }
+
+        if (y != NULL) {
+            if (y->ne[1] != x->ne[3]) {
+                y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
+            }
+        }
+
+        auto time_embed_0     = std::dynamic_pointer_cast<Linear>(blocks["time_embed.0"]);
+        auto time_embed_2     = std::dynamic_pointer_cast<Linear>(blocks["time_embed.2"]);
+        auto input_blocks_0_0 = std::dynamic_pointer_cast<Conv2d>(blocks["input_blocks.0.0"]);
+        auto zero_convs_0     = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs.0.0"]);
+
+        auto middle_block_out = std::dynamic_pointer_cast<Conv2d>(blocks["middle_block_out.0"]);
+
+        auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels);  // [N, model_channels]
+
+        auto emb = time_embed_0->forward(ctx, t_emb);
+        emb      = ggml_silu_inplace(ctx, emb);
+        emb      = time_embed_2->forward(ctx, emb);  // [N, time_embed_dim]
+
+        // SDXL/SVD
+        if (y != NULL) {
+            auto label_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.0"]);
+            auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);
+
+            auto label_emb = label_embed_0->forward(ctx, y);
+            label_emb      = ggml_silu_inplace(ctx, label_emb);
+            label_emb      = label_embed_2->forward(ctx, label_emb);  // [N, time_embed_dim]
+
+            emb = ggml_add(ctx, emb, label_emb);  // [N, time_embed_dim]
+        }
+
+        std::vector<struct ggml_tensor*> outs;
+
+        if (guided_hint == NULL) {
+            guided_hint = input_hint_block_forward(ctx, hint, emb, context);
+        }
+        outs.push_back(guided_hint);
+
+        // input_blocks
+
+        // input block 0
+        auto h = input_blocks_0_0->forward(ctx, x);
+        h      = ggml_add(ctx, h, guided_hint);
+        outs.push_back(zero_convs_0->forward(ctx, h));
+
+        // input block 1-11
+        size_t len_mults    = channel_mult.size();
+        int input_block_idx = 0;
+        int ds              = 1;
+        for (int i = 0; i < len_mults; i++) {
+            int mult = channel_mult[i];
+            for (int j = 0; j < num_res_blocks; j++) {
+                input_block_idx += 1;
+                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
+                h                = resblock_forward(name, ctx, h, emb);  // [N, mult*model_channels, h, w]
+                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
+                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
+                    h                = attention_layer_forward(name, ctx, h, context);  // [N, mult*model_channels, h, w]
+                }
+
+                auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
+
+                outs.push_back(zero_conv->forward(ctx, h));
+            }
+            if (i != len_mults - 1) {
+                ds *= 2;
+                input_block_idx += 1;
+
+                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
+                auto block       = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
+
+                h = block->forward(ctx, h);  // [N, mult*model_channels, h/(2^(i+1)), w/(2^(i+1))]
+
+                auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
+
+                outs.push_back(zero_conv->forward(ctx, h));
+            }
+        }
+        // [N, 4*model_channels, h/8, w/8]
+
+        // middle_block
+        h = resblock_forward("middle_block.0", ctx, h, emb);             // [N, 4*model_channels, h/8, w/8]
+        h = attention_layer_forward("middle_block.1", ctx, h, context);  // [N, 4*model_channels, h/8, w/8]
+        h = resblock_forward("middle_block.2", ctx, h, emb);             // [N, 4*model_channels, h/8, w/8]
+
+        // out
+        outs.push_back(middle_block_out->forward(ctx, h));
+        return outs;
+    }
+};
+
+struct ControlNet : public GGMLRunner {
+    SDVersion version = VERSION_SD1;
+    ControlNetBlock control_net;
+
+    ggml_backend_buffer_t control_buffer = NULL;  // keep control output tensors in backend memory
+    ggml_context* control_ctx            = NULL;
+    std::vector<struct ggml_tensor*> controls;  // (12 input block outputs, 1 middle block output) SD 1.5
+    struct ggml_tensor* guided_hint = NULL;     // guided_hint cache, for faster inference
+    bool guided_hint_cached         = false;
+
+    ControlNet(ggml_backend_t backend,
+               std::map<std::string, enum ggml_type>& tensor_types,
+               SDVersion version = VERSION_SD1)
+        : GGMLRunner(backend), control_net(version) {
+        control_net.init(params_ctx, tensor_types, "");
+    }
+
+    ~ControlNet() {
+        free_control_ctx();
+    }
+
+    void alloc_control_ctx(std::vector<struct ggml_tensor*> outs) {
+        struct ggml_init_params params;
+        params.mem_size   = static_cast<size_t>(outs.size() * ggml_tensor_overhead()) + 1024 * 1024;
+        params.mem_buffer = NULL;
+        params.no_alloc   = true;
+        control_ctx       = ggml_init(params);
+
+        controls.resize(outs.size() - 1);
+
+        size_t control_buffer_size = 0;
+
+        guided_hint = ggml_dup_tensor(control_ctx, outs[0]);
+        control_buffer_size += ggml_nbytes(guided_hint);
+
+        for (int i = 0; i < outs.size() - 1; i++) {
+            controls[i] = ggml_dup_tensor(control_ctx, outs[i + 1]);
+            control_buffer_size += ggml_nbytes(controls[i]);
+        }
+
+        control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, backend);
+
+        LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f);
+    }
+
+    void free_control_ctx() {
+        if (control_buffer != NULL) {
+            ggml_backend_buffer_free(control_buffer);
+            control_buffer = NULL;
+        }
+        if (control_ctx != NULL) {
+            ggml_free(control_ctx);
+            control_ctx = NULL;
+        }
+        guided_hint        = NULL;
+        guided_hint_cached = false;
+        controls.clear();
+    }
+
+    std::string get_desc() {
+        return "control_net";
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        control_net.get_param_tensors(tensors, prefix);
+    }
+
+    struct ggml_cgraph* build_graph(struct ggml_tensor* x,
+                                    struct ggml_tensor* hint,
+                                    struct ggml_tensor* timesteps,
+                                    struct ggml_tensor* context,
+                                    struct ggml_tensor* y = NULL) {
+        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false);
+
+        x = to_backend(x);
+        if (guided_hint_cached) {
+            hint = NULL;
+        } else {
+            hint = to_backend(hint);
+        }
+        context   = to_backend(context);
+        y         = to_backend(y);
+        timesteps = to_backend(timesteps);
+
+        auto outs = control_net.forward(compute_ctx,
+                                        x,
+                                        hint,
+                                        guided_hint_cached ? guided_hint : NULL,
+                                        timesteps,
+                                        context,
+                                        y);
+
+        if (control_ctx == NULL) {
+            alloc_control_ctx(outs);
+        }
+
+        ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[0], guided_hint));
+        for (int i = 0; i < outs.size() - 1; i++) {
+            ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[i + 1], controls[i]));
+        }
+
+        return gf;
+    }
+
+    void compute(int n_threads,
+                 struct ggml_tensor* x,
+                 struct ggml_tensor* hint,
+                 struct ggml_tensor* timesteps,
+                 struct ggml_tensor* context,
+                 struct ggml_tensor* y,
+                 struct ggml_tensor** output     = NULL,
+                 struct ggml_context* output_ctx = NULL) {
+        // x: [N, in_channels, h, w]
+        // timesteps: [N, ]
+        // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
+        // y: [N, adm_in_channels] or [1, adm_in_channels]
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(x, hint, timesteps, context, y);
+        };
+
+        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        guided_hint_cached = true;
+    }
+
+    bool load_from_file(const std::string& file_path) {
+        LOG_INFO("loading control net from '%s'", file_path.c_str());
+        alloc_params_buffer();
+        std::map<std::string, ggml_tensor*> tensors;
+        control_net.get_param_tensors(tensors);
+        std::set<std::string> ignore_tensors;
+
+        ModelLoader model_loader;
+        if (!model_loader.init_from_file(file_path)) {
+            LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
+            return false;
+        }
+
+        bool success = model_loader.load_tensors(tensors, backend, ignore_tensors);
+
+        if (!success) {
+            LOG_ERROR("load control net tensors from model loader failed");
+            return false;
+        }
+
+        LOG_INFO("control net model loaded");
+        return success;
+    }
+};
+
+#endif  // __CONTROL_HPP__
--- a/denoiser.hpp
+++ b/denoiser.hpp
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@ -0,0 +1,187 @@
+#ifndef __DIFFUSION_MODEL_H__
+#define __DIFFUSION_MODEL_H__
+
+#include "flux.hpp"
+#include "mmdit.hpp"
+#include "unet.hpp"
+
+struct DiffusionModel {
+    virtual void compute(int n_threads,
+                         struct ggml_tensor* x,
+                         struct ggml_tensor* timesteps,
+                         struct ggml_tensor* context,
+                         struct ggml_tensor* c_concat,
+                         struct ggml_tensor* y,
+                         struct ggml_tensor* guidance,
+                         std::vector<ggml_tensor*> ref_latents     = {},
+                         int num_video_frames                      = -1,
+                         std::vector<struct ggml_tensor*> controls = {},
+                         float control_strength                    = 0.f,
+                         struct ggml_tensor** output               = NULL,
+                         struct ggml_context* output_ctx           = NULL,
+                         std::vector<int> skip_layers              = std::vector<int>())             = 0;
+    virtual void alloc_params_buffer()                                                  = 0;
+    virtual void free_params_buffer()                                                   = 0;
+    virtual void free_compute_buffer()                                                  = 0;
+    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
+    virtual size_t get_params_buffer_size()                                             = 0;
+    virtual int64_t get_adm_in_channels()                                               = 0;
+};
+
+struct UNetModel : public DiffusionModel {
+    UNetModelRunner unet;
+
+    UNetModel(ggml_backend_t backend,
+              std::map<std::string, enum ggml_type>& tensor_types,
+              SDVersion version = VERSION_SD1,
+              bool flash_attn   = false)
+        : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
+    }
+
+    void alloc_params_buffer() {
+        unet.alloc_params_buffer();
+    }
+
+    void free_params_buffer() {
+        unet.free_params_buffer();
+    }
+
+    void free_compute_buffer() {
+        unet.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+        unet.get_param_tensors(tensors, "model.diffusion_model");
+    }
+
+    size_t get_params_buffer_size() {
+        return unet.get_params_buffer_size();
+    }
+
+    int64_t get_adm_in_channels() {
+        return unet.unet.adm_in_channels;
+    }
+
+    void compute(int n_threads,
+                 struct ggml_tensor* x,
+                 struct ggml_tensor* timesteps,
+                 struct ggml_tensor* context,
+                 struct ggml_tensor* c_concat,
+                 struct ggml_tensor* y,
+                 struct ggml_tensor* guidance,
+                 std::vector<ggml_tensor*> ref_latents     = {},
+                 int num_video_frames                      = -1,
+                 std::vector<struct ggml_tensor*> controls = {},
+                 float control_strength                    = 0.f,
+                 struct ggml_tensor** output               = NULL,
+                 struct ggml_context* output_ctx           = NULL,
+                 std::vector<int> skip_layers              = std::vector<int>()) {
+        (void)skip_layers;  // SLG doesn't work with UNet models
+        return unet.compute(n_threads, x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength, output, output_ctx);
+    }
+};
+
+struct MMDiTModel : public DiffusionModel {
+    MMDiTRunner mmdit;
+
+    MMDiTModel(ggml_backend_t backend,
+               std::map<std::string, enum ggml_type>& tensor_types)
+        : mmdit(backend, tensor_types, "model.diffusion_model") {
+    }
+
+    void alloc_params_buffer() {
+        mmdit.alloc_params_buffer();
+    }
+
+    void free_params_buffer() {
+        mmdit.free_params_buffer();
+    }
+
+    void free_compute_buffer() {
+        mmdit.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+        mmdit.get_param_tensors(tensors, "model.diffusion_model");
+    }
+
+    size_t get_params_buffer_size() {
+        return mmdit.get_params_buffer_size();
+    }
+
+    int64_t get_adm_in_channels() {
+        return 768 + 1280;
+    }
+
+    void compute(int n_threads,
+                 struct ggml_tensor* x,
+                 struct ggml_tensor* timesteps,
+                 struct ggml_tensor* context,
+                 struct ggml_tensor* c_concat,
+                 struct ggml_tensor* y,
+                 struct ggml_tensor* guidance,
+                 std::vector<ggml_tensor*> ref_latents     = {},
+                 int num_video_frames                      = -1,
+                 std::vector<struct ggml_tensor*> controls = {},
+                 float control_strength                    = 0.f,
+                 struct ggml_tensor** output               = NULL,
+                 struct ggml_context* output_ctx           = NULL,
+                 std::vector<int> skip_layers              = std::vector<int>()) {
+        return mmdit.compute(n_threads, x, timesteps, context, y, output, output_ctx, skip_layers);
+    }
+};
+
+struct FluxModel : public DiffusionModel {
+    Flux::FluxRunner flux;
+
+    FluxModel(ggml_backend_t backend,
+              std::map<std::string, enum ggml_type>& tensor_types,
+              SDVersion version = VERSION_FLUX,
+              bool flash_attn   = false,
+              bool use_mask     = false)
+        : flux(backend, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) {
+    }
+
+    void alloc_params_buffer() {
+        flux.alloc_params_buffer();
+    }
+
+    void free_params_buffer() {
+        flux.free_params_buffer();
+    }
+
+    void free_compute_buffer() {
+        flux.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+        flux.get_param_tensors(tensors, "model.diffusion_model");
+    }
+
+    size_t get_params_buffer_size() {
+        return flux.get_params_buffer_size();
+    }
+
+    int64_t get_adm_in_channels() {
+        return 768;
+    }
+
+    void compute(int n_threads,
+                 struct ggml_tensor* x,
+                 struct ggml_tensor* timesteps,
+                 struct ggml_tensor* context,
+                 struct ggml_tensor* c_concat,
+                 struct ggml_tensor* y,
+                 struct ggml_tensor* guidance,
+                 std::vector<ggml_tensor*> ref_latents     = {},
+                 int num_video_frames                      = -1,
+                 std::vector<struct ggml_tensor*> controls = {},
+                 float control_strength                    = 0.f,
+                 struct ggml_tensor** output               = NULL,
+                 struct ggml_context* output_ctx           = NULL,
+                 std::vector<int> skip_layers              = std::vector<int>()) {
+        return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, ref_latents, output, output_ctx, skip_layers);
+    }
+};
+
+#endif
--- a/docs/chroma.md
+++ b/docs/chroma.md
@ -0,0 +1,33 @@
+# How to Use
+
+You can run Chroma using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
+
+## Download weights
+
+- Download Chroma
+    - If you don't want to do the conversion yourself, download the preconverted gguf model from [silveroxides/Chroma-GGUF](https://huggingface.co/silveroxides/Chroma-GGUF)
+    - Otherwise, download chroma's safetensors from [lodestones/Chroma](https://huggingface.co/lodestones/Chroma)
+- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
+- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
+
+## Convert Chroma weights
+
+You can download the preconverted gguf weights from [silveroxides/Chroma-GGUF](https://huggingface.co/silveroxides/Chroma-GGUF), this way you don't have to do the conversion yourself.
+
+```
+.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\chroma-unlocked-v40.safetensors -o ..\models\chroma-unlocked-v40-q8_0.gguf -v --type q8_0
+```
+
+## Run
+
+### Example
+For example:
+
+```
+ .\bin\Release\sd.exe -diffusion-model  ..\models\chroma-unlocked-v40-q8_0.gguf --vae ..\models\ae.sft --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'chroma.cpp'" --cfg-scale 4.0 --sampling-method euler -v --chroma-disable-dit-mask
+```
+
+![](../assets/flux/chroma_v40.png)
+
+
+
--- a/docs/docker.md
+++ b/docs/docker.md
@ -0,0 +1,15 @@
+## Docker
+
+### Building using Docker
+
+```shell
+docker build -t sd .
+```
+
+### Run
+
+```shell
+docker run -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
+# For example
+# docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
+```
--- a/docs/esrgan.md
+++ b/docs/esrgan.md
@ -0,0 +1,9 @@
+## Using ESRGAN to upscale results
+
+You can use ESRGAN to upscale the generated images. At the moment, only the [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth) model is supported. Support for more models of this architecture will be added soon.
+
+- Specify the model path using the `--upscale-model PATH` parameter. example:
+
+```bash
+sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --upscale-model ../models/RealESRGAN_x4plus_anime_6B.pth
+```
--- a/docs/flux.md
+++ b/docs/flux.md
@ -0,0 +1,66 @@
+# How to Use
+
+You can run Flux using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
+
+## Download weights
+
+- Download flux
+    - If you don't want to do the conversion yourself, download the preconverted gguf model from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf)
+    - Otherwise, download flux-dev from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors or flux-schnell from https://huggingface.co/black-forest-labs/FLUX.1-schnell/blob/main/flux1-schnell.safetensors
+- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
+- Download clip_l from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/clip_l.safetensors
+- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
+
+## Convert flux weights
+
+You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself.
+
+Using fp16 will lead to overflow, but ggml's support for bf16 is not yet fully developed. Therefore, we need to convert flux to gguf format here, which also saves VRAM. For example:
+```
+.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
+```
+
+## Run
+
+- `--cfg-scale` is recommended to be set to 1. 
+
+### Flux-dev
+For example:
+
+```
+ .\bin\Release\sd.exe --diffusion-model  ..\models\flux1-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v
+```
+
+Using formats of different precisions will yield results of varying quality.
+
+| Type | q8_0  | q4_0  | q4_k  | q3_k  | q2_k |
+|---- | ----  |----  |----  |----  |----  |
+| **Memory** | 12068.09 MB  | 6394.53 MB | 6395.17 MB | 4888.16 MB  | 3735.73 MB |
+| **Result** | ![](../assets/flux/flux1-dev-q8_0.png) |![](../assets/flux/flux1-dev-q4_0.png) |![](../assets/flux/flux1-dev-q4_k.png) |![](../assets/flux/flux1-dev-q3_k.png) |![](../assets/flux/flux1-dev-q2_k.png)|
+
+
+
+### Flux-schnell
+
+
+```
+ .\bin\Release\sd.exe --diffusion-model  ..\models\flux1-schnell-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --steps 4
+```
+
+| q8_0  |
+| ----  |
+|![](../assets/flux/flux1-schnell-q8_0.png) |
+
+## Run with LoRA
+
+Since many flux LoRA training libraries have used various LoRA naming formats, it is possible that not all flux LoRA naming formats are supported. It is recommended to use LoRA with naming formats compatible with ComfyUI.
+
+### Flux-dev q8_0 with LoRA
+
+- LoRA model from https://huggingface.co/XLabs-AI/flux-lora-collection/tree/main (using comfy converted version!!!)
+
+```
+.\bin\Release\sd.exe --diffusion-model  ..\models\flux1-dev-q8_0.gguf --vae ...\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'<lora:realism_lora_comfy_converted:1>" --cfg-scale 1.0 --sampling-method euler -v --lora-model-dir ../models
+```
+
+![output](../assets/flux/flux1-dev-q8_0%20with%20lora.png)
--- a/docs/hipBLAS_on_Windows.md
+++ b/docs/hipBLAS_on_Windows.md
@ -0,0 +1,85 @@
+# Using hipBLAS on Windows
+
+To get hipBLAS in `stable-diffusion.cpp` working on Windows, go through this guide section by section.
+
+## Build Tools for Visual Studio 2022
+
+Skip this step if you already have Build Tools installed.
+
+To install Build Tools, go to [Visual Studio Downloads](https://visualstudio.microsoft.com/vs/), download `Visual Studio 2022 and other Products` and run the installer.
+
+## CMake
+
+Skip this step if you already have CMake installed: running `cmake --version` should output `cmake version x.y.z`.
+
+Download latest `Windows x64 Installer` from [Download | CMake](https://cmake.org/download/) and run it.
+
+## ROCm
+
+Skip this step if you already have Build Tools installed.
+
+The [validation tools](https://rocm.docs.amd.com/en/latest/reference/validation_tools.html) not support on Windows. So you should confirm the Version of `ROCM` by yourself.
+
+Fortunately, `AMD` provides complete help documentation, you can use the help documentation to install [ROCM](https://rocm.docs.amd.com/en/latest/deploy/windows/quick_start.html)
+
+>**If you encounter an error, if it is [AMD ROCm Windows Installation Error 215](https://github.com/RadeonOpenCompute/ROCm/issues/2363), don't worry about this error. ROCM has been installed correctly, but the vs studio plugin installation failed, we can ignore it.**
+
+Then we must set `ROCM` as environment variables before running cmake.
+
+Usually if you install according to the official tutorial and do not modify the ROCM path, then there is a high probability that it is here `C:\Program Files\AMD\ROCm\5.5\bin`
+
+This is what I use to set the clang:
+```Commandline
+set CC=C:\Program Files\AMD\ROCm\5.5\bin\clang.exe
+set CXX=C:\Program Files\AMD\ROCm\5.5\bin\clang++.exe
+```
+
+## Ninja
+
+Skip this step if you already have Ninja installed: running `ninja --version` should output `1.11.1`.
+
+Download latest `ninja-win.zip` from [GitHub Releases Page](https://github.com/ninja-build/ninja/releases/tag/v1.11.1) and unzip. Then set as environment variables. I unzipped it in `C:\Program Files\ninja`, so I set it like this:
+
+```Commandline
+set ninja=C:\Program Files\ninja\ninja.exe
+```
+## Building stable-diffusion.cpp
+
+The thing different from the regular CPU build is `-DSD_HIPBLAS=ON` ,
+`-G "Ninja"`, `-DCMAKE_C_COMPILER=clang`, `-DCMAKE_CXX_COMPILER=clang++`, `-DAMDGPU_TARGETS=gfx1100`
+
+>**Notice**: check the `clang` and `clang++` information:
+```Commandline
+clang --version
+clang++ --version
+```
+
+If you see like this, we can continue:
+```
+clang version 17.0.0 (git@github.amd.com:Compute-Mirrors/llvm-project e3201662d21c48894f2156d302276eb1cf47c7be)
+Target: x86_64-pc-windows-msvc
+Thread model: posix
+InstalledDir: C:\Program Files\AMD\ROCm\5.5\bin
+```
+
+```
+clang version 17.0.0 (git@github.amd.com:Compute-Mirrors/llvm-project e3201662d21c48894f2156d302276eb1cf47c7be)
+Target: x86_64-pc-windows-msvc
+Thread model: posix
+InstalledDir: C:\Program Files\AMD\ROCm\5.5\bin
+```
+
+>**Notice** that the `gfx1100` is the GPU architecture of my GPU, you can change it to your GPU architecture. Click here to see your architecture [LLVM Target](https://rocm.docs.amd.com/en/latest/release/windows_support.html#windows-supported-gpus)
+
+My GPU is AMD Radeon™ RX 7900 XTX Graphics, so I set it to `gfx1100`.
+
+option:
+
+```commandline
+mkdir build
+cd build
+cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1100
+cmake --build . --config Release
+```
+
+If everything went OK, `build\bin\sd.exe` file should appear.
--- a/docs/kontext.md
+++ b/docs/kontext.md
@ -0,0 +1,39 @@
+# How to Use
+
+You can run Kontext using stable-diffusion.cpp with a GPU that has 6GB or even 4GB of VRAM, without needing to offload to RAM.
+
+## Download weights
+
+- Download Kontext
+    - If you don't want to do the conversion yourself, download the preconverted gguf model from [FLUX.1-Kontext-dev-GGUF](https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF)
+    - Otherwise, download FLUX.1-Kontext-dev from https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev/blob/main/flux1-kontext-dev.safetensors
+- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
+- Download clip_l from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/clip_l.safetensors
+- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
+
+## Convert Kontext weights
+
+You can download the preconverted gguf weights from [FLUX.1-Kontext-dev-GGUF](https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF), this way you don't have to do the conversion yourself.
+
+```
+.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-kontext-dev.safetensors -o ..\models\flux1-kontext-dev-q8_0.gguf -v --type q8_0
+```
+
+## Run
+
+- `--cfg-scale` is recommended to be set to 1. 
+
+### Example
+For example:
+
+```
+ .\bin\Release\sd.exe -r .\flux1-dev-q8_0.png --diffusion-model  ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v
+```
+
+
+| ref_image | prompt  | output  |
+| ---- | ----  |----  |
+| ![](../assets/flux/flux1-dev-q8_0.png) | change 'flux.cpp' to 'kontext.cpp' |![](../assets/flux/kontext1_dev_output.png) |
+
+
+
--- a/docs/lcm.md
+++ b/docs/lcm.md
@ -0,0 +1,15 @@
+## LCM/LCM-LoRA
+
+- Download LCM-LoRA form https://huggingface.co/latent-consistency/lcm-lora-sdv1-5
+- Specify LCM-LoRA by adding `<lora:lcm-lora-sdv1-5:1>` to prompt
+- It's advisable to set `--cfg-scale` to `1.0` instead of the default `7.0`. For `--steps`, a range of `2-8` steps is recommended. For `--sampling-method`, `lcm`/`euler_a` is recommended.
+
+Here's a simple example:
+
+```
+./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:lcm-lora-sdv1-5:1>" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
+```
+
+| without LCM-LoRA (--cfg-scale 7)  | with LCM-LoRA (--cfg-scale 1)  |
+| ----  |----    |
+| ![](../assets/without_lcm.png) |![](../assets/with_lcm.png)  |
--- a/docs/lora.md
+++ b/docs/lora.md
@ -0,0 +1,13 @@
+## LoRA
+
+- You can specify the directory where the lora weights are stored via `--lora-model-dir`. If not specified, the default is the current working directory.
+
+- LoRA is specified via prompt, just like [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora).
+
+Here's a simple example:
+
+```
+./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
+```
+
+`../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model
--- a/docs/photo_maker.md
+++ b/docs/photo_maker.md
@ -0,0 +1,54 @@
+## Using PhotoMaker to personalize image generation
+
+You can use [PhotoMaker](https://github.com/TencentARC/PhotoMaker) to personalize generated images with your own ID.
+
+**NOTE**, currently PhotoMaker **ONLY** works with **SDXL** (any SDXL model files will work).
+
+Download PhotoMaker model file (in safetensor format) [here](https://huggingface.co/bssrdf/PhotoMaker). The official release of the model file (in .bin format) does not work with ```stablediffusion.cpp```.
+
+- Specify the PhotoMaker model path using the `--stacked-id-embd-dir PATH` parameter.
+- Specify the input images path using the `--input-id-images-dir PATH` parameter.
+  - input images **must** have the same width and height for preprocessing (to be improved)
+
+In prompt, make sure you have a class word followed by the trigger word ```"img"``` (hard-coded for now). The class word could be one of ```"man, woman, girl, boy"```. If input ID images contain asian faces, add ```Asian``` before the class
+word.
+
+Another PhotoMaker specific parameter:
+
+- ```--style-ratio  (0-100)%```: default is 20 and 10-20 typically gets good results. Lower ratio means more faithfully following input ID (not necessarily better quality).
+
+Other parameters recommended for running Photomaker:
+
+- ```--cfg-scale 5.0```
+- ```-H 1024```
+- ```-W 1024```
+
+If on low memory GPUs (<= 8GB), recommend running with ```--vae-on-cpu``` option to get artifact free images.
+
+Example:
+
+```bash
+bin/sd -m ../models/sdxlUnstableDiffusers_v11.safetensors  --vae ../models/sdxl_vae.safetensors --stacked-id-embd-dir ../models/photomaker-v1.safetensors --input-id-images-dir ../assets/photomaker_examples/scarletthead_woman -p "a girl img, retro futurism, retro game art style but extremely beautiful, intricate details, masterpiece, best quality, space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed" -n "realistic, photo-realistic, worst quality, greyscale, bad anatomy, bad hands, error, text" --cfg-scale 5.0  --sampling-method euler -H 1024 -W 1024 --style-ratio 10 --vae-on-cpu -o output.png
+```
+
+## PhotoMaker Version 2
+
+[PhotoMaker Version 2 (PMV2)](https://github.com/TencentARC/PhotoMaker/blob/main/README_pmv2.md) has some key improvements. Unfortunately it has a very heavy dependency which makes running it a bit involved in ```SD.cpp```. 
+
+Running PMV2 is now a two-step process:
+
+- Run a python script ```face_detect.py``` to obtain **id_embeds** for the given input images
+```
+python face_detect.py input_image_dir
+```
+An ```id_embeds.safetensors``` file will be generated in ```input_images_dir```
+
+**Note: this step is only needed to run once; the same ```id_embeds``` can be reused**
+
+- Run the same command as in version 1 but replacing ```photomaker-v1.safetensors``` with ```photomaker-v2.safetensors```.
+
+  You can download ```photomaker-v2.safetensors``` from [here](https://huggingface.co/bssrdf/PhotoMakerV2)
+
+- All the command line parameters from Version 1 remain the same for Version 2
+
+
--- a/docs/quantization_and_gguf.md
+++ b/docs/quantization_and_gguf.md
@ -0,0 +1,27 @@
+## Quantization
+
+You can specify the model weight type using the `--type` parameter. The weights are automatically converted when loading the model.
+
+- `f16` for 16-bit floating-point
+- `f32` for 32-bit floating-point
+- `q8_0` for 8-bit integer quantization
+- `q5_0` or `q5_1` for 5-bit integer quantization
+- `q4_0` or `q4_1` for 4-bit integer quantization
+
+
+### Memory Requirements of Stable Diffusion 1.x
+
+| precision | f32  | f16  |q8_0  |q5_0  |q5_1  |q4_0  |q4_1  |
+| ----         | ----  |----  |----  |----  |----  |----  |----  |
+|  **Memory** (txt2img - 512 x 512) | ~2.8G | ~2.3G | ~2.1G | ~2.0G | ~2.0G | ~2.0G | ~2.0G |
+|  **Memory** (txt2img - 512 x 512) *with Flash Attention* | ~2.4G | ~1.9G | ~1.6G | ~1.5G | ~1.5G | ~1.5G | ~1.5G |
+
+## Convert to GGUF
+
+You can also convert weights in the formats `ckpt/safetensors/diffusers` to gguf and perform quantization in advance, avoiding the need for quantization every time you load them.
+
+For example:
+
+```sh
+./bin/sd -M convert -m ../models/v1-5-pruned-emaonly.safetensors -o  ../models/v1-5-pruned-emaonly.q8_0.gguf -v --type q8_0
+```
--- a/docs/sd3.md
+++ b/docs/sd3.md
@ -0,0 +1,20 @@
+# How to Use
+
+## Download weights
+
+- Download sd3.5_large from https://huggingface.co/stabilityai/stable-diffusion-3.5-large/blob/main/sd3.5_large.safetensors
+- Download clip_g from https://huggingface.co/Comfy-Org/stable-diffusion-3.5-fp8/blob/main/text_encoders/clip_g.safetensors
+- Download clip_l from https://huggingface.co/Comfy-Org/stable-diffusion-3.5-fp8/blob/main/text_encoders/clip_l.safetensors
+- Download t5xxl from https://huggingface.co/Comfy-Org/stable-diffusion-3.5-fp8/blob/main/text_encoders/t5xxl_fp16.safetensors
+
+
+## Run
+
+### SD3.5 Large
+For example:
+
+```
+.\bin\Release\sd.exe -m  ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v
+```
+
+![](../assets/sd3.5_large.png)
--- a/docs/taesd.md
+++ b/docs/taesd.md
@ -0,0 +1,17 @@
+## Using TAESD to faster decoding
+
+You can use TAESD to accelerate the decoding of latent images by following these steps:
+
+- Download the model [weights](https://huggingface.co/madebyollin/taesd/blob/main/diffusion_pytorch_model.safetensors).
+
+Or curl
+
+```bash
+curl -L -O https://huggingface.co/madebyollin/taesd/blob/main/diffusion_pytorch_model.safetensors
+```
+
+- Specify the model path using the `--taesd PATH` parameter. example:
+
+```bash
+sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --taesd ../models/diffusion_pytorch_model.safetensors
+```
--- a/esrgan.hpp
+++ b/esrgan.hpp
@ -12,279 +12,151 @@

 */

-struct ResidualDenseBlock {
-    int num_features;
+class ResidualDenseBlock : public GGMLBlock {
+protected:
+    int num_feat;
    int num_grow_ch;
-    ggml_tensor* conv1_w;  // [num_grow_ch, num_features, 3, 3]
-    ggml_tensor* conv1_b;  // [num_grow_ch]

-    ggml_tensor* conv2_w;  // [num_grow_ch, num_features + num_grow_ch, 3, 3]
-    ggml_tensor* conv2_b;  // [num_grow_ch]
-
-    ggml_tensor* conv3_w;  // [num_grow_ch, num_features + 2 * num_grow_ch, 3, 3]
-    ggml_tensor* conv3_b;  // [num_grow_ch]
-
-    ggml_tensor* conv4_w;  // [num_grow_ch, num_features + 3 * num_grow_ch, 3, 3]
-    ggml_tensor* conv4_b;  // [num_grow_ch]
-
-    ggml_tensor* conv5_w;  // [num_features, num_features + 4 * num_grow_ch, 3, 3]
-    ggml_tensor* conv5_b;  // [num_features]
-
-    ResidualDenseBlock() {}
-
-    ResidualDenseBlock(int num_feat, int n_grow_ch) {
-        num_features = num_feat;
-        num_grow_ch  = n_grow_ch;
+public:
+    ResidualDenseBlock(int num_feat = 64, int num_grow_ch = 32)
+        : num_feat(num_feat), num_grow_ch(num_grow_ch) {
+        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv3"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv4"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
    }

-    size_t calculate_mem_size() {
-        size_t mem_size = num_features * num_grow_ch * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv1_w
-        mem_size += num_grow_ch * ggml_type_size(GGML_TYPE_F32);                               // conv1_b
-
-        mem_size += (num_features + num_grow_ch) * num_grow_ch * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv2_w
-        mem_size += num_grow_ch * ggml_type_size(GGML_TYPE_F32);                                         // conv2_b
-
-        mem_size += (num_features + 2 * num_grow_ch) * num_grow_ch * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv3_w
-        mem_size += num_grow_ch * ggml_type_size(GGML_TYPE_F32);                                             // conv3_w
-
-        mem_size += (num_features + 3 * num_grow_ch) * num_grow_ch * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv4_w
-        mem_size += num_grow_ch * ggml_type_size(GGML_TYPE_F32);                                             // conv4_w
-
-        mem_size += (num_features + 4 * num_grow_ch) * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv5_w
-        mem_size += num_features * ggml_type_size(GGML_TYPE_F32);                                             // conv5_w
-
-        return mem_size;
+    struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
+        return ggml_leaky_relu(ctx, x, 0.2f, true);
    }

-    int get_num_tensors() {
-        int num_tensors = 10;
-        return num_tensors;
-    }
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [n, num_feat, h, w]
+        // return: [n, num_feat, h, w]

-    void init_params(ggml_context* ctx) {
-        conv1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features, num_grow_ch);
-        conv1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_grow_ch);
-        conv2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features + num_grow_ch, num_grow_ch);
-        conv2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_grow_ch);
-        conv3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features + 2 * num_grow_ch, num_grow_ch);
-        conv3_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_grow_ch);
-        conv4_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features + 3 * num_grow_ch, num_grow_ch);
-        conv4_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_grow_ch);
-        conv5_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features + 4 * num_grow_ch, num_features);
-        conv5_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_features);
-    }
+        auto conv1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv1"]);
+        auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv2"]);
+        auto conv3 = std::dynamic_pointer_cast<Conv2d>(blocks["conv3"]);
+        auto conv4 = std::dynamic_pointer_cast<Conv2d>(blocks["conv4"]);
+        auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]);

-    void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
-        tensors[prefix + "conv1.weight"] = conv1_w;
-        tensors[prefix + "conv1.bias"]   = conv1_b;
+        auto x1    = lrelu(ctx, conv1->forward(ctx, x));
+        auto x_cat = ggml_concat(ctx, x, x1, 2);
+        auto x2    = lrelu(ctx, conv2->forward(ctx, x_cat));
+        x_cat      = ggml_concat(ctx, x_cat, x2, 2);
+        auto x3    = lrelu(ctx, conv3->forward(ctx, x_cat));
+        x_cat      = ggml_concat(ctx, x_cat, x3, 2);
+        auto x4    = lrelu(ctx, conv4->forward(ctx, x_cat));
+        x_cat      = ggml_concat(ctx, x_cat, x4, 2);
+        auto x5    = conv5->forward(ctx, x_cat);

-        tensors[prefix + "conv2.weight"] = conv2_w;
-        tensors[prefix + "conv2.bias"]   = conv2_b;
-
-        tensors[prefix + "conv3.weight"] = conv3_w;
-        tensors[prefix + "conv3.bias"]   = conv3_b;
-
-        tensors[prefix + "conv4.weight"] = conv4_w;
-        tensors[prefix + "conv4.bias"]   = conv4_b;
-
-        tensors[prefix + "conv5.weight"] = conv5_w;
-        tensors[prefix + "conv5.bias"]   = conv5_b;
-    }
-
-    ggml_tensor* forward(ggml_context* ctx, ggml_tensor* out_scale, ggml_tensor* x /* feat */) {
-        // x1 = self.lrelu(self.conv1(x))
-        ggml_tensor* x1 = ggml_nn_conv_2d(ctx, x, conv1_w, conv1_b, 1, 1, 1, 1);
-        x1              = ggml_leaky_relu(ctx, x1, 0.2f, true);
-
-        // x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
-        ggml_tensor* x_cat = ggml_concat(ctx, x, x1);
-        ggml_tensor* x2    = ggml_nn_conv_2d(ctx, x_cat, conv2_w, conv2_b, 1, 1, 1, 1);
-        x2                 = ggml_leaky_relu(ctx, x2, 0.2f, true);
-
-        // x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
-        x_cat           = ggml_concat(ctx, x_cat, x2);
-        ggml_tensor* x3 = ggml_nn_conv_2d(ctx, x_cat, conv3_w, conv3_b, 1, 1, 1, 1);
-        x3              = ggml_leaky_relu(ctx, x3, 0.2f, true);
-
-        // x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
-        x_cat           = ggml_concat(ctx, x_cat, x3);
-        ggml_tensor* x4 = ggml_nn_conv_2d(ctx, x_cat, conv4_w, conv4_b, 1, 1, 1, 1);
-        x4              = ggml_leaky_relu(ctx, x4, 0.2f, true);
-
-        // self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
-        x_cat           = ggml_concat(ctx, x_cat, x4);
-        ggml_tensor* x5 = ggml_nn_conv_2d(ctx, x_cat, conv5_w, conv5_b, 1, 1, 1, 1);
-
-        // return x5 * 0.2 + x
-        x5 = ggml_add(ctx, ggml_scale(ctx, x5, out_scale), x);
+        x5 = ggml_add(ctx, ggml_scale(ctx, x5, 0.2f), x);
        return x5;
    }
 };

-struct EsrganBlock {
-    ResidualDenseBlock rd_blocks[3];
-    int num_residual_blocks = 3;
-
-    EsrganBlock() {}
-
-    EsrganBlock(int num_feat, int num_grow_ch) {
-        for (int i = 0; i < num_residual_blocks; i++) {
-            rd_blocks[i] = ResidualDenseBlock(num_feat, num_grow_ch);
-        }
+class RRDB : public GGMLBlock {
+public:
+    RRDB(int num_feat, int num_grow_ch = 32) {
+        blocks["rdb1"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
+        blocks["rdb2"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
+        blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
    }

-    int get_num_tensors() {
-        int num_tensors = 0;
-        for (int i = 0; i < num_residual_blocks; i++) {
-            num_tensors += rd_blocks[i].get_num_tensors();
-        }
-        return num_tensors;
-    }
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [n, num_feat, h, w]
+        // return: [n, num_feat, h, w]

-    size_t calculate_mem_size() {
-        size_t mem_size = 0;
-        for (int i = 0; i < num_residual_blocks; i++) {
-            mem_size += rd_blocks[i].calculate_mem_size();
-        }
-        return mem_size;
-    }
+        auto rdb1 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb1"]);
+        auto rdb2 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb2"]);
+        auto rdb3 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb3"]);

-    void init_params(ggml_context* ctx) {
-        for (int i = 0; i < num_residual_blocks; i++) {
-            rd_blocks[i].init_params(ctx);
-        }
-    }
+        auto out = rdb1->forward(ctx, x);
+        out      = rdb2->forward(ctx, out);
+        out      = rdb3->forward(ctx, out);

-    void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
-        for (int i = 0; i < num_residual_blocks; i++) {
-            rd_blocks[i].map_by_name(tensors, prefix + "rdb" + std::to_string(i + 1) + ".");
-        }
-    }
-
-    ggml_tensor* forward(ggml_context* ctx, ggml_tensor* out_scale, ggml_tensor* x) {
-        ggml_tensor* out = x;
-        for (int i = 0; i < num_residual_blocks; i++) {
-            // out = self.rdb...(x)
-            out = rd_blocks[i].forward(ctx, out_scale, out);
-        }
-        // return out * 0.2 + x
-        out = ggml_add(ctx, ggml_scale(ctx, out, out_scale), x);
+        out = ggml_add(ctx, ggml_scale(ctx, out, 0.2f), x);
        return out;
    }
 };

-struct ESRGAN : public GGMLModule {
-    int scale        = 4;  // default RealESRGAN_x4plus_anime_6B
-    int num_blocks   = 6;  // default RealESRGAN_x4plus_anime_6B
-    int in_channels  = 3;
-    int out_channels = 3;
-    int num_features = 64;   // default RealESRGAN_x4plus_anime_6B
-    int num_grow_ch  = 32;   // default RealESRGAN_x4plus_anime_6B
-    int tile_size    = 128;  // avoid cuda OOM for 4gb VRAM
+class RRDBNet : public GGMLBlock {
+protected:
+    int scale       = 4;  // default RealESRGAN_x4plus_anime_6B
+    int num_block   = 6;  // default RealESRGAN_x4plus_anime_6B
+    int num_in_ch   = 3;
+    int num_out_ch  = 3;
+    int num_feat    = 64;  // default RealESRGAN_x4plus_anime_6B
+    int num_grow_ch = 32;  // default RealESRGAN_x4plus_anime_6B

-    ggml_tensor* conv_first_w;  // [num_features, in_channels, 3, 3]
-    ggml_tensor* conv_first_b;  // [num_features]
-
-    EsrganBlock body_blocks[6];
-    ggml_tensor* conv_body_w;  // [num_features, num_features, 3, 3]
-    ggml_tensor* conv_body_b;  // [num_features]
-
-    // upsample
-    ggml_tensor* conv_up1_w;  // [num_features, num_features, 3, 3]
-    ggml_tensor* conv_up1_b;  // [num_features]
-    ggml_tensor* conv_up2_w;  // [num_features, num_features, 3, 3]
-    ggml_tensor* conv_up2_b;  // [num_features]
-
-    ggml_tensor* conv_hr_w;    // [num_features, num_features, 3, 3]
-    ggml_tensor* conv_hr_b;    // [num_features]
-    ggml_tensor* conv_last_w;  // [out_channels, num_features, 3, 3]
-    ggml_tensor* conv_last_b;  // [out_channels]
-
-    bool decode_only = false;
-
-    ESRGAN() {
-        name = "esrgan";
-        for (int i = 0; i < num_blocks; i++) {
-            body_blocks[i] = EsrganBlock(num_features, num_grow_ch);
+public:
+    RRDBNet() {
+        blocks["conv_first"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_in_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
+        for (int i = 0; i < num_block; i++) {
+            std::string name = "body." + std::to_string(i);
+            blocks[name]     = std::shared_ptr<GGMLBlock>(new RRDB(num_feat, num_grow_ch));
        }
-    }
-
-    size_t calculate_mem_size() {
-        size_t mem_size = num_features * in_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_first_w
-        mem_size += num_features * ggml_type_size(GGML_TYPE_F32);                              // conv_first_b
-
-        for (int i = 0; i < num_blocks; i++) {
-            mem_size += body_blocks[i].calculate_mem_size();
-        }
-
-        mem_size += num_features * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_body_w
-        mem_size += num_features * ggml_type_size(GGML_TYPE_F32);                         // conv_body_w
-
+        blocks["conv_body"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
        // upsample
-        mem_size += num_features * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_up1_w
-        mem_size += num_features * ggml_type_size(GGML_TYPE_F32);                         // conv_up1_b
-
-        mem_size += num_features * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_up2_w
-        mem_size += num_features * ggml_type_size(GGML_TYPE_F32);                         // conv_up2_b
-
-        mem_size += num_features * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_hr_w
-        mem_size += num_features * ggml_type_size(GGML_TYPE_F32);                         // conv_hr_b
-
-        mem_size += out_channels * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_last_w
-        mem_size += out_channels * ggml_type_size(GGML_TYPE_F32);                         // conv_last_b
-        return mem_size;
+        blocks["conv_up1"]  = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv_up2"]  = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv_hr"]   = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv_last"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_out_ch, {3, 3}, {1, 1}, {1, 1}));
    }

-    size_t get_num_tensors() {
-        size_t num_tensors = 12;
-        for (int i = 0; i < num_blocks; i++) {
-            num_tensors += body_blocks[i].get_num_tensors();
-        }
-        return num_tensors;
+    struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
+        return ggml_leaky_relu(ctx, x, 0.2f, true);
    }

-    void init_params() {
-        ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
-        conv_first_w       = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, in_channels, num_features);
-        conv_first_b       = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, num_features);
-        conv_body_w        = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, num_features, num_features);
-        conv_body_b        = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, num_features);
-        conv_up1_w         = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, num_features, num_features);
-        conv_up1_b         = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, num_features);
-        conv_up2_w         = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, num_features, num_features);
-        conv_up2_b         = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, num_features);
-        conv_hr_w          = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, num_features, num_features);
-        conv_hr_b          = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, num_features);
-        conv_last_w        = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, num_features, out_channels);
-        conv_last_b        = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, out_channels);
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [n, num_in_ch, h, w]
+        // return: [n, num_out_ch, h*4, w*4]
+        auto conv_first = std::dynamic_pointer_cast<Conv2d>(blocks["conv_first"]);
+        auto conv_body  = std::dynamic_pointer_cast<Conv2d>(blocks["conv_body"]);
+        auto conv_up1   = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up1"]);
+        auto conv_up2   = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up2"]);
+        auto conv_hr    = std::dynamic_pointer_cast<Conv2d>(blocks["conv_hr"]);
+        auto conv_last  = std::dynamic_pointer_cast<Conv2d>(blocks["conv_last"]);

-        for (int i = 0; i < num_blocks; i++) {
-            body_blocks[i].init_params(params_ctx);
-        }
+        auto feat      = conv_first->forward(ctx, x);
+        auto body_feat = feat;
+        for (int i = 0; i < num_block; i++) {
+            std::string name = "body." + std::to_string(i);
+            auto block       = std::dynamic_pointer_cast<RRDB>(blocks[name]);

-        // alloc all tensors linked to this context
-        for (struct ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != NULL; t = ggml_get_next_tensor(params_ctx, t)) {
-            if (t->data == NULL) {
-                ggml_allocr_alloc(alloc, t);
-            }
+            body_feat = block->forward(ctx, body_feat);
        }
-        ggml_allocr_free(alloc);
+        body_feat = conv_body->forward(ctx, body_feat);
+        feat      = ggml_add(ctx, feat, body_feat);
+        // upsample
+        feat     = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
+        feat     = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
+        auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat)));
+        return out;
+    }
+};
+
+struct ESRGAN : public GGMLRunner {
+    RRDBNet rrdb_net;
+    int scale     = 4;
+    int tile_size = 128;  // avoid cuda OOM for 4gb VRAM
+
+    ESRGAN(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types)
+        : GGMLRunner(backend) {
+        rrdb_net.init(params_ctx, tensor_types, "");
    }

-    bool load_from_file(const std::string& file_path, ggml_backend_t backend) {
+    std::string get_desc() {
+        return "esrgan";
+    }
+
+    bool load_from_file(const std::string& file_path) {
        LOG_INFO("loading esrgan from '%s'", file_path.c_str());

-        if (!alloc_params_buffer(backend)) {
-            return false;
-        }
-
+        alloc_params_buffer();
        std::map<std::string, ggml_tensor*> esrgan_tensors;
-
-        // prepare memory for the weights
-        {
-            init_params();
-            map_by_name(esrgan_tensors);
-        }
+        rrdb_net.get_param_tensors(esrgan_tensors);

        ModelLoader model_loader;
        if (!model_loader.init_from_file(file_path)) {
@ -303,120 +175,22 @@ struct ESRGAN : public GGMLModule {
        return success;
    }

-    void map_by_name(std::map<std::string, ggml_tensor*>& tensors) {
-        tensors["conv_first.weight"] = conv_first_w;
-        tensors["conv_first.bias"]   = conv_first_b;
-
-        for (int i = 0; i < num_blocks; i++) {
-            body_blocks[i].map_by_name(tensors, "body." + std::to_string(i) + ".");
-        }
-
-        tensors["conv_body.weight"] = conv_body_w;
-        tensors["conv_body.bias"]   = conv_body_b;
-
-        tensors["conv_up1.weight"] = conv_up1_w;
-        tensors["conv_up1.bias"]   = conv_up1_b;
-        tensors["conv_up2.weight"] = conv_up2_w;
-        tensors["conv_up2.bias"]   = conv_up2_b;
-        tensors["conv_hr.weight"]  = conv_hr_w;
-        tensors["conv_hr.bias"]    = conv_hr_b;
-
-        tensors["conv_last.weight"] = conv_last_w;
-        tensors["conv_last.bias"]   = conv_last_b;
-    }
-
-    ggml_tensor* forward(ggml_context* ctx0, ggml_tensor* out_scale, ggml_tensor* x /* feat */) {
-        // feat = self.conv_first(feat)
-        auto h = ggml_nn_conv_2d(ctx0, x, conv_first_w, conv_first_b, 1, 1, 1, 1);
-
-        auto body_h = h;
-        // self.body(feat)
-        for (int i = 0; i < num_blocks; i++) {
-            body_h = body_blocks[i].forward(ctx0, out_scale, body_h);
-        }
-
-        // body_feat = self.conv_body(self.body(feat))
-        body_h = ggml_nn_conv_2d(ctx0, body_h, conv_body_w, conv_body_b, 1, 1, 1, 1);
-
-        // feat = feat + body_feat
-        h = ggml_add(ctx0, h, body_h);
-
-        // upsample
-        // feat = self.lrelu(self.conv_up1(F.interpolate(feat, scale_factor=2, mode='nearest')))
-        h = ggml_upscale(ctx0, h, 2);
-        h = ggml_nn_conv_2d(ctx0, h, conv_up1_w, conv_up1_b, 1, 1, 1, 1);
-        h = ggml_leaky_relu(ctx0, h, 0.2f, true);
-
-        // feat = self.lrelu(self.conv_up2(F.interpolate(feat, scale_factor=2, mode='nearest')))
-        h = ggml_upscale(ctx0, h, 2);
-        h = ggml_nn_conv_2d(ctx0, h, conv_up2_w, conv_up2_b, 1, 1, 1, 1);
-        h = ggml_leaky_relu(ctx0, h, 0.2f, true);
-
-        // out = self.conv_last(self.lrelu(self.conv_hr(feat)))
-        h = ggml_nn_conv_2d(ctx0, h, conv_hr_w, conv_hr_b, 1, 1, 1, 1);
-        h = ggml_leaky_relu(ctx0, h, 0.2f, true);
-
-        h = ggml_nn_conv_2d(ctx0, h, conv_last_w, conv_last_b, 1, 1, 1, 1);
-        return h;
-    }
-
    struct ggml_cgraph* build_graph(struct ggml_tensor* x) {
-        // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
-        static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
-        static std::vector<uint8_t> buf(buf_size);
-
-        struct ggml_init_params params = {
-            /*.mem_size   =*/buf_size,
-            /*.mem_buffer =*/buf.data(),
-            /*.no_alloc   =*/true,  // the tensors will be allocated later by ggml_allocr_alloc_graph()
-        };
-
-        struct ggml_context* ctx0 = ggml_init(params);
-
-        struct ggml_cgraph* gf = ggml_new_graph(ctx0);
-
-        struct ggml_tensor* x_ = NULL;
-        struct ggml_tensor* os = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        ggml_allocr_alloc(compute_allocr, os);
-        if (!ggml_allocr_is_measure(compute_allocr)) {
-            float scale = 0.2f;
-            ggml_backend_tensor_set(os, &scale, 0, sizeof(scale));
-        }
-
-        // it's performing a compute, check if backend isn't cpu
-        if (!ggml_backend_is_cpu(backend)) {
-            // pass input tensors to gpu memory
-            x_ = ggml_dup_tensor(ctx0, x);
-            ggml_allocr_alloc(compute_allocr, x_);
-
-            // pass data to device backend
-            if (!ggml_allocr_is_measure(compute_allocr)) {
-                ggml_backend_tensor_set(x_, x->data, 0, ggml_nbytes(x));
-            }
-        } else {
-            x_ = x;
-        }
-
-        struct ggml_tensor* out = forward(ctx0, os, x);
-
+        struct ggml_cgraph* gf  = ggml_new_graph(compute_ctx);
+        x                       = to_backend(x);
+        struct ggml_tensor* out = rrdb_net.forward(compute_ctx, x);
        ggml_build_forward_expand(gf, out);
-        ggml_free(ctx0);
-
        return gf;
    }

-    void alloc_compute_buffer(struct ggml_tensor* x) {
+    void compute(const int n_threads,
+                 struct ggml_tensor* x,
+                 ggml_tensor** output,
+                 ggml_context* output_ctx = NULL) {
        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(x);
        };
-        GGMLModule::alloc_compute_buffer(get_graph);
-    }
-
-    void compute(struct ggml_tensor* work_result, const int n_threads, struct ggml_tensor* x) {
-        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(x);
-        };
-        GGMLModule::compute(get_graph, n_threads, work_result);
+        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
    }
 };

--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
--- a/face_detect.py
+++ b/face_detect.py
@ -0,0 +1,88 @@
+import os
+import sys
+
+import numpy as np
+import torch
+from diffusers.utils import load_image
+# pip install insightface==0.7.3
+from insightface.app import FaceAnalysis
+from insightface.data import get_image as ins_get_image
+from safetensors.torch import save_file
+
+### 
+# https://github.com/cubiq/ComfyUI_IPAdapter_plus/issues/165#issue-2055829543
+###
+class FaceAnalysis2(FaceAnalysis):
+    # NOTE: allows setting det_size for each detection call.
+    # the model allows it but the wrapping code from insightface
+    # doesn't show it, and people end up loading duplicate models
+    # for different sizes where there is absolutely no need to
+    def get(self, img, max_num=0, det_size=(640, 640)):
+        if det_size is not None:
+            self.det_model.input_size = det_size
+
+        return super().get(img, max_num)
+
+def analyze_faces(face_analysis: FaceAnalysis, img_data: np.ndarray, det_size=(640, 640)):
+    # NOTE: try detect faces, if no faces detected, lower det_size until it does
+    detection_sizes = [None] + [(size, size) for size in range(640, 256, -64)] + [(256, 256)]
+
+    for size in detection_sizes:
+        faces = face_analysis.get(img_data, det_size=size)
+        if len(faces) > 0:
+            return faces
+
+    return []
+
+if __name__ == "__main__":
+    #face_detector = FaceAnalysis2(providers=['CUDAExecutionProvider'], allowed_modules=['detection', 'recognition'])
+    face_detector = FaceAnalysis2(providers=['CPUExecutionProvider'], allowed_modules=['detection', 'recognition'])
+    face_detector.prepare(ctx_id=0, det_size=(640, 640))
+    #input_folder_name = './scarletthead_woman'
+    input_folder_name = sys.argv[1]
+    image_basename_list = os.listdir(input_folder_name)
+    image_path_list = sorted([os.path.join(input_folder_name, basename) for basename in image_basename_list])
+
+    input_id_images = []
+    for image_path in image_path_list:
+        input_id_images.append(load_image(image_path))
+    
+    id_embed_list = []
+    
+    for img in input_id_images:
+        img = np.array(img)
+        img = img[:, :, ::-1]
+        faces = analyze_faces(face_detector, img)
+        if len(faces) > 0:
+            id_embed_list.append(torch.from_numpy((faces[0]['embedding'])))
+    
+    if len(id_embed_list) == 0:
+        raise ValueError(f"No face detected in input image pool")
+    
+    id_embeds = torch.stack(id_embed_list)    
+    
+    # for r in id_embeds:
+    #     print(r)
+    # #torch.save(id_embeds, input_folder_name+'/id_embeds.pt');
+    # weights = dict()
+    # weights["id_embeds"] = id_embeds
+    # save_file(weights, input_folder_name+'/id_embeds.safetensors')
+
+    binary_data = id_embeds.numpy().tobytes()
+    two = 4
+    zero = 0
+    one = 1
+    tensor_name = "id_embeds"
+# Write binary data to a file
+    with open(input_folder_name+'/id_embeds.bin', "wb") as f:
+        f.write(two.to_bytes(4, byteorder='little'))
+        f.write((len(tensor_name)).to_bytes(4, byteorder='little'))
+        f.write(zero.to_bytes(4, byteorder='little'))
+        f.write((id_embeds.shape[1]).to_bytes(4, byteorder='little'))
+        f.write((id_embeds.shape[0]).to_bytes(4, byteorder='little'))
+        f.write(one.to_bytes(4, byteorder='little'))
+        f.write(one.to_bytes(4, byteorder='little'))
+        f.write(tensor_name.encode('ascii'))
+        f.write(binary_data)
+
+    
--- a/flux.hpp
+++ b/flux.hpp
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit e5d3412fa2ea3de8c4a696c03dce73c470442dc1
+Subproject commit 9e4bee1c5afc2d677a5b32ecb90cbdb483e81fff
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
--- a/gits_noise.inl
+++ b/gits_noise.inl
@ -0,0 +1,349 @@
+#ifndef GITS_NOISE_INL
+#define GITS_NOISE_INL
+
+const std::vector<std::vector<float>> GITS_NOISE_0_80 = {
+    { 14.61464119f, 7.49001646f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 6.77309084f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 3.07277966f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 2.05039096f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 2.05039096f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 8.75849152f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 8.75849152f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.07277966f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.07277966f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.07277966f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.07277966f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.19567990f, 1.98035145f, 0.86115354f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.19567990f, 1.98035145f, 0.86115354f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.88507891f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.07277966f, 1.84880662f, 0.83188516f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_0_85 = {
+    { 14.61464119f, 7.49001646f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 1.84880662f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 6.77309084f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.11996698f, 3.07277966f, 1.24153244f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.09240818f, 2.84484982f, 0.95350921f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.09240818f, 2.84484982f, 0.95350921f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.58536053f, 3.19567990f, 1.84880662f, 0.803307f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 11.54541874f, 8.75849152f, 7.49001646f, 5.58536053f, 3.19567990f, 1.84880662f, 0.803307f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 11.54541874f, 8.75849152f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 8.75849152f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.60512662f, 2.63833880f, 1.56271636f, 0.72133851f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.88507891f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_0_90 = {
+    { 14.61464119f, 6.77309084f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 3.07277966f, 0.95350921f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 2.54230714f, 0.89115214f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 2.54230714f, 0.89115214f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.09240818f, 3.07277966f, 1.61558151f, 0.69515091f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 8.75849152f, 7.11996698f, 4.86714602f, 3.07277966f, 1.61558151f, 0.69515091f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 2.95596409f, 1.61558151f, 0.69515091f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.19988537f, 1.24153244f, 0.57119018f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 10.90732002f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.19988537f, 1.24153244f, 0.57119018f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.19988537f, 1.24153244f, 0.57119018f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.75677586f, 2.84484982f, 1.84880662f, 1.08895338f, 0.52423614f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.75677586f, 2.84484982f, 1.84880662f, 1.08895338f, 0.52423614f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.44769001f, 5.58536053f, 4.45427561f, 3.32507086f, 2.45070267f, 1.61558151f, 0.95350921f, 0.45573691f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.44769001f, 5.58536053f, 4.45427561f, 3.32507086f, 2.45070267f, 1.61558151f, 0.95350921f, 0.45573691f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.86714602f, 3.91689563f, 3.07277966f, 2.27973175f, 1.56271636f, 0.95350921f, 0.45573691f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.86714602f, 3.91689563f, 3.07277966f, 2.27973175f, 1.56271636f, 0.95350921f, 0.45573691f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.86714602f, 3.91689563f, 3.07277966f, 2.27973175f, 1.56271636f, 0.95350921f, 0.45573691f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.45427561f, 3.60512662f, 2.95596409f, 2.19988537f, 1.51179266f, 0.89115214f, 0.43325692f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_0_95 = {
+    { 14.61464119f, 6.77309084f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 2.84484982f, 0.89115214f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 2.36326075f, 0.803307f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 2.95596409f, 1.56271636f, 0.64427125f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 2.95596409f, 1.56271636f, 0.64427125f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 3.07277966f, 1.91321158f, 1.08895338f, 0.50118381f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.07277966f, 1.91321158f, 1.08895338f, 0.50118381f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.07277966f, 1.91321158f, 1.08895338f, 0.50118381f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.19988537f, 1.41535246f, 0.803307f, 0.38853383f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.46139455f, 2.63833880f, 1.84880662f, 1.24153244f, 0.72133851f, 0.34370604f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 10.90732002f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.46139455f, 2.63833880f, 1.84880662f, 1.24153244f, 0.72133851f, 0.34370604f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 10.90732002f, 8.75849152f, 7.49001646f, 6.14220476f, 4.86714602f, 3.75677586f, 2.95596409f, 2.19988537f, 1.56271636f, 1.05362725f, 0.64427125f, 0.32104823f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 10.90732002f, 8.75849152f, 7.49001646f, 6.44769001f, 5.58536053f, 4.65472794f, 3.60512662f, 2.95596409f, 2.19988537f, 1.56271636f, 1.05362725f, 0.64427125f, 0.32104823f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 6.44769001f, 5.58536053f, 4.65472794f, 3.60512662f, 2.95596409f, 2.19988537f, 1.56271636f, 1.05362725f, 0.64427125f, 0.32104823f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 6.44769001f, 5.58536053f, 4.65472794f, 3.75677586f, 3.07277966f, 2.45070267f, 1.78698075f, 1.24153244f, 0.83188516f, 0.50118381f, 0.22545385f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.45427561f, 3.60512662f, 2.95596409f, 2.36326075f, 1.72759056f, 1.24153244f, 0.83188516f, 0.50118381f, 0.22545385f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.45427561f, 3.60512662f, 2.95596409f, 2.36326075f, 1.72759056f, 1.24153244f, 0.83188516f, 0.50118381f, 0.22545385f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.45427561f, 3.75677586f, 3.07277966f, 2.45070267f, 1.91321158f, 1.46270394f, 1.05362725f, 0.72133851f, 0.43325692f, 0.19894916f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_00 = {
+    { 14.61464119f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 0.95350921f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 2.36326075f, 0.803307f, 0.02916753f },
+    { 14.61464119f, 7.11996698f, 3.07277966f, 1.56271636f, 0.59516323f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.41535246f, 0.57119018f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.61558151f, 0.86115354f, 0.38853383f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 2.84484982f, 1.61558151f, 0.86115354f, 0.38853383f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 3.07277966f, 1.98035145f, 1.24153244f, 0.72133851f, 0.34370604f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.07277966f, 1.98035145f, 1.24153244f, 0.72133851f, 0.34370604f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.27973175f, 1.51179266f, 0.95350921f, 0.54755926f, 0.25053367f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.36326075f, 1.61558151f, 1.08895338f, 0.72133851f, 0.41087446f, 0.17026083f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.36326075f, 1.61558151f, 1.08895338f, 0.72133851f, 0.41087446f, 0.17026083f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.84484982f, 2.12350607f, 1.56271636f, 1.08895338f, 0.72133851f, 0.41087446f, 0.17026083f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.84484982f, 2.19988537f, 1.61558151f, 1.162866f, 0.803307f, 0.50118381f, 0.27464288f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.75677586f, 3.07277966f, 2.45070267f, 1.84880662f, 1.36964464f, 1.01931262f, 0.72133851f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 6.14220476f, 5.09240818f, 4.26497746f, 3.46139455f, 2.84484982f, 2.19988537f, 1.67050016f, 1.24153244f, 0.92192322f, 0.64427125f, 0.43325692f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 6.14220476f, 5.09240818f, 4.26497746f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.12534678f, 0.83188516f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 5.09240818f, 4.26497746f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.12534678f, 0.83188516f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.26497746f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.12534678f, 0.83188516f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_05 = {
+    { 14.61464119f, 0.95350921f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 0.89115214f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 2.05039096f, 0.72133851f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 2.84484982f, 1.28281462f, 0.52423614f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 3.07277966f, 1.61558151f, 0.803307f, 0.34370604f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.56271636f, 0.803307f, 0.34370604f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.61558151f, 0.95350921f, 0.52423614f, 0.22545385f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 1.98035145f, 1.24153244f, 0.74807048f, 0.41087446f, 0.17026083f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.27973175f, 1.51179266f, 0.95350921f, 0.59516323f, 0.34370604f, 0.13792117f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 5.09240818f, 3.46139455f, 2.45070267f, 1.61558151f, 1.08895338f, 0.72133851f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.09240818f, 3.46139455f, 2.45070267f, 1.61558151f, 1.08895338f, 0.72133851f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.36326075f, 1.61558151f, 1.08895338f, 0.72133851f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.45070267f, 1.72759056f, 1.24153244f, 0.86115354f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.84484982f, 2.19988537f, 1.61558151f, 1.162866f, 0.83188516f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.84484982f, 2.19988537f, 1.67050016f, 1.28281462f, 0.95350921f, 0.72133851f, 0.52423614f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.95596409f, 2.36326075f, 1.84880662f, 1.41535246f, 1.08895338f, 0.83188516f, 0.61951244f, 0.45573691f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.20157266f, 0.95350921f, 0.74807048f, 0.57119018f, 0.43325692f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 8.30717278f, 7.11996698f, 5.85520077f, 4.65472794f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.20157266f, 0.95350921f, 0.74807048f, 0.57119018f, 0.43325692f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 8.30717278f, 7.11996698f, 5.85520077f, 4.65472794f, 3.60512662f, 2.95596409f, 2.45070267f, 1.98035145f, 1.61558151f, 1.32549286f, 1.08895338f, 0.86115354f, 0.69515091f, 0.54755926f, 0.41087446f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_10 = {
+    { 14.61464119f, 0.89115214f, 0.02916753f },
+    { 14.61464119f, 2.36326075f, 0.72133851f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 1.61558151f, 0.57119018f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 2.45070267f, 1.08895338f, 0.45573691f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 2.95596409f, 1.56271636f, 0.803307f, 0.34370604f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 3.07277966f, 1.61558151f, 0.89115214f, 0.4783645f, 0.19894916f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 3.07277966f, 1.84880662f, 1.08895338f, 0.64427125f, 0.34370604f, 0.13792117f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.61558151f, 0.95350921f, 0.54755926f, 0.27464288f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 2.95596409f, 1.91321158f, 1.24153244f, 0.803307f, 0.4783645f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.05039096f, 1.41535246f, 0.95350921f, 0.64427125f, 0.41087446f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.27973175f, 1.61558151f, 1.12534678f, 0.803307f, 0.54755926f, 0.36617002f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.32507086f, 2.45070267f, 1.72759056f, 1.24153244f, 0.89115214f, 0.64427125f, 0.45573691f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 5.09240818f, 3.60512662f, 2.84484982f, 2.05039096f, 1.51179266f, 1.08895338f, 0.803307f, 0.59516323f, 0.43325692f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 5.09240818f, 3.60512662f, 2.84484982f, 2.12350607f, 1.61558151f, 1.24153244f, 0.95350921f, 0.72133851f, 0.54755926f, 0.41087446f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.45070267f, 1.84880662f, 1.41535246f, 1.08895338f, 0.83188516f, 0.64427125f, 0.50118381f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.45070267f, 1.91321158f, 1.51179266f, 1.20157266f, 0.95350921f, 0.74807048f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 5.85520077f, 4.45427561f, 3.46139455f, 2.84484982f, 2.19988537f, 1.72759056f, 1.36964464f, 1.08895338f, 0.86115354f, 0.69515091f, 0.54755926f, 0.43325692f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.46139455f, 2.84484982f, 2.19988537f, 1.72759056f, 1.36964464f, 1.08895338f, 0.86115354f, 0.69515091f, 0.54755926f, 0.43325692f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.46139455f, 2.84484982f, 2.19988537f, 1.72759056f, 1.36964464f, 1.08895338f, 0.89115214f, 0.72133851f, 0.59516323f, 0.4783645f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_15 = {
+    { 14.61464119f, 0.83188516f, 0.02916753f },
+    { 14.61464119f, 1.84880662f, 0.59516323f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 1.56271636f, 0.52423614f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 1.91321158f, 0.83188516f, 0.34370604f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.45070267f, 1.24153244f, 0.59516323f, 0.25053367f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.51179266f, 0.803307f, 0.41087446f, 0.17026083f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.56271636f, 0.89115214f, 0.50118381f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 3.07277966f, 1.84880662f, 1.12534678f, 0.72133851f, 0.43325692f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 3.07277966f, 1.91321158f, 1.24153244f, 0.803307f, 0.52423614f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 2.95596409f, 1.91321158f, 1.24153244f, 0.803307f, 0.52423614f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.05039096f, 1.36964464f, 0.95350921f, 0.69515091f, 0.4783645f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.803307f, 0.59516323f, 0.43325692f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.803307f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.19988537f, 1.61558151f, 1.24153244f, 0.95350921f, 0.74807048f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.45070267f, 1.78698075f, 1.32549286f, 1.01931262f, 0.803307f, 0.64427125f, 0.50118381f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.45070267f, 1.78698075f, 1.32549286f, 1.01931262f, 0.803307f, 0.64427125f, 0.52423614f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.45070267f, 1.84880662f, 1.41535246f, 1.12534678f, 0.89115214f, 0.72133851f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.45070267f, 1.84880662f, 1.41535246f, 1.12534678f, 0.89115214f, 0.72133851f, 0.59516323f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_20 = {
+    { 14.61464119f, 0.803307f, 0.02916753f },
+    { 14.61464119f, 1.56271636f, 0.52423614f, 0.02916753f },
+    { 14.61464119f, 2.36326075f, 0.92192322f, 0.36617002f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.24153244f, 0.59516323f, 0.25053367f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.05039096f, 0.95350921f, 0.45573691f, 0.17026083f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.45070267f, 1.24153244f, 0.64427125f, 0.29807833f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.45070267f, 1.36964464f, 0.803307f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 0.95350921f, 0.59516323f, 0.36617002f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.67050016f, 1.08895338f, 0.74807048f, 0.50118381f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.95596409f, 1.84880662f, 1.24153244f, 0.83188516f, 0.59516323f, 0.41087446f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 3.07277966f, 1.98035145f, 1.36964464f, 0.95350921f, 0.69515091f, 0.50118381f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 3.46139455f, 2.36326075f, 1.56271636f, 1.08895338f, 0.803307f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 3.46139455f, 2.45070267f, 1.61558151f, 1.162866f, 0.86115354f, 0.64427125f, 0.50118381f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.83188516f, 0.64427125f, 0.50118381f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.83188516f, 0.64427125f, 0.50118381f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.83188516f, 0.64427125f, 0.50118381f, 0.41087446f, 0.34370604f, 0.27464288f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.19988537f, 1.61558151f, 1.20157266f, 0.92192322f, 0.72133851f, 0.57119018f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.19988537f, 1.61558151f, 1.24153244f, 0.95350921f, 0.74807048f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.19988537f, 1.61558151f, 1.24153244f, 0.95350921f, 0.74807048f, 0.59516323f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_25 = {
+    { 14.61464119f, 0.72133851f, 0.02916753f },
+    { 14.61464119f, 1.56271636f, 0.50118381f, 0.02916753f },
+    { 14.61464119f, 2.05039096f, 0.803307f, 0.32104823f, 0.02916753f },
+    { 14.61464119f, 2.36326075f, 0.95350921f, 0.43325692f, 0.17026083f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.24153244f, 0.59516323f, 0.27464288f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 3.07277966f, 1.51179266f, 0.803307f, 0.43325692f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.36326075f, 1.24153244f, 0.72133851f, 0.41087446f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.45070267f, 1.36964464f, 0.83188516f, 0.52423614f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 0.98595673f, 0.64427125f, 0.43325692f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.67050016f, 1.08895338f, 0.74807048f, 0.52423614f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.72759056f, 1.162866f, 0.803307f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.95596409f, 1.84880662f, 1.24153244f, 0.86115354f, 0.64427125f, 0.4783645f, 0.36617002f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.95596409f, 1.84880662f, 1.28281462f, 0.92192322f, 0.69515091f, 0.52423614f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.95596409f, 1.91321158f, 1.32549286f, 0.95350921f, 0.72133851f, 0.54755926f, 0.43325692f, 0.34370604f, 0.27464288f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.95596409f, 1.91321158f, 1.32549286f, 0.95350921f, 0.72133851f, 0.57119018f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.95596409f, 1.91321158f, 1.32549286f, 0.95350921f, 0.74807048f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 3.07277966f, 2.05039096f, 1.41535246f, 1.05362725f, 0.803307f, 0.61951244f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 3.07277966f, 2.05039096f, 1.41535246f, 1.05362725f, 0.803307f, 0.64427125f, 0.52423614f, 0.43325692f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 3.07277966f, 2.05039096f, 1.46270394f, 1.08895338f, 0.83188516f, 0.66947293f, 0.54755926f, 0.45573691f, 0.38853383f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_30 = {
+    { 14.61464119f, 0.72133851f, 0.02916753f },
+    { 14.61464119f, 1.24153244f, 0.43325692f, 0.02916753f },
+    { 14.61464119f, 1.56271636f, 0.59516323f, 0.22545385f, 0.02916753f },
+    { 14.61464119f, 1.84880662f, 0.803307f, 0.36617002f, 0.13792117f, 0.02916753f },
+    { 14.61464119f, 2.36326075f, 1.01931262f, 0.52423614f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.36964464f, 0.74807048f, 0.41087446f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 3.07277966f, 1.56271636f, 0.89115214f, 0.54755926f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 3.07277966f, 1.61558151f, 0.95350921f, 0.61951244f, 0.41087446f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.45070267f, 1.36964464f, 0.83188516f, 0.54755926f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.45070267f, 1.41535246f, 0.92192322f, 0.64427125f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.6383388f, 1.56271636f, 1.01931262f, 0.72133851f, 0.50118381f, 0.36617002f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 1.05362725f, 0.74807048f, 0.54755926f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 1.08895338f, 0.77538133f, 0.57119018f, 0.43325692f, 0.34370604f, 0.27464288f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.59516323f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.72759056f, 1.162866f, 0.83188516f, 0.64427125f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.72759056f, 1.162866f, 0.83188516f, 0.64427125f, 0.52423614f, 0.43325692f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.78698075f, 1.24153244f, 0.92192322f, 0.72133851f, 0.57119018f, 0.45573691f, 0.38853383f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.78698075f, 1.24153244f, 0.92192322f, 0.72133851f, 0.57119018f, 0.4783645f, 0.41087446f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_35 = {
+    { 14.61464119f, 0.69515091f, 0.02916753f },
+    { 14.61464119f, 0.95350921f, 0.34370604f, 0.02916753f },
+    { 14.61464119f, 1.56271636f, 0.57119018f, 0.19894916f, 0.02916753f },
+    { 14.61464119f, 1.61558151f, 0.69515091f, 0.29807833f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.84880662f, 0.83188516f, 0.43325692f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.162866f, 0.64427125f, 0.36617002f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.36964464f, 0.803307f, 0.50118381f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.41535246f, 0.83188516f, 0.54755926f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.56271636f, 0.95350921f, 0.64427125f, 0.45573691f, 0.32104823f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.56271636f, 0.95350921f, 0.64427125f, 0.45573691f, 0.34370604f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 3.07277966f, 1.61558151f, 1.01931262f, 0.72133851f, 0.52423614f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 3.07277966f, 1.61558151f, 1.01931262f, 0.72133851f, 0.52423614f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 3.07277966f, 1.61558151f, 1.05362725f, 0.74807048f, 0.54755926f, 0.43325692f, 0.34370604f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 3.07277966f, 1.72759056f, 1.12534678f, 0.803307f, 0.59516323f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 3.07277966f, 1.72759056f, 1.12534678f, 0.803307f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.45070267f, 1.51179266f, 1.01931262f, 0.74807048f, 0.57119018f, 0.45573691f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.6383388f, 1.61558151f, 1.08895338f, 0.803307f, 0.61951244f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.6383388f, 1.61558151f, 1.08895338f, 0.803307f, 0.64427125f, 0.52423614f, 0.43325692f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.6383388f, 1.61558151f, 1.08895338f, 0.803307f, 0.64427125f, 0.52423614f, 0.45573691f, 0.38853383f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_40 = {
+    { 14.61464119f, 0.59516323f, 0.02916753f },
+    { 14.61464119f, 0.95350921f, 0.34370604f, 0.02916753f },
+    { 14.61464119f, 1.08895338f, 0.43325692f, 0.13792117f, 0.02916753f },
+    { 14.61464119f, 1.56271636f, 0.64427125f, 0.27464288f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.61558151f, 0.803307f, 0.43325692f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.05039096f, 0.95350921f, 0.54755926f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.24153244f, 0.72133851f, 0.43325692f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.24153244f, 0.74807048f, 0.50118381f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.28281462f, 0.803307f, 0.52423614f, 0.36617002f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.28281462f, 0.803307f, 0.54755926f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.41535246f, 0.86115354f, 0.59516323f, 0.43325692f, 0.32104823f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.51179266f, 0.95350921f, 0.64427125f, 0.45573691f, 0.34370604f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.51179266f, 0.95350921f, 0.64427125f, 0.4783645f, 0.36617002f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.56271636f, 0.98595673f, 0.69515091f, 0.52423614f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.56271636f, 1.01931262f, 0.72133851f, 0.54755926f, 0.43325692f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.61558151f, 1.05362725f, 0.74807048f, 0.57119018f, 0.45573691f, 0.38853383f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.61951244f, 0.50118381f, 0.41087446f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.61951244f, 0.50118381f, 0.43325692f, 0.38853383f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.64427125f, 0.52423614f, 0.45573691f, 0.41087446f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_45 = {
+    { 14.61464119f, 0.59516323f, 0.02916753f },
+    { 14.61464119f, 0.803307f, 0.25053367f, 0.02916753f },
+    { 14.61464119f, 0.95350921f, 0.34370604f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.24153244f, 0.54755926f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.56271636f, 0.72133851f, 0.36617002f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.61558151f, 0.803307f, 0.45573691f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.91321158f, 0.95350921f, 0.57119018f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.19988537f, 1.08895338f, 0.64427125f, 0.41087446f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.24153244f, 0.74807048f, 0.50118381f, 0.34370604f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.24153244f, 0.74807048f, 0.50118381f, 0.36617002f, 0.27464288f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.28281462f, 0.803307f, 0.54755926f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.28281462f, 0.803307f, 0.57119018f, 0.43325692f, 0.34370604f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.28281462f, 0.83188516f, 0.59516323f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.28281462f, 0.83188516f, 0.59516323f, 0.45573691f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.51179266f, 0.95350921f, 0.69515091f, 0.52423614f, 0.41087446f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.51179266f, 0.95350921f, 0.69515091f, 0.52423614f, 0.43325692f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.56271636f, 0.98595673f, 0.72133851f, 0.54755926f, 0.45573691f, 0.38853383f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.56271636f, 1.01931262f, 0.74807048f, 0.57119018f, 0.4783645f, 0.41087446f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.56271636f, 1.01931262f, 0.74807048f, 0.59516323f, 0.50118381f, 0.43325692f, 0.38853383f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_50 = {
+    { 14.61464119f, 0.54755926f, 0.02916753f },
+    { 14.61464119f, 0.803307f, 0.25053367f, 0.02916753f },
+    { 14.61464119f, 0.86115354f, 0.32104823f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.24153244f, 0.54755926f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.56271636f, 0.72133851f, 0.36617002f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.61558151f, 0.803307f, 0.45573691f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.61558151f, 0.83188516f, 0.52423614f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.84880662f, 0.95350921f, 0.59516323f, 0.38853383f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.84880662f, 0.95350921f, 0.59516323f, 0.41087446f, 0.29807833f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.84880662f, 0.95350921f, 0.61951244f, 0.43325692f, 0.32104823f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.19988537f, 1.12534678f, 0.72133851f, 0.50118381f, 0.36617002f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.19988537f, 1.12534678f, 0.72133851f, 0.50118381f, 0.36617002f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.36326075f, 1.24153244f, 0.803307f, 0.57119018f, 0.43325692f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.36326075f, 1.24153244f, 0.803307f, 0.57119018f, 0.43325692f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.36326075f, 1.24153244f, 0.803307f, 0.59516323f, 0.45573691f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.36326075f, 1.24153244f, 0.803307f, 0.59516323f, 0.45573691f, 0.38853383f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.32549286f, 0.86115354f, 0.64427125f, 0.50118381f, 0.41087446f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.36964464f, 0.92192322f, 0.69515091f, 0.54755926f, 0.45573691f, 0.41087446f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.41535246f, 0.95350921f, 0.72133851f, 0.57119018f, 0.4783645f, 0.43325692f, 0.38853383f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<const std::vector<std::vector<float>>*> GITS_NOISE = {
+    &GITS_NOISE_0_80,
+    &GITS_NOISE_0_85,
+    &GITS_NOISE_0_90,
+    &GITS_NOISE_0_95,
+    &GITS_NOISE_1_00,
+    &GITS_NOISE_1_05,
+    &GITS_NOISE_1_10,
+    &GITS_NOISE_1_15,
+    &GITS_NOISE_1_20,
+    &GITS_NOISE_1_25,
+    &GITS_NOISE_1_30,
+    &GITS_NOISE_1_35,
+    &GITS_NOISE_1_40,
+    &GITS_NOISE_1_45,
+    &GITS_NOISE_1_50
+};
+
+#endif // GITS_NOISE_INL
--- a/lora.hpp
+++ b/lora.hpp
@ -3,35 +3,117 @@

 #include "ggml_extend.hpp"

-#define LORA_GRAPH_SIZE 10240
+#define LORA_GRAPH_BASE_SIZE 10240
+
+struct LoraModel : public GGMLRunner {
+    enum lora_t {
+        REGULAR      = 0,
+        DIFFUSERS    = 1,
+        DIFFUSERS_2  = 2,
+        DIFFUSERS_3  = 3,
+        TRANSFORMERS = 4,
+        LORA_TYPE_COUNT
+    };
+
+    const std::string lora_ups[LORA_TYPE_COUNT] = {
+        ".lora_up",
+        "_lora.up",
+        ".lora_B",
+        ".lora.up",
+        ".lora_linear_layer.up",
+    };
+
+    const std::string lora_downs[LORA_TYPE_COUNT] = {
+        ".lora_down",
+        "_lora.down",
+        ".lora_A",
+        ".lora.down",
+        ".lora_linear_layer.down",
+    };
+
+    const std::string lora_pre[LORA_TYPE_COUNT] = {
+        "lora.",
+        "",
+        "",
+        "",
+        "",
+    };
+
+    const std::map<std::string, std::string> alt_names = {
+        // mmdit
+        {"final_layer.adaLN_modulation.1", "norm_out.linear"},
+        {"pos_embed", "pos_embed.proj"},
+        {"final_layer.linear", "proj_out"},
+        {"y_embedder.mlp.0", "time_text_embed.text_embedder.linear_1"},
+        {"y_embedder.mlp.2", "time_text_embed.text_embedder.linear_2"},
+        {"t_embedder.mlp.0", "time_text_embed.timestep_embedder.linear_1"},
+        {"t_embedder.mlp.2", "time_text_embed.timestep_embedder.linear_2"},
+        {"x_block.mlp.fc1", "ff.net.0.proj"},
+        {"x_block.mlp.fc2", "ff.net.2"},
+        {"context_block.mlp.fc1", "ff_context.net.0.proj"},
+        {"context_block.mlp.fc2", "ff_context.net.2"},
+        {"x_block.adaLN_modulation.1", "norm1.linear"},
+        {"context_block.adaLN_modulation.1", "norm1_context.linear"},
+        {"context_block.attn.proj", "attn.to_add_out"},
+        {"x_block.attn.proj", "attn.to_out.0"},
+        {"x_block.attn2.proj", "attn2.to_out.0"},
+        // flux
+        // singlestream
+        {"linear2", "proj_out"},
+        {"modulation.lin", "norm.linear"},
+        // doublestream
+        {"txt_attn.proj", "attn.to_add_out"},
+        {"img_attn.proj", "attn.to_out.0"},
+        {"txt_mlp.0", "ff_context.net.0.proj"},
+        {"txt_mlp.2", "ff_context.net.2"},
+        {"img_mlp.0", "ff.net.0.proj"},
+        {"img_mlp.2", "ff.net.2"},
+        {"txt_mod.lin", "norm1_context.linear"},
+        {"img_mod.lin", "norm1.linear"},
+    };
+
+    const std::map<std::string, std::string> qkv_prefixes = {
+        // mmdit
+        {"context_block.attn.qkv", "attn.add_"},  // suffix "_proj"
+        {"x_block.attn.qkv", "attn.to_"},
+        {"x_block.attn2.qkv", "attn2.to_"},
+        // flux
+        // doublestream
+        {"txt_attn.qkv", "attn.add_"},  // suffix "_proj"
+        {"img_attn.qkv", "attn.to_"},
+    };
+    const std::map<std::string, std::string> qkvm_prefixes = {
+        // flux
+        // singlestream
+        {"linear1", ""},
+    };
+
+    const std::string* type_fingerprints = lora_ups;

-struct LoraModel : public GGMLModule {
    float multiplier = 1.0f;
    std::map<std::string, struct ggml_tensor*> lora_tensors;
    std::string file_path;
    ModelLoader model_loader;
-    bool load_failed = false;
+    bool load_failed                = false;
+    bool applied                    = false;
+    std::vector<int> zero_index_vec = {0};
+    ggml_tensor* zero_index         = NULL;
+    enum lora_t type                = REGULAR;

-    LoraModel(const std::string file_path = "")
-        : file_path(file_path) {
-        name = "lora";
-        if (!model_loader.init_from_file(file_path)) {
+    LoraModel(ggml_backend_t backend,
+              const std::string& file_path = "",
+              const std::string prefix     = "")
+        : file_path(file_path), GGMLRunner(backend) {
+        if (!model_loader.init_from_file(file_path, prefix)) {
            load_failed = true;
        }
    }

-    size_t get_num_tensors() {
-        return LORA_GRAPH_SIZE;
+    std::string get_desc() {
+        return "lora";
    }

-    size_t calculate_mem_size() {
-        return model_loader.cal_mem_size(NULL);
-    }
-
-    bool load_from_file(ggml_backend_t backend) {
-        if (!alloc_params_buffer(backend)) {
-            return false;
-        }
+    bool load_from_file(bool filter_tensor = false) {
        LOG_INFO("loading LoRA from '%s'", file_path.c_str());

        if (load_failed) {
@ -39,147 +121,726 @@ struct LoraModel : public GGMLModule {
            return false;
        }

-        ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
-
+        bool dry_run          = true;
        auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
            const std::string& name = tensor_storage.name;

-            struct ggml_tensor* real = ggml_new_tensor(params_ctx, tensor_storage.type, tensor_storage.n_dims, tensor_storage.ne);
-            ggml_allocr_alloc(alloc, real);
+            if (filter_tensor && !contains(name, "lora")) {
+                // LOG_INFO("skipping LoRA tesnor '%s'", name.c_str());
+                return true;
+            }
+            // LOG_INFO("%s", name.c_str());
+            for (int i = 0; i < LORA_TYPE_COUNT; i++) {
+                if (name.find(type_fingerprints[i]) != std::string::npos) {
+                    type = (lora_t)i;
+                    break;
+                }
+            }

-            *dst_tensor = real;
+            if (dry_run) {
+                struct ggml_tensor* real = ggml_new_tensor(params_ctx,
+                                                           tensor_storage.type,
+                                                           tensor_storage.n_dims,
+                                                           tensor_storage.ne);
+                lora_tensors[name]       = real;
+            } else {
+                auto real   = lora_tensors[name];
+                *dst_tensor = real;
+            }

-            lora_tensors[name] = real;
            return true;
        };

        model_loader.load_tensors(on_new_tensor_cb, backend);
+        alloc_params_buffer();
+        // exit(0);
+        dry_run = false;
+        model_loader.load_tensors(on_new_tensor_cb, backend);
+
+        LOG_DEBUG("lora type: \"%s\"/\"%s\"", lora_downs[type].c_str(), lora_ups[type].c_str());

        LOG_DEBUG("finished loaded lora");
-        ggml_allocr_free(alloc);
        return true;
    }

-    struct ggml_cgraph* build_graph(std::map<std::string, struct ggml_tensor*> model_tensors) {
-        // make a graph to compute all lora, expected lora and models tensors are in the same backend
-        // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
-        static size_t buf_size = ggml_tensor_overhead() * LORA_GRAPH_SIZE + ggml_graph_overhead();
-        static std::vector<uint8_t> buf(buf_size);
+    ggml_tensor* to_f32(ggml_context* ctx, ggml_tensor* a) {
+        auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a));
+        out      = ggml_get_rows(ctx, out, zero_index);
+        out      = ggml_reshape(ctx, out, a);
+        return out;
+    }

-        struct ggml_init_params params = {
-            /*.mem_size   =*/buf_size,
-            /*.mem_buffer =*/buf.data(),
-            /*.no_alloc   =*/true,  // the tensors will be allocated later by ggml_allocr_alloc_graph()
-        };
-        // LOG_DEBUG("mem_size %u ", params.mem_size);
+    std::vector<std::string> to_lora_keys(std::string blk_name, SDVersion version) {
+        std::vector<std::string> keys;
+        // if (!sd_version_is_sd3(version) || blk_name != "model.diffusion_model.pos_embed") {
+        size_t k_pos = blk_name.find(".weight");
+        if (k_pos == std::string::npos) {
+            return keys;
+        }
+        blk_name = blk_name.substr(0, k_pos);
+        // }
+        keys.push_back(blk_name);
+        keys.push_back("lora." + blk_name);
+        if (sd_version_is_dit(version)) {
+            if (blk_name.find("model.diffusion_model") != std::string::npos) {
+                blk_name.replace(blk_name.find("model.diffusion_model"), sizeof("model.diffusion_model") - 1, "transformer");
+            }

-        struct ggml_context* ctx0 = ggml_init(params);
-        struct ggml_cgraph* gf    = ggml_new_graph_custom(ctx0, LORA_GRAPH_SIZE, false);
+            if (blk_name.find(".single_blocks") != std::string::npos) {
+                blk_name.replace(blk_name.find(".single_blocks"), sizeof(".single_blocks") - 1, ".single_transformer_blocks");
+            }
+            if (blk_name.find(".double_blocks") != std::string::npos) {
+                blk_name.replace(blk_name.find(".double_blocks"), sizeof(".double_blocks") - 1, ".transformer_blocks");
+            }
+
+            if (blk_name.find(".joint_blocks") != std::string::npos) {
+                blk_name.replace(blk_name.find(".joint_blocks"), sizeof(".joint_blocks") - 1, ".transformer_blocks");
+            }
+
+            if (blk_name.find("text_encoders.clip_l") != std::string::npos) {
+                blk_name.replace(blk_name.find("text_encoders.clip_l"), sizeof("text_encoders.clip_l") - 1, "cond_stage_model");
+            }
+
+            for (const auto& item : alt_names) {
+                size_t match = blk_name.find(item.first);
+                if (match != std::string::npos) {
+                    blk_name = blk_name.substr(0, match) + item.second;
+                }
+            }
+            for (const auto& prefix : qkv_prefixes) {
+                size_t match = blk_name.find(prefix.first);
+                if (match != std::string::npos) {
+                    std::string split_blk = "SPLIT|" + blk_name.substr(0, match) + prefix.second;
+                    keys.push_back(split_blk);
+                }
+            }
+            for (const auto& prefix : qkvm_prefixes) {
+                size_t match = blk_name.find(prefix.first);
+                if (match != std::string::npos) {
+                    std::string split_blk = "SPLIT_L|" + blk_name.substr(0, match) + prefix.second;
+                    keys.push_back(split_blk);
+                }
+            }
+            keys.push_back(blk_name);
+        }
+
+        std::vector<std::string> ret;
+        for (std::string& key : keys) {
+            ret.push_back(key);
+            replace_all_chars(key, '.', '_');
+            // fix for some sdxl lora, like lcm-lora-xl
+            if (key == "model_diffusion_model_output_blocks_2_2_conv") {
+                ret.push_back("model_diffusion_model_output_blocks_2_1_conv");
+            }
+            ret.push_back(key);
+        }
+        return ret;
+    }
+
+    struct ggml_cgraph* build_lora_graph(std::map<std::string, struct ggml_tensor*> model_tensors, SDVersion version) {
+        size_t lora_graph_size = LORA_GRAPH_BASE_SIZE + lora_tensors.size() * 10;
+        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, lora_graph_size, false);
+
+        zero_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1);
+        set_backend_tensor_data(zero_index, zero_index_vec.data());
+        ggml_build_forward_expand(gf, zero_index);

        std::set<std::string> applied_lora_tensors;
        for (auto it : model_tensors) {
            std::string k_tensor       = it.first;
            struct ggml_tensor* weight = model_tensors[it.first];

-            size_t k_pos = k_tensor.find(".weight");
-            if (k_pos == std::string::npos) {
+            std::vector<std::string> keys = to_lora_keys(k_tensor, version);
+            if (keys.size() == 0)
                continue;
+
+            for (auto& key : keys) {
+                bool is_qkv_split = starts_with(key, "SPLIT|");
+                if (is_qkv_split) {
+                    key = key.substr(sizeof("SPLIT|") - 1);
+                }
+                bool is_qkvm_split = starts_with(key, "SPLIT_L|");
+                if (is_qkvm_split) {
+                    key = key.substr(sizeof("SPLIT_L|") - 1);
+                }
+                struct ggml_tensor* updown = NULL;
+                float scale_value          = 1.0f;
+                std::string fk             = lora_pre[type] + key;
+                if (lora_tensors.find(fk + ".hada_w1_a") != lora_tensors.end()) {
+                    // LoHa mode
+
+                    // TODO: split qkv convention for LoHas (is it ever used?)
+                    if (is_qkv_split || is_qkvm_split) {
+                        LOG_ERROR("Split qkv isn't supported for LoHa models.");
+                        break;
+                    }
+                    std::string alpha_name = "";
+
+                    ggml_tensor* hada_1_mid  = NULL;  // tau for tucker decomposition
+                    ggml_tensor* hada_1_up   = NULL;
+                    ggml_tensor* hada_1_down = NULL;
+
+                    ggml_tensor* hada_2_mid  = NULL;  // tau for tucker decomposition
+                    ggml_tensor* hada_2_up   = NULL;
+                    ggml_tensor* hada_2_down = NULL;
+
+                    std::string hada_1_mid_name  = "";
+                    std::string hada_1_down_name = "";
+                    std::string hada_1_up_name   = "";
+
+                    std::string hada_2_mid_name  = "";
+                    std::string hada_2_down_name = "";
+                    std::string hada_2_up_name   = "";
+
+                    hada_1_down_name = fk + ".hada_w1_b";
+                    hada_1_up_name   = fk + ".hada_w1_a";
+                    hada_1_mid_name  = fk + ".hada_t1";
+                    if (lora_tensors.find(hada_1_down_name) != lora_tensors.end()) {
+                        hada_1_down = to_f32(compute_ctx, lora_tensors[hada_1_down_name]);
+                    }
+                    if (lora_tensors.find(hada_1_up_name) != lora_tensors.end()) {
+                        hada_1_up = to_f32(compute_ctx, lora_tensors[hada_1_up_name]);
+                    }
+                    if (lora_tensors.find(hada_1_mid_name) != lora_tensors.end()) {
+                        hada_1_mid = to_f32(compute_ctx, lora_tensors[hada_1_mid_name]);
+                        applied_lora_tensors.insert(hada_1_mid_name);
+                        hada_1_up = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, hada_1_up));
+                    }
+
+                    hada_2_down_name = fk + ".hada_w2_b";
+                    hada_2_up_name   = fk + ".hada_w2_a";
+                    hada_2_mid_name  = fk + ".hada_t2";
+                    if (lora_tensors.find(hada_2_down_name) != lora_tensors.end()) {
+                        hada_2_down = to_f32(compute_ctx, lora_tensors[hada_2_down_name]);
+                    }
+                    if (lora_tensors.find(hada_2_up_name) != lora_tensors.end()) {
+                        hada_2_up = to_f32(compute_ctx, lora_tensors[hada_2_up_name]);
+                    }
+                    if (lora_tensors.find(hada_2_mid_name) != lora_tensors.end()) {
+                        hada_2_mid = to_f32(compute_ctx, lora_tensors[hada_2_mid_name]);
+                        applied_lora_tensors.insert(hada_2_mid_name);
+                        hada_2_up = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, hada_2_up));
+                    }
+
+                    alpha_name = fk + ".alpha";
+
+                    applied_lora_tensors.insert(hada_1_down_name);
+                    applied_lora_tensors.insert(hada_1_up_name);
+                    applied_lora_tensors.insert(hada_2_down_name);
+                    applied_lora_tensors.insert(hada_2_up_name);
+
+                    applied_lora_tensors.insert(alpha_name);
+                    if (hada_1_up == NULL || hada_1_down == NULL || hada_2_up == NULL || hada_2_down == NULL) {
+                        continue;
+                    }
+
+                    struct ggml_tensor* updown_1 = ggml_merge_lora(compute_ctx, hada_1_down, hada_1_up, hada_1_mid);
+                    struct ggml_tensor* updown_2 = ggml_merge_lora(compute_ctx, hada_2_down, hada_2_up, hada_2_mid);
+                    updown                       = ggml_mul_inplace(compute_ctx, updown_1, updown_2);
+
+                    // calc_scale
+                    // TODO: .dora_scale?
+                    int64_t rank = hada_1_down->ne[ggml_n_dims(hada_1_down) - 1];
+                    if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
+                        float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
+                        scale_value = alpha / rank;
+                    }
+                } else if (lora_tensors.find(fk + ".lokr_w1") != lora_tensors.end() || lora_tensors.find(fk + ".lokr_w1_a") != lora_tensors.end()) {
+                    // LoKr mode
+
+                    // TODO: split qkv convention for LoKrs (is it ever used?)
+                    if (is_qkv_split || is_qkvm_split) {
+                        LOG_ERROR("Split qkv isn't supported for LoKr models.");
+                        break;
+                    }
+
+                    std::string alpha_name = fk + ".alpha";
+
+                    ggml_tensor* lokr_w1 = NULL;
+                    ggml_tensor* lokr_w2 = NULL;
+
+                    std::string lokr_w1_name = "";
+                    std::string lokr_w2_name = "";
+
+                    lokr_w1_name = fk + ".lokr_w1";
+                    lokr_w2_name = fk + ".lokr_w2";
+
+                    if (lora_tensors.find(lokr_w1_name) != lora_tensors.end()) {
+                        lokr_w1 = to_f32(compute_ctx, lora_tensors[lokr_w1_name]);
+                        applied_lora_tensors.insert(lokr_w1_name);
+                    } else {
+                        ggml_tensor* down     = NULL;
+                        ggml_tensor* up       = NULL;
+                        std::string down_name = lokr_w1_name + "_b";
+                        std::string up_name   = lokr_w1_name + "_a";
+                        if (lora_tensors.find(down_name) != lora_tensors.end()) {
+                            // w1 should not be low rank normally, sometimes w1 and w2 are swapped
+                            down = to_f32(compute_ctx, lora_tensors[down_name]);
+                            applied_lora_tensors.insert(down_name);
+
+                            int64_t rank = down->ne[ggml_n_dims(down) - 1];
+                            if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
+                                float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
+                                scale_value = alpha / rank;
+                            }
+                        }
+                        if (lora_tensors.find(up_name) != lora_tensors.end()) {
+                            up = to_f32(compute_ctx, lora_tensors[up_name]);
+                            applied_lora_tensors.insert(up_name);
+                        }
+                        lokr_w1 = ggml_merge_lora(compute_ctx, down, up);
+                    }
+                    if (lora_tensors.find(lokr_w2_name) != lora_tensors.end()) {
+                        lokr_w2 = to_f32(compute_ctx, lora_tensors[lokr_w2_name]);
+                        applied_lora_tensors.insert(lokr_w2_name);
+                    } else {
+                        ggml_tensor* down     = NULL;
+                        ggml_tensor* up       = NULL;
+                        std::string down_name = lokr_w2_name + "_b";
+                        std::string up_name   = lokr_w2_name + "_a";
+                        if (lora_tensors.find(down_name) != lora_tensors.end()) {
+                            down = to_f32(compute_ctx, lora_tensors[down_name]);
+                            applied_lora_tensors.insert(down_name);
+
+                            int64_t rank = down->ne[ggml_n_dims(down) - 1];
+                            if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
+                                float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
+                                scale_value = alpha / rank;
+                            }
+                        }
+                        if (lora_tensors.find(up_name) != lora_tensors.end()) {
+                            up = to_f32(compute_ctx, lora_tensors[up_name]);
+                            applied_lora_tensors.insert(up_name);
+                        }
+                        lokr_w2 = ggml_merge_lora(compute_ctx, down, up);
+                    }
+
+                    // Technically it might be unused, but I believe it's the expected behavior
+                    applied_lora_tensors.insert(alpha_name);
+
+                    updown = ggml_kronecker(compute_ctx, lokr_w1, lokr_w2);
+
+                } else {
+                    // LoRA mode
+                    ggml_tensor* lora_mid  = NULL;  // tau for tucker decomposition
+                    ggml_tensor* lora_up   = NULL;
+                    ggml_tensor* lora_down = NULL;
+
+                    std::string alpha_name         = "";
+                    std::string scale_name         = "";
+                    std::string split_q_scale_name = "";
+                    std::string lora_mid_name      = "";
+                    std::string lora_down_name     = "";
+                    std::string lora_up_name       = "";
+
+                    if (is_qkv_split) {
+                        std::string suffix  = "";
+                        auto split_q_d_name = fk + "q" + suffix + lora_downs[type] + ".weight";
+
+                        if (lora_tensors.find(split_q_d_name) == lora_tensors.end()) {
+                            suffix         = "_proj";
+                            split_q_d_name = fk + "q" + suffix + lora_downs[type] + ".weight";
+                        }
+                        if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) {
+                            // print_ggml_tensor(it.second, true);  //[3072, 21504, 1, 1]
+                            // find qkv and mlp up parts in LoRA model
+                            auto split_k_d_name = fk + "k" + suffix + lora_downs[type] + ".weight";
+                            auto split_v_d_name = fk + "v" + suffix + lora_downs[type] + ".weight";
+
+                            auto split_q_u_name = fk + "q" + suffix + lora_ups[type] + ".weight";
+                            auto split_k_u_name = fk + "k" + suffix + lora_ups[type] + ".weight";
+                            auto split_v_u_name = fk + "v" + suffix + lora_ups[type] + ".weight";
+
+                            auto split_q_scale_name = fk + "q" + suffix + ".scale";
+                            auto split_k_scale_name = fk + "k" + suffix + ".scale";
+                            auto split_v_scale_name = fk + "v" + suffix + ".scale";
+
+                            auto split_q_alpha_name = fk + "q" + suffix + ".alpha";
+                            auto split_k_alpha_name = fk + "k" + suffix + ".alpha";
+                            auto split_v_alpha_name = fk + "v" + suffix + ".alpha";
+
+                            ggml_tensor* lora_q_down = NULL;
+                            ggml_tensor* lora_q_up   = NULL;
+                            ggml_tensor* lora_k_down = NULL;
+                            ggml_tensor* lora_k_up   = NULL;
+                            ggml_tensor* lora_v_down = NULL;
+                            ggml_tensor* lora_v_up   = NULL;
+
+                            lora_q_down = to_f32(compute_ctx, lora_tensors[split_q_d_name]);
+
+                            if (lora_tensors.find(split_q_u_name) != lora_tensors.end()) {
+                                lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]);
+                            }
+
+                            if (lora_tensors.find(split_k_d_name) != lora_tensors.end()) {
+                                lora_k_down = to_f32(compute_ctx, lora_tensors[split_k_d_name]);
+                            }
+
+                            if (lora_tensors.find(split_k_u_name) != lora_tensors.end()) {
+                                lora_k_up = to_f32(compute_ctx, lora_tensors[split_k_u_name]);
+                            }
+
+                            if (lora_tensors.find(split_v_d_name) != lora_tensors.end()) {
+                                lora_v_down = to_f32(compute_ctx, lora_tensors[split_v_d_name]);
+                            }
+
+                            if (lora_tensors.find(split_v_u_name) != lora_tensors.end()) {
+                                lora_v_up = to_f32(compute_ctx, lora_tensors[split_v_u_name]);
+                            }
+
+                            float q_rank = lora_q_up->ne[0];
+                            float k_rank = lora_k_up->ne[0];
+                            float v_rank = lora_v_up->ne[0];
+
+                            float lora_q_scale = 1;
+                            float lora_k_scale = 1;
+                            float lora_v_scale = 1;
+
+                            if (lora_tensors.find(split_q_scale_name) != lora_tensors.end()) {
+                                lora_q_scale = ggml_backend_tensor_get_f32(lora_tensors[split_q_scale_name]);
+                                applied_lora_tensors.insert(split_q_scale_name);
+                            }
+                            if (lora_tensors.find(split_k_scale_name) != lora_tensors.end()) {
+                                lora_k_scale = ggml_backend_tensor_get_f32(lora_tensors[split_k_scale_name]);
+                                applied_lora_tensors.insert(split_k_scale_name);
+                            }
+                            if (lora_tensors.find(split_v_scale_name) != lora_tensors.end()) {
+                                lora_v_scale = ggml_backend_tensor_get_f32(lora_tensors[split_v_scale_name]);
+                                applied_lora_tensors.insert(split_v_scale_name);
+                            }
+
+                            if (lora_tensors.find(split_q_alpha_name) != lora_tensors.end()) {
+                                float lora_q_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_q_alpha_name]);
+                                applied_lora_tensors.insert(split_q_alpha_name);
+                                lora_q_scale = lora_q_alpha / q_rank;
+                            }
+                            if (lora_tensors.find(split_k_alpha_name) != lora_tensors.end()) {
+                                float lora_k_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_k_alpha_name]);
+                                applied_lora_tensors.insert(split_k_alpha_name);
+                                lora_k_scale = lora_k_alpha / k_rank;
+                            }
+                            if (lora_tensors.find(split_v_alpha_name) != lora_tensors.end()) {
+                                float lora_v_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_v_alpha_name]);
+                                applied_lora_tensors.insert(split_v_alpha_name);
+                                lora_v_scale = lora_v_alpha / v_rank;
+                            }
+
+                            ggml_scale_inplace(compute_ctx, lora_q_down, lora_q_scale);
+                            ggml_scale_inplace(compute_ctx, lora_k_down, lora_k_scale);
+                            ggml_scale_inplace(compute_ctx, lora_v_down, lora_v_scale);
+
+                            // print_ggml_tensor(lora_q_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_k_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_v_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_q_up, true);    //[R, 3072, 1, 1]
+                            // print_ggml_tensor(lora_k_up, true);    //[R, 3072, 1, 1]
+                            // print_ggml_tensor(lora_v_up, true);    //[R, 3072, 1, 1]
+
+                            // these need to be stitched together this way:
+                            //                          |q_up,0   ,0   |
+                            //                          |0   ,k_up,0   |
+                            //                          |0   ,0   ,v_up|
+                            // (q_down,k_down,v_down) . (q   ,k   ,v)
+
+                            // up_concat will be [9216, R*3, 1, 1]
+                            // down_concat will be [R*3, 3072, 1, 1]
+                            ggml_tensor* lora_down_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_down, lora_k_down, 1), lora_v_down, 1);
+
+                            ggml_tensor* z = ggml_dup_tensor(compute_ctx, lora_q_up);
+                            ggml_scale(compute_ctx, z, 0);
+                            ggml_tensor* zz = ggml_concat(compute_ctx, z, z, 1);
+
+                            ggml_tensor* q_up = ggml_concat(compute_ctx, lora_q_up, zz, 1);
+                            ggml_tensor* k_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, z, lora_k_up, 1), z, 1);
+                            ggml_tensor* v_up = ggml_concat(compute_ctx, zz, lora_v_up, 1);
+                            // print_ggml_tensor(q_up, true);  //[R, 9216, 1, 1]
+                            // print_ggml_tensor(k_up, true);  //[R, 9216, 1, 1]
+                            // print_ggml_tensor(v_up, true);  //[R, 9216, 1, 1]
+                            ggml_tensor* lora_up_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, q_up, k_up, 0), v_up, 0);
+                            // print_ggml_tensor(lora_up_concat, true);  //[R*3, 9216, 1, 1]
+
+                            lora_down = ggml_cont(compute_ctx, lora_down_concat);
+                            lora_up   = ggml_cont(compute_ctx, lora_up_concat);
+
+                            applied_lora_tensors.insert(split_q_u_name);
+                            applied_lora_tensors.insert(split_k_u_name);
+                            applied_lora_tensors.insert(split_v_u_name);
+
+                            applied_lora_tensors.insert(split_q_d_name);
+                            applied_lora_tensors.insert(split_k_d_name);
+                            applied_lora_tensors.insert(split_v_d_name);
+                        }
+                    } else if (is_qkvm_split) {
+                        auto split_q_d_name = fk + "attn.to_q" + lora_downs[type] + ".weight";
+                        if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) {
+                            // print_ggml_tensor(it.second, true);  //[3072, 21504, 1, 1]
+                            // find qkv and mlp up parts in LoRA model
+                            auto split_k_d_name = fk + "attn.to_k" + lora_downs[type] + ".weight";
+                            auto split_v_d_name = fk + "attn.to_v" + lora_downs[type] + ".weight";
+
+                            auto split_q_u_name = fk + "attn.to_q" + lora_ups[type] + ".weight";
+                            auto split_k_u_name = fk + "attn.to_k" + lora_ups[type] + ".weight";
+                            auto split_v_u_name = fk + "attn.to_v" + lora_ups[type] + ".weight";
+
+                            auto split_m_d_name = fk + "proj_mlp" + lora_downs[type] + ".weight";
+                            auto split_m_u_name = fk + "proj_mlp" + lora_ups[type] + ".weight";
+
+                            auto split_q_scale_name = fk + "attn.to_q" + ".scale";
+                            auto split_k_scale_name = fk + "attn.to_k" + ".scale";
+                            auto split_v_scale_name = fk + "attn.to_v" + ".scale";
+                            auto split_m_scale_name = fk + "proj_mlp" + ".scale";
+
+                            auto split_q_alpha_name = fk + "attn.to_q" + ".alpha";
+                            auto split_k_alpha_name = fk + "attn.to_k" + ".alpha";
+                            auto split_v_alpha_name = fk + "attn.to_v" + ".alpha";
+                            auto split_m_alpha_name = fk + "proj_mlp" + ".alpha";
+
+                            ggml_tensor* lora_q_down = NULL;
+                            ggml_tensor* lora_q_up   = NULL;
+                            ggml_tensor* lora_k_down = NULL;
+                            ggml_tensor* lora_k_up   = NULL;
+                            ggml_tensor* lora_v_down = NULL;
+                            ggml_tensor* lora_v_up   = NULL;
+
+                            ggml_tensor* lora_m_down = NULL;
+                            ggml_tensor* lora_m_up   = NULL;
+
+                            lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]);
+
+                            if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) {
+                                lora_q_down = to_f32(compute_ctx, lora_tensors[split_q_d_name]);
+                            }
+
+                            if (lora_tensors.find(split_q_u_name) != lora_tensors.end()) {
+                                lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]);
+                            }
+
+                            if (lora_tensors.find(split_k_d_name) != lora_tensors.end()) {
+                                lora_k_down = to_f32(compute_ctx, lora_tensors[split_k_d_name]);
+                            }
+
+                            if (lora_tensors.find(split_k_u_name) != lora_tensors.end()) {
+                                lora_k_up = to_f32(compute_ctx, lora_tensors[split_k_u_name]);
+                            }
+
+                            if (lora_tensors.find(split_v_d_name) != lora_tensors.end()) {
+                                lora_v_down = to_f32(compute_ctx, lora_tensors[split_v_d_name]);
+                            }
+
+                            if (lora_tensors.find(split_v_u_name) != lora_tensors.end()) {
+                                lora_v_up = to_f32(compute_ctx, lora_tensors[split_v_u_name]);
+                            }
+
+                            if (lora_tensors.find(split_m_d_name) != lora_tensors.end()) {
+                                lora_m_down = to_f32(compute_ctx, lora_tensors[split_m_d_name]);
+                            }
+
+                            if (lora_tensors.find(split_m_u_name) != lora_tensors.end()) {
+                                lora_m_up = to_f32(compute_ctx, lora_tensors[split_m_u_name]);
+                            }
+
+                            float q_rank = lora_q_up->ne[0];
+                            float k_rank = lora_k_up->ne[0];
+                            float v_rank = lora_v_up->ne[0];
+                            float m_rank = lora_v_up->ne[0];
+
+                            float lora_q_scale = 1;
+                            float lora_k_scale = 1;
+                            float lora_v_scale = 1;
+                            float lora_m_scale = 1;
+
+                            if (lora_tensors.find(split_q_scale_name) != lora_tensors.end()) {
+                                lora_q_scale = ggml_backend_tensor_get_f32(lora_tensors[split_q_scale_name]);
+                                applied_lora_tensors.insert(split_q_scale_name);
+                            }
+                            if (lora_tensors.find(split_k_scale_name) != lora_tensors.end()) {
+                                lora_k_scale = ggml_backend_tensor_get_f32(lora_tensors[split_k_scale_name]);
+                                applied_lora_tensors.insert(split_k_scale_name);
+                            }
+                            if (lora_tensors.find(split_v_scale_name) != lora_tensors.end()) {
+                                lora_v_scale = ggml_backend_tensor_get_f32(lora_tensors[split_v_scale_name]);
+                                applied_lora_tensors.insert(split_v_scale_name);
+                            }
+                            if (lora_tensors.find(split_m_scale_name) != lora_tensors.end()) {
+                                lora_m_scale = ggml_backend_tensor_get_f32(lora_tensors[split_m_scale_name]);
+                                applied_lora_tensors.insert(split_m_scale_name);
+                            }
+
+                            if (lora_tensors.find(split_q_alpha_name) != lora_tensors.end()) {
+                                float lora_q_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_q_alpha_name]);
+                                applied_lora_tensors.insert(split_q_alpha_name);
+                                lora_q_scale = lora_q_alpha / q_rank;
+                            }
+                            if (lora_tensors.find(split_k_alpha_name) != lora_tensors.end()) {
+                                float lora_k_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_k_alpha_name]);
+                                applied_lora_tensors.insert(split_k_alpha_name);
+                                lora_k_scale = lora_k_alpha / k_rank;
+                            }
+                            if (lora_tensors.find(split_v_alpha_name) != lora_tensors.end()) {
+                                float lora_v_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_v_alpha_name]);
+                                applied_lora_tensors.insert(split_v_alpha_name);
+                                lora_v_scale = lora_v_alpha / v_rank;
+                            }
+                            if (lora_tensors.find(split_m_alpha_name) != lora_tensors.end()) {
+                                float lora_m_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_m_alpha_name]);
+                                applied_lora_tensors.insert(split_m_alpha_name);
+                                lora_m_scale = lora_m_alpha / m_rank;
+                            }
+
+                            ggml_scale_inplace(compute_ctx, lora_q_down, lora_q_scale);
+                            ggml_scale_inplace(compute_ctx, lora_k_down, lora_k_scale);
+                            ggml_scale_inplace(compute_ctx, lora_v_down, lora_v_scale);
+                            ggml_scale_inplace(compute_ctx, lora_m_down, lora_m_scale);
+
+                            // print_ggml_tensor(lora_q_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_k_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_v_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_m_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_q_up, true);  //[R, 3072, 1, 1]
+                            // print_ggml_tensor(lora_k_up, true);  //[R, 3072, 1, 1]
+                            // print_ggml_tensor(lora_v_up, true);  //[R, 3072, 1, 1]
+                            // print_ggml_tensor(lora_m_up, true);  //[R, 12288, 1, 1]
+
+                            // these need to be stitched together this way:
+                            //                                 |q_up,0   ,0   ,0   |
+                            //                                 |0   ,k_up,0   ,0   |
+                            //                                 |0   ,0   ,v_up,0   |
+                            //                                 |0   ,0   ,0   ,m_up|
+                            // (q_down,k_down,v_down,m_down) . (q   ,k   ,v   ,m)
+
+                            // up_concat will be [21504, R*4, 1, 1]
+                            // down_concat will be [R*4, 3072, 1, 1]
+
+                            ggml_tensor* lora_down_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_down, lora_k_down, 1), ggml_concat(compute_ctx, lora_v_down, lora_m_down, 1), 1);
+                            // print_ggml_tensor(lora_down_concat, true);  //[3072, R*4, 1, 1]
+
+                            // this also means that if rank is bigger than 672, it is less memory efficient to do it this way (should be fine)
+                            // print_ggml_tensor(lora_q_up, true);  //[3072, R, 1, 1]
+                            ggml_tensor* z     = ggml_dup_tensor(compute_ctx, lora_q_up);
+                            ggml_tensor* mlp_z = ggml_dup_tensor(compute_ctx, lora_m_up);
+                            ggml_scale(compute_ctx, z, 0);
+                            ggml_scale(compute_ctx, mlp_z, 0);
+                            ggml_tensor* zz = ggml_concat(compute_ctx, z, z, 1);
+
+                            ggml_tensor* q_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_up, zz, 1), mlp_z, 1);
+                            ggml_tensor* k_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, z, lora_k_up, 1), ggml_concat(compute_ctx, z, mlp_z, 1), 1);
+                            ggml_tensor* v_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, zz, lora_v_up, 1), mlp_z, 1);
+                            ggml_tensor* m_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, zz, z, 1), lora_m_up, 1);
+                            // print_ggml_tensor(q_up, true);  //[R, 21504, 1, 1]
+                            // print_ggml_tensor(k_up, true);  //[R, 21504, 1, 1]
+                            // print_ggml_tensor(v_up, true);  //[R, 21504, 1, 1]
+                            // print_ggml_tensor(m_up, true);  //[R, 21504, 1, 1]
+
+                            ggml_tensor* lora_up_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, q_up, k_up, 0), ggml_concat(compute_ctx, v_up, m_up, 0), 0);
+                            // print_ggml_tensor(lora_up_concat, true);  //[R*4, 21504, 1, 1]
+
+                            lora_down = ggml_cont(compute_ctx, lora_down_concat);
+                            lora_up   = ggml_cont(compute_ctx, lora_up_concat);
+
+                            applied_lora_tensors.insert(split_q_u_name);
+                            applied_lora_tensors.insert(split_k_u_name);
+                            applied_lora_tensors.insert(split_v_u_name);
+                            applied_lora_tensors.insert(split_m_u_name);
+
+                            applied_lora_tensors.insert(split_q_d_name);
+                            applied_lora_tensors.insert(split_k_d_name);
+                            applied_lora_tensors.insert(split_v_d_name);
+                            applied_lora_tensors.insert(split_m_d_name);
+                        }
+                    } else {
+                        lora_up_name   = fk + lora_ups[type] + ".weight";
+                        lora_down_name = fk + lora_downs[type] + ".weight";
+                        lora_mid_name  = fk + ".lora_mid.weight";
+
+                        alpha_name = fk + ".alpha";
+                        scale_name = fk + ".scale";
+
+                        if (lora_tensors.find(lora_up_name) != lora_tensors.end()) {
+                            lora_up = to_f32(compute_ctx, lora_tensors[lora_up_name]);
+                        }
+
+                        if (lora_tensors.find(lora_down_name) != lora_tensors.end()) {
+                            lora_down = to_f32(compute_ctx, lora_tensors[lora_down_name]);
+                        }
+
+                        if (lora_tensors.find(lora_mid_name) != lora_tensors.end()) {
+                            lora_mid = to_f32(compute_ctx, lora_tensors[lora_mid_name]);
+                            applied_lora_tensors.insert(lora_mid_name);
+                        }
+
+                        applied_lora_tensors.insert(lora_up_name);
+                        applied_lora_tensors.insert(lora_down_name);
+                        applied_lora_tensors.insert(alpha_name);
+                        applied_lora_tensors.insert(scale_name);
+                    }
+
+                    if (lora_up == NULL || lora_down == NULL) {
+                        continue;
+                    }
+                    // calc_scale
+                    // TODO: .dora_scale?
+                    int64_t rank = lora_down->ne[ggml_n_dims(lora_down) - 1];
+                    if (lora_tensors.find(scale_name) != lora_tensors.end()) {
+                        scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]);
+                    } else if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
+                        float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
+                        scale_value = alpha / rank;
+                    }
+
+                    updown = ggml_merge_lora(compute_ctx, lora_down, lora_up, lora_mid);
+                }
+                scale_value *= multiplier;
+                updown = ggml_reshape(compute_ctx, updown, weight);
+                GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
+                updown = ggml_scale_inplace(compute_ctx, updown, scale_value);
+                ggml_tensor* final_weight;
+                if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
+                    // final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, ggml_n_dims(weight), weight->ne);
+                    // final_weight = ggml_cpy(compute_ctx, weight, final_weight);
+                    final_weight = to_f32(compute_ctx, weight);
+                    final_weight = ggml_add_inplace(compute_ctx, final_weight, updown);
+                    final_weight = ggml_cpy(compute_ctx, final_weight, weight);
+                } else {
+                    final_weight = ggml_add_inplace(compute_ctx, weight, updown);
+                }
+                // final_weight = ggml_add_inplace(compute_ctx, weight, updown);  // apply directly
+                ggml_build_forward_expand(gf, final_weight);
+                break;
            }
-            k_tensor = k_tensor.substr(0, k_pos);
-            replace_all_chars(k_tensor, '.', '_');
-            std::string lora_up_name   = "lora." + k_tensor + ".lora_up.weight";
-            std::string lora_down_name = "lora." + k_tensor + ".lora_down.weight";
-            std::string alpha_name     = "lora." + k_tensor + ".alpha";
-            std::string scale_name     = "lora." + k_tensor + ".scale";
-
-            ggml_tensor* lora_up   = NULL;
-            ggml_tensor* lora_down = NULL;
-
-            if (lora_tensors.find(lora_up_name) != lora_tensors.end()) {
-                lora_up = lora_tensors[lora_up_name];
-            }
-
-            if (lora_tensors.find(lora_down_name) != lora_tensors.end()) {
-                lora_down = lora_tensors[lora_down_name];
-            }
-
-            if (lora_up == NULL || lora_down == NULL) {
-                continue;
-            }
-
-            applied_lora_tensors.insert(lora_up_name);
-            applied_lora_tensors.insert(lora_down_name);
-            applied_lora_tensors.insert(alpha_name);
-            applied_lora_tensors.insert(scale_name);
-
-            // calc_cale
-            int64_t dim       = lora_down->ne[lora_down->n_dims - 1];
-            float scale_value = 1.0f;
-            if (lora_tensors.find(scale_name) != lora_tensors.end()) {
-                scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]);
-            } else if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
-                float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
-                scale_value = alpha / dim;
-            }
-            scale_value *= multiplier;
-
-            ggml_tensor* lora_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-
-            ggml_allocr_alloc(compute_allocr, lora_scale);
-            if (!ggml_allocr_is_measure(compute_allocr)) {
-                ggml_backend_tensor_set(lora_scale, &scale_value, 0, ggml_nbytes(lora_scale));
-            }
-
-            // flat lora tensors to multiply it
-            int64_t lora_up_rows   = lora_up->ne[lora_up->n_dims - 1];
-            lora_up                = ggml_reshape_2d(ctx0, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows);
-            int64_t lora_down_rows = lora_down->ne[lora_down->n_dims - 1];
-            lora_down              = ggml_reshape_2d(ctx0, lora_down, ggml_nelements(lora_down) / lora_down_rows, lora_down_rows);
-
-            // ggml_mul_mat requires tensor b transposed
-            lora_down                  = ggml_cont(ctx0, ggml_transpose(ctx0, lora_down));
-            struct ggml_tensor* updown = ggml_mul_mat(ctx0, lora_up, lora_down);
-            updown                     = ggml_cont(ctx0, ggml_transpose(ctx0, updown));
-            updown                     = ggml_reshape(ctx0, updown, weight);
-            GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
-            updown = ggml_scale_inplace(ctx0, updown, lora_scale);
-            ggml_tensor* final_weight;
-            // if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
-            //     final_weight = ggml_new_tensor(ctx0, GGML_TYPE_F32, weight->n_dims, weight->ne);
-            //     final_weight = ggml_cpy_inplace(ctx0, weight, final_weight);
-            //     final_weight = ggml_add_inplace(ctx0, final_weight, updown);
-            //     final_weight = ggml_cpy_inplace(ctx0, final_weight, weight);
-            // } else {
-            //     final_weight = ggml_add_inplace(ctx0, weight, updown);
-            // }
-            final_weight = ggml_add_inplace(ctx0, weight, updown);  // apply directly
-            ggml_build_forward_expand(gf, final_weight);
        }
+        size_t total_lora_tensors_count   = 0;
+        size_t applied_lora_tensors_count = 0;

        for (auto& kv : lora_tensors) {
+            total_lora_tensors_count++;
            if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) {
-                LOG_WARN("unused lora tensor %s", kv.first.c_str());
+                LOG_WARN("unused lora tensor |%s|", kv.first.c_str());
+                print_ggml_tensor(kv.second, true);
+                // exit(0);
+            } else {
+                applied_lora_tensors_count++;
            }
        }
+        /* Don't worry if this message shows up twice in the logs per LoRA,
+         * this function is called once to calculate the required buffer size
+         * and then again to actually generate a graph to be used */
+        if (applied_lora_tensors_count != total_lora_tensors_count) {
+            LOG_WARN("Only (%lu / %lu) LoRA tensors have been applied",
+                     applied_lora_tensors_count, total_lora_tensors_count);
+        } else {
+            LOG_DEBUG("(%lu / %lu) LoRA tensors applied successfully",
+                      applied_lora_tensors_count, total_lora_tensors_count);
+        }

        return gf;
    }

-    void alloc_compute_buffer(std::map<std::string, struct ggml_tensor*> model_tensors) {
+    void apply(std::map<std::string, struct ggml_tensor*> model_tensors, SDVersion version, int n_threads) {
        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(model_tensors);
+            return build_lora_graph(model_tensors, version);
        };
-        GGMLModule::alloc_compute_buffer(get_graph);
-    }
-
-    void apply(std::map<std::string, struct ggml_tensor*> model_tensors, int n_threads) {
-        alloc_compute_buffer(model_tensors);
-
-        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(model_tensors);
-        };
-        GGMLModule::compute(get_graph, n_threads);
+        GGMLRunner::compute(get_graph, n_threads, true);
    }
 };

-#endif  // __LORA_HPP__
+#endif  // __LORA_HPP__
--- a/mmdit.hpp
+++ b/mmdit.hpp
--- a/model.cpp
+++ b/model.cpp
--- a/model.h
+++ b/model.h
@ -4,27 +4,106 @@
 #include <functional>
 #include <map>
 #include <memory>
+#include <set>
+#include <sstream>
 #include <string>
+#include <tuple>
 #include <vector>

-#include "ggml/ggml-backend.h"
-#include "ggml/ggml.h"
+#include "ggml-backend.h"
+#include "ggml.h"
+#include "gguf.h"
 #include "json.hpp"
 #include "zip.h"

+#define SD_MAX_DIMS 5
+
 enum SDVersion {
-    VERSION_1_x,
-    VERSION_2_x,
-    VERSION_XL,
+    VERSION_SD1,
+    VERSION_SD1_INPAINT,
+    VERSION_SD1_PIX2PIX,
+    VERSION_SD2,
+    VERSION_SD2_INPAINT,
+    VERSION_SDXL,
+    VERSION_SDXL_INPAINT,
+    VERSION_SDXL_PIX2PIX,
+    VERSION_SVD,
+    VERSION_SD3,
+    VERSION_FLUX,
+    VERSION_FLUX_FILL,
    VERSION_COUNT,
 };

+static inline bool sd_version_is_flux(SDVersion version) {
+    if (version == VERSION_FLUX || version == VERSION_FLUX_FILL) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_sd3(SDVersion version) {
+    if (version == VERSION_SD3) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_sd1(SDVersion version) {
+    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_sd2(SDVersion version) {
+    if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_sdxl(SDVersion version) {
+    if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT || version == VERSION_SDXL_PIX2PIX) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_inpaint(SDVersion version) {
+    if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT || version == VERSION_FLUX_FILL) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_dit(SDVersion version) {
+    if (sd_version_is_flux(version) || sd_version_is_sd3(version)) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_unet_edit(SDVersion version) {
+    return version == VERSION_SD1_PIX2PIX || version == VERSION_SDXL_PIX2PIX;
+}
+
+static bool sd_version_is_inpaint_or_unet_edit(SDVersion version) {
+    return sd_version_is_unet_edit(version) || sd_version_is_inpaint(version);
+}
+
+enum PMVersion {
+    PM_VERSION_1,
+    PM_VERSION_2,
+};
+
 struct TensorStorage {
    std::string name;
-    ggml_type type = GGML_TYPE_F32;
-    bool is_bf16   = false;
-    int64_t ne[4]  = {1, 1, 1, 1};
-    int n_dims     = 0;
+    ggml_type type          = GGML_TYPE_F32;
+    bool is_bf16            = false;
+    bool is_f8_e4m3         = false;
+    bool is_f8_e5m2         = false;
+    int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
+    int n_dims              = 0;

    size_t file_index = 0;
    int index_in_zip  = -1;  // >= means stored in a zip file
@ -40,7 +119,11 @@ struct TensorStorage {
    }

    int64_t nelements() const {
-        return ne[0] * ne[1] * ne[2] * ne[3];
+        int64_t n = 1;
+        for (int i = 0; i < SD_MAX_DIMS; i++) {
+            n *= ne[i];
+        }
+        return n;
    }

    int64_t nbytes() const {
@ -48,7 +131,7 @@ struct TensorStorage {
    }

    int64_t nbytes_to_read() const {
-        if (is_bf16) {
+        if (is_bf16 || is_f8_e4m3 || is_f8_e5m2) {
            return nbytes() / 2;
        } else {
            return nbytes();
@ -68,6 +151,7 @@ struct TensorStorage {
    std::vector<TensorStorage> chunk(size_t n) {
        std::vector<TensorStorage> chunks;
        size_t chunk_size = nbytes_to_read() / n;
+        // printf("%d/%d\n", chunk_size, nbytes_to_read());
        reverse_ne();
        for (int i = 0; i < n; i++) {
            TensorStorage chunk_i = *this;
@ -81,7 +165,7 @@ struct TensorStorage {
    }

    void reverse_ne() {
-        int64_t new_ne[4] = {1, 1, 1, 1};
+        int64_t new_ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
        for (int i = 0; i < n_dims; i++) {
            new_ne[i] = ne[n_dims - 1 - i];
        }
@ -89,10 +173,31 @@ struct TensorStorage {
            ne[i] = new_ne[i];
        }
    }
+
+    std::string to_string() const {
+        std::stringstream ss;
+        const char* type_name = ggml_type_name(type);
+        if (is_bf16) {
+            type_name = "bf16";
+        } else if (is_f8_e4m3) {
+            type_name = "f8_e4m3";
+        } else if (is_f8_e5m2) {
+            type_name = "f8_e5m2";
+        }
+        ss << name << " | " << type_name << " | ";
+        ss << n_dims << " [";
+        for (int i = 0; i < SD_MAX_DIMS; i++) {
+            ss << ne[i];
+            if (i != SD_MAX_DIMS - 1) {
+                ss << ", ";
+            }
+        }
+        ss << "]";
+        return ss.str();
+    }
 };

 typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;
-typedef std::function<void(const std::string&, int32_t)> on_new_token_cb_t;

 class ModelLoader {
 protected:
@ -104,7 +209,7 @@ protected:
                        zip_t* zip,
                        std::string dir,
                        size_t file_index,
-                        const std::string& prefix);
+                        const std::string prefix);

    bool init_from_gguf_file(const std::string& file_path, const std::string& prefix = "");
    bool init_from_safetensors_file(const std::string& file_path, const std::string& prefix = "");
@ -112,15 +217,28 @@ protected:
    bool init_from_diffusers_file(const std::string& file_path, const std::string& prefix = "");

 public:
+    std::map<std::string, enum ggml_type> tensor_storages_types;
+
    bool init_from_file(const std::string& file_path, const std::string& prefix = "");
+    bool model_is_unet();
    SDVersion get_sd_version();
    ggml_type get_sd_wtype();
-    std::string load_merges();
+    ggml_type get_conditioner_wtype();
+    ggml_type get_diffusion_model_wtype();
+    ggml_type get_vae_wtype();
+    void set_wtype_override(ggml_type wtype, std::string prefix = "");
    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend);
    bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                      ggml_backend_t backend,
                      std::set<std::string> ignore_tensors = {});
-    int64_t cal_mem_size(ggml_backend_t backend);
+
+    bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules);
+    bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
+    int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
    ~ModelLoader() = default;
+
+    static std::string load_merges();
+    static std::string load_t5_tokenizer_json();
 };
-#endif  // __MODEL_H__
+
+#endif  // __MODEL_H__
--- a/pmid.hpp
+++ b/pmid.hpp
@ -0,0 +1,845 @@
+#ifndef __PMI_HPP__
+#define __PMI_HPP__
+
+#include "ggml_extend.hpp"
+
+#include "clip.hpp"
+#include "lora.hpp"
+
+struct FuseBlock : public GGMLBlock {
+    // network hparams
+    int in_dim;
+    int out_dim;
+    int hidden_dim;
+    bool use_residue;
+
+public:
+    FuseBlock(int i_d, int o_d, int h_d, bool use_residue = true)
+        : in_dim(i_d), out_dim(o_d), hidden_dim(h_d), use_residue(use_residue) {
+        blocks["fc1"]       = std::shared_ptr<GGMLBlock>(new Linear(in_dim, hidden_dim, true));
+        blocks["fc2"]       = std::shared_ptr<GGMLBlock>(new Linear(hidden_dim, out_dim, true));
+        blocks["layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(in_dim));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, channels, h, w]
+
+        auto fc1        = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
+        auto fc2        = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
+        auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layernorm"]);
+
+        struct ggml_tensor* r = x;
+        // x = ggml_nn_layer_norm(ctx, x, ln_w, ln_b);
+        x = layer_norm->forward(ctx, x);
+        // x = ggml_add(ctx, ggml_mul_mat(ctx, fc1_w, x),  fc1_b);
+        x = fc1->forward(ctx, x);
+        x = ggml_gelu_inplace(ctx, x);
+        x = fc2->forward(ctx, x);
+        // x = ggml_add(ctx, ggml_mul_mat(ctx, fc2_w, x),  fc2_b);
+        if (use_residue)
+            x = ggml_add(ctx, x, r);
+        return x;
+    }
+};
+
+/*
+class QFormerPerceiver(nn.Module):
+    def __init__(self, id_embeddings_dim, cross_attention_dim, num_tokens, embedding_dim=1024, use_residual=True, ratio=4):
+        super().__init__()
+
+        self.num_tokens = num_tokens
+        self.cross_attention_dim = cross_attention_dim
+        self.use_residual = use_residual
+        print(cross_attention_dim*num_tokens)
+        self.token_proj = nn.Sequential(
+            nn.Linear(id_embeddings_dim, id_embeddings_dim*ratio),
+            nn.GELU(),
+            nn.Linear(id_embeddings_dim*ratio, cross_attention_dim*num_tokens),
+        )
+        self.token_norm = nn.LayerNorm(cross_attention_dim)
+        self.perceiver_resampler = FacePerceiverResampler(
+            dim=cross_attention_dim,
+            depth=4,
+            dim_head=128,
+            heads=cross_attention_dim // 128,
+            embedding_dim=embedding_dim,
+            output_dim=cross_attention_dim,
+            ff_mult=4,
+        )
+
+    def forward(self, x, last_hidden_state):
+        x = self.token_proj(x)
+        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
+        x = self.token_norm(x) # cls token
+        out = self.perceiver_resampler(x, last_hidden_state) # retrieve from patch tokens
+        if self.use_residual: # TODO: if use_residual is not true
+            out = x + 1.0 * out
+        return out
+*/
+
+struct PMFeedForward : public GGMLBlock {
+    // network hparams
+    int dim;
+
+public:
+    PMFeedForward(int d, int multi = 4)
+        : dim(d) {
+        int inner_dim = dim * multi;
+        blocks["0"]   = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
+        blocks["1"]   = std::shared_ptr<GGMLBlock>(new Mlp(dim, inner_dim, dim, false));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x) {
+        auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["0"]);
+        auto ff   = std::dynamic_pointer_cast<Mlp>(blocks["1"]);
+
+        x = norm->forward(ctx, x);
+        x = ff->forward(ctx, x);
+        return x;
+    }
+};
+
+struct PerceiverAttention : public GGMLBlock {
+    // network hparams
+    float scale;   // = dim_head**-0.5
+    int dim_head;  // = dim_head
+    int heads;     // = heads
+public:
+    PerceiverAttention(int dim, int dim_h = 64, int h = 8)
+        : scale(powf(dim_h, -0.5)), dim_head(dim_h), heads(h) {
+        int inner_dim    = dim_head * heads;
+        blocks["norm1"]  = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
+        blocks["norm2"]  = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
+        blocks["to_q"]   = std::shared_ptr<GGMLBlock>(new Linear(dim, inner_dim, false));
+        blocks["to_kv"]  = std::shared_ptr<GGMLBlock>(new Linear(dim, inner_dim * 2, false));
+        blocks["to_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim, false));
+    }
+
+    struct ggml_tensor* reshape_tensor(struct ggml_context* ctx,
+                                       struct ggml_tensor* x,
+                                       int heads) {
+        int64_t ne[4];
+        for (int i = 0; i < 4; ++i)
+            ne[i] = x->ne[i];
+        // print_ggml_tensor(x, true, "PerceiverAttention reshape x 0: ");
+        // printf("heads = %d \n", heads);
+        // x = ggml_view_4d(ctx, x, x->ne[0], x->ne[1], heads, x->ne[2]/heads,
+        //                          x->nb[1], x->nb[2], x->nb[3], 0);
+        x = ggml_reshape_4d(ctx, x, x->ne[0] / heads, heads, x->ne[1], x->ne[2]);
+        // x = ggml_view_4d(ctx, x, x->ne[0]/heads, heads, x->ne[1], x->ne[2],
+        //                          x->nb[1], x->nb[2], x->nb[3], 0);
+        // x = ggml_cont(ctx, x);
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));
+        // print_ggml_tensor(x, true, "PerceiverAttention reshape x 1: ");
+        // x  = ggml_reshape_4d(ctx, x, ne[0], heads, ne[1], ne[2]/heads);
+        return x;
+    }
+
+    std::vector<struct ggml_tensor*> chunk_half(struct ggml_context* ctx,
+                                                struct ggml_tensor* x) {
+        auto tlo = ggml_view_4d(ctx, x, x->ne[0] / 2, x->ne[1], x->ne[2], x->ne[3], x->nb[1], x->nb[2], x->nb[3], 0);
+        auto tli = ggml_view_4d(ctx, x, x->ne[0] / 2, x->ne[1], x->ne[2], x->ne[3], x->nb[1], x->nb[2], x->nb[3], x->nb[0] * x->ne[0] / 2);
+        return {ggml_cont(ctx, tlo),
+                ggml_cont(ctx, tli)};
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* latents) {
+        // x (torch.Tensor): image features
+        //     shape (b, n1, D)
+        // latent (torch.Tensor): latent features
+        //     shape (b, n2, D)
+        int64_t ne[4];
+        for (int i = 0; i < 4; ++i)
+            ne[i] = latents->ne[i];
+
+        auto norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
+        auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
+        x          = norm1->forward(ctx, x);
+        latents    = norm2->forward(ctx, latents);
+        auto to_q  = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
+        auto q     = to_q->forward(ctx, latents);
+
+        auto kv_input = ggml_concat(ctx, x, latents, 1);
+        auto to_kv    = std::dynamic_pointer_cast<Linear>(blocks["to_kv"]);
+        auto kv       = to_kv->forward(ctx, kv_input);
+        auto k        = ggml_view_4d(ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, 0);
+        auto v        = ggml_view_4d(ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, kv->nb[0] * (kv->ne[0] / 2));
+        k             = ggml_cont(ctx, k);
+        v             = ggml_cont(ctx, v);
+        q             = reshape_tensor(ctx, q, heads);
+        k             = reshape_tensor(ctx, k, heads);
+        v             = reshape_tensor(ctx, v, heads);
+        scale         = 1.f / sqrt(sqrt((float)dim_head));
+        k             = ggml_scale_inplace(ctx, k, scale);
+        q             = ggml_scale_inplace(ctx, q, scale);
+        // auto weight = ggml_mul_mat(ctx, q, k);
+        auto weight = ggml_mul_mat(ctx, k, q);  // NOTE order of mul is opposite to pytorch
+
+        // GGML's softmax() is equivalent to pytorch's softmax(x, dim=-1)
+        // in this case, dimension along which Softmax will be computed is the last dim
+        // in torch and the first dim in GGML, consistent with the convention that pytorch's
+        // last dimension (varying most rapidly) corresponds to GGML's first (varying most rapidly).
+        // weight = ggml_soft_max(ctx, weight);
+        weight = ggml_soft_max_inplace(ctx, weight);
+        v      = ggml_cont(ctx, ggml_transpose(ctx, v));
+        // auto out = ggml_mul_mat(ctx, weight, v);
+        auto out    = ggml_mul_mat(ctx, v, weight);  // NOTE order of mul is opposite to pytorch
+        out         = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3));
+        out         = ggml_reshape_3d(ctx, out, ne[0], ne[1], ggml_nelements(out) / (ne[0] * ne[1]));
+        auto to_out = std::dynamic_pointer_cast<Linear>(blocks["to_out"]);
+        out         = to_out->forward(ctx, out);
+        return out;
+    }
+};
+
+struct FacePerceiverResampler : public GGMLBlock {
+    // network hparams
+    int depth;
+
+public:
+    FacePerceiverResampler(int dim           = 768,
+                           int d             = 4,
+                           int dim_head      = 64,
+                           int heads         = 16,
+                           int embedding_dim = 1280,
+                           int output_dim    = 768,
+                           int ff_mult       = 4)
+        : depth(d) {
+        blocks["proj_in"]  = std::shared_ptr<GGMLBlock>(new Linear(embedding_dim, dim, true));
+        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(dim, output_dim, true));
+        blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new LayerNorm(output_dim));
+
+        for (int i = 0; i < depth; i++) {
+            std::string name = "layers." + std::to_string(i) + ".0";
+            blocks[name]     = std::shared_ptr<GGMLBlock>(new PerceiverAttention(dim, dim_head, heads));
+            name             = "layers." + std::to_string(i) + ".1";
+            blocks[name]     = std::shared_ptr<GGMLBlock>(new PMFeedForward(dim, ff_mult));
+        }
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* latents,
+                                struct ggml_tensor* x) {
+        // x: [N, channels, h, w]
+        auto proj_in  = std::dynamic_pointer_cast<Linear>(blocks["proj_in"]);
+        auto proj_out = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]);
+        auto norm_out = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_out"]);
+
+        x = proj_in->forward(ctx, x);
+        for (int i = 0; i < depth; i++) {
+            std::string name = "layers." + std::to_string(i) + ".0";
+            auto attn        = std::dynamic_pointer_cast<PerceiverAttention>(blocks[name]);
+            name             = "layers." + std::to_string(i) + ".1";
+            auto ff          = std::dynamic_pointer_cast<PMFeedForward>(blocks[name]);
+            auto t           = attn->forward(ctx, x, latents);
+            latents          = ggml_add(ctx, t, latents);
+            t                = ff->forward(ctx, latents);
+            latents          = ggml_add(ctx, t, latents);
+        }
+        latents = proj_out->forward(ctx, latents);
+        latents = norm_out->forward(ctx, latents);
+        return latents;
+    }
+};
+
+struct QFormerPerceiver : public GGMLBlock {
+    // network hparams
+    int num_tokens;
+    int cross_attention_dim;
+    bool use_residul;
+
+public:
+    QFormerPerceiver(int id_embeddings_dim, int cross_attention_d, int num_t, int embedding_dim = 1024, bool use_r = true, int ratio = 4)
+        : cross_attention_dim(cross_attention_d), num_tokens(num_t), use_residul(use_r) {
+        blocks["token_proj"]          = std::shared_ptr<GGMLBlock>(new Mlp(id_embeddings_dim,
+                                                                           id_embeddings_dim * ratio,
+                                                                           cross_attention_dim * num_tokens,
+                                                                           true));
+        blocks["token_norm"]          = std::shared_ptr<GGMLBlock>(new LayerNorm(cross_attention_d));
+        blocks["perceiver_resampler"] = std::shared_ptr<GGMLBlock>(new FacePerceiverResampler(
+            cross_attention_dim,
+            4,
+            128,
+            cross_attention_dim / 128,
+            embedding_dim,
+            cross_attention_dim,
+            4));
+    }
+
+    /*
+    def forward(self, x, last_hidden_state):
+        x = self.token_proj(x)
+        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
+        x = self.token_norm(x) # cls token
+        out = self.perceiver_resampler(x, last_hidden_state) # retrieve from patch tokens
+        if self.use_residual: # TODO: if use_residual is not true
+            out = x + 1.0 * out
+        return out
+    */
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* last_hidden_state) {
+        // x: [N, channels, h, w]
+        auto token_proj          = std::dynamic_pointer_cast<Mlp>(blocks["token_proj"]);
+        auto token_norm          = std::dynamic_pointer_cast<LayerNorm>(blocks["token_norm"]);
+        auto perceiver_resampler = std::dynamic_pointer_cast<FacePerceiverResampler>(blocks["perceiver_resampler"]);
+
+        x                       = token_proj->forward(ctx, x);
+        int64_t nel             = ggml_nelements(x);
+        x                       = ggml_reshape_3d(ctx, x, cross_attention_dim, num_tokens, nel / (cross_attention_dim * num_tokens));
+        x                       = token_norm->forward(ctx, x);
+        struct ggml_tensor* out = perceiver_resampler->forward(ctx, x, last_hidden_state);
+        if (use_residul)
+            out = ggml_add(ctx, x, out);
+        return out;
+    }
+};
+
+/*
+class FacePerceiverResampler(torch.nn.Module):
+    def __init__(
+        self,
+        *,
+        dim=768,
+        depth=4,
+        dim_head=64,
+        heads=16,
+        embedding_dim=1280,
+        output_dim=768,
+        ff_mult=4,
+    ):
+        super().__init__()
+
+        self.proj_in = torch.nn.Linear(embedding_dim, dim)
+        self.proj_out = torch.nn.Linear(dim, output_dim)
+        self.norm_out = torch.nn.LayerNorm(output_dim)
+        self.layers = torch.nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                torch.nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+
+    def forward(self, latents, x):
+        x = self.proj_in(x)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        latents = self.proj_out(latents)
+        return self.norm_out(latents)
+*/
+
+/*
+
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+
+def reshape_tensor(x, heads):
+    bs, length, width = x.shape
+    # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
+    x = x.view(bs, length, heads, -1)
+    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+    x = x.transpose(1, 2)
+    # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+    x = x.reshape(bs, heads, length, -1)
+    return x
+
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head * heads
+
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D)
+        """
+        x = self.norm1(x)
+        latents = self.norm2(latents)
+
+        b, l, _ = latents.shape
+
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+
+        q = reshape_tensor(q, self.heads)
+        k = reshape_tensor(k, self.heads)
+        v = reshape_tensor(v, self.heads)
+
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = weight @ v
+
+        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+
+        return self.to_out(out)
+
+*/
+
+struct FuseModule : public GGMLBlock {
+    // network hparams
+    int embed_dim;
+
+public:
+    FuseModule(int imb_d)
+        : embed_dim(imb_d) {
+        blocks["mlp1"]       = std::shared_ptr<GGMLBlock>(new FuseBlock(imb_d * 2, imb_d, imb_d, false));
+        blocks["mlp2"]       = std::shared_ptr<GGMLBlock>(new FuseBlock(imb_d, imb_d, imb_d, true));
+        blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(embed_dim));
+    }
+
+    struct ggml_tensor* fuse_fn(struct ggml_context* ctx,
+                                struct ggml_tensor* prompt_embeds,
+                                struct ggml_tensor* id_embeds) {
+        auto mlp1       = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp1"]);
+        auto mlp2       = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp2"]);
+        auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm"]);
+
+        // print_ggml_tensor(id_embeds, true, "Fuseblock id_embeds: ");
+        // print_ggml_tensor(prompt_embeds, true, "Fuseblock prompt_embeds: ");
+
+        // auto prompt_embeds0 = ggml_cont(ctx, ggml_permute(ctx, prompt_embeds, 2, 0, 1, 3));
+        // auto id_embeds0     = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
+        // print_ggml_tensor(id_embeds0, true, "Fuseblock id_embeds0: ");
+        // print_ggml_tensor(prompt_embeds0, true, "Fuseblock prompt_embeds0: ");
+        // concat is along dim 2
+        // auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds0, id_embeds0, 2);
+        auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds, id_embeds, 0);
+        // print_ggml_tensor(stacked_id_embeds, true, "Fuseblock stacked_id_embeds 0: ");
+        // stacked_id_embeds      = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 1, 2, 0, 3));
+        // print_ggml_tensor(stacked_id_embeds, true, "Fuseblock stacked_id_embeds 1: ");
+        // stacked_id_embeds = mlp1.forward(ctx, stacked_id_embeds);
+        // stacked_id_embeds = ggml_add(ctx, stacked_id_embeds, prompt_embeds);
+        // stacked_id_embeds = mlp2.forward(ctx, stacked_id_embeds);
+        // stacked_id_embeds = ggml_nn_layer_norm(ctx, stacked_id_embeds, ln_w, ln_b);
+
+        stacked_id_embeds = mlp1->forward(ctx, stacked_id_embeds);
+        stacked_id_embeds = ggml_add(ctx, stacked_id_embeds, prompt_embeds);
+        stacked_id_embeds = mlp2->forward(ctx, stacked_id_embeds);
+        stacked_id_embeds = layer_norm->forward(ctx, stacked_id_embeds);
+
+        // print_ggml_tensor(stacked_id_embeds, true, "Fuseblock stacked_id_embeds 1: ");
+
+        return stacked_id_embeds;
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* prompt_embeds,
+                                struct ggml_tensor* id_embeds,
+                                struct ggml_tensor* class_tokens_mask,
+                                struct ggml_tensor* class_tokens_mask_pos,
+                                struct ggml_tensor* left,
+                                struct ggml_tensor* right) {
+        // x: [N, channels, h, w]
+
+        struct ggml_tensor* valid_id_embeds = id_embeds;
+        // # slice out the image token embeddings
+        // print_ggml_tensor(class_tokens_mask_pos, false);
+        ggml_set_name(class_tokens_mask_pos, "class_tokens_mask_pos");
+        ggml_set_name(prompt_embeds, "prompt_embeds");
+        // print_ggml_tensor(valid_id_embeds, true, "valid_id_embeds");
+        // print_ggml_tensor(class_tokens_mask_pos, true, "class_tokens_mask_pos");
+        struct ggml_tensor* image_token_embeds = ggml_get_rows(ctx, prompt_embeds, class_tokens_mask_pos);
+        ggml_set_name(image_token_embeds, "image_token_embeds");
+        valid_id_embeds                       = ggml_reshape_2d(ctx, valid_id_embeds, valid_id_embeds->ne[0],
+                                                                ggml_nelements(valid_id_embeds) / valid_id_embeds->ne[0]);
+        struct ggml_tensor* stacked_id_embeds = fuse_fn(ctx, image_token_embeds, valid_id_embeds);
+
+        // stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
+        // print_ggml_tensor(stacked_id_embeds, true, "AA stacked_id_embeds");
+        // print_ggml_tensor(left, true, "AA left");
+        // print_ggml_tensor(right, true, "AA right");
+        if (left && right) {
+            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 1);
+            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 1);
+        } else if (left) {
+            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 1);
+        } else if (right) {
+            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 1);
+        }
+        // print_ggml_tensor(stacked_id_embeds, true, "BB stacked_id_embeds");
+        // stacked_id_embeds                         = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
+        // print_ggml_tensor(stacked_id_embeds, true, "CC stacked_id_embeds");
+        class_tokens_mask                         = ggml_cont(ctx, ggml_transpose(ctx, class_tokens_mask));
+        class_tokens_mask                         = ggml_repeat(ctx, class_tokens_mask, prompt_embeds);
+        prompt_embeds                             = ggml_mul(ctx, prompt_embeds, class_tokens_mask);
+        struct ggml_tensor* updated_prompt_embeds = ggml_add(ctx, prompt_embeds, stacked_id_embeds);
+        ggml_set_name(updated_prompt_embeds, "updated_prompt_embeds");
+        // print_ggml_tensor(updated_prompt_embeds, true, "updated_prompt_embeds: ");
+        return updated_prompt_embeds;
+    }
+};
+
+struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
+    PhotoMakerIDEncoderBlock()
+        : CLIPVisionModelProjection(OPENAI_CLIP_VIT_L_14) {
+        blocks["visual_projection_2"] = std::shared_ptr<GGMLBlock>(new Linear(1024, 1280, false));
+        blocks["fuse_module"]         = std::shared_ptr<GGMLBlock>(new FuseModule(2048));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* id_pixel_values,
+                                struct ggml_tensor* prompt_embeds,
+                                struct ggml_tensor* class_tokens_mask,
+                                struct ggml_tensor* class_tokens_mask_pos,
+                                struct ggml_tensor* left,
+                                struct ggml_tensor* right) {
+        // x: [N, channels, h, w]
+        auto vision_model        = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
+        auto visual_projection   = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]);
+        auto visual_projection_2 = std::dynamic_pointer_cast<Linear>(blocks["visual_projection_2"]);
+        auto fuse_module         = std::dynamic_pointer_cast<FuseModule>(blocks["fuse_module"]);
+
+        struct ggml_tensor* shared_id_embeds = vision_model->forward(ctx, id_pixel_values);          // [N, hidden_size]
+        struct ggml_tensor* id_embeds        = visual_projection->forward(ctx, shared_id_embeds);    // [N, proj_dim(768)]
+        struct ggml_tensor* id_embeds_2      = visual_projection_2->forward(ctx, shared_id_embeds);  // [N, 1280]
+
+        id_embeds   = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
+        id_embeds_2 = ggml_cont(ctx, ggml_permute(ctx, id_embeds_2, 2, 0, 1, 3));
+
+        id_embeds = ggml_concat(ctx, id_embeds, id_embeds_2, 2);  // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
+        id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 1, 2, 0, 3));
+
+        struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
+                                                                         prompt_embeds,
+                                                                         id_embeds,
+                                                                         class_tokens_mask,
+                                                                         class_tokens_mask_pos,
+                                                                         left, right);
+        return updated_prompt_embeds;
+    }
+};
+
+struct PhotoMakerIDEncoder_CLIPInsightfaceExtendtokenBlock : public CLIPVisionModelProjection {
+    int cross_attention_dim;
+    int num_tokens;
+
+    PhotoMakerIDEncoder_CLIPInsightfaceExtendtokenBlock(int id_embeddings_dim = 512)
+        : CLIPVisionModelProjection(OPENAI_CLIP_VIT_L_14),
+          cross_attention_dim(2048),
+          num_tokens(2) {
+        blocks["visual_projection_2"] = std::shared_ptr<GGMLBlock>(new Linear(1024, 1280, false));
+        blocks["fuse_module"]         = std::shared_ptr<GGMLBlock>(new FuseModule(2048));
+        /*
+        cross_attention_dim = 2048
+        # projection
+        self.num_tokens = 2
+        self.cross_attention_dim = cross_attention_dim
+        self.qformer_perceiver = QFormerPerceiver(
+                                    id_embeddings_dim,
+                                    cross_attention_dim,
+                                    self.num_tokens,
+                                )*/
+        blocks["qformer_perceiver"] = std::shared_ptr<GGMLBlock>(new QFormerPerceiver(id_embeddings_dim,
+                                                                                      cross_attention_dim,
+                                                                                      num_tokens));
+    }
+
+    /*
+    def forward(self, id_pixel_values, prompt_embeds, class_tokens_mask, id_embeds):
+        b, num_inputs, c, h, w = id_pixel_values.shape
+        id_pixel_values = id_pixel_values.view(b * num_inputs, c, h, w)
+
+        last_hidden_state = self.vision_model(id_pixel_values)[0]
+        id_embeds = id_embeds.view(b * num_inputs, -1)
+
+        id_embeds = self.qformer_perceiver(id_embeds, last_hidden_state)
+        id_embeds = id_embeds.view(b, num_inputs, self.num_tokens, -1)
+        updated_prompt_embeds = self.fuse_module(prompt_embeds, id_embeds, class_tokens_mask)
+    */
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* id_pixel_values,
+                                struct ggml_tensor* prompt_embeds,
+                                struct ggml_tensor* class_tokens_mask,
+                                struct ggml_tensor* class_tokens_mask_pos,
+                                struct ggml_tensor* id_embeds,
+                                struct ggml_tensor* left,
+                                struct ggml_tensor* right) {
+        // x: [N, channels, h, w]
+        auto vision_model      = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
+        auto fuse_module       = std::dynamic_pointer_cast<FuseModule>(blocks["fuse_module"]);
+        auto qformer_perceiver = std::dynamic_pointer_cast<QFormerPerceiver>(blocks["qformer_perceiver"]);
+
+        // struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values);          // [N, hidden_size]
+        struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values, false);  // [N, hidden_size]
+        id_embeds                             = qformer_perceiver->forward(ctx, id_embeds, last_hidden_state);
+
+        struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
+                                                                         prompt_embeds,
+                                                                         id_embeds,
+                                                                         class_tokens_mask,
+                                                                         class_tokens_mask_pos,
+                                                                         left, right);
+        return updated_prompt_embeds;
+    }
+};
+
+struct PhotoMakerIDEncoder : public GGMLRunner {
+public:
+    SDVersion version    = VERSION_SDXL;
+    PMVersion pm_version = PM_VERSION_1;
+    PhotoMakerIDEncoderBlock id_encoder;
+    PhotoMakerIDEncoder_CLIPInsightfaceExtendtokenBlock id_encoder2;
+    float style_strength;
+
+    std::vector<float> ctm;
+    std::vector<ggml_fp16_t> ctmf16;
+    std::vector<int> ctmpos;
+
+    std::vector<ggml_fp16_t> zeros_left_16;
+    std::vector<float> zeros_left;
+    std::vector<ggml_fp16_t> zeros_right_16;
+    std::vector<float> zeros_right;
+
+public:
+    PhotoMakerIDEncoder(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix, SDVersion version = VERSION_SDXL, PMVersion pm_v = PM_VERSION_1, float sty = 20.f)
+        : GGMLRunner(backend),
+          version(version),
+          pm_version(pm_v),
+          style_strength(sty) {
+        if (pm_version == PM_VERSION_1) {
+            id_encoder.init(params_ctx, tensor_types, prefix);
+        } else if (pm_version == PM_VERSION_2) {
+            id_encoder2.init(params_ctx, tensor_types, prefix);
+        }
+    }
+
+    std::string get_desc() {
+        return "pmid";
+    }
+
+    PMVersion get_version() const {
+        return pm_version;
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        if (pm_version == PM_VERSION_1)
+            id_encoder.get_param_tensors(tensors, prefix);
+        else if (pm_version == PM_VERSION_2)
+            id_encoder2.get_param_tensors(tensors, prefix);
+    }
+
+    struct ggml_cgraph* build_graph(  // struct ggml_allocr* allocr,
+        struct ggml_tensor* id_pixel_values,
+        struct ggml_tensor* prompt_embeds,
+        std::vector<bool>& class_tokens_mask,
+        struct ggml_tensor* id_embeds) {
+        ctm.clear();
+        ctmf16.clear();
+        ctmpos.clear();
+        zeros_left.clear();
+        zeros_left_16.clear();
+        zeros_right.clear();
+        zeros_right_16.clear();
+
+        ggml_context* ctx0 = compute_ctx;
+
+        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
+
+        int64_t hidden_size = prompt_embeds->ne[0];
+        int64_t seq_length  = prompt_embeds->ne[1];
+        ggml_type type      = GGML_TYPE_F32;
+
+        struct ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(ctx0, type, class_tokens_mask.size());
+
+        struct ggml_tensor* id_pixel_values_d = to_backend(id_pixel_values);
+        struct ggml_tensor* prompt_embeds_d   = to_backend(prompt_embeds);
+        struct ggml_tensor* id_embeds_d       = to_backend(id_embeds);
+
+        struct ggml_tensor* left  = NULL;
+        struct ggml_tensor* right = NULL;
+        for (int i = 0; i < class_tokens_mask.size(); i++) {
+            if (class_tokens_mask[i]) {
+                // printf(" 1,");
+                ctm.push_back(0.f);                        // here use 0.f instead of 1.f to make a scale mask
+                ctmf16.push_back(ggml_fp32_to_fp16(0.f));  // here use 0.f instead of 1.f to make a scale mask
+                ctmpos.push_back(i);
+            } else {
+                // printf(" 0,");
+                ctm.push_back(1.f);                        // here use 1.f instead of 0.f to make a scale mask
+                ctmf16.push_back(ggml_fp32_to_fp16(1.f));  // here use 0.f instead of 1.f to make a scale mask
+            }
+        }
+        // printf("\n");
+        if (ctmpos[0] > 0) {
+            // left = ggml_new_tensor_3d(ctx0, type, hidden_size, 1, ctmpos[0]);
+            left = ggml_new_tensor_3d(ctx0, type, hidden_size, ctmpos[0], 1);
+        }
+        if (ctmpos[ctmpos.size() - 1] < seq_length - 1) {
+            // right = ggml_new_tensor_3d(ctx0, type,
+            //                            hidden_size, 1, seq_length - ctmpos[ctmpos.size() - 1] - 1);
+            right = ggml_new_tensor_3d(ctx0, type,
+                                       hidden_size, seq_length - ctmpos[ctmpos.size() - 1] - 1, 1);
+        }
+        struct ggml_tensor* class_tokens_mask_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ctmpos.size());
+
+        {
+            if (type == GGML_TYPE_F16)
+                set_backend_tensor_data(class_tokens_mask_d, ctmf16.data());
+            else
+                set_backend_tensor_data(class_tokens_mask_d, ctm.data());
+            set_backend_tensor_data(class_tokens_mask_pos, ctmpos.data());
+            if (left) {
+                if (type == GGML_TYPE_F16) {
+                    for (int i = 0; i < ggml_nelements(left); ++i)
+                        zeros_left_16.push_back(ggml_fp32_to_fp16(0.f));
+                    set_backend_tensor_data(left, zeros_left_16.data());
+                } else {
+                    for (int i = 0; i < ggml_nelements(left); ++i)
+                        zeros_left.push_back(0.f);
+                    set_backend_tensor_data(left, zeros_left.data());
+                }
+            }
+            if (right) {
+                if (type == GGML_TYPE_F16) {
+                    for (int i = 0; i < ggml_nelements(right); ++i)
+                        zeros_right_16.push_back(ggml_fp32_to_fp16(0.f));
+                    set_backend_tensor_data(right, zeros_right_16.data());
+                } else {
+                    for (int i = 0; i < ggml_nelements(right); ++i)
+                        zeros_right.push_back(0.f);
+                    set_backend_tensor_data(right, zeros_right.data());
+                }
+            }
+        }
+        struct ggml_tensor* updated_prompt_embeds = NULL;
+        if (pm_version == PM_VERSION_1)
+            updated_prompt_embeds = id_encoder.forward(ctx0,
+                                                       id_pixel_values_d,
+                                                       prompt_embeds_d,
+                                                       class_tokens_mask_d,
+                                                       class_tokens_mask_pos,
+                                                       left, right);
+        else if (pm_version == PM_VERSION_2)
+            updated_prompt_embeds = id_encoder2.forward(ctx0,
+                                                        id_pixel_values_d,
+                                                        prompt_embeds_d,
+                                                        class_tokens_mask_d,
+                                                        class_tokens_mask_pos,
+                                                        id_embeds_d,
+                                                        left, right);
+
+        ggml_build_forward_expand(gf, updated_prompt_embeds);
+
+        return gf;
+    }
+
+    void compute(const int n_threads,
+                 struct ggml_tensor* id_pixel_values,
+                 struct ggml_tensor* prompt_embeds,
+                 struct ggml_tensor* id_embeds,
+                 std::vector<bool>& class_tokens_mask,
+                 struct ggml_tensor** updated_prompt_embeds,
+                 ggml_context* output_ctx) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            // return build_graph(compute_allocr, id_pixel_values, prompt_embeds, class_tokens_mask);
+            return build_graph(id_pixel_values, prompt_embeds, class_tokens_mask, id_embeds);
+        };
+
+        // GGMLRunner::compute(get_graph, n_threads, updated_prompt_embeds);
+        GGMLRunner::compute(get_graph, n_threads, true, updated_prompt_embeds, output_ctx);
+    }
+};
+
+struct PhotoMakerIDEmbed : public GGMLRunner {
+    std::map<std::string, struct ggml_tensor*> tensors;
+    std::string file_path;
+    ModelLoader* model_loader;
+    bool load_failed = false;
+    bool applied     = false;
+
+    PhotoMakerIDEmbed(ggml_backend_t backend,
+                      ModelLoader* ml,
+                      const std::string& file_path = "",
+                      const std::string& prefix    = "")
+        : file_path(file_path), GGMLRunner(backend), model_loader(ml) {
+        if (!model_loader->init_from_file(file_path, prefix)) {
+            load_failed = true;
+        }
+    }
+
+    std::string get_desc() {
+        return "id_embeds";
+    }
+
+    bool load_from_file(bool filter_tensor = false) {
+        LOG_INFO("loading PhotoMaker ID Embeds from '%s'", file_path.c_str());
+
+        if (load_failed) {
+            LOG_ERROR("init photomaker id embed from file failed: '%s'", file_path.c_str());
+            return false;
+        }
+
+        bool dry_run          = true;
+        auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
+            const std::string& name = tensor_storage.name;
+
+            if (filter_tensor && !contains(name, "pmid.id_embeds")) {
+                // LOG_INFO("skipping LoRA tesnor '%s'", name.c_str());
+                return true;
+            }
+            if (dry_run) {
+                struct ggml_tensor* real = ggml_new_tensor(params_ctx,
+                                                           tensor_storage.type,
+                                                           tensor_storage.n_dims,
+                                                           tensor_storage.ne);
+                tensors[name]            = real;
+            } else {
+                auto real   = tensors[name];
+                *dst_tensor = real;
+            }
+
+            return true;
+        };
+
+        model_loader->load_tensors(on_new_tensor_cb, backend);
+        alloc_params_buffer();
+
+        dry_run = false;
+        model_loader->load_tensors(on_new_tensor_cb, backend);
+
+        LOG_DEBUG("finished loading PhotoMaker ID Embeds ");
+        return true;
+    }
+
+    struct ggml_tensor* get() {
+        std::map<std::string, struct ggml_tensor*>::iterator pos;
+        pos = tensors.find("pmid.id_embeds");
+        if (pos != tensors.end())
+            return pos->second;
+        return NULL;
+    }
+};
+
+#endif  // __PMI_HPP__
--- a/preprocessing.hpp
+++ b/preprocessing.hpp
@ -0,0 +1,227 @@
+#ifndef __PREPROCESSING_HPP__
+#define __PREPROCESSING_HPP__
+
+#include "ggml_extend.hpp"
+#define M_PI_ 3.14159265358979323846
+
+void convolve(struct ggml_tensor* input, struct ggml_tensor* output, struct ggml_tensor* kernel, int padding) {
+    struct ggml_init_params params;
+    params.mem_size                 = 20 * 1024 * 1024;  // 10
+    params.mem_buffer               = NULL;
+    params.no_alloc                 = false;
+    struct ggml_context* ctx0       = ggml_init(params);
+    struct ggml_tensor* kernel_fp16 = ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, kernel->ne[0], kernel->ne[1], 1, 1);
+    ggml_fp32_to_fp16_row((float*)kernel->data, (ggml_fp16_t*)kernel_fp16->data, ggml_nelements(kernel));
+    ggml_tensor* h  = ggml_conv_2d(ctx0, kernel_fp16, input, 1, 1, padding, padding, 1, 1);
+    ggml_cgraph* gf = ggml_new_graph(ctx0);
+    ggml_build_forward_expand(gf, ggml_cpy(ctx0, h, output));
+    ggml_graph_compute_with_ctx(ctx0, gf, 1);
+    ggml_free(ctx0);
+}
+
+void gaussian_kernel(struct ggml_tensor* kernel) {
+    int ks_mid   = kernel->ne[0] / 2;
+    float sigma  = 1.4f;
+    float normal = 1.f / (2.0f * M_PI_ * powf(sigma, 2.0f));
+    for (int y = 0; y < kernel->ne[0]; y++) {
+        float gx = -ks_mid + y;
+        for (int x = 0; x < kernel->ne[1]; x++) {
+            float gy = -ks_mid + x;
+            float k_ = expf(-((gx * gx + gy * gy) / (2.0f * powf(sigma, 2.0f)))) * normal;
+            ggml_tensor_set_f32(kernel, k_, x, y);
+        }
+    }
+}
+
+void grayscale(struct ggml_tensor* rgb_img, struct ggml_tensor* grayscale) {
+    for (int iy = 0; iy < rgb_img->ne[1]; iy++) {
+        for (int ix = 0; ix < rgb_img->ne[0]; ix++) {
+            float r    = ggml_tensor_get_f32(rgb_img, ix, iy);
+            float g    = ggml_tensor_get_f32(rgb_img, ix, iy, 1);
+            float b    = ggml_tensor_get_f32(rgb_img, ix, iy, 2);
+            float gray = 0.2989f * r + 0.5870f * g + 0.1140f * b;
+            ggml_tensor_set_f32(grayscale, gray, ix, iy);
+        }
+    }
+}
+
+void prop_hypot(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tensor* h) {
+    int n_elements = ggml_nelements(h);
+    float* dx      = (float*)x->data;
+    float* dy      = (float*)y->data;
+    float* dh      = (float*)h->data;
+    for (int i = 0; i < n_elements; i++) {
+        dh[i] = sqrtf(dx[i] * dx[i] + dy[i] * dy[i]);
+    }
+}
+
+void prop_arctan2(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tensor* h) {
+    int n_elements = ggml_nelements(h);
+    float* dx      = (float*)x->data;
+    float* dy      = (float*)y->data;
+    float* dh      = (float*)h->data;
+    for (int i = 0; i < n_elements; i++) {
+        dh[i] = atan2f(dy[i], dx[i]);
+    }
+}
+
+void normalize_tensor(struct ggml_tensor* g) {
+    int n_elements = ggml_nelements(g);
+    float* dg      = (float*)g->data;
+    float max      = -INFINITY;
+    for (int i = 0; i < n_elements; i++) {
+        max = dg[i] > max ? dg[i] : max;
+    }
+    max = 1.0f / max;
+    for (int i = 0; i < n_elements; i++) {
+        dg[i] *= max;
+    }
+}
+
+void non_max_supression(struct ggml_tensor* result, struct ggml_tensor* G, struct ggml_tensor* D) {
+    for (int iy = 1; iy < result->ne[1] - 1; iy++) {
+        for (int ix = 1; ix < result->ne[0] - 1; ix++) {
+            float angle = ggml_tensor_get_f32(D, ix, iy) * 180.0f / M_PI_;
+            angle       = angle < 0.0f ? angle += 180.0f : angle;
+            float q     = 1.0f;
+            float r     = 1.0f;
+
+            // angle 0
+            if ((0 >= angle && angle < 22.5f) || (157.5f >= angle && angle <= 180)) {
+                q = ggml_tensor_get_f32(G, ix, iy + 1);
+                r = ggml_tensor_get_f32(G, ix, iy - 1);
+            }
+            // angle 45
+            else if (22.5f >= angle && angle < 67.5f) {
+                q = ggml_tensor_get_f32(G, ix + 1, iy - 1);
+                r = ggml_tensor_get_f32(G, ix - 1, iy + 1);
+            }
+            // angle 90
+            else if (67.5f >= angle && angle < 112.5) {
+                q = ggml_tensor_get_f32(G, ix + 1, iy);
+                r = ggml_tensor_get_f32(G, ix - 1, iy);
+            }
+            // angle 135
+            else if (112.5 >= angle && angle < 157.5f) {
+                q = ggml_tensor_get_f32(G, ix - 1, iy - 1);
+                r = ggml_tensor_get_f32(G, ix + 1, iy + 1);
+            }
+
+            float cur = ggml_tensor_get_f32(G, ix, iy);
+            if ((cur >= q) && (cur >= r)) {
+                ggml_tensor_set_f32(result, cur, ix, iy);
+            } else {
+                ggml_tensor_set_f32(result, 0.0f, ix, iy);
+            }
+        }
+    }
+}
+
+void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float low_threshold, float weak, float strong) {
+    int n_elements = ggml_nelements(img);
+    float* imd     = (float*)img->data;
+    float max      = -INFINITY;
+    for (int i = 0; i < n_elements; i++) {
+        max = imd[i] > max ? imd[i] : max;
+    }
+    float ht = max * high_threshold;
+    float lt = ht * low_threshold;
+    for (int i = 0; i < n_elements; i++) {
+        float img_v = imd[i];
+        if (img_v >= ht) {  // strong pixel
+            imd[i] = strong;
+        } else if (img_v <= ht && img_v >= lt) {  // strong pixel
+            imd[i] = weak;
+        }
+    }
+
+    for (int iy = 0; iy < img->ne[1]; iy++) {
+        for (int ix = 0; ix < img->ne[0]; ix++) {
+            if (ix >= 3 && ix <= img->ne[0] - 3 && iy >= 3 && iy <= img->ne[1] - 3) {
+                ggml_tensor_set_f32(img, ggml_tensor_get_f32(img, ix, iy), ix, iy);
+            } else {
+                ggml_tensor_set_f32(img, 0.0f, ix, iy);
+            }
+        }
+    }
+
+    // hysteresis
+    for (int iy = 1; iy < img->ne[1] - 1; iy++) {
+        for (int ix = 1; ix < img->ne[0] - 1; ix++) {
+            float imd_v = ggml_tensor_get_f32(img, ix, iy);
+            if (imd_v == weak) {
+                if (ggml_tensor_get_f32(img, ix + 1, iy - 1) == strong || ggml_tensor_get_f32(img, ix + 1, iy) == strong ||
+                    ggml_tensor_get_f32(img, ix, iy - 1) == strong || ggml_tensor_get_f32(img, ix, iy + 1) == strong ||
+                    ggml_tensor_get_f32(img, ix - 1, iy - 1) == strong || ggml_tensor_get_f32(img, ix - 1, iy) == strong) {
+                    ggml_tensor_set_f32(img, strong, ix, iy);
+                } else {
+                    ggml_tensor_set_f32(img, 0.0f, ix, iy);
+                }
+            }
+        }
+    }
+}
+
+uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
+    struct ggml_init_params params;
+    params.mem_size               = static_cast<size_t>(10 * 1024 * 1024);  // 10
+    params.mem_buffer             = NULL;
+    params.no_alloc               = false;
+    struct ggml_context* work_ctx = ggml_init(params);
+
+    if (!work_ctx) {
+        LOG_ERROR("ggml_init() failed");
+        return NULL;
+    }
+
+    float kX[9] = {
+        -1, 0, 1,
+        -2, 0, 2,
+        -1, 0, 1};
+
+    float kY[9] = {
+        1, 2, 1,
+        0, 0, 0,
+        -1, -2, -1};
+
+    // generate kernel
+    int kernel_size             = 5;
+    struct ggml_tensor* gkernel = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, kernel_size, kernel_size, 1, 1);
+    struct ggml_tensor* sf_kx   = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
+    memcpy(sf_kx->data, kX, ggml_nbytes(sf_kx));
+    struct ggml_tensor* sf_ky = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
+    memcpy(sf_ky->data, kY, ggml_nbytes(sf_ky));
+    gaussian_kernel(gkernel);
+    struct ggml_tensor* image      = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
+    struct ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1);
+    struct ggml_tensor* iX         = ggml_dup_tensor(work_ctx, image_gray);
+    struct ggml_tensor* iY         = ggml_dup_tensor(work_ctx, image_gray);
+    struct ggml_tensor* G          = ggml_dup_tensor(work_ctx, image_gray);
+    struct ggml_tensor* tetha      = ggml_dup_tensor(work_ctx, image_gray);
+    sd_image_to_tensor(img, image);
+    grayscale(image, image_gray);
+    convolve(image_gray, image_gray, gkernel, 2);
+    convolve(image_gray, iX, sf_kx, 1);
+    convolve(image_gray, iY, sf_ky, 1);
+    prop_hypot(iX, iY, G);
+    normalize_tensor(G);
+    prop_arctan2(iX, iY, tetha);
+    non_max_supression(image_gray, G, tetha);
+    threshold_hystersis(image_gray, high_threshold, low_threshold, weak, strong);
+    // to RGB channels
+    for (int iy = 0; iy < height; iy++) {
+        for (int ix = 0; ix < width; ix++) {
+            float gray = ggml_tensor_get_f32(image_gray, ix, iy);
+            gray       = inverse ? 1.0f - gray : gray;
+            ggml_tensor_set_f32(image, gray, ix, iy);
+            ggml_tensor_set_f32(image, gray, ix, iy, 1);
+            ggml_tensor_set_f32(image, gray, ix, iy, 2);
+        }
+    }
+    free(img);
+    uint8_t* output = sd_tensor_to_image(image);
+    ggml_free(work_ctx);
+    return output;
+}
+
+#endif  // __PREPROCESSING_HPP__
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -30,7 +30,8 @@ extern "C" {

 enum rng_type_t {
    STD_DEFAULT_RNG,
-    CUDA_RNG
+    CUDA_RNG,
+    RNG_TYPE_COUNT
 };

 enum sample_method_t {
@ -41,15 +42,22 @@ enum sample_method_t {
    DPMPP2S_A,
    DPMPP2M,
    DPMPP2Mv2,
+    IPNDM,
+    IPNDM_V,
    LCM,
-    N_SAMPLE_METHODS
+    DDIM_TRAILING,
+    TCD,
+    SAMPLE_METHOD_COUNT
 };

 enum schedule_t {
    DEFAULT,
    DISCRETE,
    KARRAS,
-    N_SCHEDULES
+    EXPONENTIAL,
+    AYS,
+    GITS,
+    SCHEDULE_COUNT
 };

 // same as enum ggml_type
@ -59,26 +67,43 @@ enum sd_type_t {
    SD_TYPE_Q4_0 = 2,
    SD_TYPE_Q4_1 = 3,
    // SD_TYPE_Q4_2 = 4, support has been removed
-    // SD_TYPE_Q4_3 (5) support has been removed
-    SD_TYPE_Q5_0 = 6,
-    SD_TYPE_Q5_1 = 7,
-    SD_TYPE_Q8_0 = 8,
-    SD_TYPE_Q8_1 = 9,
-    // k-quantizations
-    SD_TYPE_Q2_K = 10,
-    SD_TYPE_Q3_K = 11,
-    SD_TYPE_Q4_K = 12,
-    SD_TYPE_Q5_K = 13,
-    SD_TYPE_Q6_K = 14,
-    SD_TYPE_Q8_K = 15,
-    SD_TYPE_I8,
-    SD_TYPE_I16,
-    SD_TYPE_I32,
-    SD_TYPE_COUNT,
+    // SD_TYPE_Q4_3 = 5, support has been removed
+    SD_TYPE_Q5_0    = 6,
+    SD_TYPE_Q5_1    = 7,
+    SD_TYPE_Q8_0    = 8,
+    SD_TYPE_Q8_1    = 9,
+    SD_TYPE_Q2_K    = 10,
+    SD_TYPE_Q3_K    = 11,
+    SD_TYPE_Q4_K    = 12,
+    SD_TYPE_Q5_K    = 13,
+    SD_TYPE_Q6_K    = 14,
+    SD_TYPE_Q8_K    = 15,
+    SD_TYPE_IQ2_XXS = 16,
+    SD_TYPE_IQ2_XS  = 17,
+    SD_TYPE_IQ3_XXS = 18,
+    SD_TYPE_IQ1_S   = 19,
+    SD_TYPE_IQ4_NL  = 20,
+    SD_TYPE_IQ3_S   = 21,
+    SD_TYPE_IQ2_S   = 22,
+    SD_TYPE_IQ4_XS  = 23,
+    SD_TYPE_I8      = 24,
+    SD_TYPE_I16     = 25,
+    SD_TYPE_I32     = 26,
+    SD_TYPE_I64     = 27,
+    SD_TYPE_F64     = 28,
+    SD_TYPE_IQ1_M   = 29,
+    SD_TYPE_BF16    = 30,
+    // SD_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
+    // SD_TYPE_Q4_0_4_8 = 32,
+    // SD_TYPE_Q4_0_8_8 = 33,
+    SD_TYPE_TQ1_0 = 34,
+    SD_TYPE_TQ2_0 = 35,
+    // SD_TYPE_IQ4_NL_4_4 = 36,
+    // SD_TYPE_IQ4_NL_4_8 = 37,
+    // SD_TYPE_IQ4_NL_8_8 = 38,
+    SD_TYPE_COUNT = 39,
 };

-SD_API const char* sd_type_name(enum sd_type_t type);
-
 enum sd_log_level_t {
    SD_LOG_DEBUG,
    SD_LOG_INFO,
@ -86,11 +111,33 @@ enum sd_log_level_t {
    SD_LOG_ERROR
 };

-typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
-
-SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
-SD_API int32_t get_num_physical_cores();
-SD_API const char* sd_get_system_info();
+typedef struct {
+    const char* model_path;
+    const char* clip_l_path;
+    const char* clip_g_path;
+    const char* t5xxl_path;
+    const char* diffusion_model_path;
+    const char* vae_path;
+    const char* taesd_path;
+    const char* control_net_path;
+    const char* lora_model_dir;
+    const char* embedding_dir;
+    const char* stacked_id_embed_dir;
+    bool vae_decode_only;
+    bool vae_tiling;
+    bool free_params_immediately;
+    int n_threads;
+    enum sd_type_t wtype;
+    enum rng_type_t rng_type;
+    enum schedule_t schedule;
+    bool keep_clip_on_cpu;
+    bool keep_control_net_on_cpu;
+    bool keep_vae_on_cpu;
+    bool diffusion_flash_attn;
+    bool chroma_use_dit_mask;
+    bool chroma_use_t5_mask;
+    int chroma_t5_mask_pad;
+} sd_ctx_params_t;

 typedef struct {
    uint32_t width;
@ -99,59 +146,118 @@ typedef struct {
    uint8_t* data;
 } sd_image_t;

+typedef struct {
+    int* layers;
+    size_t layer_count;
+    float layer_start;
+    float layer_end;
+    float scale;
+} sd_slg_params_t;
+
+typedef struct {
+    float txt_cfg;
+    float img_cfg;
+    float min_cfg;
+    float distilled_guidance;
+    sd_slg_params_t slg;
+} sd_guidance_params_t;
+
+typedef struct {
+    const char* prompt;
+    const char* negative_prompt;
+    int clip_skip;
+    sd_guidance_params_t guidance;
+    sd_image_t init_image;
+    sd_image_t* ref_images;
+    int ref_images_count;
+    sd_image_t mask_image;
+    int width;
+    int height;
+    enum sample_method_t sample_method;
+    int sample_steps;
+    float eta;
+    float strength;
+    int64_t seed;
+    int batch_count;
+    const sd_image_t* control_cond;
+    float control_strength;
+    float style_strength;
+    bool normalize_input;
+    const char* input_id_images_path;
+} sd_img_gen_params_t;
+
+typedef struct {
+    sd_image_t init_image;
+    int width;
+    int height;
+    sd_guidance_params_t guidance;
+    enum sample_method_t sample_method;
+    int sample_steps;
+    float strength;
+    int64_t seed;
+    int video_frames;
+    int motion_bucket_id;
+    int fps;
+    float augmentation_level;
+} sd_vid_gen_params_t;
+
 typedef struct sd_ctx_t sd_ctx_t;

-SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
-                            const char* vae_path,
-                            const char* taesd_path,
-                            const char* lora_model_dir,
-                            bool vae_decode_only,
-                            bool vae_tiling,
-                            bool free_params_immediately,
-                            int n_threads,
-                            enum sd_type_t wtype,
-                            enum rng_type_t rng_type,
-                            enum schedule_t s);
+typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
+typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);

+SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
+SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
+SD_API int32_t get_num_physical_cores();
+SD_API const char* sd_get_system_info();
+
+SD_API const char* sd_type_name(enum sd_type_t type);
+SD_API enum sd_type_t str_to_sd_type(const char* str);
+SD_API const char* sd_rng_type_name(enum rng_type_t rng_type);
+SD_API enum rng_type_t str_to_rng_type(const char* str);
+SD_API const char* sd_sample_method_name(enum sample_method_t sample_method);
+SD_API enum sample_method_t str_to_sample_method(const char* str);
+SD_API const char* sd_schedule_name(enum schedule_t schedule);
+SD_API enum schedule_t str_to_schedule(const char* str);
+
+SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
+SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
+
+SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params);
 SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);

-SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
-                           const char* prompt,
-                           const char* negative_prompt,
-                           int clip_skip,
-                           float cfg_scale,
-                           int width,
-                           int height,
-                           enum sample_method_t sample_method,
-                           int sample_steps,
-                           int64_t seed,
-                           int batch_count);
+SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params);
+SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
+SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params);

-SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
-                           sd_image_t init_image,
-                           const char* prompt,
-                           const char* negative_prompt,
-                           int clip_skip,
-                           float cfg_scale,
-                           int width,
-                           int height,
-                           enum sample_method_t sample_method,
-                           int sample_steps,
-                           float strength,
-                           int64_t seed,
-                           int batch_count);
+SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params);
+SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params);  // broken

 typedef struct upscaler_ctx_t upscaler_ctx_t;

 SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
-                                        int n_threads,
-                                        enum sd_type_t wtype);
+                                        int n_threads);
 SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);

-SD_API sd_image_t upscale(upscaler_ctx_t*, sd_image_t input_image, uint32_t upscale_factor);
+SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
+
+SD_API bool convert(const char* input_path,
+                    const char* vae_path,
+                    const char* output_path,
+                    enum sd_type_t output_type,
+                    const char* tensor_type_rules);
+
+SD_API uint8_t* preprocess_canny(uint8_t* img,
+                                 int width,
+                                 int height,
+                                 float high_threshold,
+                                 float low_threshold,
+                                 float weak,
+                                 float strong,
+                                 bool inverse);

 #ifdef __cplusplus
 }
 #endif

-#endif  // __STABLE_DIFFUSION_H__
+#endif  // __STABLE_DIFFUSION_H__
--- a/t5.hpp
+++ b/t5.hpp
--- a/tae.hpp
+++ b/tae.hpp
@ -8,88 +8,45 @@
 /*
    ===================================    TinyAutoEncoder  ===================================
    References:
-    https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/autoencoder_tiny.py
+    https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/autoencoders/vae.py
    https://github.com/madebyollin/taesd/blob/main/taesd.py

 */
-struct TAEBlock {
-    int in_channels;
-    int out_channels;

-    // conv
-    ggml_tensor* conv_0_w;  // [in_channels, out_channels, 3, 3]
-    ggml_tensor* conv_0_b;  // [in_channels]
-    ggml_tensor* conv_1_w;  // [out_channels, out_channels, 3, 3]
-    ggml_tensor* conv_1_b;  // [out_channels]
-    ggml_tensor* conv_2_w;  // [out_channels, out_channels, 3, 3]
-    ggml_tensor* conv_2_b;  // [out_channels]
+class TAEBlock : public UnaryBlock {
+protected:
+    int n_in;
+    int n_out;

-    // skip
-    ggml_tensor* conv_skip_w;  // [in_channels, out_channels, 1, 1]
-
-    size_t calculate_mem_size() {
-        size_t mem_size = in_channels * out_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_0_w
-        mem_size += in_channels * ggml_type_size(GGML_TYPE_F32);                               // conv_0_b
-        mem_size += out_channels * out_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);       // conv_1_w
-        mem_size += out_channels * ggml_type_size(GGML_TYPE_F32);                              // conv_1_b
-        mem_size += out_channels * out_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);       // conv_1_w
-        mem_size += out_channels * ggml_type_size(GGML_TYPE_F32);                              // conv_1_b
-        mem_size += out_channels * out_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);       // conv_2_w
-        mem_size += out_channels * ggml_type_size(GGML_TYPE_F32);                              // conv_2_b
-
-        if (in_channels != out_channels) {
-            mem_size += in_channels * out_channels * ggml_type_size(GGML_TYPE_F16);  // conv_skip_w
-        }
-        return mem_size;
-    }
-
-    int get_num_tensors() {
-        return 6 + (in_channels != out_channels ? 1 : 0);
-    }
-
-    void init_params(ggml_context* ctx) {
-        conv_0_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, in_channels);
-        conv_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
-
-        conv_1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, out_channels);
-        conv_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
-
-        conv_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, out_channels);
-        conv_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
-
-        if (in_channels != out_channels) {
-            conv_skip_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, out_channels, in_channels);
+public:
+    TAEBlock(int n_in, int n_out)
+        : n_in(n_in), n_out(n_out) {
+        blocks["conv.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}));
+        if (n_in != n_out) {
+            blocks["skip"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {1, 1}, {1, 1}, {1, 1}, {1, 1}, false));
        }
    }

-    void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
-        tensors[prefix + "conv.0.weight"] = conv_0_w;
-        tensors[prefix + "conv.0.bias"]   = conv_0_b;
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [n, n_in, h, w]
+        // return: [n, n_out, h, w]

-        tensors[prefix + "conv.2.weight"] = conv_1_w;
-        tensors[prefix + "conv.2.bias"]   = conv_1_b;
+        auto conv_0 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.0"]);
+        auto conv_2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.2"]);
+        auto conv_4 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.4"]);

-        tensors[prefix + "conv.4.weight"] = conv_2_w;
-        tensors[prefix + "conv.4.bias"]   = conv_2_b;
+        auto h = conv_0->forward(ctx, x);
+        h      = ggml_relu_inplace(ctx, h);
+        h      = conv_2->forward(ctx, h);
+        h      = ggml_relu_inplace(ctx, h);
+        h      = conv_4->forward(ctx, h);

-        if (in_channels != out_channels) {
-            tensors[prefix + "skip.weight"] = conv_skip_w;
-        }
-    }
-
-    ggml_tensor* forward(ggml_context* ctx, ggml_tensor* x) {
-        // conv(n_in, n_out)
-        ggml_tensor* h;
-        h = ggml_nn_conv_2d(ctx, x, conv_0_w, conv_0_b, 1, 1, 1, 1);
-        h = ggml_relu_inplace(ctx, h);
-        h = ggml_nn_conv_2d(ctx, h, conv_1_w, conv_1_b, 1, 1, 1, 1);
-        h = ggml_relu_inplace(ctx, h);
-        h = ggml_nn_conv_2d(ctx, h, conv_2_w, conv_2_b, 1, 1, 1, 1);
-
-        // skip connection
-        if (in_channels != out_channels) {
-            // skip = nn.Conv2d(n_in, n_out, 1, bias=False) if n_in != n_out else nn.Identity()
-            x = ggml_nn_conv_2d(ctx, x, conv_skip_w, NULL, 1, 1, 1, 1);
+        if (n_in != n_out) {
+            auto skip = std::dynamic_pointer_cast<Conv2d>(blocks["skip"]);
+            LOG_DEBUG("skip");
+            x = skip->forward(ctx, x);
        }

        h = ggml_add(ctx, h, x);
@ -98,425 +55,169 @@ struct TAEBlock {
    }
 };

-struct TinyEncoder {
+class TinyEncoder : public UnaryBlock {
    int in_channels = 3;
-    int z_channels  = 4;
    int channels    = 64;
+    int z_channels  = 4;
    int num_blocks  = 3;

-    // input
-    ggml_tensor* conv_input_w;  // [channels, in_channels, 3, 3]
-    ggml_tensor* conv_input_b;  // [channels]
-    TAEBlock initial_block;
+public:
+    TinyEncoder(int z_channels = 4)
+        : z_channels(z_channels) {
+        int index                       = 0;
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, channels, {3, 3}, {1, 1}, {1, 1}));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));

-    ggml_tensor* conv_1_w;  // [channels, channels, 3, 3]
-    TAEBlock input_blocks[3];
-
-    // middle
-    ggml_tensor* conv_2_w;  // [channels, channels, 3, 3]
-    TAEBlock middle_blocks[3];
-
-    // output
-    ggml_tensor* conv_3_w;  // [channels, channels, 3, 3]
-    TAEBlock output_blocks[3];
-
-    // final
-    ggml_tensor* conv_final_w;  // [z_channels, channels, 3, 3]
-    ggml_tensor* conv_final_b;  // [z_channels]
-
-    TinyEncoder() {
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
        for (int i = 0; i < num_blocks; i++) {
-            input_blocks[i].in_channels  = channels;
-            input_blocks[i].out_channels = channels;
-
-            middle_blocks[i].in_channels  = channels;
-            middle_blocks[i].out_channels = channels;
-
-            output_blocks[i].in_channels  = channels;
-            output_blocks[i].out_channels = channels;
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
        }

-        initial_block.in_channels  = channels;
-        initial_block.out_channels = channels;
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
+        for (int i = 0; i < num_blocks; i++) {
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+        }
+
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
+        for (int i = 0; i < num_blocks; i++) {
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+        }
+
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1}));
    }

-    size_t calculate_mem_size() {
-        size_t mem_size = channels * in_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_input_w
-        mem_size += channels * ggml_type_size(GGML_TYPE_F32);                              // conv_input_b
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [n, in_channels, h, w]
+        // return: [n, z_channels, h/8, w/8]

-        mem_size += initial_block.calculate_mem_size();
+        for (int i = 0; i < num_blocks * 3 + 6; i++) {
+            auto block = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(i)]);

-        mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_1_w
-        mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_2_w
-        mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_3_w
-
-        for (int i = 0; i < num_blocks; i++) {
-            mem_size += input_blocks[i].calculate_mem_size();
-            mem_size += middle_blocks[i].calculate_mem_size();
-            mem_size += output_blocks[i].calculate_mem_size();
-        }
-        mem_size += z_channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_input_w
-        mem_size += z_channels * ggml_type_size(GGML_TYPE_F32);                     // conv_input_b
-        return mem_size;
-    }
-
-    int get_num_tensors() {
-        int num_tensors = 7;
-        for (int i = 0; i < num_blocks; i++) {
-            num_tensors += input_blocks[i].get_num_tensors();
-            num_tensors += middle_blocks[i].get_num_tensors();
-            num_tensors += output_blocks[i].get_num_tensors();
-        }
-        num_tensors += initial_block.get_num_tensors();
-        return num_tensors;
-    }
-
-    void init_params(ggml_context* ctx) {
-        conv_input_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, channels);
-        conv_input_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, channels);
-
-        initial_block.init_params(ctx);
-
-        conv_1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
-        conv_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
-        conv_3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
-
-        conv_final_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, z_channels);
-        conv_final_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_channels);
-
-        for (int i = 0; i < num_blocks; i++) {
-            input_blocks[i].init_params(ctx);
-            middle_blocks[i].init_params(ctx);
-            output_blocks[i].init_params(ctx);
-        }
-    }
-
-    void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
-        tensors[prefix + "0.weight"] = conv_input_w;
-        tensors[prefix + "0.bias"]   = conv_input_b;
-
-        initial_block.map_by_name(tensors, prefix + "1.");
-
-        tensors[prefix + "2.weight"] = conv_1_w;
-        for (int i = 0; i < num_blocks; i++) {
-            input_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 3) + ".");
+            x = block->forward(ctx, x);
        }

-        tensors[prefix + "6.weight"] = conv_2_w;
-        for (int i = 0; i < num_blocks; i++) {
-            middle_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 7) + ".");
-        }
-
-        tensors[prefix + "10.weight"] = conv_3_w;
-        for (int i = 0; i < num_blocks; i++) {
-            output_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 11) + ".");
-        }
-
-        tensors[prefix + "14.weight"] = conv_final_w;
-        tensors[prefix + "14.bias"]   = conv_final_b;
-    }
-
-    ggml_tensor* forward(ggml_context* ctx, ggml_tensor* x) {
-        // conv(3, 64)
-        auto z = ggml_nn_conv_2d(ctx, x, conv_input_w, conv_input_b, 1, 1, 1, 1);
-
-        // Block(64, 64)
-        z = initial_block.forward(ctx, z);
-
-        // conv(64, 64, stride=2, bias=False)
-        z = ggml_nn_conv_2d(ctx, z, conv_1_w, NULL, 2, 2, 1, 1);
-
-        // Block(64, 64), Block(64, 64), Block(64, 64)
-        for (int i = 0; i < num_blocks; i++) {
-            z = input_blocks[i].forward(ctx, z);
-        }
-
-        // conv(64, 64, stride=2, bias=False)
-        z = ggml_nn_conv_2d(ctx, z, conv_2_w, NULL, 2, 2, 1, 1);
-
-        // Block(64, 64), Block(64, 64), Block(64, 64)
-        for (int i = 0; i < num_blocks; i++) {
-            z = middle_blocks[i].forward(ctx, z);
-        }
-
-        // conv(64, 64, stride=2, bias=False)
-        z = ggml_nn_conv_2d(ctx, z, conv_3_w, NULL, 2, 2, 1, 1);
-
-        // Block(64, 64), Block(64, 64), Block(64, 64)
-        for (int i = 0; i < num_blocks; i++) {
-            z = output_blocks[i].forward(ctx, z);
-        }
-
-        // conv(64, 4)
-        z = ggml_nn_conv_2d(ctx, z, conv_final_w, conv_final_b, 1, 1, 1, 1);
-        return z;
+        return x;
    }
 };

-struct TinyDecoder {
-    int z_channels      = 4;
-    int channels        = 64;
-    int output_channels = 3;
-    int num_blocks      = 3;
+class TinyDecoder : public UnaryBlock {
+    int z_channels   = 4;
+    int channels     = 64;
+    int out_channels = 3;
+    int num_blocks   = 3;

-    // input
-    ggml_tensor* conv_input_w;  // [channels, z_channels, 3, 3]
-    ggml_tensor* conv_input_b;  // [channels]
-    TAEBlock input_blocks[3];
-    ggml_tensor* conv_1_w;  // [channels, channels, 3, 3]
+public:
+    TinyDecoder(int z_channels = 4)
+        : z_channels(z_channels) {
+        int index = 0;

-    // middle
-    TAEBlock middle_blocks[3];
-    ggml_tensor* conv_2_w;  // [channels, channels, 3, 3]
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, channels, {3, 3}, {1, 1}, {1, 1}));
+        index++;  // nn.ReLU()

-    // output
-    TAEBlock output_blocks[3];
-    ggml_tensor* conv_3_w;  // [channels, channels, 3, 3]
-
-    // final
-    TAEBlock final_block;
-    ggml_tensor* conv_final_w;  // [output_channels, channels, 3, 3]
-    ggml_tensor* conv_final_b;  // [output_channels]
-
-    ggml_tensor* in_scale_1d3;  // [1]
-    ggml_tensor* in_scale_3;    // [1]
-
-    TinyDecoder() {
        for (int i = 0; i < num_blocks; i++) {
-            input_blocks[i].in_channels  = channels;
-            input_blocks[i].out_channels = channels;
-
-            middle_blocks[i].in_channels  = channels;
-            middle_blocks[i].out_channels = channels;
-
-            output_blocks[i].in_channels  = channels;
-            output_blocks[i].out_channels = channels;
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
        }
+        index++;  // nn.Upsample()
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));

-        final_block.in_channels  = channels;
-        final_block.out_channels = channels;
+        for (int i = 0; i < num_blocks; i++) {
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+        }
+        index++;  // nn.Upsample()
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
+
+        for (int i = 0; i < num_blocks; i++) {
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+        }
+        index++;  // nn.Upsample()
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
+
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
    }

-    size_t calculate_mem_size() {
-        size_t mem_size = channels * z_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_input_w
-        mem_size += channels * ggml_type_size(GGML_TYPE_F32);                             // conv_input_b
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
+        // z: [n, z_channels, h, w]
+        // return: [n, out_channels, h*8, w*8]

-        for (int i = 0; i < num_blocks; i++) {
-            mem_size += input_blocks[i].calculate_mem_size();
-        }
-        mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_1_w
-
-        for (int i = 0; i < num_blocks; i++) {
-            mem_size += middle_blocks[i].calculate_mem_size();
-        }
-        mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_2_w
-
-        for (int i = 0; i < num_blocks; i++) {
-            mem_size += output_blocks[i].calculate_mem_size();
-        }
-        mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_3_w
-
-        mem_size += final_block.calculate_mem_size();
-        mem_size += output_channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_input_w
-        mem_size += output_channels * ggml_type_size(GGML_TYPE_F32);                     // conv_input_b
-        return mem_size;
-    }
-
-    int get_num_tensors() {
-        int num_tensors = 9;
-        for (int i = 0; i < num_blocks; i++) {
-            num_tensors += input_blocks[i].get_num_tensors();
-            num_tensors += middle_blocks[i].get_num_tensors();
-            num_tensors += output_blocks[i].get_num_tensors();
-        }
-        num_tensors += final_block.get_num_tensors();
-        return num_tensors;
-    }
-
-    void init_params(ggml_allocr* alloc, ggml_context* ctx) {
-        conv_input_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, z_channels, channels);
-        conv_input_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, channels);
-
-        conv_1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
-        conv_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
-        conv_3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
-
-        conv_final_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, output_channels);
-        conv_final_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, output_channels);
-
-        for (int i = 0; i < num_blocks; i++) {
-            input_blocks[i].init_params(ctx);
-            middle_blocks[i].init_params(ctx);
-            output_blocks[i].init_params(ctx);
-        }
-
-        final_block.init_params(ctx);
-
-        // initialize constants scales
-        in_scale_1d3 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-        in_scale_3   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-        ggml_allocr_alloc(alloc, in_scale_1d3);
-        float scale_1d3 = 1.0f / 3.0f;
-        ggml_backend_tensor_set(in_scale_1d3, &scale_1d3, 0, sizeof(scale_1d3));
-        ggml_allocr_alloc(alloc, in_scale_3);
-        float scale_3 = 3.0f;
-        ggml_backend_tensor_set(in_scale_3, &scale_3, 0, sizeof(scale_3));
-    }
-
-    void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
-        tensors[prefix + "0.weight"] = conv_input_w;
-        tensors[prefix + "0.bias"]   = conv_input_b;
-
-        for (int i = 0; i < num_blocks; i++) {
-            input_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 2) + ".");
-        }
-
-        tensors[prefix + "6.weight"] = conv_1_w;
-        for (int i = 0; i < num_blocks; i++) {
-            middle_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 7) + ".");
-        }
-
-        tensors[prefix + "11.weight"] = conv_2_w;
-        for (int i = 0; i < num_blocks; i++) {
-            output_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 12) + ".");
-        }
-
-        tensors[prefix + "16.weight"] = conv_3_w;
-
-        final_block.map_by_name(tensors, prefix + "17.");
-
-        tensors[prefix + "18.weight"] = conv_final_w;
-        tensors[prefix + "18.bias"]   = conv_final_b;
-    }
-
-    ggml_tensor* forward(ggml_context* ctx, ggml_tensor* z) {
-        // torch.tanh(x / 3) * 3
-        auto h = ggml_scale(ctx, z, in_scale_1d3);
+        auto h = ggml_scale(ctx, z, 1.0f / 3.0f);
        h      = ggml_tanh_inplace(ctx, h);
-        h      = ggml_scale(ctx, h, in_scale_3);
+        h      = ggml_scale(ctx, h, 3.0f);

-        // conv(4, 64)
-        h = ggml_nn_conv_2d(ctx, h, conv_input_w, conv_input_b, 1, 1, 1, 1);
+        for (int i = 0; i < num_blocks * 3 + 10; i++) {
+            if (blocks.find(std::to_string(i)) == blocks.end()) {
+                if (i == 1) {
+                    h = ggml_relu_inplace(ctx, h);
+                } else {
+                    h = ggml_upscale(ctx, h, 2, GGML_SCALE_MODE_NEAREST);
+                }
+                continue;
+            }
+            auto block = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(i)]);

-        // nn.ReLU()
-        h = ggml_relu_inplace(ctx, h);
-
-        // Block(64, 64), Block(64, 64), Block(64, 64)
-        for (int i = 0; i < num_blocks; i++) {
-            h = input_blocks[i].forward(ctx, h);
+            h = block->forward(ctx, h);
        }

-        // nn.Upsample(scale_factor=2)
-        h = ggml_upscale(ctx, h, 2);
-
-        // conv(64, 64, bias=False)
-        h = ggml_nn_conv_2d(ctx, h, conv_1_w, NULL, 1, 1, 1, 1);
-
-        // Block(64, 64), Block(64, 64), Block(64, 64)
-        for (int i = 0; i < num_blocks; i++) {
-            h = middle_blocks[i].forward(ctx, h);
-        }
-
-        // nn.Upsample(scale_factor=2)
-        h = ggml_upscale(ctx, h, 2);
-
-        // conv(64, 64, bias=False)
-        h = ggml_nn_conv_2d(ctx, h, conv_2_w, NULL, 1, 1, 1, 1);
-
-        // Block(64, 64), Block(64, 64), Block(64, 64)
-        for (int i = 0; i < num_blocks; i++) {
-            h = output_blocks[i].forward(ctx, h);
-        }
-
-        // nn.Upsample(scale_factor=2)
-        h = ggml_upscale(ctx, h, 2);
-
-        // conv(64, 64, bias=False)
-        h = ggml_nn_conv_2d(ctx, h, conv_3_w, NULL, 1, 1, 1, 1);
-
-        // Block(64, 64)
-        h = final_block.forward(ctx, h);
-
-        // conv(64, 3)
-        h = ggml_nn_conv_2d(ctx, h, conv_final_w, conv_final_b, 1, 1, 1, 1);
        return h;
    }
 };

-struct TinyAutoEncoder : public GGMLModule {
-    TinyEncoder encoder;
-    TinyDecoder decoder;
+class TAESD : public GGMLBlock {
+protected:
+    bool decode_only;
+
+public:
+    TAESD(bool decode_only = true, SDVersion version = VERSION_SD1)
+        : decode_only(decode_only) {
+        int z_channels = 4;
+        if (sd_version_is_dit(version)) {
+            z_channels = 16;
+        }
+        blocks["decoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyDecoder(z_channels));
+
+        if (!decode_only) {
+            blocks["encoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyEncoder(z_channels));
+        }
+    }
+
+    struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) {
+        auto decoder = std::dynamic_pointer_cast<TinyDecoder>(blocks["decoder.layers"]);
+        return decoder->forward(ctx, z);
+    }
+
+    struct ggml_tensor* encode(struct ggml_context* ctx, struct ggml_tensor* x) {
+        auto encoder = std::dynamic_pointer_cast<TinyEncoder>(blocks["encoder.layers"]);
+        return encoder->forward(ctx, x);
+    }
+};
+
+struct TinyAutoEncoder : public GGMLRunner {
+    TAESD taesd;
    bool decode_only = false;

-    TinyAutoEncoder(bool decoder_only_ = true)
-        : decode_only(decoder_only_) {
-        name = "tae";
+    TinyAutoEncoder(ggml_backend_t backend,
+                    std::map<std::string, enum ggml_type>& tensor_types,
+                    const std::string prefix,
+                    bool decoder_only = true,
+                    SDVersion version = VERSION_SD1)
+        : decode_only(decoder_only),
+          taesd(decoder_only, version),
+          GGMLRunner(backend) {
+        taesd.init(params_ctx, tensor_types, prefix);
    }

-    size_t calculate_mem_size() {
-        size_t mem_size = decoder.calculate_mem_size();
-        if (!decode_only) {
-            mem_size += encoder.calculate_mem_size();
-        }
-        mem_size += 1024;  // padding
-        return mem_size;
+    std::string get_desc() {
+        return "taesd";
    }

-    size_t get_num_tensors() {
-        size_t num_tensors = decoder.get_num_tensors();
-        if (!decode_only) {
-            num_tensors += encoder.get_num_tensors();
-        }
-        return num_tensors;
-    }
-
-    void init_params() {
-        ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
-        decoder.init_params(alloc, params_ctx);
-        if (!decode_only) {
-            encoder.init_params(params_ctx);
-        }
-
-        // alloc all tensors linked to this context
-        for (struct ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != NULL; t = ggml_get_next_tensor(params_ctx, t)) {
-            if (t->data == NULL) {
-                ggml_allocr_alloc(alloc, t);
-            }
-        }
-        ggml_allocr_free(alloc);
-    }
-
-    void map_by_name(std::map<std::string, ggml_tensor*>& tensors) {
-        decoder.map_by_name(tensors, "decoder.layers.");
-        encoder.map_by_name(tensors, "encoder.layers.");
-    }
-
-    bool load_from_file(const std::string& file_path, ggml_backend_t backend) {
-        LOG_INFO("loading taesd from '%s'", file_path.c_str());
-
-        if (!alloc_params_buffer(backend)) {
-            return false;
-        }
-
+    bool load_from_file(const std::string& file_path) {
+        LOG_INFO("loading taesd from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false");
+        alloc_params_buffer();
        std::map<std::string, ggml_tensor*> taesd_tensors;
-
-        // prepare memory for the weights
-        {
-            init_params();
-            map_by_name(taesd_tensors);
-        }
-
-        std::map<std::string, struct ggml_tensor*> tensors_need_to_load;
+        taesd.get_param_tensors(taesd_tensors);
        std::set<std::string> ignore_tensors;
-        for (auto& pair : taesd_tensors) {
-            const std::string& name = pair.first;
-
-            if (decode_only && starts_with(name, "encoder")) {
-                ignore_tensors.insert(name);
-                continue;
-            }
-
-            tensors_need_to_load.insert(pair);
+        if (decode_only) {
+            ignore_tensors.insert("encoder.");
        }

        ModelLoader model_loader;
@ -525,7 +226,7 @@ struct TinyAutoEncoder : public GGMLModule {
            return false;
        }

-        bool success = model_loader.load_tensors(tensors_need_to_load, backend, ignore_tensors);
+        bool success = model_loader.load_tensors(taesd_tensors, backend, ignore_tensors);

        if (!success) {
            LOG_ERROR("load tae tensors from model loader failed");
@ -537,57 +238,23 @@ struct TinyAutoEncoder : public GGMLModule {
    }

    struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
-        // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
-        static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
-        static std::vector<uint8_t> buf(buf_size);
-
-        struct ggml_init_params params = {
-            /*.mem_size   =*/buf_size,
-            /*.mem_buffer =*/buf.data(),
-            /*.no_alloc   =*/true,  // the tensors will be allocated later by ggml_allocr_alloc_graph()
-        };
-        // LOG_DEBUG("mem_size %u ", params.mem_size);
-
-        struct ggml_context* ctx0 = ggml_init(params);
-
-        struct ggml_cgraph* gf = ggml_new_graph(ctx0);
-
-        struct ggml_tensor* z_ = NULL;
-
-        // it's performing a compute, check if backend isn't cpu
-        if (!ggml_backend_is_cpu(backend)) {
-            // pass input tensors to gpu memory
-            z_ = ggml_dup_tensor(ctx0, z);
-            ggml_allocr_alloc(compute_allocr, z_);
-
-            // pass data to device backend
-            if (!ggml_allocr_is_measure(compute_allocr)) {
-                ggml_backend_tensor_set(z_, z->data, 0, ggml_nbytes(z));
-            }
-        } else {
-            z_ = z;
-        }
-
-        struct ggml_tensor* out = decode_graph ? decoder.forward(ctx0, z_) : encoder.forward(ctx0, z_);
-
+        struct ggml_cgraph* gf  = ggml_new_graph(compute_ctx);
+        z                       = to_backend(z);
+        struct ggml_tensor* out = decode_graph ? taesd.decode(compute_ctx, z) : taesd.encode(compute_ctx, z);
        ggml_build_forward_expand(gf, out);
-        ggml_free(ctx0);
-
        return gf;
    }

-    void alloc_compute_buffer(struct ggml_tensor* x, bool decode) {
-        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(x, decode);
-        };
-        GGMLModule::alloc_compute_buffer(get_graph);
-    }
-
-    void compute(struct ggml_tensor* work_result, int n_threads, struct ggml_tensor* z, bool decode_graph) {
+    void compute(const int n_threads,
+                 struct ggml_tensor* z,
+                 bool decode_graph,
+                 struct ggml_tensor** output,
+                 struct ggml_context* output_ctx = NULL) {
        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(z, decode_graph);
        };
-        GGMLModule::compute(get_graph, n_threads, work_result);
+
+        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
    }
 };

--- a/thirdparty/.clang-format
+++ b/thirdparty/.clang-format
@ -0,0 +1,2 @@
+DisableFormat: true
+SortIncludes: Never
--- a/thirdparty/LICENSE.darts_clone.txt
+++ b/thirdparty/LICENSE.darts_clone.txt
@ -0,0 +1,10 @@
+Copyright (c) 2008-2011, Susumu Yata
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
+- Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
+- Neither the name of the <ORGANIZATION> nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@ -1,2 +1,3 @@
 - json.hpp library from: https://github.com/nlohmann/json
- ZIP Library from: https://github.com/kuba--/zip
+- ZIP Library from: https://github.com/kuba--/zip
+- darts.h from: https://github.com/google/sentencepiece/tree/master/third_party/darts_clone
--- a/thirdparty/darts.h
+++ b/thirdparty/darts.h
--- a/thirdparty/stb_image_resize.h
+++ b/thirdparty/stb_image_resize.h
--- a/thirdparty/stb_image_write.h
+++ b/thirdparty/stb_image_write.h
@ -177,7 +177,7 @@ STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const
 STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
 STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
 STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
-STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality);
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality, const char* parameters = NULL);

 #ifdef STBIW_WINDOWS_UTF8
 STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
@ -1412,7 +1412,7 @@ static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt
   return DU[0];
 }

-static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) {
+static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality, const char* parameters) {
   // Constants that don't pollute global namespace
   static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0};
   static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
@ -1521,6 +1521,20 @@ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, in
      s->func(s->context, (void*)YTable, sizeof(YTable));
      stbiw__putc(s, 1);
      s->func(s->context, UVTable, sizeof(UVTable));
+
+      // comment block with parameters of generation
+      if(parameters != NULL) {
+         stbiw__putc(s, 0xFF /* comnent */ );
+         stbiw__putc(s, 0xFE /* marker  */ );
+         size_t param_length = std::min(2 + strlen("parameters") + 1 + strlen(parameters) + 1, (size_t) 0xFFFF);
+         stbiw__putc(s, param_length >> 8); // no need to mask, length < 65536
+         stbiw__putc(s, param_length & 0xFF);
+         s->func(s->context, (void*)"parameters", strlen("parameters") + 1); // std::string is zero-terminated
+         s->func(s->context, (void*)parameters, std::min(param_length, (size_t) 65534) - 2 - strlen("parameters") - 1);
+         if(param_length > 65534) stbiw__putc(s, 0); // always zero-terminate for safety
+         if(param_length & 1) stbiw__putc(s, 0xFF); // pad to even length
+      }
+
      s->func(s->context, (void*)head1, sizeof(head1));
      s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1);
      s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
@ -1625,16 +1639,16 @@ STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x,
 {
   stbi__write_context s = { 0 };
   stbi__start_write_callbacks(&s, func, context);
-   return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
+   return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality, NULL);
 }


 #ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality, const char* parameters)
 {
   stbi__write_context s = { 0 };
   if (stbi__start_write_file(&s,filename)) {
-      int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
+      int r = stbi_write_jpg_core(&s, x, y, comp, data, quality, parameters);
      stbi__end_write_file(&s);
      return r;
   } else
--- a/thirdparty/zip.c
+++ b/thirdparty/zip.c
@ -36,6 +36,7 @@
 #include <unistd.h>
 #endif

+#define USE_EXTERNAL_MZCRC
 #include "miniz.h"
 #include "zip.h"

@ -1834,3 +1835,234 @@ int zip_extract(const char *zipname, const char *dir,

  return zip_archive_extract(&zip_archive, dir, on_extract, arg);
 }
+
+#if defined(__SSE4_2__) || defined(__AVX512F__)
+#include <immintrin.h>
+#endif
+
+// Phil Katz 32-Bit Cyclic Redundancy Check Uber Alles
+// Goes 73 GiB/s on an AMD Ryzen Threadripper PRO 7995WX
+// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+//  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
+mz_ulong mz_crc32(mz_ulong init, const uint8_t *buf, size_t len) {
+  uint32_t crc = ~init;
+#if defined(__AVX512F__) && defined(__VPCLMULQDQ__) && defined(__PCLMUL__)
+  if (len >= 256) {
+    _Alignas(__m512) static const uint64_t k1k2[] = {
+        0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430,
+        0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430,
+    };
+    _Alignas(__m512) static const uint64_t k3k4[] = {
+        0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596,
+        0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596,
+    };
+    _Alignas(__m512) static const uint64_t k5k6[] = {
+        0x01751997d0,
+        0x00ccaa009e,
+    };
+    _Alignas(__m512) static const uint64_t k7k8[] = {
+        0x0163cd6124,
+        0x0000000000,
+    };
+    _Alignas(__m512) static const uint64_t poly[] = {
+        0x01db710641,
+        0x01f7011641,
+    };
+    __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+    __m128i a0, a1, a2, a3;
+    x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
+    x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
+    x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
+    x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
+    x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
+    x0 = _mm512_load_si512((__m512i *)k1k2);
+    buf += 256;
+    len -= 256;
+    while (len >= 256) {
+      x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+      x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
+      x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
+      x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
+      x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+      x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
+      x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
+      x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
+      y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
+      y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
+      y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
+      y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
+      x1 = _mm512_xor_si512(x1, x5);
+      x2 = _mm512_xor_si512(x2, x6);
+      x3 = _mm512_xor_si512(x3, x7);
+      x4 = _mm512_xor_si512(x4, x8);
+      x1 = _mm512_xor_si512(x1, y5);
+      x2 = _mm512_xor_si512(x2, y6);
+      x3 = _mm512_xor_si512(x3, y7);
+      x4 = _mm512_xor_si512(x4, y8);
+      buf += 256;
+      len -= 256;
+    }
+    x0 = _mm512_load_si512((__m512i *)k3k4);
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x1 = _mm512_xor_si512(x1, x2);
+    x1 = _mm512_xor_si512(x1, x5);
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x1 = _mm512_xor_si512(x1, x3);
+    x1 = _mm512_xor_si512(x1, x5);
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x1 = _mm512_xor_si512(x1, x4);
+    x1 = _mm512_xor_si512(x1, x5);
+    while (len >= 64) {
+      x2 = _mm512_loadu_si512((__m512i *)buf);
+      x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+      x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+      x1 = _mm512_xor_si512(x1, x2);
+      x1 = _mm512_xor_si512(x1, x5);
+      buf += 64;
+      len -= 64;
+    }
+    a0 = _mm_load_si128((__m128i *)k5k6);
+    a1 = _mm512_extracti32x4_epi32(x1, 0);
+    a2 = _mm512_extracti32x4_epi32(x1, 1);
+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
+    a1 = _mm_xor_si128(a1, a3);
+    a1 = _mm_xor_si128(a1, a2);
+    a2 = _mm512_extracti32x4_epi32(x1, 2);
+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
+    a1 = _mm_xor_si128(a1, a3);
+    a1 = _mm_xor_si128(a1, a2);
+    a2 = _mm512_extracti32x4_epi32(x1, 3);
+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
+    a1 = _mm_xor_si128(a1, a3);
+    a1 = _mm_xor_si128(a1, a2);
+    a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
+    a3 = _mm_setr_epi32(~0, 0, ~0, 0);
+    a1 = _mm_srli_si128(a1, 8);
+    a1 = _mm_xor_si128(a1, a2);
+    a0 = _mm_loadl_epi64((__m128i *)k7k8);
+    a2 = _mm_srli_si128(a1, 4);
+    a1 = _mm_and_si128(a1, a3);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_xor_si128(a1, a2);
+    a0 = _mm_load_si128((__m128i *)poly);
+    a2 = _mm_and_si128(a1, a3);
+    a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
+    a2 = _mm_and_si128(a2, a3);
+    a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
+    a1 = _mm_xor_si128(a1, a2);
+    crc = _mm_extract_epi32(a1, 1);
+  }
+#endif
+#if defined(__SSE4_2__) && defined(__PCLMUL__)
+  if (len >= 64) {
+    _Alignas(__m128) static const uint64_t k1k2[] = {
+        0x0154442bd4,
+        0x01c6e41596,
+    };
+    _Alignas(__m128) static const uint64_t k3k4[] = {
+        0x01751997d0,
+        0x00ccaa009e,
+    };
+    _Alignas(__m128) static const uint64_t k5k0[] = {
+        0x0163cd6124,
+        0x0000000000,
+    };
+    _Alignas(__m128) static const uint64_t poly[] = {
+        0x01db710641,
+        0x01f7011641,
+    };
+    __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+    x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+    x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+    x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+    x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+    x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
+    x0 = _mm_load_si128((__m128i *)k1k2);
+    buf += 64;
+    len -= 64;
+    while (len >= 64) {
+      x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+      x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
+      x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
+      x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
+      x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+      x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
+      x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
+      x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
+      y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+      y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+      y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+      y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+      x1 = _mm_xor_si128(x1, x5);
+      x2 = _mm_xor_si128(x2, x6);
+      x3 = _mm_xor_si128(x3, x7);
+      x4 = _mm_xor_si128(x4, x8);
+      x1 = _mm_xor_si128(x1, y5);
+      x2 = _mm_xor_si128(x2, y6);
+      x3 = _mm_xor_si128(x3, y7);
+      x4 = _mm_xor_si128(x4, y8);
+      buf += 64;
+      len -= 64;
+    }
+    x0 = _mm_load_si128((__m128i *)k3k4);
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x2);
+    x1 = _mm_xor_si128(x1, x5);
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x3);
+    x1 = _mm_xor_si128(x1, x5);
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x4);
+    x1 = _mm_xor_si128(x1, x5);
+    while (len >= 16) {
+      x2 = _mm_loadu_si128((__m128i *)buf);
+      x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+      x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+      x1 = _mm_xor_si128(x1, x2);
+      x1 = _mm_xor_si128(x1, x5);
+      buf += 16;
+      len -= 16;
+    }
+    x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
+    x3 = _mm_setr_epi32(~0, 0, ~0, 0);
+    x1 = _mm_srli_si128(x1, 8);
+    x1 = _mm_xor_si128(x1, x2);
+    x0 = _mm_loadl_epi64((__m128i *)k5k0);
+    x2 = _mm_srli_si128(x1, 4);
+    x1 = _mm_and_si128(x1, x3);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_xor_si128(x1, x2);
+    x0 = _mm_load_si128((__m128i *)poly);
+    x2 = _mm_and_si128(x1, x3);
+    x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
+    x2 = _mm_and_si128(x2, x3);
+    x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
+    x1 = _mm_xor_si128(x1, x2);
+    crc = _mm_extract_epi32(x1, 1);
+  }
+#endif
+  static uint32_t tab[256];
+  if (!tab[255]) {
+    // generates table for byte-wise crc calculation on the polynomial
+    // x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1
+    uint32_t polynomial = 0xedb88320;  // bits are reversed
+    for (int d = 0; d < 256; ++d) {
+      uint32_t r = d;
+      for (int i = 0; i < 8; ++i)
+        r = r >> 1 ^ (r & 1 ? polynomial : 0);
+      tab[d] = r;
+    }
+  }
+  for (size_t i = 0; i < len; ++i)
+    crc = crc >> 8 ^ tab[(crc & 255) ^ buf[i]];
+  return ~crc & 0xffffffff;
+}
--- a/unet.hpp
+++ b/unet.hpp
--- a/upscaler.cpp
+++ b/upscaler.cpp
@ -6,7 +6,7 @@
 struct UpscalerGGML {
    ggml_backend_t backend    = NULL;  // general backend
    ggml_type model_data_type = GGML_TYPE_F16;
-    ESRGAN esrgan_upscaler;
+    std::shared_ptr<ESRGAN> esrgan_upscaler;
    std::string esrgan_path;
    int n_threads;

@ -15,22 +15,39 @@ struct UpscalerGGML {
    }

    bool load_from_file(const std::string& esrgan_path) {
-#ifdef SD_USE_CUBLAS
+#ifdef SD_USE_CUDA
        LOG_DEBUG("Using CUDA backend");
        backend = ggml_backend_cuda_init(0);
 #endif
 #ifdef SD_USE_METAL
        LOG_DEBUG("Using Metal backend");
-        ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
+        ggml_log_set(ggml_log_callback_default, nullptr);
        backend = ggml_backend_metal_init();
 #endif
-
+#ifdef SD_USE_VULKAN
+        LOG_DEBUG("Using Vulkan backend");
+        backend = ggml_backend_vk_init(0);
+#endif
+#ifdef SD_USE_OPENCL
+        LOG_DEBUG("Using OpenCL backend");
+        backend = ggml_backend_opencl_init();
+#endif
+#ifdef SD_USE_SYCL
+        LOG_DEBUG("Using SYCL backend");
+        backend = ggml_backend_sycl_init(0);
+#endif
+        ModelLoader model_loader;
+        if (!model_loader.init_from_file(esrgan_path)) {
+            LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str());
+        }
+        model_loader.set_wtype_override(model_data_type);
        if (!backend) {
            LOG_DEBUG("Using CPU backend");
            backend = ggml_backend_cpu_init();
        }
        LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
-        if (!esrgan_upscaler.load_from_file(esrgan_path, backend)) {
+        esrgan_upscaler = std::make_shared<ESRGAN>(backend, model_loader.tensor_storages_types);
+        if (!esrgan_upscaler->load_from_file(esrgan_path)) {
            return false;
        }
        return true;
@ -39,8 +56,8 @@ struct UpscalerGGML {
    sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor) {
        // upscale_factor, unused for RealESRGAN_x4plus_anime_6B.pth
        sd_image_t upscaled_image = {0, 0, 0, NULL};
-        int output_width          = (int)input_image.width * esrgan_upscaler.scale;
-        int output_height         = (int)input_image.height * esrgan_upscaler.scale;
+        int output_width          = (int)input_image.width * esrgan_upscaler->scale;
+        int output_height         = (int)input_image.height * esrgan_upscaler->scale;
        LOG_INFO("upscaling from (%i x %i) to (%i x %i)",
                 input_image.width, input_image.height, output_width, output_height);

@ -62,15 +79,11 @@ struct UpscalerGGML {

        ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1);
        auto on_tiling        = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-            if (init) {
-                esrgan_upscaler.alloc_compute_buffer(in);
-            } else {
-                esrgan_upscaler.compute(out, n_threads, in);
-            }
+            esrgan_upscaler->compute(n_threads, in, &out);
        };
        int64_t t0 = ggml_time_ms();
-        sd_tiling(input_image_tensor, upscaled, esrgan_upscaler.scale, esrgan_upscaler.tile_size, 0.25f, on_tiling);
-        esrgan_upscaler.free_compute_buffer();
+        sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, on_tiling);
+        esrgan_upscaler->free_compute_buffer();
        ggml_tensor_clamp(upscaled, 0.f, 1.f);
        uint8_t* upscaled_data = sd_tensor_to_image(upscaled);
        ggml_free(upscale_ctx);
@ -91,8 +104,7 @@ struct upscaler_ctx_t {
 };

 upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
-                                 int n_threads,
-                                 enum sd_type_t wtype) {
+                                 int n_threads) {
    upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
    if (upscaler_ctx == NULL) {
        return NULL;
--- a/util.cpp
+++ b/util.cpp
@ -1,6 +1,7 @@
 #include "util.h"
-
 #include <stdarg.h>
+#include <algorithm>
+#include <cmath>
 #include <codecvt>
 #include <fstream>
 #include <locale>
@ -9,6 +10,7 @@
 #include <thread>
 #include <unordered_set>
 #include <vector>
+#include "preprocessing.hpp"

 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/sysctl.h>
@ -20,9 +22,13 @@
 #include <unistd.h>
 #endif

-#include "ggml/ggml.h"
+#include "ggml-cpu.h"
+#include "ggml.h"
 #include "stable-diffusion.h"

+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#include "stb_image_resize.h"
+
 bool ends_with(const std::string& str, const std::string& ending) {
    if (str.length() >= ending.length()) {
        return (str.compare(str.length() - ending.length(), ending.length(), ending) == 0);
@ -38,6 +44,13 @@ bool starts_with(const std::string& str, const std::string& start) {
    return false;
 }

+bool contains(const std::string& str, const std::string& substr) {
+    if (str.find(substr) != std::string::npos) {
+        return true;
+    }
+    return false;
+}
+
 void replace_all_chars(std::string& str, char target, char replacement) {
    for (size_t i = 0; i < str.length(); ++i) {
        if (str[i] == target) {
@ -72,6 +85,70 @@ bool is_directory(const std::string& path) {
    return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY));
 }

+std::string get_full_path(const std::string& dir, const std::string& filename) {
+    std::string full_path = dir + "\\" + filename;
+
+    WIN32_FIND_DATA find_file_data;
+    HANDLE hFind = FindFirstFile(full_path.c_str(), &find_file_data);
+
+    if (hFind != INVALID_HANDLE_VALUE) {
+        FindClose(hFind);
+        return full_path;
+    } else {
+        return "";
+    }
+}
+
+std::vector<std::string> get_files_from_dir(const std::string& dir) {
+    std::vector<std::string> files;
+
+    WIN32_FIND_DATA findFileData;
+    HANDLE hFind;
+
+    char currentDirectory[MAX_PATH];
+    GetCurrentDirectory(MAX_PATH, currentDirectory);
+
+    char directoryPath[MAX_PATH];  // this is absolute path
+    sprintf(directoryPath, "%s\\%s\\*", currentDirectory, dir.c_str());
+
+    // Find the first file in the directory
+    hFind               = FindFirstFile(directoryPath, &findFileData);
+    bool isAbsolutePath = false;
+    // Check if the directory was found
+    if (hFind == INVALID_HANDLE_VALUE) {
+        printf("Unable to find directory. Try with original path \n");
+
+        char directoryPathAbsolute[MAX_PATH];
+        sprintf(directoryPathAbsolute, "%s*", dir.c_str());
+
+        hFind          = FindFirstFile(directoryPathAbsolute, &findFileData);
+        isAbsolutePath = true;
+        if (hFind == INVALID_HANDLE_VALUE) {
+            printf("Absolute path was also wrong.\n");
+            return files;
+        }
+    }
+
+    // Loop through all files in the directory
+    do {
+        // Check if the found file is a regular file (not a directory)
+        if (!(findFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) {
+            if (isAbsolutePath) {
+                files.push_back(dir + "\\" + std::string(findFileData.cFileName));
+            } else {
+                files.push_back(std::string(currentDirectory) + "\\" + dir + "\\" + std::string(findFileData.cFileName));
+            }
+        }
+    } while (FindNextFile(hFind, &findFileData) != 0);
+
+    // Close the handle
+    FindClose(hFind);
+
+    sort(files.begin(), files.end());
+
+    return files;
+}
+
 #else  // Unix
 #include <dirent.h>
 #include <sys/stat.h>
@ -86,6 +163,47 @@ bool is_directory(const std::string& path) {
    return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
 }

+// TODO: add windows version
+std::string get_full_path(const std::string& dir, const std::string& filename) {
+    DIR* dp = opendir(dir.c_str());
+
+    if (dp != nullptr) {
+        struct dirent* entry;
+
+        while ((entry = readdir(dp)) != nullptr) {
+            if (strcasecmp(entry->d_name, filename.c_str()) == 0) {
+                closedir(dp);
+                return dir + "/" + entry->d_name;
+            }
+        }
+
+        closedir(dp);
+    }
+
+    return "";
+}
+
+std::vector<std::string> get_files_from_dir(const std::string& dir) {
+    std::vector<std::string> files;
+
+    DIR* dp = opendir(dir.c_str());
+
+    if (dp != nullptr) {
+        struct dirent* entry;
+
+        while ((entry = readdir(dp)) != nullptr) {
+            std::string fname = dir + "/" + entry->d_name;
+            if (!is_directory(fname))
+                files.push_back(fname);
+        }
+        closedir(dp);
+    }
+
+    sort(files.begin(), files.end());
+
+    return files;
+}
+
 #endif

 // get_num_physical_cores is copy from
@ -126,6 +244,9 @@ int32_t get_num_physical_cores() {
    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }

+static sd_progress_cb_t sd_progress_cb = NULL;
+void* sd_progress_cb_data              = NULL;
+
 std::u32string utf8_to_utf32(const std::string& utf8_str) {
    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
    return converter.from_bytes(utf8_str);
@ -141,7 +262,7 @@ std::u32string unicode_value_to_utf32(int unicode_value) {
    return utf32_string;
 }

-std::string sd_basename(const std::string& path) {
+static std::string sd_basename(const std::string& path) {
    size_t pos = path.find_last_of('/');
    if (pos != std::string::npos) {
        return path.substr(pos + 1);
@ -169,7 +290,64 @@ std::string path_join(const std::string& p1, const std::string& p2) {
    return p1 + "/" + p2;
 }

+std::vector<std::string> splitString(const std::string& str, char delimiter) {
+    std::vector<std::string> result;
+    size_t start = 0;
+    size_t end   = str.find(delimiter);
+
+    while (end != std::string::npos) {
+        result.push_back(str.substr(start, end - start));
+        start = end + 1;
+        end   = str.find(delimiter, start);
+    }
+
+    // Add the last segment after the last delimiter
+    result.push_back(str.substr(start));
+
+    return result;
+}
+
+sd_image_t* preprocess_id_image(sd_image_t* img) {
+    int shortest_edge   = 224;
+    int size            = shortest_edge;
+    sd_image_t* resized = NULL;
+    uint32_t w          = img->width;
+    uint32_t h          = img->height;
+    uint32_t c          = img->channel;
+
+    // 1. do resize using stb_resize functions
+
+    unsigned char* buf = (unsigned char*)malloc(sizeof(unsigned char) * 3 * size * size);
+    if (!stbir_resize_uint8(img->data, w, h, 0,
+                            buf, size, size, 0,
+                            c)) {
+        fprintf(stderr, "%s: resize operation failed \n ", __func__);
+        return resized;
+    }
+
+    // 2. do center crop (likely unnecessary due to step 1)
+
+    // 3. do rescale
+
+    // 4. do normalize
+
+    // 3 and 4 will need to be done in float format.
+
+    resized = new sd_image_t{(uint32_t)shortest_edge,
+                             (uint32_t)shortest_edge,
+                             3,
+                             buf};
+    return resized;
+}
+
 void pretty_progress(int step, int steps, float time) {
+    if (sd_progress_cb) {
+        sd_progress_cb(step, steps, time, sd_progress_cb_data);
+        return;
+    }
+    if (step == 0) {
+        return;
+    }
    std::string progress = "  |";
    int max_progress     = 50;
    int32_t current      = (int32_t)(step * 1.f * max_progress / steps);
@ -183,7 +361,7 @@ void pretty_progress(int step, int steps, float time) {
        }
    }
    progress += "|";
-    printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s",
+    printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s\033[K",
           progress.c_str(), step, steps,
           time > 1.0f || time == 0 ? time : (1.0f / time));
    fflush(stdout);  // for linux
@ -192,6 +370,24 @@ void pretty_progress(int step, int steps, float time) {
    }
 }

+std::string ltrim(const std::string& s) {
+    auto it = std::find_if(s.begin(), s.end(), [](int ch) {
+        return !std::isspace(ch);
+    });
+    return std::string(it, s.end());
+}
+
+std::string rtrim(const std::string& s) {
+    auto it = std::find_if(s.rbegin(), s.rend(), [](int ch) {
+        return !std::isspace(ch);
+    });
+    return std::string(s.begin(), it.base());
+}
+
+std::string trim(const std::string& s) {
+    return rtrim(ltrim(s));
+}
+
 static sd_log_cb_t sd_log_cb = NULL;
 void* sd_log_cb_data         = NULL;

@ -201,23 +397,13 @@ void log_printf(sd_log_level_t level, const char* file, int line, const char* fo
    va_list args;
    va_start(args, format);

-    const char* level_str = "DEBUG";
-    if (level == SD_LOG_INFO) {
-        level_str = "INFO ";
-    } else if (level == SD_LOG_WARN) {
-        level_str = "WARN ";
-    } else if (level == SD_LOG_ERROR) {
-        level_str = "ERROR";
-    }
-
-    static char log_buffer[LOG_BUFFER_SIZE];
-
-    int written = snprintf(log_buffer, LOG_BUFFER_SIZE, "[%s] %s:%-4d - ", level_str, sd_basename(file).c_str(), line);
+    static char log_buffer[LOG_BUFFER_SIZE + 1];
+    int written = snprintf(log_buffer, LOG_BUFFER_SIZE, "%s:%-4d - ", sd_basename(file).c_str(), line);

    if (written >= 0 && written < LOG_BUFFER_SIZE) {
        vsnprintf(log_buffer + written, LOG_BUFFER_SIZE - written, format, args);
-        strncat(log_buffer, "\n", LOG_BUFFER_SIZE - strlen(log_buffer) - 1);
    }
+    strncat(log_buffer, "\n", LOG_BUFFER_SIZE - strlen(log_buffer));

    if (sd_log_cb) {
        sd_log_cb(level, log_buffer, sd_log_cb_data);
@ -230,12 +416,14 @@ void sd_set_log_callback(sd_log_cb_t cb, void* data) {
    sd_log_cb      = cb;
    sd_log_cb_data = data;
 }
-
+void sd_set_progress_callback(sd_progress_cb_t cb, void* data) {
+    sd_progress_cb      = cb;
+    sd_progress_cb_data = data;
+}
 const char* sd_get_system_info() {
    static char buffer[1024];
    std::stringstream ss;
    ss << "System Info: \n";
-    ss << "    BLAS = " << ggml_cpu_has_blas() << std::endl;
    ss << "    SSE3 = " << ggml_cpu_has_sse3() << std::endl;
    ss << "    AVX = " << ggml_cpu_has_avx() << std::endl;
    ss << "    AVX2 = " << ggml_cpu_has_avx2() << std::endl;
@ -253,6 +441,258 @@ const char* sd_get_system_info() {
    return buffer;
 }

-const char* sd_type_name(enum sd_type_t type) {
-    return ggml_type_name((ggml_type)type);
+sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image) {
+    sd_image_f32_t converted_image;
+    converted_image.width   = image.width;
+    converted_image.height  = image.height;
+    converted_image.channel = image.channel;
+
+    // Allocate memory for float data
+    converted_image.data = (float*)malloc(image.width * image.height * image.channel * sizeof(float));
+
+    for (int i = 0; i < image.width * image.height * image.channel; i++) {
+        // Convert uint8_t to float
+        converted_image.data[i] = (float)image.data[i];
+    }
+
+    return converted_image;
 }
+
+// Function to perform double linear interpolation
+float interpolate(float v1, float v2, float v3, float v4, float x_ratio, float y_ratio) {
+    return v1 * (1 - x_ratio) * (1 - y_ratio) + v2 * x_ratio * (1 - y_ratio) + v3 * (1 - x_ratio) * y_ratio + v4 * x_ratio * y_ratio;
+}
+
+sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height) {
+    sd_image_f32_t resized_image;
+    resized_image.width   = target_width;
+    resized_image.height  = target_height;
+    resized_image.channel = image.channel;
+
+    // Allocate memory for resized float data
+    resized_image.data = (float*)malloc(target_width * target_height * image.channel * sizeof(float));
+
+    for (int y = 0; y < target_height; y++) {
+        for (int x = 0; x < target_width; x++) {
+            float original_x = (float)x * image.width / target_width;
+            float original_y = (float)y * image.height / target_height;
+
+            int x1 = (int)original_x;
+            int y1 = (int)original_y;
+            int x2 = x1 + 1;
+            int y2 = y1 + 1;
+
+            for (int k = 0; k < image.channel; k++) {
+                float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k);
+                float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k);
+                float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k);
+                float v4 = *(image.data + y2 * image.width * image.channel + x2 * image.channel + k);
+
+                float x_ratio = original_x - x1;
+                float y_ratio = original_y - y1;
+
+                float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio);
+
+                *(resized_image.data + y * target_width * image.channel + x * image.channel + k) = value;
+            }
+        }
+    }
+
+    return resized_image;
+}
+
+void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3]) {
+    for (int y = 0; y < image.height; y++) {
+        for (int x = 0; x < image.width; x++) {
+            for (int k = 0; k < image.channel; k++) {
+                int index         = (y * image.width + x) * image.channel + k;
+                image.data[index] = (image.data[index] - means[k]) / stds[k];
+            }
+        }
+    }
+}
+
+// Constants for means and std
+float means[3] = {0.48145466, 0.4578275, 0.40821073};
+float stds[3]  = {0.26862954, 0.26130258, 0.27577711};
+
+// Function to clip and preprocess sd_image_f32_t
+sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size) {
+    float scale = (float)size / fmin(image.width, image.height);
+
+    // Interpolation
+    int new_width       = (int)(scale * image.width);
+    int new_height      = (int)(scale * image.height);
+    float* resized_data = (float*)malloc(new_width * new_height * image.channel * sizeof(float));
+
+    for (int y = 0; y < new_height; y++) {
+        for (int x = 0; x < new_width; x++) {
+            float original_x = (float)x * image.width / new_width;
+            float original_y = (float)y * image.height / new_height;
+
+            int x1 = (int)original_x;
+            int y1 = (int)original_y;
+            int x2 = x1 + 1;
+            int y2 = y1 + 1;
+
+            for (int k = 0; k < image.channel; k++) {
+                float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k);
+                float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k);
+                float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k);
+                float v4 = *(image.data + y2 * image.width * image.channel + x2 * image.channel + k);
+
+                float x_ratio = original_x - x1;
+                float y_ratio = original_y - y1;
+
+                float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio);
+
+                *(resized_data + y * new_width * image.channel + x * image.channel + k) = value;
+            }
+        }
+    }
+
+    // Clip and preprocess
+    int h = (new_height - size) / 2;
+    int w = (new_width - size) / 2;
+
+    sd_image_f32_t result;
+    result.width   = size;
+    result.height  = size;
+    result.channel = image.channel;
+    result.data    = (float*)malloc(size * size * image.channel * sizeof(float));
+
+    for (int k = 0; k < image.channel; k++) {
+        for (int i = 0; i < size; i++) {
+            for (int j = 0; j < size; j++) {
+                *(result.data + i * size * image.channel + j * image.channel + k) =
+                    fmin(fmax(*(resized_data + (i + h) * new_width * image.channel + (j + w) * image.channel + k), 0.0f), 255.0f) / 255.0f;
+            }
+        }
+    }
+
+    // Free allocated memory
+    free(resized_data);
+
+    // Normalize
+    for (int k = 0; k < image.channel; k++) {
+        for (int i = 0; i < size; i++) {
+            for (int j = 0; j < size; j++) {
+                // *(result.data + i * size * image.channel + j * image.channel + k) = 0.5f;
+                int offset  = i * size * image.channel + j * image.channel + k;
+                float value = *(result.data + offset);
+                value       = (value - means[k]) / stds[k];
+                // value = 0.5f;
+                *(result.data + offset) = value;
+            }
+        }
+    }
+
+    return result;
+}
+
+// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/prompt_parser.py#L345
+//
+// Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+// Accepted tokens are:
+//   (abc) - increases attention to abc by a multiplier of 1.1
+//   (abc:3.12) - increases attention to abc by a multiplier of 3.12
+//   [abc] - decreases attention to abc by a multiplier of 1.1
+//   \( - literal character '('
+//   \[ - literal character '['
+//   \) - literal character ')'
+//   \] - literal character ']'
+//   \\ - literal character '\'
+//   anything else - just text
+//
+// >>> parse_prompt_attention('normal text')
+// [['normal text', 1.0]]
+// >>> parse_prompt_attention('an (important) word')
+// [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+// >>> parse_prompt_attention('(unbalanced')
+// [['unbalanced', 1.1]]
+// >>> parse_prompt_attention('\(literal\]')
+// [['(literal]', 1.0]]
+// >>> parse_prompt_attention('(unnecessary)(parens)')
+// [['unnecessaryparens', 1.1]]
+// >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+// [['a ', 1.0],
+//  ['house', 1.5730000000000004],
+//  [' ', 1.1],
+//  ['on', 1.0],
+//  [' a ', 1.1],
+//  ['hill', 0.55],
+//  [', sun, ', 1.1],
+//  ['sky', 1.4641000000000006],
+//  ['.', 1.1]]
+std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text) {
+    std::vector<std::pair<std::string, float>> res;
+    std::vector<int> round_brackets;
+    std::vector<int> square_brackets;
+
+    float round_bracket_multiplier  = 1.1f;
+    float square_bracket_multiplier = 1 / 1.1f;
+
+    std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)");
+    std::regex re_break(R"(\s*\bBREAK\b\s*)");
+
+    auto multiply_range = [&](int start_position, float multiplier) {
+        for (int p = start_position; p < res.size(); ++p) {
+            res[p].second *= multiplier;
+        }
+    };
+
+    std::smatch m;
+    std::string remaining_text = text;
+
+    while (std::regex_search(remaining_text, m, re_attention)) {
+        std::string text   = m[0];
+        std::string weight = m[1];
+
+        if (text == "(") {
+            round_brackets.push_back((int)res.size());
+        } else if (text == "[") {
+            square_brackets.push_back((int)res.size());
+        } else if (!weight.empty()) {
+            if (!round_brackets.empty()) {
+                multiply_range(round_brackets.back(), std::stof(weight));
+                round_brackets.pop_back();
+            }
+        } else if (text == ")" && !round_brackets.empty()) {
+            multiply_range(round_brackets.back(), round_bracket_multiplier);
+            round_brackets.pop_back();
+        } else if (text == "]" && !square_brackets.empty()) {
+            multiply_range(square_brackets.back(), square_bracket_multiplier);
+            square_brackets.pop_back();
+        } else if (text == "\\(") {
+            res.push_back({text.substr(1), 1.0f});
+        } else {
+            res.push_back({text, 1.0f});
+        }
+
+        remaining_text = m.suffix();
+    }
+
+    for (int pos : round_brackets) {
+        multiply_range(pos, round_bracket_multiplier);
+    }
+
+    for (int pos : square_brackets) {
+        multiply_range(pos, square_bracket_multiplier);
+    }
+
+    if (res.empty()) {
+        res.push_back({"", 1.0f});
+    }
+
+    int i = 0;
+    while (i + 1 < res.size()) {
+        if (res[i].second == res[i + 1].second) {
+            res[i].first += res[i + 1].first;
+            res.erase(res.begin() + i + 1);
+        } else {
+            ++i;
+        }
+    }
+
+    return res;
+}
--- a/util.h
+++ b/util.h
@ -3,11 +3,16 @@

 #include <cstdint>
 #include <string>
+#include <vector>

 #include "stable-diffusion.h"

+#define SAFE_STR(s) ((s) ? (s) : "")
+#define BOOL_STR(b) ((b) ? "true" : "false")
+
 bool ends_with(const std::string& str, const std::string& ending);
 bool starts_with(const std::string& str, const std::string& start);
+bool contains(const std::string& str, const std::string& substr);

 std::string format(const char* fmt, ...);

@ -15,19 +20,43 @@ void replace_all_chars(std::string& str, char target, char replacement);

 bool file_exists(const std::string& filename);
 bool is_directory(const std::string& path);
+std::string get_full_path(const std::string& dir, const std::string& filename);
+
+std::vector<std::string> get_files_from_dir(const std::string& dir);

 std::u32string utf8_to_utf32(const std::string& utf8_str);
 std::string utf32_to_utf8(const std::u32string& utf32_str);
 std::u32string unicode_value_to_utf32(int unicode_value);

-std::string sd_basename(const std::string& path);
+sd_image_t* preprocess_id_image(sd_image_t* img);
+
+// std::string sd_basename(const std::string& path);
+
+typedef struct {
+    uint32_t width;
+    uint32_t height;
+    uint32_t channel;
+    float* data;
+} sd_image_f32_t;
+
+void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3]);
+
+sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image);
+
+sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height);
+
+sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size);

 std::string path_join(const std::string& p1, const std::string& p2);
-
+std::vector<std::string> splitString(const std::string& str, char delimiter);
 void pretty_progress(int step, int steps, float time);

 void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...);

+std::string trim(const std::string& s);
+
+std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text);
+
 #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)
--- a/vae.hpp
+++ b/vae.hpp
--- a/vocab.hpp
+++ b/vocab.hpp