fix: resolve embedding loading issue when calling generate_image multiple times (#1078 )

fix: avoid crash loading LoRAs with bf16 weights (#1077 )
feat: align the spatial size to the corresponding multiple (#1073 )
2025-12-13 05:48:56 +00:00 · 2025-12-12 23:08:12 +08:00 · 2025-12-12 22:36:54 +08:00 · 2025-12-10 23:15:08 +08:00 · 2025-12-10 22:25:19 +08:00 · 2025-12-10 00:26:07 +08:00
63 changed files with 497355 additions and 4090 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -65,7 +65,7 @@ jobs:

      - name: Get commit hash
        id: commit
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: pr-mpt/actions-commit-hash@v2

      - name: Fetch system info
@ -118,7 +118,7 @@ jobs:

      - name: Get commit hash
        id: commit
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: pr-mpt/actions-commit-hash@v2

      - name: Fetch system info
@ -163,9 +163,7 @@ jobs:
          - build: "avx512"
            defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
          - build: "cuda12"
-            defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES=90;89;86;80;75"
-          # - build: "rocm5.5"
-          #   defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
+            defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES='61;70;75;80;86;89;90;100;120'"
          - build: 'vulkan'
            defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
    steps:
@ -178,28 +176,15 @@ jobs:
      - name: Install cuda-toolkit
        id: cuda-toolkit
        if: ${{ matrix.build == 'cuda12' }}
-        uses: Jimver/cuda-toolkit@v0.2.19
+        uses: Jimver/cuda-toolkit@v0.2.22
        with:
-          cuda: "12.6.2"
+          cuda: "12.8.1"
          method: "network"
          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'

-      - name: Install rocm-toolkit
-        id: rocm-toolkit
-        if: ${{ matrix.build == 'rocm5.5' }}
-        uses: Cyberhan123/rocm-toolkit@v0.1.0
-        with:
-          rocm: "5.5.0"
-
-      - name: Install Ninja
-        id: install-ninja
-        if: ${{ matrix.build == 'rocm5.5' }}
-        uses: urkle/action-get-ninja@v1
-        with:
-          version: 1.11.1
      - name: Install Vulkan SDK
        id: get_vulkan
-        if: ${{ matrix.build == 'vulkan' }} https://sdk.lunarg.com/sdk/download/1.4.328.1/windows/vulkansdk-windows-X64-1.4.328.1.exe
+        if: ${{ matrix.build == 'vulkan' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
@ -277,6 +262,104 @@ jobs:
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip

+  windows-latest-cmake-hip:
+    runs-on: windows-2022
+
+    env:
+      HIPSDK_INSTALLER_VERSION: "25.Q3"
+      GPU_TARGETS: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      - name: Cache ROCm Installation
+        id: cache-rocm
+        uses: actions/cache@v4
+        with:
+          path: C:\Program Files\AMD\ROCm
+          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-x64
+          evict-old-files: 1d
+
+      - name: Install ROCm
+        if: steps.cache-rocm.outputs.cache-hit != 'true'
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "Downloading AMD HIP SDK Installer"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          write-host "Installing AMD HIP SDK"
+          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
+          $completed = $proc.WaitForExit(600000)
+          if (-not $completed) {
+              Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
+              $proc.Kill()
+              exit 1
+          }
+          if ($proc.ExitCode -ne 0) {
+              Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
+              exit 1
+          }
+          write-host "Completed AMD HIP SDK installation"
+
+      - name: Verify ROCm
+        run: |
+          # Find and test ROCm installation
+          $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
+          if (-not $clangPath) {
+            Write-Error "ROCm installation not found"
+            exit 1
+          }
+          & $clangPath.FullName --version
+          # Set HIP_PATH environment variable for later steps
+          echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)" >> $env:GITHUB_ENV
+
+      - name: Build
+        run: |
+          mkdir build
+          cd build
+          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+          cmake .. `
+            -G "Unix Makefiles" `
+            -DSD_HIPBLAS=ON `
+            -DSD_BUILD_SHARED_LIBS=ON `
+            -DGGML_NATIVE=OFF `
+            -DCMAKE_C_COMPILER=clang `
+            -DCMAKE_CXX_COMPILER=clang++ `
+            -DCMAKE_BUILD_TYPE=Release `
+            -DGPU_TARGETS="${{ env.GPU_TARGETS }}"
+          cmake --build . --config Release --parallel ${env:NUMBER_OF_PROCESSORS}
+
+      - name: Get commit hash
+        id: commit
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: pr-mpt/actions-commit-hash@v2
+
+      - name: Pack artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          md "build\bin\rocblas\library\"
+          md "build\bin\hipblaslt\library"
+          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
+          cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
+          7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip .\build\bin\*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
+          path: |
+            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
+
  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

@ -286,6 +369,7 @@ jobs:
      - ubuntu-latest-cmake
      - macOS-latest-cmake
      - windows-latest-cmake
+      - windows-latest-cmake-hip

    steps:
      - name: Clone
--- a/.gitignore
+++ b/.gitignore
@ -12,3 +12,4 @@ test/
 output*.png
 models*
 *.log
+preview.png
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -87,6 +87,38 @@ file(GLOB SD_LIB_SOURCES
    "*.hpp"
 )

+find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
+if(GIT_EXE)
+    execute_process(COMMAND ${GIT_EXE} describe --tags --abbrev=7 --dirty=+
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE SDCPP_BUILD_VERSION
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+    )
+    execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE SDCPP_BUILD_COMMIT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+    )
+endif()
+
+if(NOT SDCPP_BUILD_VERSION)
+    set(SDCPP_BUILD_VERSION unknown)
+endif()
+message(STATUS "stable-diffusion.cpp version ${SDCPP_BUILD_VERSION}")
+
+if(NOT SDCPP_BUILD_COMMIT)
+    set(SDCPP_BUILD_COMMIT unknown)
+endif()
+message(STATUS "stable-diffusion.cpp commit ${SDCPP_BUILD_COMMIT}")
+
+set_property(
+  SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp
+  APPEND PROPERTY COMPILE_DEFINITIONS
+  SDCPP_BUILD_COMMIT=${SDCPP_BUILD_COMMIT} SDCPP_BUILD_VERSION=${SDCPP_BUILD_VERSION}
+)
+
 if(SD_BUILD_SHARED_LIBS)
    message("-- Build shared library")
    message(${SD_LIB_SOURCES})
--- a/README.md
+++ b/README.md
@ -1,5 +1,5 @@
 <p align="center">
-  <img src="./assets/cat_with_sd_cpp_42.png" width="360x">
+  <img src="./assets/logo.png" width="360x">
 </p>

 # stable-diffusion.cpp
@ -15,6 +15,12 @@ API and command-line option may change frequently.***

 ## 🔥Important News

+* **2025/12/01** 🚀 stable-diffusion.cpp now supports **Z-Image**  
+  👉 Details: [PR #1020](https://github.com/leejet/stable-diffusion.cpp/pull/1020)
+
+* **2025/11/30** 🚀 stable-diffusion.cpp now supports **FLUX.2-dev**  
+  👉 Details: [PR #1016](https://github.com/leejet/stable-diffusion.cpp/pull/1016)
+
 * **2025/10/13** 🚀 stable-diffusion.cpp now supports **Qwen-Image-Edit / Qwen-Image-Edit 2509**  
  👉 Details: [PR #877](https://github.com/leejet/stable-diffusion.cpp/pull/877)

@ -37,10 +43,13 @@ API and command-line option may change frequently.***
    - SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
    - [Some SD1.x and SDXL distilled models](./docs/distilled_sd.md)
    - [SD3/SD3.5](./docs/sd3.md)
-    - [Flux-dev/Flux-schnell](./docs/flux.md)
+    - [FlUX.1-dev/FlUX.1-schnell](./docs/flux.md)
+    - [FLUX.2-dev](./docs/flux2.md)
    - [Chroma](./docs/chroma.md)
    - [Chroma1-Radiance](./docs/chroma_radiance.md)
    - [Qwen Image](./docs/qwen_image.md)
+    - [Z-Image](./docs/z_image.md)
+    - [Ovis-Image](./docs/ovis_image.md)
  - Image Edit Models
    - [FLUX.1-Kontext-dev](./docs/kontext.md)
    - [Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
@ -81,7 +90,9 @@ API and command-line option may change frequently.***
    - [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
    - `DPM++ 2S a`
    - [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
+- Cross-platform reproducibility
+    - `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG`
+    - `--rng cpu`, consistent with the `comfyui RNG`
 - Embedds generation parameters into png output as webui-compatible text string

 ## Quick Start
@ -94,7 +105,7 @@ API and command-line option may change frequently.***
 ### Download model weights

 - download weights(.ckpt or .safetensors or .gguf). For example
-    - Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
+    - Stable Diffusion v1.5 from https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5 

    ```sh
    curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
@ -116,12 +127,15 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe

 - [SD1.x/SD2.x/SDXL](./docs/sd.md)
 - [SD3/SD3.5](./docs/sd3.md)
- [Flux-dev/Flux-schnell](./docs/flux.md)
+- [FlUX.1-dev/FlUX.1-schnell](./docs/flux.md)
+- [FLUX.2-dev](./docs/flux2.md)
 - [FLUX.1-Kontext-dev](./docs/kontext.md)
 - [Chroma](./docs/chroma.md)
 - [🔥Qwen Image](./docs/qwen_image.md)
 - [🔥Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
 - [🔥Wan2.1/Wan2.2](./docs/wan.md)
+- [🔥Z-Image](./docs/z_image.md)
+- [Ovis-Image](./docs/ovis_image.md)
 - [LoRA](./docs/lora.md)
 - [LCM/LCM-LoRA](./docs/lcm.md)
 - [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
--- a/assets/flux2/example.png
+++ b/assets/flux2/example.png
--- a/assets/logo.png
+++ b/assets/logo.png
--- a/assets/ovis_image/example.png
+++ b/assets/ovis_image/example.png
--- a/assets/z_image/bf16.png
+++ b/assets/z_image/bf16.png
--- a/assets/z_image/q2_K.png
+++ b/assets/z_image/q2_K.png
--- a/assets/z_image/q3_K.png
+++ b/assets/z_image/q3_K.png
--- a/assets/z_image/q4_0.png
+++ b/assets/z_image/q4_0.png
--- a/assets/z_image/q4_K.png
+++ b/assets/z_image/q4_K.png
--- a/assets/z_image/q5_0.png
+++ b/assets/z_image/q5_0.png
--- a/assets/z_image/q6_K.png
+++ b/assets/z_image/q6_K.png
--- a/assets/z_image/q8_0.png
+++ b/assets/z_image/q8_0.png
--- a/clip.hpp
+++ b/clip.hpp
@ -3,34 +3,10 @@

 #include "ggml_extend.hpp"
 #include "model.h"
+#include "tokenize_util.h"

 /*================================================== CLIPTokenizer ===================================================*/

-__STATIC_INLINE__ std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remove_lora(std::string text) {
-    std::regex re("<lora:([^:]+):([^>]+)>");
-    std::smatch matches;
-    std::unordered_map<std::string, float> filename2multiplier;
-
-    while (std::regex_search(text, matches, re)) {
-        std::string filename = matches[1].str();
-        float multiplier     = std::stof(matches[2].str());
-
-        text = std::regex_replace(text, re, "", std::regex_constants::format_first_only);
-
-        if (multiplier == 0.f) {
-            continue;
-        }
-
-        if (filename2multiplier.find(filename) == filename2multiplier.end()) {
-            filename2multiplier[filename] = multiplier;
-        } else {
-            filename2multiplier[filename] += multiplier;
-        }
-    }
-
-    return std::make_pair(filename2multiplier, text);
-}
-
 __STATIC_INLINE__ std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
    std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
    std::set<int> byte_set;
@ -72,6 +48,8 @@ private:
    int encoder_len;
    int bpe_len;

+    std::vector<std::string> special_tokens;
+
 public:
    const std::string UNK_TOKEN = "<|endoftext|>";
    const std::string BOS_TOKEN = "<|startoftext|>";
@ -117,6 +95,15 @@ private:
        return pairs;
    }

+    bool is_special_token(const std::string& token) {
+        for (auto& special_token : special_tokens) {
+            if (special_token == token) {
+                return true;
+            }
+        }
+        return false;
+    }
+
 public:
    CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "")
        : PAD_TOKEN_ID(pad_token_id) {
@ -125,6 +112,8 @@ public:
        } else {
            load_from_merges(ModelLoader::load_merges());
        }
+        add_special_token("<|startoftext|>");
+        add_special_token("<|endoftext|>");
    }

    void load_from_merges(const std::string& merges_utf8_str) {
@ -201,6 +190,10 @@ public:
        }
    }

+    void add_special_token(const std::string& token) {
+        special_tokens.push_back(token);
+    }
+
    std::u32string bpe(const std::u32string& token) {
        std::vector<std::u32string> word;

@ -379,25 +372,54 @@ public:
        return trim(text);
    }

+    std::vector<std::string> token_split(const std::string& text) {
+        std::regex pat(R"('s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
+                       std::regex::icase);
+        std::sregex_iterator iter(text.begin(), text.end(), pat);
+        std::sregex_iterator end;
+
+        std::vector<std::string> result;
+        for (; iter != end; ++iter) {
+            result.emplace_back(iter->str());
+        }
+
+        return result;
+    }
+
    std::vector<int> encode(std::string text, on_new_token_cb_t on_new_token_cb) {
        std::string original_text = text;
        std::vector<int32_t> bpe_tokens;
        text = whitespace_clean(text);
        std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); });

-        std::regex pat(R"(<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
-                       std::regex::icase);
-
-        std::smatch matches;
        std::string str = text;
        std::vector<std::string> token_strs;
-        while (std::regex_search(str, matches, pat)) {
-            bool skip = on_new_token_cb(str, bpe_tokens);
-            if (skip) {
+
+        auto splited_texts = split_with_special_tokens(text, special_tokens);
+
+        for (auto& splited_text : splited_texts) {
+            LOG_DEBUG("token %s", splited_text.c_str());
+            if (is_special_token(splited_text)) {
+                LOG_DEBUG("special %s", splited_text.c_str());
+                bool skip = on_new_token_cb(splited_text, bpe_tokens);
+                if (skip) {
+                    token_strs.push_back(splited_text);
+                    continue;
+                }
                continue;
            }
-            for (auto& token : matches) {
-                std::string token_str = token.str();
+
+            auto tokens = token_split(splited_text);
+            for (auto& token : tokens) {
+                if (on_new_token_cb != nullptr) {
+                    bool skip = on_new_token_cb(token, bpe_tokens);
+                    if (skip) {
+                        token_strs.push_back(token);
+                        continue;
+                    }
+                }
+
+                std::string token_str = token;
                std::u32string utf32_token;
                for (int i = 0; i < token_str.length(); i++) {
                    unsigned char b = token_str[i];
@ -417,14 +439,13 @@ public:
                bpe_tokens.push_back(encoder[bpe_str]);
                token_strs.push_back(utf32_to_utf8(bpe_str));
            }
-            str = matches.suffix();
        }
-        std::stringstream ss;
-        ss << "[";
-        for (auto token : token_strs) {
-            ss << "\"" << token << "\", ";
-        }
-        ss << "]";
+        // std::stringstream ss;
+        // ss << "[";
+        // for (auto token : token_strs) {
+        //     ss << "\"" << token << "\", ";
+        // }
+        // ss << "]";
        // LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
        // printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str());
        return bpe_tokens;
@ -936,7 +957,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
                                    size_t max_token_idx         = 0,
                                    bool return_pooled           = false,
                                    int clip_skip                = -1) {
-        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
+        struct ggml_cgraph* gf = new_graph_custom(2048);

        input_ids = to_backend(input_ids);

@ -963,7 +984,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
        return gf;
    }

-    void compute(const int n_threads,
+    bool compute(const int n_threads,
                 struct ggml_tensor* input_ids,
                 int num_custom_embeddings,
                 void* custom_embeddings_data,
@ -975,7 +996,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
        };
-        GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+        return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
    }
 };

--- a/common.hpp
+++ b/common.hpp
@ -182,31 +182,21 @@ protected:
    int64_t dim_in;
    int64_t dim_out;

-    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
-        enum ggml_type wtype      = get_type(prefix + "proj.weight", tensor_storage_map, GGML_TYPE_F32);
-        enum ggml_type bias_wtype = GGML_TYPE_F32;
-        params["proj.weight"]     = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
-        params["proj.bias"]       = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2);
-    }
-
 public:
    GEGLU(int64_t dim_in, int64_t dim_out)
-        : dim_in(dim_in), dim_out(dim_out) {}
+        : dim_in(dim_in), dim_out(dim_out) {
+        blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out * 2));
+    }

    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
        // x: [ne3, ne2, ne1, dim_in]
        // return: [ne3, ne2, ne1, dim_out]
-        struct ggml_tensor* w = params["proj.weight"];
-        struct ggml_tensor* b = params["proj.bias"];
+        auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);

-        auto x_w    = ggml_view_2d(ctx->ggml_ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0);                        // [dim_out, dim_in]
-        auto x_b    = ggml_view_1d(ctx->ggml_ctx, b, b->ne[0] / 2, 0);                                            // [dim_out, dim_in]
-        auto gate_w = ggml_view_2d(ctx->ggml_ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2);  // [dim_out, ]
-        auto gate_b = ggml_view_1d(ctx->ggml_ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2);                      // [dim_out, ]
-
-        auto x_in = x;
-        x         = ggml_ext_linear(ctx->ggml_ctx, x_in, x_w, x_b);        // [ne3, ne2, ne1, dim_out]
-        auto gate = ggml_ext_linear(ctx->ggml_ctx, x_in, gate_w, gate_b);  // [ne3, ne2, ne1, dim_out]
+        x          = proj->forward(ctx, x);  // [ne3, ne2, ne1, dim_out*2]
+        auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0);
+        x          = x_vec[0];  // [ne3, ne2, ne1, dim_out]
+        auto gate  = x_vec[1];  // [ne3, ne2, ne1, dim_out]

        gate = ggml_gelu_inplace(ctx->ggml_ctx, gate);

@ -252,14 +242,18 @@ public:
        }

        // net_1 is nn.Dropout(), skip for inference
-        float scale = 1.f;
+        bool force_prec_f32 = false;
+        float scale         = 1.f;
        if (precision_fix) {
            scale = 1.f / 128.f;
+#ifdef SD_USE_VULKAN
+            force_prec_f32 = true;
+#endif
        }
        // The purpose of the scale here is to prevent NaN issues in certain situations.
        // For example, when using Vulkan without enabling force_prec_f32,
        // or when using CUDA but the weights are k-quants.
-        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, false, scale));
+        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, force_prec_f32, scale));
    }

    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
@ -410,6 +404,22 @@ protected:
    int64_t context_dim = 768;  // hidden_size, 1024 for VERSION_SD2
    bool use_linear     = false;

+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
+        auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
+        if (iter != tensor_storage_map.end()) {
+            int64_t inner_dim = n_head * d_head;
+            if (iter->second.n_dims == 4 && use_linear) {
+                use_linear         = false;
+                blocks["proj_in"]  = std::make_shared<Conv2d>(in_channels, inner_dim, std::pair{1, 1});
+                blocks["proj_out"] = std::make_shared<Conv2d>(inner_dim, in_channels, std::pair{1, 1});
+            } else if (iter->second.n_dims == 2 && !use_linear) {
+                use_linear         = true;
+                blocks["proj_in"]  = std::make_shared<Linear>(in_channels, inner_dim);
+                blocks["proj_out"] = std::make_shared<Linear>(inner_dim, in_channels);
+            }
+        }
+    }
+
 public:
    SpatialTransformer(int64_t in_channels,
                       int64_t n_head,
--- a/conditioner.hpp
+++ b/conditioner.hpp
@ -2,7 +2,7 @@
 #define __CONDITIONER_HPP__

 #include "clip.hpp"
-#include "qwenvl.hpp"
+#include "llm.hpp"
 #include "t5.hpp"

 struct SDCondition {
@ -34,6 +34,7 @@ struct Conditioner {
    virtual void free_params_buffer()                                                      = 0;
    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors)    = 0;
    virtual size_t get_params_buffer_size()                                                = 0;
+    virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
    virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(ggml_context* work_ctx,
                                                                                          int n_threads,
                                                                                          const ConditionerParams& conditioner_params) {
@ -55,20 +56,26 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
    std::shared_ptr<CLIPTextModelRunner> text_model2;

    std::string trigger_word = "img";  // should be user settable
-    std::string embd_dir;
+    std::map<std::string, std::string> embedding_map;
    int32_t num_custom_embeddings   = 0;
    int32_t num_custom_embeddings_2 = 0;
    std::vector<uint8_t> token_embed_custom;
-    std::vector<std::string> readed_embeddings;
+    std::map<std::string, std::pair<int, int>> embedding_pos_map;

    FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
                                      bool offload_params_to_cpu,
                                      const String2TensorStorage& tensor_storage_map,
-                                      const std::string& embd_dir,
+                                      const std::map<std::string, std::string>& orig_embedding_map,
                                      SDVersion version = VERSION_SD1,
                                      PMVersion pv      = PM_VERSION_1)
-        : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
-        bool force_clip_f32 = embd_dir.size() > 0;
+        : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407) {
+        for (const auto& kv : orig_embedding_map) {
+            std::string name = kv.first;
+            std::transform(name.begin(), name.end(), name.begin(), [](unsigned char c) { return std::tolower(c); });
+            embedding_map[name] = kv.second;
+            tokenizer.add_special_token(name);
+        }
+        bool force_clip_f32 = !embedding_map.empty();
        if (sd_version_is_sd1(version)) {
            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
        } else if (sd_version_is_sd2(version)) {
@ -108,15 +115,25 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
        return buffer_size;
    }

+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        text_model->set_weight_adapter(adapter);
+        if (sd_version_is_sdxl(version)) {
+            text_model2->set_weight_adapter(adapter);
+        }
+    }
+
    bool load_embedding(std::string embd_name, std::string embd_path, std::vector<int32_t>& bpe_tokens) {
-        // the order matters
        ModelLoader model_loader;
-        if (!model_loader.init_from_file(embd_path)) {
+        if (!model_loader.init_from_file_and_convert_name(embd_path)) {
            LOG_ERROR("embedding '%s' failed", embd_name.c_str());
            return false;
        }
-        if (std::find(readed_embeddings.begin(), readed_embeddings.end(), embd_name) != readed_embeddings.end()) {
+        auto iter = embedding_pos_map.find(embd_name);
+        if (iter != embedding_pos_map.end()) {
            LOG_DEBUG("embedding already read in: %s", embd_name.c_str());
+            for (int i = iter->second.first; i < iter->second.second; i++) {
+                bpe_tokens.push_back(text_model->model.vocab_size + i);
+            }
            return true;
        }
        struct ggml_init_params params;
@ -147,7 +164,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
            return true;
        };
        model_loader.load_tensors(on_load, 1);
-        readed_embeddings.push_back(embd_name);
+        int pos_start = num_custom_embeddings;
        if (embd) {
            int64_t hidden_size = text_model->model.hidden_size;
            token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd));
@ -174,6 +191,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
            }
            LOG_DEBUG("embedding '%s' applied, custom embeddings: %i (text model 2)", embd_name.c_str(), num_custom_embeddings_2);
        }
+        int pos_end = num_custom_embeddings;
+        if (pos_end == pos_start) {
+            return false;
+        }
+        embedding_pos_map[embd_name] = std::pair{pos_start, pos_end};
        return true;
    }

@ -188,25 +210,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {

    std::vector<int> convert_token_to_id(std::string text) {
        auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
-            size_t word_end       = str.find(",");
-            std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
-            embd_name             = trim(embd_name);
-            std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
+            auto iter = embedding_map.find(str);
+            if (iter == embedding_map.end()) {
+                return false;
            }
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
-            }
-            if (embd_path.size() > 0) {
-                if (load_embedding(embd_name, embd_path, bpe_tokens)) {
-                    if (word_end != std::string::npos) {
-                        str = str.substr(word_end);
-                    } else {
-                        str = "";
-                    }
-                    return true;
-                }
+            std::string embedding_path = iter->second;
+            if (load_embedding(str, embedding_path, bpe_tokens)) {
+                return true;
            }
            return false;
        };
@ -237,25 +247,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
        }

        auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
-            size_t word_end       = str.find(",");
-            std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
-            embd_name             = trim(embd_name);
-            std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
+            auto iter = embedding_map.find(str);
+            if (iter == embedding_map.end()) {
+                return false;
            }
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
-            }
-            if (embd_path.size() > 0) {
-                if (load_embedding(embd_name, embd_path, bpe_tokens)) {
-                    if (word_end != std::string::npos) {
-                        str = str.substr(word_end);
-                    } else {
-                        str = "";
-                    }
-                    return true;
-                }
+            std::string embedding_path = iter->second;
+            if (load_embedding(str, embedding_path, bpe_tokens)) {
+                return true;
            }
            return false;
        };
@ -270,13 +268,30 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
            const std::string& curr_text = item.first;
            float curr_weight            = item.second;
            // printf(" %s: %f \n", curr_text.c_str(), curr_weight);
+            int32_t clean_index = 0;
+            if (curr_text == "BREAK" && curr_weight == -1.0f) {
+                // Pad token array up to chunk size at this point.
+                // TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future?
+                // Also, this is 75 instead of 77 to leave room for BOS and EOS tokens.
+                int padding_size = 75 - (tokens_acc % 75);
+                for (int j = 0; j < padding_size; j++) {
+                    clean_input_ids.push_back(tokenizer.EOS_TOKEN_ID);
+                    clean_index++;
+                }
+
+                // After padding, continue to the next iteration to process the following text as a new segment
+                tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
+                weights.insert(weights.end(), padding_size, curr_weight);
+                continue;
+            }
+
+            // Regular token, process normally
            std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
-            int32_t clean_index          = 0;
            for (uint32_t i = 0; i < curr_tokens.size(); i++) {
                int token_id = curr_tokens[i];
-                if (token_id == image_token)
+                if (token_id == image_token) {
                    class_token_index.push_back(clean_index - 1);
-                else {
+                } else {
                    clean_input_ids.push_back(token_id);
                    clean_index++;
                }
@ -351,25 +366,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
        }

        auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
-            size_t word_end       = str.find(",");
-            std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
-            embd_name             = trim(embd_name);
-            std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
+            auto iter = embedding_map.find(str);
+            if (iter == embedding_map.end()) {
+                return false;
            }
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
-            }
-            if (embd_path.size() > 0) {
-                if (load_embedding(embd_name, embd_path, bpe_tokens)) {
-                    if (word_end != std::string::npos) {
-                        str = str.substr(word_end);
-                    } else {
-                        str = "";
-                    }
-                    return true;
-                }
+            std::string embedding_path = iter->second;
+            if (load_embedding(str, embedding_path, bpe_tokens)) {
+                return true;
            }
            return false;
        };
@ -379,6 +382,22 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
        for (const auto& item : parsed_attention) {
            const std::string& curr_text = item.first;
            float curr_weight            = item.second;
+
+            if (curr_text == "BREAK" && curr_weight == -1.0f) {
+                // Pad token array up to chunk size at this point.
+                // TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future?
+                // Also, this is 75 instead of 77 to leave room for BOS and EOS tokens.
+                size_t current_size = tokens.size();
+                size_t padding_size = (75 - (current_size % 75)) % 75;  // Ensure no negative padding
+
+                if (padding_size > 0) {
+                    LOG_DEBUG("BREAK token encountered, padding current chunk by %zu tokens.", padding_size);
+                    tokens.insert(tokens.end(), padding_size, tokenizer.EOS_TOKEN_ID);
+                    weights.insert(weights.end(), padding_size, 1.0f);
+                }
+                continue;  // Skip to the next item after handling BREAK
+            }
+
            std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
@ -662,7 +681,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
        return gf;
    }

-    void compute(const int n_threads,
+    bool compute(const int n_threads,
                 ggml_tensor* pixel_values,
                 bool return_pooled,
                 int clip_skip,
@ -671,7 +690,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(pixel_values, return_pooled, clip_skip);
        };
-        GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+        return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
    }
 };

@ -764,6 +783,18 @@ struct SD3CLIPEmbedder : public Conditioner {
        return buffer_size;
    }

+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        if (clip_l) {
+            clip_l->set_weight_adapter(adapter);
+        }
+        if (clip_g) {
+            clip_g->set_weight_adapter(adapter);
+        }
+        if (t5) {
+            t5->set_weight_adapter(adapter);
+        }
+    }
+
    std::vector<std::pair<std::vector<int>, std::vector<float>>> tokenize(std::string text,
                                                                          size_t max_length = 0,
                                                                          bool padding      = false) {
@ -1160,6 +1191,15 @@ struct FluxCLIPEmbedder : public Conditioner {
        return buffer_size;
    }

+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {
+        if (clip_l) {
+            clip_l->set_weight_adapter(adapter);
+        }
+        if (t5) {
+            t5->set_weight_adapter(adapter);
+        }
+    }
+
    std::vector<std::pair<std::vector<int>, std::vector<float>>> tokenize(std::string text,
                                                                          size_t max_length = 0,
                                                                          bool padding      = false) {
@ -1400,6 +1440,12 @@ struct T5CLIPEmbedder : public Conditioner {
        return buffer_size;
    }

+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        if (t5) {
+            t5->set_weight_adapter(adapter);
+        }
+    }
+
    std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
                                                                                  size_t max_length = 0,
                                                                                  bool padding      = false) {
@ -1555,55 +1601,74 @@ struct T5CLIPEmbedder : public Conditioner {
    }
 };

-struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
-    Qwen::Qwen2Tokenizer tokenizer;
-    std::shared_ptr<Qwen::Qwen2_5_VLRunner> qwenvl;
+struct LLMEmbedder : public Conditioner {
+    SDVersion version;
+    std::shared_ptr<LLM::BPETokenizer> tokenizer;
+    std::shared_ptr<LLM::LLMRunner> llm;

-    Qwen2_5_VLCLIPEmbedder(ggml_backend_t backend,
-                           bool offload_params_to_cpu,
-                           const String2TensorStorage& tensor_storage_map = {},
-                           const std::string prefix                       = "",
-                           bool enable_vision                             = false) {
-        qwenvl = std::make_shared<Qwen::Qwen2_5_VLRunner>(backend,
-                                                          offload_params_to_cpu,
-                                                          tensor_storage_map,
-                                                          "text_encoders.qwen2vl",
-                                                          enable_vision);
+    LLMEmbedder(ggml_backend_t backend,
+                bool offload_params_to_cpu,
+                const String2TensorStorage& tensor_storage_map = {},
+                SDVersion version                              = VERSION_QWEN_IMAGE,
+                const std::string prefix                       = "",
+                bool enable_vision                             = false)
+        : version(version) {
+        LLM::LLMArch arch = LLM::LLMArch::QWEN2_5_VL;
+        if (sd_version_is_flux2(version)) {
+            arch = LLM::LLMArch::MISTRAL_SMALL_3_2;
+        } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE) {
+            arch = LLM::LLMArch::QWEN3;
+        }
+        if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2) {
+            tokenizer = std::make_shared<LLM::MistralTokenizer>();
+        } else {
+            tokenizer = std::make_shared<LLM::Qwen2Tokenizer>();
+        }
+        llm = std::make_shared<LLM::LLMRunner>(arch,
+                                               backend,
+                                               offload_params_to_cpu,
+                                               tensor_storage_map,
+                                               "text_encoders.llm",
+                                               enable_vision);
    }

    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
-        qwenvl->get_param_tensors(tensors, "text_encoders.qwen2vl");
+        llm->get_param_tensors(tensors, "text_encoders.llm");
    }

    void alloc_params_buffer() override {
-        qwenvl->alloc_params_buffer();
+        llm->alloc_params_buffer();
    }

    void free_params_buffer() override {
-        qwenvl->free_params_buffer();
+        llm->free_params_buffer();
    }

    size_t get_params_buffer_size() override {
        size_t buffer_size = 0;
-        buffer_size += qwenvl->get_params_buffer_size();
+        buffer_size += llm->get_params_buffer_size();
        return buffer_size;
    }

+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        if (llm) {
+            llm->set_weight_adapter(adapter);
+        }
+    }
+
    std::tuple<std::vector<int>, std::vector<float>> tokenize(std::string text,
-                                                              size_t max_length           = 0,
-                                                              size_t system_prompt_length = 0,
-                                                              bool padding                = false) {
+                                                              std::pair<int, int> attn_range,
+                                                              size_t max_length = 0,
+                                                              bool padding      = false) {
        std::vector<std::pair<std::string, float>> parsed_attention;
-        if (system_prompt_length > 0) {
-            parsed_attention.emplace_back(text.substr(0, system_prompt_length), 1.f);
-            auto new_parsed_attention = parse_prompt_attention(text.substr(system_prompt_length, text.size() - system_prompt_length));
+        parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f);
+        if (attn_range.second - attn_range.first > 0) {
+            auto new_parsed_attention = parse_prompt_attention(text.substr(attn_range.first, attn_range.second - attn_range.first));
            parsed_attention.insert(parsed_attention.end(),
                                    new_parsed_attention.begin(),
                                    new_parsed_attention.end());
-        } else {
-            parsed_attention = parse_prompt_attention(text);
        }
-
+        parsed_attention.emplace_back(text.substr(attn_range.second), 1.f);
        {
            std::stringstream ss;
            ss << "[";
@ -1619,12 +1684,12 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
        for (const auto& item : parsed_attention) {
            const std::string& curr_text = item.first;
            float curr_weight            = item.second;
-            std::vector<int> curr_tokens = tokenizer.tokenize(curr_text, nullptr);
+            std::vector<int> curr_tokens = tokenizer->tokenize(curr_text, nullptr);
            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
        }

-        tokenizer.pad_tokens(tokens, weights, max_length, padding);
+        tokenizer->pad_tokens(tokens, weights, max_length, padding);

        // for (int i = 0; i < tokens.size(); i++) {
        //     std::cout << tokens[i] << ":" << weights[i] << ", " << i << std::endl;
@ -1639,9 +1704,11 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
                                      const ConditionerParams& conditioner_params) override {
        std::string prompt;
        std::vector<std::pair<int, ggml_tensor*>> image_embeds;
-        size_t system_prompt_length          = 0;
+        std::pair<int, int> prompt_attn_range;
        int prompt_template_encode_start_idx = 34;
-        if (qwenvl->enable_vision && conditioner_params.ref_images.size() > 0) {
+        int max_length                       = 0;
+        std::set<int> out_layers;
+        if (llm->enable_vision && conditioner_params.ref_images.size() > 0) {
            LOG_INFO("QwenImageEditPlusPipeline");
            prompt_template_encode_start_idx = 64;
            int image_embed_idx              = 64 + 6;
@ -1653,7 +1720,7 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {

            for (int i = 0; i < conditioner_params.ref_images.size(); i++) {
                sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]);
-                double factor        = qwenvl->params.vision.patch_size * qwenvl->params.vision.spatial_merge_size;
+                double factor        = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
                int height           = image.height;
                int width            = image.width;
                int h_bar            = static_cast<int>(std::round(height / factor)) * factor;
@ -1683,7 +1750,7 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
                resized_image.data = nullptr;

                ggml_tensor* image_embed = nullptr;
-                qwenvl->encode_image(n_threads, image_tensor, &image_embed, work_ctx);
+                llm->encode_image(n_threads, image_tensor, &image_embed, work_ctx);
                image_embeds.emplace_back(image_embed_idx, image_embed);
                image_embed_idx += 1 + image_embed->ne[1] + 6;

@ -1697,17 +1764,70 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
            }

            prompt = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n";
-
-            system_prompt_length = prompt.size();
-
            prompt += img_prompt;
+
+            prompt_attn_range.first = static_cast<int>(prompt.size());
            prompt += conditioner_params.text;
+            prompt_attn_range.second = static_cast<int>(prompt.size());
+
            prompt += "<|im_end|>\n<|im_start|>assistant\n";
+        } else if (sd_version_is_flux2(version)) {
+            prompt_template_encode_start_idx = 0;
+            out_layers                       = {10, 20, 30};
+
+            prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]";
+
+            prompt_attn_range.first = static_cast<int>(prompt.size());
+            prompt += conditioner_params.text;
+            prompt_attn_range.second = static_cast<int>(prompt.size());
+
+            prompt += "[/INST]";
+        } else if (sd_version_is_z_image(version)) {
+            prompt_template_encode_start_idx = 0;
+            out_layers                       = {35};  // -2
+
+            prompt = "<|im_start|>user\n";
+
+            prompt_attn_range.first = static_cast<int>(prompt.size());
+            prompt += conditioner_params.text;
+            prompt_attn_range.second = static_cast<int>(prompt.size());
+
+            prompt += "<|im_end|>\n<|im_start|>assistant\n";
+        } else if (sd_version_is_flux2(version)) {
+            prompt_template_encode_start_idx = 0;
+            out_layers                       = {10, 20, 30};
+
+            prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]";
+
+            prompt_attn_range.first = prompt.size();
+            prompt += conditioner_params.text;
+            prompt_attn_range.second = prompt.size();
+
+            prompt += "[/INST]";
+        } else if (version == VERSION_OVIS_IMAGE) {
+            prompt_template_encode_start_idx = 28;
+            max_length                       = prompt_template_encode_start_idx + 256;
+
+            prompt = "<|im_start|>user\nDescribe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background:";
+
+            prompt_attn_range.first = static_cast<int>(prompt.size());
+            prompt += " " + conditioner_params.text;
+            prompt_attn_range.second = static_cast<int>(prompt.size());
+
+            prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
        } else {
-            prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n" + conditioner_params.text + "<|im_end|>\n<|im_start|>assistant\n";
+            prompt_template_encode_start_idx = 34;
+
+            prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n";
+
+            prompt_attn_range.first = static_cast<int>(prompt.size());
+            prompt += conditioner_params.text;
+            prompt_attn_range.second = static_cast<int>(prompt.size());
+
+            prompt += "<|im_end|>\n<|im_start|>assistant\n";
        }

-        auto tokens_and_weights = tokenize(prompt, 0, system_prompt_length, false);
+        auto tokens_and_weights = tokenize(prompt, prompt_attn_range, max_length, max_length > 0);
        auto& tokens            = std::get<0>(tokens_and_weights);
        auto& weights           = std::get<1>(tokens_and_weights);

@ -1716,11 +1836,12 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {

        auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);

-        qwenvl->compute(n_threads,
-                        input_ids,
-                        image_embeds,
-                        &hidden_states,
-                        work_ctx);
+        llm->compute(n_threads,
+                     input_ids,
+                     image_embeds,
+                     out_layers,
+                     &hidden_states,
+                     work_ctx);
        {
            auto tensor         = hidden_states;
            float original_mean = ggml_ext_tensor_mean(tensor);
@ -1739,17 +1860,34 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {

        GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx);

+        int64_t min_length = 0;
+        if (sd_version_is_flux2(version)) {
+            min_length = 512;
+        }
+
+        int64_t zero_pad_len = 0;
+        if (min_length > 0) {
+            if (hidden_states->ne[1] - prompt_template_encode_start_idx < min_length) {
+                zero_pad_len = min_length - hidden_states->ne[1] + prompt_template_encode_start_idx;
+            }
+        }
+
        ggml_tensor* new_hidden_states = ggml_new_tensor_3d(work_ctx,
                                                            GGML_TYPE_F32,
                                                            hidden_states->ne[0],
-                                                            hidden_states->ne[1] - prompt_template_encode_start_idx,
+                                                            hidden_states->ne[1] - prompt_template_encode_start_idx + zero_pad_len,
                                                            hidden_states->ne[2]);

        ggml_ext_tensor_iter(new_hidden_states, [&](ggml_tensor* new_hidden_states, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float value = ggml_ext_tensor_get_f32(hidden_states, i0, i1 + prompt_template_encode_start_idx, i2, i3);
+            float value = 0.f;
+            if (i1 + prompt_template_encode_start_idx < hidden_states->ne[1]) {
+                value = ggml_ext_tensor_get_f32(hidden_states, i0, i1 + prompt_template_encode_start_idx, i2, i3);
+            }
            ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3);
        });

+        // print_ggml_tensor(new_hidden_states);
+
        int64_t t1 = ggml_time_ms();
        LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
        return {new_hidden_states, nullptr, nullptr};
--- a/control.hpp
+++ b/control.hpp
@ -380,7 +380,7 @@ struct ControlNet : public GGMLRunner {
                                    struct ggml_tensor* timesteps,
                                    struct ggml_tensor* context,
                                    struct ggml_tensor* y = nullptr) {
-        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false);
+        struct ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE);

        x = to_backend(x);
        if (guided_hint_cached) {
@ -414,7 +414,7 @@ struct ControlNet : public GGMLRunner {
        return gf;
    }

-    void compute(int n_threads,
+    bool compute(int n_threads,
                 struct ggml_tensor* x,
                 struct ggml_tensor* hint,
                 struct ggml_tensor* timesteps,
@ -430,8 +430,12 @@ struct ControlNet : public GGMLRunner {
            return build_graph(x, hint, timesteps, context, y);
        };

-        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
-        guided_hint_cached = true;
+        bool res = GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        if (res) {
+            // cache guided_hint
+            guided_hint_cached = true;
+        }
+        return res;
    }

    bool load_from_file(const std::string& file_path, int n_threads) {
@ -442,7 +446,7 @@ struct ControlNet : public GGMLRunner {
        std::set<std::string> ignore_tensors;

        ModelLoader model_loader;
-        if (!model_loader.init_from_file(file_path)) {
+        if (!model_loader.init_from_file_and_convert_name(file_path)) {
            LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
            return false;
        }
--- a/denoiser.hpp
+++ b/denoiser.hpp
@ -11,14 +11,13 @@
 #define TIMESTEPS 1000
 #define FLUX_TIMESTEPS 1000

-struct SigmaSchedule {
-    int version = 0;
+struct SigmaScheduler {
    typedef std::function<float(float)> t_to_sigma_t;

    virtual std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) = 0;
 };

-struct DiscreteSchedule : SigmaSchedule {
+struct DiscreteScheduler : SigmaScheduler {
    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
        std::vector<float> result;

@ -42,7 +41,7 @@ struct DiscreteSchedule : SigmaSchedule {
    }
 };

-struct ExponentialSchedule : SigmaSchedule {
+struct ExponentialScheduler : SigmaScheduler {
    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
        std::vector<float> sigmas;

@ -149,7 +148,10 @@ std::vector<float> log_linear_interpolation(std::vector<float> sigma_in,
 /*
 https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/howto.html
 */
-struct AYSSchedule : SigmaSchedule {
+struct AYSScheduler : SigmaScheduler {
+    SDVersion version;
+    explicit AYSScheduler(SDVersion version)
+        : version(version) {}
    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
        const std::vector<float> noise_levels[] = {
            /* SD1.5 */
@ -169,19 +171,19 @@ struct AYSSchedule : SigmaSchedule {
        std::vector<float> results(n + 1);

        if (sd_version_is_sd2((SDVersion)version)) {
-            LOG_WARN("AYS not designed for SD2.X models");
+            LOG_WARN("AYS_SCHEDULER not designed for SD2.X models");
        } /* fallthrough */
        else if (sd_version_is_sd1((SDVersion)version)) {
-            LOG_INFO("AYS using SD1.5 noise levels");
+            LOG_INFO("AYS_SCHEDULER using SD1.5 noise levels");
            inputs = noise_levels[0];
        } else if (sd_version_is_sdxl((SDVersion)version)) {
-            LOG_INFO("AYS using SDXL noise levels");
+            LOG_INFO("AYS_SCHEDULER using SDXL noise levels");
            inputs = noise_levels[1];
        } else if (version == VERSION_SVD) {
-            LOG_INFO("AYS using SVD noise levels");
+            LOG_INFO("AYS_SCHEDULER using SVD noise levels");
            inputs = noise_levels[2];
        } else {
-            LOG_ERROR("Version not compatible with AYS scheduler");
+            LOG_ERROR("Version not compatible with AYS_SCHEDULER scheduler");
            return results;
        }

@ -203,7 +205,7 @@ struct AYSSchedule : SigmaSchedule {
 /*
 * GITS Scheduler: https://github.com/zju-pi/diff-sampler/tree/main/gits-main
 */
-struct GITSSchedule : SigmaSchedule {
+struct GITSScheduler : SigmaScheduler {
    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
        if (sigma_max <= 0.0f) {
            return std::vector<float>{};
@ -232,7 +234,7 @@ struct GITSSchedule : SigmaSchedule {
    }
 };

-struct SGMUniformSchedule : SigmaSchedule {
+struct SGMUniformScheduler : SigmaScheduler {
    std::vector<float> get_sigmas(uint32_t n, float sigma_min_in, float sigma_max_in, t_to_sigma_t t_to_sigma_func) override {
        std::vector<float> result;
        if (n == 0) {
@ -251,7 +253,24 @@ struct SGMUniformSchedule : SigmaSchedule {
    }
 };

-struct KarrasSchedule : SigmaSchedule {
+struct LCMScheduler : SigmaScheduler {
+    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
+        std::vector<float> result;
+        result.reserve(n + 1);
+        const int original_steps = 50;
+        const int k              = TIMESTEPS / original_steps;
+        for (int i = 0; i < n; i++) {
+            // the rounding ensures we match the training schedule of the LCM model
+            int index    = (i * original_steps) / n;
+            int timestep = (original_steps - index) * k - 1;
+            result.push_back(t_to_sigma(timestep));
+        }
+        result.push_back(0.0f);
+        return result;
+    }
+};
+
+struct KarrasScheduler : SigmaScheduler {
    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
        // These *COULD* be function arguments here,
        // but does anybody ever bother to touch them?
@ -270,7 +289,7 @@ struct KarrasSchedule : SigmaSchedule {
    }
 };

-struct SimpleSchedule : SigmaSchedule {
+struct SimpleScheduler : SigmaScheduler {
    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
        std::vector<float> result_sigmas;

@ -299,8 +318,8 @@ struct SimpleSchedule : SigmaSchedule {
    }
 };

-// Close to Beta Schedule, but increadably simple in code.
-struct SmoothStepSchedule : SigmaSchedule {
+// Close to Beta Scheduler, but increadably simple in code.
+struct SmoothStepScheduler : SigmaScheduler {
    static constexpr float smoothstep(float x) {
        return x * x * (3.0f - 2.0f * x);
    }
@ -329,7 +348,6 @@ struct SmoothStepSchedule : SigmaSchedule {
 };

 struct Denoiser {
-    std::shared_ptr<SigmaSchedule> scheduler                                                 = std::make_shared<DiscreteSchedule>();
    virtual float sigma_min()                                                                = 0;
    virtual float sigma_max()                                                                = 0;
    virtual float sigma_to_t(float sigma)                                                    = 0;
@ -338,8 +356,51 @@ struct Denoiser {
    virtual ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) = 0;
    virtual ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent)             = 0;

-    virtual std::vector<float> get_sigmas(uint32_t n) {
+    virtual std::vector<float> get_sigmas(uint32_t n, int /*image_seq_len*/, scheduler_t scheduler_type, SDVersion version) {
        auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1);
+        std::shared_ptr<SigmaScheduler> scheduler;
+        switch (scheduler_type) {
+            case DISCRETE_SCHEDULER:
+                LOG_INFO("get_sigmas with discrete scheduler");
+                scheduler = std::make_shared<DiscreteScheduler>();
+                break;
+            case KARRAS_SCHEDULER:
+                LOG_INFO("get_sigmas with Karras scheduler");
+                scheduler = std::make_shared<KarrasScheduler>();
+                break;
+            case EXPONENTIAL_SCHEDULER:
+                LOG_INFO("get_sigmas exponential scheduler");
+                scheduler = std::make_shared<ExponentialScheduler>();
+                break;
+            case AYS_SCHEDULER:
+                LOG_INFO("get_sigmas with Align-Your-Steps scheduler");
+                scheduler = std::make_shared<AYSScheduler>(version);
+                break;
+            case GITS_SCHEDULER:
+                LOG_INFO("get_sigmas with GITS scheduler");
+                scheduler = std::make_shared<GITSScheduler>();
+                break;
+            case SGM_UNIFORM_SCHEDULER:
+                LOG_INFO("get_sigmas with SGM Uniform scheduler");
+                scheduler = std::make_shared<SGMUniformScheduler>();
+                break;
+            case SIMPLE_SCHEDULER:
+                LOG_INFO("get_sigmas with Simple scheduler");
+                scheduler = std::make_shared<SimpleScheduler>();
+                break;
+            case SMOOTHSTEP_SCHEDULER:
+                LOG_INFO("get_sigmas with SmoothStep scheduler");
+                scheduler = std::make_shared<SmoothStepScheduler>();
+                break;
+            case LCM_SCHEDULER:
+                LOG_INFO("get_sigmas with LCM scheduler");
+                scheduler = std::make_shared<LCMScheduler>();
+                break;
+            default:
+                LOG_INFO("get_sigmas with discrete scheduler (default)");
+                scheduler = std::make_shared<DiscreteScheduler>();
+                break;
+        }
        return scheduler->get_sigmas(n, sigma_min(), sigma_max(), bound_t_to_sigma);
    }
 };
@ -426,7 +487,6 @@ struct EDMVDenoiser : public CompVisVDenoiser {

    EDMVDenoiser(float min_sigma = 0.002, float max_sigma = 120.0)
        : min_sigma(min_sigma), max_sigma(max_sigma) {
-        scheduler = std::make_shared<ExponentialSchedule>();
    }

    float t_to_sigma(float t) override {
@ -522,10 +582,14 @@ struct FluxFlowDenoiser : public Denoiser {
        set_parameters(shift);
    }

-    void set_parameters(float shift = 1.15f) {
+    void set_shift(float shift) {
        this->shift = shift;
-        for (int i = 1; i < TIMESTEPS + 1; i++) {
-            sigmas[i - 1] = t_to_sigma(i / TIMESTEPS * TIMESTEPS);
+    }
+
+    void set_parameters(float shift) {
+        set_shift(shift);
+        for (int i = 0; i < TIMESTEPS; i++) {
+            sigmas[i] = t_to_sigma(i);
        }
    }

@ -567,10 +631,42 @@ struct FluxFlowDenoiser : public Denoiser {
    }
 };

+struct Flux2FlowDenoiser : public FluxFlowDenoiser {
+    Flux2FlowDenoiser() = default;
+
+    float compute_empirical_mu(uint32_t n, int image_seq_len) {
+        const float a1 = 8.73809524e-05f;
+        const float b1 = 1.89833333f;
+        const float a2 = 0.00016927f;
+        const float b2 = 0.45666666f;
+
+        if (image_seq_len > 4300) {
+            float mu = a2 * image_seq_len + b2;
+            return mu;
+        }
+
+        float m_200 = a2 * image_seq_len + b2;
+        float m_10  = a1 * image_seq_len + b1;
+
+        float a  = (m_200 - m_10) / 190.0f;
+        float b  = m_200 - 200.0f * a;
+        float mu = a * n + b;
+
+        return mu;
+    }
+
+    std::vector<float> get_sigmas(uint32_t n, int image_seq_len, scheduler_t scheduler_type, SDVersion version) override {
+        float mu = compute_empirical_mu(n, image_seq_len);
+        LOG_DEBUG("Flux2FlowDenoiser: set shift to %.3f", mu);
+        set_shift(mu);
+        return Denoiser::get_sigmas(n, image_seq_len, scheduler_type, version);
+    }
+};
+
 typedef std::function<ggml_tensor*(ggml_tensor*, float, int)> denoise_cb_t;

 // k diffusion reverse ODE: dx = (x - D(x;\sigma)) / \sigma dt; \sigma(t) = t
-static void sample_k_diffusion(sample_method_t method,
+static bool sample_k_diffusion(sample_method_t method,
                               denoise_cb_t model,
                               ggml_context* work_ctx,
                               ggml_tensor* x,
@ -580,7 +676,7 @@ static void sample_k_diffusion(sample_method_t method,
    size_t steps = sigmas.size() - 1;
    // sample_euler_ancestral
    switch (method) {
-        case EULER_A: {
+        case EULER_A_SAMPLE_METHOD: {
            struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
            struct ggml_tensor* d     = ggml_dup_tensor(work_ctx, x);

@ -589,6 +685,9 @@ static void sample_k_diffusion(sample_method_t method,

                // denoise
                ggml_tensor* denoised = model(x, sigma, i + 1);
+                if (denoised == nullptr) {
+                    return false;
+                }

                // d = (x - denoised) / sigma
                {
@ -633,7 +732,7 @@ static void sample_k_diffusion(sample_method_t method,
                }
            }
        } break;
-        case EULER:  // Implemented without any sigma churn
+        case EULER_SAMPLE_METHOD:  // Implemented without any sigma churn
        {
            struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);

@ -642,6 +741,9 @@ static void sample_k_diffusion(sample_method_t method,

                // denoise
                ggml_tensor* denoised = model(x, sigma, i + 1);
+                if (denoised == nullptr) {
+                    return false;
+                }

                // d = (x - denoised) / sigma
                {
@ -666,13 +768,16 @@ static void sample_k_diffusion(sample_method_t method,
                }
            }
        } break;
-        case HEUN: {
+        case HEUN_SAMPLE_METHOD: {
            struct ggml_tensor* d  = ggml_dup_tensor(work_ctx, x);
            struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);

            for (int i = 0; i < steps; i++) {
                // denoise
                ggml_tensor* denoised = model(x, sigmas[i], -(i + 1));
+                if (denoised == nullptr) {
+                    return false;
+                }

                // d = (x - denoised) / sigma
                {
@ -707,7 +812,10 @@ static void sample_k_diffusion(sample_method_t method,
                    }

                    ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1);
-                    float* vec_denoised   = (float*)denoised->data;
+                    if (denoised == nullptr) {
+                        return false;
+                    }
+                    float* vec_denoised = (float*)denoised->data;
                    for (int j = 0; j < ggml_nelements(x); j++) {
                        float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1];
                        vec_d[j] = (vec_d[j] + d2) / 2;
@ -716,13 +824,16 @@ static void sample_k_diffusion(sample_method_t method,
                }
            }
        } break;
-        case DPM2: {
+        case DPM2_SAMPLE_METHOD: {
            struct ggml_tensor* d  = ggml_dup_tensor(work_ctx, x);
            struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);

            for (int i = 0; i < steps; i++) {
                // denoise
                ggml_tensor* denoised = model(x, sigmas[i], i + 1);
+                if (denoised == nullptr) {
+                    return false;
+                }

                // d = (x - denoised) / sigma
                {
@ -759,7 +870,10 @@ static void sample_k_diffusion(sample_method_t method,
                    }

                    ggml_tensor* denoised = model(x2, sigma_mid, i + 1);
-                    float* vec_denoised   = (float*)denoised->data;
+                    if (denoised == nullptr) {
+                        return false;
+                    }
+                    float* vec_denoised = (float*)denoised->data;
                    for (int j = 0; j < ggml_nelements(x); j++) {
                        float d2 = (vec_x2[j] - vec_denoised[j]) / sigma_mid;
                        vec_x[j] = vec_x[j] + d2 * dt_2;
@ -768,13 +882,16 @@ static void sample_k_diffusion(sample_method_t method,
            }

        } break;
-        case DPMPP2S_A: {
+        case DPMPP2S_A_SAMPLE_METHOD: {
            struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
            struct ggml_tensor* x2    = ggml_dup_tensor(work_ctx, x);

            for (int i = 0; i < steps; i++) {
                // denoise
                ggml_tensor* denoised = model(x, sigmas[i], i + 1);
+                if (denoised == nullptr) {
+                    return false;
+                }

                // get_ancestral_step
                float sigma_up   = std::min(sigmas[i + 1],
@ -811,6 +928,9 @@ static void sample_k_diffusion(sample_method_t method,
                    }

                    ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1);
+                    if (denoised == nullptr) {
+                        return false;
+                    }

                    // Second half-step
                    for (int j = 0; j < ggml_nelements(x); j++) {
@ -832,7 +952,7 @@ static void sample_k_diffusion(sample_method_t method,
                }
            }
        } break;
-        case DPMPP2M:  // DPM++ (2M) from Karras et al (2022)
+        case DPMPP2M_SAMPLE_METHOD:  // DPM++ (2M) from Karras et al (2022)
        {
            struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);

@ -841,6 +961,9 @@ static void sample_k_diffusion(sample_method_t method,
            for (int i = 0; i < steps; i++) {
                // denoise
                ggml_tensor* denoised = model(x, sigmas[i], i + 1);
+                if (denoised == nullptr) {
+                    return false;
+                }

                float t                 = t_fn(sigmas[i]);
                float t_next            = t_fn(sigmas[i + 1]);
@ -871,7 +994,7 @@ static void sample_k_diffusion(sample_method_t method,
                }
            }
        } break;
-        case DPMPP2Mv2:  // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457
+        case DPMPP2Mv2_SAMPLE_METHOD:  // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457
        {
            struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);

@ -880,6 +1003,9 @@ static void sample_k_diffusion(sample_method_t method,
            for (int i = 0; i < steps; i++) {
                // denoise
                ggml_tensor* denoised = model(x, sigmas[i], i + 1);
+                if (denoised == nullptr) {
+                    return false;
+                }

                float t                 = t_fn(sigmas[i]);
                float t_next            = t_fn(sigmas[i + 1]);
@ -914,7 +1040,7 @@ static void sample_k_diffusion(sample_method_t method,
                }
            }
        } break;
-        case IPNDM:  // iPNDM sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main
+        case IPNDM_SAMPLE_METHOD:  // iPNDM sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main
        {
            int max_order       = 4;
            ggml_tensor* x_next = x;
@ -930,7 +1056,10 @@ static void sample_k_diffusion(sample_method_t method,

                // Denoising step
                ggml_tensor* denoised = model(x_cur, sigma, i + 1);
-                float* vec_denoised   = (float*)denoised->data;
+                if (denoised == nullptr) {
+                    return false;
+                }
+                float* vec_denoised = (float*)denoised->data;
                // d_cur = (x_cur - denoised) / sigma
                struct ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x_cur);
                float* vec_d_cur          = (float*)d_cur->data;
@ -989,7 +1118,7 @@ static void sample_k_diffusion(sample_method_t method,
                }
            }
        } break;
-        case IPNDM_V:  // iPNDM_v sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main
+        case IPNDM_V_SAMPLE_METHOD:  // iPNDM_v sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main
        {
            int max_order = 4;
            std::vector<ggml_tensor*> buffer_model;
@ -1063,7 +1192,7 @@ static void sample_k_diffusion(sample_method_t method,
                d_cur = ggml_dup_tensor(work_ctx, x_next);
            }
        } break;
-        case LCM:  // Latent Consistency Models
+        case LCM_SAMPLE_METHOD:  // Latent Consistency Models
        {
            struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
            struct ggml_tensor* d     = ggml_dup_tensor(work_ctx, x);
@ -1073,6 +1202,9 @@ static void sample_k_diffusion(sample_method_t method,

                // denoise
                ggml_tensor* denoised = model(x, sigma, i + 1);
+                if (denoised == nullptr) {
+                    return false;
+                }

                // x = denoised
                {
@ -1098,8 +1230,8 @@ static void sample_k_diffusion(sample_method_t method,
                }
            }
        } break;
-        case DDIM_TRAILING:  // Denoising Diffusion Implicit Models
-                             // with the "trailing" timestep spacing
+        case DDIM_TRAILING_SAMPLE_METHOD:  // Denoising Diffusion Implicit Models
+                                           // with the "trailing" timestep spacing
        {
            // See J. Song et al., "Denoising Diffusion Implicit
            // Models", arXiv:2010.02502 [cs.LG]
@ -1109,7 +1241,7 @@ static void sample_k_diffusion(sample_method_t method,
            // end beta) (which unfortunately k-diffusion's data
            // structure hides from the denoiser), and the sigmas are
            // also needed to invert the behavior of CompVisDenoiser
-            // (k-diffusion's LMSDiscreteScheduler)
+            // (k-diffusion's LMSDiscreteSchedulerr)
            float beta_start = 0.00085f;
            float beta_end   = 0.0120f;
            std::vector<double> alphas_cumprod;
@ -1137,7 +1269,7 @@ static void sample_k_diffusion(sample_method_t method,

            for (int i = 0; i < steps; i++) {
                // The "trailing" DDIM timestep, see S. Lin et al.,
-                // "Common Diffusion Noise Schedules and Sample Steps
+                // "Common Diffusion Noise Schedulers and Sample Steps
                // are Flawed", arXiv:2305.08891 [cs], p. 4, Table
                // 2. Most variables below follow Diffusers naming
                //
@ -1292,8 +1424,8 @@ static void sample_k_diffusion(sample_method_t method,
                // factor c_in.
            }
        } break;
-        case TCD:  // Strategic Stochastic Sampling (Algorithm 4) in
-                   // Trajectory Consistency Distillation
+        case TCD_SAMPLE_METHOD:  // Strategic Stochastic Sampling (Algorithm 4) in
+                                 // Trajectory Consistency Distillation
        {
            // See J. Zheng et al., "Trajectory Consistency
            // Distillation: Improved Latent Consistency Distillation
@ -1465,8 +1597,9 @@ static void sample_k_diffusion(sample_method_t method,

        default:
            LOG_ERROR("Attempting to sample with nonexisting sample method %i", method);
-            abort();
+            return false;
    }
+    return true;
 }

 #endif  // __DENOISER_HPP__
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@ -6,6 +6,7 @@
 #include "qwen_image.hpp"
 #include "unet.hpp"
 #include "wan.hpp"
+#include "z_image.hpp"

 struct DiffusionParams {
    struct ggml_tensor* x                     = nullptr;
@ -26,7 +27,7 @@ struct DiffusionParams {

 struct DiffusionModel {
    virtual std::string get_desc()                                                      = 0;
-    virtual void compute(int n_threads,
+    virtual bool compute(int n_threads,
                         DiffusionParams diffusion_params,
                         struct ggml_tensor** output     = nullptr,
                         struct ggml_context* output_ctx = nullptr)                     = 0;
@ -35,8 +36,9 @@ struct DiffusionModel {
    virtual void free_compute_buffer()                                                  = 0;
    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
    virtual size_t get_params_buffer_size()                                             = 0;
-    virtual int64_t get_adm_in_channels()                                               = 0;
-    virtual void set_flash_attn_enabled(bool enabled)                                   = 0;
+    virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
+    virtual int64_t get_adm_in_channels()             = 0;
+    virtual void set_flash_attn_enabled(bool enabled) = 0;
 };

 struct UNetModel : public DiffusionModel {
@ -73,6 +75,10 @@ struct UNetModel : public DiffusionModel {
        return unet.get_params_buffer_size();
    }

+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        unet.set_weight_adapter(adapter);
+    }
+
    int64_t get_adm_in_channels() override {
        return unet.unet.adm_in_channels;
    }
@ -81,7 +87,7 @@ struct UNetModel : public DiffusionModel {
        unet.set_flash_attention_enabled(enabled);
    }

-    void compute(int n_threads,
+    bool compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
                 struct ggml_context* output_ctx = nullptr) override {
@ -130,6 +136,10 @@ struct MMDiTModel : public DiffusionModel {
        return mmdit.get_params_buffer_size();
    }

+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        mmdit.set_weight_adapter(adapter);
+    }
+
    int64_t get_adm_in_channels() override {
        return 768 + 1280;
    }
@ -138,7 +148,7 @@ struct MMDiTModel : public DiffusionModel {
        mmdit.set_flash_attention_enabled(enabled);
    }

-    void compute(int n_threads,
+    bool compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
                 struct ggml_context* output_ctx = nullptr) override {
@ -188,6 +198,10 @@ struct FluxModel : public DiffusionModel {
        return flux.get_params_buffer_size();
    }

+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        flux.set_weight_adapter(adapter);
+    }
+
    int64_t get_adm_in_channels() override {
        return 768;
    }
@ -196,7 +210,7 @@ struct FluxModel : public DiffusionModel {
        flux.set_flash_attention_enabled(enabled);
    }

-    void compute(int n_threads,
+    bool compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
                 struct ggml_context* output_ctx = nullptr) override {
@ -251,6 +265,10 @@ struct WanModel : public DiffusionModel {
        return wan.get_params_buffer_size();
    }

+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        wan.set_weight_adapter(adapter);
+    }
+
    int64_t get_adm_in_channels() override {
        return 768;
    }
@ -259,7 +277,7 @@ struct WanModel : public DiffusionModel {
        wan.set_flash_attention_enabled(enabled);
    }

-    void compute(int n_threads,
+    bool compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
                 struct ggml_context* output_ctx = nullptr) override {
@ -313,6 +331,10 @@ struct QwenImageModel : public DiffusionModel {
        return qwen_image.get_params_buffer_size();
    }

+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        qwen_image.set_weight_adapter(adapter);
+    }
+
    int64_t get_adm_in_channels() override {
        return 768;
    }
@ -321,7 +343,7 @@ struct QwenImageModel : public DiffusionModel {
        qwen_image.set_flash_attention_enabled(enabled);
    }

-    void compute(int n_threads,
+    bool compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
                 struct ggml_context* output_ctx = nullptr) override {
@ -336,4 +358,67 @@ struct QwenImageModel : public DiffusionModel {
    }
 };

+struct ZImageModel : public DiffusionModel {
+    std::string prefix;
+    ZImage::ZImageRunner z_image;
+
+    ZImageModel(ggml_backend_t backend,
+                bool offload_params_to_cpu,
+                const String2TensorStorage& tensor_storage_map = {},
+                const std::string prefix                       = "model.diffusion_model",
+                SDVersion version                              = VERSION_Z_IMAGE)
+        : prefix(prefix), z_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
+    }
+
+    std::string get_desc() override {
+        return z_image.get_desc();
+    }
+
+    void alloc_params_buffer() override {
+        z_image.alloc_params_buffer();
+    }
+
+    void free_params_buffer() override {
+        z_image.free_params_buffer();
+    }
+
+    void free_compute_buffer() override {
+        z_image.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+        z_image.get_param_tensors(tensors, prefix);
+    }
+
+    size_t get_params_buffer_size() override {
+        return z_image.get_params_buffer_size();
+    }
+
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        z_image.set_weight_adapter(adapter);
+    }
+
+    int64_t get_adm_in_channels() override {
+        return 768;
+    }
+
+    void set_flash_attn_enabled(bool enabled) {
+        z_image.set_flash_attention_enabled(enabled);
+    }
+
+    bool compute(int n_threads,
+                 DiffusionParams diffusion_params,
+                 struct ggml_tensor** output     = nullptr,
+                 struct ggml_context* output_ctx = nullptr) override {
+        return z_image.compute(n_threads,
+                               diffusion_params.x,
+                               diffusion_params.timesteps,
+                               diffusion_params.context,
+                               diffusion_params.ref_latents,
+                               true,  // increase_ref_index
+                               output,
+                               output_ctx);
+    }
+};
+
 #endif
--- a/docs/distilled_sd.md
+++ b/docs/distilled_sd.md
@ -1,40 +1,66 @@
-# Running distilled models: SSD1B and SD1.x with tiny U-Nets
+# Running distilled models: SSD1B and SDx.x with tiny U-Nets

-## Preface
+## Preface 

-This kind of models have a reduced U-Net part. 
-Unlike other SDXL models the U-Net of SSD1B has only one middle block and lesser attention layers in up and down blocks, resulting in relatively smaller files. Running these models saves more than 33% of the time. For more details, refer to Segmind's paper on https://arxiv.org/abs/2401.02677v1 .
-Unlike other SD 1.x models Tiny-UNet models consist of only 6 U-Net blocks, resulting in relatively smaller files (approximately 1 GB). Running these models saves almost 50% of the time. For more details, refer to the paper: https://arxiv.org/pdf/2305.15798.pdf .
+These models feature a reduced U-Net architecture. Unlike standard SDXL models, the SSD-1B U-Net contains only one middle block and fewer attention layers in its up- and down-blocks, resulting in significantly smaller file sizes. Using these models can reduce inference time by more than 33%. For more details, refer to Segmind's paper: https://arxiv.org/abs/2401.02677v1.
+Similarly, SD1.x- and SD2.x-style models with a tiny U-Net consist of only 6 U-Net blocks, leading to very small files and time savings of up to 50%. For more information, see the paper: https://arxiv.org/pdf/2305.15798.pdf.

 ## SSD1B

-Unfortunately not all of this models follow the standard model parameter naming mapping. 
-Anyway there are some very useful SSD1B models available online, such as:
+Note that not all of these models follow the standard parameter naming conventions. However, several useful SSD-1B models are available online, such as:

 * https://huggingface.co/segmind/SSD-1B/resolve/main/SSD-1B-A1111.safetensors
- * https://huggingface.co/hassenhamdi/SSD-1B-fp8_e4m3fn/resolve/main/SSD-1B_fp8_e4m3fn.safetensors 
+ * https://huggingface.co/hassenhamdi/SSD-1B-fp8_e4m3fn/resolve/main/SSD-1B_fp8_e4m3fn.safetensors

-Also there are useful LORAs available:
+Useful LoRAs are also available:

 * https://huggingface.co/seungminh/lora-swarovski-SSD-1B/resolve/main/pytorch_lora_weights.safetensors
- * https://huggingface.co/kylielee505/mylcmlorassd/resolve/main/pytorch_lora_weights.safetensors   
+ * https://huggingface.co/kylielee505/mylcmlorassd/resolve/main/pytorch_lora_weights.safetensors

-You can use this files **out-of-the-box** - unlike models in next section.
+These files can be used out-of-the-box, unlike the models described in the next section.


-## SD1.x with tiny U-Nets
+## SD1.x, SD2.x with tiny U-Nets

-There are some Tiny SD 1.x models available online, such as:
+These models require conversion before use. You will need a Python script provided by the diffusers team, available on GitHub:
+
+ * https://raw.githubusercontent.com/huggingface/diffusers/refs/heads/main/scripts/convert_diffusers_to_original_stable_diffusion.py
+
+### SD2.x
+
+NotaAI provides the following model online:
+
+* https://huggingface.co/nota-ai/bk-sdm-v2-tiny
+
+Creating a .safetensors file involves two steps. First, run this short Python script to download the model from Hugging Face:
+
+```python
+from diffusers import StableDiffusionPipeline
+pipe = StableDiffusionPipeline.from_pretrained("nota-ai/bk-sdm-v2-tiny",cache_dir="./")
+```
+
+Second, create the .safetensors file by running:
+
+```bash
+python convert_diffusers_to_original_stable_diffusion.py \
+      --model_path  models--nota-ai--bk-sdm-v2-tiny/snapshots/68277af553777858cd47e133f92e4db47321bc74 \
+      --checkpoint_path bk-sdm-v2-tiny.safetensors --half --use_safetensors
+```
+
+This will generate the **file bk-sdm-v2-tiny.safetensors**, which is now ready for use with sd.cpp.
+
+### SD1.x
+
+Several Tiny SD 1.x models are available online, such as:

 * https://huggingface.co/segmind/tiny-sd
 * https://huggingface.co/segmind/portrait-finetuned
 * https://huggingface.co/nota-ai/bk-sdm-tiny

-These models need some conversion, for example because partially tensors are **non contiguous** stored. To create a usable checkpoint file, follow these **easy** steps:
+These models also require conversion, partly because some tensors are stored in a non-contiguous manner. To create a usable checkpoint file, follow these simple steps:
+Download and prepare the model using Python: 

-### Download model from Hugging Face
-
-Download the model using Python on your computer, for example this way:
+##### Download the model using Python on your computer, for example this way:

 ```python
 import torch
@ -46,35 +72,22 @@ for param in unet.parameters():
 pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
 ```

-### Convert that to a ckpt file 
-
-To convert the downloaded model to a checkpoint file, you need another Python script. Download the conversion script from here:
-
- * https://raw.githubusercontent.com/huggingface/diffusers/refs/heads/main/scripts/convert_diffusers_to_original_stable_diffusion.py
-
-
-### Run convert script
-
-Now, run that conversion script:
+##### Run the conversion script:

 ```bash
 python convert_diffusers_to_original_stable_diffusion.py \
-	--model_path  ./segmindtiny-sd \
-	--checkpoint_path ./segmind_tiny-sd.ckpt --half
+      --model_path  ./segmindtiny-sd \
+      --checkpoint_path ./segmind_tiny-sd.ckpt --half
 ```

-The file **segmind_tiny-sd.ckpt**  will be generated and is now ready to use with sd.cpp
-
-You can follow a similar process for other models mentioned above from Hugging Face. 
+The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.


-### Another ckpt file on the net
-
-There is another model file available online: 
+### Another available .ckpt file:

 * https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
- 
-If you want to use that, you have to adjust some **non-contiguous tensors** first:
+
+To use this file, you must first adjust its non-contiguous tensors:

 ```python
 import torch
--- a/docs/flux.md
+++ b/docs/flux.md
@ -15,7 +15,7 @@ You can run Flux using stable-diffusion.cpp with a GPU that has 6GB or even 4GB

 You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself.

-Using fp16 will lead to overflow, but ggml's support for bf16 is not yet fully developed. Therefore, we need to convert flux to gguf format here, which also saves VRAM. For example:
+For example:
 ```
 .\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
 ```
--- a/docs/flux2.md
+++ b/docs/flux2.md
@ -0,0 +1,21 @@
+# How to Use
+
+## Download weights
+
+- Download FLUX.2-dev
+    - gguf: https://huggingface.co/city96/FLUX.2-dev-gguf/tree/main
+- Download vae
+    - safetensors: https://huggingface.co/black-forest-labs/FLUX.2-dev/tree/main
+- Download Mistral-Small-3.2-24B-Instruct-2506-GGUF
+    - gguf: https://huggingface.co/unsloth/Mistral-Small-3.2-24B-Instruct-2506-GGUF/tree/main
+
+## Examples
+
+```
+.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\flux2-dev-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\Mistral-Small-3.2-24B-Instruct-2506-Q4_K_M.gguf -r .\kontext_input.png -p "change 'flux.cpp' to 'flux2-dev.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu
+```
+
+<img alt="flux2 example" src="../assets/flux2/example.png" />
+
+
+
--- a/docs/lora.md
+++ b/docs/lora.md
@ -12,38 +12,15 @@ Here's a simple example:

 `../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model

-# Support matrix
+# Lora Apply Mode

-> ℹ️ CUDA `get_rows` support is defined here:  
-> [ggml-org/ggml/src/ggml-cuda/getrows.cu#L156](https://github.com/ggml-org/ggml/blob/7dee1d6a1e7611f238d09be96738388da97c88ed/src/ggml-cuda/getrows.cu#L156)  
-> Currently only the basic types + Q4/Q5/Q8 are implemented. K-quants are **not** supported.
+There are two ways to apply LoRA: **immediately** and **at_runtime**. You can specify it using the `--lora-apply-mode` parameter.

-NOTE: The other backends may have different support.
+By default, the mode is selected automatically:
+
+* If the model weights contain any quantized parameters, the **at_runtime** mode is used;
+* Otherwise, the **immediately** mode is used.
+
+The **immediately** mode may have precision and compatibility issues with quantized parameters, but it usually offers faster inference speed and, in some cases, lower memory usage.
+In contrast, the **at_runtime** mode provides better compatibility and higher precision, but inference may be slower and memory usage may be higher in some cases.

-| Quant / Type | CUDA | Vulkan |
-|--------------|------|--------|
-| F32          | ✔️   | ✔️   |
-| F16          | ✔️   | ✔️   |
-| BF16         | ✔️   | ✔️   |
-| I32          | ✔️   | ❌   |
-| Q4_0         | ✔️   | ✔️   |
-| Q4_1         | ✔️   | ✔️   |
-| Q5_0         | ✔️   | ✔️   |
-| Q5_1         | ✔️   | ✔️   |
-| Q8_0         | ✔️   | ✔️   |
-| Q2_K         | ❌   | ❌   |
-| Q3_K         | ❌   | ❌   |
-| Q4_K         | ❌   | ❌   |
-| Q5_K         | ❌   | ❌   |
-| Q6_K         | ❌   | ❌   |
-| Q8_K         | ❌   | ❌   |
-| IQ1_S        | ❌   | ✔️   |
-| IQ1_M        | ❌   | ✔️   |
-| IQ2_XXS      | ❌   | ✔️   |
-| IQ2_XS       | ❌   | ✔️   |
-| IQ2_S        | ❌   | ✔️   |
-| IQ3_XXS      | ❌   | ✔️   |
-| IQ3_S        | ❌   | ✔️   |
-| IQ4_XS       | ❌   | ✔️   |
-| IQ4_NL       | ❌   | ✔️   |
-| MXFP4        | ❌   | ✔️   |
--- a/docs/ovis_image.md
+++ b/docs/ovis_image.md
@ -0,0 +1,19 @@
+# How to Use
+
+## Download weights
+
+- Download Ovis-Image-7B
+    - safetensors: https://huggingface.co/Comfy-Org/Ovis-Image/tree/main/split_files/diffusion_models
+    - gguf: https://huggingface.co/leejet/Ovis-Image-7B-GGUF
+- Download vae
+    - safetensors: https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
+- Download Ovis 2.5
+    - safetensors: https://huggingface.co/Comfy-Org/Ovis-Image/tree/main/split_files/text_encoders
+
+## Examples
+
+```
+.\bin\Release\sd.exe --diffusion-model  ovis_image-Q4_0.gguf --vae ..\..\ComfyUI\models\vae\ae.sft  --llm ..\..\ComfyUI\models\text_encoders\ovis_2.5.safetensors -p "a lovely cat" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa
+```
+
+<img alt="ovis image example" src="../assets/ovis_image/example.png" />
--- a/docs/photo_maker.md
+++ b/docs/photo_maker.md
@ -40,7 +40,7 @@ Running PMV2 is now a two-step process:
 ```
 python face_detect.py input_image_dir
 ```
-An ```id_embeds.safetensors``` file will be generated in ```input_images_dir```
+An ```id_embeds.bin``` file will be generated in ```input_images_dir```

 **Note: this step is only needed to run once; the same ```id_embeds``` can be reused**

@ -48,6 +48,6 @@ An ```id_embeds.safetensors``` file will be generated in ```input_images_dir```

  You can download ```photomaker-v2.safetensors``` from [here](https://huggingface.co/bssrdf/PhotoMakerV2)

- All the command line parameters from Version 1 remain the same for Version 2
+- All the command line parameters from Version 1 remain the same for Version 2 plus one extra pointing to a valid ```id_embeds``` file:  --pm-id-embed-path [path_to__id_embeds.bin] 


--- a/docs/qwen_image.md
+++ b/docs/qwen_image.md
@ -14,7 +14,7 @@
 ## Examples

 ```
-.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --qwen2vl ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf  -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线： 探索视觉生成基础模型的极限，开创理解与生成一体化的未来。二、Qwen-Image的模型特色：1、复杂文字渲染。支持中英渲染、自动布局； 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景：赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
+.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf  -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线： 探索视觉生成基础模型的极限，开创理解与生成一体化的未来。二、Qwen-Image的模型特色：1、复杂文字渲染。支持中英渲染、自动布局； 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景：赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
 ```

 <img alt="qwen example" src="../assets/qwen/example.png" />
--- a/docs/qwen_image_edit.md
+++ b/docs/qwen_image_edit.md
@ -20,7 +20,7 @@
 ### Qwen Image Edit

 ```
-.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --qwen2vl ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
+.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
 ```

 <img alt="qwen_image_edit" src="../assets/qwen/qwen_image_edit.png" />
@ -29,7 +29,7 @@
 ### Qwen Image Edit 2509

 ```
-.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --qwen2vl ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --qwen2vl_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
+.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --llm_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
 ```

 <img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2509.png" />
--- a/docs/z_image.md
+++ b/docs/z_image.md
@ -0,0 +1,28 @@
+# How to Use
+
+You can run Z-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or even less.
+
+## Download weights
+
+- Download Z-Image-Turbo
+    - safetensors: https://huggingface.co/Comfy-Org/z_image_turbo/tree/main/split_files/diffusion_models
+    - gguf: https://huggingface.co/leejet/Z-Image-Turbo-GGUF/tree/main
+- Download vae
+    - safetensors: https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
+- Download Qwen3 4b
+    - safetensors: https://huggingface.co/Comfy-Org/z_image_turbo/tree/main/split_files/text_encoders
+    - gguf: https://huggingface.co/unsloth/Qwen3-4B-Instruct-2507-GGUF/tree/main
+
+## Examples
+
+```
+.\bin\Release\sd.exe --diffusion-model  z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft  --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512
+```
+
+<img width="256" alt="z-image example" src="../assets/z_image/q3_K.png" />
+
+## Comparison of Different Quantization Types
+
+| bf16 | q8_0 | q6_K | q5_0 | q4_K | q4_0 | q3_K | q2_K|
+|---|---|---|---|---|---|---|---|
+| <img width="256" alt="bf16" src="../assets/z_image/bf16.png" /> | <img width="256" alt="q8_0" src="../assets/z_image/q8_0.png" /> | <img width="256" alt="q6_K" src="../assets/z_image/q6_K.png" /> | <img width="256" alt="q5_0" src="../assets/z_image/q5_0.png" />  | <img width="256" alt="q4_K" src="../assets/z_image/q4_K.png" /> | <img width="256" alt="q4_0" src="../assets/z_image/q4_0.png" /> | <img width="256" alt="q3_K" src="../assets/z_image/q3_K.png" /> | <img width="256" alt="q2_K" src="../assets/z_image/q2_K.png" /> |
--- a/easycache.hpp
+++ b/easycache.hpp
@ -0,0 +1,265 @@
+#include <cmath>
+#include <limits>
+#include <unordered_map>
+#include <vector>
+
+#include "denoiser.hpp"
+#include "ggml_extend.hpp"
+
+struct EasyCacheConfig {
+    bool enabled          = false;
+    float reuse_threshold = 0.2f;
+    float start_percent   = 0.15f;
+    float end_percent     = 0.95f;
+};
+
+struct EasyCacheCacheEntry {
+    std::vector<float> diff;
+};
+
+struct EasyCacheState {
+    EasyCacheConfig config;
+    Denoiser* denoiser                  = nullptr;
+    float start_sigma                   = std::numeric_limits<float>::max();
+    float end_sigma                     = 0.0f;
+    bool initialized                    = false;
+    bool initial_step                   = true;
+    bool skip_current_step              = false;
+    bool step_active                    = false;
+    const SDCondition* anchor_condition = nullptr;
+    std::unordered_map<const SDCondition*, EasyCacheCacheEntry> cache_diffs;
+    std::vector<float> prev_input;
+    std::vector<float> prev_output;
+    float output_prev_norm                = 0.0f;
+    bool has_prev_input                   = false;
+    bool has_prev_output                  = false;
+    bool has_output_prev_norm             = false;
+    bool has_relative_transformation_rate = false;
+    float relative_transformation_rate    = 0.0f;
+    float cumulative_change_rate          = 0.0f;
+    float last_input_change               = 0.0f;
+    bool has_last_input_change            = false;
+    int total_steps_skipped               = 0;
+    int current_step_index                = -1;
+
+    void reset_runtime() {
+        initial_step      = true;
+        skip_current_step = false;
+        step_active       = false;
+        anchor_condition  = nullptr;
+        cache_diffs.clear();
+        prev_input.clear();
+        prev_output.clear();
+        output_prev_norm                 = 0.0f;
+        has_prev_input                   = false;
+        has_prev_output                  = false;
+        has_output_prev_norm             = false;
+        has_relative_transformation_rate = false;
+        relative_transformation_rate     = 0.0f;
+        cumulative_change_rate           = 0.0f;
+        last_input_change                = 0.0f;
+        has_last_input_change            = false;
+        total_steps_skipped              = 0;
+        current_step_index               = -1;
+    }
+
+    void init(const EasyCacheConfig& cfg, Denoiser* d) {
+        config      = cfg;
+        denoiser    = d;
+        initialized = cfg.enabled && d != nullptr;
+        reset_runtime();
+        if (initialized) {
+            start_sigma = percent_to_sigma(config.start_percent);
+            end_sigma   = percent_to_sigma(config.end_percent);
+        }
+    }
+
+    bool enabled() const {
+        return initialized && config.enabled;
+    }
+
+    float percent_to_sigma(float percent) const {
+        if (!denoiser) {
+            return 0.0f;
+        }
+        if (percent <= 0.0f) {
+            return std::numeric_limits<float>::max();
+        }
+        if (percent >= 1.0f) {
+            return 0.0f;
+        }
+        float t = (1.0f - percent) * (TIMESTEPS - 1);
+        return denoiser->t_to_sigma(t);
+    }
+
+    void begin_step(int step_index, float sigma) {
+        if (!enabled()) {
+            return;
+        }
+        if (step_index == current_step_index) {
+            return;
+        }
+        current_step_index    = step_index;
+        skip_current_step     = false;
+        has_last_input_change = false;
+        step_active           = false;
+        if (sigma > start_sigma) {
+            return;
+        }
+        if (!(sigma > end_sigma)) {
+            return;
+        }
+        step_active = true;
+    }
+
+    bool step_is_active() const {
+        return enabled() && step_active;
+    }
+
+    bool is_step_skipped() const {
+        return enabled() && step_active && skip_current_step;
+    }
+
+    bool has_cache(const SDCondition* cond) const {
+        auto it = cache_diffs.find(cond);
+        return it != cache_diffs.end() && !it->second.diff.empty();
+    }
+
+    void update_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
+        EasyCacheCacheEntry& entry = cache_diffs[cond];
+        size_t ne                  = static_cast<size_t>(ggml_nelements(output));
+        entry.diff.resize(ne);
+        float* out_data = (float*)output->data;
+        float* in_data  = (float*)input->data;
+        for (size_t i = 0; i < ne; ++i) {
+            entry.diff[i] = out_data[i] - in_data[i];
+        }
+    }
+
+    void apply_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
+        auto it = cache_diffs.find(cond);
+        if (it == cache_diffs.end() || it->second.diff.empty()) {
+            return;
+        }
+        copy_ggml_tensor(output, input);
+        float* out_data                = (float*)output->data;
+        const std::vector<float>& diff = it->second.diff;
+        for (size_t i = 0; i < diff.size(); ++i) {
+            out_data[i] += diff[i];
+        }
+    }
+
+    bool before_condition(const SDCondition* cond,
+                          ggml_tensor* input,
+                          ggml_tensor* output,
+                          float sigma,
+                          int step_index) {
+        if (!enabled() || step_index < 0) {
+            return false;
+        }
+        if (step_index != current_step_index) {
+            begin_step(step_index, sigma);
+        }
+        if (!step_active) {
+            return false;
+        }
+        if (initial_step) {
+            anchor_condition = cond;
+            initial_step     = false;
+        }
+        bool is_anchor = (cond == anchor_condition);
+        if (skip_current_step) {
+            if (has_cache(cond)) {
+                apply_cache(cond, input, output);
+                return true;
+            }
+            return false;
+        }
+        if (!is_anchor) {
+            return false;
+        }
+        if (!has_prev_input || !has_prev_output || !has_cache(cond)) {
+            return false;
+        }
+        size_t ne = static_cast<size_t>(ggml_nelements(input));
+        if (prev_input.size() != ne) {
+            return false;
+        }
+        float* input_data = (float*)input->data;
+        last_input_change = 0.0f;
+        for (size_t i = 0; i < ne; ++i) {
+            last_input_change += std::fabs(input_data[i] - prev_input[i]);
+        }
+        if (ne > 0) {
+            last_input_change /= static_cast<float>(ne);
+        }
+        has_last_input_change = true;
+
+        if (has_output_prev_norm && has_relative_transformation_rate && last_input_change > 0.0f && output_prev_norm > 0.0f) {
+            float approx_output_change_rate = (relative_transformation_rate * last_input_change) / output_prev_norm;
+            cumulative_change_rate += approx_output_change_rate;
+            if (cumulative_change_rate < config.reuse_threshold) {
+                skip_current_step = true;
+                total_steps_skipped++;
+                apply_cache(cond, input, output);
+                return true;
+            } else {
+                cumulative_change_rate = 0.0f;
+            }
+        }
+
+        return false;
+    }
+
+    void after_condition(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
+        if (!step_is_active()) {
+            return;
+        }
+        update_cache(cond, input, output);
+        if (cond != anchor_condition) {
+            return;
+        }
+
+        size_t ne      = static_cast<size_t>(ggml_nelements(input));
+        float* in_data = (float*)input->data;
+        prev_input.resize(ne);
+        for (size_t i = 0; i < ne; ++i) {
+            prev_input[i] = in_data[i];
+        }
+        has_prev_input = true;
+
+        float* out_data     = (float*)output->data;
+        float output_change = 0.0f;
+        if (has_prev_output && prev_output.size() == ne) {
+            for (size_t i = 0; i < ne; ++i) {
+                output_change += std::fabs(out_data[i] - prev_output[i]);
+            }
+            if (ne > 0) {
+                output_change /= static_cast<float>(ne);
+            }
+        }
+
+        prev_output.resize(ne);
+        for (size_t i = 0; i < ne; ++i) {
+            prev_output[i] = out_data[i];
+        }
+        has_prev_output = true;
+
+        float mean_abs = 0.0f;
+        for (size_t i = 0; i < ne; ++i) {
+            mean_abs += std::fabs(out_data[i]);
+        }
+        output_prev_norm     = (ne > 0) ? (mean_abs / static_cast<float>(ne)) : 0.0f;
+        has_output_prev_norm = output_prev_norm > 0.0f;
+
+        if (has_last_input_change && last_input_change > 0.0f && output_change > 0.0f) {
+            float rate = output_change / last_input_change;
+            if (std::isfinite(rate)) {
+                relative_transformation_rate     = rate;
+                has_relative_transformation_rate = true;
+            }
+        }
+        cumulative_change_rate = 0.0f;
+        has_last_input_change  = false;
+    }
+};
--- a/esrgan.hpp
+++ b/esrgan.hpp
@ -156,9 +156,10 @@ struct ESRGAN : public GGMLRunner {

    ESRGAN(ggml_backend_t backend,
           bool offload_params_to_cpu,
+           int tile_size                                  = 128,
           const String2TensorStorage& tensor_storage_map = {})
        : GGMLRunner(backend, offload_params_to_cpu) {
-        // rrdb_net will be created in load_from_file
+        this->tile_size = tile_size;
    }

    std::string get_desc() override {
@ -169,7 +170,7 @@ struct ESRGAN : public GGMLRunner {
        LOG_INFO("loading esrgan from '%s'", file_path.c_str());

        ModelLoader model_loader;
-        if (!model_loader.init_from_file(file_path)) {
+        if (!model_loader.init_from_file_and_convert_name(file_path)) {
            LOG_ERROR("init esrgan model loader from file failed: '%s'", file_path.c_str());
            return false;
        }
@ -344,7 +345,7 @@ struct ESRGAN : public GGMLRunner {
        if (!rrdb_net)
            return nullptr;
        constexpr int kGraphNodes = 1 << 16;  // 65k
-        struct ggml_cgraph* gf    = ggml_new_graph_custom(compute_ctx, kGraphNodes, /*grads*/ false);
+        struct ggml_cgraph* gf    = new_graph_custom(kGraphNodes);
        x                         = to_backend(x);

        auto runner_ctx         = get_context();
@ -353,14 +354,14 @@ struct ESRGAN : public GGMLRunner {
        return gf;
    }

-    void compute(const int n_threads,
+    bool compute(const int n_threads,
                 struct ggml_tensor* x,
                 ggml_tensor** output,
                 ggml_context* output_ctx = nullptr) {
        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(x);
        };
-        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
    }
 };

--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -3,14 +3,30 @@
 ```
 usage: ./bin/sd  [options]

-Options:
+CLI Options:
+  -o, --output <string>       path to write result image to (default: ./output.png)
+  --preview-path <string>     path to write preview image to (default: ./preview.png)
+  --preview-interval <int>    interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
+                              every step)
+  --canny                     apply canny preprocessor (edge detection)
+  -v, --verbose               print extra info
+  --color                     colors the logging tags according to level
+  --taesd-preview-only        prevents usage of taesd for decoding the final image. (for use with --preview tae)
+  --preview-noisy             enables previewing noisy inputs of the models rather than the denoised outputs
+  -M, --mode                  run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
+  --preview                   preview method. must be one of the following [none, proj, tae, vae] (default is none)
+  -h, --help                  show this help message and exit
+
+Context Options:
  -m, --model <string>                     path to full model
  --clip_l <string>                        path to the clip-l text encoder
  --clip_g <string>                        path to the clip-g text encoder
  --clip_vision <string>                   path to the clip-vision encoder
  --t5xxl <string>                         path to the t5xxl text encoder
-  --qwen2vl <string>                       path to the qwen2vl text encoder
-  --qwen2vl_vision <string>                path to the qwen2vl vit
+  --llm <string>                           path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
+  --llm_vision <string>                    path to the llm vit
+  --qwen2vl <string>                       alias of --llm. Deprecated.
+  --qwen2vl_vision <string>                alias of --llm_vision. Deprecated.
  --diffusion-model <string>               path to the standalone diffusion model
  --high-noise-diffusion-model <string>    path to the standalone high noise diffusion model
  --vae <string>                           path to standalone vae model
@ -18,24 +34,52 @@ Options:
  --control-net <string>                   path to control net model
  --embd-dir <string>                      embeddings directory
  --lora-model-dir <string>                lora model directory
-  -i, --init-img <string>                  path to the init image
-  --end-img <string>                       path to the end image, required by flf2v
  --tensor-type-rules <string>             weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
  --photo-maker <string>                   path to PHOTOMAKER model
-  --pm-id-images-dir <string>              path to PHOTOMAKER input id images dir
-  --pm-id-embed-path <string>              path to PHOTOMAKER v2 id embed
+  --upscale-model <string>                 path to esrgan model.
+  -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
+                                           CPU physical cores
+  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
+  --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
+  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
+  --vae-tiling                             process vae in tiles to reduce memory usage
+  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
+  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
+  --control-net-cpu                        keep controlnet in cpu (for low vram)
+  --clip-on-cpu                            keep clip in cpu (for low vram)
+  --vae-on-cpu                             keep vae in cpu (for low vram)
+  --diffusion-fa                           use flash attention in the diffusion model
+  --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
+  --vae-conv-direct                        use ggml_conv2d_direct in the vae model
+  --chroma-disable-dit-mask                disable dit mask for chroma
+  --chroma-enable-t5-mask                  enable t5 mask for chroma
+  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
+                                           type of the weight file
+  --rng                                    RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
+  --sampler-rng                            sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
+  --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
+  --lora-apply-mode                        the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
+                                           contain any quantized parameters, the at_runtime mode will be used; otherwise,
+                                           immediately will be used.The immediately mode may have precision and
+                                           compatibility issues with quantized parameters, but it usually offers faster inference
+                                           speed and, in some cases, lower memory usage. The at_runtime mode, on the
+                                           other hand, is exactly the opposite.
+  --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
+  --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
+                                           (overrides --vae-tile-size)
+
+Generation Options:
+  -p, --prompt <string>                    the prompt to render
+  -n, --negative-prompt <string>           the negative prompt (default: "")
+  -i, --init-img <string>                  path to the init image
+  --end-img <string>                       path to the end image, required by flf2v
  --mask <string>                          path to the mask image
  --control-image <string>                 path to control image, control net
  --control-video <string>                 path to control video frames, It must be a directory path. The video frames inside should be stored as images in
                                           lexicographical (character) order. For example, if the control video path is
                                           `frames`, the directory contain images such as 00.png, 01.png, ... etc.
-  -o, --output <string>                    path to write result image to (default: ./output.png)
-  -p, --prompt <string>                    the prompt to render
-  -n, --negative-prompt <string>           the negative prompt (default: "")
-  --upscale-model <string>                 path to esrgan model.
-  -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
-                                           CPU physical cores
-  --upscale-repeats <int>                  Run the ESRGAN upscaler this many times (default: 1)
+  --pm-id-images-dir <string>              path to PHOTOMAKER input id images dir
+  --pm-id-embed-path <string>              path to PHOTOMAKER v2 id embed
  -H, --height <int>                       image height, in pixel space (default: 512)
  -W, --width <int>                        image width, in pixel space (default: 512)
  --steps <int>                            number of sample steps (default: 20)
@ -43,11 +87,11 @@ Options:
  --clip-skip <int>                        ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
                                           will be 1 for SD1.x, 2 for SD2.x
  -b, --batch-count <int>                  batch count
-  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
  --video-frames <int>                     video frames (default: 1)
  --fps <int>                              fps (default: 24)
  --timestep-shift <int>                   shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
                                           NitroSD-Vibrant
+  --upscale-repeats <int>                  Run the ESRGAN upscaler this many times (default: 1)
  --cfg-scale <float>                      unconditional guidance scale: (default: 7.0)
  --img-cfg-scale <float>                  image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
  --guidance <float>                       distilled guidance scale for models with guidance input (default: 3.5)
@ -67,44 +111,18 @@ Options:
  --pm-style-strength <float>
  --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
  --moe-boundary <float>                   timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
-  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
  --vace-strength <float>                  wan vace strength
-  --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
-  --vae-tiling                             process vae in tiles to reduce memory usage
-  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
-  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
-  --control-net-cpu                        keep controlnet in cpu (for low vram)
-  --clip-on-cpu                            keep clip in cpu (for low vram)
-  --vae-on-cpu                             keep vae in cpu (for low vram)
-  --diffusion-fa                           use flash attention in the diffusion model
-  --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
-  --vae-conv-direct                        use ggml_conv2d_direct in the vae model
-  --canny                                  apply canny preprocessor (edge detection)
-  -v, --verbose                            print extra info
-  --color                                  colors the logging tags according to level
-  --chroma-disable-dit-mask                disable dit mask for chroma
-  --chroma-enable-t5-mask                  enable t5 mask for chroma
  --increase-ref-index                     automatically increase the indices of references images based on the order they are listed (starting with 1).
  --disable-auto-resize-ref-image          disable auto resize of ref images
-  -M, --mode                               run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
-  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
-                                           type of the weight file
-  --rng                                    RNG, one of [std_default, cuda], default: cuda
  -s, --seed                               RNG seed (default: 42, use random seed for < 0)
  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
                                           tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
-  --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow]
-  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default:
-                                           discrete
-  --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
                                           ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
-  --high-noise-scheduler                   (high noise) denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform,
-                                           simple], default: discrete
+  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm],
+                                           default: discrete
+  --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
  --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
  -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
-  -h, --help                               show this help message and exit
-  --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
-  --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
-                                           (overrides --vae-tile-size)
-```
+  --easycache                              enable EasyCache for DiT models with optional "threshold,start_percent,end_percent" (default: 0.2,0.15,0.95)
+```
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
--- a/flux.hpp
+++ b/flux.hpp
@ -14,9 +14,9 @@ namespace Flux {

    struct MLPEmbedder : public UnaryBlock {
    public:
-        MLPEmbedder(int64_t in_dim, int64_t hidden_dim) {
-            blocks["in_layer"]  = std::shared_ptr<GGMLBlock>(new Linear(in_dim, hidden_dim, true));
-            blocks["out_layer"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_dim, hidden_dim, true));
+        MLPEmbedder(int64_t in_dim, int64_t hidden_dim, bool bias = true) {
+            blocks["in_layer"]  = std::shared_ptr<GGMLBlock>(new Linear(in_dim, hidden_dim, bias));
+            blocks["out_layer"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_dim, hidden_dim, bias));
        }

        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
@ -89,12 +89,13 @@ namespace Flux {
    public:
        SelfAttention(int64_t dim,
                      int64_t num_heads = 8,
-                      bool qkv_bias     = false)
+                      bool qkv_bias     = false,
+                      bool proj_bias    = true)
            : num_heads(num_heads) {
            int64_t head_dim = dim / num_heads;
            blocks["qkv"]    = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * 3, qkv_bias));
            blocks["norm"]   = std::shared_ptr<GGMLBlock>(new QKNorm(head_dim));
-            blocks["proj"]   = std::shared_ptr<GGMLBlock>(new Linear(dim, dim));
+            blocks["proj"]   = std::shared_ptr<GGMLBlock>(new Linear(dim, dim, proj_bias));
        }

        std::vector<struct ggml_tensor*> pre_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
@ -133,6 +134,54 @@ namespace Flux {
        }
    };

+    struct MLP : public UnaryBlock {
+        bool use_mlp_silu_act;
+
+    public:
+        MLP(int64_t hidden_size, int64_t intermediate_size, bool use_mlp_silu_act = false, bool bias = false)
+            : use_mlp_silu_act(use_mlp_silu_act) {
+            int64_t mlp_mult_factor = use_mlp_silu_act ? 2 : 1;
+            blocks["0"]             = std::make_shared<Linear>(hidden_size, intermediate_size * mlp_mult_factor, bias);
+            blocks["2"]             = std::make_shared<Linear>(intermediate_size, hidden_size, bias);
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+            auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["0"]);
+            auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["2"]);
+
+            x = mlp_0->forward(ctx, x);
+            if (use_mlp_silu_act) {
+                x = ggml_ext_silu_act(ctx->ggml_ctx, x);
+            } else {
+                x = ggml_gelu_inplace(ctx->ggml_ctx, x);
+            }
+            x = mlp_2->forward(ctx, x);
+            return x;
+        }
+    };
+
+    struct YakMLP : public UnaryBlock {
+    public:
+        YakMLP(int64_t hidden_size, int64_t intermediate_size, bool bias = true) {
+            blocks["gate_proj"] = std::make_shared<Linear>(hidden_size, intermediate_size, bias);
+            blocks["up_proj"]   = std::make_shared<Linear>(hidden_size, intermediate_size, bias);
+            blocks["down_proj"] = std::make_shared<Linear>(intermediate_size, hidden_size, bias);
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+            auto gate_proj = std::dynamic_pointer_cast<Linear>(blocks["gate_proj"]);
+            auto up_proj   = std::dynamic_pointer_cast<Linear>(blocks["up_proj"]);
+            auto down_proj = std::dynamic_pointer_cast<Linear>(blocks["down_proj"]);
+
+            auto gate = gate_proj->forward(ctx, x);
+            gate      = ggml_silu_inplace(ctx->ggml_ctx, gate);
+            x         = up_proj->forward(ctx, x);
+            x         = ggml_mul(ctx->ggml_ctx, x, gate);
+            x         = down_proj->forward(ctx, x);
+            return x;
+        }
+    };
+
    struct ModulationOut {
        ggml_tensor* shift = nullptr;
        ggml_tensor* scale = nullptr;
@ -155,10 +204,10 @@ namespace Flux {
        int multiplier;

    public:
-        Modulation(int64_t dim, bool is_double)
+        Modulation(int64_t dim, bool is_double, bool bias = true)
            : is_double(is_double) {
            multiplier    = is_double ? 6 : 3;
-            blocks["lin"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * multiplier));
+            blocks["lin"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * multiplier, bias));
        }

        std::vector<ModulationOut> forward(GGMLRunnerContext* ctx, struct ggml_tensor* vec) {
@ -203,32 +252,41 @@ namespace Flux {
        DoubleStreamBlock(int64_t hidden_size,
                          int64_t num_heads,
                          float mlp_ratio,
-                          int idx        = 0,
-                          bool qkv_bias  = false,
-                          bool prune_mod = false)
+                          int idx               = 0,
+                          bool qkv_bias         = false,
+                          bool prune_mod        = false,
+                          bool share_modulation = false,
+                          bool mlp_proj_bias    = true,
+                          bool use_yak_mlp      = false,
+                          bool use_mlp_silu_act = false)
            : idx(idx), prune_mod(prune_mod) {
            int64_t mlp_hidden_dim = hidden_size * mlp_ratio;
-            if (!prune_mod) {
+
+            if (!prune_mod && !share_modulation) {
                blocks["img_mod"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, true));
            }
            blocks["img_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
-            blocks["img_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias));
+            blocks["img_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias, mlp_proj_bias));

            blocks["img_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
-            blocks["img_mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, mlp_hidden_dim));
-            // img_mlp.1 is nn.GELU(approximate="tanh")
-            blocks["img_mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(mlp_hidden_dim, hidden_size));
+            if (use_yak_mlp) {
+                blocks["img_mlp"] = std::shared_ptr<GGMLBlock>(new YakMLP(hidden_size, mlp_hidden_dim, mlp_proj_bias));
+            } else {
+                blocks["img_mlp"] = std::shared_ptr<GGMLBlock>(new MLP(hidden_size, mlp_hidden_dim, use_mlp_silu_act, mlp_proj_bias));
+            }

-            if (!prune_mod) {
+            if (!prune_mod && !share_modulation) {
                blocks["txt_mod"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, true));
            }
            blocks["txt_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
-            blocks["txt_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias));
+            blocks["txt_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias, mlp_proj_bias));

            blocks["txt_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
-            blocks["txt_mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, mlp_hidden_dim));
-            // img_mlp.1 is nn.GELU(approximate="tanh")
-            blocks["txt_mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(mlp_hidden_dim, hidden_size));
+            if (use_yak_mlp) {
+                blocks["txt_mlp"] = std::shared_ptr<GGMLBlock>(new YakMLP(hidden_size, mlp_hidden_dim, mlp_proj_bias));
+            } else {
+                blocks["txt_mlp"] = std::shared_ptr<GGMLBlock>(new MLP(hidden_size, mlp_hidden_dim, use_mlp_silu_act, mlp_proj_bias));
+            }
        }

        std::vector<ModulationOut> get_distil_img_mod(GGMLRunnerContext* ctx, struct ggml_tensor* vec) {
@ -254,7 +312,9 @@ namespace Flux {
                                                                    struct ggml_tensor* txt,
                                                                    struct ggml_tensor* vec,
                                                                    struct ggml_tensor* pe,
-                                                                    struct ggml_tensor* mask = nullptr) {
+                                                                    struct ggml_tensor* mask            = nullptr,
+                                                                    std::vector<ModulationOut> img_mods = {},
+                                                                    std::vector<ModulationOut> txt_mods = {}) {
            // img: [N, n_img_token, hidden_size]
            // txt: [N, n_txt_token, hidden_size]
            // pe: [n_img_token + n_txt_token, d_head/2, 2, 2]
@ -263,31 +323,31 @@ namespace Flux {
            auto img_attn  = std::dynamic_pointer_cast<SelfAttention>(blocks["img_attn"]);

            auto img_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["img_norm2"]);
-            auto img_mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["img_mlp.0"]);
-            auto img_mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["img_mlp.2"]);
+            auto img_mlp   = std::dynamic_pointer_cast<UnaryBlock>(blocks["img_mlp"]);

            auto txt_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["txt_norm1"]);
            auto txt_attn  = std::dynamic_pointer_cast<SelfAttention>(blocks["txt_attn"]);

            auto txt_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["txt_norm2"]);
-            auto txt_mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["txt_mlp.0"]);
-            auto txt_mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["txt_mlp.2"]);
+            auto txt_mlp   = std::dynamic_pointer_cast<UnaryBlock>(blocks["txt_mlp"]);

-            std::vector<ModulationOut> img_mods;
-            if (prune_mod) {
-                img_mods = get_distil_img_mod(ctx, vec);
-            } else {
-                auto img_mod = std::dynamic_pointer_cast<Modulation>(blocks["img_mod"]);
-                img_mods     = img_mod->forward(ctx, vec);
+            if (img_mods.empty()) {
+                if (prune_mod) {
+                    img_mods = get_distil_img_mod(ctx, vec);
+                } else {
+                    auto img_mod = std::dynamic_pointer_cast<Modulation>(blocks["img_mod"]);
+                    img_mods     = img_mod->forward(ctx, vec);
+                }
            }
            ModulationOut img_mod1 = img_mods[0];
            ModulationOut img_mod2 = img_mods[1];
-            std::vector<ModulationOut> txt_mods;
-            if (prune_mod) {
-                txt_mods = get_distil_txt_mod(ctx, vec);
-            } else {
-                auto txt_mod = std::dynamic_pointer_cast<Modulation>(blocks["txt_mod"]);
-                txt_mods     = txt_mod->forward(ctx, vec);
+            if (txt_mods.empty()) {
+                if (prune_mod) {
+                    txt_mods = get_distil_txt_mod(ctx, vec);
+                } else {
+                    auto txt_mod = std::dynamic_pointer_cast<Modulation>(blocks["txt_mod"]);
+                    txt_mods     = txt_mod->forward(ctx, vec);
+                }
            }
            ModulationOut txt_mod1 = txt_mods[0];
            ModulationOut txt_mod2 = txt_mods[1];
@ -337,20 +397,15 @@ namespace Flux {
            // calculate the img bloks
            img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_attn->post_attention(ctx, img_attn_out), img_mod1.gate));

-            auto img_mlp_out = img_mlp_0->forward(ctx, Flux::modulate(ctx->ggml_ctx, img_norm2->forward(ctx, img), img_mod2.shift, img_mod2.scale));
-            img_mlp_out      = ggml_gelu_inplace(ctx->ggml_ctx, img_mlp_out);
-            img_mlp_out      = img_mlp_2->forward(ctx, img_mlp_out);
+            auto img_mlp_out = img_mlp->forward(ctx, Flux::modulate(ctx->ggml_ctx, img_norm2->forward(ctx, img), img_mod2.shift, img_mod2.scale));

            img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_mlp_out, img_mod2.gate));

            // calculate the txt bloks
            txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_attn->post_attention(ctx, txt_attn_out), txt_mod1.gate));

-            auto txt_mlp_out = txt_mlp_0->forward(ctx, Flux::modulate(ctx->ggml_ctx, txt_norm2->forward(ctx, txt), txt_mod2.shift, txt_mod2.scale));
-            txt_mlp_out      = ggml_gelu_inplace(ctx->ggml_ctx, txt_mlp_out);
-            txt_mlp_out      = txt_mlp_2->forward(ctx, txt_mlp_out);
-
-            txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp_out, txt_mod2.gate));
+            auto txt_mlp_out = txt_mlp->forward(ctx, Flux::modulate(ctx->ggml_ctx, txt_norm2->forward(ctx, txt), txt_mod2.shift, txt_mod2.scale));
+            txt              = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp_out, txt_mod2.gate));

            return {img, txt};
        }
@ -363,28 +418,39 @@ namespace Flux {
        int64_t mlp_hidden_dim;
        bool prune_mod;
        int idx = 0;
+        bool use_yak_mlp;
+        bool use_mlp_silu_act;
+        int64_t mlp_mult_factor;

    public:
        SingleStreamBlock(int64_t hidden_size,
                          int64_t num_heads,
-                          float mlp_ratio = 4.0f,
-                          int idx         = 0,
-                          float qk_scale  = 0.f,
-                          bool prune_mod  = false)
-            : hidden_size(hidden_size), num_heads(num_heads), idx(idx), prune_mod(prune_mod) {
+                          float mlp_ratio       = 4.0f,
+                          int idx               = 0,
+                          float qk_scale        = 0.f,
+                          bool prune_mod        = false,
+                          bool share_modulation = false,
+                          bool mlp_proj_bias    = true,
+                          bool use_yak_mlp      = false,
+                          bool use_mlp_silu_act = false)
+            : hidden_size(hidden_size), num_heads(num_heads), idx(idx), prune_mod(prune_mod), use_yak_mlp(use_yak_mlp), use_mlp_silu_act(use_mlp_silu_act) {
            int64_t head_dim = hidden_size / num_heads;
            float scale      = qk_scale;
            if (scale <= 0.f) {
                scale = 1 / sqrt((float)head_dim);
            }
-            mlp_hidden_dim = hidden_size * mlp_ratio;
+            mlp_hidden_dim  = hidden_size * mlp_ratio;
+            mlp_mult_factor = 1;
+            if (use_yak_mlp || use_mlp_silu_act) {
+                mlp_mult_factor = 2;
+            }

-            blocks["linear1"]  = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim));
-            blocks["linear2"]  = std::shared_ptr<GGMLBlock>(new Linear(hidden_size + mlp_hidden_dim, hidden_size));
+            blocks["linear1"]  = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias));
+            blocks["linear2"]  = std::shared_ptr<GGMLBlock>(new Linear(hidden_size + mlp_hidden_dim, hidden_size, mlp_proj_bias));
            blocks["norm"]     = std::shared_ptr<GGMLBlock>(new QKNorm(head_dim));
            blocks["pre_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
            // mlp_act is nn.GELU(approximate="tanh")
-            if (!prune_mod) {
+            if (!prune_mod && !share_modulation) {
                blocks["modulation"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, false));
            }
        }
@ -398,7 +464,8 @@ namespace Flux {
                                    struct ggml_tensor* x,
                                    struct ggml_tensor* vec,
                                    struct ggml_tensor* pe,
-                                    struct ggml_tensor* mask = nullptr) {
+                                    struct ggml_tensor* mask        = nullptr,
+                                    std::vector<ModulationOut> mods = {}) {
            // x: [N, n_token, hidden_size]
            // pe: [n_token, d_head/2, 2, 2]
            // return: [N, n_token, hidden_size]
@ -407,14 +474,20 @@ namespace Flux {
            auto linear2  = std::dynamic_pointer_cast<Linear>(blocks["linear2"]);
            auto norm     = std::dynamic_pointer_cast<QKNorm>(blocks["norm"]);
            auto pre_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_norm"]);
-            ModulationOut mod;
-            if (prune_mod) {
-                mod = get_distil_mod(ctx, vec);
-            } else {
-                auto modulation = std::dynamic_pointer_cast<Modulation>(blocks["modulation"]);

-                mod = modulation->forward(ctx, vec)[0];
+            ModulationOut mod;
+            if (!mods.empty()) {
+                mod = mods[0];
+            } else {
+                if (prune_mod) {
+                    mod = get_distil_mod(ctx, vec);
+                } else {
+                    auto modulation = std::dynamic_pointer_cast<Modulation>(blocks["modulation"]);
+
+                    mod = modulation->forward(ctx, vec)[0];
+                }
            }
+
            auto x_mod   = Flux::modulate(ctx->ggml_ctx, pre_norm->forward(ctx, x), mod.shift, mod.scale);
            auto qkv_mlp = linear1->forward(ctx, x_mod);                                                // [N, n_token, hidden_size * 3 + mlp_hidden_dim]
            qkv_mlp      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, qkv_mlp, 2, 0, 1, 3));  // [hidden_size * 3 + mlp_hidden_dim, N, n_token]
@ -432,11 +505,11 @@ namespace Flux {
                                    qkv_mlp,
                                    qkv_mlp->ne[0],
                                    qkv_mlp->ne[1],
-                                    mlp_hidden_dim,
+                                    mlp_hidden_dim * mlp_mult_factor,
                                    qkv_mlp->nb[1],
                                    qkv_mlp->nb[2],
-                                    qkv_mlp->nb[2] * hidden_size * 3);                          // [mlp_hidden_dim , N, n_token]
-            mlp      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, mlp, 1, 2, 0, 3));  // [N, n_token, mlp_hidden_dim]
+                                    qkv_mlp->nb[2] * hidden_size * 3);                          // [mlp_hidden_dim*mlp_mult_factor , N, n_token]
+            mlp      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, mlp, 1, 2, 0, 3));  // [N, n_token, mlp_hidden_dim*mlp_mult_factor]

            auto qkv_vec     = split_qkv(ctx->ggml_ctx, qkv);  // q,k,v: [N, n_token, hidden_size]
            int64_t head_dim = hidden_size / num_heads;
@ -447,8 +520,15 @@ namespace Flux {
            k                = norm->key_norm(ctx, k);
            auto attn        = Rope::attention(ctx, q, k, v, pe, mask);  // [N, n_token, hidden_size]

-            auto attn_mlp = ggml_concat(ctx->ggml_ctx, attn, ggml_gelu_inplace(ctx->ggml_ctx, mlp), 0);  // [N, n_token, hidden_size + mlp_hidden_dim]
-            auto output   = linear2->forward(ctx, attn_mlp);                                             // [N, n_token, hidden_size]
+            if (use_yak_mlp) {
+                mlp = ggml_ext_silu_act(ctx->ggml_ctx, mlp, false);
+            } else if (use_mlp_silu_act) {
+                mlp = ggml_ext_silu_act(ctx->ggml_ctx, mlp);
+            } else {
+                mlp = ggml_gelu_inplace(ctx->ggml_ctx, mlp);
+            }
+            auto attn_mlp = ggml_concat(ctx->ggml_ctx, attn, mlp, 0);  // [N, n_token, hidden_size + mlp_hidden_dim]
+            auto output   = linear2->forward(ctx, attn_mlp);           // [N, n_token, hidden_size]

            output = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, output, mod.gate));
            return output;
@ -462,12 +542,13 @@ namespace Flux {
        LastLayer(int64_t hidden_size,
                  int64_t patch_size,
                  int64_t out_channels,
-                  bool prune_mod = false)
+                  bool prune_mod = false,
+                  bool bias      = true)
            : prune_mod(prune_mod) {
            blocks["norm_final"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
-            blocks["linear"]     = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels));
+            blocks["linear"]     = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels, bias));
            if (!prune_mod) {
-                blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size));
+                blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size, bias));
            }
        }

@ -684,6 +765,12 @@ namespace Flux {
        bool qkv_bias               = true;
        bool guidance_embed         = true;
        int64_t in_dim              = 64;
+        bool disable_bias           = false;
+        bool share_modulation       = false;
+        bool semantic_txt_norm      = false;
+        bool use_yak_mlp            = false;
+        bool use_mlp_silu_act       = false;
+        float ref_index_scale       = 1.f;
        ChromaRadianceParams chroma_radiance_params;
    };

@ -702,18 +789,23 @@ namespace Flux {
                                                                  kernel_size,
                                                                  stride);
            } else {
-                blocks["img_in"] = std::make_shared<Linear>(params.in_channels, params.hidden_size, true);
+                blocks["img_in"] = std::make_shared<Linear>(params.in_channels, params.hidden_size, !params.disable_bias);
            }
            if (params.is_chroma) {
                blocks["distilled_guidance_layer"] = std::make_shared<ChromaApproximator>(params.in_dim, params.hidden_size);
            } else {
-                blocks["time_in"]   = std::make_shared<MLPEmbedder>(256, params.hidden_size);
-                blocks["vector_in"] = std::make_shared<MLPEmbedder>(params.vec_in_dim, params.hidden_size);
+                blocks["time_in"] = std::make_shared<MLPEmbedder>(256, params.hidden_size, !params.disable_bias);
+                if (params.vec_in_dim > 0) {
+                    blocks["vector_in"] = std::make_shared<MLPEmbedder>(params.vec_in_dim, params.hidden_size, !params.disable_bias);
+                }
                if (params.guidance_embed) {
-                    blocks["guidance_in"] = std::make_shared<MLPEmbedder>(256, params.hidden_size);
+                    blocks["guidance_in"] = std::make_shared<MLPEmbedder>(256, params.hidden_size, !params.disable_bias);
                }
            }
-            blocks["txt_in"] = std::make_shared<Linear>(params.context_in_dim, params.hidden_size, true);
+            if (params.semantic_txt_norm) {
+                blocks["txt_norm"] = std::make_shared<RMSNorm>(params.context_in_dim);
+            }
+            blocks["txt_in"] = std::make_shared<Linear>(params.context_in_dim, params.hidden_size, !params.disable_bias);

            for (int i = 0; i < params.depth; i++) {
                blocks["double_blocks." + std::to_string(i)] = std::make_shared<DoubleStreamBlock>(params.hidden_size,
@ -721,7 +813,11 @@ namespace Flux {
                                                                                                   params.mlp_ratio,
                                                                                                   i,
                                                                                                   params.qkv_bias,
-                                                                                                   params.is_chroma);
+                                                                                                   params.is_chroma,
+                                                                                                   params.share_modulation,
+                                                                                                   !params.disable_bias,
+                                                                                                   params.use_yak_mlp,
+                                                                                                   params.use_mlp_silu_act);
            }

            for (int i = 0; i < params.depth_single_blocks; i++) {
@ -730,7 +826,11 @@ namespace Flux {
                                                                                                   params.mlp_ratio,
                                                                                                   i,
                                                                                                   0.f,
-                                                                                                   params.is_chroma);
+                                                                                                   params.is_chroma,
+                                                                                                   params.share_modulation,
+                                                                                                   !params.disable_bias,
+                                                                                                   params.use_yak_mlp,
+                                                                                                   params.use_mlp_silu_act);
            }

            if (params.version == VERSION_CHROMA_RADIANCE) {
@ -748,7 +848,13 @@ namespace Flux {
                                                                                       params.in_channels);

            } else {
-                blocks["final_layer"] = std::make_shared<LastLayer>(params.hidden_size, 1, params.out_channels, params.is_chroma);
+                blocks["final_layer"] = std::make_shared<LastLayer>(params.hidden_size, 1, params.out_channels, params.is_chroma, !params.disable_bias);
+            }
+
+            if (params.share_modulation) {
+                blocks["double_stream_modulation_img"] = std::make_shared<Modulation>(params.hidden_size, true, !params.disable_bias);
+                blocks["double_stream_modulation_txt"] = std::make_shared<Modulation>(params.hidden_size, true, !params.disable_bias);
+                blocks["single_stream_modulation"]     = std::make_shared<Modulation>(params.hidden_size, false, !params.disable_bias);
            }
        }

@ -861,9 +967,8 @@ namespace Flux {
                    txt_img_mask = ggml_pad(ctx->ggml_ctx, y, img->ne[1], 0, 0, 0);
                }
            } else {
-                auto time_in   = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]);
-                auto vector_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["vector_in"]);
-                vec            = time_in->forward(ctx, ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f));
+                auto time_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]);
+                vec          = time_in->forward(ctx, ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f));
                if (params.guidance_embed) {
                    GGML_ASSERT(guidance != nullptr);
                    auto guidance_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["guidance_in"]);
@ -872,7 +977,29 @@ namespace Flux {
                    vec       = ggml_add(ctx->ggml_ctx, vec, guidance_in->forward(ctx, g_in));
                }

-                vec = ggml_add(ctx->ggml_ctx, vec, vector_in->forward(ctx, y));
+                if (params.vec_in_dim > 0) {
+                    auto vector_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["vector_in"]);
+                    vec            = ggml_add(ctx->ggml_ctx, vec, vector_in->forward(ctx, y));
+                }
+            }
+
+            std::vector<ModulationOut> ds_img_mods;
+            std::vector<ModulationOut> ds_txt_mods;
+            std::vector<ModulationOut> ss_mods;
+            if (params.share_modulation) {
+                auto double_stream_modulation_img = std::dynamic_pointer_cast<Modulation>(blocks["double_stream_modulation_img"]);
+                auto double_stream_modulation_txt = std::dynamic_pointer_cast<Modulation>(blocks["double_stream_modulation_txt"]);
+                auto single_stream_modulation     = std::dynamic_pointer_cast<Modulation>(blocks["single_stream_modulation"]);
+
+                ds_img_mods = double_stream_modulation_img->forward(ctx, vec);
+                ds_txt_mods = double_stream_modulation_txt->forward(ctx, vec);
+                ss_mods     = single_stream_modulation->forward(ctx, vec);
+            }
+
+            if (params.semantic_txt_norm) {
+                auto semantic_txt_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm"]);
+
+                txt = semantic_txt_norm->forward(ctx, txt);
            }

            txt = txt_in->forward(ctx, txt);
@ -884,7 +1011,7 @@ namespace Flux {

                auto block = std::dynamic_pointer_cast<DoubleStreamBlock>(blocks["double_blocks." + std::to_string(i)]);

-                auto img_txt = block->forward(ctx, img, txt, vec, pe, txt_img_mask);
+                auto img_txt = block->forward(ctx, img, txt, vec, pe, txt_img_mask, ds_img_mods, ds_txt_mods);
                img          = img_txt.first;   // [N, n_img_token, hidden_size]
                txt          = img_txt.second;  // [N, n_txt_token, hidden_size]
            }
@ -896,7 +1023,7 @@ namespace Flux {
                }
                auto block = std::dynamic_pointer_cast<SingleStreamBlock>(blocks["single_blocks." + std::to_string(i)]);

-                txt_img = block->forward(ctx, txt_img, vec, pe, txt_img_mask);
+                txt_img = block->forward(ctx, txt_img, vec, pe, txt_img_mask, ss_mods);
            }

            txt_img = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, txt_img, 0, 2, 1, 3));  // [n_txt_token + n_img_token, N, hidden_size]
@ -1133,6 +1260,27 @@ namespace Flux {
            } else if (version == VERSION_CHROMA_RADIANCE) {
                flux_params.in_channels = 3;
                flux_params.patch_size  = 16;
+            } else if (version == VERSION_OVIS_IMAGE) {
+                flux_params.semantic_txt_norm = true;
+                flux_params.use_yak_mlp       = true;
+                flux_params.context_in_dim    = 2048;
+                flux_params.vec_in_dim        = 0;
+            } else if (sd_version_is_flux2(version)) {
+                flux_params.context_in_dim   = 15360;
+                flux_params.in_channels      = 128;
+                flux_params.hidden_size      = 6144;
+                flux_params.num_heads        = 48;
+                flux_params.patch_size       = 1;
+                flux_params.out_channels     = 128;
+                flux_params.mlp_ratio        = 3.f;
+                flux_params.theta            = 2000;
+                flux_params.axes_dim         = {32, 32, 32, 32};
+                flux_params.vec_in_dim       = 0;
+                flux_params.qkv_bias         = false;
+                flux_params.disable_bias     = true;
+                flux_params.share_modulation = true;
+                flux_params.ref_index_scale  = 10.f;
+                flux_params.use_mlp_silu_act = true;
            }
            for (auto pair : tensor_storage_map) {
                std::string tensor_name = pair.first;
@ -1243,7 +1391,7 @@ namespace Flux {
                                        bool increase_ref_index               = false,
                                        std::vector<int> skip_layers          = {}) {
            GGML_ASSERT(x->ne[3] == 1);
-            struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
+            struct ggml_cgraph* gf = new_graph_custom(FLUX_GRAPH_SIZE);

            struct ggml_tensor* mod_index_arange = nullptr;
            struct ggml_tensor* dct              = nullptr;  // for chroma radiance
@ -1275,13 +1423,23 @@ namespace Flux {
                ref_latents[i] = to_backend(ref_latents[i]);
            }

+            std::set<int> txt_arange_dims;
+            if (sd_version_is_flux2(version)) {
+                txt_arange_dims    = {3};
+                increase_ref_index = true;
+            } else if (version == VERSION_OVIS_IMAGE) {
+                txt_arange_dims = {1, 2};
+            }
+
            pe_vec      = Rope::gen_flux_pe(x->ne[1],
                                            x->ne[0],
                                            flux_params.patch_size,
                                            x->ne[3],
                                            context->ne[1],
+                                            txt_arange_dims,
                                            ref_latents,
                                            increase_ref_index,
+                                            flux_params.ref_index_scale,
                                            flux_params.theta,
                                            flux_params.axes_dim);
            int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2;
@ -1323,7 +1481,7 @@ namespace Flux {
            return gf;
        }

-        void compute(int n_threads,
+        bool compute(int n_threads,
                     struct ggml_tensor* x,
                     struct ggml_tensor* timesteps,
                     struct ggml_tensor* context,
@ -1344,7 +1502,7 @@ namespace Flux {
                return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, skip_layers);
            };

-            GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+            return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
        }

        void test() {
@ -1360,9 +1518,9 @@ namespace Flux {
                // cpu f16:
                // cuda f16: nan
                // cuda q8_0: pass
-                // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1);
+                auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 128, 1);
                // ggml_set_f32(x, 0.01f);
-                auto x = load_tensor_from_file(work_ctx, "chroma_x.bin");
+                // auto x = load_tensor_from_file(work_ctx, "chroma_x.bin");
                // print_ggml_tensor(x);

                std::vector<float> timesteps_vec(1, 1.f);
@ -1371,9 +1529,9 @@ namespace Flux {
                std::vector<float> guidance_vec(1, 0.f);
                auto guidance = vector_to_ggml_tensor(work_ctx, guidance_vec);

-                // auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 4096, 256, 1);
+                auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 15360, 256, 1);
                // ggml_set_f32(context, 0.01f);
-                auto context = load_tensor_from_file(work_ctx, "chroma_context.bin");
+                // auto context = load_tensor_from_file(work_ctx, "chroma_context.bin");
                // print_ggml_tensor(context);

                // auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, 1);
@ -1395,18 +1553,20 @@ namespace Flux {
        static void load_from_file_and_test(const std::string& file_path) {
            // ggml_backend_t backend = ggml_backend_cuda_init(0);
            ggml_backend_t backend    = ggml_backend_cpu_init();
-            ggml_type model_data_type = GGML_TYPE_Q8_0;
+            ggml_type model_data_type = GGML_TYPE_COUNT;

            ModelLoader model_loader;
-            if (!model_loader.init_from_file(file_path, "model.diffusion_model.")) {
+            if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
            }

            auto& tensor_storage_map = model_loader.get_tensor_storage_map();
-            for (auto& [name, tensor_storage] : tensor_storage_map) {
-                if (ends_with(name, "weight")) {
-                    tensor_storage.expected_type = model_data_type;
+            if (model_data_type != GGML_TYPE_COUNT) {
+                for (auto& [name, tensor_storage] : tensor_storage_map) {
+                    if (ends_with(name, "weight")) {
+                        tensor_storage.expected_type = model_data_type;
+                    }
                }
            }

@ -1414,7 +1574,7 @@ namespace Flux {
                                                                            false,
                                                                            tensor_storage_map,
                                                                            "model.diffusion_model",
-                                                                            VERSION_CHROMA_RADIANCE,
+                                                                            VERSION_FLUX2,
                                                                            false);

            flux->alloc_params_buffer();
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -60,6 +60,14 @@
 #define SD_UNUSED(x) (void)(x)
 #endif

+__STATIC_INLINE__ int align_up_offset(int n, int multiple) {
+    return (multiple - n % multiple) % multiple;
+}
+
+__STATIC_INLINE__ int align_up(int n, int multiple) {
+    return n + align_up_offset(n, multiple);
+}
+
 __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) {
    switch (level) {
        case GGML_LOG_LEVEL_DEBUG:
@ -760,6 +768,27 @@ __STATIC_INLINE__ std::vector<struct ggml_tensor*> ggml_ext_chunk(struct ggml_co
    return chunks;
 }

+__STATIC_INLINE__ ggml_tensor* ggml_ext_silu_act(ggml_context* ctx, ggml_tensor* x, bool gate_first = true) {
+    // x: [ne3, ne2, ne1, ne0]
+    // return: [ne3, ne2, ne1, ne0/2]
+
+    auto x_vec = ggml_ext_chunk(ctx, x, 2, 0);
+    ggml_tensor* gate;
+    if (gate_first) {
+        gate = x_vec[0];
+        x    = x_vec[1];
+    } else {
+        x    = x_vec[0];
+        gate = x_vec[1];
+    }
+
+    gate = ggml_silu_inplace(ctx, gate);
+
+    x = ggml_mul(ctx, x, gate);  // [ne3, ne2, ne1, ne0/2]
+
+    return x;
+}
+
 typedef std::function<void(ggml_tensor*, ggml_tensor*, bool)> on_tile_process;

 __STATIC_INLINE__ void sd_tiling_calc_tiles(int& num_tiles_dim,
@ -875,7 +904,7 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
    ggml_tensor* input_tile  = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne[2], input->ne[3]);
    ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne[2], output->ne[3]);
    int num_tiles            = num_tiles_x * num_tiles_y;
-    LOG_INFO("processing %i tiles", num_tiles);
+    LOG_DEBUG("processing %i tiles", num_tiles);
    pretty_progress(0, num_tiles, 0.0f);
    int tile_count = 1;
    bool last_y = false, last_x = false;
@ -959,12 +988,15 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_linear(struct ggml_context* ctx,
        int64_t ne3 = x->ne[3];
        x           = ggml_reshape_2d(ctx, x, x->ne[0], x->ne[1] * x->ne[2] * x->ne[3]);
        x           = ggml_mul_mat(ctx, w, x);
-        x           = ggml_reshape_4d(ctx, x, x->ne[0], x->ne[1] / ne2 / ne3, ne2, ne3);
+        if (force_prec_f32) {
+            ggml_mul_mat_set_prec(x, GGML_PREC_F32);
+        }
+        x = ggml_reshape_4d(ctx, x, x->ne[0], x->ne[1] / ne2 / ne3, ne2, ne3);
    } else {
        x = ggml_mul_mat(ctx, w, x);
-    }
-    if (force_prec_f32) {
-        ggml_mul_mat_set_prec(x, GGML_PREC_F32);
+        if (force_prec_f32) {
+            ggml_mul_mat_set_prec(x, GGML_PREC_F32);
+        }
    }
    if (scale != 1.f) {
        x = ggml_scale(ctx, x, 1.f / scale);
@ -994,6 +1026,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_2d(struct ggml_context* ctx,
    if (scale != 1.f) {
        x = ggml_scale(ctx, x, scale);
    }
+    if (w->ne[2] != x->ne[2] && ggml_n_dims(w) == 2) {
+        w = ggml_reshape_4d(ctx, w, 1, 1, w->ne[0], w->ne[1]);
+    }
    if (direct) {
        x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
    } else {
@ -1119,6 +1154,27 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones(struct ggml_context* ctx,
    return ggml_ext_full(ctx, 1.f, ne0, ne1, ne2, ne3);
 }

+__STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor* a) {
+#ifdef SD_USE_VULKAN
+    auto zero_index = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:zero_int");
+    auto out        = ggml_reshape_1d(ctx, a, ggml_nelements(a));
+    out             = ggml_get_rows(ctx, out, zero_index);
+    out             = ggml_reshape(ctx, out, a);
+    // auto out = ggml_cast(ctx, a, GGML_TYPE_F32);
+    return out;
+#else
+    auto out         = ggml_reshape_2d(ctx, a, 1, ggml_nelements(a));
+    ggml_tensor* one = ggml_ext_ones(ctx, 1, 1, 1, 1);  // [1,]
+    if (ggml_is_transposed(out)) {
+        out = ggml_mul_mat(ctx, one, out);
+    } else {
+        out = ggml_mul_mat(ctx, out, one);
+    }
+    out                    = ggml_reshape(ctx, out, a);
+#endif
+    return out;
+}
+
 // q: [N * n_head, n_token, d_head]
 // k: [N * n_head, n_k, d_head]
 // v: [N * n_head, d_head, n_k]
@ -1344,10 +1400,14 @@ __STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backe
 }

 __STATIC_INLINE__ float ggml_ext_backend_tensor_get_f32(ggml_tensor* tensor) {
-    GGML_ASSERT(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_I32);
+    GGML_ASSERT(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_I32 || tensor->type == GGML_TYPE_BF16);
    float value;
    if (tensor->type == GGML_TYPE_F32) {
        ggml_backend_tensor_get(tensor, &value, 0, sizeof(value));
+    } else if (tensor->type == GGML_TYPE_BF16) {
+        ggml_bf16_t bf16_value;
+        ggml_backend_tensor_get(tensor, &bf16_value, 0, sizeof(bf16_value));
+        value = ggml_bf16_to_fp32(bf16_value);
    } else if (tensor->type == GGML_TYPE_F16) {
        ggml_fp16_t f16_value;
        ggml_backend_tensor_get(tensor, &f16_value, 0, sizeof(f16_value));
@ -1460,11 +1520,43 @@ __STATIC_INLINE__ size_t ggml_tensor_num(ggml_context* ctx) {
 #define MAX_PARAMS_TENSOR_NUM 32768
 #define MAX_GRAPH_SIZE 327680

+struct WeightAdapter {
+    struct ForwardParams {
+        enum class op_type_t {
+            OP_LINEAR,
+            OP_CONV2D,
+        } op_type;
+        struct {
+            bool force_prec_f32 = false;
+            float scale         = 1.f;
+        } linear;
+        struct {
+            int s0      = 1;
+            int s1      = 1;
+            int p0      = 0;
+            int p1      = 0;
+            int d0      = 1;
+            int d1      = 1;
+            bool direct = false;
+            float scale = 1.f;
+        } conv2d;
+    };
+    virtual ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) = 0;
+    virtual ggml_tensor* forward_with_lora(ggml_context* ctx,
+                                           ggml_tensor* x,
+                                           ggml_tensor* w,
+                                           ggml_tensor* b,
+                                           const std::string& prefix,
+                                           ForwardParams forward_params)                                      = 0;
+    virtual size_t get_extra_graph_size()                                                                     = 0;
+};
+
 struct GGMLRunnerContext {
-    ggml_backend_t backend     = nullptr;
-    ggml_context* ggml_ctx     = nullptr;
-    bool flash_attn_enabled    = false;
-    bool conv2d_direct_enabled = false;
+    ggml_backend_t backend                        = nullptr;
+    ggml_context* ggml_ctx                        = nullptr;
+    bool flash_attn_enabled                       = false;
+    bool conv2d_direct_enabled                    = false;
+    std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
 };

 struct GGMLRunner {
@ -1486,9 +1578,14 @@ protected:
    struct ggml_context* compute_ctx    = nullptr;
    struct ggml_gallocr* compute_allocr = nullptr;

+    std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
+
    std::vector<float> one_vec = {1.f};
    ggml_tensor* one_tensor    = nullptr;

+    std::vector<int> zero_int_vec = {0};
+    ggml_tensor* zero_int_tensor  = nullptr;
+
    std::map<struct ggml_tensor*, const void*> backend_tensor_data_map;
    std::map<std::string, struct ggml_tensor*> cache_tensor_map;  // name -> tensor
    const std::string final_result_name = "ggml_runner_final_result_tensor";
@ -1559,17 +1656,31 @@ protected:
        one_tensor = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, 1);
        ggml_set_name(one_tensor, "ggml_runner_build_in_tensor:one");
        set_backend_tensor_data(one_tensor, one_vec.data());
+
+        zero_int_tensor = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1);
+        ggml_set_name(zero_int_tensor, "ggml_runner_build_in_tensor:zero_int");
+        set_backend_tensor_data(zero_int_tensor, zero_int_vec.data());
    }

    void prepare_build_in_tensor_after(struct ggml_cgraph* gf) {
        ggml_build_forward_expand(gf, one_tensor);
+        ggml_build_forward_expand(gf, zero_int_tensor);
+    }
+
+    struct ggml_cgraph* new_graph_custom(size_t graph_size) {
+        if (weight_adapter) {
+            graph_size += weight_adapter->get_extra_graph_size();
+        }
+        return ggml_new_graph_custom(compute_ctx, graph_size, false);
    }

    struct ggml_cgraph* get_compute_graph(get_graph_cb_t get_graph) {
        prepare_build_in_tensor_before();
        struct ggml_cgraph* gf = get_graph();
-        auto result            = ggml_graph_node(gf, -1);
-        ggml_set_name(result, final_result_name.c_str());
+        if (ggml_graph_n_nodes(gf) > 0) {
+            auto result = ggml_graph_node(gf, -1);
+            ggml_set_name(result, final_result_name.c_str());
+        }
        prepare_build_in_tensor_after(gf);
        return gf;
    }
@ -1758,6 +1869,7 @@ public:
        runner_ctx.backend               = runtime_backend;
        runner_ctx.flash_attn_enabled    = flash_attn_enabled;
        runner_ctx.conv2d_direct_enabled = conv2d_direct_enabled;
+        runner_ctx.weight_adapter        = weight_adapter;
        return runner_ctx;
    }

@ -1844,25 +1956,35 @@ public:
        return ggml_get_tensor(cache_ctx, name.c_str());
    }

-    void compute(get_graph_cb_t get_graph,
+    bool compute(get_graph_cb_t get_graph,
                 int n_threads,
                 bool free_compute_buffer_immediately = true,
                 struct ggml_tensor** output          = nullptr,
                 struct ggml_context* output_ctx      = nullptr) {
        if (!offload_params_to_runtime_backend()) {
            LOG_ERROR("%s offload params to runtime backend failed", get_desc().c_str());
-            return;
+            return false;
+        }
+        if (!alloc_compute_buffer(get_graph)) {
+            LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str());
+            return false;
        }
-        alloc_compute_buffer(get_graph);
        reset_compute_ctx();
        struct ggml_cgraph* gf = get_compute_graph(get_graph);
-        GGML_ASSERT(ggml_gallocr_alloc_graph(compute_allocr, gf));
+        if (!ggml_gallocr_alloc_graph(compute_allocr, gf)) {
+            LOG_ERROR("%s alloc compute graph failed", get_desc().c_str());
+            return false;
+        }
        copy_data_to_backend_tensor();
        if (ggml_backend_is_cpu(runtime_backend)) {
            ggml_backend_cpu_set_n_threads(runtime_backend, n_threads);
        }

-        ggml_backend_graph_compute(runtime_backend, gf);
+        ggml_status status = ggml_backend_graph_compute(runtime_backend, gf);
+        if (status != GGML_STATUS_SUCCESS) {
+            LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
+            return false;
+        }
 #ifdef GGML_PERF
        ggml_graph_print(gf);
 #endif
@ -1880,6 +2002,7 @@ public:
        if (free_compute_buffer_immediately) {
            free_compute_buffer();
        }
+        return true;
    }

    void set_flash_attention_enabled(bool enabled) {
@ -1889,6 +2012,10 @@ public:
    void set_conv2d_direct_enabled(bool enabled) {
        conv2d_direct_enabled = enabled;
    }
+
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {
+        weight_adapter = adapter;
+    }
 };

 class GGMLBlock {
@ -1926,8 +2053,8 @@ public:
        if (prefix.size() > 0) {
            prefix = prefix + ".";
        }
-        init_blocks(ctx, tensor_storage_map, prefix);
        init_params(ctx, tensor_storage_map, prefix);
+        init_blocks(ctx, tensor_storage_map, prefix);
    }

    size_t get_params_num() {
@ -2004,8 +2131,10 @@ protected:
    bool force_f32;
    bool force_prec_f32;
    float scale;
+    std::string prefix;

    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        this->prefix         = prefix;
        enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
        if (in_features % ggml_blck_size(wtype) != 0 || force_f32) {
            wtype = GGML_TYPE_F32;
@ -2037,6 +2166,13 @@ public:
        if (bias) {
            b = params["bias"];
        }
+        if (ctx->weight_adapter) {
+            WeightAdapter::ForwardParams forward_params;
+            forward_params.op_type               = WeightAdapter::ForwardParams::op_type_t::OP_LINEAR;
+            forward_params.linear.force_prec_f32 = force_prec_f32;
+            forward_params.linear.scale          = scale;
+            return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params);
+        }
        return ggml_ext_linear(ctx->ggml_ctx, x, w, b, force_prec_f32, scale);
    }
 };
@ -2096,8 +2232,10 @@ protected:
    std::pair<int, int> dilation;
    bool bias;
    float scale = 1.f;
+    std::string prefix;

    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override {
+        this->prefix         = prefix;
        enum ggml_type wtype = GGML_TYPE_F16;
        params["weight"]     = ggml_new_tensor_4d(ctx, wtype, kernel_size.second, kernel_size.first, in_channels, out_channels);
        if (bias) {
@ -2136,6 +2274,19 @@ public:
        if (bias) {
            b = params["bias"];
        }
+        if (ctx->weight_adapter) {
+            WeightAdapter::ForwardParams forward_params;
+            forward_params.op_type       = WeightAdapter::ForwardParams::op_type_t::OP_CONV2D;
+            forward_params.conv2d.s0     = stride.second;
+            forward_params.conv2d.s1     = stride.first;
+            forward_params.conv2d.p0     = padding.second;
+            forward_params.conv2d.p1     = padding.first;
+            forward_params.conv2d.d0     = dilation.second;
+            forward_params.conv2d.d1     = dilation.first;
+            forward_params.conv2d.direct = ctx->conv2d_direct_enabled;
+            forward_params.conv2d.scale  = scale;
+            return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params);
+        }
        return ggml_ext_conv_2d(ctx->ggml_ctx,
                                x,
                                w,
@ -2207,8 +2358,10 @@ protected:
    std::tuple<int, int, int> padding;
    std::tuple<int, int, int> dilation;
    bool bias;
+    std::string prefix;

    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override {
+        this->prefix         = prefix;
        enum ggml_type wtype = GGML_TYPE_F16;
        params["weight"]     = ggml_new_tensor_4d(ctx,
                                                  wtype,
@ -2240,8 +2393,17 @@ public:
    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        struct ggml_tensor* w = params["weight"];
        struct ggml_tensor* b = nullptr;
+        if (ctx->weight_adapter) {
+            w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
+            if (w->type != GGML_TYPE_F16) {
+                w = ggml_cast(ctx->ggml_ctx, w, GGML_TYPE_F16);
+            }
+        }
        if (bias) {
            b = params["bias"];
+            if (ctx->weight_adapter) {
+                b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias");
+            }
        }
        return ggml_ext_conv_3d(ctx->ggml_ctx, x, w, b, in_channels,
                                std::get<2>(stride), std::get<1>(stride), std::get<0>(stride),
@ -2256,8 +2418,10 @@ protected:
    float eps;
    bool elementwise_affine;
    bool bias;
+    std::string prefix;

    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        this->prefix = prefix;
        if (elementwise_affine) {
            enum ggml_type wtype = GGML_TYPE_F32;
            params["weight"]     = ggml_new_tensor_1d(ctx, wtype, normalized_shape);
@ -2284,8 +2448,14 @@ public:

        if (elementwise_affine) {
            w = params["weight"];
+            if (ctx->weight_adapter) {
+                w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
+            }
            if (bias) {
                b = params["bias"];
+                if (ctx->weight_adapter) {
+                    b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias");
+                }
            }
        }
        return ggml_ext_layer_norm(ctx->ggml_ctx, x, w, b, eps);
@ -2298,8 +2468,10 @@ protected:
    int64_t num_channels;
    float eps;
    bool affine;
+    std::string prefix;

    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        this->prefix = prefix;
        if (affine) {
            enum ggml_type wtype      = GGML_TYPE_F32;
            enum ggml_type bias_wtype = GGML_TYPE_F32;
@ -2324,6 +2496,10 @@ public:
        if (affine) {
            w = params["weight"];
            b = params["bias"];
+            if (ctx->weight_adapter) {
+                w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
+                b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias");
+            }
        }
        return ggml_ext_group_norm(ctx->ggml_ctx, x, w, b, num_groups);
    }
@ -2339,8 +2515,10 @@ class RMSNorm : public UnaryBlock {
 protected:
    int64_t hidden_size;
    float eps;
+    std::string prefix;

    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
+        this->prefix         = prefix;
        enum ggml_type wtype = GGML_TYPE_F32;
        params["weight"]     = ggml_new_tensor_1d(ctx, wtype, hidden_size);
    }
@ -2353,8 +2531,11 @@ public:

    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        struct ggml_tensor* w = params["weight"];
-        x                     = ggml_rms_norm(ctx->ggml_ctx, x, eps);
-        x                     = ggml_mul_inplace(ctx->ggml_ctx, x, w);
+        if (ctx->weight_adapter) {
+            w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
+        }
+        x = ggml_rms_norm(ctx->ggml_ctx, x, eps);
+        x = ggml_mul_inplace(ctx->ggml_ctx, x, w);
        return x;
    }
 };
--- a/latent-preview.h
+++ b/latent-preview.h
@ -0,0 +1,234 @@
+#include <cstddef>
+#include <cstdint>
+#include "ggml.h"
+
+const float wan_21_latent_rgb_proj[16][3] = {
+    {0.015123f, -0.148418f, 0.479828f},
+    {0.003652f, -0.010680f, -0.037142f},
+    {0.212264f, 0.063033f, 0.016779f},
+    {0.232999f, 0.406476f, 0.220125f},
+    {-0.051864f, -0.082384f, -0.069396f},
+    {0.085005f, -0.161492f, 0.010689f},
+    {-0.245369f, -0.506846f, -0.117010f},
+    {-0.151145f, 0.017721f, 0.007207f},
+    {-0.293239f, -0.207936f, -0.421135f},
+    {-0.187721f, 0.050783f, 0.177649f},
+    {-0.013067f, 0.265964f, 0.166578f},
+    {0.028327f, 0.109329f, 0.108642f},
+    {-0.205343f, 0.043991f, 0.148914f},
+    {0.014307f, -0.048647f, -0.007219f},
+    {0.217150f, 0.053074f, 0.319923f},
+    {0.155357f, 0.083156f, 0.064780f}};
+float wan_21_latent_rgb_bias[3] = {-0.270270f, -0.234976f, -0.456853f};
+
+const float wan_22_latent_rgb_proj[48][3] = {
+    {0.017126f, -0.027230f, -0.019257f},
+    {-0.113739f, -0.028715f, -0.022885f},
+    {-0.000106f, 0.021494f, 0.004629f},
+    {-0.013273f, -0.107137f, -0.033638f},
+    {-0.000381f, 0.000279f, 0.025877f},
+    {-0.014216f, -0.003975f, 0.040528f},
+    {0.001638f, -0.000748f, 0.011022f},
+    {0.029238f, -0.006697f, 0.035933f},
+    {0.021641f, -0.015874f, 0.040531f},
+    {-0.101984f, -0.070160f, -0.028855f},
+    {0.033207f, -0.021068f, 0.002663f},
+    {-0.104711f, 0.121673f, 0.102981f},
+    {0.082647f, -0.004991f, 0.057237f},
+    {-0.027375f, 0.031581f, 0.006868f},
+    {-0.045434f, 0.029444f, 0.019287f},
+    {-0.046572f, -0.012537f, 0.006675f},
+    {0.074709f, 0.033690f, 0.025289f},
+    {-0.008251f, -0.002745f, -0.006999f},
+    {0.012685f, -0.061856f, -0.048658f},
+    {0.042304f, -0.007039f, 0.000295f},
+    {-0.007644f, -0.060843f, -0.033142f},
+    {0.159909f, 0.045628f, 0.367541f},
+    {0.095171f, 0.086438f, 0.010271f},
+    {0.006812f, 0.019643f, 0.029637f},
+    {0.003467f, -0.010705f, 0.014252f},
+    {-0.099681f, -0.066272f, -0.006243f},
+    {0.047357f, 0.037040f, 0.000185f},
+    {-0.041797f, -0.089225f, -0.032257f},
+    {0.008928f, 0.017028f, 0.018684f},
+    {-0.042255f, 0.016045f, 0.006849f},
+    {0.011268f, 0.036462f, 0.037387f},
+    {0.011553f, -0.016375f, -0.048589f},
+    {0.046266f, -0.027189f, 0.056979f},
+    {0.009640f, -0.017576f, 0.030324f},
+    {-0.045794f, -0.036083f, -0.010616f},
+    {0.022418f, 0.039783f, -0.032939f},
+    {-0.052714f, -0.015525f, 0.007438f},
+    {0.193004f, 0.223541f, 0.264175f},
+    {-0.059406f, -0.008188f, 0.022867f},
+    {-0.156742f, -0.263791f, -0.007385f},
+    {-0.015717f, 0.016570f, 0.033969f},
+    {0.037969f, 0.109835f, 0.200449f},
+    {-0.000782f, -0.009566f, -0.008058f},
+    {0.010709f, 0.052960f, -0.044195f},
+    {0.017271f, 0.045839f, 0.034569f},
+    {0.009424f, 0.013088f, -0.001714f},
+    {-0.024805f, -0.059378f, -0.033756f},
+    {-0.078293f, 0.029070f, 0.026129f}};
+float wan_22_latent_rgb_bias[3] = {0.013160f, -0.096492f, -0.071323f};
+
+const float flux_latent_rgb_proj[16][3] = {
+    {-0.041168f, 0.019917f, 0.097253f},
+    {0.028096f, 0.026730f, 0.129576f},
+    {0.065618f, -0.067950f, -0.014651f},
+    {-0.012998f, -0.014762f, 0.081251f},
+    {0.078567f, 0.059296f, -0.024687f},
+    {-0.015987f, -0.003697f, 0.005012f},
+    {0.033605f, 0.138999f, 0.068517f},
+    {-0.024450f, -0.063567f, -0.030101f},
+    {-0.040194f, -0.016710f, 0.127185f},
+    {0.112681f, 0.088764f, -0.041940f},
+    {-0.023498f, 0.093664f, 0.025543f},
+    {0.082899f, 0.048320f, 0.007491f},
+    {0.075712f, 0.074139f, 0.081965f},
+    {-0.143501f, 0.018263f, -0.136138f},
+    {-0.025767f, -0.082035f, -0.040023f},
+    {-0.111849f, -0.055589f, -0.032361f}};
+float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f};
+
+const float flux2_latent_rgb_proj[32][3] = {
+    {0.000736f, -0.008385f, -0.019710f},
+    {-0.001352f, -0.016392f, 0.020693f},
+    {-0.006376f, 0.002428f, 0.036736f},
+    {0.039384f, 0.074167f, 0.119789f},
+    {0.007464f, -0.005705f, -0.004734f},
+    {-0.004086f, 0.005287f, -0.000409f},
+    {-0.032835f, 0.050802f, -0.028120f},
+    {-0.003158f, -0.000835f, 0.000406f},
+    {-0.112840f, -0.084337f, -0.023083f},
+    {0.001462f, -0.006656f, 0.000549f},
+    {-0.009980f, -0.007480f, 0.009702f},
+    {0.032540f, 0.000214f, -0.061388f},
+    {0.011023f, 0.000694f, 0.007143f},
+    {-0.001468f, -0.006723f, -0.001678f},
+    {-0.005921f, -0.010320f, -0.003907f},
+    {-0.028434f, 0.027584f, 0.018457f},
+    {0.014349f, 0.011523f, 0.000441f},
+    {0.009874f, 0.003081f, 0.001507f},
+    {0.002218f, 0.005712f, 0.001563f},
+    {0.053010f, -0.019844f, 0.008683f},
+    {-0.002507f, 0.005384f, 0.000938f},
+    {-0.002177f, -0.011366f, 0.003559f},
+    {-0.000261f, 0.015121f, -0.003240f},
+    {-0.003944f, -0.002083f, 0.005043f},
+    {-0.009138f, 0.011336f, 0.003781f},
+    {0.011429f, 0.003985f, -0.003855f},
+    {0.010518f, -0.005586f, 0.010131f},
+    {0.007883f, 0.002912f, -0.001473f},
+    {-0.003318f, -0.003160f, 0.003684f},
+    {-0.034560f, -0.008740f, 0.012996f},
+    {0.000166f, 0.001079f, -0.012153f},
+    {0.017772f, 0.000937f, -0.011953f}};
+float flux2_latent_rgb_bias[3] = {-0.028738f, -0.098463f, -0.107619f};
+
+// This one was taken straight from
+// https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303
+// (MiT Licence)
+const float sd3_latent_rgb_proj[16][3] = {
+    {-0.0645f, 0.0177f, 0.1052f},
+    {0.0028f, 0.0312f, 0.0650f},
+    {0.1848f, 0.0762f, 0.0360f},
+    {0.0944f, 0.0360f, 0.0889f},
+    {0.0897f, 0.0506f, -0.0364f},
+    {-0.0020f, 0.1203f, 0.0284f},
+    {0.0855f, 0.0118f, 0.0283f},
+    {-0.0539f, 0.0658f, 0.1047f},
+    {-0.0057f, 0.0116f, 0.0700f},
+    {-0.0412f, 0.0281f, -0.0039f},
+    {0.1106f, 0.1171f, 0.1220f},
+    {-0.0248f, 0.0682f, -0.0481f},
+    {0.0815f, 0.0846f, 0.1207f},
+    {-0.0120f, -0.0055f, -0.0867f},
+    {-0.0749f, -0.0634f, -0.0456f},
+    {-0.1418f, -0.1457f, -0.1259f},
+};
+float sd3_latent_rgb_bias[3] = {0, 0, 0};
+
+const float sdxl_latent_rgb_proj[4][3] = {
+    {0.258303f, 0.277640f, 0.329699f},
+    {-0.299701f, 0.105446f, 0.014194f},
+    {0.050522f, 0.186163f, -0.143257f},
+    {-0.211938f, -0.149892f, -0.080036f}};
+float sdxl_latent_rgb_bias[3] = {0.144381f, -0.033313f, 0.007061f};
+
+const float sd_latent_rgb_proj[4][3] = {
+    {0.337366f, 0.216344f, 0.257386f},
+    {0.165636f, 0.386828f, 0.046994f},
+    {-0.267803f, 0.237036f, 0.223517f},
+    {-0.178022f, -0.200862f, -0.678514f}};
+float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f};
+
+void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) {
+    size_t buffer_head = 0;
+
+    uint32_t latent_width  = latents->ne[0];
+    uint32_t latent_height = latents->ne[1];
+    uint32_t dim           = latents->ne[ggml_n_dims(latents) - 1];
+    uint32_t frames        = 1;
+    if (ggml_n_dims(latents) == 4) {
+        frames = latents->ne[2];
+    }
+
+    uint32_t rgb_width  = latent_width * patch_size;
+    uint32_t rgb_height = latent_height * patch_size;
+
+    uint32_t unpatched_dim = dim / (patch_size * patch_size);
+
+    for (int k = 0; k < frames; k++) {
+        for (int rgb_x = 0; rgb_x < rgb_width; rgb_x++) {
+            for (int rgb_y = 0; rgb_y < rgb_height; rgb_y++) {
+                int latent_x = rgb_x / patch_size;
+                int latent_y = rgb_y / patch_size;
+
+                int channel_offset = 0;
+                if (patch_size > 1) {
+                    channel_offset = ((rgb_y % patch_size) * patch_size + (rgb_x % patch_size));
+                }
+
+                size_t latent_id = (latent_x * latents->nb[0] + latent_y * latents->nb[1] + k * latents->nb[2]);
+
+                // should be incremented by 1 for each pixel
+                size_t pixel_id = k * rgb_width * rgb_height + rgb_y * rgb_width + rgb_x;
+
+                float r = 0, g = 0, b = 0;
+                if (latent_rgb_proj != nullptr) {
+                    for (int d = 0; d < unpatched_dim; d++) {
+                        float value = *(float*)((char*)latents->data + latent_id + (d * patch_size * patch_size + channel_offset) * latents->nb[ggml_n_dims(latents) - 1]);
+                        r += value * latent_rgb_proj[d][0];
+                        g += value * latent_rgb_proj[d][1];
+                        b += value * latent_rgb_proj[d][2];
+                    }
+                } else {
+                    // interpret first 3 channels as RGB
+                    r = *(float*)((char*)latents->data + latent_id + 0 * latents->nb[ggml_n_dims(latents) - 1]);
+                    g = *(float*)((char*)latents->data + latent_id + 1 * latents->nb[ggml_n_dims(latents) - 1]);
+                    b = *(float*)((char*)latents->data + latent_id + 2 * latents->nb[ggml_n_dims(latents) - 1]);
+                }
+                if (latent_rgb_bias != nullptr) {
+                    // bias
+                    r += latent_rgb_bias[0];
+                    g += latent_rgb_bias[1];
+                    b += latent_rgb_bias[2];
+                }
+                // change range
+                r = r * .5f + .5f;
+                g = g * .5f + .5f;
+                b = b * .5f + .5f;
+
+                // clamp rgb values to [0,1] range
+                r = r >= 0 ? r <= 1 ? r : 1 : 0;
+                g = g >= 0 ? g <= 1 ? g : 1 : 0;
+                b = b >= 0 ? b <= 1 ? b : 1 : 0;
+
+                buffer[pixel_id * 3 + 0] = (uint8_t)(r * 255);
+                buffer[pixel_id * 3 + 1] = (uint8_t)(g * 255);
+                buffer[pixel_id * 3 + 2] = (uint8_t)(b * 255);
+            }
+        }
+    }
+}
--- a/qwenvl.hpp
+++ b/qwenvl.hpp
--- a/lora.hpp
+++ b/lora.hpp
--- a/mmdit.hpp
+++ b/mmdit.hpp
@ -101,10 +101,14 @@ protected:

 public:
    TimestepEmbedder(int64_t hidden_size,
-                     int64_t frequency_embedding_size = 256)
+                     int64_t frequency_embedding_size = 256,
+                     int64_t out_channels             = 0)
        : frequency_embedding_size(frequency_embedding_size) {
+        if (out_channels <= 0) {
+            out_channels = hidden_size;
+        }
        blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size, true, true));
-        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
+        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, out_channels, true, true));
    }

    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* t) {
@ -870,7 +874,7 @@ struct MMDiTRunner : public GGMLRunner {
                                    struct ggml_tensor* context,
                                    struct ggml_tensor* y,
                                    std::vector<int> skip_layers = std::vector<int>()) {
-        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, MMDIT_GRAPH_SIZE, false);
+        struct ggml_cgraph* gf = new_graph_custom(MMDIT_GRAPH_SIZE);

        x         = to_backend(x);
        context   = to_backend(context);
@ -890,7 +894,7 @@ struct MMDiTRunner : public GGMLRunner {
        return gf;
    }

-    void compute(int n_threads,
+    bool compute(int n_threads,
                 struct ggml_tensor* x,
                 struct ggml_tensor* timesteps,
                 struct ggml_tensor* context,
@ -906,7 +910,7 @@ struct MMDiTRunner : public GGMLRunner {
            return build_graph(x, timesteps, context, y, skip_layers);
        };

-        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
    }

    void test() {
@ -961,7 +965,7 @@ struct MMDiTRunner : public GGMLRunner {
            mmdit->get_param_tensors(tensors, "model.diffusion_model");

            ModelLoader model_loader;
-            if (!model_loader.init_from_file(file_path)) {
+            if (!model_loader.init_from_file_and_convert_name(file_path)) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
            }
--- a/model.cpp
+++ b/model.cpp
@ -17,6 +17,7 @@
 #include "stable-diffusion.h"
 #include "util.h"
 #include "vocab.hpp"
+#include "vocab_mistral.hpp"
 #include "vocab_qwen.hpp"
 #include "vocab_umt5.hpp"

@ -25,6 +26,7 @@
 #include "ggml-cpu.h"
 #include "ggml.h"

+#include "name_conversion.h"
 #include "stable-diffusion.h"

 #ifdef SD_USE_METAL
@ -75,15 +77,6 @@ uint16_t read_short(uint8_t* buffer) {

 /*================================================= Preprocess ==================================================*/

-std::string self_attn_names[] = {
-    "self_attn.q_proj.weight",
-    "self_attn.k_proj.weight",
-    "self_attn.v_proj.weight",
-    "self_attn.q_proj.bias",
-    "self_attn.k_proj.bias",
-    "self_attn.v_proj.bias",
-};
-
 const char* unused_tensors[] = {
    "betas",
    "alphas_cumprod_prev",
@ -97,9 +90,9 @@ const char* unused_tensors[] = {
    "posterior_mean_coef1",
    "posterior_mean_coef2",
    "cond_stage_model.transformer.text_model.embeddings.position_ids",
+    "cond_stage_model.1.model.text_model.embeddings.position_ids",
    "cond_stage_model.transformer.vision_model.embeddings.position_ids",
    "cond_stage_model.model.logit_scale",
-    "cond_stage_model.model.text_projection",
    "conditioner.embedders.0.transformer.text_model.embeddings.position_ids",
    "conditioner.embedders.0.model.logit_scale",
    "conditioner.embedders.1.model.logit_scale",
@ -111,8 +104,14 @@ const char* unused_tensors[] = {
    "embedding_manager",
    "denoiser.sigmas",
    "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight",  // only used during training
-    "text_encoders.qwen2vl.output.weight",
-    "text_encoders.qwen2vl.lm_head.",
+    "ztsnr",                                                        // Found in some SDXL vpred models
+    "edm_vpred.sigma_min",                                          // Found in CosXL
+    // TODO: find another way to avoid the "unknown tensor" for these two
+    // "edm_vpred.sigma_max", // Used to detect CosXL
+    // "v_pred", // Used to detect SDXL vpred models
+    "text_encoders.llm.output.weight",
+    "text_encoders.llm.lm_head.",
+    "first_stage_model.bn.",
 };

 bool is_unused_tensor(std::string name) {
@ -124,627 +123,6 @@ bool is_unused_tensor(std::string name) {
    return false;
 }

-std::unordered_map<std::string, std::string> open_clip_to_hf_clip_model = {
-    {"model.ln_final.bias", "transformer.text_model.final_layer_norm.bias"},
-    {"model.ln_final.weight", "transformer.text_model.final_layer_norm.weight"},
-    {"model.positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"},
-    {"model.token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"},
-    {"model.text_projection", "transformer.text_model.text_projection"},
-    {"model.visual.class_embedding", "transformer.vision_model.embeddings.class_embedding"},
-    {"model.visual.conv1.weight", "transformer.vision_model.embeddings.patch_embedding.weight"},
-    {"model.visual.ln_post.bias", "transformer.vision_model.post_layernorm.bias"},
-    {"model.visual.ln_post.weight", "transformer.vision_model.post_layernorm.weight"},
-    {"model.visual.ln_pre.bias", "transformer.vision_model.pre_layernorm.bias"},
-    {"model.visual.ln_pre.weight", "transformer.vision_model.pre_layernorm.weight"},
-    {"model.visual.positional_embedding", "transformer.vision_model.embeddings.position_embedding.weight"},
-    {"model.visual.proj", "transformer.visual_projection.weight"},
-};
-
-std::unordered_map<std::string, std::string> open_clip_to_hf_clip_resblock = {
-    {"attn.in_proj_bias", "self_attn.in_proj.bias"},
-    {"attn.in_proj_weight", "self_attn.in_proj.weight"},
-    {"attn.out_proj.bias", "self_attn.out_proj.bias"},
-    {"attn.out_proj.weight", "self_attn.out_proj.weight"},
-    {"ln_1.bias", "layer_norm1.bias"},
-    {"ln_1.weight", "layer_norm1.weight"},
-    {"ln_2.bias", "layer_norm2.bias"},
-    {"ln_2.weight", "layer_norm2.weight"},
-    {"mlp.c_fc.bias", "mlp.fc1.bias"},
-    {"mlp.c_fc.weight", "mlp.fc1.weight"},
-    {"mlp.c_proj.bias", "mlp.fc2.bias"},
-    {"mlp.c_proj.weight", "mlp.fc2.weight"},
-};
-
-std::unordered_map<std::string, std::string> cond_model_name_map = {
-    {"transformer.vision_model.pre_layrnorm.weight", "transformer.vision_model.pre_layernorm.weight"},
-    {"transformer.vision_model.pre_layrnorm.bias", "transformer.vision_model.pre_layernorm.bias"},
-};
-
-std::unordered_map<std::string, std::string> vae_decoder_name_map = {
-    {"first_stage_model.decoder.mid.attn_1.to_k.bias", "first_stage_model.decoder.mid.attn_1.k.bias"},
-    {"first_stage_model.decoder.mid.attn_1.to_k.weight", "first_stage_model.decoder.mid.attn_1.k.weight"},
-    {"first_stage_model.decoder.mid.attn_1.to_out.0.bias", "first_stage_model.decoder.mid.attn_1.proj_out.bias"},
-    {"first_stage_model.decoder.mid.attn_1.to_out.0.weight", "first_stage_model.decoder.mid.attn_1.proj_out.weight"},
-    {"first_stage_model.decoder.mid.attn_1.to_q.bias", "first_stage_model.decoder.mid.attn_1.q.bias"},
-    {"first_stage_model.decoder.mid.attn_1.to_q.weight", "first_stage_model.decoder.mid.attn_1.q.weight"},
-    {"first_stage_model.decoder.mid.attn_1.to_v.bias", "first_stage_model.decoder.mid.attn_1.v.bias"},
-    {"first_stage_model.decoder.mid.attn_1.to_v.weight", "first_stage_model.decoder.mid.attn_1.v.weight"},
-};
-
-std::unordered_map<std::string, std::string> pmid_v2_name_map = {
-    {"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.weight",
-     "pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.fc1.weight"},
-    {"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.3.weight",
-     "pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.fc2.weight"},
-    {"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.weight",
-     "pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.fc1.weight"},
-    {"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.3.weight",
-     "pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.fc2.weight"},
-    {"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.weight",
-     "pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.fc1.weight"},
-    {"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.3.weight",
-     "pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.fc2.weight"},
-    {"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.weight",
-     "pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.fc1.weight"},
-    {"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.3.weight",
-     "pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.fc2.weight"},
-    {"pmid.qformer_perceiver.token_proj.0.bias",
-     "pmid.qformer_perceiver.token_proj.fc1.bias"},
-    {"pmid.qformer_perceiver.token_proj.2.bias",
-     "pmid.qformer_perceiver.token_proj.fc2.bias"},
-    {"pmid.qformer_perceiver.token_proj.0.weight",
-     "pmid.qformer_perceiver.token_proj.fc1.weight"},
-    {"pmid.qformer_perceiver.token_proj.2.weight",
-     "pmid.qformer_perceiver.token_proj.fc2.weight"},
-};
-
-std::unordered_map<std::string, std::string> qwenvl_name_map{
-    {"token_embd.", "model.embed_tokens."},
-    {"blk.", "model.layers."},
-    {"attn_q.", "self_attn.q_proj."},
-    {"attn_k.", "self_attn.k_proj."},
-    {"attn_v.", "self_attn.v_proj."},
-    {"attn_output.", "self_attn.o_proj."},
-    {"attn_norm.", "input_layernorm."},
-    {"ffn_down.", "mlp.down_proj."},
-    {"ffn_gate.", "mlp.gate_proj."},
-    {"ffn_up.", "mlp.up_proj."},
-    {"ffn_norm.", "post_attention_layernorm."},
-    {"output_norm.", "model.norm."},
-};
-
-std::unordered_map<std::string, std::string> qwenvl_vision_name_map{
-    {"mm.", "merger.mlp."},
-    {"v.post_ln.", "merger.ln_q."},
-    {"v.patch_embd.weight", "patch_embed.proj.0.weight"},
-    {"patch_embed.proj.0.weight.1", "patch_embed.proj.1.weight"},
-    {"v.patch_embd.weight.1", "patch_embed.proj.1.weight"},
-    {"v.blk.", "blocks."},
-    {"attn_q.", "attn.q_proj."},
-    {"attn_k.", "attn.k_proj."},
-    {"attn_v.", "attn.v_proj."},
-    {"attn_out.", "attn.proj."},
-    {"ffn_down.", "mlp.down_proj."},
-    {"ffn_gate.", "mlp.gate_proj."},
-    {"ffn_up.", "mlp.up_proj."},
-    {"ln1.", "norm1."},
-    {"ln2.", "norm2."},
-};
-
-std::string convert_cond_model_name(const std::string& name) {
-    std::string new_name = name;
-    std::string prefix;
-    if (contains(new_name, ".enc.")) {
-        // llama.cpp naming convention for T5
-        size_t pos = new_name.find(".enc.");
-        if (pos != std::string::npos) {
-            new_name.replace(pos, 5, ".encoder.");
-        }
-        pos = new_name.find("blk.");
-        if (pos != std::string::npos) {
-            new_name.replace(pos, 4, "block.");
-        }
-        pos = new_name.find("output_norm.");
-        if (pos != std::string::npos) {
-            new_name.replace(pos, 12, "final_layer_norm.");
-        }
-        pos = new_name.find("attn_k.");
-        if (pos != std::string::npos) {
-            new_name.replace(pos, 7, "layer.0.SelfAttention.k.");
-        }
-        pos = new_name.find("attn_v.");
-        if (pos != std::string::npos) {
-            new_name.replace(pos, 7, "layer.0.SelfAttention.v.");
-        }
-        pos = new_name.find("attn_o.");
-        if (pos != std::string::npos) {
-            new_name.replace(pos, 7, "layer.0.SelfAttention.o.");
-        }
-        pos = new_name.find("attn_q.");
-        if (pos != std::string::npos) {
-            new_name.replace(pos, 7, "layer.0.SelfAttention.q.");
-        }
-        pos = new_name.find("attn_norm.");
-        if (pos != std::string::npos) {
-            new_name.replace(pos, 10, "layer.0.layer_norm.");
-        }
-        pos = new_name.find("ffn_norm.");
-        if (pos != std::string::npos) {
-            new_name.replace(pos, 9, "layer.1.layer_norm.");
-        }
-        pos = new_name.find("ffn_up.");
-        if (pos != std::string::npos) {
-            new_name.replace(pos, 7, "layer.1.DenseReluDense.wi_1.");
-        }
-        pos = new_name.find("ffn_down.");
-        if (pos != std::string::npos) {
-            new_name.replace(pos, 9, "layer.1.DenseReluDense.wo.");
-        }
-        pos = new_name.find("ffn_gate.");
-        if (pos != std::string::npos) {
-            new_name.replace(pos, 9, "layer.1.DenseReluDense.wi_0.");
-        }
-        pos = new_name.find("attn_rel_b.");
-        if (pos != std::string::npos) {
-            new_name.replace(pos, 11, "layer.0.SelfAttention.relative_attention_bias.");
-        }
-    } else if (contains(name, "qwen2vl")) {
-        if (contains(name, "qwen2vl.visual")) {
-            for (auto kv : qwenvl_vision_name_map) {
-                size_t pos = new_name.find(kv.first);
-                if (pos != std::string::npos) {
-                    new_name.replace(pos, kv.first.size(), kv.second);
-                }
-            }
-        } else {
-            for (auto kv : qwenvl_name_map) {
-                size_t pos = new_name.find(kv.first);
-                if (pos != std::string::npos) {
-                    new_name.replace(pos, kv.first.size(), kv.second);
-                }
-            }
-        }
-    } else if (name == "text_encoders.t5xxl.transformer.token_embd.weight") {
-        new_name = "text_encoders.t5xxl.transformer.shared.weight";
-    }
-
-    if (starts_with(new_name, "conditioner.embedders.0.open_clip.")) {
-        prefix   = "cond_stage_model.";
-        new_name = new_name.substr(strlen("conditioner.embedders.0.open_clip."));
-    } else if (starts_with(new_name, "conditioner.embedders.0.")) {
-        prefix   = "cond_stage_model.";
-        new_name = new_name.substr(strlen("conditioner.embedders.0."));
-    } else if (starts_with(new_name, "conditioner.embedders.1.")) {
-        prefix   = "cond_stage_model.1.";
-        new_name = new_name.substr(strlen("conditioner.embedders.0."));
-    } else if (starts_with(new_name, "cond_stage_model.")) {
-        prefix   = "cond_stage_model.";
-        new_name = new_name.substr(strlen("cond_stage_model."));
-    } else if (ends_with(new_name, "vision_model.visual_projection.weight")) {
-        prefix   = new_name.substr(0, new_name.size() - strlen("vision_model.visual_projection.weight"));
-        new_name = prefix + "visual_projection.weight";
-        return new_name;
-    } else if (ends_with(new_name, "transformer.text_projection.weight")) {
-        prefix   = new_name.substr(0, new_name.size() - strlen("transformer.text_projection.weight"));
-        new_name = prefix + "transformer.text_model.text_projection";
-        return new_name;
-    } else {
-        return new_name;
-    }
-
-    if (new_name == "model.text_projection.weight") {
-        new_name = "transformer.text_model.text_projection";
-    }
-
-    if (open_clip_to_hf_clip_model.find(new_name) != open_clip_to_hf_clip_model.end()) {
-        new_name = open_clip_to_hf_clip_model[new_name];
-    }
-
-    if (cond_model_name_map.find(new_name) != cond_model_name_map.end()) {
-        new_name = cond_model_name_map[new_name];
-    }
-
-    std::string open_clip_resblock_prefix = "model.transformer.resblocks.";
-    std::string hf_clip_resblock_prefix   = "transformer.text_model.encoder.layers.";
-
-    auto replace_suffix = [&]() {
-        if (new_name.find(open_clip_resblock_prefix) == 0) {
-            std::string remain = new_name.substr(open_clip_resblock_prefix.length());
-            std::string idx    = remain.substr(0, remain.find("."));
-            std::string suffix = remain.substr(idx.length() + 1);
-
-            if (open_clip_to_hf_clip_resblock.find(suffix) != open_clip_to_hf_clip_resblock.end()) {
-                std::string new_suffix = open_clip_to_hf_clip_resblock[suffix];
-                new_name               = hf_clip_resblock_prefix + idx + "." + new_suffix;
-            }
-        }
-    };
-
-    replace_suffix();
-
-    open_clip_resblock_prefix = "model.visual.transformer.resblocks.";
-    hf_clip_resblock_prefix   = "transformer.vision_model.encoder.layers.";
-
-    replace_suffix();
-
-    return prefix + new_name;
-}
-
-std::string convert_vae_decoder_name(const std::string& name) {
-    if (vae_decoder_name_map.find(name) != vae_decoder_name_map.end()) {
-        return vae_decoder_name_map[name];
-    }
-    return name;
-}
-
-std::string convert_pmid_v2_name(const std::string& name) {
-    if (pmid_v2_name_map.find(name) != pmid_v2_name_map.end()) {
-        return pmid_v2_name_map[name];
-    }
-    return name;
-}
-
-/* If not a SDXL LoRA the unet" prefix will have already been replaced by this
- * point and "te2" and "te1" don't seem to appear in non-SDXL only "te_" */
-std::string convert_sdxl_lora_name(std::string tensor_name) {
-    const std::pair<std::string, std::string> sdxl_lora_name_lookup[] = {
-        {"unet", "model_diffusion_model"},
-        {"te2", "cond_stage_model_1_transformer"},
-        {"te1", "cond_stage_model_transformer"},
-        {"text_encoder_2", "cond_stage_model_1_transformer"},
-        {"text_encoder", "cond_stage_model_transformer"},
-    };
-    for (auto& pair_i : sdxl_lora_name_lookup) {
-        if (tensor_name.compare(0, pair_i.first.length(), pair_i.first) == 0) {
-            tensor_name = std::regex_replace(tensor_name, std::regex(pair_i.first), pair_i.second);
-            break;
-        }
-    }
-    return tensor_name;
-}
-
-std::unordered_map<std::string, std::unordered_map<std::string, std::string>> suffix_conversion_underline = {
-    {
-        "attentions",
-        {
-            {"to_k", "k"},
-            {"to_q", "q"},
-            {"to_v", "v"},
-            {"to_out_0", "proj_out"},
-            {"group_norm", "norm"},
-            {"key", "k"},
-            {"query", "q"},
-            {"value", "v"},
-            {"proj_attn", "proj_out"},
-        },
-    },
-    {
-        "resnets",
-        {
-            {"conv1", "in_layers_2"},
-            {"conv2", "out_layers_3"},
-            {"norm1", "in_layers_0"},
-            {"norm2", "out_layers_0"},
-            {"time_emb_proj", "emb_layers_1"},
-            {"conv_shortcut", "skip_connection"},
-        },
-    },
-};
-
-std::unordered_map<std::string, std::unordered_map<std::string, std::string>> suffix_conversion_dot = {
-    {
-        "attentions",
-        {
-            {"to_k", "k"},
-            {"to_q", "q"},
-            {"to_v", "v"},
-            {"to_out.0", "proj_out"},
-            {"group_norm", "norm"},
-            {"key", "k"},
-            {"query", "q"},
-            {"value", "v"},
-            {"proj_attn", "proj_out"},
-        },
-    },
-    {
-        "resnets",
-        {
-            {"conv1", "in_layers.2"},
-            {"conv2", "out_layers.3"},
-            {"norm1", "in_layers.0"},
-            {"norm2", "out_layers.0"},
-            {"time_emb_proj", "emb_layers.1"},
-            {"conv_shortcut", "skip_connection"},
-        },
-    },
-};
-
-std::string convert_diffusers_name_to_compvis(std::string key, char seq) {
-    std::vector<std::string> m;
-
-    auto match = [](std::vector<std::string>& match_list, const std::regex& regex, const std::string& key) {
-        auto r = std::smatch{};
-        if (!std::regex_match(key, r, regex)) {
-            return false;
-        }
-
-        match_list.clear();
-        for (size_t i = 1; i < r.size(); ++i) {
-            match_list.push_back(r.str(i));
-        }
-        return true;
-    };
-
-    std::unordered_map<std::string, std::unordered_map<std::string, std::string>> suffix_conversion;
-    if (seq == '_') {
-        suffix_conversion = suffix_conversion_underline;
-    } else {
-        suffix_conversion = suffix_conversion_dot;
-    }
-
-    auto get_converted_suffix = [&suffix_conversion](const std::string& outer_key, const std::string& inner_key) {
-        auto outer_iter = suffix_conversion.find(outer_key);
-        if (outer_iter != suffix_conversion.end()) {
-            auto inner_iter = outer_iter->second.find(inner_key);
-            if (inner_iter != outer_iter->second.end()) {
-                return inner_iter->second;
-            }
-        }
-        return inner_key;
-    };
-
-    // convert attn to out
-    if (ends_with(key, "to_out")) {
-        key += format("%c0", seq);
-    }
-
-    // unet
-    if (match(m, std::regex(format("unet%cconv_in(.*)", seq)), key)) {
-        return format("model%cdiffusion_model%cinput_blocks%c0%c0", seq, seq, seq, seq) + m[0];
-    }
-
-    if (match(m, std::regex(format("unet%cconv%cout(.*)", seq, seq)), key)) {
-        return format("model%cdiffusion_model%cout%c2", seq, seq, seq) + m[0];
-    }
-
-    if (match(m, std::regex(format("unet%cconv_norm_out(.*)", seq)), key)) {
-        return format("model%cdiffusion_model%cout%c0", seq, seq, seq) + m[0];
-    }
-
-    if (match(m, std::regex(format("unet%ctime_embedding%clinear_(\\d+)(.*)", seq, seq)), key)) {
-        return format("model%cdiffusion_model%ctime_embed%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 2 - 2) + m[1];
-    }
-
-    if (match(m, std::regex(format("unet%cadd_embedding%clinear_(\\d+)(.*)", seq, seq)), key)) {
-        return format("model%cdiffusion_model%clabel_emb%c0%c", seq, seq, seq, seq) + std::to_string(std::stoi(m[0]) * 2 - 2) + m[1];
-    }
-
-    if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
-        std::string suffix = get_converted_suffix(m[1], m[3]);
-        // LOG_DEBUG("%s %s %s %s", m[0].c_str(), m[1].c_str(), m[2].c_str(), m[3].c_str());
-        return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) + std::to_string(1 + std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq +
-               (m[1] == "attentions" ? "1" : "0") + seq + suffix;
-    }
-
-    if (match(m, std::regex(format("unet%cmid_block%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq)), key)) {
-        std::string suffix = get_converted_suffix(m[0], m[2]);
-        return format("model%cdiffusion_model%cmiddle_block%c", seq, seq, seq) + (m[0] == "attentions" ? "1" : std::to_string(std::stoi(m[1]) * 2)) +
-               seq + suffix;
-    }
-
-    if (match(m, std::regex(format("unet%cup_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
-        std::string suffix = get_converted_suffix(m[1], m[3]);
-        return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq +
-               (m[1] == "attentions" ? "1" : "0") + seq + suffix;
-    }
-
-    if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq)), key)) {
-        return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) + std::to_string(3 + std::stoi(m[0]) * 3) + seq + "0" + seq + "op";
-    }
-
-    if (match(m, std::regex(format("unet%cup_blocks%c(\\d+)%cupsamplers%c0%cconv", seq, seq, seq, seq, seq)), key)) {
-        return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) + std::to_string(2 + std::stoi(m[0]) * 3) + seq +
-               (std::stoi(m[0]) > 0 ? "2" : "1") + seq + "conv";
-    }
-
-    // clip
-    if (match(m, std::regex(format("te%ctext_model%cencoder%clayers%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
-        return format("cond_stage_model%ctransformer%ctext_model%cencoder%clayers%c", seq, seq, seq, seq, seq) + m[0] + seq + m[1];
-    }
-
-    if (match(m, std::regex(format("te%ctext_model(.*)", seq)), key)) {
-        return format("cond_stage_model%ctransformer%ctext_model", seq, seq) + m[0];
-    }
-
-    // clip-g
-    if (match(m, std::regex(format("te%c1%ctext_model%cencoder%clayers%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), key)) {
-        return format("cond_stage_model%c1%ctransformer%ctext_model%cencoder%clayers%c", seq, seq, seq, seq, seq, seq) + m[0] + seq + m[1];
-    }
-
-    if (match(m, std::regex(format("te%c1%ctext_model(.*)", seq, seq)), key)) {
-        return format("cond_stage_model%c1%ctransformer%ctext_model", seq, seq, seq) + m[0];
-    }
-
-    if (match(m, std::regex(format("te%c1%ctext_projection", seq, seq)), key)) {
-        return format("cond_stage_model%c1%ctransformer%ctext_model%ctext_projection", seq, seq, seq, seq);
-    }
-
-    // vae
-    if (match(m, std::regex(format("vae%c(.*)%cconv_norm_out(.*)", seq, seq)), key)) {
-        return format("first_stage_model%c%s%cnorm_out%s", seq, m[0].c_str(), seq, m[1].c_str());
-    }
-
-    if (match(m, std::regex(format("vae%c(.*)%cmid_block%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
-        std::string suffix;
-        std::string block_name;
-        if (m[1] == "attentions") {
-            block_name = "attn";
-            suffix     = get_converted_suffix(m[1], m[3]);
-        } else {
-            block_name = "block";
-            suffix     = m[3];
-        }
-        return format("first_stage_model%c%s%cmid%c%s_%d%c%s",
-                      seq, m[0].c_str(), seq, seq, block_name.c_str(), std::stoi(m[2]) + 1, seq, suffix.c_str());
-    }
-
-    if (match(m, std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), key)) {
-        std::string suffix = m[3];
-        if (suffix == "conv_shortcut") {
-            suffix = "nin_shortcut";
-        }
-        return format("first_stage_model%c%s%cup%c%d%cblock%c%s%c%s",
-                      seq, m[0].c_str(), seq, seq, 3 - std::stoi(m[1]), seq, seq, m[2].c_str(), seq, suffix.c_str());
-    }
-
-    if (match(m, std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)), key)) {
-        return format("first_stage_model%c%s%cdown%c%d%cdownsample%cconv",
-                      seq, m[0].c_str(), seq, seq, std::stoi(m[1]), seq, seq);
-    }
-
-    if (match(m, std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), key)) {
-        std::string suffix = m[3];
-        if (suffix == "conv_shortcut") {
-            suffix = "nin_shortcut";
-        }
-        return format("first_stage_model%c%s%cdown%c%d%cblock%c%s%c%s",
-                      seq, m[0].c_str(), seq, seq, std::stoi(m[1]), seq, seq, m[2].c_str(), seq, suffix.c_str());
-    }
-
-    if (match(m, std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cupsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)), key)) {
-        return format("first_stage_model%c%s%cup%c%d%cupsample%cconv",
-                      seq, m[0].c_str(), seq, seq, 3 - std::stoi(m[1]), seq, seq);
-    }
-
-    if (match(m, std::regex(format("vae%c(.*)", seq)), key)) {
-        return format("first_stage_model%c", seq) + m[0];
-    }
-
-    return key;
-}
-
-std::string convert_tensor_name(std::string name) {
-    if (starts_with(name, "diffusion_model")) {
-        name = "model." + name;
-    }
-    if (starts_with(name, "model.diffusion_model.up_blocks.0.attentions.0.")) {
-        name.replace(0, sizeof("model.diffusion_model.up_blocks.0.attentions.0.") - 1,
-                     "model.diffusion_model.output_blocks.0.1.");
-    }
-    if (starts_with(name, "model.diffusion_model.up_blocks.0.attentions.1.")) {
-        name.replace(0, sizeof("model.diffusion_model.up_blocks.0.attentions.1.") - 1,
-                     "model.diffusion_model.output_blocks.1.1.");
-    }
-    // size_t pos = name.find("lora_A");
-    // if (pos != std::string::npos) {
-    //     name.replace(pos, strlen("lora_A"), "lora_up");
-    // }
-    // pos = name.find("lora_B");
-    // if (pos != std::string::npos) {
-    //     name.replace(pos, strlen("lora_B"), "lora_down");
-    // }
-    std::string new_name = name;
-    if (starts_with(name, "cond_stage_model.") ||
-        starts_with(name, "conditioner.embedders.") ||
-        starts_with(name, "text_encoders.") ||
-        ends_with(name, ".vision_model.visual_projection.weight") ||
-        starts_with(name, "qwen2vl")) {
-        new_name = convert_cond_model_name(name);
-    } else if (starts_with(name, "first_stage_model.decoder")) {
-        new_name = convert_vae_decoder_name(name);
-    } else if (starts_with(name, "pmid.qformer_perceiver")) {
-        new_name = convert_pmid_v2_name(name);
-    } else if (starts_with(name, "control_model.")) {  // for controlnet pth models
-        size_t pos = name.find('.');
-        if (pos != std::string::npos) {
-            new_name = name.substr(pos + 1);
-        }
-    } else if (starts_with(name, "lora_")) {  // for lora
-        size_t pos = name.find('.');
-        if (pos != std::string::npos) {
-            std::string name_without_network_parts = name.substr(5, pos - 5);
-            std::string network_part               = name.substr(pos + 1);
-
-            // LOG_DEBUG("%s %s", name_without_network_parts.c_str(), network_part.c_str());
-            std::string new_key = convert_diffusers_name_to_compvis(name_without_network_parts, '_');
-            /* For dealing with the new SDXL LoRA tensor naming convention */
-            new_key = convert_sdxl_lora_name(new_key);
-
-            if (new_key.empty()) {
-                new_name = name;
-            } else {
-                new_name = "lora." + new_key + "." + network_part;
-            }
-        } else {
-            new_name = name;
-        }
-    } else if (ends_with(name, ".diff") || ends_with(name, ".diff_b")) {
-        new_name = "lora." + name;
-    } else if (contains(name, "lora_up") || contains(name, "lora_down") ||
-               contains(name, "lora.up") || contains(name, "lora.down") ||
-               contains(name, "lora_linear") || ends_with(name, ".alpha")) {
-        size_t pos = new_name.find(".processor");
-        if (pos != std::string::npos) {
-            new_name.replace(pos, strlen(".processor"), "");
-        }
-        // if (starts_with(new_name, "transformer.transformer_blocks") || starts_with(new_name, "transformer.single_transformer_blocks")) {
-        //     new_name = "model.diffusion_model." + new_name;
-        // }
-        if (ends_with(name, ".alpha")) {
-            pos = new_name.rfind("alpha");
-        } else {
-            pos = new_name.rfind("lora");
-        }
-        if (pos != std::string::npos) {
-            std::string name_without_network_parts = new_name.substr(0, pos - 1);
-            std::string network_part               = new_name.substr(pos);
-            // LOG_DEBUG("%s %s", name_without_network_parts.c_str(), network_part.c_str());
-            std::string new_key = convert_diffusers_name_to_compvis(name_without_network_parts, '.');
-            new_key             = convert_sdxl_lora_name(new_key);
-            replace_all_chars(new_key, '.', '_');
-            size_t npos = network_part.rfind("_linear_layer");
-            if (npos != std::string::npos) {
-                network_part.replace(npos, strlen("_linear_layer"), "");
-            }
-            if (starts_with(network_part, "lora.")) {
-                network_part = "lora_" + network_part.substr(5);
-            }
-            if (new_key.size() > 0) {
-                new_name = "lora." + new_key + "." + network_part;
-            }
-            // LOG_DEBUG("new name: %s", new_name.c_str());
-        }
-    } else if (starts_with(name, "unet") || starts_with(name, "vae") || starts_with(name, "te")) {  // for diffuser
-        size_t pos = name.find_last_of('.');
-        if (pos != std::string::npos) {
-            std::string name_without_network_parts = name.substr(0, pos);
-            std::string network_part               = name.substr(pos + 1);
-            // LOG_DEBUG("%s %s", name_without_network_parts.c_str(), network_part.c_str());
-            std::string new_key = convert_diffusers_name_to_compvis(name_without_network_parts, '.');
-            if (new_key.empty()) {
-                new_name = name;
-            } else if (new_key == "cond_stage_model.1.transformer.text_model.text_projection") {
-                new_name = new_key;
-            } else {
-                new_name = new_key + "." + network_part;
-            }
-        } else {
-            new_name = name;
-        }
-    } else {
-        new_name = name;
-    }
-    // if (new_name != name) {
-    //     LOG_DEBUG("%s => %s", name.c_str(), new_name.c_str());
-    // }
-    return new_name;
-}
-
-float bf16_to_f32(uint16_t bfloat16) {
-    uint32_t val_bits = (static_cast<uint32_t>(bfloat16) << 16);
-    return *reinterpret_cast<float*>(&val_bits);
-}
-
 uint16_t f8_e4m3_to_f16(uint8_t f8) {
    // do we need to support uz?

@ -827,13 +205,6 @@ uint16_t f8_e5m2_to_f16(uint8_t fp8) {
    return fp16_sign | (fp16_exponent << 10) | fp16_mantissa;
 }

-void bf16_to_f32_vec(uint16_t* src, float* dst, int64_t n) {
-    // support inplace op
-    for (int64_t i = n - 1; i >= 0; i--) {
-        dst[i] = bf16_to_f32(src[i]);
-    }
-}
-
 void f8_e4m3_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
    // support inplace op
    for (int64_t i = n - 1; i >= 0; i--) {
@ -886,8 +257,8 @@ void convert_tensor(void* src,
        } else {
            auto qtype = ggml_get_type_traits(src_type);
            if (qtype->to_float == nullptr) {
-                throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available",
-                                                ggml_type_name(src_type)));
+                throw std::runtime_error(sd_format("type %s unsupported for integer quantization: no dequantization available",
+                                                   ggml_type_name(src_type)));
            }
            qtype->to_float(src, (float*)dst, n);
        }
@ -896,8 +267,8 @@ void convert_tensor(void* src,
        // src_type is quantized => dst_type == GGML_TYPE_F16 or dst_type is quantized
        auto qtype = ggml_get_type_traits(src_type);
        if (qtype->to_float == nullptr) {
-            throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available",
-                                            ggml_type_name(src_type)));
+            throw std::runtime_error(sd_format("type %s unsupported for integer quantization: no dequantization available",
+                                               ggml_type_name(src_type)));
        }
        std::vector<char> buf;
        buf.resize(sizeof(float) * n);
@ -916,9 +287,7 @@ void convert_tensor(void* src,
 /*================================================= ModelLoader ==================================================*/

 void ModelLoader::add_tensor_storage(const TensorStorage& tensor_storage) {
-    TensorStorage copy            = tensor_storage;
-    copy.name                     = convert_tensor_name(copy.name);
-    tensor_storage_map[copy.name] = std::move(copy);
+    tensor_storage_map[tensor_storage.name] = tensor_storage;
 }

 bool is_zip_file(const std::string& file_path) {
@ -1012,6 +381,31 @@ bool ModelLoader::init_from_file(const std::string& file_path, const std::string
    }
 }

+void ModelLoader::convert_tensors_name() {
+    SDVersion version = (version_ == VERSION_COUNT) ? get_sd_version() : version_;
+    String2TensorStorage new_map;
+
+    for (auto& [_, tensor_storage] : tensor_storage_map) {
+        auto new_name = convert_tensor_name(tensor_storage.name, version);
+        // LOG_DEBUG("%s -> %s", tensor_storage.name.c_str(), new_name.c_str());
+        tensor_storage.name = new_name;
+        new_map[new_name]   = std::move(tensor_storage);
+    }
+
+    tensor_storage_map.swap(new_map);
+}
+
+bool ModelLoader::init_from_file_and_convert_name(const std::string& file_path, const std::string& prefix, SDVersion version) {
+    if (version_ == VERSION_COUNT && version != VERSION_COUNT) {
+        version_ = version;
+    }
+    if (!init_from_file(file_path, prefix)) {
+        return false;
+    }
+    convert_tensors_name();
+    return true;
+}
+
 /*================================================= GGUFModelLoader ==================================================*/

 bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::string& prefix) {
@ -1089,7 +483,7 @@ ggml_type str_to_ggml_type(const std::string& dtype) {
    if (dtype == "F16") {
        ttype = GGML_TYPE_F16;
    } else if (dtype == "BF16") {
-        ttype = GGML_TYPE_F32;
+        ttype = GGML_TYPE_BF16;
    } else if (dtype == "F32") {
        ttype = GGML_TYPE_F32;
    } else if (dtype == "F64") {
@ -1217,10 +611,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const

        size_t tensor_data_size = end - begin;

-        if (dtype == "BF16") {
-            tensor_storage.is_bf16 = true;
-            GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2);
-        } else if (dtype == "F8_E4M3") {
+        if (dtype == "F8_E4M3") {
            tensor_storage.is_f8_e4m3 = true;
            // f8 -> f16
            GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2);
@ -1259,32 +650,6 @@ bool ModelLoader::init_from_diffusers_file(const std::string& file_path, const s
    if (!init_from_safetensors_file(unet_path, "unet.")) {
        return false;
    }
-    for (auto& [name, tensor_storage] : tensor_storage_map) {
-        if (name.find("add_embedding") != std::string::npos || name.find("label_emb") != std::string::npos) {
-            // probably SDXL
-            LOG_DEBUG("Fixing name for SDXL output blocks.2.2");
-            String2TensorStorage new_tensor_storage_map;
-
-            for (auto& [name, tensor_storage] : tensor_storage_map) {
-                int len  = 34;
-                auto pos = tensor_storage.name.find("unet.up_blocks.0.upsamplers.0.conv");
-                if (pos == std::string::npos) {
-                    len = 44;
-                    pos = tensor_storage.name.find("model.diffusion_model.output_blocks.2.1.conv");
-                }
-                if (pos != std::string::npos) {
-                    std::string new_name = "model.diffusion_model.output_blocks.2.2.conv" + name.substr(len);
-                    LOG_DEBUG("NEW NAME: %s", new_name.c_str());
-                    tensor_storage.name              = new_name;
-                    new_tensor_storage_map[new_name] = tensor_storage;
-                } else {
-                    new_tensor_storage_map[name] = tensor_storage;
-                }
-            }
-            tensor_storage_map = new_tensor_storage_map;
-            break;
-        }
-    }

    if (!init_from_safetensors_file(vae_path, "vae.")) {
        LOG_WARN("Couldn't find working VAE in %s", file_path.c_str());
@ -1688,6 +1053,15 @@ SDVersion ModelLoader::get_sd_version() {
            if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) {
                return VERSION_QWEN_IMAGE;
            }
+            if (tensor_storage.name.find("model.diffusion_model.double_stream_modulation_img.lin.weight") != std::string::npos) {
+                return VERSION_FLUX2;
+            }
+            if (tensor_storage.name.find("model.diffusion_model.double_blocks.0.img_mlp.gate_proj.weight") != std::string::npos) {
+                return VERSION_OVIS_IMAGE;
+            }
+            if (tensor_storage.name.find("model.diffusion_model.cap_embedder.0.weight") != std::string::npos) {
+                return VERSION_Z_IMAGE;
+            }
            if (tensor_storage.name.find("model.diffusion_model.blocks.0.cross_attn.norm_k.weight") != std::string::npos) {
                is_wan = true;
            }
@ -1788,6 +1162,9 @@ SDVersion ModelLoader::get_sd_version() {
        if (is_inpaint) {
            return VERSION_SD2_INPAINT;
        }
+        if (!has_middle_block_1) {
+            return VERSION_SD2_TINY_UNET;
+        }
        return VERSION_SD2;
    }
    return VERSION_COUNT;
@ -1877,15 +1254,59 @@ std::map<ggml_type, uint32_t> ModelLoader::get_vae_wtype_stat() {
    return wtype_stat;
 }

-void ModelLoader::set_wtype_override(ggml_type wtype, std::string prefix) {
+static std::vector<std::pair<std::string, ggml_type>> parse_tensor_type_rules(const std::string& tensor_type_rules) {
+    std::vector<std::pair<std::string, ggml_type>> result;
+    for (const auto& item : split_string(tensor_type_rules, ',')) {
+        if (item.size() == 0)
+            continue;
+        std::string::size_type pos = item.find('=');
+        if (pos == std::string::npos) {
+            LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str());
+            continue;
+        }
+        std::string tensor_pattern = item.substr(0, pos);
+        std::string type_name      = item.substr(pos + 1);
+
+        ggml_type tensor_type = GGML_TYPE_COUNT;
+
+        if (type_name == "f32") {
+            tensor_type = GGML_TYPE_F32;
+        } else {
+            for (size_t i = 0; i < GGML_TYPE_COUNT; i++) {
+                auto trait = ggml_get_type_traits((ggml_type)i);
+                if (trait->to_float && trait->type_size && type_name == trait->type_name) {
+                    tensor_type = (ggml_type)i;
+                }
+            }
+        }
+
+        if (tensor_type != GGML_TYPE_COUNT) {
+            result.emplace_back(tensor_pattern, tensor_type);
+        } else {
+            LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str());
+        }
+    }
+    return result;
+}
+
+void ModelLoader::set_wtype_override(ggml_type wtype, std::string tensor_type_rules) {
+    auto map_rules = parse_tensor_type_rules(tensor_type_rules);
    for (auto& [name, tensor_storage] : tensor_storage_map) {
-        if (!starts_with(name, prefix)) {
+        ggml_type dst_type = wtype;
+        for (const auto& tensor_type_rule : map_rules) {
+            std::regex pattern(tensor_type_rule.first);
+            if (std::regex_search(name, pattern)) {
+                dst_type = tensor_type_rule.second;
+                break;
+            }
+        }
+        if (dst_type == GGML_TYPE_COUNT) {
            continue;
        }
-        if (!tensor_should_be_converted(tensor_storage, wtype)) {
+        if (!tensor_should_be_converted(tensor_storage, dst_type)) {
            continue;
        }
-        tensor_storage.expected_type = wtype;
+        tensor_storage.expected_type = dst_type;
    }
 }

@ -1899,6 +1320,16 @@ std::string ModelLoader::load_qwen2_merges() {
    return merges_utf8_str;
 }

+std::string ModelLoader::load_mistral_merges() {
+    std::string merges_utf8_str(reinterpret_cast<const char*>(mistral_merges_utf8_c_str), sizeof(mistral_merges_utf8_c_str));
+    return merges_utf8_str;
+}
+
+std::string ModelLoader::load_mistral_vocab_json() {
+    std::string json_str(reinterpret_cast<const char*>(mistral_vocab_json_utf8_c_str), sizeof(mistral_vocab_json_utf8_c_str));
+    return json_str;
+}
+
 std::string ModelLoader::load_t5_tokenizer_json() {
    std::string json_str(reinterpret_cast<const char*>(t5_tokenizer_json_str), sizeof(t5_tokenizer_json_str));
    return json_str;
@ -1916,13 +1347,13 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
    std::atomic<int64_t> copy_to_backend_time_ms(0);
    std::atomic<int64_t> convert_time_ms(0);

-    int num_threads_to_use = n_threads_p > 0 ? n_threads_p : get_num_physical_cores();
+    int num_threads_to_use = n_threads_p > 0 ? n_threads_p : sd_get_num_physical_cores();
    LOG_DEBUG("using %d threads for model loading", num_threads_to_use);

    int64_t start_time = ggml_time_ms();

    std::vector<TensorStorage> processed_tensor_storages;
-    for (auto& [name, tensor_storage] : tensor_storage_map) {
+    for (const auto& [name, tensor_storage] : tensor_storage_map) {
        if (is_unused_tensor(tensor_storage.name)) {
            continue;
        }
@ -2079,9 +1510,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                    read_time_ms.fetch_add(t1 - t0);

                    t0 = ggml_time_ms();
-                    if (tensor_storage.is_bf16) {
-                        bf16_to_f32_vec((uint16_t*)read_buf, (float*)target_buf, tensor_storage.nelements());
-                    } else if (tensor_storage.is_f8_e4m3) {
+                    if (tensor_storage.is_f8_e4m3) {
                        f8_e4m3_to_f16_vec((uint8_t*)read_buf, (uint16_t*)target_buf, tensor_storage.nelements());
                    } else if (tensor_storage.is_f8_e5m2) {
                        f8_e5m2_to_f16_vec((uint8_t*)read_buf, (uint16_t*)target_buf, tensor_storage.nelements());
@ -2226,41 +1655,6 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
    return true;
 }

-std::vector<std::pair<std::string, ggml_type>> parse_tensor_type_rules(const std::string& tensor_type_rules) {
-    std::vector<std::pair<std::string, ggml_type>> result;
-    for (const auto& item : split_string(tensor_type_rules, ',')) {
-        if (item.size() == 0)
-            continue;
-        std::string::size_type pos = item.find('=');
-        if (pos == std::string::npos) {
-            LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str());
-            continue;
-        }
-        std::string tensor_pattern = item.substr(0, pos);
-        std::string type_name      = item.substr(pos + 1);
-
-        ggml_type tensor_type = GGML_TYPE_COUNT;
-
-        if (type_name == "f32") {
-            tensor_type = GGML_TYPE_F32;
-        } else {
-            for (size_t i = 0; i < GGML_TYPE_COUNT; i++) {
-                auto trait = ggml_get_type_traits((ggml_type)i);
-                if (trait->to_float && trait->type_size && type_name == trait->type_name) {
-                    tensor_type = (ggml_type)i;
-                }
-            }
-        }
-
-        if (tensor_type != GGML_TYPE_COUNT) {
-            result.emplace_back(tensor_pattern, tensor_type);
-        } else {
-            LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str());
-        }
-    }
-    return result;
-}
-
 bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type) {
    const std::string& name = tensor_storage.name;
    if (type != GGML_TYPE_COUNT) {
@ -2391,6 +1785,7 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa
            return false;
        }
    }
+    model_loader.convert_tensors_name();
    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type, tensor_type_rules);
    return success;
 }
--- a/model.h
+++ b/model.h
@ -15,6 +15,7 @@
 #include "ggml.h"
 #include "gguf.h"
 #include "json.hpp"
+#include "ordered_map.hpp"
 #include "zip.h"

 #define SD_MAX_DIMS 5
@ -26,6 +27,7 @@ enum SDVersion {
    VERSION_SD1_TINY_UNET,
    VERSION_SD2,
    VERSION_SD2_INPAINT,
+    VERSION_SD2_TINY_UNET,
    VERSION_SDXL,
    VERSION_SDXL_INPAINT,
    VERSION_SDXL_PIX2PIX,
@ -41,6 +43,9 @@ enum SDVersion {
    VERSION_WAN2_2_I2V,
    VERSION_WAN2_2_TI2V,
    VERSION_QWEN_IMAGE,
+    VERSION_FLUX2,
+    VERSION_Z_IMAGE,
+    VERSION_OVIS_IMAGE,
    VERSION_COUNT,
 };

@ -52,7 +57,7 @@ static inline bool sd_version_is_sd1(SDVersion version) {
 }

 static inline bool sd_version_is_sd2(SDVersion version) {
-    if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT) {
+    if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET) {
        return true;
    }
    return false;
@ -86,12 +91,20 @@ static inline bool sd_version_is_flux(SDVersion version) {
        version == VERSION_FLUX_FILL ||
        version == VERSION_FLUX_CONTROLS ||
        version == VERSION_FLEX_2 ||
+        version == VERSION_OVIS_IMAGE ||
        version == VERSION_CHROMA_RADIANCE) {
        return true;
    }
    return false;
 }

+static inline bool sd_version_is_flux2(SDVersion version) {
+    if (version == VERSION_FLUX2) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_is_wan(SDVersion version) {
    if (version == VERSION_WAN2 || version == VERSION_WAN2_2_I2V || version == VERSION_WAN2_2_TI2V) {
        return true;
@ -106,8 +119,19 @@ static inline bool sd_version_is_qwen_image(SDVersion version) {
    return false;
 }

+static inline bool sd_version_is_z_image(SDVersion version) {
+    if (version == VERSION_Z_IMAGE) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_is_inpaint(SDVersion version) {
-    if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT || version == VERSION_FLUX_FILL || version == VERSION_FLEX_2) {
+    if (version == VERSION_SD1_INPAINT ||
+        version == VERSION_SD2_INPAINT ||
+        version == VERSION_SDXL_INPAINT ||
+        version == VERSION_FLUX_FILL ||
+        version == VERSION_FLEX_2) {
        return true;
    }
    return false;
@ -115,9 +139,11 @@ static inline bool sd_version_is_inpaint(SDVersion version) {

 static inline bool sd_version_is_dit(SDVersion version) {
    if (sd_version_is_flux(version) ||
+        sd_version_is_flux2(version) ||
        sd_version_is_sd3(version) ||
        sd_version_is_wan(version) ||
-        sd_version_is_qwen_image(version)) {
+        sd_version_is_qwen_image(version) ||
+        sd_version_is_z_image(version)) {
        return true;
    }
    return false;
@ -144,7 +170,6 @@ struct TensorStorage {
    std::string name;
    ggml_type type          = GGML_TYPE_F32;
    ggml_type expected_type = GGML_TYPE_COUNT;
-    bool is_bf16            = false;
    bool is_f8_e4m3         = false;
    bool is_f8_e5m2         = false;
    bool is_f64             = false;
@ -178,7 +203,7 @@ struct TensorStorage {
    }

    int64_t nbytes_to_read() const {
-        if (is_bf16 || is_f8_e4m3 || is_f8_e5m2) {
+        if (is_f8_e4m3 || is_f8_e5m2) {
            return nbytes() / 2;
        } else if (is_f64 || is_i64) {
            return nbytes() * 2;
@ -226,9 +251,7 @@ struct TensorStorage {
    std::string to_string() const {
        std::stringstream ss;
        const char* type_name = ggml_type_name(type);
-        if (is_bf16) {
-            type_name = "bf16";
-        } else if (is_f8_e4m3) {
+        if (is_f8_e4m3) {
            type_name = "f8_e4m3";
        } else if (is_f8_e5m2) {
            type_name = "f8_e5m2";
@ -252,10 +275,11 @@ struct TensorStorage {

 typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;

-typedef std::map<std::string, TensorStorage> String2TensorStorage;
+typedef OrderedMap<std::string, TensorStorage> String2TensorStorage;

 class ModelLoader {
 protected:
+    SDVersion version_ = VERSION_COUNT;
    std::vector<std::string> file_paths_;
    String2TensorStorage tensor_storage_map;

@ -275,13 +299,17 @@ protected:

 public:
    bool init_from_file(const std::string& file_path, const std::string& prefix = "");
+    void convert_tensors_name();
+    bool init_from_file_and_convert_name(const std::string& file_path,
+                                         const std::string& prefix = "",
+                                         SDVersion version         = VERSION_COUNT);
    SDVersion get_sd_version();
    std::map<ggml_type, uint32_t> get_wtype_stat();
    std::map<ggml_type, uint32_t> get_conditioner_wtype_stat();
    std::map<ggml_type, uint32_t> get_diffusion_model_wtype_stat();
    std::map<ggml_type, uint32_t> get_vae_wtype_stat();
    String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
-    void set_wtype_override(ggml_type wtype, std::string prefix = "");
+    void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = "");
    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0);
    bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                      std::set<std::string> ignore_tensors = {},
@ -302,6 +330,8 @@ public:

    static std::string load_merges();
    static std::string load_qwen2_merges();
+    static std::string load_mistral_merges();
+    static std::string load_mistral_vocab_json();
    static std::string load_t5_tokenizer_json();
    static std::string load_umt5_tokenizer_json();
 };
--- a/name_conversion.cpp
+++ b/name_conversion.cpp
--- a/name_conversion.h
+++ b/name_conversion.h
@ -0,0 +1,14 @@
+#ifndef __NAME_CONVERSTION_H__
+#define __NAME_CONVERSTION_H__
+
+#include <string>
+
+#include "model.h"
+
+bool is_cond_stage_model_name(const std::string& name);
+bool is_diffusion_model_name(const std::string& name);
+bool is_first_stage_model_name(const std::string& name);
+
+std::string convert_tensor_name(std::string name, SDVersion version);
+
+#endif  // __NAME_CONVERSTION_H__
--- a/ordered_map.hpp
+++ b/ordered_map.hpp
@ -0,0 +1,177 @@
+#ifndef __ORDERED_MAP_HPP__
+#define __ORDERED_MAP_HPP__
+
+#include <iostream>
+#include <list>
+#include <string>
+#include <unordered_map>
+
+#include <initializer_list>
+#include <iterator>
+#include <list>
+#include <stdexcept>
+#include <unordered_map>
+#include <utility>
+
+template <typename Key, typename T>
+class OrderedMap {
+public:
+    using key_type        = Key;
+    using mapped_type     = T;
+    using value_type      = std::pair<const Key, T>;
+    using list_type       = std::list<value_type>;
+    using size_type       = typename list_type::size_type;
+    using difference_type = typename list_type::difference_type;
+    using iterator        = typename list_type::iterator;
+    using const_iterator  = typename list_type::const_iterator;
+
+private:
+    list_type data_;
+    std::unordered_map<Key, iterator> index_;
+
+public:
+    // --- constructors ---
+    OrderedMap() = default;
+
+    OrderedMap(std::initializer_list<value_type> init) {
+        for (const auto& kv : init)
+            insert(kv);
+    }
+
+    OrderedMap(const OrderedMap&)                = default;
+    OrderedMap(OrderedMap&&) noexcept            = default;
+    OrderedMap& operator=(const OrderedMap&)     = default;
+    OrderedMap& operator=(OrderedMap&&) noexcept = default;
+
+    // --- element access ---
+    T& at(const Key& key) {
+        auto it = index_.find(key);
+        if (it == index_.end())
+            throw std::out_of_range("OrderedMap::at: key not found");
+        return it->second->second;
+    }
+
+    const T& at(const Key& key) const {
+        auto it = index_.find(key);
+        if (it == index_.end())
+            throw std::out_of_range("OrderedMap::at: key not found");
+        return it->second->second;
+    }
+
+    T& operator[](const Key& key) {
+        auto it = index_.find(key);
+        if (it == index_.end()) {
+            data_.emplace_back(key, T{});
+            auto iter   = std::prev(data_.end());
+            index_[key] = iter;
+            return iter->second;
+        }
+        return it->second->second;
+    }
+
+    // --- iterators ---
+    iterator begin() noexcept { return data_.begin(); }
+    const_iterator begin() const noexcept { return data_.begin(); }
+    const_iterator cbegin() const noexcept { return data_.cbegin(); }
+
+    iterator end() noexcept { return data_.end(); }
+    const_iterator end() const noexcept { return data_.end(); }
+    const_iterator cend() const noexcept { return data_.cend(); }
+
+    // --- capacity ---
+    bool empty() const noexcept { return data_.empty(); }
+    size_type size() const noexcept { return data_.size(); }
+
+    // --- modifiers ---
+    void clear() noexcept {
+        data_.clear();
+        index_.clear();
+    }
+
+    std::pair<iterator, bool> insert(const value_type& value) {
+        auto it = index_.find(value.first);
+        if (it != index_.end()) {
+            return {it->second, false};
+        }
+        data_.push_back(value);
+        auto iter           = std::prev(data_.end());
+        index_[value.first] = iter;
+        return {iter, true};
+    }
+
+    std::pair<iterator, bool> insert(value_type&& value) {
+        auto it = index_.find(value.first);
+        if (it != index_.end()) {
+            return {it->second, false};
+        }
+        data_.push_back(std::move(value));
+        auto iter           = std::prev(data_.end());
+        index_[iter->first] = iter;
+        return {iter, true};
+    }
+
+    void erase(const Key& key) {
+        auto it = index_.find(key);
+        if (it != index_.end()) {
+            data_.erase(it->second);
+            index_.erase(it);
+        }
+    }
+
+    iterator erase(iterator pos) {
+        index_.erase(pos->first);
+        return data_.erase(pos);
+    }
+
+    // --- lookup ---
+    size_type count(const Key& key) const {
+        return index_.count(key);
+    }
+
+    iterator find(const Key& key) {
+        auto it = index_.find(key);
+        if (it == index_.end())
+            return data_.end();
+        return it->second;
+    }
+
+    const_iterator find(const Key& key) const {
+        auto it = index_.find(key);
+        if (it == index_.end())
+            return data_.end();
+        return it->second;
+    }
+
+    bool contains(const Key& key) const {
+        return index_.find(key) != index_.end();
+    }
+
+    // --- comparison ---
+    bool operator==(const OrderedMap& other) const {
+        return data_ == other.data_;
+    }
+
+    bool operator!=(const OrderedMap& other) const {
+        return !(*this == other);
+    }
+
+    template <typename... Args>
+    std::pair<iterator, bool> emplace(Args&&... args) {
+        value_type value(std::forward<Args>(args)...);
+        auto it = index_.find(value.first);
+        if (it != index_.end()) {
+            return {it->second, false};
+        }
+        data_.push_back(std::move(value));
+        auto iter           = std::prev(data_.end());
+        index_[iter->first] = iter;
+        return {iter, true};
+    }
+
+    void swap(OrderedMap& other) noexcept {
+        data_.swap(other.data_);
+        index_.swap(other.index_);
+    }
+};
+
+#endif  // __ORDERED_MAP_HPP__
--- a/pmid.hpp
+++ b/pmid.hpp
@ -548,7 +548,7 @@ public:
        return gf;
    }

-    void compute(const int n_threads,
+    bool compute(const int n_threads,
                 struct ggml_tensor* id_pixel_values,
                 struct ggml_tensor* prompt_embeds,
                 struct ggml_tensor* id_embeds,
@ -561,7 +561,7 @@ public:
        };

        // GGMLRunner::compute(get_graph, n_threads, updated_prompt_embeds);
-        GGMLRunner::compute(get_graph, n_threads, true, updated_prompt_embeds, output_ctx);
+        return GGMLRunner::compute(get_graph, n_threads, true, updated_prompt_embeds, output_ctx);
    }
 };

@ -578,7 +578,7 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
                      const std::string& file_path = "",
                      const std::string& prefix    = "")
        : file_path(file_path), GGMLRunner(backend, offload_params_to_cpu), model_loader(ml) {
-        if (!model_loader->init_from_file(file_path, prefix)) {
+        if (!model_loader->init_from_file_and_convert_name(file_path, prefix)) {
            load_failed = true;
        }
    }
--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@ -94,10 +94,14 @@ namespace Qwen {
            blocks["norm_added_q"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps));
            blocks["norm_added_k"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps));

-            float scale = 1.f / 32.f;
+            float scale         = 1.f / 32.f;
+            bool force_prec_f32 = false;
+#ifdef SD_USE_VULKAN
+            force_prec_f32 = true;
+#endif
            // The purpose of the scale here is to prevent NaN issues in certain situations.
            // For example when using CUDA but the weights are k-quants (not all prompts).
-            blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias, false, false, scale));
+            blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias, false, force_prec_f32, scale));
            // to_out.1 is nn.Dropout

            blocks["to_add_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_context_dim, out_bias, false, false, scale));
@ -543,7 +547,7 @@ namespace Qwen {
                                        std::vector<ggml_tensor*> ref_latents = {},
                                        bool increase_ref_index               = false) {
            GGML_ASSERT(x->ne[3] == 1);
-            struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, QWEN_IMAGE_GRAPH_SIZE, false);
+            struct ggml_cgraph* gf = new_graph_custom(QWEN_IMAGE_GRAPH_SIZE);

            x         = to_backend(x);
            context   = to_backend(context);
@ -584,7 +588,7 @@ namespace Qwen {
            return gf;
        }

-        void compute(int n_threads,
+        bool compute(int n_threads,
                     struct ggml_tensor* x,
                     struct ggml_tensor* timesteps,
                     struct ggml_tensor* context,
@ -599,7 +603,7 @@ namespace Qwen {
                return build_graph(x, timesteps, context, ref_latents, increase_ref_index);
            };

-            GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+            return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
        }

        void test() {
@ -644,7 +648,7 @@ namespace Qwen {
            ggml_type model_data_type = GGML_TYPE_Q8_0;

            ModelLoader model_loader;
-            if (!model_loader.init_from_file(file_path, "model.diffusion_model.")) {
+            if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
            }
--- a/rng_mt19937.hpp
+++ b/rng_mt19937.hpp
@ -0,0 +1,147 @@
+#ifndef __RNG_MT19937_HPP__
+#define __RNG_MT19937_HPP__
+
+#include <cmath>
+#include <vector>
+
+#include "rng.hpp"
+
+// RNG imitiating torch cpu randn on CPU.
+// Port from pytorch, original license: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/LICENSE
+// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/TransformationHelper.h, for uniform_real
+// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/native/cpu/DistributionTemplates.h, for normal_kernel/normal_fill/normal_fill_16
+// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/MT19937RNGEngine.h, for mt19937_engine
+// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/DistributionsHelper.h, for uniform_real_distribution/normal_distribution
+class MT19937RNG : public RNG {
+    static const int N             = 624;
+    static const int M             = 397;
+    static const uint32_t MATRIX_A = 0x9908b0dfU;
+    static const uint32_t UMASK    = 0x80000000U;
+    static const uint32_t LMASK    = 0x7fffffffU;
+
+    struct State {
+        uint64_t seed_;
+        int left_;
+        bool seeded_;
+        uint32_t next_;
+        std::array<uint32_t, N> state_;
+        bool has_next_gauss = false;
+        double next_gauss   = 0.0f;
+    };
+
+    State s;
+
+    uint32_t mix_bits(uint32_t u, uint32_t v) { return (u & UMASK) | (v & LMASK); }
+    uint32_t twist(uint32_t u, uint32_t v) { return (mix_bits(u, v) >> 1) ^ ((v & 1) ? MATRIX_A : 0); }
+    void next_state() {
+        uint32_t* p = s.state_.data();
+        s.left_     = N;
+        s.next_     = 0;
+        for (int j = N - M + 1; --j; p++)
+            p[0] = p[M] ^ twist(p[0], p[1]);
+        for (int j = M; --j; p++)
+            p[0] = p[M - N] ^ twist(p[0], p[1]);
+        p[0] = p[M - N] ^ twist(p[0], s.state_[0]);
+    }
+
+    uint32_t rand_uint32() {
+        if (--s.left_ == 0)
+            next_state();
+        uint32_t y = s.state_[s.next_++];
+        y ^= (y >> 11);
+        y ^= (y << 7) & 0x9d2c5680U;
+        y ^= (y << 15) & 0xefc60000U;
+        y ^= (y >> 18);
+        return y;
+    }
+
+    uint64_t rand_uint64() {
+        uint64_t high = (uint64_t)rand_uint32();
+        uint64_t low  = (uint64_t)rand_uint32();
+        return (high << 32) | low;
+    }
+
+    template <typename T, typename V>
+    T uniform_real(V val, T from, T to) {
+        constexpr auto MASK    = static_cast<V>((static_cast<uint64_t>(1) << std::numeric_limits<T>::digits) - 1);
+        constexpr auto DIVISOR = static_cast<T>(1) / (static_cast<uint64_t>(1) << std::numeric_limits<T>::digits);
+        T x                    = (val & MASK) * DIVISOR;
+        return (x * (to - from) + from);
+    }
+
+    double normal_double_value(double mean, double std) {
+        if (s.has_next_gauss) {
+            s.has_next_gauss = false;
+            return s.next_gauss;
+        }
+        double u1 = uniform_real(rand_uint64(), 0., 1.);  // double
+        double u2 = uniform_real(rand_uint64(), 0., 1.);  // double
+
+        double r         = std::sqrt(-2.0 * std::log1p(-u2));
+        double theta     = 2.0 * 3.14159265358979323846 * u1;
+        double value     = r * std::cos(theta) * std + mean;
+        s.next_gauss     = r * std::sin(theta) * std + mean;
+        s.has_next_gauss = true;
+        return value;
+    }
+
+    void normal_fill_16(float* data, float mean, float std) {
+        for (int j = 0; j < 8; ++j) {
+            float u1    = 1.0f - data[j];
+            float u2    = data[j + 8];
+            float r     = std::sqrt(-2.0f * std::log(u1));
+            float theta = 2.0f * 3.14159265358979323846 * u2;
+            data[j]     = r * std::cos(theta) * std + mean;
+            data[j + 8] = r * std::sin(theta) * std + mean;
+        }
+    }
+
+    void randn(float* data, int64_t size, float mean = 0.0f, float std = 1.0f) {
+        if (size >= 16) {
+            for (int64_t i = 0; i < size; i++) {
+                data[i] = uniform_real(rand_uint32(), 0.f, 1.f);
+            }
+            for (int64_t i = 0; i < size - 15; i += 16) {
+                normal_fill_16(data + i, mean, std);
+            }
+            if (size % 16 != 0) {
+                // Recompute the last 16 values.
+                data = data + size - 16;
+                for (int64_t i = 0; i < 16; i++) {
+                    data[i] = uniform_real(rand_uint32(), 0.f, 1.f);
+                }
+                normal_fill_16(data, mean, std);
+            }
+        } else {
+            // Strange handling, hard to understand, but keeping it consistent with PyTorch.
+            for (int64_t i = 0; i < size; i++) {
+                data[i] = (float)normal_double_value(mean, std);
+            }
+        }
+    }
+
+public:
+    MT19937RNG(uint64_t seed = 0) { manual_seed(seed); }
+
+    void manual_seed(uint64_t seed) override {
+        s.seed_     = seed;
+        s.seeded_   = true;
+        s.state_[0] = (uint32_t)(seed & 0xffffffffU);
+        for (int j = 1; j < N; j++) {
+            uint32_t prev = s.state_[j - 1];
+            s.state_[j]   = 1812433253U * (prev ^ (prev >> 30)) + j;
+        }
+        s.left_          = 1;
+        s.next_          = 0;
+        s.has_next_gauss = false;
+    }
+
+    std::vector<float> randn(uint32_t n) override {
+        std::vector<float> out;
+        out.resize(n);
+        randn((float*)out.data(), out.size());
+        return out;
+    }
+};
+
+#endif  // __RNG_MT19937_HPP__
--- a/rope.hpp
+++ b/rope.hpp
@ -72,15 +72,30 @@ namespace Rope {
    }

    // Generate IDs for image patches and text
-    __STATIC_INLINE__ std::vector<std::vector<float>> gen_txt_ids(int bs, int context_len) {
-        return std::vector<std::vector<float>>(bs * context_len, std::vector<float>(3, 0.0));
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_txt_ids(int bs, int context_len, int axes_dim_num, std::set<int> arange_dims) {
+        auto txt_ids = std::vector<std::vector<float>>(bs * context_len, std::vector<float>(axes_dim_num, 0.0f));
+        for (int dim = 0; dim < axes_dim_num; dim++) {
+            if (arange_dims.find(dim) != arange_dims.end()) {
+                for (int i = 0; i < bs * context_len; i++) {
+                    txt_ids[i][dim] = (i % context_len);
+                }
+            }
+        }
+        return txt_ids;
    }

-    __STATIC_INLINE__ std::vector<std::vector<float>> gen_img_ids(int h, int w, int patch_size, int bs, int index = 0, int h_offset = 0, int w_offset = 0) {
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_img_ids(int h,
+                                                                       int w,
+                                                                       int patch_size,
+                                                                       int bs,
+                                                                       int axes_dim_num,
+                                                                       int index    = 0,
+                                                                       int h_offset = 0,
+                                                                       int w_offset = 0) {
        int h_len = (h + (patch_size / 2)) / patch_size;
        int w_len = (w + (patch_size / 2)) / patch_size;

-        std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(3, 0.0));
+        std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(axes_dim_num, 0.0));

        std::vector<float> row_ids = linspace<float>(h_offset, h_len - 1 + h_offset, h_len);
        std::vector<float> col_ids = linspace<float>(w_offset, w_len - 1 + w_offset, w_len);
@ -153,8 +168,10 @@ namespace Rope {

    __STATIC_INLINE__ std::vector<std::vector<float>> gen_refs_ids(int patch_size,
                                                                   int bs,
+                                                                   int axes_dim_num,
                                                                   const std::vector<ggml_tensor*>& ref_latents,
-                                                                   bool increase_ref_index) {
+                                                                   bool increase_ref_index,
+                                                                   float ref_index_scale) {
        std::vector<std::vector<float>> ids;
        uint64_t curr_h_offset = 0;
        uint64_t curr_w_offset = 0;
@ -170,7 +187,14 @@ namespace Rope {
                }
            }

-            auto ref_ids = gen_img_ids(ref->ne[1], ref->ne[0], patch_size, bs, index, h_offset, w_offset);
+            auto ref_ids = gen_flux_img_ids(ref->ne[1],
+                                            ref->ne[0],
+                                            patch_size,
+                                            bs,
+                                            axes_dim_num,
+                                            static_cast<int>(index * ref_index_scale),
+                                            h_offset,
+                                            w_offset);
            ids          = concat_ids(ids, ref_ids, bs);

            if (increase_ref_index) {
@ -187,15 +211,18 @@ namespace Rope {
                                                                   int w,
                                                                   int patch_size,
                                                                   int bs,
+                                                                   int axes_dim_num,
                                                                   int context_len,
+                                                                   std::set<int> txt_arange_dims,
                                                                   const std::vector<ggml_tensor*>& ref_latents,
-                                                                   bool increase_ref_index) {
-        auto txt_ids = gen_txt_ids(bs, context_len);
-        auto img_ids = gen_img_ids(h, w, patch_size, bs);
+                                                                   bool increase_ref_index,
+                                                                   float ref_index_scale) {
+        auto txt_ids = gen_flux_txt_ids(bs, context_len, axes_dim_num, txt_arange_dims);
+        auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num);

        auto ids = concat_ids(txt_ids, img_ids, bs);
        if (ref_latents.size() > 0) {
-            auto refs_ids = gen_refs_ids(patch_size, bs, ref_latents, increase_ref_index);
+            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, ref_index_scale);
            ids           = concat_ids(ids, refs_ids, bs);
        }
        return ids;
@ -207,11 +234,22 @@ namespace Rope {
                                                     int patch_size,
                                                     int bs,
                                                     int context_len,
+                                                     std::set<int> txt_arange_dims,
                                                     const std::vector<ggml_tensor*>& ref_latents,
                                                     bool increase_ref_index,
+                                                     float ref_index_scale,
                                                     int theta,
                                                     const std::vector<int>& axes_dim) {
-        std::vector<std::vector<float>> ids = gen_flux_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index);
+        std::vector<std::vector<float>> ids = gen_flux_ids(h,
+                                                           w,
+                                                           patch_size,
+                                                           bs,
+                                                           static_cast<int>(axes_dim.size()),
+                                                           context_len,
+                                                           txt_arange_dims,
+                                                           ref_latents,
+                                                           increase_ref_index,
+                                                           ref_index_scale);
        return embed_nd(ids, bs, theta, axes_dim);
    }

@ -232,10 +270,11 @@ namespace Rope {
                txt_ids_repeated[i * txt_ids.size() + j] = {txt_ids[j], txt_ids[j], txt_ids[j]};
            }
        }
-        auto img_ids = gen_img_ids(h, w, patch_size, bs);
-        auto ids     = concat_ids(txt_ids_repeated, img_ids, bs);
+        int axes_dim_num = 3;
+        auto img_ids     = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num);
+        auto ids         = concat_ids(txt_ids_repeated, img_ids, bs);
        if (ref_latents.size() > 0) {
-            auto refs_ids = gen_refs_ids(patch_size, bs, ref_latents, increase_ref_index);
+            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, 1.f);
            ids           = concat_ids(ids, refs_ids, bs);
        }
        return ids;
@ -345,6 +384,55 @@ namespace Rope {
        return embed_nd(ids, 1, theta, axes_dim);
    }

+    __STATIC_INLINE__ int bound_mod(int a, int m) {
+        return (m - (a % m)) % m;
+    }
+
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_z_image_ids(int h,
+                                                                      int w,
+                                                                      int patch_size,
+                                                                      int bs,
+                                                                      int context_len,
+                                                                      int seq_multi_of,
+                                                                      const std::vector<ggml_tensor*>& ref_latents,
+                                                                      bool increase_ref_index) {
+        int padded_context_len = context_len + bound_mod(context_len, seq_multi_of);
+        auto txt_ids           = std::vector<std::vector<float>>(bs * padded_context_len, std::vector<float>(3, 0.0f));
+        for (int i = 0; i < bs * padded_context_len; i++) {
+            txt_ids[i][0] = (i % padded_context_len) + 1.f;
+        }
+
+        int axes_dim_num = 3;
+        int index        = padded_context_len + 1;
+        auto img_ids     = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num, index);
+
+        int img_pad_len = bound_mod(static_cast<int>(img_ids.size() / bs), seq_multi_of);
+        if (img_pad_len > 0) {
+            std::vector<std::vector<float>> img_pad_ids(bs * img_pad_len, std::vector<float>(3, 0.f));
+            img_ids = concat_ids(img_ids, img_pad_ids, bs);
+        }
+
+        auto ids = concat_ids(txt_ids, img_ids, bs);
+
+        // ignore ref_latents for now
+        return ids;
+    }
+
+    // Generate z_image positional embeddings
+    __STATIC_INLINE__ std::vector<float> gen_z_image_pe(int h,
+                                                        int w,
+                                                        int patch_size,
+                                                        int bs,
+                                                        int context_len,
+                                                        int seq_multi_of,
+                                                        const std::vector<ggml_tensor*>& ref_latents,
+                                                        bool increase_ref_index,
+                                                        int theta,
+                                                        const std::vector<int>& axes_dim) {
+        std::vector<std::vector<float>> ids = gen_z_image_ids(h, w, patch_size, bs, context_len, seq_multi_of, ref_latents, increase_ref_index);
+        return embed_nd(ids, bs, theta, axes_dim);
+    }
+
    __STATIC_INLINE__ struct ggml_tensor* apply_rope(struct ggml_context* ctx,
                                                     struct ggml_tensor* x,
                                                     struct ggml_tensor* pe,
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -31,46 +31,46 @@ extern "C" {
 enum rng_type_t {
    STD_DEFAULT_RNG,
    CUDA_RNG,
+    CPU_RNG,
    RNG_TYPE_COUNT
 };

 enum sample_method_t {
-    SAMPLE_METHOD_DEFAULT,
-    EULER,
-    HEUN,
-    DPM2,
-    DPMPP2S_A,
-    DPMPP2M,
-    DPMPP2Mv2,
-    IPNDM,
-    IPNDM_V,
-    LCM,
-    DDIM_TRAILING,
-    TCD,
-    EULER_A,
+    EULER_SAMPLE_METHOD,
+    EULER_A_SAMPLE_METHOD,
+    HEUN_SAMPLE_METHOD,
+    DPM2_SAMPLE_METHOD,
+    DPMPP2S_A_SAMPLE_METHOD,
+    DPMPP2M_SAMPLE_METHOD,
+    DPMPP2Mv2_SAMPLE_METHOD,
+    IPNDM_SAMPLE_METHOD,
+    IPNDM_V_SAMPLE_METHOD,
+    LCM_SAMPLE_METHOD,
+    DDIM_TRAILING_SAMPLE_METHOD,
+    TCD_SAMPLE_METHOD,
    SAMPLE_METHOD_COUNT
 };

 enum scheduler_t {
-    DEFAULT,
-    DISCRETE,
-    KARRAS,
-    EXPONENTIAL,
-    AYS,
-    GITS,
-    SGM_UNIFORM,
-    SIMPLE,
-    SMOOTHSTEP,
-    SCHEDULE_COUNT
+    DISCRETE_SCHEDULER,
+    KARRAS_SCHEDULER,
+    EXPONENTIAL_SCHEDULER,
+    AYS_SCHEDULER,
+    GITS_SCHEDULER,
+    SGM_UNIFORM_SCHEDULER,
+    SIMPLE_SCHEDULER,
+    SMOOTHSTEP_SCHEDULER,
+    LCM_SCHEDULER,
+    SCHEDULER_COUNT
 };

 enum prediction_t {
-    DEFAULT_PRED,
    EPS_PRED,
    V_PRED,
    EDM_V_PRED,
-    SD3_FLOW_PRED,
+    FLOW_PRED,
    FLUX_FLOW_PRED,
+    FLUX2_FLOW_PRED,
    PREDICTION_COUNT
 };

@ -126,6 +126,21 @@ enum sd_log_level_t {
    SD_LOG_ERROR
 };

+enum preview_t {
+    PREVIEW_NONE,
+    PREVIEW_PROJ,
+    PREVIEW_TAE,
+    PREVIEW_VAE,
+    PREVIEW_COUNT
+};
+
+enum lora_apply_mode_t {
+    LORA_APPLY_AUTO,
+    LORA_APPLY_IMMEDIATELY,
+    LORA_APPLY_AT_RUNTIME,
+    LORA_APPLY_MODE_COUNT,
+};
+
 typedef struct {
    bool enabled;
    int tile_size_x;
@ -135,33 +150,43 @@ typedef struct {
    float rel_size_y;
 } sd_tiling_params_t;

+typedef struct {
+    const char* name;
+    const char* path;
+} sd_embedding_t;
+
 typedef struct {
    const char* model_path;
    const char* clip_l_path;
    const char* clip_g_path;
    const char* clip_vision_path;
    const char* t5xxl_path;
-    const char* qwen2vl_path;
-    const char* qwen2vl_vision_path;
+    const char* llm_path;
+    const char* llm_vision_path;
    const char* diffusion_model_path;
    const char* high_noise_diffusion_model_path;
    const char* vae_path;
    const char* taesd_path;
    const char* control_net_path;
    const char* lora_model_dir;
-    const char* embedding_dir;
+    const sd_embedding_t* embeddings;
+    uint32_t embedding_count;
    const char* photo_maker_path;
+    const char* tensor_type_rules;
    bool vae_decode_only;
    bool free_params_immediately;
    int n_threads;
    enum sd_type_t wtype;
    enum rng_type_t rng_type;
+    enum rng_type_t sampler_rng_type;
    enum prediction_t prediction;
+    enum lora_apply_mode_t lora_apply_mode;
    bool offload_params_to_cpu;
    bool keep_clip_on_cpu;
    bool keep_control_net_on_cpu;
    bool keep_vae_on_cpu;
    bool diffusion_flash_attn;
+    bool tae_preview_only;
    bool diffusion_conv_direct;
    bool vae_conv_direct;
    bool force_sdxl_vae_conv_scale;
@ -210,6 +235,21 @@ typedef struct {
 } sd_pm_params_t;  // photo maker

 typedef struct {
+    bool enabled;
+    float reuse_threshold;
+    float start_percent;
+    float end_percent;
+} sd_easycache_params_t;
+
+typedef struct {
+    bool is_high_noise;
+    float multiplier;
+    const char* path;
+} sd_lora_t;
+
+typedef struct {
+    const sd_lora_t* loras;
+    uint32_t lora_count;
    const char* prompt;
    const char* negative_prompt;
    int clip_skip;
@ -229,9 +269,12 @@ typedef struct {
    float control_strength;
    sd_pm_params_t pm_params;
    sd_tiling_params_t vae_tiling_params;
+    sd_easycache_params_t easycache;
 } sd_img_gen_params_t;

 typedef struct {
+    const sd_lora_t* loras;
+    uint32_t lora_count;
    const char* prompt;
    const char* negative_prompt;
    int clip_skip;
@ -248,16 +291,19 @@ typedef struct {
    int64_t seed;
    int video_frames;
    float vace_strength;
+    sd_easycache_params_t easycache;
 } sd_vid_gen_params_t;

 typedef struct sd_ctx_t sd_ctx_t;

 typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
 typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
+typedef void (*sd_preview_cb_t)(int step, int frame_count, sd_image_t* frames, bool is_noisy, void* data);

 SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
 SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
-SD_API int32_t get_num_physical_cores();
+SD_API void sd_set_preview_callback(sd_preview_cb_t cb, enum preview_t mode, int interval, bool denoised, bool noisy, void* data);
+SD_API int32_t sd_get_num_physical_cores();
 SD_API const char* sd_get_system_info();

 SD_API const char* sd_type_name(enum sd_type_t type);
@ -266,21 +312,29 @@ SD_API const char* sd_rng_type_name(enum rng_type_t rng_type);
 SD_API enum rng_type_t str_to_rng_type(const char* str);
 SD_API const char* sd_sample_method_name(enum sample_method_t sample_method);
 SD_API enum sample_method_t str_to_sample_method(const char* str);
-SD_API const char* sd_schedule_name(enum scheduler_t scheduler);
-SD_API enum scheduler_t str_to_schedule(const char* str);
+SD_API const char* sd_scheduler_name(enum scheduler_t scheduler);
+SD_API enum scheduler_t str_to_scheduler(const char* str);
 SD_API const char* sd_prediction_name(enum prediction_t prediction);
 SD_API enum prediction_t str_to_prediction(const char* str);
+SD_API const char* sd_preview_name(enum preview_t preview);
+SD_API enum preview_t str_to_preview(const char* str);
+SD_API const char* sd_lora_apply_mode_name(enum lora_apply_mode_t mode);
+SD_API enum lora_apply_mode_t str_to_lora_apply_mode(const char* str);
+
+SD_API void sd_easycache_params_init(sd_easycache_params_t* easycache_params);

 SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
 SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);

 SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params);
 SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
-SD_API enum sample_method_t sd_get_default_sample_method(const sd_ctx_t* sd_ctx);

 SD_API void sd_sample_params_init(sd_sample_params_t* sample_params);
 SD_API char* sd_sample_params_to_str(const sd_sample_params_t* sample_params);

+SD_API enum sample_method_t sd_get_default_sample_method(const sd_ctx_t* sd_ctx);
+SD_API enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx);
+
 SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params);
 SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
 SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params);
@ -293,7 +347,8 @@ typedef struct upscaler_ctx_t upscaler_ctx_t;
 SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
                                        bool offload_params_to_cpu,
                                        bool direct,
-                                        int n_threads);
+                                        int n_threads,
+                                        int tile_size);
 SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);

 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
@ -315,6 +370,9 @@ SD_API bool preprocess_canny(sd_image_t image,
                             float strong,
                             bool inverse);

+SD_API const char* sd_commit(void);
+SD_API const char* sd_version(void);
+
 #ifdef __cplusplus
 }
 #endif
--- a/t5.hpp
+++ b/t5.hpp
@ -820,7 +820,7 @@ struct T5Runner : public GGMLRunner {
        return gf;
    }

-    void compute(const int n_threads,
+    bool compute(const int n_threads,
                 struct ggml_tensor* input_ids,
                 struct ggml_tensor* attention_mask,
                 ggml_tensor** output,
@ -828,7 +828,7 @@ struct T5Runner : public GGMLRunner {
        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(input_ids, attention_mask);
        };
-        GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+        return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
    }

    static std::vector<int> _relative_position_bucket(const std::vector<int>& relative_position,
@ -1004,7 +1004,7 @@ struct T5Embedder {
        ggml_type model_data_type = GGML_TYPE_F16;

        ModelLoader model_loader;
-        if (!model_loader.init_from_file(file_path)) {
+        if (!model_loader.init_from_file_and_convert_name(file_path)) {
            LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
            return;
        }
--- a/tae.hpp
+++ b/tae.hpp
@ -222,7 +222,7 @@ struct TinyAutoEncoder : public GGMLRunner {
        }

        ModelLoader model_loader;
-        if (!model_loader.init_from_file(file_path)) {
+        if (!model_loader.init_from_file_and_convert_name(file_path)) {
            LOG_ERROR("init taesd model loader from file failed: '%s'", file_path.c_str());
            return false;
        }
@ -247,7 +247,7 @@ struct TinyAutoEncoder : public GGMLRunner {
        return gf;
    }

-    void compute(const int n_threads,
+    bool compute(const int n_threads,
                 struct ggml_tensor* z,
                 bool decode_graph,
                 struct ggml_tensor** output,
@ -256,7 +256,7 @@ struct TinyAutoEncoder : public GGMLRunner {
            return build_graph(z, decode_graph);
        };

-        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
    }
 };

--- a/tokenize_util.cpp
+++ b/tokenize_util.cpp
@ -811,6 +811,8 @@ bool starts_with(const std::vector<char32_t>& text,
    return std::equal(prefix.begin(), prefix.end(), text.begin() + index);
 }

+// mistral: [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
+// qwen2: (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
 std::vector<std::string> token_split(const std::string& text) {
    std::vector<std::string> tokens;
    auto cps = utf8_to_codepoints(text);
--- a/unet.hpp
+++ b/unet.hpp
@ -7,7 +7,7 @@

 /*==================================================== UnetModel =====================================================*/

-#define UNET_GRAPH_SIZE 10240
+#define UNET_GRAPH_SIZE 102400

 class SpatialVideoTransformer : public SpatialTransformer {
 protected:
@ -180,6 +180,7 @@ protected:
    int num_head_channels                  = -1;   // channels // num_heads
    int context_dim                        = 768;  // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
    bool use_linear_projection             = false;
+    bool tiny_unet                         = false;

 public:
    int model_channels  = 320;
@ -208,15 +209,17 @@ public:
            num_head_channels     = 64;
            num_heads             = -1;
            use_linear_projection = true;
-        } else if (version == VERSION_SD1_TINY_UNET) {
-            num_res_blocks = 1;
-            channel_mult   = {1, 2, 4};
        }
        if (sd_version_is_inpaint(version)) {
            in_channels = 9;
        } else if (sd_version_is_unet_edit(version)) {
            in_channels = 8;
        }
+        if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET) {
+            num_res_blocks = 1;
+            channel_mult   = {1, 2, 4};
+            tiny_unet      = true;
+        }

        // dims is always 2
        // use_temporal_attention is always True for SVD
@ -290,7 +293,7 @@ public:
                                                                                  context_dim));
                }
                input_block_chans.push_back(ch);
-                if (version == VERSION_SD1_TINY_UNET) {
+                if (tiny_unet) {
                    input_block_idx++;
                }
            }
@ -311,7 +314,7 @@ public:
            d_head = num_head_channels;
            n_head = ch / d_head;
        }
-        if (version != VERSION_SD1_TINY_UNET) {
+        if (!tiny_unet) {
            blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
            if (version != VERSION_SDXL_SSD1B) {
                blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
@ -358,7 +361,7 @@ public:
                }

                if (i > 0 && j == num_res_blocks) {
-                    if (version == VERSION_SD1_TINY_UNET) {
+                    if (tiny_unet) {
                        output_block_idx++;
                        if (output_block_idx == 2) {
                            up_sample_idx = 1;
@ -495,7 +498,7 @@ public:
                }
                hs.push_back(h);
            }
-            if (version == VERSION_SD1_TINY_UNET) {
+            if (tiny_unet) {
                input_block_idx++;
            }
            if (i != len_mults - 1) {
@ -512,7 +515,7 @@ public:
        // [N, 4*model_channels, h/8, w/8]

        // middle_block
-        if (version != VERSION_SD1_TINY_UNET) {
+        if (!tiny_unet) {
            h = resblock_forward("middle_block.0", ctx, h, emb, num_video_frames);  // [N, 4*model_channels, h/8, w/8]
            if (version != VERSION_SDXL_SSD1B) {
                h = attention_layer_forward("middle_block.1", ctx, h, context, num_video_frames);  // [N, 4*model_channels, h/8, w/8]
@ -554,7 +557,7 @@ public:
                }

                if (i > 0 && j == num_res_blocks) {
-                    if (version == VERSION_SD1_TINY_UNET) {
+                    if (tiny_unet) {
                        output_block_idx++;
                        if (output_block_idx == 2) {
                            up_sample_idx = 1;
@ -609,7 +612,7 @@ struct UNetModelRunner : public GGMLRunner {
                                    int num_video_frames                      = -1,
                                    std::vector<struct ggml_tensor*> controls = {},
                                    float control_strength                    = 0.f) {
-        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, UNET_GRAPH_SIZE, false);
+        struct ggml_cgraph* gf = new_graph_custom(UNET_GRAPH_SIZE);

        if (num_video_frames == -1) {
            num_video_frames = x->ne[3];
@ -642,7 +645,7 @@ struct UNetModelRunner : public GGMLRunner {
        return gf;
    }

-    void compute(int n_threads,
+    bool compute(int n_threads,
                 struct ggml_tensor* x,
                 struct ggml_tensor* timesteps,
                 struct ggml_tensor* context,
@ -662,7 +665,7 @@ struct UNetModelRunner : public GGMLRunner {
            return build_graph(x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength);
        };

-        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
    }

    void test() {
--- a/upscaler.cpp
+++ b/upscaler.cpp
@ -9,12 +9,15 @@ struct UpscalerGGML {
    std::shared_ptr<ESRGAN> esrgan_upscaler;
    std::string esrgan_path;
    int n_threads;
-    bool direct = false;
+    bool direct   = false;
+    int tile_size = 128;

    UpscalerGGML(int n_threads,
-                 bool direct = false)
+                 bool direct   = false,
+                 int tile_size = 128)
        : n_threads(n_threads),
-          direct(direct) {
+          direct(direct),
+          tile_size(tile_size) {
    }

    bool load_from_file(const std::string& esrgan_path,
@ -42,7 +45,7 @@ struct UpscalerGGML {
        backend = ggml_backend_sycl_init(0);
 #endif
        ModelLoader model_loader;
-        if (!model_loader.init_from_file(esrgan_path)) {
+        if (!model_loader.init_from_file_and_convert_name(esrgan_path)) {
            LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str());
        }
        model_loader.set_wtype_override(model_data_type);
@ -51,7 +54,7 @@ struct UpscalerGGML {
            backend = ggml_backend_cpu_init();
        }
        LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
-        esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, model_loader.get_tensor_storage_map());
+        esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map());
        if (direct) {
            esrgan_upscaler->set_conv2d_direct_enabled(true);
        }
@ -113,14 +116,15 @@ struct upscaler_ctx_t {
 upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
                                 bool offload_params_to_cpu,
                                 bool direct,
-                                 int n_threads) {
+                                 int n_threads,
+                                 int tile_size) {
    upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
    if (upscaler_ctx == nullptr) {
        return nullptr;
    }
    std::string esrgan_path(esrgan_path_c_str);

-    upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct);
+    upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct, tile_size);
    if (upscaler_ctx->upscaler == nullptr) {
        return nullptr;
    }
--- a/util.cpp
+++ b/util.cpp
@ -5,6 +5,7 @@
 #include <cstdarg>
 #include <fstream>
 #include <locale>
+#include <regex>
 #include <sstream>
 #include <string>
 #include <thread>
@ -56,7 +57,7 @@ void replace_all_chars(std::string& str, char target, char replacement) {
    }
 }

-std::string format(const char* fmt, ...) {
+std::string sd_format(const char* fmt, ...) {
    va_list ap;
    va_list ap2;
    va_start(ap, fmt);
@ -94,20 +95,6 @@ bool is_directory(const std::string& path) {
    return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY));
 }

-std::string get_full_path(const std::string& dir, const std::string& filename) {
-    std::string full_path = dir + "\\" + filename;
-
-    WIN32_FIND_DATA find_file_data;
-    HANDLE hFind = FindFirstFile(full_path.c_str(), &find_file_data);
-
-    if (hFind != INVALID_HANDLE_VALUE) {
-        FindClose(hFind);
-        return full_path;
-    } else {
-        return "";
-    }
-}
-
 #else  // Unix
 #include <dirent.h>
 #include <sys/stat.h>
@ -122,32 +109,12 @@ bool is_directory(const std::string& path) {
    return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
 }

-// TODO: add windows version
-std::string get_full_path(const std::string& dir, const std::string& filename) {
-    DIR* dp = opendir(dir.c_str());
-
-    if (dp != nullptr) {
-        struct dirent* entry;
-
-        while ((entry = readdir(dp)) != nullptr) {
-            if (strcasecmp(entry->d_name, filename.c_str()) == 0) {
-                closedir(dp);
-                return dir + "/" + entry->d_name;
-            }
-        }
-
-        closedir(dp);
-    }
-
-    return "";
-}
-
 #endif

 // get_num_physical_cores is copy from
 // https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
 // LICENSE: https://github.com/ggerganov/llama.cpp/blob/master/LICENSE
-int32_t get_num_physical_cores() {
+int32_t sd_get_num_physical_cores() {
 #ifdef __linux__
    // enumerate the set of thread siblings, num entries is num cores
    std::unordered_set<std::string> siblings;
@ -185,6 +152,13 @@ int32_t get_num_physical_cores() {
 static sd_progress_cb_t sd_progress_cb = nullptr;
 void* sd_progress_cb_data              = nullptr;

+static sd_preview_cb_t sd_preview_cb = nullptr;
+static void* sd_preview_cb_data      = nullptr;
+preview_t sd_preview_mode            = PREVIEW_NONE;
+int sd_preview_interval              = 1;
+bool sd_preview_denoised             = true;
+bool sd_preview_noisy                = false;
+
 std::u32string utf8_to_utf32(const std::string& utf8_str) {
    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
    return converter.from_bytes(utf8_str);
@ -266,13 +240,16 @@ void pretty_progress(int step, int steps, float time) {
        }
    }
    progress += "|";
-    printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s\033[K",
-           progress.c_str(), step, steps,
-           time > 1.0f || time == 0 ? time : (1.0f / time));
-    fflush(stdout);  // for linux
-    if (step == steps) {
-        printf("\n");
+
+    const char* lf   = (step == steps ? "\n" : "");
+    const char* unit = "s/it";
+    float speed      = time;
+    if (speed < 1.0f && speed > 0.f) {
+        speed = 1.0f / speed;
+        unit  = "it/s";
    }
+    printf("\r%s %i/%i - %.2f%s\033[K%s", progress.c_str(), step, steps, speed, unit, lf);
+    fflush(stdout);  // for linux
 }

 std::string ltrim(const std::string& s) {
@ -328,23 +305,58 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) {
    sd_progress_cb      = cb;
    sd_progress_cb_data = data;
 }
+void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode, int interval, bool denoised, bool noisy, void* data) {
+    sd_preview_cb       = cb;
+    sd_preview_cb_data  = data;
+    sd_preview_mode     = mode;
+    sd_preview_interval = interval;
+    sd_preview_denoised = denoised;
+    sd_preview_noisy    = noisy;
+}
+
+sd_preview_cb_t sd_get_preview_callback() {
+    return sd_preview_cb;
+}
+void* sd_get_preview_callback_data() {
+    return sd_preview_cb_data;
+}
+
+preview_t sd_get_preview_mode() {
+    return sd_preview_mode;
+}
+int sd_get_preview_interval() {
+    return sd_preview_interval;
+}
+bool sd_should_preview_denoised() {
+    return sd_preview_denoised;
+}
+bool sd_should_preview_noisy() {
+    return sd_preview_noisy;
+}
+
+sd_progress_cb_t sd_get_progress_callback() {
+    return sd_progress_cb;
+}
+void* sd_get_progress_callback_data() {
+    return sd_progress_cb_data;
+}
 const char* sd_get_system_info() {
    static char buffer[1024];
    std::stringstream ss;
    ss << "System Info: \n";
-    ss << "    SSE3 = " << ggml_cpu_has_sse3() << std::endl;
-    ss << "    AVX = " << ggml_cpu_has_avx() << std::endl;
-    ss << "    AVX2 = " << ggml_cpu_has_avx2() << std::endl;
-    ss << "    AVX512 = " << ggml_cpu_has_avx512() << std::endl;
-    ss << "    AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << std::endl;
-    ss << "    AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << std::endl;
-    ss << "    FMA = " << ggml_cpu_has_fma() << std::endl;
-    ss << "    NEON = " << ggml_cpu_has_neon() << std::endl;
-    ss << "    ARM_FMA = " << ggml_cpu_has_arm_fma() << std::endl;
-    ss << "    F16C = " << ggml_cpu_has_f16c() << std::endl;
-    ss << "    FP16_VA = " << ggml_cpu_has_fp16_va() << std::endl;
-    ss << "    WASM_SIMD = " << ggml_cpu_has_wasm_simd() << std::endl;
-    ss << "    VSX = " << ggml_cpu_has_vsx() << std::endl;
+    ss << "    SSE3 = " << ggml_cpu_has_sse3() << " | ";
+    ss << "    AVX = " << ggml_cpu_has_avx() << " | ";
+    ss << "    AVX2 = " << ggml_cpu_has_avx2() << " | ";
+    ss << "    AVX512 = " << ggml_cpu_has_avx512() << " | ";
+    ss << "    AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | ";
+    ss << "    AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | ";
+    ss << "    FMA = " << ggml_cpu_has_fma() << " | ";
+    ss << "    NEON = " << ggml_cpu_has_neon() << " | ";
+    ss << "    ARM_FMA = " << ggml_cpu_has_arm_fma() << " | ";
+    ss << "    F16C = " << ggml_cpu_has_f16c() << " | ";
+    ss << "    FP16_VA = " << ggml_cpu_has_fp16_va() << " | ";
+    ss << "    WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | ";
+    ss << "    VSX = " << ggml_cpu_has_vsx() << " | ";
    snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str());
    return buffer;
 }
@ -510,6 +522,8 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int targe
 //   (abc) - increases attention to abc by a multiplier of 1.1
 //   (abc:3.12) - increases attention to abc by a multiplier of 3.12
 //   [abc] - decreases attention to abc by a multiplier of 1.1
+//   BREAK - separates the prompt into conceptually distinct parts for sequential processing
+//   B - internal helper pattern; prevents 'B' in 'BREAK' from being consumed as normal text
 //   \( - literal character '('
 //   \[ - literal character '['
 //   \) - literal character ')'
@ -545,7 +559,7 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
    float round_bracket_multiplier  = 1.1f;
    float square_bracket_multiplier = 1 / 1.1f;

-    std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)");
+    std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|\bBREAK\b|[^\\()\[\]:B]+|:|\bB)");
    std::regex re_break(R"(\s*\bBREAK\b\s*)");

    auto multiply_range = [&](int start_position, float multiplier) {
@ -554,7 +568,7 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
        }
    };

-    std::smatch m;
+    std::smatch m, m2;
    std::string remaining_text = text;

    while (std::regex_search(remaining_text, m, re_attention)) {
@ -578,6 +592,8 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
            square_brackets.pop_back();
        } else if (text == "\\(") {
            res.push_back({text.substr(1), 1.0f});
+        } else if (std::regex_search(text, m2, re_break)) {
+            res.push_back({"BREAK", -1.0f});
        } else {
            res.push_back({text, 1.0f});
        }
@ -608,4 +624,4 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
    }

    return res;
-}
+}
--- a/util.h
+++ b/util.h
@ -14,7 +14,7 @@ bool ends_with(const std::string& str, const std::string& ending);
 bool starts_with(const std::string& str, const std::string& start);
 bool contains(const std::string& str, const std::string& substr);

-std::string format(const char* fmt, ...);
+std::string sd_format(const char* fmt, ...);

 void replace_all_chars(std::string& str, char target, char replacement);

@ -22,7 +22,6 @@ int round_up_to(int value, int base);

 bool file_exists(const std::string& filename);
 bool is_directory(const std::string& path);
-std::string get_full_path(const std::string& dir, const std::string& filename);

 std::u32string utf8_to_utf32(const std::string& utf8_str);
 std::string utf32_to_utf8(const std::u32string& utf32_str);
@ -54,6 +53,16 @@ std::string trim(const std::string& s);

 std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text);

+sd_progress_cb_t sd_get_progress_callback();
+void* sd_get_progress_callback_data();
+
+sd_preview_cb_t sd_get_preview_callback();
+void* sd_get_preview_callback_data();
+preview_t sd_get_preview_mode();
+int sd_get_preview_interval();
+bool sd_should_preview_denoised();
+bool sd_should_preview_noisy();
+
 #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)
--- a/vae.hpp
+++ b/vae.hpp
@ -66,6 +66,25 @@ protected:
    int64_t in_channels;
    bool use_linear;

+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
+        auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
+        if (iter != tensor_storage_map.end()) {
+            if (iter->second.n_dims == 4 && use_linear) {
+                use_linear         = false;
+                blocks["q"]        = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
+                blocks["k"]        = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
+                blocks["v"]        = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
+                blocks["proj_out"] = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
+            } else if (iter->second.n_dims == 2 && !use_linear) {
+                use_linear         = true;
+                blocks["q"]        = std::make_shared<Linear>(in_channels, in_channels);
+                blocks["k"]        = std::make_shared<Linear>(in_channels, in_channels);
+                blocks["v"]        = std::make_shared<Linear>(in_channels, in_channels);
+                blocks["proj_out"] = std::make_shared<Linear>(in_channels, in_channels);
+            }
+        }
+    }
+
 public:
    AttnBlock(int64_t in_channels, bool use_linear)
        : in_channels(in_channels), use_linear(use_linear) {
@ -468,6 +487,7 @@ public:
 // ldm.models.autoencoder.AutoencoderKL
 class AutoencodingEngine : public GGMLBlock {
 protected:
+    SDVersion version;
    bool decode_only       = true;
    bool use_video_decoder = false;
    bool use_quant         = true;
@ -488,10 +508,15 @@ public:
                       bool decode_only           = true,
                       bool use_linear_projection = false,
                       bool use_video_decoder     = false)
-        : decode_only(decode_only), use_video_decoder(use_video_decoder) {
+        : version(version), decode_only(decode_only), use_video_decoder(use_video_decoder) {
        if (sd_version_is_dit(version)) {
-            dd_config.z_channels = 16;
-            use_quant            = false;
+            if (sd_version_is_flux2(version)) {
+                dd_config.z_channels = 32;
+                embed_dim            = 32;
+            } else {
+                use_quant            = false;
+                dd_config.z_channels = 16;
+            }
        }
        if (use_video_decoder) {
            use_quant = false;
@ -528,6 +553,24 @@ public:

    struct ggml_tensor* decode(GGMLRunnerContext* ctx, struct ggml_tensor* z) {
        // z: [N, z_channels, h, w]
+        if (sd_version_is_flux2(version)) {
+            // [N, C*p*p, h, w] -> [N, C, h*p, w*p]
+            int64_t p = 2;
+
+            int64_t N = z->ne[3];
+            int64_t C = z->ne[2] / p / p;
+            int64_t h = z->ne[1];
+            int64_t w = z->ne[0];
+            int64_t H = h * p;
+            int64_t W = w * p;
+
+            z = ggml_reshape_4d(ctx->ggml_ctx, z, w * h, p * p, C, N);                           // [N, C, p*p, h*w]
+            z = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, z, 1, 0, 2, 3));  // [N, C, h*w, p*p]
+            z = ggml_reshape_4d(ctx->ggml_ctx, z, p, p, w, h * C * N);                           // [N*C*h, w, p, p]
+            z = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, z, 0, 2, 1, 3));  // [N*C*h, p, w, p]
+            z = ggml_reshape_4d(ctx->ggml_ctx, z, W, H, C, N);                                   // [N, C, h*p, w*p]
+        }
+
        if (use_quant) {
            auto post_quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["post_quant_conv"]);
            z                    = post_quant_conv->forward(ctx, z);  // [N, z_channels, h, w]
@ -544,19 +587,37 @@ public:
        // x: [N, in_channels, h, w]
        auto encoder = std::dynamic_pointer_cast<Encoder>(blocks["encoder"]);

-        auto h = encoder->forward(ctx, x);  // [N, 2*z_channels, h/8, w/8]
+        auto z = encoder->forward(ctx, x);  // [N, 2*z_channels, h/8, w/8]
        if (use_quant) {
            auto quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["quant_conv"]);
-            h               = quant_conv->forward(ctx, h);  // [N, 2*embed_dim, h/8, w/8]
+            z               = quant_conv->forward(ctx, z);  // [N, 2*embed_dim, h/8, w/8]
        }
-        return h;
+        if (sd_version_is_flux2(version)) {
+            z = ggml_ext_chunk(ctx->ggml_ctx, z, 2, 2)[0];
+
+            // [N, C, H, W] -> [N, C*p*p, H/p, W/p]
+            int64_t p = 2;
+            int64_t N = z->ne[3];
+            int64_t C = z->ne[2];
+            int64_t H = z->ne[1];
+            int64_t W = z->ne[0];
+            int64_t h = H / p;
+            int64_t w = W / p;
+
+            z = ggml_reshape_4d(ctx->ggml_ctx, z, p, w, p, h * C * N);                 // [N*C*h, p, w, p]
+            z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 0, 2, 1, 3));  // [N*C*h, w, p, p]
+            z = ggml_reshape_4d(ctx->ggml_ctx, z, p * p, w * h, C, N);                 // [N, C, h*w, p*p]
+            z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 1, 0, 2, 3));  // [N, C, p*p, h*w]
+            z = ggml_reshape_4d(ctx->ggml_ctx, z, w, h, p * p * C, N);                 // [N, C*p*p, h*w]
+        }
+        return z;
    }
 };

 struct VAE : public GGMLRunner {
    VAE(ggml_backend_t backend, bool offload_params_to_cpu)
        : GGMLRunner(backend, offload_params_to_cpu) {}
-    virtual void compute(const int n_threads,
+    virtual bool compute(const int n_threads,
                         struct ggml_tensor* z,
                         bool decode_graph,
                         struct ggml_tensor** output,
@ -568,7 +629,7 @@ struct VAE : public GGMLRunner {
 struct FakeVAE : public VAE {
    FakeVAE(ggml_backend_t backend, bool offload_params_to_cpu)
        : VAE(backend, offload_params_to_cpu) {}
-    void compute(const int n_threads,
+    bool compute(const int n_threads,
                 struct ggml_tensor* z,
                 bool decode_graph,
                 struct ggml_tensor** output,
@ -580,6 +641,7 @@ struct FakeVAE : public VAE {
            float value = ggml_ext_tensor_get_f32(z, i0, i1, i2, i3);
            ggml_ext_tensor_set_f32(*output, value, i0, i1, i2, i3);
        });
+        return true;
    }

    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) override {}
@ -650,7 +712,7 @@ struct AutoEncoderKL : public VAE {
        return gf;
    }

-    void compute(const int n_threads,
+    bool compute(const int n_threads,
                 struct ggml_tensor* z,
                 bool decode_graph,
                 struct ggml_tensor** output,
@ -661,7 +723,7 @@ struct AutoEncoderKL : public VAE {
        };
        // ggml_set_f32(z, 0.5f);
        // print_ggml_tensor(z);
-        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
    }

    void test() {
--- a/version.cpp
+++ b/version.cpp
@ -0,0 +1,20 @@
+#include "stable-diffusion.h"
+
+#ifndef SDCPP_BUILD_COMMIT
+#define SDCPP_BUILD_COMMIT unknown
+#endif
+
+#ifndef SDCPP_BUILD_VERSION
+#define SDCPP_BUILD_VERSION unknown
+#endif
+
+#define STRINGIZE2(x) #x
+#define STRINGIZE(x) STRINGIZE2(x)
+
+const char* sd_commit(void) {
+    return STRINGIZE(SDCPP_BUILD_COMMIT);
+}
+
+const char* sd_version(void) {
+    return STRINGIZE(SDCPP_BUILD_VERSION);
+}
--- a/vocab_mistral.hpp
+++ b/vocab_mistral.hpp
--- a/wan.hpp
+++ b/wan.hpp
@ -1133,7 +1133,7 @@ namespace WAN {
        }

        struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
-            struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, 10240 * z->ne[2], false);
+            struct ggml_cgraph* gf = new_graph_custom(10240 * z->ne[2]);

            z = to_backend(z);

@ -1147,7 +1147,7 @@ namespace WAN {
        }

        struct ggml_cgraph* build_graph_partial(struct ggml_tensor* z, bool decode_graph, int64_t i) {
-            struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, 20480, false);
+            struct ggml_cgraph* gf = new_graph_custom(20480);

            ae.clear_cache();

@ -1175,7 +1175,7 @@ namespace WAN {
            return gf;
        }

-        void compute(const int n_threads,
+        bool compute(const int n_threads,
                     struct ggml_tensor* z,
                     bool decode_graph,
                     struct ggml_tensor** output,
@ -1184,7 +1184,7 @@ namespace WAN {
                auto get_graph = [&]() -> struct ggml_cgraph* {
                    return build_graph(z, decode_graph);
                };
-                GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+                return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
            } else {  // chunk 1 result is weird
                ae.clear_cache();
                int64_t t      = z->ne[2];
@ -1193,11 +1193,11 @@ namespace WAN {
                    return build_graph_partial(z, decode_graph, i);
                };
                struct ggml_tensor* out = nullptr;
-                GGMLRunner::compute(get_graph, n_threads, true, &out, output_ctx);
+                bool res                = GGMLRunner::compute(get_graph, n_threads, true, &out, output_ctx);
                ae.clear_cache();
                if (t == 1) {
                    *output = out;
-                    return;
+                    return res;
                }

                *output = ggml_new_tensor_4d(output_ctx, GGML_TYPE_F32, out->ne[0], out->ne[1], (t - 1) * 4 + 1, out->ne[3]);
@ -1221,11 +1221,12 @@ namespace WAN {
                out = ggml_new_tensor_4d(output_ctx, GGML_TYPE_F32, out->ne[0], out->ne[1], 4, out->ne[3]);

                for (i = 1; i < t; i++) {
-                    GGMLRunner::compute(get_graph, n_threads, true, &out);
+                    res = res || GGMLRunner::compute(get_graph, n_threads, true, &out);
                    ae.clear_cache();
                    copy_to_output();
                }
                free_cache_ctx_and_buffer();
+                return res;
            }
        }

@ -1271,7 +1272,7 @@ namespace WAN {
                vae->get_param_tensors(tensors, "first_stage_model");

                ModelLoader model_loader;
-                if (!model_loader.init_from_file(file_path, "vae.")) {
+                if (!model_loader.init_from_file_and_convert_name(file_path, "vae.")) {
                    LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                    return;
                }
@ -2075,15 +2076,19 @@ namespace WAN {
                    wan_params.text_len  = 512;
                } else {
                    if (wan_params.vace_layers > 0) {
-                        desc = "Wan2.1-VACE-1.3B";
+                        desc              = "Wan2.1-VACE-1.3B";
+                        wan_params.in_dim = 16;
+                    } else if (wan_params.model_type == "i2v") {
+                        desc              = "Wan2.1-I2V-1.3B";
+                        wan_params.in_dim = 36;
                    } else {
-                        desc = "Wan2.1-T2V-1.3B";
+                        desc              = "Wan2.1-T2V-1.3B";
+                        wan_params.in_dim = 16;
                    }
                    wan_params.dim       = 1536;
                    wan_params.eps       = 1e-06;
                    wan_params.ffn_dim   = 8960;
                    wan_params.freq_dim  = 256;
-                    wan_params.in_dim    = 16;
                    wan_params.num_heads = 12;
                    wan_params.out_dim   = 16;
                    wan_params.text_len  = 512;
@ -2142,7 +2147,7 @@ namespace WAN {
                                        struct ggml_tensor* time_dim_concat = nullptr,
                                        struct ggml_tensor* vace_context    = nullptr,
                                        float vace_strength                 = 1.f) {
-            struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, WAN_GRAPH_SIZE, false);
+            struct ggml_cgraph* gf = new_graph_custom(WAN_GRAPH_SIZE);

            x               = to_backend(x);
            timesteps       = to_backend(timesteps);
@ -2190,7 +2195,7 @@ namespace WAN {
            return gf;
        }

-        void compute(int n_threads,
+        bool compute(int n_threads,
                     struct ggml_tensor* x,
                     struct ggml_tensor* timesteps,
                     struct ggml_tensor* context,
@ -2205,7 +2210,7 @@ namespace WAN {
                return build_graph(x, timesteps, context, clip_fea, c_concat, time_dim_concat, vace_context, vace_strength);
            };

-            GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+            return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
        }

        void test() {
@ -2255,7 +2260,7 @@ namespace WAN {
            LOG_INFO("loading from '%s'", file_path.c_str());

            ModelLoader model_loader;
-            if (!model_loader.init_from_file(file_path, "model.diffusion_model.")) {
+            if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
            }
--- a/z_image.hpp
+++ b/z_image.hpp
@ -0,0 +1,675 @@
+#ifndef __Z_IMAGE_HPP__
+#define __Z_IMAGE_HPP__
+
+#include <algorithm>
+
+#include "flux.hpp"
+#include "ggml_extend.hpp"
+#include "mmdit.hpp"
+
+// Ref: https://github.com/Alpha-VLLM/Lumina-Image-2.0/blob/main/models/model.py
+// Ref: https://github.com/huggingface/diffusers/pull/12703
+
+#ifndef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+namespace ZImage {
+    constexpr int Z_IMAGE_GRAPH_SIZE = 20480;
+    constexpr int ADALN_EMBED_DIM    = 256;
+    constexpr int SEQ_MULTI_OF       = 32;
+
+    struct JointAttention : public GGMLBlock {
+    protected:
+        int64_t head_dim;
+        int64_t num_heads;
+        int64_t num_kv_heads;
+        bool qk_norm;
+
+    public:
+        JointAttention(int64_t hidden_size, int64_t head_dim, int64_t num_heads, int64_t num_kv_heads, bool qk_norm)
+            : head_dim(head_dim), num_heads(num_heads), num_kv_heads(num_kv_heads), qk_norm(qk_norm) {
+            blocks["qkv"] = std::make_shared<Linear>(hidden_size, (num_heads + num_kv_heads * 2) * head_dim, false);
+            float scale   = 1.f;
+#if GGML_USE_HIP
+            // Prevent NaN issues with certain ROCm setups
+            scale = 1.f / 16.f;
+#endif
+            blocks["out"] = std::make_shared<Linear>(num_heads * head_dim, hidden_size, false, false, false, scale);
+            if (qk_norm) {
+                blocks["q_norm"] = std::make_shared<RMSNorm>(head_dim);
+                blocks["k_norm"] = std::make_shared<RMSNorm>(head_dim);
+            }
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* x,
+                                    struct ggml_tensor* pe,
+                                    struct ggml_tensor* mask = nullptr) {
+            // x: [N, n_token, hidden_size]
+            int64_t n_token = x->ne[1];
+            int64_t N       = x->ne[2];
+            auto qkv_proj   = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
+            auto out_proj   = std::dynamic_pointer_cast<Linear>(blocks["out"]);
+
+            auto qkv = qkv_proj->forward(ctx, x);                                                                            // [N, n_token, (num_heads + num_kv_heads*2)*head_dim]
+            qkv      = ggml_reshape_4d(ctx->ggml_ctx, qkv, head_dim, num_heads + num_kv_heads * 2, qkv->ne[1], qkv->ne[2]);  // [N, n_token, num_heads + num_kv_heads*2, head_dim]
+            qkv      = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, qkv, 0, 2, 3, 1));                     // [num_heads + num_kv_heads*2, N, n_token, head_dim]
+
+            auto q = ggml_view_4d(ctx->ggml_ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], num_heads, qkv->nb[1], qkv->nb[2], qkv->nb[3], 0);                                           // [num_heads, N, n_token, head_dim]
+            auto k = ggml_view_4d(ctx->ggml_ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], num_kv_heads, qkv->nb[1], qkv->nb[2], qkv->nb[3], qkv->nb[3] * num_heads);                   // [num_kv_heads, N, n_token, head_dim]
+            auto v = ggml_view_4d(ctx->ggml_ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], num_kv_heads, qkv->nb[1], qkv->nb[2], qkv->nb[3], qkv->nb[3] * (num_heads + num_kv_heads));  // [num_kv_heads, N, n_token, head_dim]
+
+            q = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, q, 0, 3, 1, 2));  // [N, n_token, num_heads, head_dim]
+            k = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, k, 0, 3, 1, 2));  // [N, n_token, num_kv_heads, head_dim]
+            v = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, v, 0, 3, 1, 2));  // [N, n_token, num_kv_heads, head_dim]
+
+            if (qk_norm) {
+                auto q_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["q_norm"]);
+                auto k_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["k_norm"]);
+
+                q = q_norm->forward(ctx, q);
+                k = k_norm->forward(ctx, k);
+            }
+
+            x = Rope::attention(ctx, q, k, v, pe, mask, 1.f / 128.f);  // [N, n_token, num_heads * head_dim]
+
+            x = out_proj->forward(ctx, x);  // [N, n_token, hidden_size]
+            return x;
+        }
+    };
+
+    class FeedForward : public GGMLBlock {
+    public:
+        FeedForward(int64_t dim,
+                    int64_t hidden_dim,
+                    int64_t multiple_of,
+                    float ffn_dim_multiplier = 0.f) {
+            if (ffn_dim_multiplier > 0.f) {
+                hidden_dim = static_cast<int64_t>(ffn_dim_multiplier * hidden_dim);
+            }
+            hidden_dim   = multiple_of * ((hidden_dim + multiple_of - 1) / multiple_of);
+            blocks["w1"] = std::make_shared<Linear>(dim, hidden_dim, false);
+
+            bool force_prec_f32 = false;
+            float scale         = 1.f / 128.f;
+#ifdef SD_USE_VULKAN
+            force_prec_f32 = true;
+#endif
+            // The purpose of the scale here is to prevent NaN issues in certain situations.
+            // For example, when using CUDA but the weights are k-quants.
+            blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false, false, force_prec_f32, scale);
+            blocks["w3"] = std::make_shared<Linear>(dim, hidden_dim, false);
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+            auto w1 = std::dynamic_pointer_cast<Linear>(blocks["w1"]);
+            auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]);
+            auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]);
+
+            auto x1 = w1->forward(ctx, x);
+            auto x3 = w3->forward(ctx, x);
+            x       = ggml_mul(ctx->ggml_ctx, ggml_silu(ctx->ggml_ctx, x1), x3);
+            x       = w2->forward(ctx, x);
+
+            return x;
+        }
+    };
+
+    __STATIC_INLINE__ struct ggml_tensor* modulate(struct ggml_context* ctx,
+                                                   struct ggml_tensor* x,
+                                                   struct ggml_tensor* scale) {
+        // x: [N, L, C]
+        // scale: [N, C]
+        scale = ggml_reshape_3d(ctx, scale, scale->ne[0], 1, scale->ne[1]);  // [N, 1, C]
+        x     = ggml_add(ctx, x, ggml_mul(ctx, x, scale));
+        return x;
+    }
+
+    struct JointTransformerBlock : public GGMLBlock {
+    protected:
+        bool modulation;
+
+    public:
+        JointTransformerBlock(int layer_id,
+                              int64_t hidden_size,
+                              int64_t head_dim,
+                              int64_t num_heads,
+                              int64_t num_kv_heads,
+                              int64_t multiple_of,
+                              float ffn_dim_multiplier,
+                              float norm_eps,
+                              bool qk_norm,
+                              bool modulation = true)
+            : modulation(modulation) {
+            blocks["attention"]       = std::make_shared<JointAttention>(hidden_size, head_dim, num_heads, num_kv_heads, qk_norm);
+            blocks["feed_forward"]    = std::make_shared<FeedForward>(hidden_size, hidden_size, multiple_of, ffn_dim_multiplier);
+            blocks["attention_norm1"] = std::make_shared<RMSNorm>(hidden_size, norm_eps);
+            blocks["ffn_norm1"]       = std::make_shared<RMSNorm>(hidden_size, norm_eps);
+            blocks["attention_norm2"] = std::make_shared<RMSNorm>(hidden_size, norm_eps);
+            blocks["ffn_norm2"]       = std::make_shared<RMSNorm>(hidden_size, norm_eps);
+            if (modulation) {
+                blocks["adaLN_modulation.0"] = std::make_shared<Linear>(MIN(hidden_size, ADALN_EMBED_DIM), 4 * hidden_size);
+            }
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* x,
+                                    struct ggml_tensor* pe,
+                                    struct ggml_tensor* mask        = nullptr,
+                                    struct ggml_tensor* adaln_input = nullptr) {
+            auto attention       = std::dynamic_pointer_cast<JointAttention>(blocks["attention"]);
+            auto feed_forward    = std::dynamic_pointer_cast<FeedForward>(blocks["feed_forward"]);
+            auto attention_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["attention_norm1"]);
+            auto ffn_norm1       = std::dynamic_pointer_cast<RMSNorm>(blocks["ffn_norm1"]);
+            auto attention_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["attention_norm2"]);
+            auto ffn_norm2       = std::dynamic_pointer_cast<RMSNorm>(blocks["ffn_norm2"]);
+
+            if (modulation) {
+                GGML_ASSERT(adaln_input != nullptr);
+                auto adaLN_modulation_0 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.0"]);
+
+                auto m         = adaLN_modulation_0->forward(ctx, adaln_input);  // [N, 4 * hidden_size]
+                auto mods      = ggml_ext_chunk(ctx->ggml_ctx, m, 4, 0);
+                auto scale_msa = mods[0];
+                auto gate_msa  = mods[1];
+                auto scale_mlp = mods[2];
+                auto gate_mlp  = mods[3];
+
+                auto residual = x;
+                x             = modulate(ctx->ggml_ctx, attention_norm1->forward(ctx, x), scale_msa);
+                x             = attention->forward(ctx, x, pe, mask);
+                x             = attention_norm2->forward(ctx, x);
+                x             = ggml_mul(ctx->ggml_ctx, x, ggml_tanh(ctx->ggml_ctx, gate_msa));
+                x             = ggml_add(ctx->ggml_ctx, x, residual);
+
+                residual = x;
+                x        = modulate(ctx->ggml_ctx, ffn_norm1->forward(ctx, x), scale_mlp);
+                x        = feed_forward->forward(ctx, x);
+                x        = ffn_norm2->forward(ctx, x);
+                x        = ggml_mul(ctx->ggml_ctx, x, ggml_tanh(ctx->ggml_ctx, gate_mlp));
+                x        = ggml_add(ctx->ggml_ctx, x, residual);
+            } else {
+                GGML_ASSERT(adaln_input == nullptr);
+
+                auto residual = x;
+                x             = attention_norm1->forward(ctx, x);
+                x             = attention->forward(ctx, x, pe, mask);
+                x             = attention_norm2->forward(ctx, x);
+                x             = ggml_add(ctx->ggml_ctx, x, residual);
+
+                residual = x;
+                x        = ffn_norm1->forward(ctx, x);
+                x        = feed_forward->forward(ctx, x);
+                x        = ffn_norm2->forward(ctx, x);
+                x        = ggml_add(ctx->ggml_ctx, x, residual);
+            }
+
+            return x;
+        }
+    };
+
+    struct FinalLayer : public GGMLBlock {
+    public:
+        FinalLayer(int64_t hidden_size,
+                   int64_t patch_size,
+                   int64_t out_channels) {
+            blocks["norm_final"]         = std::make_shared<LayerNorm>(hidden_size, 1e-06f, false);
+            blocks["linear"]             = std::make_shared<Linear>(hidden_size, patch_size * patch_size * out_channels, true, true);
+            blocks["adaLN_modulation.1"] = std::make_shared<Linear>(MIN(hidden_size, ADALN_EMBED_DIM), hidden_size);
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* x,
+                                    struct ggml_tensor* c) {
+            // x: [N, n_token, hidden_size]
+            // c: [N, hidden_size]
+            // return: [N, n_token, patch_size * patch_size * out_channels]
+            auto norm_final         = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_final"]);
+            auto linear             = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
+            auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);
+
+            auto scale = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c));  // [N, hidden_size]
+            x          = norm_final->forward(ctx, x);
+            x          = modulate(ctx->ggml_ctx, x, scale);
+            x          = linear->forward(ctx, x);
+
+            return x;
+        }
+    };
+
+    struct ZImageParams {
+        int64_t patch_size         = 2;
+        int64_t hidden_size        = 3840;
+        int64_t in_channels        = 16;
+        int64_t out_channels       = 16;
+        int64_t num_layers         = 30;
+        int64_t num_refiner_layers = 2;
+        int64_t head_dim           = 128;
+        int64_t num_heads          = 30;
+        int64_t num_kv_heads       = 30;
+        int64_t multiple_of        = 256;
+        float ffn_dim_multiplier   = 8.0 / 3.0f;
+        float norm_eps             = 1e-5f;
+        bool qk_norm               = true;
+        int64_t cap_feat_dim       = 2560;
+        float theta                = 256.f;
+        std::vector<int> axes_dim  = {32, 48, 48};
+        int64_t axes_dim_sum       = 128;
+    };
+
+    class ZImageModel : public GGMLBlock {
+    protected:
+        ZImageParams z_image_params;
+
+        void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+            params["cap_pad_token"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_image_params.hidden_size);
+            params["x_pad_token"]   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_image_params.hidden_size);
+        }
+
+    public:
+        ZImageModel() = default;
+        ZImageModel(ZImageParams z_image_params)
+            : z_image_params(z_image_params) {
+            blocks["x_embedder"]     = std::make_shared<Linear>(z_image_params.patch_size * z_image_params.patch_size * z_image_params.in_channels, z_image_params.hidden_size);
+            blocks["t_embedder"]     = std::make_shared<TimestepEmbedder>(MIN(z_image_params.hidden_size, 1024), 256, 256);
+            blocks["cap_embedder.0"] = std::make_shared<RMSNorm>(z_image_params.cap_feat_dim, z_image_params.norm_eps);
+            blocks["cap_embedder.1"] = std::make_shared<Linear>(z_image_params.cap_feat_dim, z_image_params.hidden_size);
+
+            for (int i = 0; i < z_image_params.num_refiner_layers; i++) {
+                auto block = std::make_shared<JointTransformerBlock>(i,
+                                                                     z_image_params.hidden_size,
+                                                                     z_image_params.head_dim,
+                                                                     z_image_params.num_heads,
+                                                                     z_image_params.num_kv_heads,
+                                                                     z_image_params.multiple_of,
+                                                                     z_image_params.ffn_dim_multiplier,
+                                                                     z_image_params.norm_eps,
+                                                                     z_image_params.qk_norm,
+                                                                     true);
+
+                blocks["noise_refiner." + std::to_string(i)] = block;
+            }
+
+            for (int i = 0; i < z_image_params.num_refiner_layers; i++) {
+                auto block = std::make_shared<JointTransformerBlock>(i,
+                                                                     z_image_params.hidden_size,
+                                                                     z_image_params.head_dim,
+                                                                     z_image_params.num_heads,
+                                                                     z_image_params.num_kv_heads,
+                                                                     z_image_params.multiple_of,
+                                                                     z_image_params.ffn_dim_multiplier,
+                                                                     z_image_params.norm_eps,
+                                                                     z_image_params.qk_norm,
+                                                                     false);
+
+                blocks["context_refiner." + std::to_string(i)] = block;
+            }
+
+            for (int i = 0; i < z_image_params.num_layers; i++) {
+                auto block = std::make_shared<JointTransformerBlock>(i,
+                                                                     z_image_params.hidden_size,
+                                                                     z_image_params.head_dim,
+                                                                     z_image_params.num_heads,
+                                                                     z_image_params.num_kv_heads,
+                                                                     z_image_params.multiple_of,
+                                                                     z_image_params.ffn_dim_multiplier,
+                                                                     z_image_params.norm_eps,
+                                                                     z_image_params.qk_norm,
+                                                                     true);
+
+                blocks["layers." + std::to_string(i)] = block;
+            }
+
+            blocks["final_layer"] = std::make_shared<FinalLayer>(z_image_params.hidden_size, z_image_params.patch_size, z_image_params.out_channels);
+        }
+
+        struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx,
+                                              struct ggml_tensor* x) {
+            int64_t W = x->ne[0];
+            int64_t H = x->ne[1];
+
+            int pad_h = (z_image_params.patch_size - H % z_image_params.patch_size) % z_image_params.patch_size;
+            int pad_w = (z_image_params.patch_size - W % z_image_params.patch_size) % z_image_params.patch_size;
+            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            return x;
+        }
+
+        struct ggml_tensor* patchify(struct ggml_context* ctx,
+                                     struct ggml_tensor* x) {
+            // x: [N, C, H, W]
+            // return: [N, h*w, patch_size*patch_size*C]
+            int64_t N = x->ne[3];
+            int64_t C = x->ne[2];
+            int64_t H = x->ne[1];
+            int64_t W = x->ne[0];
+            int64_t p = z_image_params.patch_size;
+            int64_t h = H / z_image_params.patch_size;
+            int64_t w = W / z_image_params.patch_size;
+
+            GGML_ASSERT(h * p == H && w * p == W);
+
+            x = ggml_reshape_4d(ctx, x, p, w, p, h * C * N);                 // [N*C*h, p, w, p]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));            // [N*C*h, w, p, p]
+            x = ggml_reshape_4d(ctx, x, p * p, w * h, C, N);                 // [N, C, h*w, p*p]
+            x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 2, 0, 1, 3));  // [N, h*w, C, p*p]
+            x = ggml_reshape_3d(ctx, x, C * p * p, w * h, N);                // [N, h*w, p*p*C]
+            return x;
+        }
+
+        struct ggml_tensor* process_img(struct ggml_context* ctx,
+                                        struct ggml_tensor* x) {
+            x = pad_to_patch_size(ctx, x);
+            x = patchify(ctx, x);
+            return x;
+        }
+
+        struct ggml_tensor* unpatchify(struct ggml_context* ctx,
+                                       struct ggml_tensor* x,
+                                       int64_t h,
+                                       int64_t w) {
+            // x: [N, h*w, patch_size*patch_size*C]
+            // return: [N, C, H, W]
+            int64_t N = x->ne[2];
+            int64_t C = x->ne[0] / z_image_params.patch_size / z_image_params.patch_size;
+            int64_t H = h * z_image_params.patch_size;
+            int64_t W = w * z_image_params.patch_size;
+            int64_t p = z_image_params.patch_size;
+
+            GGML_ASSERT(C * p * p == x->ne[0]);
+
+            x = ggml_reshape_4d(ctx, x, C, p * p, w * h, N);                 // [N, h*w, p*p, C]
+            x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3));  // [N, C, h*w, p*p]
+            x = ggml_reshape_4d(ctx, x, p, p, w, h * C * N);                 // [N*C*h, w, p, p]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));            // [N*C*h, p, w, p]
+            x = ggml_reshape_4d(ctx, x, W, H, C, N);                         // [N, C, h*p, w*p]
+
+            return x;
+        }
+
+        struct ggml_tensor* forward_core(GGMLRunnerContext* ctx,
+                                         struct ggml_tensor* x,
+                                         struct ggml_tensor* timestep,
+                                         struct ggml_tensor* context,
+                                         struct ggml_tensor* pe) {
+            auto x_embedder     = std::dynamic_pointer_cast<Linear>(blocks["x_embedder"]);
+            auto t_embedder     = std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder"]);
+            auto cap_embedder_0 = std::dynamic_pointer_cast<RMSNorm>(blocks["cap_embedder.0"]);
+            auto cap_embedder_1 = std::dynamic_pointer_cast<Linear>(blocks["cap_embedder.1"]);
+            auto norm_final     = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_final"]);
+            auto final_layer    = std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer"]);
+
+            auto txt_pad_token = params["cap_pad_token"];
+            auto img_pad_token = params["x_pad_token"];
+
+            int64_t N           = x->ne[2];
+            int64_t n_img_token = x->ne[1];
+            int64_t n_txt_token = context->ne[1];
+
+            auto t_emb = t_embedder->forward(ctx, timestep);
+
+            auto txt = cap_embedder_1->forward(ctx, cap_embedder_0->forward(ctx, context));  // [N, n_txt_token, hidden_size]
+            auto img = x_embedder->forward(ctx, x);                                          // [N, n_img_token, hidden_size]
+
+            int64_t n_txt_pad_token = Rope::bound_mod(n_txt_token, SEQ_MULTI_OF);
+            if (n_txt_pad_token > 0) {
+                auto txt_pad_tokens = ggml_repeat_4d(ctx->ggml_ctx, txt_pad_token, txt_pad_token->ne[0], n_txt_pad_token, N, 1);
+                txt                 = ggml_concat(ctx->ggml_ctx, txt, txt_pad_tokens, 1);  // [N, n_txt_token + n_txt_pad_token, hidden_size]
+            }
+
+            int64_t n_img_pad_token = Rope::bound_mod(n_img_token, SEQ_MULTI_OF);
+            if (n_img_pad_token > 0) {
+                auto img_pad_tokens = ggml_repeat_4d(ctx->ggml_ctx, img_pad_token, img_pad_token->ne[0], n_img_pad_token, N, 1);
+                img                 = ggml_concat(ctx->ggml_ctx, img, img_pad_tokens, 1);  // [N, n_img_token + n_img_pad_token, hidden_size]
+            }
+
+            GGML_ASSERT(txt->ne[1] + img->ne[1] == pe->ne[3]);
+
+            auto txt_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, 0, txt->ne[1]);
+            auto img_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt->ne[1], pe->ne[3]);
+
+            for (int i = 0; i < z_image_params.num_refiner_layers; i++) {
+                auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["context_refiner." + std::to_string(i)]);
+
+                txt = block->forward(ctx, txt, txt_pe, nullptr, nullptr);
+            }
+
+            for (int i = 0; i < z_image_params.num_refiner_layers; i++) {
+                auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["noise_refiner." + std::to_string(i)]);
+
+                img = block->forward(ctx, img, img_pe, nullptr, t_emb);
+            }
+
+            auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1);  // [N, n_txt_token + n_txt_pad_token + n_img_token + n_img_pad_token, hidden_size]
+
+            for (int i = 0; i < z_image_params.num_layers; i++) {
+                auto block = std::dynamic_pointer_cast<JointTransformerBlock>(blocks["layers." + std::to_string(i)]);
+
+                txt_img = block->forward(ctx, txt_img, pe, nullptr, t_emb);
+            }
+
+            txt_img = final_layer->forward(ctx, txt_img, t_emb);  // [N, n_txt_token + n_txt_pad_token + n_img_token + n_img_pad_token, ph*pw*C]
+
+            img = ggml_ext_slice(ctx->ggml_ctx, txt_img, 1, n_txt_token + n_txt_pad_token, n_txt_token + n_txt_pad_token + n_img_token);  // [N, n_img_token, ph*pw*C]
+
+            return img;
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* x,
+                                    struct ggml_tensor* timestep,
+                                    struct ggml_tensor* context,
+                                    struct ggml_tensor* pe,
+                                    std::vector<ggml_tensor*> ref_latents = {}) {
+            // Forward pass of DiT.
+            // x: [N, C, H, W]
+            // timestep: [N,]
+            // context: [N, L, D]
+            // pe: [L, d_head/2, 2, 2]
+            // return: [N, C, H, W]
+
+            int64_t W = x->ne[0];
+            int64_t H = x->ne[1];
+            int64_t C = x->ne[2];
+            int64_t N = x->ne[3];
+
+            auto img             = process_img(ctx->ggml_ctx, x);
+            uint64_t n_img_token = img->ne[1];
+
+            if (ref_latents.size() > 0) {
+                for (ggml_tensor* ref : ref_latents) {
+                    ref = process_img(ctx->ggml_ctx, ref);
+                    img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
+                }
+            }
+
+            int64_t h_len = ((H + (z_image_params.patch_size / 2)) / z_image_params.patch_size);
+            int64_t w_len = ((W + (z_image_params.patch_size / 2)) / z_image_params.patch_size);
+
+            auto out = forward_core(ctx, img, timestep, context, pe);
+
+            out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, n_img_token);  // [N, n_img_token, ph*pw*C]
+            out = unpatchify(ctx->ggml_ctx, out, h_len, w_len);           // [N, C, H + pad_h, W + pad_w]
+
+            // slice
+            out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, H);  // [N, C, H, W + pad_w]
+            out = ggml_ext_slice(ctx->ggml_ctx, out, 0, 0, W);  // [N, C, H, W]
+
+            out = ggml_scale(ctx->ggml_ctx, out, -1.f);
+
+            return out;
+        }
+    };
+
+    struct ZImageRunner : public GGMLRunner {
+    public:
+        ZImageParams z_image_params;
+        ZImageModel z_image;
+        std::vector<float> pe_vec;
+        std::vector<float> timestep_vec;
+        SDVersion version;
+
+        ZImageRunner(ggml_backend_t backend,
+                     bool offload_params_to_cpu,
+                     const String2TensorStorage& tensor_storage_map = {},
+                     const std::string prefix                       = "",
+                     SDVersion version                              = VERSION_Z_IMAGE)
+            : GGMLRunner(backend, offload_params_to_cpu) {
+            z_image = ZImageModel(z_image_params);
+            z_image.init(params_ctx, tensor_storage_map, prefix);
+        }
+
+        std::string get_desc() override {
+            return "z_image";
+        }
+
+        void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+            z_image.get_param_tensors(tensors, prefix);
+        }
+
+        struct ggml_cgraph* build_graph(struct ggml_tensor* x,
+                                        struct ggml_tensor* timesteps,
+                                        struct ggml_tensor* context,
+                                        std::vector<ggml_tensor*> ref_latents = {},
+                                        bool increase_ref_index               = false) {
+            GGML_ASSERT(x->ne[3] == 1);
+            struct ggml_cgraph* gf = new_graph_custom(Z_IMAGE_GRAPH_SIZE);
+
+            x         = to_backend(x);
+            context   = to_backend(context);
+            timesteps = to_backend(timesteps);
+
+            for (int i = 0; i < ref_latents.size(); i++) {
+                ref_latents[i] = to_backend(ref_latents[i]);
+            }
+
+            pe_vec      = Rope::gen_z_image_pe(x->ne[1],
+                                               x->ne[0],
+                                               z_image_params.patch_size,
+                                               x->ne[3],
+                                               context->ne[1],
+                                               SEQ_MULTI_OF,
+                                               ref_latents,
+                                               increase_ref_index,
+                                               z_image_params.theta,
+                                               z_image_params.axes_dim);
+            int pos_len = pe_vec.size() / z_image_params.axes_dim_sum / 2;
+            // LOG_DEBUG("pos_len %d", pos_len);
+            auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, z_image_params.axes_dim_sum / 2, pos_len);
+            // pe->data = pe_vec.data();
+            // print_ggml_tensor(pe, true, "pe");
+            // pe->data = nullptr;
+            set_backend_tensor_data(pe, pe_vec.data());
+            auto runner_ctx = get_context();
+
+            struct ggml_tensor* out = z_image.forward(&runner_ctx,
+                                                      x,
+                                                      timesteps,
+                                                      context,
+                                                      pe,
+                                                      ref_latents);
+
+            ggml_build_forward_expand(gf, out);
+
+            return gf;
+        }
+
+        bool compute(int n_threads,
+                     struct ggml_tensor* x,
+                     struct ggml_tensor* timesteps,
+                     struct ggml_tensor* context,
+                     std::vector<ggml_tensor*> ref_latents = {},
+                     bool increase_ref_index               = false,
+                     struct ggml_tensor** output           = nullptr,
+                     struct ggml_context* output_ctx       = nullptr) {
+            // x: [N, in_channels, h, w]
+            // timesteps: [N, ]
+            // context: [N, max_position, hidden_size]
+            auto get_graph = [&]() -> struct ggml_cgraph* {
+                return build_graph(x, timesteps, context, ref_latents, increase_ref_index);
+            };
+
+            return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        }
+
+        void test() {
+            struct ggml_init_params params;
+            params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1GB
+            params.mem_buffer = nullptr;
+            params.no_alloc   = false;
+
+            struct ggml_context* work_ctx = ggml_init(params);
+            GGML_ASSERT(work_ctx != nullptr);
+
+            {
+                // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1);
+                // ggml_set_f32(x, 0.01f);
+                auto x = load_tensor_from_file(work_ctx, "./z_image_x.bin");
+                print_ggml_tensor(x);
+
+                std::vector<float> timesteps_vec(1, 0.f);
+                auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
+
+                // auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 2560, 256, 1);
+                // ggml_set_f32(context, 0.01f);
+                auto context = load_tensor_from_file(work_ctx, "./z_image_context.bin");
+                print_ggml_tensor(context);
+
+                struct ggml_tensor* out = nullptr;
+
+                int t0 = ggml_time_ms();
+                compute(8, x, timesteps, context, {}, false, &out, work_ctx);
+                int t1 = ggml_time_ms();
+
+                print_ggml_tensor(out);
+                LOG_DEBUG("z_image test done in %dms", t1 - t0);
+            }
+        }
+
+        static void load_from_file_and_test(const std::string& file_path) {
+            // cuda q8: pass
+            // cuda q8 fa: pass
+            // ggml_backend_t backend = ggml_backend_cuda_init(0);
+            ggml_backend_t backend    = ggml_backend_cpu_init();
+            ggml_type model_data_type = GGML_TYPE_Q8_0;
+
+            ModelLoader model_loader;
+            if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
+                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
+                return;
+            }
+
+            auto& tensor_storage_map = model_loader.get_tensor_storage_map();
+            if (model_data_type != GGML_TYPE_COUNT) {
+                for (auto& [name, tensor_storage] : tensor_storage_map) {
+                    if (ends_with(name, "weight")) {
+                        tensor_storage.expected_type = model_data_type;
+                    }
+                }
+            }
+
+            std::shared_ptr<ZImageRunner> z_image = std::make_shared<ZImageRunner>(backend,
+                                                                                   false,
+                                                                                   tensor_storage_map,
+                                                                                   "model.diffusion_model",
+                                                                                   VERSION_QWEN_IMAGE);
+
+            z_image->alloc_params_buffer();
+            std::map<std::string, ggml_tensor*> tensors;
+            z_image->get_param_tensors(tensors, "model.diffusion_model");
+
+            bool success = model_loader.load_tensors(tensors);
+
+            if (!success) {
+                LOG_ERROR("load tensors from model loader failed");
+                return;
+            }
+
+            LOG_INFO("z_image model loaded");
+            z_image->test();
+        }
+    };
+
+}  // namespace ZImage
+
+#endif  // __Z_IMAGE_HPP__
Author	SHA1	Message	Date
leejet	11ab095230	fix: resolve embedding loading issue when calling generate_image multiple times (#1078 )	2025-12-12 23:08:12 +08:00
Wagner Bruna	a3a88fc9b2	fix: avoid crash loading LoRAs with bf16 weights (#1077 )	2025-12-12 22:36:54 +08:00
leejet	8823dc48bc	feat: align the spatial size to the corresponding multiple (#1073 )	2025-12-10 23:15:08 +08:00
Pedrito	1ac5a616de	feat: support custom upscale tile size (#896 )	2025-12-10 22:25:19 +08:00
leejet	d939f6e86a	refactor: optimize the handling of LoRA models (#1070 )	2025-12-10 00:26:07 +08:00
Wagner Bruna	e72aea796e	feat: embed version string and git commit hash (#1008 )	2025-12-09 22:38:54 +08:00
wuhei	a908436729	docs: update download link for Stable Diffusion v1.5 (#1063 )	2025-12-09 22:06:16 +08:00
stduhpf	583a02e29e	feat: add Flux.2 VAE proj matrix for previews (#1017 )	2025-12-09 22:00:45 +08:00
leejet	96c3e64057	refactor: optimize the handling of embedding (#1068 ) * optimize the handling of embedding * support case-insensitive embedding names	2025-12-08 23:59:04 +08:00
Weiqi Gao	0392273e10	chore: add compute kernels to Windows CUDA build (#1062 ) * Fix syntax for CUDA architecture definitions * Extend CUDA support to GTX 10 Series to RTX 50 Series * update cuda installer step version to install cuda 12.8.1 * Remove unsupported compute capability	2025-12-07 22:12:50 +08:00
leejet	bf1a388b44	docs: update logo	2025-12-07 15:09:32 +08:00
leejet	c9005337a8	docs: update logo	2025-12-07 14:56:21 +08:00
leejet	2f0bd31a84	feat: add ovis image support (#1057 )	2025-12-07 12:32:56 +08:00
leejet	bfbb929790	feat: do not convert bf16 to f32 (#1055 )	2025-12-06 23:55:51 +08:00
leejet	689e44c9a8	fix: correct ggml_ext_silu_act (#1056 )	2025-12-06 23:55:28 +08:00
leejet	985aedda32	refactor: optimize the handling of pred type (#1048 )	2025-12-04 23:31:55 +08:00
leejet	3f3610b5cd	chore: optimize lora log (#1047 )	2025-12-04 22:44:58 +08:00
Wagner Bruna	118683de8a	fix: correct preview method selection (#1038 )	2025-12-04 22:43:16 +08:00
stduhpf	bcc9c0d0b3	feat: handle ggml compute failures without crashing the program (#1003 ) * Feat: handle compute failures more gracefully * fix Unreachable code after return Co-authored-by: idostyle <idostyl3@googlemail.com> * adjust z_image.hpp --------- Co-authored-by: idostyle <idostyl3@googlemail.com> Co-authored-by: leejet <leejet714@gmail.com>	2025-12-04 22:04:27 +08:00
leejet	5865b5e703	refactor: split SDParams to SDCliParams/SDContextParams/SDGenerationParams (#1032 )	2025-12-03 22:31:46 +08:00
stduhpf	edf2cb3846	fix: fix CosXL not being detected (#989 )	2025-12-03 22:25:02 +08:00
Wagner Bruna	99e17232a4	fix: prevent NaN issues with Z-Image on certain ROCm setups (#1034 )	2025-12-03 22:19:34 +08:00
leejet	710169df5c	docs: update news	2025-12-01 22:46:15 +08:00
Wagner Bruna	e4c50f1de5	chore: add sd_ prefix to a few functions (#967 )	2025-12-01 22:43:52 +08:00
rmatif	0743a1b3b5	fix: fix vae tiling for flux2 (#1025 )	2025-12-01 22:41:56 +08:00
leejet	34a6fd4e60	feat: add z-image support (#1020 ) * add z-image support * use flux_latent_rgb_proj for z-image * fix qwen3 rope type * add support for qwen3 4b gguf * add support for diffusers format lora * fix nan issue that occurs when using CUDA with k-quants weights * add z-image docs	2025-12-01 22:39:43 +08:00
leejet	3c1187ce83	docs: correct the time of adding flux2 support	2025-11-30 12:40:56 +08:00
leejet	20eb674100	fix: avoid crash when the lora file is not found using immediately mode (#1022 )	2025-11-30 12:19:37 +08:00
leejet	bc80225336	fix: make the immediate LoRA apply mode work better when using Vulkan (#1021 )	2025-11-30 12:08:25 +08:00
leejet	ab7e8d285e	docs: update news	2025-11-30 11:51:23 +08:00
Wagner Bruna	673dbdda17	fix: add missing line cleanup for s/it progress display (#891 )	2025-11-30 11:45:30 +08:00
Wagner Bruna	0249509a30	refactor: add user data pointer to the image preview callback (#1001 )	2025-11-30 11:34:17 +08:00
leejet	52b67c538b	feat: add flux2 support (#1016 ) * add flux2 support * rename qwenvl to llm * add Flux2FlowDenoiser * update docs	2025-11-30 11:32:56 +08:00
leejet	20345888a3	refactor: optimize the handling of sample method (#999 )	2025-11-22 14:00:25 +08:00
akleine	490c51d963	feat: report success/failure when saving PNG/JPG output (#912 )	2025-11-22 13:57:44 +08:00
Wagner Bruna	45c46779af	feat: add LCM scheduler (#983 )	2025-11-22 13:53:31 +08:00
leejet	869d023416	refactor: optimize the handling of scheduler (#998 )	2025-11-22 12:48:53 +08:00
akleine	e9bc3b6c06	fix: check the PhotoMaker id_embeds tensor ONLY in PhotoMaker V2 mode (#987 )	2025-11-22 12:47:40 +08:00
Wagner Bruna	b542894fb9	fix: avoid crash on default video preview path (#997 ) Co-authored-by: masamaru-san	2025-11-22 12:46:27 +08:00
leejet	5498cc0d67	feat: add Wan2.1-I2V-1.3B(SkyReels) support (#988 )	2025-11-19 23:56:46 +08:00
stduhpf	aa2b8e0ca5	fix: patch 1x1 conv weights at runtime (#986 )	2025-11-19 23:27:23 +08:00
rmatif	a14e2b321d	feat: add easycache support (#940 )	2025-11-19 23:19:32 +08:00
leejet	28ffb6c13d	fix: resolve issue with concat multiple LoRA output diffs at runtime (#985 )	2025-11-17 22:56:07 +08:00
leejet	b88cc32346	fix: avoid using same type but diff instances for rng and sampler_rng (#982 )	2025-11-16 23:37:14 +08:00
leejet	f532972d60	fix: avoid precision issues on vulkan backend (#980 )	2025-11-16 20:57:08 +08:00
leejet	d5b05f70c6	feat: support independent sampler rng (#978 )	2025-11-16 17:11:02 +08:00
akleine	6d6dc1b8ed	fix: make PhotoMakerV2 more robust by image count check (#970 )	2025-11-16 17:10:48 +08:00
Wagner Bruna	199e675cc7	feat: support for --tensor-type-rules on generation modes (#932 )	2025-11-16 17:07:32 +08:00
leejet	742a7333c3	feat: add cpu rng (#977 )	2025-11-16 14:48:15 +08:00
Wagner Bruna	e8eb3791c8	fix: typo in --lora-apply-mode help (#972 )	2025-11-16 14:48:00 +08:00
Wagner Bruna	aa44e06890	fix: avoid crash with LoRAs and type override (#974 )	2025-11-16 14:47:36 +08:00
Daniele	6448430dbb	feat: add break pseudo token support (#422 ) --------- Co-authored-by: Urs Ganse <urs.ganse@helsinki.fi>	2025-11-16 14:45:20 +08:00
leejet	347710f68f	feat: support applying LoRA at runtime (#969 )	2025-11-13 21:48:44 +08:00
lcy	59ebdf0bb5	chrore: enable Windows ROCm(HIP) build release (#956 ) * build: fix missing commit sha in macOS and Ubuntu build zip name The build workflows for macOS and Ubuntu incorrectly check for the "main" branch instead of "master" when retrieving the commit hash for naming the build artifacts. * build: correct Vulkan SDK installation condition in build workflow * build: Enable Windows ROCm(HIP) build release Refer to the build workflow of llama.cpp to add a Windows ROCm (HIP) build release to the workflow. Since there are many differences between the HIP build and other builds, this commit add a separate "windows-latest-cmake-hip" job, instead of enabling the ROCm matrix entry in the existing Windows build job. Main differences include: - Install ROCm SDK from AMD official installer. - Add a cache step for ROCm installation and a ccache step for build processing, since the HIP build takes much longer time than other builds. - Include the ROCm/HIP artifact in the release assets.	2025-11-12 00:28:55 +08:00
Flavio Bizzarri	4ffcbcaed7	fix: specify enum modifier in sd_set_preview_callback signature (#959 )	2025-11-12 00:27:23 +08:00
leejet	694f0d9235	refactor: optimize the logic for name conversion and the processing of the LoRA model (#955 )	2025-11-10 00:12:20 +08:00
stduhpf	8ecdf053ac	feat: add image preview support (#522 )	2025-11-10 00:12:02 +08:00
leejet	ee89afc878	fix: resolve issue with pmid (#957 )	2025-11-09 22:47:53 +08:00
akleine	d2d3944f50	feat: add support for SD2.x with TINY U-Nets (#939 )	2025-11-09 22:47:37 +08:00
akleine	0fa3e1a383	fix: prevent core dump in PM V2 in case of incomplete cmd line (#950 )	2025-11-09 22:36:43 +08:00
leejet	c2d8ffc22c	fix: compatibility for models with modified tensor shapes (#951 )	2025-11-07 23:04:41 +08:00
stduhpf	fb748bb8a4	fix: TAE encoding (#935 )	2025-11-07 22:58:59 +08:00