refactor: simplify logic for saving results (#1149 )

chore: reformat named cache params description into single line
feat: support mmap for model loading (#1059 )
2026-06-25 15:46:40 +00:00 · 2025-12-28 23:27:27 +08:00 · 2025-12-28 22:53:07 +08:00 · 2025-12-28 22:38:29 +08:00 · 2025-12-27 16:48:15 +08:00 · 2025-12-27 15:54:18 +08:00
60 changed files with 19817 additions and 2258 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -163,7 +163,7 @@ jobs:
          - build: "avx512"
            defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
          - build: "cuda12"
-            defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES='61;70;75;80;86;89;90;100;120'"
+            defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES='61;70;75;80;86;89;90;100;120' -DCMAKE_CUDA_FLAGS='-Xcudafe \"--diag_suppress=177\" -Xcudafe \"--diag_suppress=550\"'"
          - build: 'vulkan'
            defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
    steps:
@ -191,13 +191,17 @@ jobs:
          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
      - name: Activate MSVC environment
        id: msvc_dev_cmd
        uses: ilammy/msvc-dev-cmd@v1
      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
-          cmake .. ${{ matrix.defines }}
+          cmake .. -DCMAKE_CXX_FLAGS='/bigobj' -G Ninja -DCMAKE_C_COMPILER=cl.exe -DCMAKE_CXX_COMPILER=cl.exe ${{ matrix.defines }}
-          cmake --build . --config Release
+          cmake --build .
      - name: Check AVX512F support
        id: check_avx512f
--- a/4
+++ b/4
@ -17,6 +17,6 @@ RUN apt-get update && \
    apt-get install --yes --no-install-recommends libgomp1 && \
    apt-get clean
-COPY --from=build /sd.cpp/build/bin/sd /sd
+COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
-ENTRYPOINT [ "/sd" ]
+ENTRYPOINT [ "/sd-cli" ]
--- a/Dockerfile.musa
+++ b/Dockerfile.musa
@ -18,6 +18,6 @@ RUN mkdir build && cd build && \
 FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 as runtime
-COPY --from=build /sd.cpp/build/bin/sd /sd
+COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
-ENTRYPOINT [ "/sd" ]
+ENTRYPOINT [ "/sd-cli" ]
--- a/Dockerfile.sycl
+++ b/Dockerfile.sycl
@ -14,6 +14,6 @@ RUN mkdir build && cd build && \
 FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS runtime
-COPY --from=build /sd.cpp/build/bin/sd /sd
+COPY --from=build /sd.cpp/build/bin/sd-cli /sd-cli
-ENTRYPOINT [ "/sd" ]
+ENTRYPOINT [ "/sd-cli" ]
--- a/README.md
+++ b/README.md
@ -52,7 +52,7 @@ API and command-line option may change frequently.***
    - [Ovis-Image](./docs/ovis_image.md)
  - Image Edit Models
    - [FLUX.1-Kontext-dev](./docs/kontext.md)
-    - [Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
+    - [Qwen Image Edit series](./docs/qwen_image_edit.md)
  - Video Models
    - [Wan2.1/Wan2.2](./docs/wan.md)
  - [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
@ -114,7 +114,7 @@ API and command-line option may change frequently.***
 ### Generate an image with just one command
 ```sh
-./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
+./bin/sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
 ```
 ***For detailed command-line arguments, check out [cli doc](./examples/cli/README.md).***
@ -132,7 +132,7 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe
 - [FLUX.1-Kontext-dev](./docs/kontext.md)
 - [Chroma](./docs/chroma.md)
 - [🔥Qwen Image](./docs/qwen_image.md)
- [🔥Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
+- [🔥Qwen Image Edit series](./docs/qwen_image_edit.md)
 - [🔥Wan2.1/Wan2.2](./docs/wan.md)
 - [🔥Z-Image](./docs/z_image.md)
 - [Ovis-Image](./docs/ovis_image.md)
@ -143,6 +143,7 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe
 - [Using TAESD to faster decoding](./docs/taesd.md)
 - [Docker](./docs/docker.md)
 - [Quantization and GGUF](./docs/quantization_and_gguf.md)
 - [Inference acceleration via caching](./docs/caching.md)
 ## Bindings
--- a/assets/qwen/qwen_image_edit_2511.png
+++ b/assets/qwen/qwen_image_edit_2511.png
--- a/cache_dit.hpp
+++ b/cache_dit.hpp
@ -0,0 +1,975 @@
 #ifndef __CACHE_DIT_HPP__
 #define __CACHE_DIT_HPP__
 #include <algorithm>
 #include <cmath>
 #include <limits>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include "ggml_extend.hpp"
 struct DBCacheConfig {
    bool enabled                        = false;
    int Fn_compute_blocks               = 8;
    int Bn_compute_blocks               = 0;
    float residual_diff_threshold       = 0.08f;
    int max_warmup_steps                = 8;
    int max_cached_steps                = -1;
    int max_continuous_cached_steps     = -1;
    float max_accumulated_residual_diff = -1.0f;
    std::vector<int> steps_computation_mask;
    bool scm_policy_dynamic = true;
 };
 struct TaylorSeerConfig {
    bool enabled            = false;
    int n_derivatives       = 1;
    int max_warmup_steps    = 2;
    int skip_interval_steps = 1;
 };
 struct CacheDitConfig {
    DBCacheConfig dbcache;
    TaylorSeerConfig taylorseer;
    int double_Fn_blocks = -1;
    int double_Bn_blocks = -1;
    int single_Fn_blocks = -1;
    int single_Bn_blocks = -1;
 };
 struct TaylorSeerState {
    int n_derivatives      = 1;
    int current_step       = -1;
    int last_computed_step = -1;
    std::vector<std::vector<float>> dY_prev;
    std::vector<std::vector<float>> dY_current;
    void init(int n_deriv, size_t hidden_size) {
        n_derivatives = n_deriv;
        int order     = n_derivatives + 1;
        dY_prev.resize(order);
        dY_current.resize(order);
        for (int i = 0; i < order; i++) {
            dY_prev[i].clear();
            dY_current[i].clear();
        }
        current_step       = -1;
        last_computed_step = -1;
    }
    void reset() {
        for (auto& v : dY_prev)
            v.clear();
        for (auto& v : dY_current)
            v.clear();
        current_step       = -1;
        last_computed_step = -1;
    }
    bool can_approximate() const {
        return last_computed_step >= n_derivatives && !dY_prev.empty() && !dY_prev[0].empty();
    }
    void update_derivatives(const float* Y, size_t size, int step) {
        int order = n_derivatives + 1;
        dY_prev   = dY_current;
        dY_current[0].resize(size);
        for (size_t i = 0; i < size; i++) {
            dY_current[0][i] = Y[i];
        }
        int window = step - last_computed_step;
        if (window <= 0)
            window = 1;
        for (int d = 0; d < n_derivatives; d++) {
            if (!dY_prev[d].empty() && dY_prev[d].size() == size) {
                dY_current[d + 1].resize(size);
                for (size_t i = 0; i < size; i++) {
                    dY_current[d + 1][i] = (dY_current[d][i] - dY_prev[d][i]) / static_cast<float>(window);
                }
            } else {
                dY_current[d + 1].clear();
            }
        }
        current_step       = step;
        last_computed_step = step;
    }
    void approximate(float* output, size_t size, int target_step) const {
        if (!can_approximate() || dY_prev[0].size() != size) {
            return;
        }
        int elapsed = target_step - last_computed_step;
        if (elapsed <= 0)
            elapsed = 1;
        std::fill(output, output + size, 0.0f);
        float factorial = 1.0f;
        int order       = static_cast<int>(dY_prev.size());
        for (int o = 0; o < order; o++) {
            if (dY_prev[o].empty() || dY_prev[o].size() != size)
                continue;
            if (o > 0)
                factorial *= static_cast<float>(o);
            float coeff = std::pow(static_cast<float>(elapsed), o) / factorial;
            for (size_t i = 0; i < size; i++) {
                output[i] += coeff * dY_prev[o][i];
            }
        }
    }
 };
 struct BlockCacheEntry {
    std::vector<float> residual_img;
    std::vector<float> residual_txt;
    std::vector<float> residual;
    std::vector<float> prev_img;
    std::vector<float> prev_txt;
    std::vector<float> prev_output;
    bool has_prev = false;
 };
 struct CacheDitState {
    CacheDitConfig config;
    bool initialized = false;
    int total_double_blocks = 0;
    int total_single_blocks = 0;
    size_t hidden_size      = 0;
    int current_step     = -1;
    int total_steps      = 0;
    int warmup_remaining = 0;
    std::vector<int> cached_steps;
    int continuous_cached_steps     = 0;
    float accumulated_residual_diff = 0.0f;
    std::vector<BlockCacheEntry> double_block_cache;
    std::vector<BlockCacheEntry> single_block_cache;
    std::vector<float> Fn_residual_img;
    std::vector<float> Fn_residual_txt;
    std::vector<float> prev_Fn_residual_img;
    std::vector<float> prev_Fn_residual_txt;
    bool has_prev_Fn_residual = false;
    std::vector<float> Bn_buffer_img;
    std::vector<float> Bn_buffer_txt;
    std::vector<float> Bn_buffer;
    bool has_Bn_buffer = false;
    TaylorSeerState taylor_state;
    bool can_cache_this_step  = false;
    bool is_caching_this_step = false;
    int total_blocks_computed = 0;
    int total_blocks_cached   = 0;
    void init(const CacheDitConfig& cfg, int num_double_blocks, int num_single_blocks, size_t h_size) {
        config              = cfg;
        total_double_blocks = num_double_blocks;
        total_single_blocks = num_single_blocks;
        hidden_size         = h_size;
        initialized = cfg.dbcache.enabled || cfg.taylorseer.enabled;
        if (!initialized)
            return;
        warmup_remaining = cfg.dbcache.max_warmup_steps;
        double_block_cache.resize(total_double_blocks);
        single_block_cache.resize(total_single_blocks);
        if (cfg.taylorseer.enabled) {
            taylor_state.init(cfg.taylorseer.n_derivatives, h_size);
        }
        reset_runtime();
    }
    void reset_runtime() {
        current_step     = -1;
        total_steps      = 0;
        warmup_remaining = config.dbcache.max_warmup_steps;
        cached_steps.clear();
        continuous_cached_steps   = 0;
        accumulated_residual_diff = 0.0f;
        for (auto& entry : double_block_cache) {
            entry.residual_img.clear();
            entry.residual_txt.clear();
            entry.prev_img.clear();
            entry.prev_txt.clear();
            entry.has_prev = false;
        }
        for (auto& entry : single_block_cache) {
            entry.residual.clear();
            entry.prev_output.clear();
            entry.has_prev = false;
        }
        Fn_residual_img.clear();
        Fn_residual_txt.clear();
        prev_Fn_residual_img.clear();
        prev_Fn_residual_txt.clear();
        has_prev_Fn_residual = false;
        Bn_buffer_img.clear();
        Bn_buffer_txt.clear();
        Bn_buffer.clear();
        has_Bn_buffer = false;
        taylor_state.reset();
        can_cache_this_step  = false;
        is_caching_this_step = false;
        total_blocks_computed = 0;
        total_blocks_cached   = 0;
    }
    bool enabled() const {
        return initialized && (config.dbcache.enabled || config.taylorseer.enabled);
    }
    void begin_step(int step_index, float sigma = 0.0f) {
        if (!enabled())
            return;
        if (step_index == current_step)
            return;
        current_step = step_index;
        total_steps++;
        bool in_warmup = warmup_remaining > 0;
        if (in_warmup) {
            warmup_remaining--;
        }
        bool scm_allows_cache = true;
        if (!config.dbcache.steps_computation_mask.empty()) {
            if (step_index < static_cast<int>(config.dbcache.steps_computation_mask.size())) {
                scm_allows_cache = (config.dbcache.steps_computation_mask[step_index] == 0);
                if (!config.dbcache.scm_policy_dynamic && scm_allows_cache) {
                    can_cache_this_step  = true;
                    is_caching_this_step = false;
                    return;
                }
            }
        }
        bool max_cached_ok = (config.dbcache.max_cached_steps < 0) ||
                             (static_cast<int>(cached_steps.size()) < config.dbcache.max_cached_steps);
        bool max_cont_ok = (config.dbcache.max_continuous_cached_steps < 0) ||
                           (continuous_cached_steps < config.dbcache.max_continuous_cached_steps);
        bool accum_ok = (config.dbcache.max_accumulated_residual_diff < 0.0f) ||
                        (accumulated_residual_diff < config.dbcache.max_accumulated_residual_diff);
        can_cache_this_step  = !in_warmup && scm_allows_cache && max_cached_ok && max_cont_ok && accum_ok && has_prev_Fn_residual;
        is_caching_this_step = false;
    }
    void end_step(bool was_cached) {
        if (was_cached) {
            cached_steps.push_back(current_step);
            continuous_cached_steps++;
        } else {
            continuous_cached_steps = 0;
        }
    }
    static float calculate_residual_diff(const float* prev, const float* curr, size_t size) {
        if (size == 0)
            return 0.0f;
        float sum_diff = 0.0f;
        float sum_abs  = 0.0f;
        for (size_t i = 0; i < size; i++) {
            sum_diff += std::fabs(prev[i] - curr[i]);
            sum_abs += std::fabs(prev[i]);
        }
        return sum_diff / (sum_abs + 1e-6f);
    }
    static float calculate_residual_diff(const std::vector<float>& prev, const std::vector<float>& curr) {
        if (prev.size() != curr.size() || prev.empty())
            return 1.0f;
        return calculate_residual_diff(prev.data(), curr.data(), prev.size());
    }
    int get_double_Fn_blocks() const {
        return (config.double_Fn_blocks >= 0) ? config.double_Fn_blocks : config.dbcache.Fn_compute_blocks;
    }
    int get_double_Bn_blocks() const {
        return (config.double_Bn_blocks >= 0) ? config.double_Bn_blocks : config.dbcache.Bn_compute_blocks;
    }
    int get_single_Fn_blocks() const {
        return (config.single_Fn_blocks >= 0) ? config.single_Fn_blocks : config.dbcache.Fn_compute_blocks;
    }
    int get_single_Bn_blocks() const {
        return (config.single_Bn_blocks >= 0) ? config.single_Bn_blocks : config.dbcache.Bn_compute_blocks;
    }
    bool is_Fn_double_block(int block_idx) const {
        return block_idx < get_double_Fn_blocks();
    }
    bool is_Bn_double_block(int block_idx) const {
        int Bn = get_double_Bn_blocks();
        return Bn > 0 && block_idx >= (total_double_blocks - Bn);
    }
    bool is_Mn_double_block(int block_idx) const {
        return !is_Fn_double_block(block_idx) && !is_Bn_double_block(block_idx);
    }
    bool is_Fn_single_block(int block_idx) const {
        return block_idx < get_single_Fn_blocks();
    }
    bool is_Bn_single_block(int block_idx) const {
        int Bn = get_single_Bn_blocks();
        return Bn > 0 && block_idx >= (total_single_blocks - Bn);
    }
    bool is_Mn_single_block(int block_idx) const {
        return !is_Fn_single_block(block_idx) && !is_Bn_single_block(block_idx);
    }
    void store_Fn_residual(const float* img, const float* txt, size_t img_size, size_t txt_size, const float* input_img, const float* input_txt) {
        Fn_residual_img.resize(img_size);
        Fn_residual_txt.resize(txt_size);
        for (size_t i = 0; i < img_size; i++) {
            Fn_residual_img[i] = img[i] - input_img[i];
        }
        for (size_t i = 0; i < txt_size; i++) {
            Fn_residual_txt[i] = txt[i] - input_txt[i];
        }
    }
    bool check_cache_decision() {
        if (!can_cache_this_step) {
            is_caching_this_step = false;
            return false;
        }
        if (!has_prev_Fn_residual || prev_Fn_residual_img.empty()) {
            is_caching_this_step = false;
            return false;
        }
        float diff_img = calculate_residual_diff(prev_Fn_residual_img, Fn_residual_img);
        float diff_txt = calculate_residual_diff(prev_Fn_residual_txt, Fn_residual_txt);
        float diff     = (diff_img + diff_txt) / 2.0f;
        if (diff < config.dbcache.residual_diff_threshold) {
            is_caching_this_step = true;
            accumulated_residual_diff += diff;
            return true;
        }
        is_caching_this_step = false;
        return false;
    }
    void update_prev_Fn_residual() {
        prev_Fn_residual_img = Fn_residual_img;
        prev_Fn_residual_txt = Fn_residual_txt;
        has_prev_Fn_residual = !prev_Fn_residual_img.empty();
    }
    void store_double_block_residual(int block_idx, const float* img, const float* txt, size_t img_size, size_t txt_size, const float* prev_img, const float* prev_txt) {
        if (block_idx < 0 || block_idx >= static_cast<int>(double_block_cache.size()))
            return;
        BlockCacheEntry& entry = double_block_cache[block_idx];
        entry.residual_img.resize(img_size);
        entry.residual_txt.resize(txt_size);
        for (size_t i = 0; i < img_size; i++) {
            entry.residual_img[i] = img[i] - prev_img[i];
        }
        for (size_t i = 0; i < txt_size; i++) {
            entry.residual_txt[i] = txt[i] - prev_txt[i];
        }
        entry.prev_img.resize(img_size);
        entry.prev_txt.resize(txt_size);
        for (size_t i = 0; i < img_size; i++) {
            entry.prev_img[i] = img[i];
        }
        for (size_t i = 0; i < txt_size; i++) {
            entry.prev_txt[i] = txt[i];
        }
        entry.has_prev = true;
    }
    void apply_double_block_cache(int block_idx, float* img, float* txt, size_t img_size, size_t txt_size) {
        if (block_idx < 0 || block_idx >= static_cast<int>(double_block_cache.size()))
            return;
        const BlockCacheEntry& entry = double_block_cache[block_idx];
        if (entry.residual_img.size() != img_size || entry.residual_txt.size() != txt_size)
            return;
        for (size_t i = 0; i < img_size; i++) {
            img[i] += entry.residual_img[i];
        }
        for (size_t i = 0; i < txt_size; i++) {
            txt[i] += entry.residual_txt[i];
        }
        total_blocks_cached++;
    }
    void store_single_block_residual(int block_idx, const float* output, size_t size, const float* input) {
        if (block_idx < 0 || block_idx >= static_cast<int>(single_block_cache.size()))
            return;
        BlockCacheEntry& entry = single_block_cache[block_idx];
        entry.residual.resize(size);
        for (size_t i = 0; i < size; i++) {
            entry.residual[i] = output[i] - input[i];
        }
        entry.prev_output.resize(size);
        for (size_t i = 0; i < size; i++) {
            entry.prev_output[i] = output[i];
        }
        entry.has_prev = true;
    }
    void apply_single_block_cache(int block_idx, float* output, size_t size) {
        if (block_idx < 0 || block_idx >= static_cast<int>(single_block_cache.size()))
            return;
        const BlockCacheEntry& entry = single_block_cache[block_idx];
        if (entry.residual.size() != size)
            return;
        for (size_t i = 0; i < size; i++) {
            output[i] += entry.residual[i];
        }
        total_blocks_cached++;
    }
    void store_Bn_buffer(const float* img, const float* txt, size_t img_size, size_t txt_size, const float* Bn_start_img, const float* Bn_start_txt) {
        Bn_buffer_img.resize(img_size);
        Bn_buffer_txt.resize(txt_size);
        for (size_t i = 0; i < img_size; i++) {
            Bn_buffer_img[i] = img[i] - Bn_start_img[i];
        }
        for (size_t i = 0; i < txt_size; i++) {
            Bn_buffer_txt[i] = txt[i] - Bn_start_txt[i];
        }
        has_Bn_buffer = true;
    }
    void apply_Bn_buffer(float* img, float* txt, size_t img_size, size_t txt_size) {
        if (!has_Bn_buffer)
            return;
        if (Bn_buffer_img.size() != img_size || Bn_buffer_txt.size() != txt_size)
            return;
        for (size_t i = 0; i < img_size; i++) {
            img[i] += Bn_buffer_img[i];
        }
        for (size_t i = 0; i < txt_size; i++) {
            txt[i] += Bn_buffer_txt[i];
        }
    }
    void taylor_update(const float* hidden_state, size_t size) {
        if (!config.taylorseer.enabled)
            return;
        taylor_state.update_derivatives(hidden_state, size, current_step);
    }
    bool taylor_can_approximate() const {
        return config.taylorseer.enabled && taylor_state.can_approximate();
    }
    void taylor_approximate(float* output, size_t size) {
        if (!config.taylorseer.enabled)
            return;
        taylor_state.approximate(output, size, current_step);
    }
    bool should_use_taylor_this_step() const {
        if (!config.taylorseer.enabled)
            return false;
        if (current_step < config.taylorseer.max_warmup_steps)
            return false;
        int interval = config.taylorseer.skip_interval_steps;
        if (interval <= 0)
            interval = 1;
        return (current_step % (interval + 1)) != 0;
    }
    void log_metrics() const {
        if (!enabled())
            return;
        int total_blocks  = total_blocks_computed + total_blocks_cached;
        float cache_ratio = (total_blocks > 0) ? (static_cast<float>(total_blocks_cached) / total_blocks * 100.0f) : 0.0f;
        float step_cache_ratio = (total_steps > 0) ? (static_cast<float>(cached_steps.size()) / total_steps * 100.0f) : 0.0f;
        LOG_INFO("CacheDIT: steps_cached=%zu/%d (%.1f%%), blocks_cached=%d/%d (%.1f%%), accum_diff=%.4f",
                 cached_steps.size(), total_steps, step_cache_ratio,
                 total_blocks_cached, total_blocks, cache_ratio,
                 accumulated_residual_diff);
    }
    std::string get_summary() const {
        char buf[256];
        snprintf(buf, sizeof(buf),
                 "CacheDIT[thresh=%.2f]: cached %zu/%d steps, %d/%d blocks",
                 config.dbcache.residual_diff_threshold,
                 cached_steps.size(), total_steps,
                 total_blocks_cached, total_blocks_computed + total_blocks_cached);
        return std::string(buf);
    }
 };
 inline std::vector<int> parse_scm_mask(const std::string& mask_str) {
    std::vector<int> mask;
    if (mask_str.empty())
        return mask;
    size_t pos   = 0;
    size_t start = 0;
    while ((pos = mask_str.find(',', start)) != std::string::npos) {
        std::string token = mask_str.substr(start, pos - start);
        mask.push_back(std::stoi(token));
        start = pos + 1;
    }
    if (start < mask_str.length()) {
        mask.push_back(std::stoi(mask_str.substr(start)));
    }
    return mask;
 }
 inline std::vector<int> generate_scm_mask(
    const std::vector<int>& compute_bins,
    const std::vector<int>& cache_bins,
    int total_steps) {
    std::vector<int> mask;
    size_t c_idx = 0, cache_idx = 0;
    while (static_cast<int>(mask.size()) < total_steps) {
        if (c_idx < compute_bins.size()) {
            for (int i = 0; i < compute_bins[c_idx] && static_cast<int>(mask.size()) < total_steps; i++) {
                mask.push_back(1);
            }
            c_idx++;
        }
        if (cache_idx < cache_bins.size()) {
            for (int i = 0; i < cache_bins[cache_idx] && static_cast<int>(mask.size()) < total_steps; i++) {
                mask.push_back(0);
            }
            cache_idx++;
        }
        if (c_idx >= compute_bins.size() && cache_idx >= cache_bins.size())
            break;
    }
    if (!mask.empty()) {
        mask.back() = 1;
    }
    return mask;
 }
 inline std::vector<int> get_scm_preset(const std::string& preset, int total_steps) {
    struct Preset {
        std::vector<int> compute_bins;
        std::vector<int> cache_bins;
    };
    Preset slow   = {{8, 3, 3, 2, 1, 1}, {1, 2, 2, 2, 3}};
    Preset medium = {{6, 2, 2, 2, 2, 1}, {1, 3, 3, 3, 3}};
    Preset fast   = {{6, 1, 1, 1, 1, 1}, {1, 3, 4, 5, 4}};
    Preset ultra  = {{4, 1, 1, 1, 1}, {2, 5, 6, 7}};
    Preset* p = nullptr;
    if (preset == "slow" || preset == "s" || preset == "S")
        p = &slow;
    else if (preset == "medium" || preset == "m" || preset == "M")
        p = &medium;
    else if (preset == "fast" || preset == "f" || preset == "F")
        p = &fast;
    else if (preset == "ultra" || preset == "u" || preset == "U")
        p = &ultra;
    else
        return {};
    if (total_steps != 28 && total_steps > 0) {
        float scale = static_cast<float>(total_steps) / 28.0f;
        std::vector<int> scaled_compute, scaled_cache;
        for (int v : p->compute_bins) {
            scaled_compute.push_back(std::max(1, static_cast<int>(v * scale + 0.5f)));
        }
        for (int v : p->cache_bins) {
            scaled_cache.push_back(std::max(1, static_cast<int>(v * scale + 0.5f)));
        }
        return generate_scm_mask(scaled_compute, scaled_cache, total_steps);
    }
    return generate_scm_mask(p->compute_bins, p->cache_bins, total_steps);
 }
 inline float get_preset_threshold(const std::string& preset) {
    if (preset == "slow" || preset == "s" || preset == "S")
        return 0.20f;
    if (preset == "medium" || preset == "m" || preset == "M")
        return 0.25f;
    if (preset == "fast" || preset == "f" || preset == "F")
        return 0.30f;
    if (preset == "ultra" || preset == "u" || preset == "U")
        return 0.34f;
    return 0.08f;
 }
 inline int get_preset_warmup(const std::string& preset) {
    if (preset == "slow" || preset == "s" || preset == "S")
        return 8;
    if (preset == "medium" || preset == "m" || preset == "M")
        return 6;
    if (preset == "fast" || preset == "f" || preset == "F")
        return 6;
    if (preset == "ultra" || preset == "u" || preset == "U")
        return 4;
    return 8;
 }
 inline int get_preset_Fn(const std::string& preset) {
    if (preset == "slow" || preset == "s" || preset == "S")
        return 8;
    if (preset == "medium" || preset == "m" || preset == "M")
        return 8;
    if (preset == "fast" || preset == "f" || preset == "F")
        return 6;
    if (preset == "ultra" || preset == "u" || preset == "U")
        return 4;
    return 8;
 }
 inline int get_preset_Bn(const std::string& preset) {
    (void)preset;
    return 0;
 }
 inline void parse_dbcache_options(const std::string& opts, DBCacheConfig& cfg) {
    if (opts.empty())
        return;
    int Fn = 8, Bn = 0, warmup = 8, max_cached = -1, max_cont = -1;
    float thresh = 0.08f;
    sscanf(opts.c_str(), "%d,%d,%f,%d,%d,%d",
           &Fn, &Bn, &thresh, &warmup, &max_cached, &max_cont);
    cfg.Fn_compute_blocks           = Fn;
    cfg.Bn_compute_blocks           = Bn;
    cfg.residual_diff_threshold     = thresh;
    cfg.max_warmup_steps            = warmup;
    cfg.max_cached_steps            = max_cached;
    cfg.max_continuous_cached_steps = max_cont;
 }
 inline void parse_taylorseer_options(const std::string& opts, TaylorSeerConfig& cfg) {
    if (opts.empty())
        return;
    int n_deriv = 1, warmup = 2, interval = 1;
    sscanf(opts.c_str(), "%d,%d,%d", &n_deriv, &warmup, &interval);
    cfg.n_derivatives       = n_deriv;
    cfg.max_warmup_steps    = warmup;
    cfg.skip_interval_steps = interval;
 }
 struct CacheDitConditionState {
    DBCacheConfig config;
    TaylorSeerConfig taylor_config;
    bool initialized = false;
    int current_step_index = -1;
    bool step_active       = false;
    bool skip_current_step = false;
    bool initial_step      = true;
    int warmup_remaining   = 0;
    std::vector<int> cached_steps;
    int continuous_cached_steps     = 0;
    float accumulated_residual_diff = 0.0f;
    int total_steps_skipped         = 0;
    const void* anchor_condition = nullptr;
    struct CacheEntry {
        std::vector<float> diff;
        std::vector<float> prev_input;
        std::vector<float> prev_output;
        bool has_prev = false;
    };
    std::unordered_map<const void*, CacheEntry> cache_diffs;
    TaylorSeerState taylor_state;
    float start_sigma = std::numeric_limits<float>::max();
    float end_sigma   = 0.0f;
    void reset_runtime() {
        current_step_index = -1;
        step_active        = false;
        skip_current_step  = false;
        initial_step       = true;
        warmup_remaining   = config.max_warmup_steps;
        cached_steps.clear();
        continuous_cached_steps   = 0;
        accumulated_residual_diff = 0.0f;
        total_steps_skipped       = 0;
        anchor_condition          = nullptr;
        cache_diffs.clear();
        taylor_state.reset();
    }
    void init(const DBCacheConfig& dbcfg, const TaylorSeerConfig& tcfg) {
        config        = dbcfg;
        taylor_config = tcfg;
        initialized   = dbcfg.enabled || tcfg.enabled;
        reset_runtime();
        if (taylor_config.enabled) {
            taylor_state.init(taylor_config.n_derivatives, 0);
        }
    }
    void set_sigmas(const std::vector<float>& sigmas) {
        if (!initialized || sigmas.size() < 2)
            return;
        float start_percent = 0.15f;
        float end_percent   = 0.95f;
        size_t n_steps    = sigmas.size() - 1;
        size_t start_step = static_cast<size_t>(start_percent * n_steps);
        size_t end_step   = static_cast<size_t>(end_percent * n_steps);
        if (start_step >= n_steps)
            start_step = n_steps - 1;
        if (end_step >= n_steps)
            end_step = n_steps - 1;
        start_sigma = sigmas[start_step];
        end_sigma   = sigmas[end_step];
        if (start_sigma < end_sigma) {
            std::swap(start_sigma, end_sigma);
        }
    }
    bool enabled() const {
        return initialized && (config.enabled || taylor_config.enabled);
    }
    void begin_step(int step_index, float sigma) {
        if (!enabled())
            return;
        if (step_index == current_step_index)
            return;
        current_step_index = step_index;
        skip_current_step  = false;
        step_active        = false;
        if (sigma > start_sigma)
            return;
        if (!(sigma > end_sigma))
            return;
        step_active = true;
        if (warmup_remaining > 0) {
            warmup_remaining--;
            return;
        }
        if (!config.steps_computation_mask.empty()) {
            if (step_index < static_cast<int>(config.steps_computation_mask.size())) {
                if (config.steps_computation_mask[step_index] == 1) {
                    return;
                }
            }
        }
        if (config.max_cached_steps >= 0 &&
            static_cast<int>(cached_steps.size()) >= config.max_cached_steps) {
            return;
        }
        if (config.max_continuous_cached_steps >= 0 &&
            continuous_cached_steps >= config.max_continuous_cached_steps) {
            return;
        }
    }
    bool step_is_active() const {
        return enabled() && step_active;
    }
    bool is_step_skipped() const {
        return enabled() && step_active && skip_current_step;
    }
    bool has_cache(const void* cond) const {
        auto it = cache_diffs.find(cond);
        return it != cache_diffs.end() && !it->second.diff.empty();
    }
    void update_cache(const void* cond, const float* input, const float* output, size_t size) {
        CacheEntry& entry = cache_diffs[cond];
        entry.diff.resize(size);
        for (size_t i = 0; i < size; i++) {
            entry.diff[i] = output[i] - input[i];
        }
        entry.prev_input.resize(size);
        entry.prev_output.resize(size);
        for (size_t i = 0; i < size; i++) {
            entry.prev_input[i]  = input[i];
            entry.prev_output[i] = output[i];
        }
        entry.has_prev = true;
    }
    void apply_cache(const void* cond, const float* input, float* output, size_t size) {
        auto it = cache_diffs.find(cond);
        if (it == cache_diffs.end() || it->second.diff.empty())
            return;
        if (it->second.diff.size() != size)
            return;
        for (size_t i = 0; i < size; i++) {
            output[i] = input[i] + it->second.diff[i];
        }
    }
    bool before_condition(const void* cond, struct ggml_tensor* input, struct ggml_tensor* output, float sigma, int step_index) {
        if (!enabled() || step_index < 0)
            return false;
        if (step_index != current_step_index) {
            begin_step(step_index, sigma);
        }
        if (!step_active)
            return false;
        if (initial_step) {
            anchor_condition = cond;
            initial_step     = false;
        }
        bool is_anchor = (cond == anchor_condition);
        if (skip_current_step) {
            if (has_cache(cond)) {
                apply_cache(cond, (float*)input->data, (float*)output->data,
                            static_cast<size_t>(ggml_nelements(output)));
                return true;
            }
            return false;
        }
        if (!is_anchor)
            return false;
        auto it = cache_diffs.find(cond);
        if (it == cache_diffs.end() || !it->second.has_prev)
            return false;
        size_t ne = static_cast<size_t>(ggml_nelements(input));
        if (it->second.prev_input.size() != ne)
            return false;
        float* input_data = (float*)input->data;
        float diff        = CacheDitState::calculate_residual_diff(
                   it->second.prev_input.data(), input_data, ne);
        float effective_threshold = config.residual_diff_threshold;
        if (config.Fn_compute_blocks > 0) {
            float fn_confidence = 1.0f + 0.02f * (config.Fn_compute_blocks - 8);
            fn_confidence       = std::max(0.5f, std::min(2.0f, fn_confidence));
            effective_threshold *= fn_confidence;
        }
        if (config.Bn_compute_blocks > 0) {
            float bn_quality = 1.0f - 0.03f * config.Bn_compute_blocks;
            bn_quality       = std::max(0.5f, std::min(1.0f, bn_quality));
            effective_threshold *= bn_quality;
        }
        if (diff < effective_threshold) {
            skip_current_step = true;
            total_steps_skipped++;
            cached_steps.push_back(current_step_index);
            continuous_cached_steps++;
            accumulated_residual_diff += diff;
            apply_cache(cond, input_data, (float*)output->data, ne);
            return true;
        }
        continuous_cached_steps = 0;
        return false;
    }
    void after_condition(const void* cond, struct ggml_tensor* input, struct ggml_tensor* output) {
        if (!step_is_active())
            return;
        size_t ne = static_cast<size_t>(ggml_nelements(output));
        update_cache(cond, (float*)input->data, (float*)output->data, ne);
        if (cond == anchor_condition && taylor_config.enabled) {
            taylor_state.update_derivatives((float*)output->data, ne, current_step_index);
        }
    }
    void log_metrics() const {
        if (!enabled())
            return;
        LOG_INFO("CacheDIT: steps_skipped=%d/%d (%.1f%%), accum_residual_diff=%.4f",
                 total_steps_skipped,
                 current_step_index + 1,
                 (current_step_index > 0) ? (100.0f * total_steps_skipped / (current_step_index + 1)) : 0.0f,
                 accumulated_residual_diff);
    }
 };
 #endif
--- a/common.hpp
+++ b/common.hpp
@ -28,7 +28,7 @@ public:
        if (vae_downsample) {
            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
-            x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
+            x = ggml_ext_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
            x = conv->forward(ctx, x);
        } else {
            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
@ -194,10 +194,12 @@ public:
        auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
        x          = proj->forward(ctx, x);  // [ne3, ne2, ne1, dim_out*2]
-        auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0);
+        auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0, false);
        x          = x_vec[0];  // [ne3, ne2, ne1, dim_out]
        auto gate  = x_vec[1];  // [ne3, ne2, ne1, dim_out]
        gate = ggml_cont(ctx->ggml_ctx, gate);
        gate = ggml_gelu_inplace(ctx->ggml_ctx, gate);
        x = ggml_mul(ctx->ggml_ctx, x, gate);  // [ne3, ne2, ne1, dim_out]
--- a/denoiser.hpp
+++ b/denoiser.hpp
@ -347,6 +347,41 @@ struct SmoothStepScheduler : SigmaScheduler {
    }
 };
 // Implementation adapted from https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/15608
 struct KLOptimalScheduler : SigmaScheduler {
    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
        std::vector<float> sigmas;
        if (n == 0) {
            return sigmas;
        }
        if (n == 1) {
            sigmas.push_back(sigma_max);
            sigmas.push_back(0.0f);
            return sigmas;
        }
        float alpha_min = std::atan(sigma_min);
        float alpha_max = std::atan(sigma_max);
        for (uint32_t i = 0; i < n; ++i) {
            // t goes from 0.0 to 1.0
            float t = static_cast<float>(i) / static_cast<float>(n - 1);
            // Interpolate in the angle domain
            float angle = t * alpha_min + (1.0f - t) * alpha_max;
            // Convert back to sigma
            sigmas.push_back(std::tan(angle));
        }
        // Append the final zero to sigma
        sigmas.push_back(0.0f);
        return sigmas;
    }
 };
 struct Denoiser {
    virtual float sigma_min()                                                                = 0;
    virtual float sigma_max()                                                                = 0;
@ -392,6 +427,10 @@ struct Denoiser {
                LOG_INFO("get_sigmas with SmoothStep scheduler");
                scheduler = std::make_shared<SmoothStepScheduler>();
                break;
            case KL_OPTIMAL_SCHEDULER:
                LOG_INFO("get_sigmas with KL Optimal scheduler");
                scheduler = std::make_shared<KLOptimalScheduler>();
                break;
            case LCM_SCHEDULER:
                LOG_INFO("get_sigmas with LCM scheduler");
                scheduler = std::make_shared<LCMScheduler>();
@ -830,7 +869,7 @@ static bool sample_k_diffusion(sample_method_t method,
            for (int i = 0; i < steps; i++) {
                // denoise
-                ggml_tensor* denoised = model(x, sigmas[i], i + 1);
+                ggml_tensor* denoised = model(x, sigmas[i], -(i + 1));
                if (denoised == nullptr) {
                    return false;
                }
@ -888,7 +927,7 @@ static bool sample_k_diffusion(sample_method_t method,
            for (int i = 0; i < steps; i++) {
                // denoise
-                ggml_tensor* denoised = model(x, sigmas[i], i + 1);
+                ggml_tensor* denoised = model(x, sigmas[i], -(i + 1));
                if (denoised == nullptr) {
                    return false;
                }
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@ -37,8 +37,9 @@ struct DiffusionModel {
    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
    virtual size_t get_params_buffer_size()                                             = 0;
    virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
-    virtual int64_t get_adm_in_channels()             = 0;
+    virtual int64_t get_adm_in_channels()                            = 0;
-    virtual void set_flash_attn_enabled(bool enabled) = 0;
+    virtual void set_flash_attn_enabled(bool enabled)                = 0;
    virtual void set_circular_axes(bool circular_x, bool circular_y) = 0;
 };
 struct UNetModel : public DiffusionModel {
@ -87,6 +88,10 @@ struct UNetModel : public DiffusionModel {
        unet.set_flash_attention_enabled(enabled);
    }
    void set_circular_axes(bool circular_x, bool circular_y) override {
        unet.set_circular_axes(circular_x, circular_y);
    }
    bool compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
@ -148,6 +153,10 @@ struct MMDiTModel : public DiffusionModel {
        mmdit.set_flash_attention_enabled(enabled);
    }
    void set_circular_axes(bool circular_x, bool circular_y) override {
        mmdit.set_circular_axes(circular_x, circular_y);
    }
    bool compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
@ -210,6 +219,10 @@ struct FluxModel : public DiffusionModel {
        flux.set_flash_attention_enabled(enabled);
    }
    void set_circular_axes(bool circular_x, bool circular_y) override {
        flux.set_circular_axes(circular_x, circular_y);
    }
    bool compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
@ -277,6 +290,10 @@ struct WanModel : public DiffusionModel {
        wan.set_flash_attention_enabled(enabled);
    }
    void set_circular_axes(bool circular_x, bool circular_y) override {
        wan.set_circular_axes(circular_x, circular_y);
    }
    bool compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
@ -303,8 +320,9 @@ struct QwenImageModel : public DiffusionModel {
                   bool offload_params_to_cpu,
                   const String2TensorStorage& tensor_storage_map = {},
                   const std::string prefix                       = "model.diffusion_model",
-                   SDVersion version                              = VERSION_QWEN_IMAGE)
+                   SDVersion version                              = VERSION_QWEN_IMAGE,
-        : prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
+                   bool zero_cond_t                               = false)
        : prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version, zero_cond_t) {
    }
    std::string get_desc() override {
@ -343,6 +361,10 @@ struct QwenImageModel : public DiffusionModel {
        qwen_image.set_flash_attention_enabled(enabled);
    }
    void set_circular_axes(bool circular_x, bool circular_y) override {
        qwen_image.set_circular_axes(circular_x, circular_y);
    }
    bool compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
@ -406,6 +428,10 @@ struct ZImageModel : public DiffusionModel {
        z_image.set_flash_attention_enabled(enabled);
    }
    void set_circular_axes(bool circular_x, bool circular_y) override {
        z_image.set_circular_axes(circular_x, circular_y);
    }
    bool compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
--- a/docs/caching.md
+++ b/docs/caching.md
@ -0,0 +1,126 @@
 ## Caching
 Caching methods accelerate diffusion inference by reusing intermediate computations when changes between steps are small.
 ### Cache Modes
 | Mode | Target | Description |
 |------|--------|-------------|
 | `ucache` | UNET models | Condition-level caching with error tracking |
 | `easycache` | DiT models | Condition-level cache |
 | `dbcache` | DiT models | Block-level L1 residual threshold |
 | `taylorseer` | DiT models | Taylor series approximation |
 | `cache-dit` | DiT models | Combined DBCache + TaylorSeer |
 ### UCache (UNET Models)
 UCache caches the residual difference (output - input) and reuses it when input changes are below threshold.
 ```bash
 sd-cli -m model.safetensors -p "a cat" --cache-mode ucache --cache-option "threshold=1.5"
 ```
 #### Parameters
 | Parameter | Description | Default |
 |-----------|-------------|---------|
 | `threshold` | Error threshold for reuse decision | 1.0 |
 | `start` | Start caching at this percent of steps | 0.15 |
 | `end` | Stop caching at this percent of steps | 0.95 |
 | `decay` | Error decay rate (0-1) | 1.0 |
 | `relative` | Scale threshold by output norm (0/1) | 1 |
 | `reset` | Reset error after computing (0/1) | 1 |
 #### Reset Parameter
 The `reset` parameter controls error accumulation behavior:
 - `reset=1` (default): Resets accumulated error after each computed step. More aggressive caching, works well with most samplers.
 - `reset=0`: Keeps error accumulated. More conservative, recommended for `euler_a` sampler.
 ### EasyCache (DiT Models)
 Condition-level caching for DiT models. Caches and reuses outputs when input changes are below threshold.
 ```bash
 --cache-mode easycache --cache-option "threshold=0.3"
 ```
 #### Parameters
 | Parameter | Description | Default |
 |-----------|-------------|---------|
 | `threshold` | Input change threshold for reuse | 0.2 |
 | `start` | Start caching at this percent of steps | 0.15 |
 | `end` | Stop caching at this percent of steps | 0.95 |
 ### Cache-DIT (DiT Models)
 For DiT models like FLUX and QWEN, use block-level caching modes.
 #### DBCache
 Caches blocks based on L1 residual difference threshold:
 ```bash
 --cache-mode dbcache --cache-option "threshold=0.25,warmup=4"
 ```
 #### TaylorSeer
 Uses Taylor series approximation to predict block outputs:
 ```bash
 --cache-mode taylorseer
 ```
 #### Cache-DIT (Combined)
 Combines DBCache and TaylorSeer:
 ```bash
 --cache-mode cache-dit --cache-preset fast
 ```
 #### Parameters
 | Parameter | Description | Default |
 |-----------|-------------|---------|
 | `Fn` | Front blocks to always compute | 8 |
 | `Bn` | Back blocks to always compute | 0 |
 | `threshold` | L1 residual difference threshold | 0.08 |
 | `warmup` | Steps before caching starts | 8 |
 #### Presets
 Available presets: `slow`, `medium`, `fast`, `ultra` (or `s`, `m`, `f`, `u`).
 ```bash
 --cache-mode cache-dit --cache-preset fast
 ```
 #### SCM Options
 Steps Computation Mask controls which steps can be cached:
 ```bash
 --scm-mask "1,1,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1"
 ```
 Mask values: `1` = compute, `0` = can cache.
 | Policy | Description |
 |--------|-------------|
 | `dynamic` | Check threshold before caching |
 | `static` | Always cache on cacheable steps |
 ```bash
 --scm-policy dynamic
 ```
 ### Performance Tips
 - Start with default thresholds and adjust based on output quality
 - Lower threshold = better quality, less speedup
 - Higher threshold = more speedup, potential quality loss
 - More steps generally means more caching opportunities
--- a/docs/chroma.md
+++ b/docs/chroma.md
@ -15,7 +15,7 @@ You can run Chroma using stable-diffusion.cpp with a GPU that has 6GB or even 4G
 You can download the preconverted gguf weights from [silveroxides/Chroma-GGUF](https://huggingface.co/silveroxides/Chroma-GGUF), this way you don't have to do the conversion yourself.
 ```
-.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\chroma-unlocked-v40.safetensors -o ..\models\chroma-unlocked-v40-q8_0.gguf -v --type q8_0
+.\bin\Release\sd-cli.exe -M convert -m ..\..\ComfyUI\models\unet\chroma-unlocked-v40.safetensors -o ..\models\chroma-unlocked-v40-q8_0.gguf -v --type q8_0
 ```
 ## Run
@ -24,7 +24,7 @@ You can download the preconverted gguf weights from [silveroxides/Chroma-GGUF](h
 For example:
 ```
- .\bin\Release\sd.exe --diffusion-model  ..\models\chroma-unlocked-v40-q8_0.gguf --vae ..\models\ae.sft --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'chroma.cpp'" --cfg-scale 4.0 --sampling-method euler -v --chroma-disable-dit-mask --clip-on-cpu
+ .\bin\Release\sd-cli.exe --diffusion-model  ..\models\chroma-unlocked-v40-q8_0.gguf --vae ..\models\ae.sft --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'chroma.cpp'" --cfg-scale 4.0 --sampling-method euler -v --chroma-disable-dit-mask --clip-on-cpu
 ```
 ![](../assets/flux/chroma_v40.png)
--- a/docs/chroma_radiance.md
+++ b/docs/chroma_radiance.md
@ -12,7 +12,7 @@
 ## Examples
 ```
-.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Chroma1-Radiance-v0.4-Q8_0.gguf --t5xxl ..\..\ComfyUI\models\clip\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'chroma  radiance cpp'" --cfg-scale 4.0 --sampling-method euler -v
+.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Chroma1-Radiance-v0.4-Q8_0.gguf --t5xxl ..\..\ComfyUI\models\clip\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'chroma  radiance cpp'" --cfg-scale 4.0 --sampling-method euler -v
 ```
 <img alt="Chroma1-Radiance" src="../assets/flux/chroma1-radiance.png" />
--- a/docs/docker.md
+++ b/docs/docker.md
@ -9,7 +9,7 @@ docker build -t sd .
 ### Run
 ```shell
-docker run -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
+docker run -v /path/to/models:/models -v /path/to/output/:/output sd-cli [args...]
 # For example
-# docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
+# docker run -v ./models:/models -v ./build:/output sd-cli -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
 ```
--- a/docs/esrgan.md
+++ b/docs/esrgan.md
@ -5,5 +5,5 @@ You can use ESRGAN to upscale the generated images. At the moment, only the [Rea
 - Specify the model path using the `--upscale-model PATH` parameter. example:
 ```bash
-sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --upscale-model ../models/RealESRGAN_x4plus_anime_6B.pth
+sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --upscale-model ../models/RealESRGAN_x4plus_anime_6B.pth
 ```
--- a/docs/flux.md
+++ b/docs/flux.md
@ -17,7 +17,7 @@ You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://hu
 For example:
 ```
-.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
+.\bin\Release\sd-cli.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
 ```
 ## Run
@ -28,7 +28,7 @@ For example:
 For example:
 ```
- .\bin\Release\sd.exe --diffusion-model  ..\models\flux1-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
+ .\bin\Release\sd-cli.exe --diffusion-model  ..\models\flux1-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
 ```
 Using formats of different precisions will yield results of varying quality.
@ -44,7 +44,7 @@ Using formats of different precisions will yield results of varying quality.
 ```
- .\bin\Release\sd.exe --diffusion-model  ..\models\flux1-schnell-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --steps 4 --clip-on-cpu
+ .\bin\Release\sd-cli.exe --diffusion-model  ..\models\flux1-schnell-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --steps 4 --clip-on-cpu
 ```
 | q8_0  |
@ -60,7 +60,7 @@ Since many flux LoRA training libraries have used various LoRA naming formats, i
 - LoRA model from https://huggingface.co/XLabs-AI/flux-lora-collection/tree/main (using comfy converted version!!!)
 ```
-.\bin\Release\sd.exe --diffusion-model  ..\models\flux1-dev-q8_0.gguf --vae ...\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'<lora:realism_lora_comfy_converted:1>" --cfg-scale 1.0 --sampling-method euler -v --lora-model-dir ../models --clip-on-cpu
+.\bin\Release\sd-cli.exe --diffusion-model  ..\models\flux1-dev-q8_0.gguf --vae ...\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'<lora:realism_lora_comfy_converted:1>" --cfg-scale 1.0 --sampling-method euler -v --lora-model-dir ../models --clip-on-cpu
 ```
 ![output](../assets/flux/flux1-dev-q8_0%20with%20lora.png)
--- a/docs/flux2.md
+++ b/docs/flux2.md
@ -12,7 +12,7 @@
 ## Examples
 ```
-.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\flux2-dev-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\Mistral-Small-3.2-24B-Instruct-2506-Q4_K_M.gguf -r .\kontext_input.png -p "change 'flux.cpp' to 'flux2-dev.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu
+.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\flux2-dev-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\Mistral-Small-3.2-24B-Instruct-2506-Q4_K_M.gguf -r .\kontext_input.png -p "change 'flux.cpp' to 'flux2-dev.cpp'" --cfg-scale 1.0 --sampling-method euler -v --diffusion-fa --offload-to-cpu
 ```
 <img alt="flux2 example" src="../assets/flux2/example.png" />
--- a/docs/hipBLAS_on_Windows.md
+++ b/docs/hipBLAS_on_Windows.md
@ -82,4 +82,4 @@ cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_H
 cmake --build . --config Release
 ```
-If everything went OK, `build\bin\sd.exe` file should appear.
+If everything went OK, `build\bin\sd-cli.exe` file should appear.
--- a/docs/kontext.md
+++ b/docs/kontext.md
@ -16,7 +16,7 @@ You can run Kontext using stable-diffusion.cpp with a GPU that has 6GB or even 4
 You can download the preconverted gguf weights from [FLUX.1-Kontext-dev-GGUF](https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF), this way you don't have to do the conversion yourself.
 ```
-.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-kontext-dev.safetensors -o ..\models\flux1-kontext-dev-q8_0.gguf -v --type q8_0
+.\bin\Release\sd-cli.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-kontext-dev.safetensors -o ..\models\flux1-kontext-dev-q8_0.gguf -v --type q8_0
 ```
 ## Run
@ -27,7 +27,7 @@ You can download the preconverted gguf weights from [FLUX.1-Kontext-dev-GGUF](ht
 For example:
 ```
- .\bin\Release\sd.exe -r .\flux1-dev-q8_0.png --diffusion-model  ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
+ .\bin\Release\sd-cli.exe -r .\flux1-dev-q8_0.png --diffusion-model  ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
 ```
--- a/docs/lcm.md
+++ b/docs/lcm.md
@ -7,7 +7,7 @@
 Here's a simple example:
 ```
-./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:lcm-lora-sdv1-5:1>" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
+./bin/sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:lcm-lora-sdv1-5:1>" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
 ```
 | without LCM-LoRA (--cfg-scale 7)  | with LCM-LoRA (--cfg-scale 1)  |
--- a/docs/lora.md
+++ b/docs/lora.md
@ -7,7 +7,7 @@
 Here's a simple example:
 ```
-./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
+./bin/sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
 ```
 `../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model
--- a/docs/ovis_image.md
+++ b/docs/ovis_image.md
@ -13,7 +13,7 @@
 ## Examples
 ```
-.\bin\Release\sd.exe --diffusion-model  ovis_image-Q4_0.gguf --vae ..\..\ComfyUI\models\vae\ae.sft  --llm ..\..\ComfyUI\models\text_encoders\ovis_2.5.safetensors -p "a lovely cat" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa
+.\bin\Release\sd-cli.exe --diffusion-model  ovis_image-Q4_0.gguf --vae ..\..\ComfyUI\models\vae\ae.sft  --llm ..\..\ComfyUI\models\text_encoders\ovis_2.5.safetensors -p "a lovely cat" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa
 ```
 <img alt="ovis image example" src="../assets/ovis_image/example.png" />
--- a/docs/photo_maker.md
+++ b/docs/photo_maker.md
@ -27,7 +27,7 @@ If on low memory GPUs (<= 8GB), recommend running with ```--vae-on-cpu``` option
 Example:
 ```bash
-bin/sd -m ../models/sdxlUnstableDiffusers_v11.safetensors  --vae ../models/sdxl_vae.safetensors --photo-maker ../models/photomaker-v1.safetensors --pm-id-images-dir ../assets/photomaker_examples/scarletthead_woman -p "a girl img, retro futurism, retro game art style but extremely beautiful, intricate details, masterpiece, best quality, space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed" -n "realistic, photo-realistic, worst quality, greyscale, bad anatomy, bad hands, error, text" --cfg-scale 5.0  --sampling-method euler -H 1024 -W 1024 --pm-style-strength 10 --vae-on-cpu --steps 50
+bin/sd-cli -m ../models/sdxlUnstableDiffusers_v11.safetensors  --vae ../models/sdxl_vae.safetensors --photo-maker ../models/photomaker-v1.safetensors --pm-id-images-dir ../assets/photomaker_examples/scarletthead_woman -p "a girl img, retro futurism, retro game art style but extremely beautiful, intricate details, masterpiece, best quality, space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed" -n "realistic, photo-realistic, worst quality, greyscale, bad anatomy, bad hands, error, text" --cfg-scale 5.0  --sampling-method euler -H 1024 -W 1024 --pm-style-strength 10 --vae-on-cpu --steps 50
 ```
 ## PhotoMaker Version 2
--- a/docs/quantization_and_gguf.md
+++ b/docs/quantization_and_gguf.md
@ -23,5 +23,5 @@ You can also convert weights in the formats `ckpt/safetensors/diffusers` to gguf
 For example:
 ```sh
-./bin/sd -M convert -m ../models/v1-5-pruned-emaonly.safetensors -o  ../models/v1-5-pruned-emaonly.q8_0.gguf -v --type q8_0
+./bin/sd-cli -M convert -m ../models/v1-5-pruned-emaonly.safetensors -o  ../models/v1-5-pruned-emaonly.q8_0.gguf -v --type q8_0
 ```
--- a/docs/qwen_image.md
+++ b/docs/qwen_image.md
@ -14,7 +14,7 @@
 ## Examples
 ```
-.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf  -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线： 探索视觉生成基础模型的极限，开创理解与生成一体化的未来。二、Qwen-Image的模型特色：1、复杂文字渲染。支持中英渲染、自动布局； 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景：赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
+.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\qwen-image-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf  -p '一个穿着"QWEN"标志的T恤的中国美女正拿着黑色的马克笔面相镜头微笑。她身后的玻璃板上手写体写着 “一、Qwen-Image的技术路线： 探索视觉生成基础模型的极限，开创理解与生成一体化的未来。二、Qwen-Image的模型特色：1、复杂文字渲染。支持中英渲染、自动布局； 2、精准图像编辑。支持文字编辑、物体增减、风格变换。三、Qwen-Image的未来愿景：赋能专业内容创作、助力生成式AI发展。”' --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu -H 1024 -W 1024 --diffusion-fa --flow-shift 3
 ```
 <img alt="qwen example" src="../assets/qwen/example.png" />
--- a/docs/qwen_image_edit.md
+++ b/docs/qwen_image_edit.md
@ -9,6 +9,9 @@
    - Qwen Image Edit 2509
        - safetensors: https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/tree/main/split_files/diffusion_models
        - gguf: https://huggingface.co/QuantStack/Qwen-Image-Edit-2509-GGUF/tree/main
    - Qwen Image Edit 2511
        - safetensors: https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/tree/main/split_files/diffusion_models
        - gguf: https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/tree/main
 - Download vae
    - safetensors: https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/tree/main/split_files/vae
 - Download qwen_2.5_vl 7b
@ -20,7 +23,7 @@
 ### Qwen Image Edit
 ```
-.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
+.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen_Image_Edit-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'" --seed 1118877715456453
 ```
 <img alt="qwen_image_edit" src="../assets/qwen/qwen_image_edit.png" />
@ -29,7 +32,17 @@
 ### Qwen Image Edit 2509
 ```
-.\bin\Release\sd.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --llm_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
+.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Qwen-Image-Edit-2509-Q4_K_S.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct-Q8_0.gguf --llm_vision ..\..\ComfyUI\models\text_encoders\Qwen2.5-VL-7B-Instruct.mmproj-Q8_0.gguf --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'Qwen Image Edit 2509'"
 ```
-<img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2509.png" />
+<img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2509.png" />
 ### Qwen Image Edit 2511
 To use the new Qwen Image Edit 2511 mode, the  `--qwen-image-zero-cond-t` flag must be enabled; otherwise, image editing quality will degrade significantly.
 ```
 .\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\qwen-image-edit-2511-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\qwen_image_vae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\qwen_2.5_vl_7b.safetensors --cfg-scale 2.5 --sampling-method euler -v --offload-to-cpu --diffusion-fa --flow-shift 3 -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'edit.cpp'"  --qwen-image-zero-cond-t
 ```
 <img alt="qwen_image_edit_2509" src="../assets/qwen/qwen_image_edit_2511.png" />
--- a/docs/sd.md
+++ b/docs/sd.md
@ -9,12 +9,12 @@
 ### txt2img example
 ```sh
-./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
+./bin/sd-cli -m ../models/sd-v1-4.ckpt -p "a lovely cat"
-# ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
+# ./bin/sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
-# ./bin/sd -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
+# ./bin/sd-cli -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
-# ./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
+# ./bin/sd-cli -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
-# ./bin/sd --diffusion-model  ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
+# ./bin/sd-cli --diffusion-model  ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
-# ./bin/sd -m  ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
+# ./bin/sd-cli -m  ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
 ```
 Using formats of different precisions will yield results of varying quality.
@ -29,7 +29,7 @@ Using formats of different precisions will yield results of varying quality.
 ```
-./bin/sd -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
+./bin/sd-cli -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
 ```
 <p align="center">
--- a/docs/sd3.md
+++ b/docs/sd3.md
@ -14,7 +14,7 @@
 For example:
 ```
-.\bin\Release\sd.exe -m  ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
+.\bin\Release\sd-cli.exe -m  ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
 ```
 ![](../assets/sd3.5_large.png)
--- a/docs/taesd.md
+++ b/docs/taesd.md
@ -13,5 +13,27 @@ curl -L -O https://huggingface.co/madebyollin/taesd/resolve/main/diffusion_pytor
 - Specify the model path using the `--taesd PATH` parameter. example:
 ```bash
-sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --taesd ../models/diffusion_pytorch_model.safetensors
+sd-cli -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --taesd ../models/diffusion_pytorch_model.safetensors
-```
+```
 ### Qwen-Image and wan (TAEHV)
 sd.cpp also supports [TAEHV](https://github.com/madebyollin/taehv) (#937), which can be used for Qwen-Image and wan.
 - For **Qwen-Image and wan2.1 and wan2.2-A14B**, download the wan2.1 tae [safetensors weights](https://github.com/madebyollin/taehv/blob/main/safetensors/taew2_1.safetensors)
  Or curl
  ```bash
  curl -L -O https://github.com/madebyollin/taehv/raw/refs/heads/main/safetensors/taew2_1.safetensors
  ```
 - For **wan2.2-TI2V-5B**, use the wan2.2 tae [safetensors weights](https://github.com/madebyollin/taehv/blob/main/safetensors/taew2_2.safetensors)
  Or curl
  ```bash
  curl -L -O https://github.com/madebyollin/taehv/raw/refs/heads/main/safetensors/taew2_2.safetensors
  ```
 Then simply replace the `--vae xxx.safetensors` with `--tae xxx.safetensors` in the commands. If it still out of VRAM, add `--vae-conv-direct` to your command though might be slower.
--- a/docs/wan.md
+++ b/docs/wan.md
@ -39,6 +39,9 @@
        - safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/vae/wan_2.1_vae.safetensors
    - wan_2.2_vae (for Wan2.2 TI2V 5B only)
        - safetensors: https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/blob/main/split_files/vae/wan2.2_vae.safetensors
    > Wan models vae requires really much VRAM! If you do not have enough VRAM, please try tae instead, though the results may be poorer. For tae usage, please refer to [taesd](taesd.md)
 - Download umt5_xxl
    - safetensors: https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/blob/main/split_files/text_encoders/umt5_xxl_fp16.safetensors
    - gguf: https://huggingface.co/city96/umt5-xxl-encoder-gguf/tree/main
@ -52,7 +55,7 @@
 ### Wan2.1 T2V 1.3B
 ```
-.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1_t2v_1.3B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --video-frames 33 --flow-shift 3.0
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1_t2v_1.3B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --video-frames 33 --flow-shift 3.0
 ```
 <video src=../assets/wan/Wan2.1_1.3B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -60,7 +63,7 @@
 ### Wan2.1 T2V 14B
 ```
-.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-t2v-14b-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa  --offload-to-cpu --video-frames 33 --flow-shift 3.0
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-t2v-14b-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa  --offload-to-cpu --video-frames 33 --flow-shift 3.0
 ```
 <video src=../assets/wan/Wan2.1_14B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -70,7 +73,7 @@
 ### Wan2.1 I2V 14B
 ```
-.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-i2v-14b-480p-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --clip_vision ..\..\ComfyUI\models\clip_vision\clip_vision_h.safetensors -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-i2v-14b-480p-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --clip_vision ..\..\ComfyUI\models\clip_vision\clip_vision_h.safetensors -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
 ```
 <video src=../assets/wan/Wan2.1_14B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -78,7 +81,7 @@
 ### Wan2.2 T2V A14B
 ```
-.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33 --flow-shift 3.0
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33 --flow-shift 3.0
 ```
 <video src=../assets/wan/Wan2.2_14B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -86,7 +89,7 @@
 ### Wan2.2 I2V A14B
 ```
-.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33 --offload-to-cpu -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --video-frames 33 --offload-to-cpu -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
 ```
 <video src=../assets/wan/Wan2.2_14B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -94,7 +97,7 @@
 ### Wan2.2 T2V A14B T2I
 ```
-.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --flow-shift 3.0
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --flow-shift 3.0
 ```
 <img width="832" height="480" alt="Wan2 2_14B_t2i" src="../assets/wan/Wan2.2_14B_t2i.png" />
@ -102,7 +105,7 @@
 ### Wan2.2 T2V 14B with Lora
 ```
-.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat<lora:wan2.2_t2v_lightx2v_4steps_lora_v1.1_low_noise:1><lora:|high_noise|wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise:1>" --cfg-scale 3.5 --sampling-method euler --steps 4 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 4 -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --lora-model-dir ..\..\ComfyUI\models\loras --video-frames 33 --flow-shift 3.0
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-T2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat<lora:wan2.2_t2v_lightx2v_4steps_lora_v1.1_low_noise:1><lora:|high_noise|wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise:1>" --cfg-scale 3.5 --sampling-method euler --steps 4 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 4 -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --offload-to-cpu --lora-model-dir ..\..\ComfyUI\models\loras --video-frames 33 --flow-shift 3.0
 ```
 <video src=../assets/wan/Wan2.2_14B_t2v_lora.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -114,7 +117,7 @@
 #### T2V
 ```
-.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.2_ti2v_5B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan2.2_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --offload-to-cpu --video-frames 33 --flow-shift 3.0
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.2_ti2v_5B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan2.2_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --offload-to-cpu --video-frames 33 --flow-shift 3.0
 ```
 <video src=../assets/wan/Wan2.2_5B_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -122,7 +125,7 @@
 #### I2V
 ```
-.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.2_ti2v_5B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan2.2_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --offload-to-cpu --video-frames 33 -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.2_ti2v_5B_fp16.safetensors --vae ..\..\ComfyUI\models\vae\wan2.2_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --offload-to-cpu --video-frames 33 -i ..\assets\cat_with_sd_cpp_42.png --flow-shift 3.0
 ```
 <video src=../assets/wan/Wan2.2_5B_i2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -130,7 +133,7 @@
 ### Wan2.1 FLF2V 14B
 ```
-.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-flf2v-14b-720p-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --clip_vision ..\..\ComfyUI\models\clip_vision\clip_vision_h.safetensors -p "glass flower blossom" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png --flow-shift 3.0
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-flf2v-14b-720p-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --clip_vision ..\..\ComfyUI\models\clip_vision\clip_vision_h.safetensors -p "glass flower blossom" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png --flow-shift 3.0
 ```
@ -139,7 +142,7 @@
 ### Wan2.2 FLF2V 14B
 ```
-.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -p "glass flower blossom" -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png --flow-shift 3.0
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-LowNoise-Q8_0.gguf --high-noise-diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.2-I2V-A14B-HighNoise-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf --cfg-scale 3.5 --sampling-method euler --steps 10 --high-noise-cfg-scale 3.5 --high-noise-sampling-method euler --high-noise-steps 8 -v -p "glass flower blossom" -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa --video-frames 33 --offload-to-cpu --init-img ..\..\ComfyUI\input\start_image.png --end-img ..\..\ComfyUI\input\end_image.png --flow-shift 3.0
 ```
 <video src=../assets/wan/Wan2.2_14B_flf2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -149,7 +152,7 @@
 #### T2V
 ```
-.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --video-frames 1 --offload-to-cpu
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --video-frames 1 --offload-to-cpu
 ```
 <video src=../assets/wan/Wan2.1_1.3B_vace_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -158,7 +161,7 @@
 #### R2V
 ```
-.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa -i ..\assets\cat_with_sd_cpp_42.png --video-frames 33 --offload-to-cpu
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa -i ..\assets\cat_with_sd_cpp_42.png --video-frames 33 --offload-to-cpu
 ```
 <video src=../assets/wan/Wan2.1_1.3B_vace_r2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -169,7 +172,7 @@
 ```
 mkdir post+depth
 ffmpeg -i ..\..\ComfyUI\input\post+depth.mp4 -qscale:v 1 -vf fps=8 post+depth\frame_%04d.jpg
-.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "The girl is dancing in a sea of flowers, slowly moving her hands. There is a close - up shot of her upper body. The character is surrounded by other transparent glass flowers in the style of Nicoletta Ceccoli, creating a beautiful, surreal, and emotionally expressive movie scene with a white. transparent feel and a dreamyl atmosphere." --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa -i ..\..\ComfyUI\input\dance_girl.jpg --control-video ./post+depth --video-frames 33 --offload-to-cpu
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\wan2.1-vace-1.3b-q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "The girl is dancing in a sea of flowers, slowly moving her hands. There is a close - up shot of her upper body. The character is surrounded by other transparent glass flowers in the style of Nicoletta Ceccoli, creating a beautiful, surreal, and emotionally expressive movie scene with a white. transparent feel and a dreamyl atmosphere." --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa -i ..\..\ComfyUI\input\dance_girl.jpg --control-video ./post+depth --video-frames 33 --offload-to-cpu
 ```
 <video src=../assets/wan/Wan2.1_1.3B_vace_v2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -179,7 +182,7 @@ ffmpeg -i ..\..\ComfyUI\input\post+depth.mp4 -qscale:v 1 -vf fps=8 post+depth\fr
 #### T2V
 ```
-.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --video-frames 33 --offload-to-cpu
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa --video-frames 33 --offload-to-cpu
 ```
 <video src=../assets/wan/Wan2.1_14B_vace_t2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -188,7 +191,7 @@ ffmpeg -i ..\..\ComfyUI\input\post+depth.mp4 -qscale:v 1 -vf fps=8 post+depth\fr
 #### R2V
 ```
-.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa -i ..\assets\cat_with_sd_cpp_42.png --video-frames 33 --offload-to-cpu
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "a lovely cat" --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 832 -H 480 --diffusion-fa -i ..\assets\cat_with_sd_cpp_42.png --video-frames 33 --offload-to-cpu
 ```
 <video src=../assets/wan/Wan2.1_14B_vace_r2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
@ -198,7 +201,7 @@ ffmpeg -i ..\..\ComfyUI\input\post+depth.mp4 -qscale:v 1 -vf fps=8 post+depth\fr
 #### V2V
 ```
-.\bin\Release\sd.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "The girl is dancing in a sea of flowers, slowly moving her hands. There is a close - up shot of her upper body. The character is surrounded by other transparent glass flowers in the style of Nicoletta Ceccoli, creating a beautiful, surreal, and emotionally expressive movie scene with a white. transparent feel and a dreamyl atmosphere." --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa -i ..\..\ComfyUI\input\dance_girl.jpg --control-video ./post+depth --video-frames 33 --offload-to-cpu
+.\bin\Release\sd-cli.exe -M vid_gen --diffusion-model  ..\..\ComfyUI\models\diffusion_models\Wan2.1_14B_VACE-Q8_0.gguf --vae ..\..\ComfyUI\models\vae\wan_2.1_vae.safetensors --t5xxl ..\..\ComfyUI\models\text_encoders\umt5-xxl-encoder-Q8_0.gguf  -p "The girl is dancing in a sea of flowers, slowly moving her hands. There is a close - up shot of her upper body. The character is surrounded by other transparent glass flowers in the style of Nicoletta Ceccoli, creating a beautiful, surreal, and emotionally expressive movie scene with a white. transparent feel and a dreamyl atmosphere." --cfg-scale 6.0 --sampling-method euler -v -n "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部， 畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" -W 480 -H 832 --diffusion-fa -i ..\..\ComfyUI\input\dance_girl.jpg --control-video ./post+depth --video-frames 33 --offload-to-cpu
 ```
 <video src=../assets/wan/Wan2.1_14B_vace_v2v.mp4 controls="controls" muted="muted" type="video/mp4"></video>
--- a/docs/z_image.md
+++ b/docs/z_image.md
@ -16,7 +16,7 @@ You can run Z-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or ev
 ## Examples
 ```
-.\bin\Release\sd.exe --diffusion-model  z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft  --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512
+.\bin\Release\sd-cli.exe --diffusion-model  z_image_turbo-Q3_K.gguf --vae ..\..\ComfyUI\models\vae\ae.sft  --llm ..\..\ComfyUI\models\text_encoders\Qwen3-4B-Instruct-2507-Q4_K_M.gguf -p "A cinematic, melancholic photograph of a solitary hooded figure walking through a sprawling, rain-slicked metropolis at night. The city lights are a chaotic blur of neon orange and cool blue, reflecting on the wet asphalt. The scene evokes a sense of being a single component in a vast machine. Superimposed over the image in a sleek, modern, slightly glitched font is the philosophical quote: 'THE CITY IS A CIRCUIT BOARD, AND I AM A BROKEN TRANSISTOR.' -- moody, atmospheric, profound, dark academic" --cfg-scale 1.0 -v --offload-to-cpu --diffusion-fa -H 1024 -W 512
 ```
 <img width="256" alt="z-image example" src="../assets/z_image/q3_K.png" />
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -1,3 +1,4 @@
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_subdirectory(cli)
+add_subdirectory(cli)
 add_subdirectory(server)
--- a/examples/cli/CMakeLists.txt
+++ b/examples/cli/CMakeLists.txt
@ -1,4 +1,4 @@
-set(TARGET sd)
+set(TARGET sd-cli)
 add_executable(${TARGET} main.cpp)
 install(TARGETS ${TARGET} RUNTIME)
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -1,14 +1,16 @@
 # Run
 ```
-usage: ./bin/sd  [options]
+usage: ./bin/sd-cli  [options]
 CLI Options:
-  -o, --output <string>       path to write result image to (default: ./output.png)
+  -o, --output <string>       path to write result image to. you can use printf-style %d format specifiers for image sequences (default: ./output.png) (eg. output_%03d.png)
  --output-begin-idx <int>    starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)
  --preview-path <string>     path to write preview image to (default: ./preview.png)
  --preview-interval <int>    interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
                              every step)
  --canny                     apply canny preprocessor (edge detection)
  --convert-name              convert tensor name (for convert mode)
  -v, --verbose               print extra info
  --color                     colors the logging tags according to level
  --taesd-preview-only        prevents usage of taesd for decoding the final image. (for use with --preview tae)
@ -31,6 +33,7 @@ Context Options:
  --high-noise-diffusion-model <string>    path to the standalone high noise diffusion model
  --vae <string>                           path to standalone vae model
  --taesd <string>                         path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
  --tae <string>                           alias of --taesd
  --control-net <string>                   path to control net model
  --embd-dir <string>                      embeddings directory
  --lora-model-dir <string>                lora model directory
@ -45,12 +48,16 @@ Context Options:
  --vae-tiling                             process vae in tiles to reduce memory usage
  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
  --mmap                                   whether to memory-map model
  --control-net-cpu                        keep controlnet in cpu (for low vram)
  --clip-on-cpu                            keep clip in cpu (for low vram)
  --vae-on-cpu                             keep vae in cpu (for low vram)
  --diffusion-fa                           use flash attention in the diffusion model
  --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
  --vae-conv-direct                        use ggml_conv2d_direct in the vae model
  --circular                               enable circular padding for convolutions
  --circularx                              enable circular RoPE wrapping on x-axis (width) only
  --circulary                              enable circular RoPE wrapping on y-axis (height) only
  --chroma-disable-dit-mask                disable dit mask for chroma
  --chroma-enable-t5-mask                  enable t5 mask for chroma
  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
@ -92,6 +99,7 @@ Generation Options:
  --timestep-shift <int>                   shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
                                           NitroSD-Vibrant
  --upscale-repeats <int>                  Run the ESRGAN upscaler this many times (default: 1)
  --upscale-tile-size <int>                tile size for ESRGAN upscaling (default: 128)
  --cfg-scale <float>                      unconditional guidance scale: (default: 7.0)
  --img-cfg-scale <float>                  image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
  --guidance <float>                       distilled guidance scale for models with guidance input (default: 3.5)
@ -119,10 +127,17 @@ Generation Options:
                                           tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
                                           ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
-  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm],
+  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
-                                           default: discrete
+                                           kl_optimal, lcm], default: discrete
  --sigmas                                 custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
  --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
  --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
  -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
-  --easycache                              enable EasyCache for DiT models with optional "threshold,start_percent,end_percent" (default: 0.2,0.15,0.95)
+  --cache-mode                             caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level)
  --cache-option                           named cache params (key=value format, comma-separated). easycache/ucache:
                                           threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
                                           "threshold=0.25" or "threshold=1.5,reset=0"
  --cache-preset                           cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u'
  --scm-mask                               SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
  --scm-policy                             SCM policy: 'dynamic' (default) or 'static'
 ```
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
--- a/examples/common/common.hpp
+++ b/examples/common/common.hpp
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -0,0 +1,6 @@
 set(TARGET sd-server)
 add_executable(${TARGET} main.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PUBLIC c_std_11 cxx_std_17)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -0,0 +1,135 @@
 # Run
 ```
 usage: ./bin/sd-server  [options]
 Svr Options:
  -l, --listen-ip <string>    server listen ip (default: 127.0.0.1)
  --listen-port <int>         server listen port (default: 1234)
  --serve-html-path <string>  path to HTML file to serve at root (optional)
  -v, --verbose               print extra info
  --color                     colors the logging tags according to level
  -h, --help                  show this help message and exit
 Context Options:
  -m, --model <string>                     path to full model
  --clip_l <string>                        path to the clip-l text encoder
  --clip_g <string>                        path to the clip-g text encoder
  --clip_vision <string>                   path to the clip-vision encoder
  --t5xxl <string>                         path to the t5xxl text encoder
  --llm <string>                           path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)
  --llm_vision <string>                    path to the llm vit
  --qwen2vl <string>                       alias of --llm. Deprecated.
  --qwen2vl_vision <string>                alias of --llm_vision. Deprecated.
  --diffusion-model <string>               path to the standalone diffusion model
  --high-noise-diffusion-model <string>    path to the standalone high noise diffusion model
  --vae <string>                           path to standalone vae model
  --taesd <string>                         path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
  --tae <string>                           alias of --taesd
  --control-net <string>                   path to control net model
  --embd-dir <string>                      embeddings directory
  --lora-model-dir <string>                lora model directory
  --tensor-type-rules <string>             weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
  --photo-maker <string>                   path to PHOTOMAKER model
  --upscale-model <string>                 path to esrgan model.
  -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
                                           CPU physical cores
  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
  --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
  --vae-tiling                             process vae in tiles to reduce memory usage
  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
  --control-net-cpu                        keep controlnet in cpu (for low vram)
  --clip-on-cpu                            keep clip in cpu (for low vram)
  --vae-on-cpu                             keep vae in cpu (for low vram)
  --mmap                                   whether to memory-map model
  --diffusion-fa                           use flash attention in the diffusion model
  --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
  --vae-conv-direct                        use ggml_conv2d_direct in the vae model
  --circular                               enable circular padding for convolutions
  --circularx                              enable circular RoPE wrapping on x-axis (width) only
  --circulary                              enable circular RoPE wrapping on y-axis (height) only
  --chroma-disable-dit-mask                disable dit mask for chroma
  --chroma-enable-t5-mask                  enable t5 mask for chroma
  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
                                           type of the weight file
  --rng                                    RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
  --sampler-rng                            sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
  --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]
  --lora-apply-mode                        the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
                                           contain any quantized parameters, the at_runtime mode will be used; otherwise,
                                           immediately will be used.The immediately mode may have precision and
                                           compatibility issues with quantized parameters, but it usually offers faster inference
                                           speed and, in some cases, lower memory usage. The at_runtime mode, on the
                                           other hand, is exactly the opposite.
  --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
  --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
                                           (overrides --vae-tile-size)
 Default Generation Options:
  -p, --prompt <string>                    the prompt to render
  -n, --negative-prompt <string>           the negative prompt (default: "")
  -i, --init-img <string>                  path to the init image
  --end-img <string>                       path to the end image, required by flf2v
  --mask <string>                          path to the mask image
  --control-image <string>                 path to control image, control net
  --control-video <string>                 path to control video frames, It must be a directory path. The video frames inside should be stored as images in
                                           lexicographical (character) order. For example, if the control video path is
                                           `frames`, the directory contain images such as 00.png, 01.png, ... etc.
  --pm-id-images-dir <string>              path to PHOTOMAKER input id images dir
  --pm-id-embed-path <string>              path to PHOTOMAKER v2 id embed
  -H, --height <int>                       image height, in pixel space (default: 512)
  -W, --width <int>                        image width, in pixel space (default: 512)
  --steps <int>                            number of sample steps (default: 20)
  --high-noise-steps <int>                 (high noise) number of sample steps (default: -1 = auto)
  --clip-skip <int>                        ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
                                           will be 1 for SD1.x, 2 for SD2.x
  -b, --batch-count <int>                  batch count
  --video-frames <int>                     video frames (default: 1)
  --fps <int>                              fps (default: 24)
  --timestep-shift <int>                   shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
                                           NitroSD-Vibrant
  --upscale-repeats <int>                  Run the ESRGAN upscaler this many times (default: 1)
  --upscale-tile-size <int>                tile size for ESRGAN upscaling (default: 128)
  --cfg-scale <float>                      unconditional guidance scale: (default: 7.0)
  --img-cfg-scale <float>                  image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
  --guidance <float>                       distilled guidance scale for models with guidance input (default: 3.5)
  --slg-scale <float>                      skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
                                           medium
  --skip-layer-start <float>               SLG enabling point (default: 0.01)
  --skip-layer-end <float>                 SLG disabling point (default: 0.2)
  --eta <float>                            eta in DDIM, only for DDIM and TCD (default: 0)
  --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
  --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
  --high-noise-guidance <float>            (high noise) distilled guidance scale for models with guidance input (default: 3.5)
  --high-noise-slg-scale <float>           (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
  --high-noise-skip-layer-start <float>    (high noise) SLG enabling point (default: 0.01)
  --high-noise-skip-layer-end <float>      (high noise) SLG disabling point (default: 0.2)
  --high-noise-eta <float>                 (high noise) eta in DDIM, only for DDIM and TCD (default: 0)
  --strength <float>                       strength for noising/unnoising (default: 0.75)
  --pm-style-strength <float>
  --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
  --moe-boundary <float>                   timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
  --vace-strength <float>                  wan vace strength
  --increase-ref-index                     automatically increase the indices of references images based on the order they are listed (starting with 1).
  --disable-auto-resize-ref-image          disable auto resize of ref images
  -s, --seed                               RNG seed (default: 42, use random seed for < 0)
  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
                                           tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
                                           ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
                                           kl_optimal, lcm], default: discrete
  --sigmas                                 custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
  --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
  --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
  -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
  --cache-mode                             caching method: 'easycache' (DiT), 'ucache' (UNET), 'dbcache'/'taylorseer'/'cache-dit' (DiT block-level)
  --cache-option                           named cache params (key=value format, comma-separated). easycache/ucache:
                                           threshold=,start=,end=,decay=,relative=,reset=; dbcache/taylorseer/cache-dit: Fn=,Bn=,threshold=,warmup=. Examples:
                                           "threshold=0.25" or "threshold=1.5,reset=0"
  --cache-preset                           cache-dit preset: 'slow'/'s', 'medium'/'m', 'fast'/'f', 'ultra'/'u'
  --scm-mask                               SCM steps mask for cache-dit: comma-separated 0/1 (e.g., "1,1,1,0,0,1,0,0,1,0") - 1=compute, 0=can cache
  --scm-policy                             SCM policy: 'dynamic' (default) or 'static'
 ```
--- a/examples/server/main.cpp
+++ b/examples/server/main.cpp
@ -0,0 +1,729 @@
 // main.cpp
 #include <chrono>
 #include <filesystem>
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <mutex>
 #include <sstream>
 #include <vector>
 #include "httplib.h"
 #include "stable-diffusion.h"
 #include "common/common.hpp"
 namespace fs = std::filesystem;
 // ----------------------- helpers -----------------------
 static const std::string base64_chars =
    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    "abcdefghijklmnopqrstuvwxyz"
    "0123456789+/";
 std::string base64_encode(const std::vector<uint8_t>& bytes) {
    std::string ret;
    int val = 0, valb = -6;
    for (uint8_t c : bytes) {
        val = (val << 8) + c;
        valb += 8;
        while (valb >= 0) {
            ret.push_back(base64_chars[(val >> valb) & 0x3F]);
            valb -= 6;
        }
    }
    if (valb > -6)
        ret.push_back(base64_chars[((val << 8) >> (valb + 8)) & 0x3F]);
    while (ret.size() % 4)
        ret.push_back('=');
    return ret;
 }
 inline bool is_base64(unsigned char c) {
    return (isalnum(c) || (c == '+') || (c == '/'));
 }
 std::vector<uint8_t> base64_decode(const std::string& encoded_string) {
    int in_len = encoded_string.size();
    int i      = 0;
    int j      = 0;
    int in_    = 0;
    uint8_t char_array_4[4], char_array_3[3];
    std::vector<uint8_t> ret;
    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
        char_array_4[i++] = encoded_string[in_];
        in_++;
        if (i == 4) {
            for (i = 0; i < 4; i++)
                char_array_4[i] = static_cast<uint8_t>(base64_chars.find(char_array_4[i]));
            char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
            for (i = 0; i < 3; i++)
                ret.push_back(char_array_3[i]);
            i = 0;
        }
    }
    if (i) {
        for (j = i; j < 4; j++)
            char_array_4[j] = 0;
        for (j = 0; j < 4; j++)
            char_array_4[j] = static_cast<uint8_t>(base64_chars.find(char_array_4[j]));
        char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
        for (j = 0; j < i - 1; j++)
            ret.push_back(char_array_3[j]);
    }
    return ret;
 }
 std::string iso_timestamp_now() {
    using namespace std::chrono;
    auto now      = system_clock::now();
    std::time_t t = system_clock::to_time_t(now);
    std::tm tm{};
 #ifdef _MSC_VER
    gmtime_s(&tm, &t);
 #else
    gmtime_r(&t, &tm);
 #endif
    std::ostringstream oss;
    oss << std::put_time(&tm, "%Y-%m-%dT%H:%M:%SZ");
    return oss.str();
 }
 struct SDSvrParams {
    std::string listen_ip = "127.0.0.1";
    int listen_port       = 1234;
    std::string serve_html_path;
    bool normal_exit = false;
    bool verbose     = false;
    bool color       = false;
    ArgOptions get_options() {
        ArgOptions options;
        options.string_options = {
            {"-l",
             "--listen-ip",
             "server listen ip (default: 127.0.0.1)",
             &listen_ip},
            {"",
             "--serve-html-path",
             "path to HTML file to serve at root (optional)",
             &serve_html_path}};
        options.int_options = {
            {"",
             "--listen-port",
             "server listen port (default: 1234)",
             &listen_port},
        };
        options.bool_options = {
            {"-v",
             "--verbose",
             "print extra info",
             true, &verbose},
            {"",
             "--color",
             "colors the logging tags according to level",
             true, &color},
        };
        auto on_help_arg = [&](int argc, const char** argv, int index) {
            normal_exit = true;
            return -1;
        };
        options.manual_options = {
            {"-h",
             "--help",
             "show this help message and exit",
             on_help_arg},
        };
        return options;
    };
    bool process_and_check() {
        if (listen_ip.empty()) {
            LOG_ERROR("error: the following arguments are required: listen_ip");
            return false;
        }
        if (listen_port < 0 || listen_port > 65535) {
            LOG_ERROR("error: listen_port should be in the range [0, 65535]");
            return false;
        }
        if (!serve_html_path.empty() && !fs::exists(serve_html_path)) {
            LOG_ERROR("error: serve_html_path file does not exist: %s", serve_html_path.c_str());
            return false;
        }
        return true;
    }
    std::string to_string() const {
        std::ostringstream oss;
        oss << "SDSvrParams {\n"
            << "  listen_ip: " << listen_ip << ",\n"
            << "  listen_port: \"" << listen_port << "\",\n"
            << "  serve_html_path: \"" << serve_html_path << "\",\n"
            << "}";
        return oss.str();
    }
 };
 void print_usage(int argc, const char* argv[], const std::vector<ArgOptions>& options_list) {
    std::cout << version_string() << "\n";
    std::cout << "Usage: " << argv[0] << " [options]\n\n";
    std::cout << "Svr Options:\n";
    options_list[0].print();
    std::cout << "\nContext Options:\n";
    options_list[1].print();
    std::cout << "\nDefault Generation Options:\n";
    options_list[2].print();
 }
 void parse_args(int argc, const char** argv, SDSvrParams& svr_params, SDContextParams& ctx_params, SDGenerationParams& default_gen_params) {
    std::vector<ArgOptions> options_vec = {svr_params.get_options(), ctx_params.get_options(), default_gen_params.get_options()};
    if (!parse_options(argc, argv, options_vec)) {
        print_usage(argc, argv, options_vec);
        exit(svr_params.normal_exit ? 0 : 1);
    }
    if (!svr_params.process_and_check() ||
        !ctx_params.process_and_check(IMG_GEN) ||
        !default_gen_params.process_and_check(IMG_GEN, ctx_params.lora_model_dir)) {
        print_usage(argc, argv, options_vec);
        exit(1);
    }
 }
 std::string extract_and_remove_sd_cpp_extra_args(std::string& text) {
    std::regex re("<sd_cpp_extra_args>(.*?)</sd_cpp_extra_args>");
    std::smatch match;
    std::string extracted;
    if (std::regex_search(text, match, re)) {
        extracted = match[1].str();
        text      = std::regex_replace(text, re, "");
    }
    return extracted;
 }
 enum class ImageFormat { JPEG,
                         PNG };
 std::vector<uint8_t> write_image_to_vector(
    ImageFormat format,
    const uint8_t* image,
    int width,
    int height,
    int channels,
    int quality = 90) {
    std::vector<uint8_t> buffer;
    auto write_func = [&buffer](void* context, void* data, int size) {
        uint8_t* src = reinterpret_cast<uint8_t*>(data);
        buffer.insert(buffer.end(), src, src + size);
    };
    struct ContextWrapper {
        decltype(write_func)& func;
    } ctx{write_func};
    auto c_func = [](void* context, void* data, int size) {
        auto* wrapper = reinterpret_cast<ContextWrapper*>(context);
        wrapper->func(context, data, size);
    };
    int result = 0;
    switch (format) {
        case ImageFormat::JPEG:
            result = stbi_write_jpg_to_func(c_func, &ctx, width, height, channels, image, quality);
            break;
        case ImageFormat::PNG:
            result = stbi_write_png_to_func(c_func, &ctx, width, height, channels, image, width * channels);
            break;
        default:
            throw std::runtime_error("invalid image format");
    }
    if (!result) {
        throw std::runtime_error("write imgage to mem failed");
    }
    return buffer;
 }
 void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
    SDSvrParams* svr_params = (SDSvrParams*)data;
    log_print(level, log, svr_params->verbose, svr_params->color);
 }
 int main(int argc, const char** argv) {
    if (argc > 1 && std::string(argv[1]) == "--version") {
        std::cout << version_string() << "\n";
        return EXIT_SUCCESS;
    }
    SDSvrParams svr_params;
    SDContextParams ctx_params;
    SDGenerationParams default_gen_params;
    parse_args(argc, argv, svr_params, ctx_params, default_gen_params);
    sd_set_log_callback(sd_log_cb, (void*)&svr_params);
    log_verbose = svr_params.verbose;
    log_color   = svr_params.color;
    LOG_DEBUG("version: %s", version_string().c_str());
    LOG_DEBUG("%s", sd_get_system_info());
    LOG_DEBUG("%s", svr_params.to_string().c_str());
    LOG_DEBUG("%s", ctx_params.to_string().c_str());
    LOG_DEBUG("%s", default_gen_params.to_string().c_str());
    sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(false, false, false);
    sd_ctx_t* sd_ctx              = new_sd_ctx(&sd_ctx_params);
    if (sd_ctx == nullptr) {
        LOG_ERROR("new_sd_ctx_t failed");
        return 1;
    }
    std::mutex sd_ctx_mutex;
    httplib::Server svr;
    svr.set_pre_routing_handler([](const httplib::Request& req, httplib::Response& res) {
        std::string origin = req.get_header_value("Origin");
        if (origin.empty()) {
            origin = "*";
        }
        res.set_header("Access-Control-Allow-Origin", origin);
        res.set_header("Access-Control-Allow-Credentials", "true");
        res.set_header("Access-Control-Allow-Methods", "*");
        res.set_header("Access-Control-Allow-Headers", "*");
        if (req.method == "OPTIONS") {
            res.status = 204;
            return httplib::Server::HandlerResponse::Handled;
        }
        return httplib::Server::HandlerResponse::Unhandled;
    });
    // health
    svr.Get("/", [&](const httplib::Request&, httplib::Response& res) {
        if (!svr_params.serve_html_path.empty()) {
            std::ifstream file(svr_params.serve_html_path);
            if (file) {
                std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
                res.set_content(content, "text/html");
            } else {
                res.status = 500;
                res.set_content("Error: Unable to read HTML file", "text/plain");
            }
        } else {
            res.set_content("Stable Diffusion Server is running", "text/plain");
        }
    });
    // models endpoint (minimal)
    svr.Get("/v1/models", [&](const httplib::Request&, httplib::Response& res) {
        json r;
        r["data"] = json::array();
        r["data"].push_back({{"id", "sd-cpp-local"}, {"object", "model"}, {"owned_by", "local"}});
        res.set_content(r.dump(), "application/json");
    });
    // core endpoint: /v1/images/generations
    svr.Post("/v1/images/generations", [&](const httplib::Request& req, httplib::Response& res) {
        try {
            if (req.body.empty()) {
                res.status = 400;
                res.set_content(R"({"error":"empty body"})", "application/json");
                return;
            }
            json j                    = json::parse(req.body);
            std::string prompt        = j.value("prompt", "");
            int n                     = std::max(1, j.value("n", 1));
            std::string size          = j.value("size", "");
            std::string output_format = j.value("output_format", "png");
            int output_compression    = j.value("output_compression", 100);
            int width                 = 512;
            int height                = 512;
            if (!size.empty()) {
                auto pos = size.find('x');
                if (pos != std::string::npos) {
                    try {
                        width  = std::stoi(size.substr(0, pos));
                        height = std::stoi(size.substr(pos + 1));
                    } catch (...) {
                    }
                }
            }
            if (prompt.empty()) {
                res.status = 400;
                res.set_content(R"({"error":"prompt required"})", "application/json");
                return;
            }
            std::string sd_cpp_extra_args_str = extract_and_remove_sd_cpp_extra_args(prompt);
            if (output_format != "png" && output_format != "jpeg") {
                res.status = 400;
                res.set_content(R"({"error":"invalid output_format, must be one of [png, jpeg]"})", "application/json");
                return;
            }
            if (n <= 0)
                n = 1;
            if (n > 8)
                n = 8;  // safety
            if (output_compression > 100) {
                output_compression = 100;
            }
            if (output_compression < 0) {
                output_compression = 0;
            }
            json out;
            out["created"]       = iso_timestamp_now();
            out["data"]          = json::array();
            out["output_format"] = output_format;
            SDGenerationParams gen_params = default_gen_params;
            gen_params.prompt             = prompt;
            gen_params.width              = width;
            gen_params.height             = height;
            gen_params.batch_count        = n;
            if (!sd_cpp_extra_args_str.empty() && !gen_params.from_json_str(sd_cpp_extra_args_str)) {
                res.status = 400;
                res.set_content(R"({"error":"invalid sd_cpp_extra_args"})", "application/json");
                return;
            }
            if (!gen_params.process_and_check(IMG_GEN, "")) {
                res.status = 400;
                res.set_content(R"({"error":"invalid params"})", "application/json");
                return;
            }
            LOG_DEBUG("%s\n", gen_params.to_string().c_str());
            sd_image_t init_image    = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
            sd_image_t control_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
            sd_image_t mask_image    = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 1, nullptr};
            std::vector<sd_image_t> pmid_images;
            sd_img_gen_params_t img_gen_params = {
                gen_params.lora_vec.data(),
                static_cast<uint32_t>(gen_params.lora_vec.size()),
                gen_params.prompt.c_str(),
                gen_params.negative_prompt.c_str(),
                gen_params.clip_skip,
                init_image,
                nullptr,
                0,
                gen_params.auto_resize_ref_image,
                gen_params.increase_ref_index,
                mask_image,
                gen_params.width,
                gen_params.height,
                gen_params.sample_params,
                gen_params.strength,
                gen_params.seed,
                gen_params.batch_count,
                control_image,
                gen_params.control_strength,
                {
                    pmid_images.data(),
                    (int)pmid_images.size(),
                    gen_params.pm_id_embed_path.c_str(),
                    gen_params.pm_style_strength,
                },  // pm_params
                ctx_params.vae_tiling_params,
                gen_params.cache_params,
            };
            sd_image_t* results = nullptr;
            int num_results     = 0;
            {
                std::lock_guard<std::mutex> lock(sd_ctx_mutex);
                results     = generate_image(sd_ctx, &img_gen_params);
                num_results = gen_params.batch_count;
            }
            for (int i = 0; i < num_results; i++) {
                if (results[i].data == nullptr) {
                    continue;
                }
                auto image_bytes = write_image_to_vector(output_format == "jpeg" ? ImageFormat::JPEG : ImageFormat::PNG,
                                                         results[i].data,
                                                         results[i].width,
                                                         results[i].height,
                                                         results[i].channel,
                                                         output_compression);
                if (image_bytes.empty()) {
                    LOG_ERROR("write image to mem failed");
                    continue;
                }
                // base64 encode
                std::string b64 = base64_encode(image_bytes);
                json item;
                item["b64_json"] = b64;
                out["data"].push_back(item);
            }
            res.set_content(out.dump(), "application/json");
            res.status = 200;
        } catch (const std::exception& e) {
            res.status = 500;
            json err;
            err["error"]   = "server_error";
            err["message"] = e.what();
            res.set_content(err.dump(), "application/json");
        }
    });
    svr.Post("/v1/images/edits", [&](const httplib::Request& req, httplib::Response& res) {
        try {
            if (!req.is_multipart_form_data()) {
                res.status = 400;
                res.set_content(R"({"error":"Content-Type must be multipart/form-data"})", "application/json");
                return;
            }
            std::string prompt = req.form.get_field("prompt");
            if (prompt.empty()) {
                res.status = 400;
                res.set_content(R"({"error":"prompt required"})", "application/json");
                return;
            }
            std::string sd_cpp_extra_args_str = extract_and_remove_sd_cpp_extra_args(prompt);
            size_t image_count = req.form.get_file_count("image[]");
            if (image_count == 0) {
                res.status = 400;
                res.set_content(R"({"error":"at least one image[] required"})", "application/json");
                return;
            }
            std::vector<std::vector<uint8_t>> images_bytes;
            for (size_t i = 0; i < image_count; i++) {
                auto file = req.form.get_file("image[]", i);
                images_bytes.emplace_back(file.content.begin(), file.content.end());
            }
            std::vector<uint8_t> mask_bytes;
            if (req.form.has_field("mask")) {
                auto file = req.form.get_file("mask");
                mask_bytes.assign(file.content.begin(), file.content.end());
            }
            int n = 1;
            if (req.form.has_field("n")) {
                try {
                    n = std::stoi(req.form.get_field("n"));
                } catch (...) {
                }
            }
            n = std::clamp(n, 1, 8);
            std::string size = req.form.get_field("size");
            int width = 512, height = 512;
            if (!size.empty()) {
                auto pos = size.find('x');
                if (pos != std::string::npos) {
                    try {
                        width  = std::stoi(size.substr(0, pos));
                        height = std::stoi(size.substr(pos + 1));
                    } catch (...) {
                    }
                }
            }
            std::string output_format = "png";
            if (req.form.has_field("output_format"))
                output_format = req.form.get_field("output_format");
            if (output_format != "png" && output_format != "jpeg") {
                res.status = 400;
                res.set_content(R"({"error":"invalid output_format, must be one of [png, jpeg]"})", "application/json");
                return;
            }
            std::string output_compression_str = req.form.get_field("output_compression");
            int output_compression             = 100;
            try {
                output_compression = std::stoi(output_compression_str);
            } catch (...) {
            }
            if (output_compression > 100) {
                output_compression = 100;
            }
            if (output_compression < 0) {
                output_compression = 0;
            }
            SDGenerationParams gen_params = default_gen_params;
            gen_params.prompt             = prompt;
            gen_params.width              = width;
            gen_params.height             = height;
            gen_params.batch_count        = n;
            if (!sd_cpp_extra_args_str.empty() && !gen_params.from_json_str(sd_cpp_extra_args_str)) {
                res.status = 400;
                res.set_content(R"({"error":"invalid sd_cpp_extra_args"})", "application/json");
                return;
            }
            if (!gen_params.process_and_check(IMG_GEN, "")) {
                res.status = 400;
                res.set_content(R"({"error":"invalid params"})", "application/json");
                return;
            }
            LOG_DEBUG("%s\n", gen_params.to_string().c_str());
            sd_image_t init_image    = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
            sd_image_t control_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr};
            std::vector<sd_image_t> pmid_images;
            std::vector<sd_image_t> ref_images;
            ref_images.reserve(images_bytes.size());
            for (auto& bytes : images_bytes) {
                int img_w           = width;
                int img_h           = height;
                uint8_t* raw_pixels = load_image_from_memory(
                    reinterpret_cast<const char*>(bytes.data()),
                    bytes.size(),
                    img_w, img_h,
                    width, height, 3);
                if (!raw_pixels) {
                    continue;
                }
                sd_image_t img{(uint32_t)img_w, (uint32_t)img_h, 3, raw_pixels};
                ref_images.push_back(img);
            }
            sd_image_t mask_image = {0};
            if (!mask_bytes.empty()) {
                int mask_w        = width;
                int mask_h        = height;
                uint8_t* mask_raw = load_image_from_memory(
                    reinterpret_cast<const char*>(mask_bytes.data()),
                    mask_bytes.size(),
                    mask_w, mask_h,
                    width, height, 1);
                mask_image = {(uint32_t)mask_w, (uint32_t)mask_h, 1, mask_raw};
            } else {
                mask_image.width   = width;
                mask_image.height  = height;
                mask_image.channel = 1;
                mask_image.data    = nullptr;
            }
            sd_img_gen_params_t img_gen_params = {
                gen_params.lora_vec.data(),
                static_cast<uint32_t>(gen_params.lora_vec.size()),
                gen_params.prompt.c_str(),
                gen_params.negative_prompt.c_str(),
                gen_params.clip_skip,
                init_image,
                ref_images.data(),
                (int)ref_images.size(),
                gen_params.auto_resize_ref_image,
                gen_params.increase_ref_index,
                mask_image,
                gen_params.width,
                gen_params.height,
                gen_params.sample_params,
                gen_params.strength,
                gen_params.seed,
                gen_params.batch_count,
                control_image,
                gen_params.control_strength,
                {
                    pmid_images.data(),
                    (int)pmid_images.size(),
                    gen_params.pm_id_embed_path.c_str(),
                    gen_params.pm_style_strength,
                },  // pm_params
                ctx_params.vae_tiling_params,
                gen_params.cache_params,
            };
            sd_image_t* results = nullptr;
            int num_results     = 0;
            {
                std::lock_guard<std::mutex> lock(sd_ctx_mutex);
                results     = generate_image(sd_ctx, &img_gen_params);
                num_results = gen_params.batch_count;
            }
            json out;
            out["created"]       = iso_timestamp_now();
            out["data"]          = json::array();
            out["output_format"] = output_format;
            for (int i = 0; i < num_results; i++) {
                if (results[i].data == nullptr)
                    continue;
                auto image_bytes = write_image_to_vector(output_format == "jpeg" ? ImageFormat::JPEG : ImageFormat::PNG,
                                                         results[i].data,
                                                         results[i].width,
                                                         results[i].height,
                                                         results[i].channel,
                                                         output_compression);
                std::string b64 = base64_encode(image_bytes);
                json item;
                item["b64_json"] = b64;
                out["data"].push_back(item);
            }
            res.set_content(out.dump(), "application/json");
            res.status = 200;
            if (init_image.data) {
                stbi_image_free(init_image.data);
            }
            if (mask_image.data) {
                stbi_image_free(mask_image.data);
            }
            for (auto ref_image : ref_images) {
                stbi_image_free(ref_image.data);
            }
        } catch (const std::exception& e) {
            res.status = 500;
            json err;
            err["error"]   = "server_error";
            err["message"] = e.what();
            res.set_content(err.dump(), "application/json");
        }
    });
    LOG_INFO("listening on: %s:%d\n", svr_params.listen_ip.c_str(), svr_params.listen_port);
    svr.listen(svr_params.listen_ip, svr_params.listen_port);
    // cleanup
    free_sd_ctx(sd_ctx);
    return 0;
 }
--- a/flux.hpp
+++ b/flux.hpp
@ -233,14 +233,17 @@ namespace Flux {
    __STATIC_INLINE__ struct ggml_tensor* modulate(struct ggml_context* ctx,
                                                   struct ggml_tensor* x,
                                                   struct ggml_tensor* shift,
-                                                   struct ggml_tensor* scale) {
+                                                   struct ggml_tensor* scale,
                                                   bool skip_reshape = false) {
        // x: [N, L, C]
        // scale: [N, C]
        // shift: [N, C]
-        scale = ggml_reshape_3d(ctx, scale, scale->ne[0], 1, scale->ne[1]);  // [N, 1, C]
+        if (!skip_reshape) {
-        shift = ggml_reshape_3d(ctx, shift, shift->ne[0], 1, shift->ne[1]);  // [N, 1, C]
+            scale = ggml_reshape_3d(ctx, scale, scale->ne[0], 1, scale->ne[1]);  // [N, 1, C]
-        x     = ggml_add(ctx, x, ggml_mul(ctx, x, scale));
+            shift = ggml_reshape_3d(ctx, shift, shift->ne[0], 1, shift->ne[1]);  // [N, 1, C]
-        x     = ggml_add(ctx, x, shift);
+        }
        x = ggml_add(ctx, x, ggml_mul(ctx, x, scale));
        x = ggml_add(ctx, x, shift);
        return x;
    }
@ -744,6 +747,8 @@ namespace Flux {
        int64_t nerf_mlp_ratio   = 4;
        int64_t nerf_depth       = 4;
        int64_t nerf_max_freqs   = 8;
        bool use_x0              = false;
        bool use_patch_size_32   = false;
    };
    struct FluxParams {
@ -781,7 +786,7 @@ namespace Flux {
        Flux(FluxParams params)
            : params(params) {
            if (params.version == VERSION_CHROMA_RADIANCE) {
-                std::pair<int, int> kernel_size = {(int)params.patch_size, (int)params.patch_size};
+                std::pair<int, int> kernel_size = {16, 16};
                std::pair<int, int> stride      = kernel_size;
                blocks["img_in_patch"] = std::make_shared<Conv2d>(params.in_channels,
@ -858,14 +863,14 @@ namespace Flux {
            }
        }
-        struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx,
+        struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
                                              struct ggml_tensor* x) {
            int64_t W = x->ne[0];
            int64_t H = x->ne[1];
            int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
            int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
-            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            x         = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
            return x;
        }
@ -891,11 +896,11 @@ namespace Flux {
            return x;
        }
-        struct ggml_tensor* process_img(struct ggml_context* ctx,
+        struct ggml_tensor* process_img(GGMLRunnerContext* ctx,
                                        struct ggml_tensor* x) {
            // img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
            x = pad_to_patch_size(ctx, x);
-            x = patchify(ctx, x);
+            x = patchify(ctx->ggml_ctx, x);
            return x;
        }
@ -1044,6 +1049,15 @@ namespace Flux {
            return img;
        }
        struct ggml_tensor* _apply_x0_residual(GGMLRunnerContext* ctx,
                                               struct ggml_tensor* predicted,
                                               struct ggml_tensor* noisy,
                                               struct ggml_tensor* timesteps) {
            auto x = ggml_sub(ctx->ggml_ctx, noisy, predicted);
            x      = ggml_div(ctx->ggml_ctx, x, timesteps);
            return x;
        }
        struct ggml_tensor* forward_chroma_radiance(GGMLRunnerContext* ctx,
                                                    struct ggml_tensor* x,
                                                    struct ggml_tensor* timestep,
@ -1065,9 +1079,16 @@ namespace Flux {
            int pad_h          = (patch_size - H % patch_size) % patch_size;
            int pad_w          = (patch_size - W % patch_size) % patch_size;
-            auto img      = pad_to_patch_size(ctx->ggml_ctx, x);
+            auto img      = pad_to_patch_size(ctx, x);
            auto orig_img = img;
            if (params.chroma_radiance_params.use_patch_size_32) {
                // It's supposed to be using GGML_SCALE_MODE_NEAREST, but this seems more stable
                // Maybe the implementation of nearest-neighbor interpolation in ggml behaves differently than the one in PyTorch?
                // img = F.interpolate(img, size=(H//2, W//2), mode="nearest")
                img = ggml_interpolate(ctx->ggml_ctx, img, W / 2, H / 2, C, x->ne[3], GGML_SCALE_MODE_BILINEAR);
            }
            auto img_in_patch = std::dynamic_pointer_cast<Conv2d>(blocks["img_in_patch"]);
            img = img_in_patch->forward(ctx, img);                                                       // [N, hidden_size, H/patch_size, W/patch_size]
@ -1104,6 +1125,10 @@ namespace Flux {
            out = nerf_final_layer_conv->forward(ctx, img_dct);  // [N, C, H, W]
            if (params.chroma_radiance_params.use_x0) {
                out = _apply_x0_residual(ctx, out, orig_img, timestep);
            }
            return out;
        }
@ -1128,7 +1153,7 @@ namespace Flux {
            int pad_h          = (patch_size - H % patch_size) % patch_size;
            int pad_w          = (patch_size - W % patch_size) % patch_size;
-            auto img            = process_img(ctx->ggml_ctx, x);
+            auto img            = process_img(ctx, x);
            uint64_t img_tokens = img->ne[1];
            if (params.version == VERSION_FLUX_FILL) {
@ -1136,8 +1161,8 @@ namespace Flux {
                ggml_tensor* masked = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
                ggml_tensor* mask   = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
-                masked = process_img(ctx->ggml_ctx, masked);
+                masked = process_img(ctx, masked);
-                mask   = process_img(ctx->ggml_ctx, mask);
+                mask   = process_img(ctx, mask);
                img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, masked, mask, 0), 0);
            } else if (params.version == VERSION_FLEX_2) {
@ -1146,21 +1171,21 @@ namespace Flux {
                ggml_tensor* mask    = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
                ggml_tensor* control = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * (C + 1));
-                masked  = process_img(ctx->ggml_ctx, masked);
+                masked  = process_img(ctx, masked);
-                mask    = process_img(ctx->ggml_ctx, mask);
+                mask    = process_img(ctx, mask);
-                control = process_img(ctx->ggml_ctx, control);
+                control = process_img(ctx, control);
                img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, ggml_concat(ctx->ggml_ctx, masked, mask, 0), control, 0), 0);
            } else if (params.version == VERSION_FLUX_CONTROLS) {
                GGML_ASSERT(c_concat != nullptr);
-                auto control = process_img(ctx->ggml_ctx, c_concat);
+                auto control = process_img(ctx, c_concat);
                img          = ggml_concat(ctx->ggml_ctx, img, control, 0);
            }
            if (ref_latents.size() > 0) {
                for (ggml_tensor* ref : ref_latents) {
-                    ref = process_img(ctx->ggml_ctx, ref);
+                    ref = process_img(ctx, ref);
                    img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
                }
            }
@ -1290,6 +1315,15 @@ namespace Flux {
                    // not schnell
                    flux_params.guidance_embed = true;
                }
                if (tensor_name.find("__x0__") != std::string::npos) {
                    LOG_DEBUG("using x0 prediction");
                    flux_params.chroma_radiance_params.use_x0 = true;
                }
                if (tensor_name.find("__32x32__") != std::string::npos) {
                    LOG_DEBUG("using patch size 32 prediction");
                    flux_params.chroma_radiance_params.use_patch_size_32 = true;
                    flux_params.patch_size                               = 32;
                }
                if (tensor_name.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) {
                    // Chroma
                    flux_params.is_chroma = true;
@ -1441,6 +1475,8 @@ namespace Flux {
                                            increase_ref_index,
                                            flux_params.ref_index_scale,
                                            flux_params.theta,
                                            circular_y_enabled,
                                            circular_x_enabled,
                                            flux_params.axes_dim);
            int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2;
            // LOG_DEBUG("pos_len %d", pos_len);
--- a/format-code.sh
+++ b/format-code.sh
@ -1,4 +1,4 @@
-for f in *.cpp *.h *.hpp examples/cli/*.cpp examples/cli/*.h; do
+for f in *.cpp *.h *.hpp examples/cli/*.cpp examples/common/*.hpp examples/cli/*.h examples/server/*.cpp; do
  [[ "$f" == vocab* ]] && continue
  echo "formatting '$f'"
  # if [ "$f" != "stable-diffusion.h" ]; then
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 2d3876d554551d35c06dccc5852be50d5fd2a275
+Subproject commit 3e9f2ba3b934c20b26873b3c60dbf41b116978ff
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -5,6 +5,7 @@
 #include <inttypes.h>
 #include <stdarg.h>
 #include <algorithm>
 #include <atomic>
 #include <cstring>
 #include <fstream>
 #include <functional>
@ -732,34 +733,22 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_slice(struct ggml_context* ctx,
 __STATIC_INLINE__ std::vector<struct ggml_tensor*> ggml_ext_chunk(struct ggml_context* ctx,
                                                                  struct ggml_tensor* x,
                                                                  int num,
-                                                                  int64_t dim) {
+                                                                  int64_t dim,
                                                                  bool cont = true) {
    GGML_ASSERT(dim >= 0 && dim < 4);
    GGML_ASSERT(x->ne[dim] % num == 0);
    int perm[4] = {0, 1, 2, 3};
    for (int i = dim; i < 3; ++i)
        perm[i] = perm[i + 1];
    perm[3] = dim;
    int inv_perm[4];
    for (int i = 0; i < 4; ++i)
        inv_perm[perm[i]] = i;
    if (dim != 3) {
        x = ggml_ext_torch_permute(ctx, x, perm[0], perm[1], perm[2], perm[3]);
        x = ggml_cont(ctx, x);
    }
    std::vector<struct ggml_tensor*> chunks;
-    int64_t chunk_size = x->ne[3] / num;
+    int64_t chunk_size  = x->ne[dim] / num;
    int64_t stride      = chunk_size * x->nb[dim];
    int64_t chunk_ne[4] = {x->ne[0], x->ne[1], x->ne[2], x->ne[3]};
    chunk_ne[dim]       = chunk_size;
    for (int i = 0; i < num; i++) {
        auto chunk = ggml_view_4d(
            ctx, x,
-            x->ne[0], x->ne[1], x->ne[2], chunk_size,
+            chunk_ne[0], chunk_ne[1], chunk_ne[2], chunk_ne[3],
-            x->nb[1], x->nb[2], x->nb[3], x->nb[3] * i * chunk_size);
+            x->nb[1], x->nb[2], x->nb[3], stride * i);
-
+        if (cont) {
        if (dim != 3) {
            chunk = ggml_ext_torch_permute(ctx, chunk, inv_perm[0], inv_perm[1], inv_perm[2], inv_perm[3]);
            chunk = ggml_cont(ctx, chunk);
        }
        chunks.push_back(chunk);
@ -772,7 +761,7 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_silu_act(ggml_context* ctx, ggml_tensor*
    // x: [ne3, ne2, ne1, ne0]
    // return: [ne3, ne2, ne1, ne0/2]
-    auto x_vec = ggml_ext_chunk(ctx, x, 2, 0);
+    auto x_vec = ggml_ext_chunk(ctx, x, 2, 0, false);
    ggml_tensor* gate;
    if (gate_first) {
        gate = x_vec[0];
@ -781,7 +770,7 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_silu_act(ggml_context* ctx, ggml_tensor*
        x    = x_vec[0];
        gate = x_vec[1];
    }
-
+    gate = ggml_cont(ctx, gate);
    gate = ggml_silu_inplace(ctx, gate);
    x = ggml_mul(ctx, x, gate);  // [ne3, ne2, ne1, ne0/2]
@ -860,8 +849,6 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
    LOG_DEBUG("num tiles : %d, %d ", num_tiles_x, num_tiles_y);
    LOG_DEBUG("optimal overlap : %f, %f (targeting %f)", tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor);
    GGML_ASSERT(input_width % 2 == 0 && input_height % 2 == 0 && output_width % 2 == 0 && output_height % 2 == 0);  // should be multiple of 2
    int tile_overlap_x     = (int32_t)(p_tile_size_x * tile_overlap_factor_x);
    int non_tile_overlap_x = p_tile_size_x - tile_overlap_x;
@ -1007,6 +994,48 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_linear(struct ggml_context* ctx,
    return x;
 }
 __STATIC_INLINE__ struct ggml_tensor* ggml_ext_pad_ext(struct ggml_context* ctx,
                                                       struct ggml_tensor* x,
                                                       int lp0,
                                                       int rp0,
                                                       int lp1,
                                                       int rp1,
                                                       int lp2,
                                                       int rp2,
                                                       int lp3,
                                                       int rp3,
                                                       bool circular_x = false,
                                                       bool circular_y = false) {
    if (circular_x && circular_y) {
        return ggml_pad_ext_circular(ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
    }
    if (circular_x && (lp0 != 0 || rp0 != 0)) {
        x   = ggml_pad_ext_circular(ctx, x, lp0, rp0, 0, 0, 0, 0, 0, 0);
        lp0 = rp0 = 0;
    }
    if (circular_y && (lp1 != 0 || rp1 != 0)) {
        x   = ggml_pad_ext_circular(ctx, x, 0, 0, lp1, rp1, 0, 0, 0, 0);
        lp1 = rp1 = 0;
    }
    if (lp0 != 0 || rp0 != 0 || lp1 != 0 || rp1 != 0 || lp2 != 0 || rp2 != 0 || lp3 != 0 || rp3 != 0) {
        x = ggml_pad_ext(ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
    }
    return x;
 }
 __STATIC_INLINE__ struct ggml_tensor* ggml_ext_pad(struct ggml_context* ctx,
                                                   struct ggml_tensor* x,
                                                   int p0,
                                                   int p1,
                                                   int p2          = 0,
                                                   int p3          = 0,
                                                   bool circular_x = false,
                                                   bool circular_y = false) {
    return ggml_ext_pad_ext(ctx, x, 0, p0, 0, p1, 0, p2, 0, p3, circular_x, circular_y);
 }
 // w: [OC，IC, KH, KW]
 // x: [N, IC, IH, IW]
 // b: [OC,]
@ -1015,20 +1044,29 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_2d(struct ggml_context* ctx,
                                                       struct ggml_tensor* x,
                                                       struct ggml_tensor* w,
                                                       struct ggml_tensor* b,
-                                                       int s0      = 1,
+                                                       int s0          = 1,
-                                                       int s1      = 1,
+                                                       int s1          = 1,
-                                                       int p0      = 0,
+                                                       int p0          = 0,
-                                                       int p1      = 0,
+                                                       int p1          = 0,
-                                                       int d0      = 1,
+                                                       int d0          = 1,
-                                                       int d1      = 1,
+                                                       int d1          = 1,
-                                                       bool direct = false,
+                                                       bool direct     = false,
-                                                       float scale = 1.f) {
+                                                       bool circular_x = false,
                                                       bool circular_y = false,
                                                       float scale     = 1.f) {
    if (scale != 1.f) {
        x = ggml_scale(ctx, x, scale);
    }
    if (w->ne[2] != x->ne[2] && ggml_n_dims(w) == 2) {
        w = ggml_reshape_4d(ctx, w, 1, 1, w->ne[0], w->ne[1]);
    }
    if ((p0 != 0 || p1 != 0) && (circular_x || circular_y)) {
        x  = ggml_ext_pad_ext(ctx, x, p0, p0, p1, p1, 0, 0, 0, 0, circular_x, circular_y);
        p0 = 0;
        p1 = 0;
    }
    if (direct) {
        x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
    } else {
@ -1282,6 +1320,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context
        }
        if (mask_in != nullptr) {
            // the need for padding got removed in ggml 4767bda
            // ensure we can still use the old version for now
 #ifdef GGML_KQ_MASK_PAD
            int mask_pad = 0;
            if (mask_in->ne[1] % GGML_KQ_MASK_PAD != 0) {
                mask_pad = GGML_PAD(L_q, GGML_KQ_MASK_PAD) - mask_in->ne[1];
@ -1289,6 +1330,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context
            if (mask_pad > 0) {
                mask_in = ggml_pad(ctx, mask_in, 0, mask_pad, 0, 0);
            }
 #endif
            mask_in = ggml_cast(ctx, mask_in, GGML_TYPE_F16);
        }
@ -1531,14 +1573,16 @@ struct WeightAdapter {
            float scale         = 1.f;
        } linear;
        struct {
-            int s0      = 1;
+            int s0          = 1;
-            int s1      = 1;
+            int s1          = 1;
-            int p0      = 0;
+            int p0          = 0;
-            int p1      = 0;
+            int p1          = 0;
-            int d0      = 1;
+            int d0          = 1;
-            int d1      = 1;
+            int d1          = 1;
-            bool direct = false;
+            bool direct     = false;
-            float scale = 1.f;
+            bool circular_x = false;
            bool circular_y = false;
            float scale     = 1.f;
        } conv2d;
    };
    virtual ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) = 0;
@ -1556,6 +1600,8 @@ struct GGMLRunnerContext {
    ggml_context* ggml_ctx                        = nullptr;
    bool flash_attn_enabled                       = false;
    bool conv2d_direct_enabled                    = false;
    bool circular_x_enabled                       = false;
    bool circular_y_enabled                       = false;
    std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
 };
@ -1592,6 +1638,8 @@ protected:
    bool flash_attn_enabled    = false;
    bool conv2d_direct_enabled = false;
    bool circular_x_enabled    = false;
    bool circular_y_enabled    = false;
    void alloc_params_ctx() {
        struct ggml_init_params params;
@ -1869,6 +1917,8 @@ public:
        runner_ctx.backend               = runtime_backend;
        runner_ctx.flash_attn_enabled    = flash_attn_enabled;
        runner_ctx.conv2d_direct_enabled = conv2d_direct_enabled;
        runner_ctx.circular_x_enabled    = circular_x_enabled;
        runner_ctx.circular_y_enabled    = circular_y_enabled;
        runner_ctx.weight_adapter        = weight_adapter;
        return runner_ctx;
    }
@ -2013,6 +2063,11 @@ public:
        conv2d_direct_enabled = enabled;
    }
    void set_circular_axes(bool circular_x, bool circular_y) {
        circular_x_enabled = circular_x;
        circular_y_enabled = circular_y;
    }
    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {
        weight_adapter = adapter;
    }
@ -2276,15 +2331,17 @@ public:
        }
        if (ctx->weight_adapter) {
            WeightAdapter::ForwardParams forward_params;
-            forward_params.op_type       = WeightAdapter::ForwardParams::op_type_t::OP_CONV2D;
+            forward_params.op_type           = WeightAdapter::ForwardParams::op_type_t::OP_CONV2D;
-            forward_params.conv2d.s0     = stride.second;
+            forward_params.conv2d.s0         = stride.second;
-            forward_params.conv2d.s1     = stride.first;
+            forward_params.conv2d.s1         = stride.first;
-            forward_params.conv2d.p0     = padding.second;
+            forward_params.conv2d.p0         = padding.second;
-            forward_params.conv2d.p1     = padding.first;
+            forward_params.conv2d.p1         = padding.first;
-            forward_params.conv2d.d0     = dilation.second;
+            forward_params.conv2d.d0         = dilation.second;
-            forward_params.conv2d.d1     = dilation.first;
+            forward_params.conv2d.d1         = dilation.first;
-            forward_params.conv2d.direct = ctx->conv2d_direct_enabled;
+            forward_params.conv2d.direct     = ctx->conv2d_direct_enabled;
-            forward_params.conv2d.scale  = scale;
+            forward_params.conv2d.circular_x = ctx->circular_x_enabled;
            forward_params.conv2d.circular_y = ctx->circular_y_enabled;
            forward_params.conv2d.scale      = scale;
            return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params);
        }
        return ggml_ext_conv_2d(ctx->ggml_ctx,
@ -2298,6 +2355,8 @@ public:
                                dilation.second,
                                dilation.first,
                                ctx->conv2d_direct_enabled,
                                ctx->circular_x_enabled,
                                ctx->circular_y_enabled,
                                scale);
    }
 };
--- a/lora.hpp
+++ b/lora.hpp
@ -599,6 +599,8 @@ struct LoraModel : public GGMLRunner {
                                      forward_params.conv2d.d0,
                                      forward_params.conv2d.d1,
                                      forward_params.conv2d.direct,
                                      forward_params.conv2d.circular_x,
                                      forward_params.conv2d.circular_y,
                                      forward_params.conv2d.scale);
                if (lora_mid) {
                    lx = ggml_ext_conv_2d(ctx,
@ -612,6 +614,8 @@ struct LoraModel : public GGMLRunner {
                                          1,
                                          1,
                                          forward_params.conv2d.direct,
                                          forward_params.conv2d.circular_x,
                                          forward_params.conv2d.circular_y,
                                          forward_params.conv2d.scale);
                }
                lx = ggml_ext_conv_2d(ctx,
@ -625,6 +629,8 @@ struct LoraModel : public GGMLRunner {
                                      1,
                                      1,
                                      forward_params.conv2d.direct,
                                      forward_params.conv2d.circular_x,
                                      forward_params.conv2d.circular_y,
                                      forward_params.conv2d.scale);
            }
@ -779,6 +785,8 @@ public:
                                   forward_params.conv2d.d0,
                                   forward_params.conv2d.d1,
                                   forward_params.conv2d.direct,
                                   forward_params.conv2d.circular_x,
                                   forward_params.conv2d.circular_y,
                                   forward_params.conv2d.scale);
        }
        for (auto& lora_model : lora_models) {
--- a/mmdit.hpp
+++ b/mmdit.hpp
@ -983,4 +983,4 @@ struct MMDiTRunner : public GGMLRunner {
    }
 };
-#endif
+#endif
--- a/model.cpp
+++ b/model.cpp
@ -1340,7 +1340,7 @@ std::string ModelLoader::load_umt5_tokenizer_json() {
    return json_str;
 }
-bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p) {
+bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool enable_mmap) {
    int64_t process_time_ms = 0;
    std::atomic<int64_t> read_time_ms(0);
    std::atomic<int64_t> memcpy_time_ms(0);
@ -1390,6 +1390,15 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
            }
        }
        std::unique_ptr<MmapWrapper> mmapped;
        if (enable_mmap && !is_zip) {
            LOG_DEBUG("using mmap for I/O");
            mmapped = MmapWrapper::create(file_path);
            if (!mmapped) {
                LOG_WARN("failed to memory-map '%s'", file_path.c_str());
            }
        }
        int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size());
        if (n_threads < 1) {
            n_threads = 1;
@ -1411,7 +1420,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                        failed = true;
                        return;
                    }
-                } else {
+                } else if (!mmapped) {
                    file.open(file_path, std::ios::binary);
                    if (!file.is_open()) {
                        LOG_ERROR("failed to open '%s'", file_path.c_str());
@ -1464,6 +1473,11 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                                zip_entry_noallocread(zip, (void*)buf, n);
                            }
                            zip_entry_close(zip);
                        } else if (mmapped) {
                            if (!mmapped->copy_data(buf, n, tensor_storage.offset)) {
                                LOG_ERROR("read tensor data failed: '%s'", file_path.c_str());
                                failed = true;
                            }
                        } else {
                            file.seekg(tensor_storage.offset);
                            file.read(buf, n);
@ -1520,6 +1534,11 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                        i64_to_i32_vec((int64_t*)read_buf, (int32_t*)target_buf, tensor_storage.nelements());
                    }
                    if (tensor_storage.type != dst_tensor->type) {
                        if (convert_buf == nullptr) {
                            LOG_ERROR("read tensor data failed: too less memory for conversion");
                            failed = true;
                            return;
                        }
                        convert_tensor((void*)target_buf,
                                       tensor_storage.type,
                                       convert_buf,
@ -1583,7 +1602,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
 bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                               std::set<std::string> ignore_tensors,
-                               int n_threads) {
+                               int n_threads,
                               bool enable_mmap) {
    std::set<std::string> tensor_names_in_file;
    std::mutex tensor_names_mutex;
    auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
@ -1626,7 +1646,7 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
        return true;
    };
-    bool success = load_tensors(on_new_tensor_cb, n_threads);
+    bool success = load_tensors(on_new_tensor_cb, n_threads, enable_mmap);
    if (!success) {
        LOG_ERROR("load tensors from file failed");
        return false;
@ -1732,6 +1752,13 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
        // tensor_storage.ne[0], tensor_storage.ne[1], tensor_storage.ne[2], tensor_storage.ne[3],
        // tensor->n_dims, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
        if (!tensor->data) {
            GGML_ASSERT(ggml_nelements(tensor) == 0);
            // avoid crashing the gguf writer by setting a dummy pointer for zero-sized tensors
            LOG_DEBUG("setting dummy pointer for zero-sized tensor %s", name.c_str());
            tensor->data = ggml_get_mem_buffer(ggml_ctx);
        }
        *dst_tensor = tensor;
        gguf_add_tensor(gguf_ctx, tensor);
@ -1771,7 +1798,12 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type)
    return mem_size;
 }
-bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type, const char* tensor_type_rules) {
+bool convert(const char* input_path,
             const char* vae_path,
             const char* output_path,
             sd_type_t output_type,
             const char* tensor_type_rules,
             bool convert_name) {
    ModelLoader model_loader;
    if (!model_loader.init_from_file(input_path)) {
@ -1785,7 +1817,9 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa
            return false;
        }
    }
-    model_loader.convert_tensors_name();
+    if (convert_name) {
        model_loader.convert_tensors_name();
    }
    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type, tensor_type_rules);
    return success;
 }
--- a/model.h
+++ b/model.h
@ -310,10 +310,11 @@ public:
    std::map<ggml_type, uint32_t> get_vae_wtype_stat();
    String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
    void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = "");
-    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0);
+    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0, bool use_mmap = false);
    bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                      std::set<std::string> ignore_tensors = {},
-                      int n_threads                        = 0);
+                      int n_threads                        = 0,
                      bool use_mmap                        = false);
    std::vector<std::string> get_tensor_names() const {
        std::vector<std::string> names;
--- a/name_conversion.cpp
+++ b/name_conversion.cpp
@ -835,6 +835,7 @@ std::string convert_sep_to_dot(std::string name) {
        "proj_out",
        "transformer_blocks",
        "single_transformer_blocks",
        "single_blocks",
        "diffusion_model",
        "cond_stage_model",
        "first_stage_model",
@ -876,7 +877,18 @@ std::string convert_sep_to_dot(std::string name) {
        "ff_context",
        "norm_added_q",
        "norm_added_v",
-        "to_add_out"};
+        "to_add_out",
        "txt_mod",
        "img_mod",
        "txt_mlp",
        "img_mlp",
        "proj_mlp",
        "wi_0",
        "wi_1",
        "norm1_context",
        "ff_context",
        "x_embedder",
    };
    // record the positions of underscores that should NOT be replaced
    std::unordered_set<size_t> protected_positions;
@ -948,6 +960,7 @@ bool is_first_stage_model_name(const std::string& name) {
 std::string convert_tensor_name(std::string name, SDVersion version) {
    bool is_lora                             = false;
    bool is_lycoris_underline                = false;
    bool is_underline                        = false;
    std::vector<std::string> lora_prefix_vec = {
        "lora.lora.",
        "lora.lora_",
@ -955,12 +968,27 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
        "lora.lycoris.",
        "lora.",
    };
    std::vector<std::string> underline_lora_prefix_vec = {
        "unet_",
        "te_",
        "te1_",
        "te2_",
        "te3_",
        "vae_",
    };
    for (const auto& prefix : lora_prefix_vec) {
        if (starts_with(name, prefix)) {
            is_lora = true;
            name    = name.substr(prefix.size());
            if (contains(prefix, "lycoris_")) {
                is_lycoris_underline = true;
            } else {
                for (const auto& underline_lora_prefix : underline_lora_prefix_vec) {
                    if (starts_with(name, underline_lora_prefix)) {
                        is_underline = true;
                        break;
                    }
                }
            }
            break;
        }
@ -1020,12 +1048,14 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
            }
        }
-        if (sd_version_is_unet(version) || is_lycoris_underline) {
+        // LOG_DEBUG("name %s %d", name.c_str(), version);
        if (sd_version_is_unet(version) || is_underline || is_lycoris_underline) {
            name = convert_sep_to_dot(name);
        }
    }
-    std::vector<std::pair<std::string, std::string>> prefix_map = {
+    std::unordered_map<std::string, std::string> prefix_map = {
        {"diffusion_model.", "model.diffusion_model."},
        {"unet.", "model.diffusion_model."},
        {"transformer.", "model.diffusion_model."},  // dit
@ -1040,8 +1070,13 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
        // {"te2.text_model.encoder.layers.", "cond_stage_model.1.model.transformer.resblocks."},
        {"te2.", "cond_stage_model.1.transformer."},
        {"te1.", "cond_stage_model.transformer."},
        {"te3.", "text_encoders.t5xxl.transformer."},
    };
    if (sd_version_is_flux(version)) {
        prefix_map["te1."] = "text_encoders.clip_l.transformer.";
    }
    replace_with_prefix_map(name, prefix_map);
    // diffusion model
--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@ -191,11 +191,16 @@ namespace Qwen {
    };
    class QwenImageTransformerBlock : public GGMLBlock {
    protected:
        bool zero_cond_t;
    public:
        QwenImageTransformerBlock(int64_t dim,
                                  int64_t num_attention_heads,
                                  int64_t attention_head_dim,
-                                  float eps = 1e-6) {
+                                  float eps        = 1e-6,
                                  bool zero_cond_t = false)
            : zero_cond_t(zero_cond_t) {
            // img_mod.0 is nn.SiLU()
            blocks["img_mod.1"] = std::shared_ptr<GGMLBlock>(new Linear(dim, 6 * dim, true));
@ -220,11 +225,37 @@ namespace Qwen {
                                                                               eps));
        }
        std::vector<ggml_tensor*> get_mod_params_vec(ggml_context* ctx, ggml_tensor* mod_params, ggml_tensor* index = nullptr) {
            // index: [N, n_img_token]
            // mod_params: [N, hidden_size * 12]
            if (index == nullptr) {
                return ggml_ext_chunk(ctx, mod_params, 6, 0);
            }
            mod_params          = ggml_reshape_1d(ctx, mod_params, ggml_nelements(mod_params));
            auto mod_params_vec = ggml_ext_chunk(ctx, mod_params, 12, 0);
            index               = ggml_reshape_3d(ctx, index, 1, index->ne[0], index->ne[1]);                                      // [N, n_img_token, 1]
            index               = ggml_repeat_4d(ctx, index, mod_params_vec[0]->ne[0], index->ne[1], index->ne[2], index->ne[3]);  // [N, n_img_token, hidden_size]
            std::vector<ggml_tensor*> mod_results;
            for (int i = 0; i < 6; i++) {
                auto mod_0 = mod_params_vec[i];
                auto mod_1 = mod_params_vec[i + 6];
                // mod_result = torch.where(index == 0, mod_0, mod_1)
                // mod_result = (1 - index)*mod_0 + index*mod_1
                mod_0           = ggml_sub(ctx, ggml_repeat(ctx, mod_0, index), ggml_mul(ctx, index, mod_0));  // [N, n_img_token, hidden_size]
                mod_1           = ggml_mul(ctx, index, mod_1);                                                 // [N, n_img_token, hidden_size]
                auto mod_result = ggml_add(ctx, mod_0, mod_1);
                mod_results.push_back(mod_result);
            }
            return mod_results;
        }
        virtual std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
                                                              struct ggml_tensor* img,
                                                              struct ggml_tensor* txt,
                                                              struct ggml_tensor* t_emb,
-                                                              struct ggml_tensor* pe) {
+                                                              struct ggml_tensor* pe,
                                                              struct ggml_tensor* modulate_index = nullptr) {
            // img: [N, n_img_token, hidden_size]
            // txt: [N, n_txt_token, hidden_size]
            // pe: [n_img_token + n_txt_token, d_head/2, 2, 2]
@ -244,14 +275,18 @@ namespace Qwen {
            auto img_mod_params    = ggml_silu(ctx->ggml_ctx, t_emb);
            img_mod_params         = img_mod_1->forward(ctx, img_mod_params);
-            auto img_mod_param_vec = ggml_ext_chunk(ctx->ggml_ctx, img_mod_params, 6, 0);
+            auto img_mod_param_vec = get_mod_params_vec(ctx->ggml_ctx, img_mod_params, modulate_index);
            if (zero_cond_t) {
                t_emb = ggml_ext_chunk(ctx->ggml_ctx, t_emb, 2, 1)[0];
            }
            auto txt_mod_params    = ggml_silu(ctx->ggml_ctx, t_emb);
            txt_mod_params         = txt_mod_1->forward(ctx, txt_mod_params);
-            auto txt_mod_param_vec = ggml_ext_chunk(ctx->ggml_ctx, txt_mod_params, 6, 0);
+            auto txt_mod_param_vec = get_mod_params_vec(ctx->ggml_ctx, txt_mod_params);
            auto img_normed    = img_norm1->forward(ctx, img);
-            auto img_modulated = Flux::modulate(ctx->ggml_ctx, img_normed, img_mod_param_vec[0], img_mod_param_vec[1]);
+            auto img_modulated = Flux::modulate(ctx->ggml_ctx, img_normed, img_mod_param_vec[0], img_mod_param_vec[1], modulate_index != nullptr);
            auto img_gate1     = img_mod_param_vec[2];
            auto txt_normed    = txt_norm1->forward(ctx, txt);
@ -264,7 +299,7 @@ namespace Qwen {
            txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_attn_output, txt_gate1));
            auto img_normed2    = img_norm2->forward(ctx, img);
-            auto img_modulated2 = Flux::modulate(ctx->ggml_ctx, img_normed2, img_mod_param_vec[3], img_mod_param_vec[4]);
+            auto img_modulated2 = Flux::modulate(ctx->ggml_ctx, img_normed2, img_mod_param_vec[3], img_mod_param_vec[4], modulate_index != nullptr);
            auto img_gate2      = img_mod_param_vec[5];
            auto txt_normed2    = txt_norm2->forward(ctx, txt);
@ -325,6 +360,7 @@ namespace Qwen {
        float theta                 = 10000;
        std::vector<int> axes_dim   = {16, 56, 56};
        int64_t axes_dim_sum        = 128;
        bool zero_cond_t            = false;
    };
    class QwenImageModel : public GGMLBlock {
@ -346,7 +382,8 @@ namespace Qwen {
                auto block                                        = std::shared_ptr<GGMLBlock>(new QwenImageTransformerBlock(inner_dim,
                                                                                                                             params.num_attention_heads,
                                                                                                                             params.attention_head_dim,
-                                                                                                                             1e-6f));
+                                                                                                                             1e-6f,
                                                                                                                             params.zero_cond_t));
                blocks["transformer_blocks." + std::to_string(i)] = block;
            }
@ -354,14 +391,14 @@ namespace Qwen {
            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, params.patch_size * params.patch_size * params.out_channels));
        }
-        struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx,
+        struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
                                              struct ggml_tensor* x) {
            int64_t W = x->ne[0];
            int64_t H = x->ne[1];
            int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
            int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
-            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            x         = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
            return x;
        }
@ -387,10 +424,10 @@ namespace Qwen {
            return x;
        }
-        struct ggml_tensor* process_img(struct ggml_context* ctx,
+        struct ggml_tensor* process_img(GGMLRunnerContext* ctx,
                                        struct ggml_tensor* x) {
            x = pad_to_patch_size(ctx, x);
-            x = patchify(ctx, x);
+            x = patchify(ctx->ggml_ctx, x);
            return x;
        }
@ -421,7 +458,8 @@ namespace Qwen {
                                         struct ggml_tensor* x,
                                         struct ggml_tensor* timestep,
                                         struct ggml_tensor* context,
-                                         struct ggml_tensor* pe) {
+                                         struct ggml_tensor* pe,
                                         struct ggml_tensor* modulate_index = nullptr) {
            auto time_text_embed = std::dynamic_pointer_cast<QwenTimestepProjEmbeddings>(blocks["time_text_embed"]);
            auto txt_norm        = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm"]);
            auto img_in          = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
@ -430,18 +468,26 @@ namespace Qwen {
            auto proj_out        = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]);
            auto t_emb = time_text_embed->forward(ctx, timestep);
-            auto img   = img_in->forward(ctx, x);
+            if (params.zero_cond_t) {
-            auto txt   = txt_norm->forward(ctx, context);
+                auto t_emb_0 = time_text_embed->forward(ctx, ggml_ext_zeros(ctx->ggml_ctx, timestep->ne[0], timestep->ne[1], timestep->ne[2], timestep->ne[3]));
-            txt        = txt_in->forward(ctx, txt);
+                t_emb        = ggml_concat(ctx->ggml_ctx, t_emb, t_emb_0, 1);
            }
            auto img = img_in->forward(ctx, x);
            auto txt = txt_norm->forward(ctx, context);
            txt      = txt_in->forward(ctx, txt);
            for (int i = 0; i < params.num_layers; i++) {
                auto block = std::dynamic_pointer_cast<QwenImageTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
-                auto result = block->forward(ctx, img, txt, t_emb, pe);
+                auto result = block->forward(ctx, img, txt, t_emb, pe, modulate_index);
                img         = result.first;
                txt         = result.second;
            }
            if (params.zero_cond_t) {
                t_emb = ggml_ext_chunk(ctx->ggml_ctx, t_emb, 2, 1)[0];
            }
            img = norm_out->forward(ctx, img, t_emb);
            img = proj_out->forward(ctx, img);
@ -453,7 +499,8 @@ namespace Qwen {
                                    struct ggml_tensor* timestep,
                                    struct ggml_tensor* context,
                                    struct ggml_tensor* pe,
-                                    std::vector<ggml_tensor*> ref_latents = {}) {
+                                    std::vector<ggml_tensor*> ref_latents = {},
                                    struct ggml_tensor* modulate_index    = nullptr) {
            // Forward pass of DiT.
            // x: [N, C, H, W]
            // timestep: [N,]
@ -466,12 +513,12 @@ namespace Qwen {
            int64_t C = x->ne[2];
            int64_t N = x->ne[3];
-            auto img            = process_img(ctx->ggml_ctx, x);
+            auto img            = process_img(ctx, x);
            uint64_t img_tokens = img->ne[1];
            if (ref_latents.size() > 0) {
                for (ggml_tensor* ref : ref_latents) {
-                    ref = process_img(ctx->ggml_ctx, ref);
+                    ref = process_img(ctx, ref);
                    img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
                }
            }
@ -479,7 +526,7 @@ namespace Qwen {
            int64_t h_len = ((H + (params.patch_size / 2)) / params.patch_size);
            int64_t w_len = ((W + (params.patch_size / 2)) / params.patch_size);
-            auto out = forward_orig(ctx, img, timestep, context, pe);  // [N, h_len*w_len, ph*pw*C]
+            auto out = forward_orig(ctx, img, timestep, context, pe, modulate_index);  // [N, h_len*w_len, ph*pw*C]
            if (out->ne[1] > img_tokens) {
                out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3));  // [num_tokens, N, C * patch_size * patch_size]
@ -502,19 +549,25 @@ namespace Qwen {
        QwenImageParams qwen_image_params;
        QwenImageModel qwen_image;
        std::vector<float> pe_vec;
        std::vector<float> modulate_index_vec;
        SDVersion version;
        QwenImageRunner(ggml_backend_t backend,
                        bool offload_params_to_cpu,
                        const String2TensorStorage& tensor_storage_map = {},
                        const std::string prefix                       = "",
-                        SDVersion version                              = VERSION_QWEN_IMAGE)
+                        SDVersion version                              = VERSION_QWEN_IMAGE,
                        bool zero_cond_t                               = false)
            : GGMLRunner(backend, offload_params_to_cpu) {
-            qwen_image_params.num_layers = 0;
+            qwen_image_params.num_layers  = 0;
            qwen_image_params.zero_cond_t = zero_cond_t;
            for (auto pair : tensor_storage_map) {
                std::string tensor_name = pair.first;
                if (tensor_name.find(prefix) == std::string::npos)
                    continue;
                if (tensor_name.find("__index_timestep_zero__") != std::string::npos) {
                    qwen_image_params.zero_cond_t = true;
                }
                size_t pos = tensor_name.find("transformer_blocks.");
                if (pos != std::string::npos) {
                    tensor_name = tensor_name.substr(pos);  // remove prefix
@ -529,6 +582,9 @@ namespace Qwen {
                }
            }
            LOG_INFO("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers);
            if (qwen_image_params.zero_cond_t) {
                LOG_INFO("use zero_cond_t");
            }
            qwen_image = QwenImageModel(qwen_image_params);
            qwen_image.init(params_ctx, tensor_storage_map, prefix);
        }
@ -565,6 +621,8 @@ namespace Qwen {
                                                  ref_latents,
                                                  increase_ref_index,
                                                  qwen_image_params.theta,
                                                  circular_y_enabled,
                                                  circular_x_enabled,
                                                  qwen_image_params.axes_dim);
            int pos_len = pe_vec.size() / qwen_image_params.axes_dim_sum / 2;
            // LOG_DEBUG("pos_len %d", pos_len);
@ -574,6 +632,31 @@ namespace Qwen {
            // pe->data = nullptr;
            set_backend_tensor_data(pe, pe_vec.data());
            ggml_tensor* modulate_index = nullptr;
            if (qwen_image_params.zero_cond_t) {
                modulate_index_vec.clear();
                int64_t h_len          = ((x->ne[1] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
                int64_t w_len          = ((x->ne[0] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
                int64_t num_img_tokens = h_len * w_len;
                modulate_index_vec.insert(modulate_index_vec.end(), num_img_tokens, 0.f);
                int64_t num_ref_img_tokens = 0;
                for (ggml_tensor* ref : ref_latents) {
                    int64_t h_len = ((ref->ne[1] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
                    int64_t w_len = ((ref->ne[0] + (qwen_image_params.patch_size / 2)) / qwen_image_params.patch_size);
                    num_ref_img_tokens += h_len * w_len;
                }
                if (num_ref_img_tokens > 0) {
                    modulate_index_vec.insert(modulate_index_vec.end(), num_ref_img_tokens, 1.f);
                }
                modulate_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, modulate_index_vec.size());
                set_backend_tensor_data(modulate_index, modulate_index_vec.data());
            }
            auto runner_ctx = get_context();
            struct ggml_tensor* out = qwen_image.forward(&runner_ctx,
@ -581,7 +664,8 @@ namespace Qwen {
                                                         timesteps,
                                                         context,
                                                         pe,
-                                                         ref_latents);
+                                                         ref_latents,
                                                         modulate_index);
            ggml_build_forward_expand(gf, out);
@ -684,4 +768,4 @@ namespace Qwen {
 }  // namespace name
-#endif  // __QWEN_IMAGE_HPP__
+#endif  // __QWEN_IMAGE_HPP__
--- a/rope.hpp
+++ b/rope.hpp
@ -1,6 +1,8 @@
 #ifndef __ROPE_HPP__
 #define __ROPE_HPP__
 #include <algorithm>
 #include <cmath>
 #include <vector>
 #include "ggml_extend.hpp"
@ -39,7 +41,10 @@ namespace Rope {
        return flat_vec;
    }
-    __STATIC_INLINE__ std::vector<std::vector<float>> rope(const std::vector<float>& pos, int dim, int theta) {
+    __STATIC_INLINE__ std::vector<std::vector<float>> rope(const std::vector<float>& pos,
                                                           int dim,
                                                           int theta,
                                                           const std::vector<int>& axis_wrap_dims = {}) {
        assert(dim % 2 == 0);
        int half_dim = dim / 2;
@ -47,14 +52,31 @@ namespace Rope {
        std::vector<float> omega(half_dim);
        for (int i = 0; i < half_dim; ++i) {
-            omega[i] = 1.0 / std::pow(theta, scale[i]);
+            omega[i] = 1.0f / std::pow(theta, scale[i]);
        }
        int pos_size = pos.size();
        std::vector<std::vector<float>> out(pos_size, std::vector<float>(half_dim));
        for (int i = 0; i < pos_size; ++i) {
            for (int j = 0; j < half_dim; ++j) {
-                out[i][j] = pos[i] * omega[j];
+                float angle = pos[i] * omega[j];
                if (!axis_wrap_dims.empty()) {
                    size_t wrap_size = axis_wrap_dims.size();
                    // mod batch size since we only store this for one item in the batch
                    size_t wrap_idx = wrap_size > 0 ? (i % wrap_size) : 0;
                    int wrap_dim    = axis_wrap_dims[wrap_idx];
                    if (wrap_dim > 0) {
                        constexpr float TWO_PI = 6.28318530717958647692f;
                        float cycles           = omega[j] * wrap_dim / TWO_PI;
                        // closest periodic harmonic, necessary to ensure things neatly tile
                        // without this round, things don't tile at the boundaries and you end up
                        // with the model knowing what is "center"
                        float rounded = std::round(cycles);
                        angle         = pos[i] * TWO_PI * rounded / wrap_dim;
                    }
                }
                out[i][j] = angle;
            }
        }
@ -89,16 +111,25 @@ namespace Rope {
                                                                       int patch_size,
                                                                       int bs,
                                                                       int axes_dim_num,
-                                                                       int index    = 0,
+                                                                       int index       = 0,
-                                                                       int h_offset = 0,
+                                                                       int h_offset    = 0,
-                                                                       int w_offset = 0) {
+                                                                       int w_offset    = 0,
                                                                       bool scale_rope = false) {
        int h_len = (h + (patch_size / 2)) / patch_size;
        int w_len = (w + (patch_size / 2)) / patch_size;
        std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(axes_dim_num, 0.0));
-        std::vector<float> row_ids = linspace<float>(h_offset, h_len - 1 + h_offset, h_len);
+        int h_start = h_offset;
-        std::vector<float> col_ids = linspace<float>(w_offset, w_len - 1 + w_offset, w_len);
+        int w_start = w_offset;
        if (scale_rope) {
            h_start -= h_len / 2;
            w_start -= w_len / 2;
        }
        std::vector<float> row_ids = linspace<float>(h_start, h_start + h_len - 1, h_len);
        std::vector<float> col_ids = linspace<float>(w_start, w_start + w_len - 1, w_len);
        for (int i = 0; i < h_len; ++i) {
            for (int j = 0; j < w_len; ++j) {
@ -137,7 +168,8 @@ namespace Rope {
    __STATIC_INLINE__ std::vector<float> embed_nd(const std::vector<std::vector<float>>& ids,
                                                  int bs,
                                                  int theta,
-                                                  const std::vector<int>& axes_dim) {
+                                                  const std::vector<int>& axes_dim,
                                                  const std::vector<std::vector<int>>& wrap_dims = {}) {
        std::vector<std::vector<float>> trans_ids = transpose(ids);
        size_t pos_len                            = ids.size() / bs;
        int num_axes                              = axes_dim.size();
@ -152,7 +184,12 @@ namespace Rope {
        std::vector<std::vector<float>> emb(bs * pos_len, std::vector<float>(emb_dim * 2 * 2, 0.0));
        int offset = 0;
        for (int i = 0; i < num_axes; ++i) {
-            std::vector<std::vector<float>> rope_emb = rope(trans_ids[i], axes_dim[i], theta);  // [bs*pos_len, axes_dim[i]/2 * 2 * 2]
+            std::vector<int> axis_wrap_dims;
            if (!wrap_dims.empty() && i < (int)wrap_dims.size()) {
                axis_wrap_dims = wrap_dims[i];
            }
            std::vector<std::vector<float>> rope_emb =
                rope(trans_ids[i], axes_dim[i], theta, axis_wrap_dims);  // [bs*pos_len, axes_dim[i]/2 * 2 * 2]
            for (int b = 0; b < bs; ++b) {
                for (int j = 0; j < pos_len; ++j) {
                    for (int k = 0; k < rope_emb[0].size(); ++k) {
@ -171,7 +208,8 @@ namespace Rope {
                                                                   int axes_dim_num,
                                                                   const std::vector<ggml_tensor*>& ref_latents,
                                                                   bool increase_ref_index,
-                                                                   float ref_index_scale) {
+                                                                   float ref_index_scale,
                                                                   bool scale_rope) {
        std::vector<std::vector<float>> ids;
        uint64_t curr_h_offset = 0;
        uint64_t curr_w_offset = 0;
@ -185,6 +223,7 @@ namespace Rope {
                } else {
                    h_offset = curr_h_offset;
                }
                scale_rope = false;
            }
            auto ref_ids = gen_flux_img_ids(ref->ne[1],
@ -194,7 +233,8 @@ namespace Rope {
                                            axes_dim_num,
                                            static_cast<int>(index * ref_index_scale),
                                            h_offset,
-                                            w_offset);
+                                            w_offset,
                                            scale_rope);
            ids          = concat_ids(ids, ref_ids, bs);
            if (increase_ref_index) {
@ -222,7 +262,7 @@ namespace Rope {
        auto ids = concat_ids(txt_ids, img_ids, bs);
        if (ref_latents.size() > 0) {
-            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, ref_index_scale);
+            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, ref_index_scale, false);
            ids           = concat_ids(ids, refs_ids, bs);
        }
        return ids;
@ -239,6 +279,8 @@ namespace Rope {
                                                     bool increase_ref_index,
                                                     float ref_index_scale,
                                                     int theta,
                                                     bool circular_h,
                                                     bool circular_w,
                                                     const std::vector<int>& axes_dim) {
        std::vector<std::vector<float>> ids = gen_flux_ids(h,
                                                           w,
@ -250,7 +292,47 @@ namespace Rope {
                                                           ref_latents,
                                                           increase_ref_index,
                                                           ref_index_scale);
-        return embed_nd(ids, bs, theta, axes_dim);
+        std::vector<std::vector<int>> wrap_dims;
        if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) {
            int h_len = (h + (patch_size / 2)) / patch_size;
            int w_len = (w + (patch_size / 2)) / patch_size;
            if (h_len > 0 && w_len > 0) {
                size_t pos_len = ids.size() / bs;
                wrap_dims.assign(axes_dim.size(), std::vector<int>(pos_len, 0));
                size_t cursor           = context_len;  // text first
                const size_t img_tokens = static_cast<size_t>(h_len) * static_cast<size_t>(w_len);
                for (size_t token_i = 0; token_i < img_tokens; ++token_i) {
                    if (circular_h) {
                        wrap_dims[1][cursor + token_i] = h_len;
                    }
                    if (circular_w) {
                        wrap_dims[2][cursor + token_i] = w_len;
                    }
                }
                cursor += img_tokens;
                // reference latents
                for (ggml_tensor* ref : ref_latents) {
                    if (ref == nullptr) {
                        continue;
                    }
                    int ref_h         = static_cast<int>(ref->ne[1]);
                    int ref_w         = static_cast<int>(ref->ne[0]);
                    int ref_h_l       = (ref_h + (patch_size / 2)) / patch_size;
                    int ref_w_l       = (ref_w + (patch_size / 2)) / patch_size;
                    size_t ref_tokens = static_cast<size_t>(ref_h_l) * static_cast<size_t>(ref_w_l);
                    for (size_t token_i = 0; token_i < ref_tokens; ++token_i) {
                        if (circular_h) {
                            wrap_dims[1][cursor + token_i] = ref_h_l;
                        }
                        if (circular_w) {
                            wrap_dims[2][cursor + token_i] = ref_w_l;
                        }
                    }
                    cursor += ref_tokens;
                }
            }
        }
        return embed_nd(ids, bs, theta, axes_dim, wrap_dims);
    }
    __STATIC_INLINE__ std::vector<std::vector<float>> gen_qwen_image_ids(int h,
@ -271,10 +353,10 @@ namespace Rope {
            }
        }
        int axes_dim_num = 3;
-        auto img_ids     = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num);
+        auto img_ids     = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num, 0, 0, 0, true);
        auto ids         = concat_ids(txt_ids_repeated, img_ids, bs);
        if (ref_latents.size() > 0) {
-            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, 1.f);
+            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, 1.f, true);
            ids           = concat_ids(ids, refs_ids, bs);
        }
        return ids;
@ -289,9 +371,57 @@ namespace Rope {
                                                           const std::vector<ggml_tensor*>& ref_latents,
                                                           bool increase_ref_index,
                                                           int theta,
                                                           bool circular_h,
                                                           bool circular_w,
                                                           const std::vector<int>& axes_dim) {
        std::vector<std::vector<float>> ids = gen_qwen_image_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index);
-        return embed_nd(ids, bs, theta, axes_dim);
+        std::vector<std::vector<int>> wrap_dims;
        // This logic simply stores the (pad and patch_adjusted) sizes of images so we can make sure rope correctly tiles
        if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) {
            int pad_h = (patch_size - (h % patch_size)) % patch_size;
            int pad_w = (patch_size - (w % patch_size)) % patch_size;
            int h_len = (h + pad_h) / patch_size;
            int w_len = (w + pad_w) / patch_size;
            if (h_len > 0 && w_len > 0) {
                const size_t total_tokens = ids.size();
                // Track per-token wrap lengths for the row/column axes so only spatial tokens become periodic.
                wrap_dims.assign(axes_dim.size(), std::vector<int>(total_tokens / bs, 0));
                size_t cursor           = context_len;  // ignore text tokens
                const size_t img_tokens = static_cast<size_t>(h_len) * static_cast<size_t>(w_len);
                for (size_t token_i = 0; token_i < img_tokens; ++token_i) {
                    if (circular_h) {
                        wrap_dims[1][cursor + token_i] = h_len;
                    }
                    if (circular_w) {
                        wrap_dims[2][cursor + token_i] = w_len;
                    }
                }
                cursor += img_tokens;
                // For each reference image, store wrap sizes as well
                for (ggml_tensor* ref : ref_latents) {
                    if (ref == nullptr) {
                        continue;
                    }
                    int ref_h           = static_cast<int>(ref->ne[1]);
                    int ref_w           = static_cast<int>(ref->ne[0]);
                    int ref_pad_h       = (patch_size - (ref_h % patch_size)) % patch_size;
                    int ref_pad_w       = (patch_size - (ref_w % patch_size)) % patch_size;
                    int ref_h_len       = (ref_h + ref_pad_h) / patch_size;
                    int ref_w_len       = (ref_w + ref_pad_w) / patch_size;
                    size_t ref_n_tokens = static_cast<size_t>(ref_h_len) * static_cast<size_t>(ref_w_len);
                    for (size_t token_i = 0; token_i < ref_n_tokens; ++token_i) {
                        if (circular_h) {
                            wrap_dims[1][cursor + token_i] = ref_h_len;
                        }
                        if (circular_w) {
                            wrap_dims[2][cursor + token_i] = ref_w_len;
                        }
                    }
                    cursor += ref_n_tokens;
                }
            }
        }
        return embed_nd(ids, bs, theta, axes_dim, wrap_dims);
    }
    __STATIC_INLINE__ std::vector<std::vector<float>> gen_vid_ids(int t,
@ -428,9 +558,33 @@ namespace Rope {
                                                        const std::vector<ggml_tensor*>& ref_latents,
                                                        bool increase_ref_index,
                                                        int theta,
                                                        bool circular_h,
                                                        bool circular_w,
                                                        const std::vector<int>& axes_dim) {
        std::vector<std::vector<float>> ids = gen_z_image_ids(h, w, patch_size, bs, context_len, seq_multi_of, ref_latents, increase_ref_index);
-        return embed_nd(ids, bs, theta, axes_dim);
+        std::vector<std::vector<int>> wrap_dims;
        if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) {
            int pad_h = (patch_size - (h % patch_size)) % patch_size;
            int pad_w = (patch_size - (w % patch_size)) % patch_size;
            int h_len = (h + pad_h) / patch_size;
            int w_len = (w + pad_w) / patch_size;
            if (h_len > 0 && w_len > 0) {
                size_t pos_len = ids.size() / bs;
                wrap_dims.assign(axes_dim.size(), std::vector<int>(pos_len, 0));
                size_t cursor     = context_len + bound_mod(context_len, seq_multi_of);  // skip text (and its padding)
                size_t img_tokens = static_cast<size_t>(h_len) * static_cast<size_t>(w_len);
                for (size_t token_i = 0; token_i < img_tokens; ++token_i) {
                    if (circular_h) {
                        wrap_dims[1][cursor + token_i] = h_len;
                    }
                    if (circular_w) {
                        wrap_dims[2][cursor + token_i] = w_len;
                    }
                }
            }
        }
        return embed_nd(ids, bs, theta, axes_dim, wrap_dims);
    }
    __STATIC_INLINE__ struct ggml_tensor* apply_rope(struct ggml_context* ctx,
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -60,6 +60,7 @@ enum scheduler_t {
    SGM_UNIFORM_SCHEDULER,
    SIMPLE_SCHEDULER,
    SMOOTHSTEP_SCHEDULER,
    KL_OPTIMAL_SCHEDULER,
    LCM_SCHEDULER,
    SCHEDULER_COUNT
 };
@ -168,7 +169,6 @@ typedef struct {
    const char* vae_path;
    const char* taesd_path;
    const char* control_net_path;
    const char* lora_model_dir;
    const sd_embedding_t* embeddings;
    uint32_t embedding_count;
    const char* photo_maker_path;
@ -182,6 +182,7 @@ typedef struct {
    enum prediction_t prediction;
    enum lora_apply_mode_t lora_apply_mode;
    bool offload_params_to_cpu;
    bool enable_mmap;
    bool keep_clip_on_cpu;
    bool keep_control_net_on_cpu;
    bool keep_vae_on_cpu;
@ -189,10 +190,13 @@ typedef struct {
    bool tae_preview_only;
    bool diffusion_conv_direct;
    bool vae_conv_direct;
    bool circular_x;
    bool circular_y;
    bool force_sdxl_vae_conv_scale;
    bool chroma_use_dit_mask;
    bool chroma_use_t5_mask;
    int chroma_t5_mask_pad;
    bool qwen_image_zero_cond_t;
    float flow_shift;
 } sd_ctx_params_t;
@ -225,6 +229,8 @@ typedef struct {
    int sample_steps;
    float eta;
    int shifted_timestep;
    float* custom_sigmas;
    int custom_sigmas_count;
 } sd_sample_params_t;
 typedef struct {
@ -234,12 +240,34 @@ typedef struct {
    float style_strength;
 } sd_pm_params_t;  // photo maker
 enum sd_cache_mode_t {
    SD_CACHE_DISABLED = 0,
    SD_CACHE_EASYCACHE,
    SD_CACHE_UCACHE,
    SD_CACHE_DBCACHE,
    SD_CACHE_TAYLORSEER,
    SD_CACHE_CACHE_DIT,
 };
 typedef struct {
-    bool enabled;
+    enum sd_cache_mode_t mode;
    float reuse_threshold;
    float start_percent;
    float end_percent;
-} sd_easycache_params_t;
+    float error_decay_rate;
    bool use_relative_threshold;
    bool reset_error_on_compute;
    int Fn_compute_blocks;
    int Bn_compute_blocks;
    float residual_diff_threshold;
    int max_warmup_steps;
    int max_cached_steps;
    int max_continuous_cached_steps;
    int taylorseer_n_derivatives;
    int taylorseer_skip_interval;
    const char* scm_mask;
    bool scm_policy_dynamic;
 } sd_cache_params_t;
 typedef struct {
    bool is_high_noise;
@ -269,7 +297,7 @@ typedef struct {
    float control_strength;
    sd_pm_params_t pm_params;
    sd_tiling_params_t vae_tiling_params;
-    sd_easycache_params_t easycache;
+    sd_cache_params_t cache;
 } sd_img_gen_params_t;
 typedef struct {
@ -291,7 +319,7 @@ typedef struct {
    int64_t seed;
    int video_frames;
    float vace_strength;
-    sd_easycache_params_t easycache;
+    sd_cache_params_t cache;
 } sd_vid_gen_params_t;
 typedef struct sd_ctx_t sd_ctx_t;
@ -321,7 +349,7 @@ SD_API enum preview_t str_to_preview(const char* str);
 SD_API const char* sd_lora_apply_mode_name(enum lora_apply_mode_t mode);
 SD_API enum lora_apply_mode_t str_to_lora_apply_mode(const char* str);
-SD_API void sd_easycache_params_init(sd_easycache_params_t* easycache_params);
+SD_API void sd_cache_params_init(sd_cache_params_t* cache_params);
 SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
 SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
@ -333,7 +361,7 @@ SD_API void sd_sample_params_init(sd_sample_params_t* sample_params);
 SD_API char* sd_sample_params_to_str(const sd_sample_params_t* sample_params);
 SD_API enum sample_method_t sd_get_default_sample_method(const sd_ctx_t* sd_ctx);
-SD_API enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx);
+SD_API enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx, enum sample_method_t sample_method);
 SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params);
 SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
@ -361,7 +389,8 @@ SD_API bool convert(const char* input_path,
                    const char* vae_path,
                    const char* output_path,
                    enum sd_type_t output_type,
-                    const char* tensor_type_rules);
+                    const char* tensor_type_rules,
                    bool convert_name);
 SD_API bool preprocess_canny(sd_image_t image,
                             float high_threshold,
--- a/tae.hpp
+++ b/tae.hpp
@ -162,6 +162,311 @@ public:
    }
 };
 class TPool : public UnaryBlock {
    int stride;
 public:
    TPool(int channels, int stride)
        : stride(stride) {
        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels * stride, channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, false));
    }
    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
        auto conv = std::dynamic_pointer_cast<UnaryBlock>(blocks["conv"]);
        auto h    = x;
        if (stride != 1) {
            h = ggml_reshape_4d(ctx->ggml_ctx, h, h->ne[0], h->ne[1], h->ne[2] * stride, h->ne[3] / stride);
        }
        h = conv->forward(ctx, h);
        return h;
    }
 };
 class TGrow : public UnaryBlock {
    int stride;
 public:
    TGrow(int channels, int stride)
        : stride(stride) {
        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels * stride, {1, 1}, {1, 1}, {0, 0}, {1, 1}, false));
    }
    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
        auto conv = std::dynamic_pointer_cast<UnaryBlock>(blocks["conv"]);
        auto h    = conv->forward(ctx, x);
        if (stride != 1) {
            h = ggml_reshape_4d(ctx->ggml_ctx, h, h->ne[0], h->ne[1], h->ne[2] / stride, h->ne[3] * stride);
        }
        return h;
    }
 };
 class MemBlock : public GGMLBlock {
    bool has_skip_conv = false;
 public:
    MemBlock(int channels, int out_channels)
        : has_skip_conv(channels != out_channels) {
        blocks["conv.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels * 2, out_channels, {3, 3}, {1, 1}, {1, 1}));
        blocks["conv.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
        blocks["conv.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
        if (has_skip_conv) {
            blocks["skip"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, false));
        }
    }
    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* past) {
        // x: [n, channels, h, w]
        auto conv0 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.0"]);
        auto conv1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.2"]);
        auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.4"]);
        auto h = ggml_concat(ctx->ggml_ctx, x, past, 2);
        h      = conv0->forward(ctx, h);
        h      = ggml_relu_inplace(ctx->ggml_ctx, h);
        h      = conv1->forward(ctx, h);
        h      = ggml_relu_inplace(ctx->ggml_ctx, h);
        h      = conv2->forward(ctx, h);
        auto skip = x;
        if (has_skip_conv) {
            auto skip_conv = std::dynamic_pointer_cast<Conv2d>(blocks["skip"]);
            skip           = skip_conv->forward(ctx, x);
        }
        h = ggml_add_inplace(ctx->ggml_ctx, h, skip);
        h = ggml_relu_inplace(ctx->ggml_ctx, h);
        return h;
    }
 };
 struct ggml_tensor* patchify(struct ggml_context* ctx,
                             struct ggml_tensor* x,
                             int64_t patch_size,
                             int64_t b = 1) {
    // x: [f, b*c, h*q, w*r]
    // return: [f, b*c*r*q, h, w]
    if (patch_size == 1) {
        return x;
    }
    int64_t r = patch_size;
    int64_t q = patch_size;
    int64_t W = x->ne[0];
    int64_t H = x->ne[1];
    int64_t C = x->ne[2];
    int64_t f = x->ne[3];
    int64_t w = W / r;
    int64_t h = H / q;
    x = ggml_reshape_4d(ctx, x, W, q, h, C * f);                         // [W, q, h, C*f]
    x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));  // [W, h, q, C*f]
    x = ggml_reshape_4d(ctx, x, r, w, h, q * C * f);                     // [r, w, h, q*C*f]
    x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3));  // [w, h, r, q*C*f]
    x = ggml_reshape_4d(ctx, x, w, h, r * q * C, f);                     // [f, b*c*r*q, h, w]
    return x;
 }
 struct ggml_tensor* unpatchify(struct ggml_context* ctx,
                               struct ggml_tensor* x,
                               int64_t patch_size,
                               int64_t b = 1) {
    // x: [f, b*c*r*q, h, w]
    // return: [f, b*c, h*q, w*r]
    if (patch_size == 1) {
        return x;
    }
    int64_t r = patch_size;
    int64_t q = patch_size;
    int64_t c = x->ne[2] / b / q / r;
    int64_t f = x->ne[3];
    int64_t h = x->ne[1];
    int64_t w = x->ne[0];
    x = ggml_reshape_4d(ctx, x, w, h, r, q * c * b * f);                 // [q*c*b*f, r, h, w]
    x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 2, 0, 1, 3));  // [r, w, h, q*c*b*f]
    x = ggml_reshape_4d(ctx, x, r * w, h, q, c * b * f);                 // [c*b*f, q, h, r*w]
    x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3));  // [r*w, q, h, c*b*f]
    x = ggml_reshape_4d(ctx, x, r * w, q * h, c * b, f);
    return x;
 }
 class TinyVideoEncoder : public UnaryBlock {
    int in_channels = 3;
    int hidden      = 64;
    int z_channels  = 4;
    int num_blocks  = 3;
    int num_layers  = 3;
    int patch_size  = 1;
 public:
    TinyVideoEncoder(int z_channels = 4, int patch_size = 1)
        : z_channels(z_channels), patch_size(patch_size) {
        int index                       = 0;
        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels * patch_size * patch_size, hidden, {3, 3}, {1, 1}, {1, 1}));
        index++;  // nn.ReLU()
        for (int i = 0; i < num_layers; i++) {
            int stride                      = i == num_layers - 1 ? 1 : 2;
            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TPool(hidden, stride));
            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(hidden, hidden, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
            for (int j = 0; j < num_blocks; j++) {
                blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new MemBlock(hidden, hidden));
            }
        }
        blocks[std::to_string(index)] = std::shared_ptr<GGMLBlock>(new Conv2d(hidden, z_channels, {3, 3}, {1, 1}, {1, 1}));
    }
    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* z) override {
        auto first_conv = std::dynamic_pointer_cast<Conv2d>(blocks["0"]);
        if (patch_size > 1) {
            z = patchify(ctx->ggml_ctx, z, patch_size, 1);
        }
        auto h = first_conv->forward(ctx, z);
        h      = ggml_relu_inplace(ctx->ggml_ctx, h);
        int index = 2;
        for (int i = 0; i < num_layers; i++) {
            auto pool = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(index++)]);
            auto conv = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(index++)]);
            h = pool->forward(ctx, h);
            h = conv->forward(ctx, h);
            for (int j = 0; j < num_blocks; j++) {
                auto block = std::dynamic_pointer_cast<MemBlock>(blocks[std::to_string(index++)]);
                auto mem   = ggml_pad_ext(ctx->ggml_ctx, h, 0, 0, 0, 0, 0, 0, 1, 0);
                mem        = ggml_view_4d(ctx->ggml_ctx, mem, h->ne[0], h->ne[1], h->ne[2], h->ne[3], h->nb[1], h->nb[2], h->nb[3], 0);
                h          = block->forward(ctx, h, mem);
            }
        }
        auto last_conv = std::dynamic_pointer_cast<Conv2d>(blocks[std::to_string(index)]);
        h              = last_conv->forward(ctx, h);
        return h;
    }
 };
 class TinyVideoDecoder : public UnaryBlock {
    int z_channels               = 4;
    int out_channels             = 3;
    int num_blocks               = 3;
    static const int num_layers  = 3;
    int channels[num_layers + 1] = {256, 128, 64, 64};
    int patch_size               = 1;
 public:
    TinyVideoDecoder(int z_channels = 4, int patch_size = 1)
        : z_channels(z_channels), patch_size(patch_size) {
        int index                       = 1;  // Clamp()
        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, channels[0], {3, 3}, {1, 1}, {1, 1}));
        index++;  // nn.ReLU()
        for (int i = 0; i < num_layers; i++) {
            int stride = i == 0 ? 1 : 2;
            for (int j = 0; j < num_blocks; j++) {
                blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new MemBlock(channels[i], channels[i]));
            }
            index++;  // nn.Upsample()
            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TGrow(channels[i], stride));
            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels[i], channels[i + 1], {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
        }
        index++;  // nn.ReLU()
        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels[num_layers], out_channels * patch_size * patch_size, {3, 3}, {1, 1}, {1, 1}));
    }
    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* z) override {
        auto first_conv = std::dynamic_pointer_cast<Conv2d>(blocks["1"]);
        // Clamp()
        auto h = ggml_scale_inplace(ctx->ggml_ctx,
                                    ggml_tanh_inplace(ctx->ggml_ctx,
                                                      ggml_scale(ctx->ggml_ctx, z, 1.0f / 3.0f)),
                                    3.0f);
        h         = first_conv->forward(ctx, h);
        h         = ggml_relu_inplace(ctx->ggml_ctx, h);
        int index = 3;
        for (int i = 0; i < num_layers; i++) {
            for (int j = 0; j < num_blocks; j++) {
                auto block = std::dynamic_pointer_cast<MemBlock>(blocks[std::to_string(index++)]);
                auto mem   = ggml_pad_ext(ctx->ggml_ctx, h, 0, 0, 0, 0, 0, 0, 1, 0);
                mem        = ggml_view_4d(ctx->ggml_ctx, mem, h->ne[0], h->ne[1], h->ne[2], h->ne[3], h->nb[1], h->nb[2], h->nb[3], 0);
                h          = block->forward(ctx, h, mem);
            }
            // upsample
            index++;
            h          = ggml_upscale(ctx->ggml_ctx, h, 2, GGML_SCALE_MODE_NEAREST);
            auto block = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(index++)]);
            h          = block->forward(ctx, h);
            block      = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(index++)]);
            h          = block->forward(ctx, h);
        }
        h = ggml_relu_inplace(ctx->ggml_ctx, h);
        auto last_conv = std::dynamic_pointer_cast<Conv2d>(blocks[std::to_string(++index)]);
        h              = last_conv->forward(ctx, h);
        if (patch_size > 1) {
            h = unpatchify(ctx->ggml_ctx, h, patch_size, 1);
        }
        // shape(W, H, 3, 3 + T) => shape(W, H, 3, T)
        h = ggml_view_4d(ctx->ggml_ctx, h, h->ne[0], h->ne[1], h->ne[2], h->ne[3] - 3, h->nb[1], h->nb[2], h->nb[3], 3 * h->nb[3]);
        return h;
    }
 };
 class TAEHV : public GGMLBlock {
 protected:
    bool decode_only;
    SDVersion version;
 public:
    TAEHV(bool decode_only = true, SDVersion version = VERSION_WAN2)
        : decode_only(decode_only), version(version) {
        int z_channels = 16;
        int patch      = 1;
        if (version == VERSION_WAN2_2_TI2V) {
            z_channels = 48;
            patch      = 2;
        }
        blocks["decoder"] = std::shared_ptr<GGMLBlock>(new TinyVideoDecoder(z_channels, patch));
        if (!decode_only) {
            blocks["encoder"] = std::shared_ptr<GGMLBlock>(new TinyVideoEncoder(z_channels, patch));
        }
    }
    struct ggml_tensor* decode(GGMLRunnerContext* ctx, struct ggml_tensor* z) {
        auto decoder = std::dynamic_pointer_cast<TinyVideoDecoder>(blocks["decoder"]);
        if (sd_version_is_wan(version)) {
            // (W, H, C, T) -> (W, H, T, C)
            z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 0, 1, 3, 2));
        }
        auto result = decoder->forward(ctx, z);
        if (sd_version_is_wan(version)) {
            // (W, H, C, T) -> (W, H, T, C)
            result = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, result, 0, 1, 3, 2));
        }
        return result;
    }
    struct ggml_tensor* encode(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        auto encoder = std::dynamic_pointer_cast<TinyVideoEncoder>(blocks["encoder"]);
        // (W, H, T, C) -> (W, H, C, T)
        x                  = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 1, 3, 2));
        int64_t num_frames = x->ne[3];
        if (num_frames % 4) {
            // pad to multiple of 4 at the end
            auto last_frame = ggml_view_4d(ctx->ggml_ctx, x, x->ne[0], x->ne[1], x->ne[2], 1, x->nb[1], x->nb[2], x->nb[3], (num_frames - 1) * x->nb[3]);
            for (int i = 0; i < 4 - num_frames % 4; i++) {
                x = ggml_concat(ctx->ggml_ctx, x, last_frame, 3);
            }
        }
        x = encoder->forward(ctx, x);
        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 1, 3, 2));
        return x;
    }
 };
 class TAESD : public GGMLBlock {
 protected:
    bool decode_only;
@ -192,18 +497,30 @@ public:
 };
 struct TinyAutoEncoder : public GGMLRunner {
    TinyAutoEncoder(ggml_backend_t backend, bool offload_params_to_cpu)
        : GGMLRunner(backend, offload_params_to_cpu) {}
    virtual bool compute(const int n_threads,
                         struct ggml_tensor* z,
                         bool decode_graph,
                         struct ggml_tensor** output,
                         struct ggml_context* output_ctx = nullptr) = 0;
    virtual bool load_from_file(const std::string& file_path, int n_threads) = 0;
 };
 struct TinyImageAutoEncoder : public TinyAutoEncoder {
    TAESD taesd;
    bool decode_only = false;
-    TinyAutoEncoder(ggml_backend_t backend,
+    TinyImageAutoEncoder(ggml_backend_t backend,
-                    bool offload_params_to_cpu,
+                         bool offload_params_to_cpu,
-                    const String2TensorStorage& tensor_storage_map,
+                         const String2TensorStorage& tensor_storage_map,
-                    const std::string prefix,
+                         const std::string prefix,
-                    bool decoder_only = true,
+                         bool decoder_only = true,
-                    SDVersion version = VERSION_SD1)
+                         SDVersion version = VERSION_SD1)
        : decode_only(decoder_only),
          taesd(decoder_only, version),
-          GGMLRunner(backend, offload_params_to_cpu) {
+          TinyAutoEncoder(backend, offload_params_to_cpu) {
        taesd.init(params_ctx, tensor_storage_map, prefix);
    }
@ -260,4 +577,73 @@ struct TinyAutoEncoder : public GGMLRunner {
    }
 };
 struct TinyVideoAutoEncoder : public TinyAutoEncoder {
    TAEHV taehv;
    bool decode_only = false;
    TinyVideoAutoEncoder(ggml_backend_t backend,
                         bool offload_params_to_cpu,
                         const String2TensorStorage& tensor_storage_map,
                         const std::string prefix,
                         bool decoder_only = true,
                         SDVersion version = VERSION_WAN2)
        : decode_only(decoder_only),
          taehv(decoder_only, version),
          TinyAutoEncoder(backend, offload_params_to_cpu) {
        taehv.init(params_ctx, tensor_storage_map, prefix);
    }
    std::string get_desc() override {
        return "taehv";
    }
    bool load_from_file(const std::string& file_path, int n_threads) {
        LOG_INFO("loading taehv from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false");
        alloc_params_buffer();
        std::map<std::string, ggml_tensor*> taehv_tensors;
        taehv.get_param_tensors(taehv_tensors);
        std::set<std::string> ignore_tensors;
        if (decode_only) {
            ignore_tensors.insert("encoder.");
        }
        ModelLoader model_loader;
        if (!model_loader.init_from_file(file_path)) {
            LOG_ERROR("init taehv model loader from file failed: '%s'", file_path.c_str());
            return false;
        }
        bool success = model_loader.load_tensors(taehv_tensors, ignore_tensors, n_threads);
        if (!success) {
            LOG_ERROR("load tae tensors from model loader failed");
            return false;
        }
        LOG_INFO("taehv model loaded");
        return success;
    }
    struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
        struct ggml_cgraph* gf  = ggml_new_graph(compute_ctx);
        z                       = to_backend(z);
        auto runner_ctx         = get_context();
        struct ggml_tensor* out = decode_graph ? taehv.decode(&runner_ctx, z) : taehv.encode(&runner_ctx, z);
        ggml_build_forward_expand(gf, out);
        return gf;
    }
    bool compute(const int n_threads,
                 struct ggml_tensor* z,
                 bool decode_graph,
                 struct ggml_tensor** output,
                 struct ggml_context* output_ctx = nullptr) {
        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(z, decode_graph);
        };
        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
    }
 };
 #endif  // __TAE_HPP__
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@ -1,3 +1,10 @@
 - json.hpp library from: https://github.com/nlohmann/json
    - LICENSE: https://github.com/nlohmann/json/blob/develop/LICENSE.MIT
 - ZIP Library from: https://github.com/kuba--/zip
- darts.h from: https://github.com/google/sentencepiece/tree/master/third_party/darts_clone
+    LICENSE: https://github.com/kuba--/zip/blob/master/LICENSE.txt
 - darts.h from: https://github.com/google/sentencepiece/tree/master/third_party/darts_clone
    - LICENSE: https://github.com/google/sentencepiece/blob/master/third_party/darts_clone/LICENSE
 - httplib.h from: https://github.com/yhirose/cpp-httplib/blob/master/httplib.h
    - LICENSE: https://github.com/yhirose/cpp-httplib/blob/master/LICENSE
 - stb_image.h/stb_image_resize.h/stb_image_write.h from: https://github.com/nothings/stb
    - LICENSE: https://github.com/nothings/stb/blob/master/LICENSE
--- a/thirdparty/httplib.h
+++ b/thirdparty/httplib.h
--- a/ucache.hpp
+++ b/ucache.hpp
@ -0,0 +1,404 @@
 #ifndef __UCACHE_HPP__
 #define __UCACHE_HPP__
 #include <cmath>
 #include <limits>
 #include <unordered_map>
 #include <vector>
 #include "denoiser.hpp"
 #include "ggml_extend.hpp"
 struct UCacheConfig {
    bool enabled                = false;
    float reuse_threshold       = 1.0f;
    float start_percent         = 0.15f;
    float end_percent           = 0.95f;
    float error_decay_rate      = 1.0f;
    bool use_relative_threshold = true;
    bool adaptive_threshold     = true;
    float early_step_multiplier = 0.5f;
    float late_step_multiplier  = 1.5f;
    bool reset_error_on_compute = true;
 };
 struct UCacheCacheEntry {
    std::vector<float> diff;
 };
 struct UCacheState {
    UCacheConfig config;
    Denoiser* denoiser                  = nullptr;
    float start_sigma                   = std::numeric_limits<float>::max();
    float end_sigma                     = 0.0f;
    bool initialized                    = false;
    bool initial_step                   = true;
    bool skip_current_step              = false;
    bool step_active                    = false;
    const SDCondition* anchor_condition = nullptr;
    std::unordered_map<const SDCondition*, UCacheCacheEntry> cache_diffs;
    std::vector<float> prev_input;
    std::vector<float> prev_output;
    float output_prev_norm                = 0.0f;
    bool has_prev_input                   = false;
    bool has_prev_output                  = false;
    bool has_output_prev_norm             = false;
    bool has_relative_transformation_rate = false;
    float relative_transformation_rate    = 0.0f;
    float cumulative_change_rate          = 0.0f;
    float last_input_change               = 0.0f;
    bool has_last_input_change            = false;
    int total_steps_skipped               = 0;
    int current_step_index                = -1;
    int steps_computed_since_active       = 0;
    float accumulated_error               = 0.0f;
    float reference_output_norm           = 0.0f;
    struct BlockMetrics {
        float sum_transformation_rate = 0.0f;
        float sum_output_norm         = 0.0f;
        int sample_count              = 0;
        float min_change_rate         = std::numeric_limits<float>::max();
        float max_change_rate         = 0.0f;
        void reset() {
            sum_transformation_rate = 0.0f;
            sum_output_norm         = 0.0f;
            sample_count            = 0;
            min_change_rate         = std::numeric_limits<float>::max();
            max_change_rate         = 0.0f;
        }
        void record(float change_rate, float output_norm) {
            if (std::isfinite(change_rate) && change_rate > 0.0f) {
                sum_transformation_rate += change_rate;
                sum_output_norm += output_norm;
                sample_count++;
                if (change_rate < min_change_rate)
                    min_change_rate = change_rate;
                if (change_rate > max_change_rate)
                    max_change_rate = change_rate;
            }
        }
        float avg_transformation_rate() const {
            return (sample_count > 0) ? (sum_transformation_rate / sample_count) : 0.0f;
        }
        float avg_output_norm() const {
            return (sample_count > 0) ? (sum_output_norm / sample_count) : 0.0f;
        }
    };
    BlockMetrics block_metrics;
    int total_active_steps = 0;
    void reset_runtime() {
        initial_step      = true;
        skip_current_step = false;
        step_active       = false;
        anchor_condition  = nullptr;
        cache_diffs.clear();
        prev_input.clear();
        prev_output.clear();
        output_prev_norm                 = 0.0f;
        has_prev_input                   = false;
        has_prev_output                  = false;
        has_output_prev_norm             = false;
        has_relative_transformation_rate = false;
        relative_transformation_rate     = 0.0f;
        cumulative_change_rate           = 0.0f;
        last_input_change                = 0.0f;
        has_last_input_change            = false;
        total_steps_skipped              = 0;
        current_step_index               = -1;
        steps_computed_since_active      = 0;
        accumulated_error                = 0.0f;
        reference_output_norm            = 0.0f;
        block_metrics.reset();
        total_active_steps = 0;
    }
    void init(const UCacheConfig& cfg, Denoiser* d) {
        config      = cfg;
        denoiser    = d;
        initialized = cfg.enabled && d != nullptr;
        reset_runtime();
        if (initialized) {
            start_sigma = percent_to_sigma(config.start_percent);
            end_sigma   = percent_to_sigma(config.end_percent);
        }
    }
    void set_sigmas(const std::vector<float>& sigmas) {
        if (!initialized || sigmas.size() < 2) {
            return;
        }
        size_t n_steps = sigmas.size() - 1;
        size_t start_step = static_cast<size_t>(config.start_percent * n_steps);
        size_t end_step   = static_cast<size_t>(config.end_percent * n_steps);
        if (start_step >= n_steps)
            start_step = n_steps - 1;
        if (end_step >= n_steps)
            end_step = n_steps - 1;
        start_sigma = sigmas[start_step];
        end_sigma   = sigmas[end_step];
        if (start_sigma < end_sigma) {
            std::swap(start_sigma, end_sigma);
        }
    }
    bool enabled() const {
        return initialized && config.enabled;
    }
    float percent_to_sigma(float percent) const {
        if (!denoiser) {
            return 0.0f;
        }
        if (percent <= 0.0f) {
            return std::numeric_limits<float>::max();
        }
        if (percent >= 1.0f) {
            return 0.0f;
        }
        float t = (1.0f - percent) * (TIMESTEPS - 1);
        return denoiser->t_to_sigma(t);
    }
    void begin_step(int step_index, float sigma) {
        if (!enabled()) {
            return;
        }
        if (step_index == current_step_index) {
            return;
        }
        current_step_index    = step_index;
        skip_current_step     = false;
        has_last_input_change = false;
        step_active           = false;
        if (sigma > start_sigma) {
            return;
        }
        if (!(sigma > end_sigma)) {
            return;
        }
        step_active = true;
        total_active_steps++;
    }
    bool step_is_active() const {
        return enabled() && step_active;
    }
    bool is_step_skipped() const {
        return enabled() && step_active && skip_current_step;
    }
    float get_adaptive_threshold(int estimated_total_steps = 0) const {
        float base_threshold = config.reuse_threshold;
        if (!config.adaptive_threshold) {
            return base_threshold;
        }
        int effective_total = estimated_total_steps;
        if (effective_total <= 0) {
            effective_total = std::max(20, steps_computed_since_active * 2);
        }
        float progress = (effective_total > 0) ? (static_cast<float>(steps_computed_since_active) / effective_total) : 0.0f;
        float multiplier = 1.0f;
        if (progress < 0.2f) {
            multiplier = config.early_step_multiplier;
        } else if (progress > 0.8f) {
            multiplier = config.late_step_multiplier;
        }
        return base_threshold * multiplier;
    }
    bool has_cache(const SDCondition* cond) const {
        auto it = cache_diffs.find(cond);
        return it != cache_diffs.end() && !it->second.diff.empty();
    }
    void update_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
        UCacheCacheEntry& entry = cache_diffs[cond];
        size_t ne               = static_cast<size_t>(ggml_nelements(output));
        entry.diff.resize(ne);
        float* out_data = (float*)output->data;
        float* in_data  = (float*)input->data;
        for (size_t i = 0; i < ne; ++i) {
            entry.diff[i] = out_data[i] - in_data[i];
        }
    }
    void apply_cache(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
        auto it = cache_diffs.find(cond);
        if (it == cache_diffs.end() || it->second.diff.empty()) {
            return;
        }
        copy_ggml_tensor(output, input);
        float* out_data                = (float*)output->data;
        const std::vector<float>& diff = it->second.diff;
        for (size_t i = 0; i < diff.size(); ++i) {
            out_data[i] += diff[i];
        }
    }
    bool before_condition(const SDCondition* cond,
                          ggml_tensor* input,
                          ggml_tensor* output,
                          float sigma,
                          int step_index) {
        if (!enabled() || step_index < 0) {
            return false;
        }
        if (step_index != current_step_index) {
            begin_step(step_index, sigma);
        }
        if (!step_active) {
            return false;
        }
        if (initial_step) {
            anchor_condition = cond;
            initial_step     = false;
        }
        bool is_anchor = (cond == anchor_condition);
        if (skip_current_step) {
            if (has_cache(cond)) {
                apply_cache(cond, input, output);
                return true;
            }
            return false;
        }
        if (!is_anchor) {
            return false;
        }
        if (!has_prev_input || !has_prev_output || !has_cache(cond)) {
            return false;
        }
        size_t ne = static_cast<size_t>(ggml_nelements(input));
        if (prev_input.size() != ne) {
            return false;
        }
        float* input_data = (float*)input->data;
        last_input_change = 0.0f;
        for (size_t i = 0; i < ne; ++i) {
            last_input_change += std::fabs(input_data[i] - prev_input[i]);
        }
        if (ne > 0) {
            last_input_change /= static_cast<float>(ne);
        }
        has_last_input_change = true;
        if (has_output_prev_norm && has_relative_transformation_rate &&
            last_input_change > 0.0f && output_prev_norm > 0.0f) {
            float approx_output_change_rate = (relative_transformation_rate * last_input_change) / output_prev_norm;
            accumulated_error               = accumulated_error * config.error_decay_rate + approx_output_change_rate;
            float effective_threshold = get_adaptive_threshold();
            if (config.use_relative_threshold && reference_output_norm > 0.0f) {
                effective_threshold = effective_threshold * reference_output_norm;
            }
            if (accumulated_error < effective_threshold) {
                skip_current_step = true;
                total_steps_skipped++;
                apply_cache(cond, input, output);
                return true;
            } else if (config.reset_error_on_compute) {
                accumulated_error = 0.0f;
            }
        }
        return false;
    }
    void after_condition(const SDCondition* cond, ggml_tensor* input, ggml_tensor* output) {
        if (!step_is_active()) {
            return;
        }
        update_cache(cond, input, output);
        if (cond != anchor_condition) {
            return;
        }
        size_t ne      = static_cast<size_t>(ggml_nelements(input));
        float* in_data = (float*)input->data;
        prev_input.resize(ne);
        for (size_t i = 0; i < ne; ++i) {
            prev_input[i] = in_data[i];
        }
        has_prev_input = true;
        float* out_data     = (float*)output->data;
        float output_change = 0.0f;
        if (has_prev_output && prev_output.size() == ne) {
            for (size_t i = 0; i < ne; ++i) {
                output_change += std::fabs(out_data[i] - prev_output[i]);
            }
            if (ne > 0) {
                output_change /= static_cast<float>(ne);
            }
        }
        prev_output.resize(ne);
        for (size_t i = 0; i < ne; ++i) {
            prev_output[i] = out_data[i];
        }
        has_prev_output = true;
        float mean_abs = 0.0f;
        for (size_t i = 0; i < ne; ++i) {
            mean_abs += std::fabs(out_data[i]);
        }
        output_prev_norm     = (ne > 0) ? (mean_abs / static_cast<float>(ne)) : 0.0f;
        has_output_prev_norm = output_prev_norm > 0.0f;
        if (reference_output_norm == 0.0f) {
            reference_output_norm = output_prev_norm;
        }
        if (has_last_input_change && last_input_change > 0.0f && output_change > 0.0f) {
            float rate = output_change / last_input_change;
            if (std::isfinite(rate)) {
                relative_transformation_rate     = rate;
                has_relative_transformation_rate = true;
                block_metrics.record(rate, output_prev_norm);
            }
        }
        has_last_input_change = false;
    }
    void log_block_metrics() const {
        if (block_metrics.sample_count > 0) {
            LOG_INFO("UCacheBlockMetrics: samples=%d, avg_rate=%.4f, min=%.4f, max=%.4f, avg_norm=%.4f",
                     block_metrics.sample_count,
                     block_metrics.avg_transformation_rate(),
                     block_metrics.min_change_rate,
                     block_metrics.max_change_rate,
                     block_metrics.avg_output_norm());
        }
    }
 };
 #endif  // __UCACHE_HPP__
--- a/util.cpp
+++ b/util.cpp
@ -95,9 +95,71 @@ bool is_directory(const std::string& path) {
    return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY));
 }
 class MmapWrapperImpl : public MmapWrapper {
 public:
    MmapWrapperImpl(void* data, size_t size, HANDLE hfile, HANDLE hmapping)
        : MmapWrapper(data, size), hfile_(hfile), hmapping_(hmapping) {}
    ~MmapWrapperImpl() override {
        UnmapViewOfFile(data_);
        CloseHandle(hmapping_);
        CloseHandle(hfile_);
    }
 private:
    HANDLE hfile_;
    HANDLE hmapping_;
 };
 std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
    void* mapped_data = nullptr;
    size_t file_size  = 0;
    HANDLE file_handle = CreateFileA(
        filename.c_str(),
        GENERIC_READ,
        FILE_SHARE_READ,
        NULL,
        OPEN_EXISTING,
        FILE_ATTRIBUTE_NORMAL,
        NULL);
    if (file_handle == INVALID_HANDLE_VALUE) {
        return nullptr;
    }
    LARGE_INTEGER size;
    if (!GetFileSizeEx(file_handle, &size)) {
        CloseHandle(file_handle);
        return nullptr;
    }
    file_size = static_cast<size_t>(size.QuadPart);
    HANDLE mapping_handle = CreateFileMapping(file_handle, NULL, PAGE_READONLY, 0, 0, NULL);
    if (mapping_handle == NULL) {
        CloseHandle(file_handle);
        return nullptr;
    }
    mapped_data = MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, file_size);
    if (mapped_data == NULL) {
        CloseHandle(mapping_handle);
        CloseHandle(file_handle);
        return nullptr;
    }
    return std::make_unique<MmapWrapperImpl>(mapped_data, file_size, file_handle, mapping_handle);
 }
 #else  // Unix
 #include <dirent.h>
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <unistd.h>
 bool file_exists(const std::string& filename) {
    struct stat buffer;
@ -109,8 +171,64 @@ bool is_directory(const std::string& path) {
    return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
 }
 class MmapWrapperImpl : public MmapWrapper {
 public:
    MmapWrapperImpl(void* data, size_t size)
        : MmapWrapper(data, size) {}
    ~MmapWrapperImpl() override {
        munmap(data_, size_);
    }
 };
 std::unique_ptr<MmapWrapper> MmapWrapper::create(const std::string& filename) {
    int file_descriptor = open(filename.c_str(), O_RDONLY);
    if (file_descriptor == -1) {
        return nullptr;
    }
    int mmap_flags = MAP_PRIVATE;
 #ifdef __linux__
    // performance flags used by llama.cpp
    // posix_fadvise(file_descriptor, 0, 0, POSIX_FADV_SEQUENTIAL);
    // mmap_flags |= MAP_POPULATE;
 #endif
    struct stat sb;
    if (fstat(file_descriptor, &sb) == -1) {
        close(file_descriptor);
        return nullptr;
    }
    size_t file_size = sb.st_size;
    void* mapped_data = mmap(NULL, file_size, PROT_READ, mmap_flags, file_descriptor, 0);
    close(file_descriptor);
    if (mapped_data == MAP_FAILED) {
        return nullptr;
    }
 #ifdef __linux__
    // performance flags used by llama.cpp
    // posix_madvise(mapped_data, file_size, POSIX_MADV_WILLNEED);
 #endif
    return std::make_unique<MmapWrapperImpl>(mapped_data, file_size);
 }
 #endif
 bool MmapWrapper::copy_data(void* buf, size_t n, size_t offset) const {
    if (offset >= size_ || n > (size_ - offset)) {
        return false;
    }
    std::memcpy(buf, data() + offset, n);
    return true;
 }
 // get_num_physical_cores is copy from
 // https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
 // LICENSE: https://github.com/ggerganov/llama.cpp/blob/master/LICENSE
--- a/util.h
+++ b/util.h
@ -2,6 +2,7 @@
 #define __UTIL_H__
 #include <cstdint>
 #include <memory>
 #include <string>
 #include <vector>
@ -43,6 +44,28 @@ sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int
 sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int target_height);
 class MmapWrapper {
 public:
    static std::unique_ptr<MmapWrapper> create(const std::string& filename);
    virtual ~MmapWrapper() = default;
    MmapWrapper(const MmapWrapper&)            = delete;
    MmapWrapper& operator=(const MmapWrapper&) = delete;
    MmapWrapper(MmapWrapper&&)                 = delete;
    MmapWrapper& operator=(MmapWrapper&&)      = delete;
    const uint8_t* data() const { return static_cast<uint8_t*>(data_); }
    size_t size() const { return size_; }
    bool copy_data(void* buf, size_t n, size_t offset) const;
 protected:
    MmapWrapper(void* data, size_t size)
        : data_(data), size_(size) {}
    void* data_  = nullptr;
    size_t size_ = 0;
 };
 std::string path_join(const std::string& p1, const std::string& p2);
 std::vector<std::string> split_string(const std::string& str, char delimiter);
 void pretty_progress(int step, int steps, float time);
--- a/wan.hpp
+++ b/wan.hpp
@ -75,7 +75,7 @@ namespace WAN {
                lp2 -= (int)cache_x->ne[2];
            }
-            x = ggml_pad_ext(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0);
+            x = ggml_ext_pad_ext(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
            return ggml_ext_conv_3d(ctx->ggml_ctx, x, w, b, in_channels,
                                    std::get<2>(stride), std::get<1>(stride), std::get<0>(stride),
                                    0, 0, 0,
@ -206,9 +206,9 @@ namespace WAN {
                } else if (mode == "upsample3d") {
                    x = ggml_upscale(ctx->ggml_ctx, x, 2, GGML_SCALE_MODE_NEAREST);
                } else if (mode == "downsample2d") {
-                    x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
+                    x = ggml_ext_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
                } else if (mode == "downsample3d") {
-                    x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
+                    x = ggml_ext_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
                }
                x = resample_1->forward(ctx, x);
                x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 0, 1, 3, 2));  // (c, t, h, w)
@ -1826,7 +1826,7 @@ namespace WAN {
            }
        }
-        struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx,
+        struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
                                              struct ggml_tensor* x) {
            int64_t W = x->ne[0];
            int64_t H = x->ne[1];
@ -1835,8 +1835,7 @@ namespace WAN {
            int pad_t = (std::get<0>(params.patch_size) - T % std::get<0>(params.patch_size)) % std::get<0>(params.patch_size);
            int pad_h = (std::get<1>(params.patch_size) - H % std::get<1>(params.patch_size)) % std::get<1>(params.patch_size);
            int pad_w = (std::get<2>(params.patch_size) - W % std::get<2>(params.patch_size)) % std::get<2>(params.patch_size);
-            x         = ggml_pad(ctx, x, pad_w, pad_h, pad_t, 0);  // [N*C, T + pad_t, H + pad_h, W + pad_w]
+            ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, pad_t, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
            return x;
        }
@ -1986,14 +1985,14 @@ namespace WAN {
            int64_t T = x->ne[2];
            int64_t C = x->ne[3];
-            x = pad_to_patch_size(ctx->ggml_ctx, x);
+            x = pad_to_patch_size(ctx, x);
            int64_t t_len = ((T + (std::get<0>(params.patch_size) / 2)) / std::get<0>(params.patch_size));
            int64_t h_len = ((H + (std::get<1>(params.patch_size) / 2)) / std::get<1>(params.patch_size));
            int64_t w_len = ((W + (std::get<2>(params.patch_size) / 2)) / std::get<2>(params.patch_size));
            if (time_dim_concat != nullptr) {
-                time_dim_concat = pad_to_patch_size(ctx->ggml_ctx, time_dim_concat);
+                time_dim_concat = pad_to_patch_size(ctx, time_dim_concat);
                x               = ggml_concat(ctx->ggml_ctx, x, time_dim_concat, 2);  // [N*C, (T+pad_t) + (T2+pad_t2), H + pad_h, W + pad_w]
                t_len           = ((x->ne[2] + (std::get<0>(params.patch_size) / 2)) / std::get<0>(params.patch_size));
            }
--- a/z_image.hpp
+++ b/z_image.hpp
@ -324,14 +324,14 @@ namespace ZImage {
            blocks["final_layer"] = std::make_shared<FinalLayer>(z_image_params.hidden_size, z_image_params.patch_size, z_image_params.out_channels);
        }
-        struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx,
+        struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
                                              struct ggml_tensor* x) {
            int64_t W = x->ne[0];
            int64_t H = x->ne[1];
            int pad_h = (z_image_params.patch_size - H % z_image_params.patch_size) % z_image_params.patch_size;
            int pad_w = (z_image_params.patch_size - W % z_image_params.patch_size) % z_image_params.patch_size;
-            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            x         = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
            return x;
        }
@ -357,10 +357,10 @@ namespace ZImage {
            return x;
        }
-        struct ggml_tensor* process_img(struct ggml_context* ctx,
+        struct ggml_tensor* process_img(GGMLRunnerContext* ctx,
                                        struct ggml_tensor* x) {
            x = pad_to_patch_size(ctx, x);
-            x = patchify(ctx, x);
+            x = patchify(ctx->ggml_ctx, x);
            return x;
        }
@ -473,12 +473,12 @@ namespace ZImage {
            int64_t C = x->ne[2];
            int64_t N = x->ne[3];
-            auto img             = process_img(ctx->ggml_ctx, x);
+            auto img             = process_img(ctx, x);
            uint64_t n_img_token = img->ne[1];
            if (ref_latents.size() > 0) {
                for (ggml_tensor* ref : ref_latents) {
-                    ref = process_img(ctx->ggml_ctx, ref);
+                    ref = process_img(ctx, ref);
                    img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
                }
            }
@ -552,6 +552,8 @@ namespace ZImage {
                                               ref_latents,
                                               increase_ref_index,
                                               z_image_params.theta,
                                               circular_y_enabled,
                                               circular_x_enabled,
                                               z_image_params.axes_dim);
            int pos_len = pe_vec.size() / z_image_params.axes_dim_sum / 2;
            // LOG_DEBUG("pos_len %d", pos_len);
Author	SHA1	Message	Date
leejet	4ff2c8c74b	refactor: simplify logic for saving results (#1149 )	2025-12-28 23:27:27 +08:00
leejet	51bd9c8004	chore: reformat named cache params description into single line	2025-12-28 22:53:07 +08:00
Wagner Bruna	d0d836ae74	feat: support mmap for model loading (#1059 )	2025-12-28 22:38:29 +08:00
leejet	a2d83dd0c8	refactor: move pmid condition logic into get_pmid_condition (#1148 )	2025-12-27 16:48:15 +08:00
Wagner Bruna	cc107714d7	fix: consistently pass 2nd-order samplers half steps as negatives (#1095 )	2025-12-27 15:54:18 +08:00
leejet	37c9860b79	fix: handle redirected UTF-8 output correctly on Windows (#1147 )	2025-12-27 15:43:19 +08:00
leejet	ccb6b0ac9d	feat: add __index_timestep_zero__ support (#1146 )	2025-12-26 22:07:40 +08:00
Weiqi Gao	df4efe26bd	feat: add png sequence output for vid_gen (#1117 )	2025-12-26 22:06:13 +08:00
leejet	860a78e248	fix: avoid crash when using taesd for preview only (#1141 )	2025-12-24 23:30:12 +08:00
leejet	a0adcfb148	feat: add support for qwen image edit 2511 (#1096 )	2025-12-24 23:00:08 +08:00
leejet	3d5fdd7b37	feat: add support for more underline loras (#1135 )	2025-12-24 22:59:23 +08:00
Weiqi Gao	3e6c428c27	chore: use Ninja on Windows to speed up build process (#1120 )	2025-12-24 22:53:17 +08:00
张春乔	96fcb13fc0	feat: add --serve-html-path option to example server (#1123 )	2025-12-24 22:43:09 +08:00
leejet	3e812460cf	fix: correct ggml_pad_ext (#1133 )	2025-12-23 21:37:07 +08:00
leejet	98916e8256	docs: update README.md	2025-12-22 23:58:28 +08:00
rmatif	298b11069f	feat: add more caching methods (#1066 )	2025-12-22 23:52:11 +08:00
leejet	30a91138f8	fix: add the missing }	2025-12-21 21:53:38 +08:00
leejet	c6937ba44a	fix: correct the parsing of --convert-name opotion	2025-12-21 21:47:50 +08:00
leejet	ca5b1969a8	feat: do not convert tensor names by default in convert mode (#1122 )	2025-12-21 18:40:10 +08:00
Phylliida Dev	50ff966445	feat: add seamless texture generation support (#914 ) * global bool * reworked circular to global flag * cleaner implementation of tiling support in sd cpp * cleaned rope * working simplified but still need wraps * Further clean of rope * resolve flux conflict * switch to pad op circular only * Set ggml to most recent * Revert ggml temp * Update ggml to most recent * Revert unneded flux change * move circular flag to the GGMLRunnerContext * Pass through circular param in all places where conv is called * fix of constant and minor cleanup * Added back --circular option * Conv2d circular in vae and various models * Fix temporal padding for qwen image and other vaes * Z Image circular tiling * x and y axis seamless only * First attempt at chroma seamless x and y * refactor into pure x and y, almost there * Fix crash on chroma * Refactor into cleaner variable choices * Removed redundant set_circular_enabled * Sync ggml * simplify circular parameter * format code * no need to perform circular pad on the clip * simplify circular_axes setting * unify function naming * remove unnecessary member variables * simplify rope --------- Co-authored-by: Phylliida <phylliidadev@gmail.com> Co-authored-by: leejet <leejet714@gmail.com>	2025-12-21 18:06:47 +08:00
leejet	88ec9d30b1	feat: add scale_rope support (#1121 )	2025-12-21 15:40:21 +08:00
stduhpf	60abda56e0	feat: select vulkan device with env variable (#629 )	2025-12-21 15:35:38 +08:00
stduhpf	23fce0bd84	feat: add support for Chroma Radiance x0 (#1091 ) * Add x0 Flux pred (+prepare for others) * Fix convert models with empty tensors * patch_32 exp support attempt * improve support for patch_32 * follow official pipeline --------- Co-authored-by: leejet <leejet714@gmail.com>	2025-12-20 00:55:57 +08:00
Wagner Bruna	7c88c4765c	chore: give feedback about cfg values smaller than 1 (#1088 )	2025-12-19 23:41:52 +08:00
Weiqi Gao	1f77545cf8	docs: document usage of tae for VRAM reduction using wan (#1108 )	2025-12-19 23:31:09 +08:00
leejet	8e9f3a4d9e	feat: add support for underline style lora of flux (#1103 ) * feat: add support for underline style lora of flux * add support for underline style lora of t5 * add more protected tokens	2025-12-18 21:44:16 +08:00
Wagner Bruna	78e15bd4af	feat: default to LCM scheduler for LCM sampling (#1109 ) * feat: default to LCM scheduler for LCM sampling * fix bug and attempt to get default scheduler for vid_gen when none is set --------- Co-authored-by: leejet <leejet714@gmail.com>	2025-12-18 21:43:39 +08:00
Daniele	97cf2efe45	feat: add KL Optimal scheduler (#1098 )	2025-12-18 21:02:55 +08:00
leejet	bda7fab9f2	chore: remove unused debug code	2025-12-17 23:43:37 +08:00
leejet	c2e18c86e8	fix: make flash attn work with high noise diffusion model (#1111 )	2025-12-17 23:28:59 +08:00
leejet	c3ad6a13e1	refactor: optimize the printing of version log (#1102 )	2025-12-16 23:11:27 +08:00
leejet	ebe9d26a72	feat: supports correct UTF-8 printing on windows (#1101 )	2025-12-16 23:00:41 +08:00
stduhpf	9fa7f415df	feat: add taehv support for Wan/Qwen (#937 )	2025-12-16 22:57:34 +08:00
akleine	a23262dfde	fix: added a clean exit in ModelLoader::load_tensors if OOM (#1097 )	2025-12-16 22:45:10 +08:00
Wagner Bruna	e687913bf1	chore: remove lora_model_dir parameter (#1100 )	2025-12-16 22:37:45 +08:00
Wagner Bruna	200cb6f2ca	fix: avoid crash with VAE tiling and certain image sizes (#1090 )	2025-12-15 23:51:40 +08:00
leejet	43a70e819b	fix: add lora info to image metadata (#1086 )	2025-12-14 01:24:15 +08:00
Kirill A. Korinsky	614f8736df	sync: update ggml (#1082 )	2025-12-14 01:23:34 +08:00
stduhpf	d96b4152d6	perf: optimize ggml_ext_chunk (#1084 )	2025-12-14 01:22:41 +08:00
rmatif	8f05f5bc6e	feat: add support for custom scheduler (#694 ) --------- Co-authored-by: leejet <leejet714@gmail.com>	2025-12-13 16:20:02 +08:00
leejet	15d0f82760	feat(server): do not parse lora fromt client-side prompts (#1083 )	2025-12-13 14:27:47 +08:00
xxnuo	6888fcb581	feat: server add default_gen_params to override default args (#1050 )	2025-12-13 14:22:32 +08:00
leejet	2aecdd57ca	feat: simple openai image generation api compatiple server (#1037 )	2025-12-13 13:53:21 +08:00
		`@ -1 +1 @@`
			`Subproject commit 2d3876d554551d35c06dccc5852be50d5fd2a275`				`Subproject commit 3e9f2ba3b934c20b26873b3c60dbf41b116978ff`