fix: avoid writable mmap for read-only weights (#1698 )

feat: support guidance_schedule (#1684 )
refactor: add Flux VAE version helper (#1696 )
2026-06-23 22:56:42 +00:00 · 2026-06-23 00:39:31 +08:00 · 2026-06-23 00:05:55 +08:00 · 2026-06-22 22:39:42 +08:00 · 2026-06-22 22:16:54 +08:00 · 2026-06-22 22:10:09 +08:00
23 changed files with 1341 additions and 41 deletions
--- a/README.md
+++ b/README.md
@ -50,12 +50,14 @@ API and command-line option may change frequently.***
    - [Ovis-Image](./docs/ovis_image.md)
    - [Anima](./docs/anima.md)
    - [ERNIE-Image](./docs/ernie_image.md)
    - [Boogu Image](./docs/boogu_image.md)
    - [HiDream-O1-Image](./docs/hidream_o1_image.md)
    - [Ideogram4](./docs/ideogram4.md)
  - Image Edit Models
    - [FLUX.1-Kontext-dev](./docs/kontext.md)
    - [Qwen Image Edit series](./docs/qwen_image_edit.md)
    - [LongCat Image Edit](./docs/longcat_image.md)
    - [Boogu Image Edit](./docs/boogu_image.md)
  - Video Models
    - [Wan2.1/Wan2.2](./docs/wan.md)
    - [LTX-2.3](./docs/ltx2.md)
--- a/assets/boogu/edit_example.png
+++ b/assets/boogu/edit_example.png
--- a/assets/boogu/example.png
+++ b/assets/boogu/example.png
--- a/docs/boogu_image.md
+++ b/docs/boogu_image.md
@ -0,0 +1,31 @@
 # How to Use
 Boogu Image uses a Boogu diffusion transformer, the FLUX VAE, and Qwen3-VL as the LLM text and vision encoder.
 ## Download weights
 - Download Boogu Image
    - safetensors: https://huggingface.co/Comfy-Org/Boogu-Image/tree/main/diffusion_models
 - Download vae
    - safetensors: https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
 - Download Qwen3-VL 8B
    - gguf: https://huggingface.co/unsloth/Qwen3-VL-8B-Instruct-GGUF/tree/main
        - For image editing with GGUF text encoders, also download the matching mmproj file and pass it with `--llm_vision`.
 ## Examples
 ### Boogu Image Base
 ```
 .\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\boogu_image_base_bf16.safetensors --llm ..\..\llm\Qwen3VL-8B-Instruct-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\ae.sft -p "a lovely cat" --diffusion-fa -v --offload-to-cpu
 ```
 <img width="256" alt="Boogu Image Base example" src="../assets/boogu/example.png" />
 ### Boogu Image Edit
 ```
 .\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\boogu_image_edit_bf16.safetensors --llm ..\..\llm\Qwen3VL-8B-Instruct-Q4_K_M.gguf --llm_vision ..\..\llm\mmproj-Qwen3VL-8B-Instruct-F16.gguf --vae ..\..\ComfyUI\models\vae\ae.sft --diffusion-fa -v --offload-to-cpu -r ..\assets\flux\flux1-dev-q8_0.png -p "change 'flux.cpp' to 'boogu.cpp'"
 ```
 <img width="256" alt="Boogu Image Edit example" src="../assets/boogu/edit_example.png" />
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -62,18 +62,22 @@ struct SDCliParams {
            {"-o",
             "--output",
             "path to write result image to. you can use printf-style %d format specifiers for image sequences (default: ./output.png) (eg. output_%03d.png). Single-file video outputs support .avi, .webm, and animated .webp",
             0,
             &output_path},
            {"",
             "--image",
             "path to the image to inspect (for metadata mode)",
             0,
             &image_path},
            {"",
             "--metadata-format",
             "metadata output format, one of [text, json] (default: text)",
             0,
             &metadata_format},
            {"",
             "--preview-path",
             "path to write preview image to (default: ./preview.png). Multi-frame previews support .avi, .webm, and animated .webp",
             0,
             &preview_path},
        };
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@ -6,6 +6,7 @@
 #include <cstdlib>
 #include <ctime>
 #include <filesystem>
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <regex>
@ -260,8 +261,15 @@ bool parse_options(int argc, const char** argv, const std::vector<ArgOptions>& o
                        invalid_arg = true;
                        return;
                    }
-                    *option.target = argv_to_utf8(i, argv);
+                    if (option.concat && !option.target->empty()) {
-                    found_arg      = true;
+                        if (option.concat > 0 && option.concat <= 0xff) {
                            *option.target += static_cast<char>(option.concat);
                        }
                        *option.target += argv_to_utf8(i, argv);
                    } else {
                        *option.target = argv_to_utf8(i, argv);
                    }
                    found_arg = true;
                }))
                break;
@ -324,120 +332,151 @@ ArgOptions SDContextParams::get_options() {
        {"-m",
         "--model",
         "path to full model",
         0,
         &model_path},
        {"",
         "--clip_l",
-         "path to the clip-l text encoder", &clip_l_path},
+         "path to the clip-l text encoder",
         0,
         &clip_l_path},
        {"", "--clip_g",
         "path to the clip-g text encoder",
         0,
         &clip_g_path},
        {"",
         "--clip_vision",
         "path to the clip-vision encoder",
         0,
         &clip_vision_path},
        {"",
         "--t5xxl",
         "path to the t5xxl text encoder",
         0,
         &t5xxl_path},
        {"",
         "--llm",
         "path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)",
         0,
         &llm_path},
        {"",
         "--llm_vision",
         "path to the llm vit",
         0,
         &llm_vision_path},
        {"",
         "--qwen2vl",
         "alias of --llm. Deprecated.",
         0,
         &llm_path},
        {"",
         "--qwen2vl_vision",
         "alias of --llm_vision. Deprecated.",
         0,
         &llm_vision_path},
        {"",
         "--diffusion-model",
         "path to the standalone diffusion model",
         0,
         &diffusion_model_path},
        {"",
         "--high-noise-diffusion-model",
         "path to the standalone high noise diffusion model",
         0,
         &high_noise_diffusion_model_path},
        {"",
         "--uncond-diffusion-model",
         "path to the standalone unconditional diffusion model, currently used by Ideogram4 CFG",
         0,
         &uncond_diffusion_model_path},
        {"",
         "--embeddings-connectors",
         "path to LTXAV embeddings connectors",
         0,
         &embeddings_connectors_path},
        {"",
         "--vae",
         "path to standalone vae model",
         0,
         &vae_path},
        {"",
         "--vae-format",
         "VAE latent format override: auto, flux, sd3, or flux2 (default: auto)",
         0,
         &vae_format},
        {"",
         "--audio-vae",
         "path to standalone LTX audio vae model",
         0,
         &audio_vae_path},
        {"",
         "--taesd",
         "path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)",
         0,
         &taesd_path},
        {"",
         "--tae",
         "alias of --taesd",
         0,
         &taesd_path},
        {"",
         "--control-net",
         "path to control net model",
         0,
         &control_net_path},
        {"",
         "--embd-dir",
         "embeddings directory",
         0,
         &embedding_dir},
        {"",
         "--lora-model-dir",
         "lora model directory",
         0,
         &lora_model_dir},
        {"",
         "--hires-upscalers-dir",
         "highres fix upscaler model directory",
         0,
         &hires_upscalers_dir},
        {"",
         "--tensor-type-rules",
         "weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")",
         (int)',',
         &tensor_type_rules},
        {"",
         "--photo-maker",
         "path to PHOTOMAKER model",
         0,
         &photo_maker_path},
        {"",
         "--pulid-weights",
         "path to PuLID Flux weights",
         0,
         &pulid_weights_path},
        {"",
         "--upscale-model",
         "path to esrgan model.",
         0,
         &esrgan_path},
        {"",
         "--backend",
         "runtime backend assignment, e.g. cpu or clip=cpu,vae=cuda0,diffusion=vulkan0",
         (int)',',
         &backend},
        {"",
         "--params-backend",
         "parameter backend assignment, e.g. disk, cpu, or diffusion=disk,clip=cpu",
         (int)',',
         &params_backend},
        {"",
         "--rpc-servers",
         "comma-separated list of RPC servers to connect to for offloading, in the format host:port, e.g. localhost:50052,192.168.1.3:50052",
         (int)',',
         &rpc_servers},
        {"",
         "--max-vram",
         "maximum VRAM budget in GiB for graph-cut segmented execution. Accepts a single value or assignments by backend/device, e.g. 6 or cuda0=6,vulkan0=4. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value",
         0,
         &max_vram},
    };
@ -458,6 +497,10 @@ ArgOptions SDContextParams::get_options() {
         "--stream-layers",
         "enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram; defaults to false)",
         true, &stream_layers},
        {"",
         "--eager-load",
         "load all params into the params backend at model-load time instead of lazily on first use (defaults to false)",
         true, &eager_load},
        {"",
         "--force-sdxl-vae-conv-scale",
         "force use of conv scale on sdxl vae",
@ -761,6 +804,7 @@ std::string SDContextParams::to_string() const {
        << "  offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
        << "  max_vram: \"" << max_vram << "\",\n"
        << "  stream_layers: " << (stream_layers ? "true" : "false") << ",\n"
        << "  eager_load: " << (eager_load ? "true" : "false") << ",\n"
        << "  backend: \"" << backend << "\",\n"
        << "  params_backend: \"" << params_backend << "\",\n"
        << "  enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
@ -840,6 +884,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool taesd_preview) {
    sd_ctx_params.vae_format                      = str_to_vae_format(vae_format);
    sd_ctx_params.max_vram                        = max_vram.c_str();
    sd_ctx_params.stream_layers                   = stream_layers;
    sd_ctx_params.eager_load                      = eager_load;
    sd_ctx_params.backend                         = effective_backend.c_str();
    sd_ctx_params.params_backend                  = effective_params_backend.c_str();
    sd_ctx_params.rpc_servers                     = rpc_servers.c_str();
@ -857,58 +902,71 @@ ArgOptions SDGenerationParams::get_options() {
        {"-p",
         "--prompt",
         "the prompt to render",
         0,
         &prompt},
        {"-n",
         "--negative-prompt",
         "the negative prompt (default: \"\")",
         0,
         &negative_prompt},
        {"-i",
         "--init-img",
         "path to the init image",
         0,
         &init_image_path},
        {"",
         "--end-img",
         "path to the end image, required by flf2v",
         0,
         &end_image_path},
        {"",
         "--mask",
         "path to the mask image",
         0,
         &mask_image_path},
        {"",
         "--control-image",
         "path to control image, control net",
         0,
         &control_image_path},
        {"",
         "--control-video",
         "path to control video frames, It must be a directory path. The video frames inside should be stored as images in "
         "lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images "
         "such as 00.png, 01.png, ... etc.",
         0,
         &control_video_path},
        {"",
         "--pm-id-images-dir",
         "path to PHOTOMAKER input id images dir",
         0,
         &pm_id_images_dir},
        {"",
         "--pm-id-embed-path",
         "path to PHOTOMAKER v2 id embed",
         0,
         &pm_id_embed_path},
        {"",
         "--pulid-id-embedding",
         "path to PuLID id embedding",
         0,
         &pulid_id_embedding_path},
        {"",
         "--hires-upscaler",
         "highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent (nearest-exact), "
         "Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name "
         "under --hires-upscalers-dir (default: Latent)",
         0,
         &hires_upscaler},
        {"",
         "--extra-sample-args",
-         "extra sampler/scheduler/guidance args, key=value list. APG supports apg_eta, apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma",
+         "extra sampler/scheduler/guidance args, key=value list. CFG supports guidance_schedule; APG supports apg_eta, apg_momentum, apg_norm_threshold, apg_norm_threshold_smoothing; SLG supports slg_uncond; lcm supports noise_clip_std, noise_scale_start, noise_scale_end; ltx2 supports max_shift, base_shift, stretch, terminal; euler_ge supports gamma;",
         (int)',',
         &extra_sample_args},
        {"",
         "--extra-tiling-args",
         "extra VAE tiling args, key=value list. LTX video VAE supports temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)",
         (int)',',
         &extra_tiling_args},
    };
@ -1364,6 +1422,42 @@ ArgOptions SDGenerationParams::get_options() {
        return 1;
    };
    auto on_prompt_file_arg = [&](int argc, const char** argv, int index) {
        if (++index >= argc) {
            return -1;
        }
        const char* arg = argv[index];
        std::ifstream f(arg, std::ios::binary);
        try {
            prompt = std::string(std::istreambuf_iterator<char>{f}, {});
        } catch (const std::ios_base::failure&) {
            f.setstate(std::ios_base::failbit);
        }
        if (f.fail()) {
            LOG_ERROR("error: failed to read prompt file '%s'\n", arg);
            return -1;
        }
        return 1;
    };
    auto on_negative_prompt_file_arg = [&](int argc, const char** argv, int index) {
        if (++index >= argc) {
            return -1;
        }
        const char* arg = argv[index];
        std::ifstream f(arg, std::ios::binary);
        try {
            negative_prompt = std::string(std::istreambuf_iterator<char>{f}, {});
        } catch (const std::ios_base::failure&) {
            f.setstate(std::ios_base::failbit);
        }
        if (f.fail()) {
            LOG_ERROR("error: failed to read negative prompt file '%s'\n", arg);
            return -1;
        }
        return 1;
    };
    options.manual_options = {
        {"-s",
         "--seed",
@ -1427,6 +1521,14 @@ ArgOptions SDGenerationParams::get_options() {
         "--vae-relative-tile-size",
         "relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)",
         on_relative_tile_size_arg},
        {"",
         "--prompt-file",
         "path to the file containing the prompt to render",
         on_prompt_file_arg},
        {"",
         "--negative-prompt-file",
         "path to the file containing the negative prompt",
         on_negative_prompt_file_arg},
    };
--- a/examples/common/common.h
+++ b/examples/common/common.h
@ -31,6 +31,7 @@ struct StringOption {
    std::string short_name;
    std::string long_name;
    std::string desc;
    int concat;
    std::string* target;
 };
@ -147,6 +148,7 @@ struct SDContextParams {
    bool offload_params_to_cpu  = false;
    std::string max_vram        = "0";
    bool stream_layers          = false;
    bool eager_load             = false;
    std::string backend;
    std::string params_backend;
    std::string rpc_servers;
--- a/examples/server/runtime.cpp
+++ b/examples/server/runtime.cpp
@ -190,8 +190,8 @@ ArgOptions SDSvrParams::get_options() {
    ArgOptions options;
    options.string_options = {
-        {"-l", "--listen-ip", "server listen ip (default: 127.0.0.1)", &listen_ip},
+        {"-l", "--listen-ip", "server listen ip (default: 127.0.0.1)", 0, &listen_ip},
-        {"", "--serve-html-path", "path to HTML file to serve at root (optional)", &serve_html_path},
+        {"", "--serve-html-path", "path to HTML file to serve at root (optional)", 0, &serve_html_path},
    };
    options.int_options = {
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@ -219,6 +219,7 @@ typedef struct {
    enum sd_vae_format_t vae_format;
    const char* max_vram;  // GiB budget or backend assignment spec for graph-cut segmented param offload (0 = disabled, -1 = auto)
    bool stream_layers;  // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram)
    bool eager_load;  // Load all params into the params backend at model-load time instead of lazily on first use
    const char* backend;
    const char* params_backend;
    const char* rpc_servers;
--- a/src/conditioning/conditioner.hpp
+++ b/src/conditioning/conditioner.hpp
@ -1518,7 +1518,7 @@ struct LLMEmbedder : public Conditioner {
            arch = LLM::LLMArch::GPT_OSS_20B;
        } else if (sd_version_is_pid(version)) {
            arch = LLM::LLMArch::GEMMA2_2B;
-        } else if (sd_version_is_ideogram4(version)) {
+        } else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version)) {
            arch = LLM::LLMArch::QWEN3_VL;
        } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
            arch = LLM::LLMArch::QWEN3;
@ -1778,6 +1778,65 @@ struct LLMEmbedder : public Conditioner {
                prompt += "<|im_end|>\n<|im_start|>assistant\n";
            }
        } else if (sd_version_is_boogu_image(version)) {
            prompt_template_encode_start_idx = 0;
            const std::string t2i_system_prompt =
                "You are a helpful assistant that generates high-quality images based on user instructions. The instructions are as follows.";
            const std::string edit_system_prompt =
                "Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.";
            const bool has_ref_images = llm->enable_vision && conditioner_params.ref_images != nullptr && !conditioner_params.ref_images->empty();
            const bool text_empty     = conditioner_params.text.find_first_not_of(" \t\r\n") == std::string::npos;
            if (has_ref_images) {
                LOG_INFO("BooguImageEditPipeline");
                const std::string prompt_prefix = "<|im_start|>system\n" + edit_system_prompt + "<|im_end|>\n<|im_start|>user\n";
                std::string img_prompt;
                const std::string placeholder = "<|image_pad|>";
                for (int i = 0; i < conditioner_params.ref_images->size(); i++) {
                    const auto& image = (*conditioner_params.ref_images)[i];
                    double factor     = llm->config.vision.patch_size * llm->config.vision.spatial_merge_size;
                    int height        = static_cast<int>(image.shape()[1]);
                    int width         = static_cast<int>(image.shape()[0]);
                    double beta       = std::sqrt((384.0 * 384.0) / (static_cast<double>(height) * static_cast<double>(width)));
                    int h_bar         = std::max(static_cast<int>(factor),
                                                 static_cast<int>(std::round(height * beta / factor)) * static_cast<int>(factor));
                    int w_bar         = std::max(static_cast<int>(factor),
                                                 static_cast<int>(std::round(width * beta / factor)) * static_cast<int>(factor));
                    LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, height, width, h_bar, w_bar);
                    auto resized_image = clip_preprocess(image, w_bar, h_bar);
                    auto image_embed   = llm->encode_image(n_threads, resized_image, false, true, true);
                    GGML_ASSERT(!image_embed.empty());
                    std::string image_prefix = prompt_prefix + img_prompt + "<|vision_start|>";
                    int image_embed_idx      = static_cast<int>(tokenizer->encode(image_prefix, nullptr).size());
                    image_embeds.emplace_back(image_embed_idx, image_embed);
                    img_prompt += "<|vision_start|>";
                    int64_t num_image_tokens = image_embed.shape()[1];
                    img_prompt.reserve(img_prompt.size() + static_cast<size_t>(num_image_tokens) * placeholder.size() + 32);
                    for (int j = 0; j < num_image_tokens; j++) {
                        img_prompt += placeholder;
                    }
                    img_prompt += "<|vision_end|>";
                }
                prompt                  = prompt_prefix + img_prompt;
                prompt_attn_range.first = static_cast<int>(prompt.size());
                prompt += conditioner_params.text;
                prompt_attn_range.second = static_cast<int>(prompt.size());
                prompt += "<|im_end|>\n";
            } else {
                const std::string& system_prompt = text_empty ? edit_system_prompt : t2i_system_prompt;
                prompt                           = "<|im_start|>system\n" + system_prompt + "<|im_end|>\n<|im_start|>user\n";
                prompt_attn_range.first          = static_cast<int>(prompt.size());
                prompt += conditioner_params.text;
                prompt_attn_range.second = static_cast<int>(prompt.size());
                prompt += "<|im_end|>\n";
            }
        } else if (sd_version_is_longcat(version)) {
            spell_quotes = true;
--- a/src/model.h
+++ b/src/model.h
@ -42,6 +42,7 @@ enum SDVersion {
    VERSION_LTXAV,
    VERSION_HIDREAM_O1,
    VERSION_Z_IMAGE,
    VERSION_BOOGU_IMAGE,
    VERSION_OVIS_IMAGE,
    VERSION_ERNIE_IMAGE,
    VERSION_LENS,
@ -143,6 +144,13 @@ static inline bool sd_version_is_z_image(SDVersion version) {
    return false;
 }
 static inline bool sd_version_is_boogu_image(SDVersion version) {
    if (version == VERSION_BOOGU_IMAGE) {
        return true;
    }
    return false;
 }
 static inline bool sd_version_is_longcat(SDVersion version) {
    if (version == VERSION_LONGCAT) {
        return true;
@ -178,6 +186,13 @@ static inline bool sd_version_is_ideogram4(SDVersion version) {
    return false;
 }
 static inline bool sd_version_uses_flux_vae(SDVersion version) {
    if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_boogu_image(version) || sd_version_is_longcat(version)) {
        return true;
    }
    return false;
 }
 static inline bool sd_version_uses_flux2_vae(SDVersion version) {
    if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_ideogram4(version)) {
        return true;
@ -206,6 +221,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
        version == VERSION_HIDREAM_O1 ||
        sd_version_is_anima(version) ||
        sd_version_is_z_image(version) ||
        sd_version_is_boogu_image(version) ||
        sd_version_is_ernie_image(version) ||
        sd_version_is_lens(version) ||
        sd_version_is_longcat(version) ||
--- a/src/model/common/rope.hpp
+++ b/src/model/common/rope.hpp
@ -899,10 +899,12 @@ namespace Rope {
        // q,k,v: [N, L, n_head, d_head]
        // pe: [L, d_head/2, 2, 2]
        // return: [N, L, n_head*d_head]
        int64_t n_head = q->ne[1];
        q = apply_rope(ctx->ggml_ctx, q, pe, rope_interleaved);  // [N*n_head, L, d_head]
        k = apply_rope(ctx->ggml_ctx, k, pe, rope_interleaved);  // [N*n_head, L, d_head]
-        auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, v->ne[1], mask, true, ctx->flash_attn_enabled, kv_scale);  // [N, L, n_head*d_head]
+        auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, mask, true, ctx->flash_attn_enabled, kv_scale);  // [N, L, n_head*d_head]
        return x;
    }
 };  // namespace Rope
--- a/src/model/diffusion/boogu.hpp
+++ b/src/model/diffusion/boogu.hpp
@ -0,0 +1,835 @@
 #ifndef __SD_MODEL_DIFFUSION_BOOGU_HPP__
 #define __SD_MODEL_DIFFUSION_BOOGU_HPP__
 #include <algorithm>
 #include <cmath>
 #include <tuple>
 #include <vector>
 #include "core/ggml_extend.hpp"
 #include "model/common/rope.hpp"
 #include "model/diffusion/dit.hpp"
 #include "model/diffusion/model.hpp"
 #include "model/diffusion/qwen_image.hpp"
 #include "model_loader.h"
 namespace Boogu {
    constexpr int BOOGU_GRAPH_SIZE = 65536;
    struct BooguConfig {
        int patch_size                   = 2;
        int64_t in_channels              = 16;
        int64_t out_channels             = 16;
        int64_t hidden_size              = 3360;
        int64_t num_layers               = 32;
        int64_t num_double_stream_layers = 8;
        int64_t num_refiner_layers       = 2;
        int64_t num_attention_heads      = 28;
        int64_t num_kv_heads             = 7;
        int64_t head_dim                 = 120;
        int64_t multiple_of              = 256;
        int64_t instruction_feat_dim     = 4096;
        int64_t timestep_embed_dim       = 1024;
        int theta                        = 10000;
        float timestep_scale             = 1000.0f;
        float norm_eps                   = 1e-5f;
        std::vector<int> axes_dim        = {40, 40, 40};
        int64_t axes_dim_sum             = 120;
        static int64_t count_blocks(const String2TensorStorage& tensor_storage_map,
                                    const std::string& prefix,
                                    const std::string& block_prefix) {
            int64_t count = 0;
            for (const auto& [name, _] : tensor_storage_map) {
                if (!starts_with(name, prefix)) {
                    continue;
                }
                size_t pos = name.find(block_prefix);
                if (pos == std::string::npos) {
                    continue;
                }
                auto items = split_string(name.substr(pos), '.');
                if (items.size() > 1) {
                    count = std::max<int64_t>(count, atoi(items[1].c_str()) + 1);
                }
            }
            return count;
        }
        static BooguConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) {
            BooguConfig config;
            int64_t detected_head_dim = 0;
            int64_t detected_kv_dim   = 0;
            for (const auto& [name, tensor_storage] : tensor_storage_map) {
                if (!starts_with(name, prefix)) {
                    continue;
                }
                if (ends_with(name, "x_embedder.weight") && tensor_storage.n_dims == 2) {
                    int64_t patch_area = config.patch_size * config.patch_size;
                    config.in_channels = tensor_storage.ne[0] / patch_area;
                    config.hidden_size = tensor_storage.ne[1];
                } else if (ends_with(name, "time_caption_embed.caption_embedder.1.weight") && tensor_storage.n_dims == 2) {
                    config.instruction_feat_dim = tensor_storage.ne[0];
                    config.hidden_size          = tensor_storage.ne[1];
                } else if (ends_with(name, "single_stream_layers.0.attn.norm_q.weight") && tensor_storage.n_dims == 1) {
                    detected_head_dim = tensor_storage.ne[0];
                } else if (ends_with(name, "double_stream_layers.0.img_self_attn.norm_q.weight") && tensor_storage.n_dims == 1) {
                    detected_head_dim = tensor_storage.ne[0];
                } else if (ends_with(name, "single_stream_layers.0.attn.to_k.weight") && tensor_storage.n_dims == 2) {
                    detected_kv_dim = tensor_storage.ne[1];
                } else if (ends_with(name, "double_stream_layers.0.img_instruct_attn.processor.img_to_k.weight") && tensor_storage.n_dims == 2) {
                    detected_kv_dim = tensor_storage.ne[1];
                } else if (ends_with(name, "norm_out.linear_2.weight") && tensor_storage.n_dims == 2) {
                    int64_t patch_area  = config.patch_size * config.patch_size;
                    config.out_channels = tensor_storage.ne[1] / patch_area;
                }
            }
            config.num_layers               = std::max<int64_t>(1, count_blocks(tensor_storage_map, prefix, "single_stream_layers."));
            config.num_double_stream_layers = std::max<int64_t>(0, count_blocks(tensor_storage_map, prefix, "double_stream_layers."));
            int64_t noise_refiner_layers    = count_blocks(tensor_storage_map, prefix, "noise_refiner.");
            int64_t ref_refiner_layers      = count_blocks(tensor_storage_map, prefix, "ref_image_refiner.");
            int64_t context_refiner_layers  = count_blocks(tensor_storage_map, prefix, "context_refiner.");
            config.num_refiner_layers       = std::max<int64_t>(1, std::max(noise_refiner_layers, std::max(ref_refiner_layers, context_refiner_layers)));
            if (detected_head_dim > 0) {
                config.head_dim            = detected_head_dim;
                config.num_attention_heads = config.hidden_size / config.head_dim;
                config.axes_dim_sum        = config.head_dim;
                if (detected_kv_dim > 0) {
                    config.num_kv_heads = detected_kv_dim / config.head_dim;
                }
                if (config.axes_dim_sum == 120) {
                    config.axes_dim = {40, 40, 40};
                } else if (config.axes_dim_sum % 3 == 0) {
                    int axis        = static_cast<int>(config.axes_dim_sum / 3);
                    config.axes_dim = {axis, axis, axis};
                }
            }
            config.timestep_embed_dim = std::min<int64_t>(config.hidden_size, 1024);
            LOG_DEBUG("boogu_image: layers=%" PRId64 ", double_stream_layers=%" PRId64 ", refiner_layers=%" PRId64 ", hidden=%" PRId64 ", heads=%" PRId64 ", kv_heads=%" PRId64 ", head_dim=%" PRId64 ", in_channels=%" PRId64 ", out_channels=%" PRId64,
                      config.num_layers,
                      config.num_double_stream_layers,
                      config.num_refiner_layers,
                      config.hidden_size,
                      config.num_attention_heads,
                      config.num_kv_heads,
                      config.head_dim,
                      config.in_channels,
                      config.out_channels);
            return config;
        }
    };
    __STATIC_INLINE__ ggml_tensor* scale_modulate(ggml_context* ctx, ggml_tensor* x, ggml_tensor* scale) {
        scale = ggml_reshape_3d(ctx, scale, scale->ne[0], 1, scale->ne[1]);
        return ggml_add(ctx, x, ggml_mul(ctx, x, scale));
    }
    __STATIC_INLINE__ ggml_tensor* gate_residual(ggml_context* ctx, ggml_tensor* residual, ggml_tensor* x, ggml_tensor* gate) {
        gate = ggml_tanh(ctx, gate);
        gate = ggml_reshape_3d(ctx, gate, gate->ne[0], 1, gate->ne[1]);
        x    = ggml_mul(ctx, x, gate);
        return ggml_add(ctx, residual, x);
    }
    struct LuminaCombinedTimestepCaptionEmbedding : public GGMLBlock {
        int64_t frequency_embedding_size;
        float timestep_scale;
        LuminaCombinedTimestepCaptionEmbedding(int64_t hidden_size,
                                               int64_t instruction_feat_dim,
                                               int64_t frequency_embedding_size,
                                               float norm_eps,
                                               float timestep_scale)
            : frequency_embedding_size(frequency_embedding_size),
              timestep_scale(timestep_scale) {
            blocks["timestep_embedder"]  = std::make_shared<Qwen::TimestepEmbedding>(frequency_embedding_size, std::min<int64_t>(hidden_size, 1024));
            blocks["caption_embedder.0"] = std::make_shared<RMSNorm>(instruction_feat_dim, norm_eps);
            blocks["caption_embedder.1"] = std::make_shared<Linear>(instruction_feat_dim, hidden_size, true);
        }
        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx, ggml_tensor* timestep, ggml_tensor* text_hidden_states) {
            auto timestep_embedder  = std::dynamic_pointer_cast<Qwen::TimestepEmbedding>(blocks["timestep_embedder"]);
            auto caption_embedder_0 = std::dynamic_pointer_cast<RMSNorm>(blocks["caption_embedder.0"]);
            auto caption_embedder_1 = std::dynamic_pointer_cast<Linear>(blocks["caption_embedder.1"]);
            auto timestep_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast<int>(frequency_embedding_size), 10000, timestep_scale);
            auto time_embed    = timestep_embedder->forward(ctx, timestep_proj);
            auto caption_embed = caption_embedder_1->forward(ctx, caption_embedder_0->forward(ctx, text_hidden_states));
            return {time_embed, caption_embed};
        }
    };
    struct LuminaRMSNormZero : public GGMLBlock {
        LuminaRMSNormZero(int64_t embedding_dim, int64_t conditioning_embedding_dim, float norm_eps) {
            blocks["linear"] = std::make_shared<Linear>(conditioning_embedding_dim, 4 * embedding_dim, true);
            blocks["norm"]   = std::make_shared<RMSNorm>(embedding_dim, norm_eps);
        }
        std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* emb) {
            auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
            auto norm   = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
            emb       = linear->forward(ctx, ggml_silu(ctx->ggml_ctx, emb));
            auto mods = ggml_ext_chunk(ctx->ggml_ctx, emb, 4, 0);
            auto scale_msa = mods[0];
            auto gate_msa  = mods[1];
            auto scale_mlp = mods[2];
            auto gate_mlp  = mods[3];
            x = scale_modulate(ctx->ggml_ctx, norm->forward(ctx, x), scale_msa);
            return {x, gate_msa, scale_mlp, gate_mlp};
        }
    };
    struct LuminaFeedForward : public GGMLBlock {
        LuminaFeedForward(int64_t dim, int64_t inner_dim, int64_t multiple_of) {
            inner_dim          = multiple_of * ((inner_dim + multiple_of - 1) / multiple_of);
            blocks["linear_1"] = std::make_shared<Linear>(dim, inner_dim, false);
            blocks["linear_2"] = std::make_shared<Linear>(inner_dim, dim, false);
            blocks["linear_3"] = std::make_shared<Linear>(dim, inner_dim, false);
        }
        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
            auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
            auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);
            auto linear_3 = std::dynamic_pointer_cast<Linear>(blocks["linear_3"]);
            if (sd_backend_is(ctx->backend, "Vulkan")) {
                linear_2->set_force_prec_f32(true);
            }
            auto h1 = linear_1->forward(ctx, x);
            auto h2 = linear_3->forward(ctx, x);
            x       = ggml_swiglu_split(ctx->ggml_ctx, h1, h2);
            x       = linear_2->forward(ctx, x);
            return x;
        }
    };
    struct LuminaLayerNormContinuous : public GGMLBlock {
        LuminaLayerNormContinuous(int64_t embedding_dim,
                                  int64_t conditioning_embedding_dim,
                                  int64_t out_dim) {
            blocks["linear_1"] = std::make_shared<Linear>(conditioning_embedding_dim, embedding_dim, true);
            blocks["norm"]     = std::make_shared<LayerNorm>(embedding_dim, 1e-6f, false);
            blocks["linear_2"] = std::make_shared<Linear>(embedding_dim, out_dim, true);
        }
        ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* conditioning_embedding) {
            auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
            auto norm     = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
            auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);
            auto emb = linear_1->forward(ctx, ggml_silu(ctx->ggml_ctx, conditioning_embedding));
            x        = scale_modulate(ctx->ggml_ctx, norm->forward(ctx, x), emb);
            x        = linear_2->forward(ctx, x);
            return x;
        }
    };
    struct Attention : public GGMLBlock {
        int64_t dim_head;
        int64_t heads;
        int64_t kv_heads;
        Attention(int64_t query_dim, int64_t dim_head, int64_t heads, int64_t kv_heads, float eps = 1e-5f)
            : dim_head(dim_head), heads(heads), kv_heads(kv_heads) {
            blocks["to_q"]     = std::make_shared<Linear>(query_dim, heads * dim_head, false);
            blocks["to_k"]     = std::make_shared<Linear>(query_dim, kv_heads * dim_head, false);
            blocks["to_v"]     = std::make_shared<Linear>(query_dim, kv_heads * dim_head, false);
            blocks["norm_q"]   = std::make_shared<RMSNorm>(dim_head, eps);
            blocks["norm_k"]   = std::make_shared<RMSNorm>(dim_head, eps);
            blocks["to_out.0"] = std::make_shared<Linear>(heads * dim_head, query_dim, false);
        }
        ggml_tensor* forward(GGMLRunnerContext* ctx,
                             ggml_tensor* hidden_states,
                             ggml_tensor* encoder_hidden_states,
                             ggml_tensor* rotary_emb,
                             ggml_tensor* attention_mask = nullptr) {
            auto to_q     = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
            auto to_k     = std::dynamic_pointer_cast<Linear>(blocks["to_k"]);
            auto to_v     = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
            auto norm_q   = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_q"]);
            auto norm_k   = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_k"]);
            auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
            if (sd_backend_is(ctx->backend, "Vulkan")) {
                to_out_0->set_force_prec_f32(true);
            }
            int64_t N  = hidden_states->ne[2];
            int64_t Lq = hidden_states->ne[1];
            int64_t Lk = encoder_hidden_states->ne[1];
            auto q = to_q->forward(ctx, hidden_states);
            q      = ggml_reshape_4d(ctx->ggml_ctx, q, dim_head, heads, Lq, N);
            auto k = to_k->forward(ctx, encoder_hidden_states);
            k      = ggml_reshape_4d(ctx->ggml_ctx, k, dim_head, kv_heads, Lk, N);
            auto v = to_v->forward(ctx, encoder_hidden_states);
            v      = ggml_reshape_4d(ctx->ggml_ctx, v, dim_head, kv_heads, Lk, N);
            q = norm_q->forward(ctx, q);
            k = norm_k->forward(ctx, k);
            auto out = Rope::attention(ctx, q, k, v, rotary_emb, attention_mask);
            out      = to_out_0->forward(ctx, out);
            return out;
        }
    };
    struct BooguImageTransformerBlock : public GGMLBlock {
        bool modulation;
        BooguImageTransformerBlock(int64_t dim,
                                   int64_t num_attention_heads,
                                   int64_t num_kv_heads,
                                   int64_t multiple_of,
                                   float norm_eps,
                                   bool modulation)
            : modulation(modulation) {
            int64_t head_dim       = dim / num_attention_heads;
            blocks["attn"]         = std::make_shared<Attention>(dim, head_dim, num_attention_heads, num_kv_heads, 1e-5f);
            blocks["feed_forward"] = std::make_shared<LuminaFeedForward>(dim, 4 * dim, multiple_of);
            if (modulation) {
                blocks["norm1"] = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
            } else {
                blocks["norm1"] = std::make_shared<RMSNorm>(dim, norm_eps);
            }
            blocks["ffn_norm1"] = std::make_shared<RMSNorm>(dim, norm_eps);
            blocks["norm2"]     = std::make_shared<RMSNorm>(dim, norm_eps);
            blocks["ffn_norm2"] = std::make_shared<RMSNorm>(dim, norm_eps);
        }
        ggml_tensor* forward(GGMLRunnerContext* ctx,
                             ggml_tensor* hidden_states,
                             ggml_tensor* rotary_emb,
                             ggml_tensor* temb           = nullptr,
                             ggml_tensor* attention_mask = nullptr) {
            auto attn         = std::dynamic_pointer_cast<Attention>(blocks["attn"]);
            auto feed_forward = std::dynamic_pointer_cast<LuminaFeedForward>(blocks["feed_forward"]);
            auto ffn_norm1    = std::dynamic_pointer_cast<RMSNorm>(blocks["ffn_norm1"]);
            auto norm2        = std::dynamic_pointer_cast<RMSNorm>(blocks["norm2"]);
            auto ffn_norm2    = std::dynamic_pointer_cast<RMSNorm>(blocks["ffn_norm2"]);
            if (modulation) {
                auto norm1 = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["norm1"]);
                auto mods  = norm1->forward(ctx, hidden_states, temb);
                auto norm_hidden_states = std::get<0>(mods);
                auto gate_msa           = std::get<1>(mods);
                auto scale_mlp          = std::get<2>(mods);
                auto gate_mlp           = std::get<3>(mods);
                auto attn_output = attn->forward(ctx, norm_hidden_states, norm_hidden_states, rotary_emb, attention_mask);
                hidden_states    = gate_residual(ctx->ggml_ctx, hidden_states, norm2->forward(ctx, attn_output), gate_msa);
                auto mlp_input  = scale_modulate(ctx->ggml_ctx, ffn_norm1->forward(ctx, hidden_states), scale_mlp);
                auto mlp_output = feed_forward->forward(ctx, mlp_input);
                hidden_states   = gate_residual(ctx->ggml_ctx, hidden_states, ffn_norm2->forward(ctx, mlp_output), gate_mlp);
            } else {
                auto norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["norm1"]);
                auto norm_hidden_states = norm1->forward(ctx, hidden_states);
                auto attn_output        = attn->forward(ctx, norm_hidden_states, norm_hidden_states, rotary_emb, attention_mask);
                hidden_states           = ggml_add(ctx->ggml_ctx, hidden_states, norm2->forward(ctx, attn_output));
                auto mlp_output = feed_forward->forward(ctx, ffn_norm1->forward(ctx, hidden_states));
                hidden_states   = ggml_add(ctx->ggml_ctx, hidden_states, ffn_norm2->forward(ctx, mlp_output));
            }
            return hidden_states;
        }
    };
    struct BooguImageJointAttention : public GGMLBlock {
        int64_t dim_head;
        int64_t heads;
        int64_t kv_heads;
        BooguImageJointAttention(int64_t dim, int64_t dim_head, int64_t heads, int64_t kv_heads)
            : dim_head(dim_head), heads(heads), kv_heads(kv_heads) {
            blocks["norm_q"]                  = std::make_shared<RMSNorm>(dim_head, 1e-5f);
            blocks["norm_k"]                  = std::make_shared<RMSNorm>(dim_head, 1e-5f);
            blocks["to_out.0"]                = std::make_shared<Linear>(heads * dim_head, dim, false);
            blocks["processor.img_to_q"]      = std::make_shared<Linear>(dim, heads * dim_head, false);
            blocks["processor.img_to_k"]      = std::make_shared<Linear>(dim, kv_heads * dim_head, false);
            blocks["processor.img_to_v"]      = std::make_shared<Linear>(dim, kv_heads * dim_head, false);
            blocks["processor.instruct_to_q"] = std::make_shared<Linear>(dim, heads * dim_head, false);
            blocks["processor.instruct_to_k"] = std::make_shared<Linear>(dim, kv_heads * dim_head, false);
            blocks["processor.instruct_to_v"] = std::make_shared<Linear>(dim, kv_heads * dim_head, false);
            blocks["processor.instruct_out"]  = std::make_shared<Linear>(heads * dim_head, dim, false);
            blocks["processor.img_out"]       = std::make_shared<Linear>(heads * dim_head, dim, false);
        }
        ggml_tensor* forward(GGMLRunnerContext* ctx,
                             ggml_tensor* img_hidden_states,
                             ggml_tensor* instruct_hidden_states,
                             ggml_tensor* rotary_emb,
                             ggml_tensor* attention_mask = nullptr) {
            auto norm_q        = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_q"]);
            auto norm_k        = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_k"]);
            auto to_out_0      = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
            auto img_to_q      = std::dynamic_pointer_cast<Linear>(blocks["processor.img_to_q"]);
            auto img_to_k      = std::dynamic_pointer_cast<Linear>(blocks["processor.img_to_k"]);
            auto img_to_v      = std::dynamic_pointer_cast<Linear>(blocks["processor.img_to_v"]);
            auto instruct_to_q = std::dynamic_pointer_cast<Linear>(blocks["processor.instruct_to_q"]);
            auto instruct_to_k = std::dynamic_pointer_cast<Linear>(blocks["processor.instruct_to_k"]);
            auto instruct_to_v = std::dynamic_pointer_cast<Linear>(blocks["processor.instruct_to_v"]);
            auto instruct_out  = std::dynamic_pointer_cast<Linear>(blocks["processor.instruct_out"]);
            auto img_out       = std::dynamic_pointer_cast<Linear>(blocks["processor.img_out"]);
            if (sd_backend_is(ctx->backend, "Vulkan")) {
                to_out_0->set_force_prec_f32(true);
            }
            int64_t N          = img_hidden_states->ne[2];
            int64_t L_img      = img_hidden_states->ne[1];
            int64_t L_instruct = instruct_hidden_states->ne[1];
            auto img_q = img_to_q->forward(ctx, img_hidden_states);
            img_q      = ggml_reshape_4d(ctx->ggml_ctx, img_q, dim_head, heads, L_img, N);
            auto img_k = img_to_k->forward(ctx, img_hidden_states);
            img_k      = ggml_reshape_4d(ctx->ggml_ctx, img_k, dim_head, kv_heads, L_img, N);
            auto img_v = img_to_v->forward(ctx, img_hidden_states);
            img_v      = ggml_reshape_4d(ctx->ggml_ctx, img_v, dim_head, kv_heads, L_img, N);
            auto instruct_q = instruct_to_q->forward(ctx, instruct_hidden_states);
            instruct_q      = ggml_reshape_4d(ctx->ggml_ctx, instruct_q, dim_head, heads, L_instruct, N);
            auto instruct_k = instruct_to_k->forward(ctx, instruct_hidden_states);
            instruct_k      = ggml_reshape_4d(ctx->ggml_ctx, instruct_k, dim_head, kv_heads, L_instruct, N);
            auto instruct_v = instruct_to_v->forward(ctx, instruct_hidden_states);
            instruct_v      = ggml_reshape_4d(ctx->ggml_ctx, instruct_v, dim_head, kv_heads, L_instruct, N);
            auto q = ggml_concat(ctx->ggml_ctx, instruct_q, img_q, 2);
            auto k = ggml_concat(ctx->ggml_ctx, instruct_k, img_k, 2);
            auto v = ggml_concat(ctx->ggml_ctx, instruct_v, img_v, 2);
            q      = norm_q->forward(ctx, q);
            k      = norm_k->forward(ctx, k);
            auto hidden_states = Rope::attention(ctx, q, k, v, rotary_emb, attention_mask);
            auto instruct_attn = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, 0, L_instruct);
            auto img_attn      = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, L_instruct, L_instruct + L_img);
            instruct_attn = instruct_out->forward(ctx, instruct_attn);
            img_attn      = img_out->forward(ctx, img_attn);
            hidden_states = ggml_concat(ctx->ggml_ctx, instruct_attn, img_attn, 1);
            hidden_states = to_out_0->forward(ctx, hidden_states);
            return hidden_states;
        }
    };
    struct BooguImageDoubleStreamBlock : public GGMLBlock {
        BooguImageDoubleStreamBlock(int64_t dim,
                                    int64_t num_attention_heads,
                                    int64_t num_kv_heads,
                                    int64_t multiple_of,
                                    float norm_eps) {
            int64_t head_dim                = dim / num_attention_heads;
            blocks["img_instruct_attn"]     = std::make_shared<BooguImageJointAttention>(dim, head_dim, num_attention_heads, num_kv_heads);
            blocks["img_self_attn"]         = std::make_shared<Attention>(dim, head_dim, num_attention_heads, num_kv_heads, 1e-5f);
            blocks["img_feed_forward"]      = std::make_shared<LuminaFeedForward>(dim, 4 * dim, multiple_of);
            blocks["instruct_feed_forward"] = std::make_shared<LuminaFeedForward>(dim, 4 * dim, multiple_of);
            blocks["img_norm1"]             = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
            blocks["img_norm2"]             = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
            blocks["img_norm3"]             = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
            blocks["instruct_norm1"]        = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
            blocks["instruct_norm2"]        = std::make_shared<LuminaRMSNormZero>(dim, std::min<int64_t>(dim, 1024), norm_eps);
            blocks["img_attn_norm"]         = std::make_shared<RMSNorm>(dim, norm_eps);
            blocks["img_self_attn_norm"]    = std::make_shared<RMSNorm>(dim, norm_eps);
            blocks["img_ffn_norm1"]         = std::make_shared<RMSNorm>(dim, norm_eps);
            blocks["img_ffn_norm2"]         = std::make_shared<RMSNorm>(dim, norm_eps);
            blocks["instruct_attn_norm"]    = std::make_shared<RMSNorm>(dim, norm_eps);
            blocks["instruct_ffn_norm1"]    = std::make_shared<RMSNorm>(dim, norm_eps);
            blocks["instruct_ffn_norm2"]    = std::make_shared<RMSNorm>(dim, norm_eps);
        }
        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
                                                      ggml_tensor* img_hidden_states,
                                                      ggml_tensor* instruct_hidden_states,
                                                      ggml_tensor* joint_rotary_emb,
                                                      ggml_tensor* img_rotary_emb,
                                                      ggml_tensor* temb) {
            auto img_instruct_attn     = std::dynamic_pointer_cast<BooguImageJointAttention>(blocks["img_instruct_attn"]);
            auto img_self_attn         = std::dynamic_pointer_cast<Attention>(blocks["img_self_attn"]);
            auto img_feed_forward      = std::dynamic_pointer_cast<LuminaFeedForward>(blocks["img_feed_forward"]);
            auto instruct_feed_forward = std::dynamic_pointer_cast<LuminaFeedForward>(blocks["instruct_feed_forward"]);
            auto img_norm1             = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["img_norm1"]);
            auto img_norm2             = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["img_norm2"]);
            auto img_norm3             = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["img_norm3"]);
            auto instruct_norm1        = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["instruct_norm1"]);
            auto instruct_norm2        = std::dynamic_pointer_cast<LuminaRMSNormZero>(blocks["instruct_norm2"]);
            auto img_attn_norm         = std::dynamic_pointer_cast<RMSNorm>(blocks["img_attn_norm"]);
            auto img_self_attn_norm    = std::dynamic_pointer_cast<RMSNorm>(blocks["img_self_attn_norm"]);
            auto img_ffn_norm1         = std::dynamic_pointer_cast<RMSNorm>(blocks["img_ffn_norm1"]);
            auto img_ffn_norm2         = std::dynamic_pointer_cast<RMSNorm>(blocks["img_ffn_norm2"]);
            auto instruct_attn_norm    = std::dynamic_pointer_cast<RMSNorm>(blocks["instruct_attn_norm"]);
            auto instruct_ffn_norm1    = std::dynamic_pointer_cast<RMSNorm>(blocks["instruct_ffn_norm1"]);
            auto instruct_ffn_norm2    = std::dynamic_pointer_cast<RMSNorm>(blocks["instruct_ffn_norm2"]);
            int64_t L_instruct = instruct_hidden_states->ne[1];
            auto img_norm1_out_vec      = img_norm1->forward(ctx, img_hidden_states, temb);
            auto img_norm2_out_vec      = img_norm2->forward(ctx, img_hidden_states, temb);
            auto img_norm3_out_vec      = img_norm3->forward(ctx, img_hidden_states, temb);
            auto instruct_norm1_out_vec = instruct_norm1->forward(ctx, instruct_hidden_states, temb);
            auto instruct_norm2_out_vec = instruct_norm2->forward(ctx, instruct_hidden_states, temb);
            auto img_norm1_out = std::get<0>(img_norm1_out_vec);
            auto img_gate_msa  = std::get<1>(img_norm1_out_vec);
            auto img_scale_mlp = std::get<2>(img_norm1_out_vec);
            auto img_gate_mlp  = std::get<3>(img_norm1_out_vec);
            auto img_norm2_out = std::get<0>(img_norm2_out_vec);
            auto img_shift_mlp = std::get<1>(img_norm2_out_vec);
            auto img_norm3_out = std::get<0>(img_norm3_out_vec);
            auto img_gate_self = std::get<1>(img_norm3_out_vec);
            auto instruct_norm1_out = std::get<0>(instruct_norm1_out_vec);
            auto instruct_gate_msa  = std::get<1>(instruct_norm1_out_vec);
            auto instruct_scale_mlp = std::get<2>(instruct_norm1_out_vec);
            auto instruct_gate_mlp  = std::get<3>(instruct_norm1_out_vec);
            auto instruct_norm2_out = std::get<0>(instruct_norm2_out_vec);
            auto instruct_shift_mlp = std::get<1>(instruct_norm2_out_vec);
            auto joint_attn_out    = img_instruct_attn->forward(ctx, img_norm1_out, instruct_norm1_out, joint_rotary_emb);
            auto instruct_attn_out = ggml_ext_slice(ctx->ggml_ctx, joint_attn_out, 1, 0, L_instruct);
            auto img_attn_out      = ggml_ext_slice(ctx->ggml_ctx, joint_attn_out, 1, L_instruct, joint_attn_out->ne[1]);
            auto img_self_attn_out = img_self_attn->forward(ctx, img_norm3_out, img_norm3_out, img_rotary_emb);
            img_hidden_states = gate_residual(ctx->ggml_ctx, img_hidden_states, img_attn_norm->forward(ctx, img_attn_out), img_gate_msa);
            img_hidden_states = gate_residual(ctx->ggml_ctx, img_hidden_states, img_self_attn_norm->forward(ctx, img_self_attn_out), img_gate_self);
            auto img_mlp_input = scale_modulate(ctx->ggml_ctx, img_norm2_out, img_scale_mlp);
            img_shift_mlp      = ggml_reshape_3d(ctx->ggml_ctx, img_shift_mlp, img_shift_mlp->ne[0], 1, img_shift_mlp->ne[1]);
            img_mlp_input      = ggml_add(ctx->ggml_ctx, img_mlp_input, img_shift_mlp);
            auto img_mlp_out   = img_feed_forward->forward(ctx, img_ffn_norm1->forward(ctx, img_mlp_input));
            img_hidden_states  = gate_residual(ctx->ggml_ctx, img_hidden_states, img_ffn_norm2->forward(ctx, img_mlp_out), img_gate_mlp);
            instruct_hidden_states  = gate_residual(ctx->ggml_ctx, instruct_hidden_states, instruct_attn_norm->forward(ctx, instruct_attn_out), instruct_gate_msa);
            auto instruct_mlp_input = scale_modulate(ctx->ggml_ctx, instruct_norm2_out, instruct_scale_mlp);
            instruct_shift_mlp      = ggml_reshape_3d(ctx->ggml_ctx, instruct_shift_mlp, instruct_shift_mlp->ne[0], 1, instruct_shift_mlp->ne[1]);
            instruct_mlp_input      = ggml_add(ctx->ggml_ctx, instruct_mlp_input, instruct_shift_mlp);
            auto instruct_mlp_out   = instruct_feed_forward->forward(ctx, instruct_ffn_norm1->forward(ctx, instruct_mlp_input));
            instruct_hidden_states  = gate_residual(ctx->ggml_ctx, instruct_hidden_states, instruct_ffn_norm2->forward(ctx, instruct_mlp_out), instruct_gate_mlp);
            return {img_hidden_states, instruct_hidden_states};
        }
    };
    struct BooguImageModel : public GGMLBlock {
        BooguConfig config;
        void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
            GGML_UNUSED(tensor_storage_map);
            GGML_UNUSED(prefix);
            params["image_index_embedding"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, config.hidden_size, 5);
        }
        BooguImageModel() = default;
        BooguImageModel(BooguConfig config)
            : config(std::move(config)) {
            blocks["x_embedder"]               = std::make_shared<Linear>(this->config.patch_size * this->config.patch_size * this->config.in_channels, this->config.hidden_size, true);
            blocks["ref_image_patch_embedder"] = std::make_shared<Linear>(this->config.patch_size * this->config.patch_size * this->config.in_channels, this->config.hidden_size, true);
            blocks["time_caption_embed"]       = std::make_shared<LuminaCombinedTimestepCaptionEmbedding>(this->config.hidden_size,
                                                                                                    this->config.instruction_feat_dim,
                                                                                                    256,
                                                                                                    this->config.norm_eps,
                                                                                                    this->config.timestep_scale);
            for (int i = 0; i < this->config.num_refiner_layers; i++) {
                blocks["noise_refiner." + std::to_string(i)]     = std::make_shared<BooguImageTransformerBlock>(this->config.hidden_size,
                                                                                                            this->config.num_attention_heads,
                                                                                                            this->config.num_kv_heads,
                                                                                                            this->config.multiple_of,
                                                                                                            this->config.norm_eps,
                                                                                                            true);
                blocks["ref_image_refiner." + std::to_string(i)] = std::make_shared<BooguImageTransformerBlock>(this->config.hidden_size,
                                                                                                                this->config.num_attention_heads,
                                                                                                                this->config.num_kv_heads,
                                                                                                                this->config.multiple_of,
                                                                                                                this->config.norm_eps,
                                                                                                                true);
                blocks["context_refiner." + std::to_string(i)]   = std::make_shared<BooguImageTransformerBlock>(this->config.hidden_size,
                                                                                                              this->config.num_attention_heads,
                                                                                                              this->config.num_kv_heads,
                                                                                                              this->config.multiple_of,
                                                                                                              this->config.norm_eps,
                                                                                                              false);
            }
            for (int i = 0; i < this->config.num_double_stream_layers; i++) {
                blocks["double_stream_layers." + std::to_string(i)] = std::make_shared<BooguImageDoubleStreamBlock>(this->config.hidden_size,
                                                                                                                    this->config.num_attention_heads,
                                                                                                                    this->config.num_kv_heads,
                                                                                                                    this->config.multiple_of,
                                                                                                                    this->config.norm_eps);
            }
            for (int i = 0; i < this->config.num_layers; i++) {
                blocks["single_stream_layers." + std::to_string(i)] = std::make_shared<BooguImageTransformerBlock>(this->config.hidden_size,
                                                                                                                   this->config.num_attention_heads,
                                                                                                                   this->config.num_kv_heads,
                                                                                                                   this->config.multiple_of,
                                                                                                                   this->config.norm_eps,
                                                                                                                   true);
            }
            blocks["norm_out"] = std::make_shared<LuminaLayerNormContinuous>(this->config.hidden_size,
                                                                             this->config.timestep_embed_dim,
                                                                             this->config.patch_size * this->config.patch_size * this->config.out_channels);
        }
        ggml_tensor* image_index_embedding(GGMLRunnerContext* ctx, int index) {
            GGML_ASSERT(index >= 0 && index < 5);
            auto embedding = params["image_index_embedding"];
            auto out       = ggml_view_1d(ctx->ggml_ctx,
                                          embedding,
                                          config.hidden_size,
                                          index * config.hidden_size * ggml_element_size(embedding));
            out            = ggml_reshape_3d(ctx->ggml_ctx, out, config.hidden_size, 1, 1);
            return out;
        }
        ggml_tensor* embed_refs(GGMLRunnerContext* ctx, const std::vector<ggml_tensor*>& ref_latents) {
            if (ref_latents.empty()) {
                return nullptr;
            }
            auto ref_image_patch_embedder = std::dynamic_pointer_cast<Linear>(blocks["ref_image_patch_embedder"]);
            ggml_tensor* ref_img = nullptr;
            for (int i = 0; i < static_cast<int>(ref_latents.size()); i++) {
                auto ref = DiT::pad_and_patchify(ctx, ref_latents[i], config.patch_size, config.patch_size, false);
                ref      = ref_image_patch_embedder->forward(ctx, ref);
                ref      = ggml_add(ctx->ggml_ctx, ref, image_index_embedding(ctx, std::min(i, 4)));
                ref_img  = ref_img == nullptr ? ref : ggml_concat(ctx->ggml_ctx, ref_img, ref, 1);
            }
            return ref_img;
        }
        ggml_tensor* forward(GGMLRunnerContext* ctx,
                             ggml_tensor* x,
                             ggml_tensor* timesteps,
                             ggml_tensor* context,
                             ggml_tensor* pe,
                             std::vector<ggml_tensor*> ref_latents = {}) {
            int64_t W = x->ne[0];
            int64_t H = x->ne[1];
            int64_t N = x->ne[3];
            GGML_ASSERT(N == 1);
            auto x_embedder         = std::dynamic_pointer_cast<Linear>(blocks["x_embedder"]);
            auto time_caption_embed = std::dynamic_pointer_cast<LuminaCombinedTimestepCaptionEmbedding>(blocks["time_caption_embed"]);
            auto norm_out           = std::dynamic_pointer_cast<LuminaLayerNormContinuous>(blocks["norm_out"]);
            auto timestep = ggml_sub(ctx->ggml_ctx, ggml_ext_ones_like(ctx->ggml_ctx, timesteps), timesteps);
            auto embeds   = time_caption_embed->forward(ctx, timestep, context);
            auto temb     = embeds.first;
            auto txt      = embeds.second;
            auto img        = DiT::pad_and_patchify(ctx, x, config.patch_size, config.patch_size, false);
            int64_t img_len = img->ne[1];
            img             = x_embedder->forward(ctx, img);
            auto ref_img    = embed_refs(ctx, ref_latents);
            int64_t ref_len = ref_img != nullptr ? ref_img->ne[1] : 0;
            int64_t txt_len = txt->ne[1];
            GGML_ASSERT(pe->ne[3] == txt_len + ref_len + img_len);
            auto txt_pe   = ggml_ext_slice(ctx->ggml_ctx, pe, 3, 0, txt_len);
            auto noise_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt_len + ref_len, txt_len + ref_len + img_len);
            for (int i = 0; i < config.num_refiner_layers; i++) {
                auto block = std::dynamic_pointer_cast<BooguImageTransformerBlock>(blocks["context_refiner." + std::to_string(i)]);
                txt        = block->forward(ctx, txt, txt_pe);
                sd::ggml_graph_cut::mark_graph_cut(txt, "boogu.context_refiner." + std::to_string(i), "txt");
            }
            for (int i = 0; i < config.num_refiner_layers; i++) {
                auto block = std::dynamic_pointer_cast<BooguImageTransformerBlock>(blocks["noise_refiner." + std::to_string(i)]);
                img        = block->forward(ctx, img, noise_pe, temb);
                sd::ggml_graph_cut::mark_graph_cut(img, "boogu.noise_refiner." + std::to_string(i), "img");
            }
            ggml_tensor* combined_img = img;
            if (ref_img != nullptr) {
                auto ref_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt_len, txt_len + ref_len);
                for (int i = 0; i < config.num_refiner_layers; i++) {
                    auto block = std::dynamic_pointer_cast<BooguImageTransformerBlock>(blocks["ref_image_refiner." + std::to_string(i)]);
                    ref_img    = block->forward(ctx, ref_img, ref_pe, temb);
                    sd::ggml_graph_cut::mark_graph_cut(ref_img, "boogu.ref_image_refiner." + std::to_string(i), "ref_img");
                }
                combined_img = ggml_concat(ctx->ggml_ctx, ref_img, img, 1);
            }
            auto img_pe = ggml_ext_slice(ctx->ggml_ctx, pe, 3, txt_len, txt_len + combined_img->ne[1]);
            for (int i = 0; i < config.num_double_stream_layers; i++) {
                auto block   = std::dynamic_pointer_cast<BooguImageDoubleStreamBlock>(blocks["double_stream_layers." + std::to_string(i)]);
                auto result  = block->forward(ctx, combined_img, txt, pe, img_pe, temb);
                combined_img = result.first;
                txt          = result.second;
                sd::ggml_graph_cut::mark_graph_cut(combined_img, "boogu.double_stream_layers." + std::to_string(i), "img");
                sd::ggml_graph_cut::mark_graph_cut(txt, "boogu.double_stream_layers." + std::to_string(i), "txt");
            }
            auto hidden_states = ggml_concat(ctx->ggml_ctx, txt, combined_img, 1);
            for (int i = 0; i < config.num_layers; i++) {
                auto block    = std::dynamic_pointer_cast<BooguImageTransformerBlock>(blocks["single_stream_layers." + std::to_string(i)]);
                hidden_states = block->forward(ctx, hidden_states, pe, temb);
                sd::ggml_graph_cut::mark_graph_cut(hidden_states, "boogu.single_stream_layers." + std::to_string(i), "hidden_states");
            }
            hidden_states = norm_out->forward(ctx, hidden_states, temb);
            hidden_states = ggml_ext_slice(ctx->ggml_ctx, hidden_states, 1, hidden_states->ne[1] - img_len, hidden_states->ne[1]);
            hidden_states = DiT::unpatchify_and_crop(ctx->ggml_ctx, hidden_states, H, W, config.patch_size, config.patch_size, false);
            hidden_states = ggml_ext_scale(ctx->ggml_ctx, hidden_states, -1.f);
            return hidden_states;
        }
    };
    __STATIC_INLINE__ int patched_token_count(int64_t size, int patch_size) {
        int pad = (patch_size - (static_cast<int>(size) % patch_size)) % patch_size;
        return (static_cast<int>(size) + pad) / patch_size;
    }
    __STATIC_INLINE__ void append_spatial_ids(std::vector<std::vector<float>>& ids,
                                              int bs,
                                              int pe_shift,
                                              int h_tokens,
                                              int w_tokens) {
        std::vector<std::vector<float>> image_ids(h_tokens * w_tokens, std::vector<float>(3, 0.0f));
        for (int h = 0; h < h_tokens; h++) {
            for (int w = 0; w < w_tokens; w++) {
                image_ids[h * w_tokens + w][0] = static_cast<float>(pe_shift);
                image_ids[h * w_tokens + w][1] = static_cast<float>(h);
                image_ids[h * w_tokens + w][2] = static_cast<float>(w);
            }
        }
        for (int b = 0; b < bs; b++) {
            ids.insert(ids.end(), image_ids.begin(), image_ids.end());
        }
    }
    __STATIC_INLINE__ std::vector<float> gen_boogu_pe(int h,
                                                      int w,
                                                      int patch_size,
                                                      int bs,
                                                      int context_len,
                                                      const std::vector<ggml_tensor*>& ref_latents,
                                                      int theta,
                                                      const std::vector<int>& axes_dim) {
        std::vector<std::vector<float>> ids;
        ids.reserve(static_cast<size_t>(bs) * context_len);
        for (int b = 0; b < bs; b++) {
            for (int i = 0; i < context_len; i++) {
                float pos = static_cast<float>(i);
                ids.push_back({pos, pos, pos});
            }
        }
        int pe_shift = context_len;
        for (ggml_tensor* ref : ref_latents) {
            int ref_h_tokens = patched_token_count(ref->ne[1], patch_size);
            int ref_w_tokens = patched_token_count(ref->ne[0], patch_size);
            append_spatial_ids(ids, bs, pe_shift, ref_h_tokens, ref_w_tokens);
            pe_shift += std::max(ref_h_tokens, ref_w_tokens);
        }
        int h_tokens = patched_token_count(h, patch_size);
        int w_tokens = patched_token_count(w, patch_size);
        append_spatial_ids(ids, bs, pe_shift, h_tokens, w_tokens);
        return Rope::embed_nd(ids, bs, static_cast<float>(theta), axes_dim);
    }
    struct BooguImageRunner : public DiffusionModelRunner {
        BooguConfig config;
        BooguImageModel boogu;
        std::vector<float> pe_vec;
        BooguImageRunner(ggml_backend_t backend,
                         const String2TensorStorage& tensor_storage_map      = {},
                         const std::string prefix                            = "",
                         SDVersion version                                   = VERSION_BOOGU_IMAGE,
                         std::shared_ptr<RunnerWeightManager> weight_manager = nullptr)
            : DiffusionModelRunner(backend, prefix, weight_manager),
              config(BooguConfig::detect_from_weights(tensor_storage_map, prefix)) {
            boogu = BooguImageModel(config);
            boogu.init(params_ctx, tensor_storage_map, prefix);
        }
        std::string get_desc() override {
            return "boogu_image";
        }
        void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
            boogu.get_param_tensors(tensors, prefix);
        }
        ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
                                 const sd::Tensor<float>& timesteps_tensor,
                                 const sd::Tensor<float>& context_tensor,
                                 const std::vector<sd::Tensor<float>>& ref_latents_tensor = {}) {
            ggml_cgraph* gf        = new_graph_custom(BOOGU_GRAPH_SIZE);
            ggml_tensor* x         = make_input(x_tensor);
            ggml_tensor* timesteps = make_input(timesteps_tensor);
            GGML_ASSERT(x->ne[3] == 1);
            GGML_ASSERT(!context_tensor.empty());
            ggml_tensor* context = make_input(context_tensor);
            std::vector<ggml_tensor*> ref_latents;
            ref_latents.reserve(ref_latents_tensor.size());
            for (const auto& ref_latent_tensor : ref_latents_tensor) {
                ref_latents.push_back(make_input(ref_latent_tensor));
            }
            pe_vec      = gen_boogu_pe(static_cast<int>(x->ne[1]),
                                       static_cast<int>(x->ne[0]),
                                       config.patch_size,
                                       static_cast<int>(x->ne[3]),
                                       static_cast<int>(context->ne[1]),
                                       ref_latents,
                                       config.theta,
                                       config.axes_dim);
            int pos_len = static_cast<int>(pe_vec.size() / config.axes_dim_sum / 2);
            auto pe     = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, config.axes_dim_sum / 2, pos_len);
            set_backend_tensor_data(pe, pe_vec.data());
            auto runner_ctx  = get_context();
            ggml_tensor* out = boogu.forward(&runner_ctx, x, timesteps, context, pe, ref_latents);
            ggml_build_forward_expand(gf, out);
            return gf;
        }
        sd::Tensor<float> compute(int n_threads,
                                  const sd::Tensor<float>& x,
                                  const sd::Tensor<float>& timesteps,
                                  const sd::Tensor<float>& context,
                                  const std::vector<sd::Tensor<float>>& ref_latents = {}) {
            auto get_graph = [&]() -> ggml_cgraph* {
                return build_graph(x, timesteps, context, ref_latents);
            };
            return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false, false, false), x.dim());
        }
        sd::Tensor<float> compute(int n_threads,
                                  const DiffusionParams& diffusion_params) override {
            GGML_ASSERT(diffusion_params.x != nullptr);
            GGML_ASSERT(diffusion_params.timesteps != nullptr);
            static const std::vector<sd::Tensor<float>> empty_ref_latents;
            return compute(n_threads,
                           *diffusion_params.x,
                           *diffusion_params.timesteps,
                           tensor_or_empty(diffusion_params.context),
                           diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents);
        }
    };
 }  // namespace Boogu
 #endif  // __SD_MODEL_DIFFUSION_BOOGU_HPP__
--- a/src/model/diffusion/ernie_image.hpp
+++ b/src/model/diffusion/ernie_image.hpp
@ -162,6 +162,8 @@ namespace ErnieImage {
            int64_t S = x->ne[1];
            int64_t N = x->ne[2];
            float scale = (sd_backend_is(ctx->backend, "Vulkan") && ctx->flash_attn_enabled) ? 1.0f / 32.0f : 1.0f;
            auto q = to_q->forward(ctx, x);
            auto k = to_k->forward(ctx, x);
            auto v = to_v->forward(ctx, x);
@ -182,7 +184,7 @@ namespace ErnieImage {
            k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 0, 2, 1, 3));  // [N, heads, S, head_dim]
            k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]);
-            x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, ctx->flash_attn_enabled);  // [N, S, hidden_size]
+            x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, ctx->flash_attn_enabled, scale);  // [N, S, hidden_size]
            x = to_out_0->forward(ctx, x);
            return x;
        }
--- a/src/model/te/llm.hpp
+++ b/src/model/te/llm.hpp
@ -79,6 +79,7 @@ namespace LLM {
        int window_size                     = 112;
        int num_position_embeddings         = 0;
        std::set<int> fullatt_block_indexes = {7, 15, 23, 31};
        bool split_patch_embed              = false;
    };
    struct LLMConfig {
@ -179,7 +180,8 @@ namespace LLM {
                config.num_experts_per_tok     = 4;
            }
-            config.num_layers = 0;
+            config.num_layers          = 0;
            int detected_vision_layers = 0;
            for (const auto& [name, tensor_storage] : tensor_storage_map) {
                if (!starts_with(name, prefix)) {
                    continue;
@ -190,6 +192,38 @@ namespace LLM {
                    if (contains(name, "attn.q_proj")) {
                        config.llama_cpp_style = true;
                    }
                    if (contains(name, "visual.patch_embed.proj.1.weight")) {
                        config.vision.split_patch_embed = true;
                    }
                    if (contains(name, "visual.patch_embed.proj.0.weight")) {
                        config.vision.patch_size  = static_cast<int>(tensor_storage.ne[0]);
                        config.vision.in_channels = tensor_storage.ne[2];
                        config.vision.hidden_size = tensor_storage.ne[3];
                    }
                    if (contains(name, "visual.patch_embed.bias")) {
                        config.vision.hidden_size = tensor_storage.ne[0];
                    }
                    if (contains(name, "visual.pos_embed.weight")) {
                        config.vision.hidden_size             = tensor_storage.ne[0];
                        config.vision.num_position_embeddings = static_cast<int>(tensor_storage.ne[1]);
                    }
                    if (contains(name, "visual.blocks.")) {
                        auto items = split_string(name.substr(pos), '.');
                        if (items.size() > 2) {
                            int block_index = atoi(items[2].c_str());
                            if (block_index + 1 > detected_vision_layers) {
                                detected_vision_layers = block_index + 1;
                            }
                        }
                    }
                    if (contains(name, "visual.blocks.0.mlp.linear_fc1.weight") ||
                        contains(name, "visual.blocks.0.mlp.gate_proj.weight")) {
                        config.vision.intermediate_size = tensor_storage.ne[1];
                    }
                    if (contains(name, "visual.merger.linear_fc2.weight") ||
                        contains(name, "visual.merger.mlp.2.weight")) {
                        config.vision.out_hidden_size = tensor_storage.ne[1];
                    }
                    continue;
                }
                pos = name.find("layers.");
@ -219,6 +253,9 @@ namespace LLM {
            if (arch == LLMArch::QWEN3 && config.num_layers == 28) {
                config.num_heads = 16;
            }
            if (detected_vision_layers > 0) {
                config.vision.num_layers = detected_vision_layers;
            }
            LOG_DEBUG("llm: num_layers = %" PRId64 ", vocab_size = %" PRId64 ", hidden_size = %" PRId64 ", intermediate_size = %" PRId64,
                      config.num_layers,
                      config.vocab_size,
@ -539,40 +576,51 @@ namespace LLM {
    struct VisionPatchEmbed : public GGMLBlock {
    protected:
-        bool llama_cpp_style;
+        bool split_patch_embed;
        bool bias;
        int patch_size;
        int temporal_patch_size;
        int64_t in_channels;
        int64_t embed_dim;
        void init_params(ggml_context* ctx,
                         const String2TensorStorage& tensor_storage_map = {},
                         const std::string prefix                       = "") override {
            GGML_UNUSED(tensor_storage_map);
            GGML_UNUSED(prefix);
            if (split_patch_embed && bias) {
                params["bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, embed_dim);
            }
        }
    public:
-        VisionPatchEmbed(bool llama_cpp_style,
+        VisionPatchEmbed(bool split_patch_embed,
                         LLMVisionArch arch,
                         int patch_size          = 14,
                         int temporal_patch_size = 2,
                         int64_t in_channels     = 3,
                         int64_t embed_dim       = 1152)
-            : llama_cpp_style(llama_cpp_style),
+            : split_patch_embed(split_patch_embed),
              bias(arch == LLMVisionArch::QWEN3_VL),
              patch_size(patch_size),
              temporal_patch_size(temporal_patch_size),
              in_channels(in_channels),
              embed_dim(embed_dim) {
-            bool bias = arch == LLMVisionArch::QWEN3_VL;
+            if (split_patch_embed) {
            if (llama_cpp_style) {
                blocks["proj.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels,
                                                                         embed_dim,
                                                                         {patch_size, patch_size},
                                                                         {patch_size, patch_size},
                                                                         {0, 0},
                                                                         {1, 1},
-                                                                         bias));
+                                                                         false));
                blocks["proj.1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels,
                                                                         embed_dim,
                                                                         {patch_size, patch_size},
                                                                         {patch_size, patch_size},
                                                                         {0, 0},
                                                                         {1, 1},
-                                                                         bias));
+                                                                         false));
            } else {
                std::tuple<int, int, int> kernel_size = {(int)temporal_patch_size, (int)patch_size, (int)patch_size};
                blocks["proj"]                        = std::shared_ptr<GGMLBlock>(new Conv3d(in_channels,
@ -593,7 +641,7 @@ namespace LLM {
                                temporal_patch_size,
                                ggml_nelements(x) / (temporal_patch_size * patch_size * patch_size));
-            if (llama_cpp_style) {
+            if (split_patch_embed) {
                auto proj_0 = std::dynamic_pointer_cast<Conv2d>(blocks["proj.0"]);
                auto proj_1 = std::dynamic_pointer_cast<Conv2d>(blocks["proj.1"]);
@ -606,6 +654,10 @@ namespace LLM {
                x1      = proj_1->forward(ctx, x1);
                x = ggml_add(ctx->ggml_ctx, x0, x1);
                if (bias) {
                    auto b = ggml_reshape_4d(ctx->ggml_ctx, params["bias"], 1, 1, embed_dim, 1);
                    x      = ggml_add_inplace(ctx->ggml_ctx, x, b);
                }
            } else {
                auto proj = std::dynamic_pointer_cast<Conv3d>(blocks["proj"]);
@ -798,7 +850,7 @@ namespace LLM {
              spatial_merge_size(vision_params.spatial_merge_size),
              num_grid_per_side(vision_params.num_position_embeddings > 0 ? static_cast<int>(std::sqrt(vision_params.num_position_embeddings)) : 0),
              fullatt_block_indexes(vision_params.fullatt_block_indexes) {
-            blocks["patch_embed"] = std::shared_ptr<GGMLBlock>(new VisionPatchEmbed(llama_cpp_style,
+            blocks["patch_embed"] = std::shared_ptr<GGMLBlock>(new VisionPatchEmbed(vision_params.split_patch_embed,
                                                                                    arch_,
                                                                                    vision_params.patch_size,
                                                                                    vision_params.temporal_patch_size,
--- a/src/model/vae/auto_encoder_kl.hpp
+++ b/src/model/vae/auto_encoder_kl.hpp
@ -682,7 +682,7 @@ struct AutoEncoderKL : public VAE {
        } else if (sd_version_is_sd3(version)) {
            scale_factor = 1.5305f;
            shift_factor = 0.0609f;
-        } else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_longcat(version)) {
+        } else if (sd_version_uses_flux_vae(version)) {
            scale_factor = 0.3611f;
            shift_factor = 0.1159f;
        } else if (sd_version_uses_flux2_vae(version)) {
--- a/src/model_loader.cpp
+++ b/src/model_loader.cpp
@ -485,6 +485,9 @@ SDVersion ModelLoader::get_sd_version() {
        if (tensor_storage.name.find("model.diffusion_model.cap_embedder.0.weight") != std::string::npos) {
            return VERSION_Z_IMAGE;
        }
        if (tensor_storage.name.find("double_stream_layers.0.img_instruct_attn.processor.img_to_q.weight") != std::string::npos) {
            return VERSION_BOOGU_IMAGE;
        }
        if (tensor_storage.name.find("model.diffusion_model.layers.0.adaLN_sa_ln.weight") != std::string::npos) {
            return VERSION_ERNIE_IMAGE;
        }
--- a/src/model_manager.cpp
+++ b/src/model_manager.cpp
@ -147,6 +147,17 @@ bool ModelManager::register_param_tensors(const std::string& desc,
    return true;
 }
 bool ModelManager::load_all_params_eagerly() {
    std::vector<TensorState*> all_states;
    all_states.reserve(tensor_states_.size());
    for (const auto& s : tensor_states_) {
        if (s != nullptr) {
            all_states.push_back(s.get());
        }
    }
    return load_tensors_to_params_backend(all_states);
 }
 bool ModelManager::validate_registered_tensors() {
    bool ok = true;
    for (const auto& state : tensor_states_) {
@ -469,7 +480,7 @@ bool ModelManager::mmap_params(const std::vector<TensorState*>& states,
        return true;
    }
-    auto mmap_store = model_loader_.mmap_tensors(mmap_candidates, {}, true);
+    auto mmap_store = model_loader_.mmap_tensors(mmap_candidates, {}, writable_mmap_);
    if (mmap_store.empty()) {
        return true;
    }
--- a/src/model_manager.h
+++ b/src/model_manager.h
@ -69,6 +69,7 @@ private:
    uint64_t current_lora_epoch_ = 0;
    int n_threads_               = 0;
    bool enable_mmap_            = false;
    bool writable_mmap_          = false;
    void finish_compute_backend_usage(const std::vector<TensorState*>& states);
    void release_all();
@ -110,6 +111,7 @@ public:
        model_loader_.set_n_threads(n_threads);
    }
    void set_enable_mmap(bool enable_mmap) { enable_mmap_ = enable_mmap; }
    void set_writable_mmap(bool writable_mmap) { writable_mmap_ = writable_mmap; }
    void set_common_ignore_tensors(std::set<std::string> ignore_tensors);
    void set_loras(std::vector<LoraSpec> loras, SDVersion version);
@ -158,6 +160,7 @@ public:
    }
    bool validate_registered_tensors();
    bool load_all_params_eagerly();
    bool prepare_params(const std::vector<ggml_tensor*>& tensors) override;
    void release_compute_backend_params(const std::vector<ggml_tensor*>& tensors) override;
--- a/src/name_conversion.cpp
+++ b/src/name_conversion.cpp
@ -184,6 +184,27 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
    return name;
 }
 std::string convert_qwen3_vl_vision_name(std::string name) {
    static const std::vector<std::pair<std::string, std::string>> qwen3_vl_vision_name_map{
        {"mm.0.", "merger.linear_fc1."},
        {"mm.2.", "merger.linear_fc2."},
        {"v.post_ln.", "merger.norm."},
        {"v.position_embd.weight", "pos_embed.weight"},
        {"v.patch_embd.weight.1", "patch_embed.proj.1.weight"},
        {"v.patch_embd.weight", "patch_embed.proj.0.weight"},
        {"v.patch_embd.bias", "patch_embed.bias"},
        {"v.blk.", "blocks."},
        {"attn_qkv.", "attn.qkv."},
        {"attn_out.", "attn.proj."},
        {"ffn_up.", "mlp.linear_fc1."},
        {"ffn_down.", "mlp.linear_fc2."},
        {"ln1.", "norm1."},
        {"ln2.", "norm2."},
    };
    replace_with_name_map(name, qwen3_vl_vision_name_map);
    return name;
 }
 // ref: https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py
 std::string convert_diffusers_unet_to_original_sd1(std::string name) {
    // (stable-diffusion, HF Diffusers)
@ -1154,6 +1175,10 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
    replace_with_prefix_map(name, prefix_map);
    if (sd_version_is_boogu_image(version) && starts_with(name, "text_encoders.llm.visual.")) {
        name = convert_qwen3_vl_vision_name(std::move(name));
    }
    // diffusion model
    {
        for (const auto& prefix : diffuison_model_prefix_vec) {
--- a/src/runtime/guidance.cpp
+++ b/src/runtime/guidance.cpp
@ -3,6 +3,7 @@
 #include <algorithm>
 #include <cmath>
 #include <cstdlib>
 #include <optional>
 #include <string>
 #include <utility>
@ -63,6 +64,82 @@ namespace sd::guidance {
        return uncond;
    }
    std::vector<float> parse_guidance_schedule_from_spec(std::string spec) {
        std::vector<float> schedule;
        while (!spec.empty()) {
            auto sep     = spec.find('+');
            auto segment = spec.substr(0, sep);
            auto x = segment.find('x');
            if (x == std::string::npos) {
                LOG_ERROR("Invalid guidance schedule segment: '%s' (expected <guidance>x<count>)", segment.c_str());
                return {};
            }
            float guidance;
            int count;
            auto guidance_str = segment.substr(0, x);
            auto count_str    = segment.substr(x + 1);
            try {
                size_t idx = 0;
                guidance   = std::stof(guidance_str, &idx);
                if (idx != guidance_str.size()) {
                    LOG_ERROR("Invalid guidance value in guidance schedule: '%s'", guidance_str.c_str());
                    return {};
                }
            } catch (const std::exception&) {
                LOG_ERROR("Invalid guidance value in guidance schedule: '%s'", guidance_str.c_str());
                return {};
            }
            try {
                size_t idx = 0;
                count      = std::stoi(count_str, &idx);
                if (idx != count_str.size()) {
                    LOG_ERROR("Invalid count in guidance schedule: '%s'", count_str.c_str());
                    return {};
                }
            } catch (const std::exception&) {
                LOG_ERROR("Invalid count in guidance schedule: '%s'", count_str.c_str());
                return {};
            }
            if (count <= 0) {
                LOG_ERROR("Guidance schedule count must be positive");
                return {};
            }
            schedule.insert(schedule.end(), count, guidance);
            if (sep == std::string::npos) {
                break;
            }
            spec = spec.substr(sep + 1);
        }
        return schedule;
    }
    std::vector<float> parse_guidance_schedule(const char* extra_sample_args) {
        std::vector<float> guidance_schedule;
        std::string guidance_schedule_str = "";
        for (const auto& [key, value] : parse_key_value_args(extra_sample_args, "extra sample arg")) {
            float parsed = 0.0f;
            if (key == "guidance_schedule") {
                guidance_schedule_str = value;
            }
        }
        if (!guidance_schedule_str.empty()) {
            guidance_schedule = parse_guidance_schedule_from_spec(guidance_schedule_str);
        }
        return guidance_schedule;
    }
    ClassifierFreeGuidance::ClassifierFreeGuidance(float guidance_scale,
                                                   float image_guidance_scale)
        : guidance_scale_(guidance_scale),
@ -70,8 +147,10 @@ namespace sd::guidance {
    }
    GuiderOutput ClassifierFreeGuidance::forward(const GuidanceInput& input,
-                                                 GuiderOutput previous) const {
+                                                 GuiderOutput previous,
                                                 std::optional<float> scale_override) const {
        (void)previous;
        float guidance_scale = scale_override.value_or(guidance_scale_);
        GuiderOutput output;
        if (!has_tensor(input.pred_cond)) {
@ -86,14 +165,14 @@ namespace sd::guidance {
                const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
                output.pred                              = pred_img_uncond +
                              image_guidance_scale_ * (pred_uncond - pred_img_uncond) +
-                              guidance_scale_ * (pred_cond - pred_uncond);
+                              guidance_scale * (pred_cond - pred_uncond);
            } else {
-                output.pred = pred_uncond + guidance_scale_ * (pred_cond - pred_uncond);
+                output.pred = pred_uncond + guidance_scale * (pred_cond - pred_uncond);
            }
        } else if (has_tensor(input.pred_img_uncond)) {
            const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
-            output.pred                              = pred_img_uncond + guidance_scale_ * (pred_cond - pred_img_uncond);
+            output.pred                              = pred_img_uncond + guidance_scale * (pred_cond - pred_img_uncond);
        }
        return output;
@ -128,8 +207,10 @@ namespace sd::guidance {
    }
    GuiderOutput AdaptiveProjectedGuidance::forward(const GuidanceInput& input,
-                                                    GuiderOutput previous) const {
+                                                    GuiderOutput previous,
                                                    std::optional<float> scale_override) const {
        (void)previous;
        float guidance_scale = scale_override.value_or(guidance_scale_);
        GuiderOutput output;
        if (!has_tensor(input.pred_cond)) {
@ -144,13 +225,13 @@ namespace sd::guidance {
                const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
                output.pred                              = pred_img_uncond +
                              image_guidance_scale_ * (pred_uncond - pred_img_uncond) +
-                              guidance_scale_ * (pred_cond - pred_uncond);
+                              guidance_scale * (pred_cond - pred_uncond);
            } else {
-                output.pred = pred_uncond + guidance_scale_ * (pred_cond - pred_uncond);
+                output.pred = pred_uncond + guidance_scale * (pred_cond - pred_uncond);
            }
        } else if (has_tensor(input.pred_img_uncond)) {
            const sd::Tensor<float>& pred_img_uncond = *input.pred_img_uncond;
-            output.pred                              = pred_img_uncond + guidance_scale_ * (pred_cond - pred_img_uncond);
+            output.pred                              = pred_img_uncond + guidance_scale * (pred_cond - pred_img_uncond);
        }
        if (!has_tensor(input.pred_uncond) && !has_tensor(input.pred_img_uncond)) {
            return output;
@ -162,7 +243,7 @@ namespace sd::guidance {
        sd::Tensor<float> deltas = calculate_guidance_delta(pred_cond,
                                                            pred_uncond,
                                                            pred_img_uncond,
-                                                            guidance_scale_,
+                                                            guidance_scale,
                                                            image_guidance_scale_);
        if (params_.momentum != 0.0f) {
            if (momentum_buffer_.shape() != deltas.shape()) {
@ -239,7 +320,8 @@ namespace sd::guidance {
    }
    GuiderOutput SkipLayerGuidance::forward(const GuidanceInput& input,
-                                            GuiderOutput output) const {
+                                            GuiderOutput output,
                                            std::optional<float> /*scale_override*/) const {
        if (scale_ == 0.0f || !is_enabled_for_step(input) || !input.predict_skip_layer) {
            return output;
        }
--- a/src/runtime/guidance.h
+++ b/src/runtime/guidance.h
@ -3,6 +3,7 @@
 #include <cstddef>
 #include <functional>
 #include <optional>
 #include <vector>
 #include "core/tensor.hpp"
@ -27,6 +28,7 @@ namespace sd::guidance {
    AdaptiveProjectedGuidanceParams parse_adaptive_projected_guidance_args(const char* extra_sample_args);
    bool is_adaptive_projected_guidance_enabled(const AdaptiveProjectedGuidanceParams& params);
    bool parse_skip_layer_guidance_uncond_arg(const char* extra_sample_args);
    std::vector<float> parse_guidance_schedule(const char* extra_sample_args);
    struct GuidanceInput {
        int step                                 = 0;
@ -40,9 +42,10 @@ namespace sd::guidance {
    class BaseGuidance {
    public:
-        virtual ~BaseGuidance()                                   = default;
+        virtual ~BaseGuidance()                                                                = default;
        virtual GuiderOutput forward(const GuidanceInput& input,
-                                     GuiderOutput previous) const = 0;
+                                     GuiderOutput previous,
                                     std::optional<float> scale_override = std::nullopt) const = 0;
    };
    class ClassifierFreeGuidance : public BaseGuidance {
@ -54,7 +57,8 @@ namespace sd::guidance {
                               float image_guidance_scale);
        GuiderOutput forward(const GuidanceInput& input,
-                             GuiderOutput previous) const override;
+                             GuiderOutput previous,
                             std::optional<float> scale_override = std::nullopt) const override;
    };
    class AdaptiveProjectedGuidance : public BaseGuidance {
@ -69,7 +73,8 @@ namespace sd::guidance {
                                  AdaptiveProjectedGuidanceParams params);
        GuiderOutput forward(const GuidanceInput& input,
-                             GuiderOutput previous) const override;
+                             GuiderOutput previous,
                             std::optional<float> scale_override = std::nullopt) const override;
    };
    class SkipLayerGuidance : public BaseGuidance {
@ -88,7 +93,8 @@ namespace sd::guidance {
        const std::vector<int>& layers() const;
        GuiderOutput forward(const GuidanceInput& input,
-                             GuiderOutput previous) const override;
+                             GuiderOutput previous,
                             std::optional<float> scale_override = std::nullopt) const override;
    };
 }  // namespace sd::guidance
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -20,6 +20,7 @@
 #include "extensions/generation_extension.h"
 #include "model/adapter/lora.hpp"
 #include "model/diffusion/anima.hpp"
 #include "model/diffusion/boogu.hpp"
 #include "model/diffusion/control.hpp"
 #include "model/diffusion/ernie_image.hpp"
 #include "model/diffusion/flux.hpp"
@ -87,6 +88,7 @@ const char* model_version_to_str[] = {
    "LTXAV",
    "HiDream O1",
    "Z-Image",
    "Boogu Image",
    "Ovis Image",
    "Ernie Image",
    "Lens",
@ -124,7 +126,8 @@ static bool sd_version_supports_ref_latent_img_cfg(SDVersion version) {
           sd_version_is_flux2(version) ||
           sd_version_is_qwen_image(version) ||
           sd_version_is_longcat(version) ||
-           sd_version_is_z_image(version);
+           sd_version_is_z_image(version) ||
           sd_version_is_boogu_image(version);
 }
 static bool sd_version_supports_img_cfg(SDVersion version, bool has_ref_images) {
@ -196,6 +199,7 @@ public:
    bool enable_mmap                     = false;
    sd::ggml_graph_cut::MaxVramAssignment max_vram_assignment;
    bool stream_layers = false;
    bool eager_load    = false;
    std::string backend_spec;
    std::string params_backend_spec;
@ -339,6 +343,7 @@ public:
        n_threads           = sd_ctx_params->n_threads;
        enable_mmap         = sd_ctx_params->enable_mmap;
        stream_layers       = sd_ctx_params->stream_layers;
        eager_load          = sd_ctx_params->eager_load;
        backend_spec        = SAFE_STR(sd_ctx_params->backend);
        params_backend_spec = SAFE_STR(sd_ctx_params->params_backend);
        max_vram_assignment.reset(0.f);
@ -527,7 +532,6 @@ public:
        if (wtype != GGML_TYPE_COUNT || tensor_type_rules.size() > 0) {
            model_loader.set_wtype_override(wtype, tensor_type_rules);
        }
        model_loader.process_model_files(enable_mmap, true);
        std::map<ggml_type, uint32_t> wtype_stat                 = model_loader.get_wtype_stat();
        std::map<ggml_type, uint32_t> conditioner_wtype_stat     = model_loader.get_conditioner_wtype_stat();
@ -581,9 +585,12 @@ public:
            apply_lora_immediately = false;
        }
        bool needs_writable_mmap = enable_mmap && apply_lora_immediately;
        model_manager->set_writable_mmap(needs_writable_mmap);
        if (enable_mmap && apply_lora_immediately) {
            LOG_WARN("in mode 'immediately', LoRAs will cause extra memory usage with mmap");
        }
        model_loader.process_model_files(enable_mmap, needs_writable_mmap);
        load_alphas_cumprod(model_loader);
        size_t text_encoder_params_mem_size = 0;
@ -784,6 +791,18 @@ public:
                                                                         "model.diffusion_model",
                                                                         version,
                                                                         model_manager);
            } else if (sd_version_is_boogu_image(version)) {
                cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
                                                                 tensor_storage_map,
                                                                 version,
                                                                 "",
                                                                 true,
                                                                 model_manager);
                diffusion_model  = std::make_shared<Boogu::BooguImageRunner>(backend_for(SDBackendModule::DIFFUSION),
                                                                            tensor_storage_map,
                                                                            "model.diffusion_model",
                                                                            version,
                                                                            model_manager);
            } else if (sd_version_is_ernie_image(version)) {
                cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
                                                                 tensor_storage_map,
@ -1138,7 +1157,15 @@ public:
            return false;
        }
-        LOG_DEBUG("model metadata validated; weights will be prepared lazily");
+        if (eager_load) {
            if (!model_manager->load_all_params_eagerly()) {
                LOG_ERROR("model params eager load failed");
                return false;
            }
            LOG_DEBUG("model metadata validated; weights pre-loaded to params backend");
        } else {
            LOG_DEBUG("model metadata validated; weights will be prepared lazily");
        }
        {
            size_t total_params_ram_size  = 0;
@ -1220,6 +1247,7 @@ public:
                           sd_version_is_anima(version) ||
                           sd_version_is_ernie_image(version) ||
                           sd_version_is_z_image(version) ||
                           sd_version_is_boogu_image(version) ||
                           sd_version_is_pid(version) ||
                           sd_version_is_ideogram4(version)) {
                    pred_type = FLOW_PRED;
@ -1231,6 +1259,8 @@ public:
                        default_flow_shift = 1.5f;
                    } else if (sd_version_is_ideogram4(version)) {
                        default_flow_shift = 1.0f;
                    } else if (sd_version_is_boogu_image(version)) {
                        default_flow_shift = 3.16f;
                    } else {
                        default_flow_shift = 3.f;
                    }
@ -1691,7 +1721,7 @@ public:
                if (sd_version_is_sd3(version)) {
                    latent_rgb_proj = sd3_latent_rgb_proj;
                    latent_rgb_bias = sd3_latent_rgb_bias;
-                } else if (sd_version_is_flux(version) || sd_version_is_z_image(version) || sd_version_is_longcat(version)) {
+                } else if (sd_version_uses_flux_vae(version)) {
                    latent_rgb_proj = flux_latent_rgb_proj;
                    latent_rgb_bias = flux_latent_rgb_bias;
                } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
@ -1786,6 +1816,9 @@ public:
        if (sd_version_is_anima(version)) {
            return std::vector<float>{t / static_cast<float>(TIMESTEPS)};
        }
        if (sd_version_is_boogu_image(version)) {
            return std::vector<float>{t / static_cast<float>(TIMESTEPS)};
        }
        if (version == VERSION_HIDREAM_O1) {
            return std::vector<float>{1.0f - (t / static_cast<float>(TIMESTEPS))};
        }
@ -1911,6 +1944,32 @@ public:
        float slg_scale     = guidance.slg.scale;
        bool slg_uncond     = sd::guidance::parse_skip_layer_guidance_uncond_arg(extra_sample_args);
        std::vector<float> guidance_schedule = sd::guidance::parse_guidance_schedule(extra_sample_args);
        if (!guidance_schedule.empty() && guidance_schedule.size() != sigmas.size() - 1) {
            if (guidance_schedule.size() > sigmas.size()) {
                LOG_WARN("guidance_schedule length (%zu) is greater than number of steps (%zu)", guidance_schedule.size(), sigmas.size() - 1);
                LOG_WARN("truncating guidance_schedule to match step count");
                guidance_schedule.resize(sigmas.size() - 1);
            } else {
                LOG_INFO("padding guidance_schedule with cfg_scale");
                while (guidance_schedule.size() < sigmas.size() - 1) {
                    guidance_schedule.push_back(cfg_scale);
                }
            }
        }
        if (!guidance_schedule.empty()) {
            std::string schedule_str = "[";
            for (size_t i = 0; i < guidance_schedule.size(); ++i) {
                schedule_str += std::to_string(guidance_schedule[i]);
                if (i < guidance_schedule.size() - 1) {
                    schedule_str += ", ";
                }
            }
            schedule_str += "]";
            LOG_DEBUG("using guidance schedule: %s", schedule_str.c_str());
        }
        sd_sample::SampleCacheRuntime cache_runtime = sd_sample::init_sample_cache_runtime(version,
                                                                                           cache_params,
                                                                                           denoiser.get(),
@ -2151,7 +2210,7 @@ public:
            guidance_input.pred_uncond     = uncond_out.empty() ? nullptr : &uncond_out;
            guidance_input.pred_img_uncond = img_uncond_out.empty() ? nullptr : &img_uncond_out;
-            sd::guidance::GuiderOutput guided = primary_guidance.forward(guidance_input, {});
+            sd::guidance::GuiderOutput guided = guidance_schedule.empty() ? primary_guidance.forward(guidance_input, {}) : primary_guidance.forward(guidance_input, {}, guidance_schedule[guidance_schedule.size() - 1 - step]);
            if (guided.pred.empty()) {
                return {};
            }
@ -2675,6 +2734,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
    sd_ctx_params->lora_apply_mode      = LORA_APPLY_AUTO;
    sd_ctx_params->max_vram             = nullptr;
    sd_ctx_params->stream_layers        = false;
    sd_ctx_params->eager_load           = false;
    sd_ctx_params->enable_mmap          = false;
    sd_ctx_params->diffusion_flash_attn = false;
    sd_ctx_params->circular_x           = false;
@ -2721,6 +2781,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
             "prediction: %s\n"
             "max_vram: %s\n"
             "stream_layers: %s\n"
             "eager_load: %s\n"
             "backend: %s\n"
             "params_backend: %s\n"
             "flash_attn: %s\n"
@ -2756,6 +2817,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
             sd_prediction_name(sd_ctx_params->prediction),
             SAFE_STR(sd_ctx_params->max_vram),
             BOOL_STR(sd_ctx_params->stream_layers),
             BOOL_STR(sd_ctx_params->eager_load),
             SAFE_STR(sd_ctx_params->backend),
             SAFE_STR(sd_ctx_params->params_backend),
             BOOL_STR(sd_ctx_params->flash_attn),
Author	SHA1	Message	Date
leejet	f440ad9c29	fix: avoid writable mmap for read-only weights (#1698 )	2026-06-23 00:39:31 +08:00
stduhpf	41f7acbfb0	feat: support guidance_schedule (#1684 )	2026-06-23 00:05:55 +08:00
leejet	b395a6972d	refactor: add Flux VAE version helper (#1696 )	2026-06-22 22:39:42 +08:00
Alex Klinkhamer	854bebfe02	feat: add --prompt-file and --negative-prompt-file flags (#1693 )	2026-06-22 22:16:54 +08:00
fszontagh	787d229d84	perf: --eager-load to pre-load params at model-load time (#1687 )	2026-06-22 22:10:09 +08:00
leejet	b12098f5d0	feat: add boogu image support (#1688 )	2026-06-22 00:36:17 +08:00
stduhpf	2bd249c971	feat: concatenate repeated cli arg strings (#1686 )	2026-06-22 00:24:13 +08:00
Daniele	e9e952462f	fix: workaround for Ernie with Vulkan and Flash Attention (#1680 )	2026-06-22 00:21:38 +08:00