2026-06-25 15:46:40 +00:00
7 changed files with 36 additions and 66 deletions
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -4,12 +4,11 @@
 usage: ./bin/sd-cli  [options]
 CLI Options:
-  -o, --output <string>       path to write result image to. you can use printf-style %d format specifiers for image sequences (default:
+  -o, --output <string>       path to write result image to. you can use printf-style %d format specifiers for image sequences (default: ./output.png) (eg. output_%03d.png)
-                              ./output.png) (eg. output_%03d.png)
+  --output-begin-idx <int>    starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)
  --preview-path <string>     path to write preview image to (default: ./preview.png)
  --preview-interval <int>    interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
                              every step)
  --output-begin-idx <int>    starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)
  --canny                     apply canny preprocessor (edge detection)
  --convert-name              convert tensor name (for convert mode)
  -v, --verbose               print extra info
@ -60,7 +59,6 @@ Context Options:
  --circularx                              enable circular RoPE wrapping on x-axis (width) only
  --circulary                              enable circular RoPE wrapping on y-axis (height) only
  --chroma-disable-dit-mask                disable dit mask for chroma
  --qwen-image-zero-cond-t                 enable zero_cond_t for qwen image
  --chroma-enable-t5-mask                  enable t5 mask for chroma
  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
                                           type of the weight file
@ -109,7 +107,7 @@ Generation Options:
                                           medium
  --skip-layer-start <float>               SLG enabling point (default: 0.01)
  --skip-layer-end <float>                 SLG disabling point (default: 0.2)
-  --eta <float>                            eta in DDIM, only for DDIM and TCD (default: 0)
+  --eta <float>                            eta in DDIM, only for DDIM/TCD/res_multistep/res_2s (default: 0)
  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
  --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
  --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
@ -117,7 +115,7 @@ Generation Options:
  --high-noise-slg-scale <float>           (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
  --high-noise-skip-layer-start <float>    (high noise) SLG enabling point (default: 0.01)
  --high-noise-skip-layer-end <float>      (high noise) SLG disabling point (default: 0.2)
-  --high-noise-eta <float>                 (high noise) eta in DDIM, only for DDIM and TCD (default: 0)
+  --high-noise-eta <float>                 (high noise) eta in DDIM, only for DDIM/TCD/res_multistep/res_2s (default: 0)
  --strength <float>                       strength for noising/unnoising (default: 0.75)
  --pm-style-strength <float>
  --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
@ -126,12 +124,10 @@ Generation Options:
  --increase-ref-index                     automatically increase the indices of references images based on the order they are listed (starting with 1).
  --disable-auto-resize-ref-image          disable auto resize of ref images
  -s, --seed                               RNG seed (default: 42, use random seed for < 0)
-  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
+  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd,
-                                           tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a
+                                           res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a otherwise)
-                                           otherwise)
+  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
-  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
+                                           tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan, euler_a otherwise
                                           ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan,
                                           euler_a otherwise
  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
                                           kl_optimal, lcm, bong_tangent], default: discrete
  --sigmas                                 custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -5,8 +5,8 @@ usage: ./bin/sd-server  [options]
 Svr Options:
  -l, --listen-ip <string>    server listen ip (default: 127.0.0.1)
  --serve-html-path <string>    path to HTML file to serve at root (optional)
  --listen-port <int>         server listen port (default: 1234)
  --serve-html-path <string>  path to HTML file to serve at root (optional)
  -v, --verbose               print extra info
  --color                     colors the logging tags according to level
  -h, --help                  show this help message and exit
@ -39,10 +39,10 @@ Context Options:
  --vae-tiling                             process vae in tiles to reduce memory usage
  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
  --mmap                                   whether to memory-map model
  --control-net-cpu                        keep controlnet in cpu (for low vram)
  --clip-on-cpu                            keep clip in cpu (for low vram)
  --vae-on-cpu                             keep vae in cpu (for low vram)
  --mmap                                   whether to memory-map model
  --fa                                     use flash attention
  --diffusion-fa                           use flash attention in the diffusion model only
  --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
@ -51,7 +51,6 @@ Context Options:
  --circularx                              enable circular RoPE wrapping on x-axis (width) only
  --circulary                              enable circular RoPE wrapping on y-axis (height) only
  --chroma-disable-dit-mask                disable dit mask for chroma
  --qwen-image-zero-cond-t                 enable zero_cond_t for qwen image
  --chroma-enable-t5-mask                  enable t5 mask for chroma
  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
                                           type of the weight file
@ -100,7 +99,7 @@ Default Generation Options:
                                           medium
  --skip-layer-start <float>               SLG enabling point (default: 0.01)
  --skip-layer-end <float>                 SLG disabling point (default: 0.2)
-  --eta <float>                            eta in DDIM, only for DDIM and TCD (default: 0)
+  --eta <float>                            eta in DDIM, only for DDIM/TCD/res_multistep/res_2s (default: 0)
  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
  --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
  --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
@ -108,7 +107,7 @@ Default Generation Options:
  --high-noise-slg-scale <float>           (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
  --high-noise-skip-layer-start <float>    (high noise) SLG enabling point (default: 0.01)
  --high-noise-skip-layer-end <float>      (high noise) SLG disabling point (default: 0.2)
-  --high-noise-eta <float>                 (high noise) eta in DDIM, only for DDIM and TCD (default: 0)
+  --high-noise-eta <float>                 (high noise) eta in DDIM, only for DDIM/TCD/res_multistep/res_2s (default: 0)
  --strength <float>                       strength for noising/unnoising (default: 0.75)
  --pm-style-strength <float>
  --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
@ -117,12 +116,10 @@ Default Generation Options:
  --increase-ref-index                     automatically increase the indices of references images based on the order they are listed (starting with 1).
  --disable-auto-resize-ref-image          disable auto resize of ref images
  -s, --seed                               RNG seed (default: 42, use random seed for < 0)
-  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
+  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd,
-                                           tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a
+                                           res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a otherwise)
-                                           otherwise)
+  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
-  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
+                                           tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan, euler_a otherwise
                                           ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan,
                                           euler_a otherwise
  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
                                           kl_optimal, lcm, bong_tangent], default: discrete
  --sigmas                                 custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
--- a/src/flux.hpp
+++ b/src/flux.hpp
@ -103,13 +103,11 @@ namespace Flux {
            auto norm     = std::dynamic_pointer_cast<QKNorm>(blocks["norm"]);
            auto qkv         = qkv_proj->forward(ctx, x);
-            int64_t head_dim = qkv->ne[0] / 3 / num_heads;
+            auto qkv_vec     = ggml_ext_chunk(ctx->ggml_ctx, qkv, 3, 0, true);
-            auto q           = ggml_view_4d(ctx->ggml_ctx, qkv, head_dim, num_heads, qkv->ne[1], qkv->ne[2],
+            int64_t head_dim = qkv_vec[0]->ne[0] / num_heads;
-                                            qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], 0);
+            auto q           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]);
-            auto k           = ggml_view_4d(ctx->ggml_ctx, qkv, head_dim, num_heads, qkv->ne[1], qkv->ne[2],
+            auto k           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]);
-                                            qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], (qkv->nb[0]) * qkv->ne[0] / 3);
+            auto v           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[2], head_dim, num_heads, qkv_vec[2]->ne[1], qkv_vec[2]->ne[2]);
            auto v           = ggml_view_4d(ctx->ggml_ctx, qkv, head_dim, num_heads, qkv->ne[1], qkv->ne[2],
                                            qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], (qkv->nb[0]) * 2 * qkv->ne[0] / 3);
            q                = norm->query_norm(ctx, q);
            k                = norm->key_norm(ctx, k);
            return {q, k, v};
@ -493,14 +491,15 @@ namespace Flux {
            auto x_mod   = Flux::modulate(ctx->ggml_ctx, pre_norm->forward(ctx, x), mod.shift, mod.scale);
            auto qkv_mlp = linear1->forward(ctx, x_mod);  // [N, n_token, hidden_size * 3 + mlp_hidden_dim*mlp_mult_factor]
            auto q = ggml_view_3d(ctx->ggml_ctx, qkv_mlp, hidden_size, qkv_mlp->ne[1], qkv_mlp->ne[2], qkv_mlp->nb[1], qkv_mlp->nb[2], 0);
            auto k = ggml_view_3d(ctx->ggml_ctx, qkv_mlp, hidden_size, qkv_mlp->ne[1], qkv_mlp->ne[2], qkv_mlp->nb[1], qkv_mlp->nb[2], hidden_size * qkv_mlp->nb[0]);
            auto v = ggml_view_3d(ctx->ggml_ctx, qkv_mlp, hidden_size, qkv_mlp->ne[1], qkv_mlp->ne[2], qkv_mlp->nb[1], qkv_mlp->nb[2], hidden_size * 2 * qkv_mlp->nb[0]);
            int64_t head_dim = hidden_size / num_heads;
-            auto q = ggml_view_4d(ctx->ggml_ctx, qkv_mlp, head_dim, num_heads, qkv_mlp->ne[1], qkv_mlp->ne[2],
+            q = ggml_reshape_4d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, q), head_dim, num_heads, q->ne[1], q->ne[2]);  // [N, n_token, n_head, d_head]
-                                  qkv_mlp->nb[0] * head_dim, qkv_mlp->nb[1], qkv_mlp->nb[2], 0);
+            k = ggml_reshape_4d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, k), head_dim, num_heads, k->ne[1], k->ne[2]);  // [N, n_token, n_head, d_head]
-            auto k = ggml_view_4d(ctx->ggml_ctx, qkv_mlp, head_dim, num_heads, qkv_mlp->ne[1], qkv_mlp->ne[2],
+            v = ggml_reshape_4d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, v), head_dim, num_heads, v->ne[1], v->ne[2]);  // [N, n_token, n_head, d_head]
                                  qkv_mlp->nb[0] * head_dim, qkv_mlp->nb[1], qkv_mlp->nb[2], (qkv_mlp->nb[0]) * hidden_size);
            auto v = ggml_view_4d(ctx->ggml_ctx, qkv_mlp, head_dim, num_heads, qkv_mlp->ne[1], qkv_mlp->ne[2],
                                  qkv_mlp->nb[0] * head_dim, qkv_mlp->nb[1], qkv_mlp->nb[2], (qkv_mlp->nb[0]) * 2 * hidden_size);
            q         = norm->query_norm(ctx, q);
            k         = norm->key_norm(ctx, k);
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@ -1219,11 +1219,6 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_zeros(struct ggml_context* ctx,
    return ggml_ext_full(ctx, 0.f, ne0, ne1, ne2, ne3);
 }
 __STATIC_INLINE__ struct ggml_tensor* ggml_ext_zeros_like(struct ggml_context* ctx,
                                                          struct ggml_tensor* x) {
    return ggml_ext_zeros(ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]);
 }
 __STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones(struct ggml_context* ctx,
                                                    int64_t ne0,
                                                    int64_t ne1,
@ -1232,11 +1227,6 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones(struct ggml_context* ctx,
    return ggml_ext_full(ctx, 1.f, ne0, ne1, ne2, ne3);
 }
 __STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones_like(struct ggml_context* ctx,
                                                         struct ggml_tensor* x) {
    return ggml_ext_ones(ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]);
 }
 __STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor* a) {
 #ifdef SD_USE_VULKAN
    auto zero_index = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:zero_int");
--- a/src/qwen_image.hpp
+++ b/src/qwen_image.hpp
@ -404,7 +404,7 @@ namespace Qwen {
            auto t_emb = time_text_embed->forward(ctx, timestep);
            if (params.zero_cond_t) {
-                auto t_emb_0 = time_text_embed->forward(ctx, ggml_ext_zeros_like(ctx->ggml_ctx, timestep));
+                auto t_emb_0 = time_text_embed->forward(ctx, ggml_ext_zeros(ctx->ggml_ctx, timestep->ne[0], timestep->ne[1], timestep->ne[2], timestep->ne[3]));
                t_emb        = ggml_concat(ctx->ggml_ctx, t_emb, t_emb_0, 1);
            }
            auto img = img_in->forward(ctx, x);
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -1098,18 +1098,6 @@ public:
        cond_stage_lora_models.clear();
        diffusion_lora_models.clear();
        first_stage_lora_models.clear();
        if (cond_stage_model) {
            cond_stage_model->set_weight_adapter(nullptr);
        }
        if (diffusion_model) {
            diffusion_model->set_weight_adapter(nullptr);
        }
        if (high_noise_diffusion_model) {
            high_noise_diffusion_model->set_weight_adapter(nullptr);
        }
        if (first_stage_model) {
            first_stage_model->set_weight_adapter(nullptr);
        }
        if (lora_state.empty()) {
            return;
        }