docs: update sd-cli/sd-server docs

refactor: introduce ggml_ext_zeros_like/ggml_ext_ones_like (#1312 )
perf: improved flux attention qkv unpacking (#1306 )
2026-05-09 00:38:55 +00:00 · 2026-03-04 00:41:17 +08:00 · 2026-03-04 00:36:52 +08:00 · 2026-03-04 00:36:32 +08:00 · 2026-03-04 00:34:07 +08:00
7 changed files with 66 additions and 36 deletions
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -4,11 +4,12 @@
 usage: ./bin/sd-cli  [options]

 CLI Options:
-  -o, --output <string>       path to write result image to. you can use printf-style %d format specifiers for image sequences (default: ./output.png) (eg. output_%03d.png)
-  --output-begin-idx <int>    starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)
+  -o, --output <string>       path to write result image to. you can use printf-style %d format specifiers for image sequences (default:
+                              ./output.png) (eg. output_%03d.png)
  --preview-path <string>     path to write preview image to (default: ./preview.png)
  --preview-interval <int>    interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
                              every step)
+  --output-begin-idx <int>    starting index for output image sequence, must be non-negative (default 0 if specified %d in output path, 1 otherwise)
  --canny                     apply canny preprocessor (edge detection)
  --convert-name              convert tensor name (for convert mode)
  -v, --verbose               print extra info
@ -59,6 +60,7 @@ Context Options:
  --circularx                              enable circular RoPE wrapping on x-axis (width) only
  --circulary                              enable circular RoPE wrapping on y-axis (height) only
  --chroma-disable-dit-mask                disable dit mask for chroma
+  --qwen-image-zero-cond-t                 enable zero_cond_t for qwen image
  --chroma-enable-t5-mask                  enable t5 mask for chroma
  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
                                           type of the weight file
@ -107,7 +109,7 @@ Generation Options:
                                           medium
  --skip-layer-start <float>               SLG enabling point (default: 0.01)
  --skip-layer-end <float>                 SLG disabling point (default: 0.2)
-  --eta <float>                            eta in DDIM, only for DDIM/TCD/res_multistep/res_2s (default: 0)
+  --eta <float>                            eta in DDIM, only for DDIM and TCD (default: 0)
  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
  --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
  --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
@ -115,7 +117,7 @@ Generation Options:
  --high-noise-slg-scale <float>           (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
  --high-noise-skip-layer-start <float>    (high noise) SLG enabling point (default: 0.01)
  --high-noise-skip-layer-end <float>      (high noise) SLG disabling point (default: 0.2)
-  --high-noise-eta <float>                 (high noise) eta in DDIM, only for DDIM/TCD/res_multistep/res_2s (default: 0)
+  --high-noise-eta <float>                 (high noise) eta in DDIM, only for DDIM and TCD (default: 0)
  --strength <float>                       strength for noising/unnoising (default: 0.75)
  --pm-style-strength <float>
  --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
@ -124,10 +126,12 @@ Generation Options:
  --increase-ref-index                     automatically increase the indices of references images based on the order they are listed (starting with 1).
  --disable-auto-resize-ref-image          disable auto resize of ref images
  -s, --seed                               RNG seed (default: 42, use random seed for < 0)
-  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd,
-                                           res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a otherwise)
-  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
-                                           tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan, euler_a otherwise
+  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
+                                           tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a
+                                           otherwise)
+  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
+                                           ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan,
+                                           euler_a otherwise
  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
                                           kl_optimal, lcm, bong_tangent], default: discrete
  --sigmas                                 custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -4,12 +4,12 @@
 usage: ./bin/sd-server  [options]

 Svr Options:
-  -l, --listen-ip <string>    server listen ip (default: 127.0.0.1)
-  --listen-port <int>         server listen port (default: 1234)
-  --serve-html-path <string>  path to HTML file to serve at root (optional)
-  -v, --verbose               print extra info
-  --color                     colors the logging tags according to level
-  -h, --help                  show this help message and exit
+  -l, --listen-ip <string>      server listen ip (default: 127.0.0.1)        
+  --serve-html-path <string>    path to HTML file to serve at root (optional)
+  --listen-port <int>           server listen port (default: 1234)
+  -v, --verbose                 print extra info
+  --color                       colors the logging tags according to level   
+  -h, --help                    show this help message and exit

 Context Options:
  -m, --model <string>                     path to full model
@ -39,10 +39,10 @@ Context Options:
  --vae-tiling                             process vae in tiles to reduce memory usage
  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
+  --mmap                                   whether to memory-map model
  --control-net-cpu                        keep controlnet in cpu (for low vram)
  --clip-on-cpu                            keep clip in cpu (for low vram)
  --vae-on-cpu                             keep vae in cpu (for low vram)
-  --mmap                                   whether to memory-map model
  --fa                                     use flash attention
  --diffusion-fa                           use flash attention in the diffusion model only
  --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
@ -51,6 +51,7 @@ Context Options:
  --circularx                              enable circular RoPE wrapping on x-axis (width) only
  --circulary                              enable circular RoPE wrapping on y-axis (height) only
  --chroma-disable-dit-mask                disable dit mask for chroma
+  --qwen-image-zero-cond-t                 enable zero_cond_t for qwen image
  --chroma-enable-t5-mask                  enable t5 mask for chroma
  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
                                           type of the weight file
@ -99,7 +100,7 @@ Default Generation Options:
                                           medium
  --skip-layer-start <float>               SLG enabling point (default: 0.01)
  --skip-layer-end <float>                 SLG disabling point (default: 0.2)
-  --eta <float>                            eta in DDIM, only for DDIM/TCD/res_multistep/res_2s (default: 0)
+  --eta <float>                            eta in DDIM, only for DDIM and TCD (default: 0)
  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
  --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
  --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
@ -107,7 +108,7 @@ Default Generation Options:
  --high-noise-slg-scale <float>           (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
  --high-noise-skip-layer-start <float>    (high noise) SLG enabling point (default: 0.01)
  --high-noise-skip-layer-end <float>      (high noise) SLG disabling point (default: 0.2)
-  --high-noise-eta <float>                 (high noise) eta in DDIM, only for DDIM/TCD/res_multistep/res_2s (default: 0)
+  --high-noise-eta <float>                 (high noise) eta in DDIM, only for DDIM and TCD (default: 0)
  --strength <float>                       strength for noising/unnoising (default: 0.75)
  --pm-style-strength <float>
  --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
@ -116,10 +117,12 @@ Default Generation Options:
  --increase-ref-index                     automatically increase the indices of references images based on the order they are listed (starting with 1).
  --disable-auto-resize-ref-image          disable auto resize of ref images
  -s, --seed                               RNG seed (default: 42, use random seed for < 0)
-  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd,
-                                           res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a otherwise)
-  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
-                                           tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan, euler_a otherwise
+  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
+                                           tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a
+                                           otherwise)
+  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
+                                           ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan,
+                                           euler_a otherwise
  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
                                           kl_optimal, lcm, bong_tangent], default: discrete
  --sigmas                                 custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
--- a/examples/server/main.cpp
+++ b/examples/server/main.cpp
@ -345,7 +345,7 @@ int main(int argc, const char** argv) {
    auto get_lora_full_path = [&](const std::string& path) -> std::string {
        std::lock_guard<std::mutex> lock(lora_mutex);
        auto it = std::find_if(lora_cache.begin(), lora_cache.end(),
-                           [&](const LoraEntry& e) { return e.path == path; });
+                               [&](const LoraEntry& e) { return e.path == path; });
        return (it != lora_cache.end()) ? it->fullpath : "";
    };

@ -567,7 +567,7 @@ int main(int argc, const char** argv) {

            std::string sd_cpp_extra_args_str = extract_and_remove_sd_cpp_extra_args(prompt);

-            size_t image_count = req.form.get_file_count("image[]");
+            size_t image_count    = req.form.get_file_count("image[]");
            bool has_legacy_image = req.form.has_file("image");
            if (image_count == 0 && !has_legacy_image) {
                res.status = 400;
--- a/src/flux.hpp
+++ b/src/flux.hpp
@ -103,11 +103,13 @@ namespace Flux {
            auto norm     = std::dynamic_pointer_cast<QKNorm>(blocks["norm"]);

            auto qkv         = qkv_proj->forward(ctx, x);
-            auto qkv_vec     = ggml_ext_chunk(ctx->ggml_ctx, qkv, 3, 0, true);
-            int64_t head_dim = qkv_vec[0]->ne[0] / num_heads;
-            auto q           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]);
-            auto k           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]);
-            auto v           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[2], head_dim, num_heads, qkv_vec[2]->ne[1], qkv_vec[2]->ne[2]);
+            int64_t head_dim = qkv->ne[0] / 3 / num_heads;
+            auto q           = ggml_view_4d(ctx->ggml_ctx, qkv, head_dim, num_heads, qkv->ne[1], qkv->ne[2],
+                                            qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], 0);
+            auto k           = ggml_view_4d(ctx->ggml_ctx, qkv, head_dim, num_heads, qkv->ne[1], qkv->ne[2],
+                                            qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], (qkv->nb[0]) * qkv->ne[0] / 3);
+            auto v           = ggml_view_4d(ctx->ggml_ctx, qkv, head_dim, num_heads, qkv->ne[1], qkv->ne[2],
+                                            qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], (qkv->nb[0]) * 2 * qkv->ne[0] / 3);
            q                = norm->query_norm(ctx, q);
            k                = norm->key_norm(ctx, k);
            return {q, k, v};
@ -491,15 +493,14 @@ namespace Flux {
            auto x_mod   = Flux::modulate(ctx->ggml_ctx, pre_norm->forward(ctx, x), mod.shift, mod.scale);
            auto qkv_mlp = linear1->forward(ctx, x_mod);  // [N, n_token, hidden_size * 3 + mlp_hidden_dim*mlp_mult_factor]

-            auto q = ggml_view_3d(ctx->ggml_ctx, qkv_mlp, hidden_size, qkv_mlp->ne[1], qkv_mlp->ne[2], qkv_mlp->nb[1], qkv_mlp->nb[2], 0);
-            auto k = ggml_view_3d(ctx->ggml_ctx, qkv_mlp, hidden_size, qkv_mlp->ne[1], qkv_mlp->ne[2], qkv_mlp->nb[1], qkv_mlp->nb[2], hidden_size * qkv_mlp->nb[0]);
-            auto v = ggml_view_3d(ctx->ggml_ctx, qkv_mlp, hidden_size, qkv_mlp->ne[1], qkv_mlp->ne[2], qkv_mlp->nb[1], qkv_mlp->nb[2], hidden_size * 2 * qkv_mlp->nb[0]);
-
            int64_t head_dim = hidden_size / num_heads;

-            q = ggml_reshape_4d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, q), head_dim, num_heads, q->ne[1], q->ne[2]);  // [N, n_token, n_head, d_head]
-            k = ggml_reshape_4d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, k), head_dim, num_heads, k->ne[1], k->ne[2]);  // [N, n_token, n_head, d_head]
-            v = ggml_reshape_4d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, v), head_dim, num_heads, v->ne[1], v->ne[2]);  // [N, n_token, n_head, d_head]
+            auto q = ggml_view_4d(ctx->ggml_ctx, qkv_mlp, head_dim, num_heads, qkv_mlp->ne[1], qkv_mlp->ne[2],
+                                  qkv_mlp->nb[0] * head_dim, qkv_mlp->nb[1], qkv_mlp->nb[2], 0);
+            auto k = ggml_view_4d(ctx->ggml_ctx, qkv_mlp, head_dim, num_heads, qkv_mlp->ne[1], qkv_mlp->ne[2],
+                                  qkv_mlp->nb[0] * head_dim, qkv_mlp->nb[1], qkv_mlp->nb[2], (qkv_mlp->nb[0]) * hidden_size);
+            auto v = ggml_view_4d(ctx->ggml_ctx, qkv_mlp, head_dim, num_heads, qkv_mlp->ne[1], qkv_mlp->ne[2],
+                                  qkv_mlp->nb[0] * head_dim, qkv_mlp->nb[1], qkv_mlp->nb[2], (qkv_mlp->nb[0]) * 2 * hidden_size);

            q         = norm->query_norm(ctx, q);
            k         = norm->key_norm(ctx, k);
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@ -1219,6 +1219,11 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_zeros(struct ggml_context* ctx,
    return ggml_ext_full(ctx, 0.f, ne0, ne1, ne2, ne3);
 }

+__STATIC_INLINE__ struct ggml_tensor* ggml_ext_zeros_like(struct ggml_context* ctx,
+                                                          struct ggml_tensor* x) {
+    return ggml_ext_zeros(ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]);
+}
+
 __STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones(struct ggml_context* ctx,
                                                    int64_t ne0,
                                                    int64_t ne1,
@ -1227,6 +1232,11 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones(struct ggml_context* ctx,
    return ggml_ext_full(ctx, 1.f, ne0, ne1, ne2, ne3);
 }

+__STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones_like(struct ggml_context* ctx,
+                                                         struct ggml_tensor* x) {
+    return ggml_ext_ones(ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]);
+}
+
 __STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor* a) {
 #ifdef SD_USE_VULKAN
    auto zero_index = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:zero_int");
--- a/src/qwen_image.hpp
+++ b/src/qwen_image.hpp
@ -404,7 +404,7 @@ namespace Qwen {

            auto t_emb = time_text_embed->forward(ctx, timestep);
            if (params.zero_cond_t) {
-                auto t_emb_0 = time_text_embed->forward(ctx, ggml_ext_zeros(ctx->ggml_ctx, timestep->ne[0], timestep->ne[1], timestep->ne[2], timestep->ne[3]));
+                auto t_emb_0 = time_text_embed->forward(ctx, ggml_ext_zeros_like(ctx->ggml_ctx, timestep));
                t_emb        = ggml_concat(ctx->ggml_ctx, t_emb, t_emb_0, 1);
            }
            auto img = img_in->forward(ctx, x);
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@ -1098,6 +1098,18 @@ public:
        cond_stage_lora_models.clear();
        diffusion_lora_models.clear();
        first_stage_lora_models.clear();
+        if (cond_stage_model) {
+            cond_stage_model->set_weight_adapter(nullptr);
+        }
+        if (diffusion_model) {
+            diffusion_model->set_weight_adapter(nullptr);
+        }
+        if (high_noise_diffusion_model) {
+            high_noise_diffusion_model->set_weight_adapter(nullptr);
+        }
+        if (first_stage_model) {
+            first_stage_model->set_weight_adapter(nullptr);
+        }
        if (lora_state.empty()) {
            return;
        }
Author	SHA1	Message	Date
leejet	aaa8a51bd8	docs: update sd-cli/sd-server docs	2026-03-04 00:41:17 +08:00
leejet	ba35dd734e	refactor: introduce ggml_ext_zeros_like/ggml_ext_ones_like (#1312 )	2026-03-04 00:36:52 +08:00
bssrdf	d41f5fff69	perf: improved flux attention qkv unpacking (#1306 )	2026-03-04 00:36:32 +08:00
Korsar13	810ef0cf76	fix: reset weight adapter for models if no loras in request (#1307 )	2026-03-04 00:34:07 +08:00