From 15d0f82760e2d44d9bec904b277c4a7ad1f6b2ed Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Sat, 13 Dec 2025 14:27:47 +0800
Subject: [PATCH 1/5] feat(server): do not parse lora fromt client-side prompts
 (#1083)

---
 examples/common/common.hpp | 3 +++
 examples/server/main.cpp   | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/common/common.hpp b/examples/common/common.hpp
index 558817e..0ab5c08 100644
--- a/examples/common/common.hpp
+++ b/examples/common/common.hpp
@@ -1349,6 +1349,9 @@ struct SDGenerationParams {
     }
 
     void extract_and_remove_lora(const std::string& lora_model_dir) {
+        if (lora_model_dir.empty()) {
+            return;
+        }
         static const std::regex re(R"(<lora:([^:>]+):([^>]+)>)");
         static const std::vector<std::string> valid_ext = {".pt", ".safetensors", ".gguf"};
         std::smatch m;
diff --git a/examples/server/main.cpp b/examples/server/main.cpp
index 90cf484..f1ba0cd 100644
--- a/examples/server/main.cpp
+++ b/examples/server/main.cpp
@@ -425,7 +425,7 @@ int main(int argc, const char** argv) {
                 return;
             }
 
-            if (!gen_params.process_and_check(IMG_GEN, ctx_params.lora_model_dir)) {
+            if (!gen_params.process_and_check(IMG_GEN, "")) {
                 res.status = 400;
                 res.set_content(R"({"error":"invalid params"})", "application/json");
                 return;
@@ -605,7 +605,7 @@ int main(int argc, const char** argv) {
                 return;
             }
 
-            if (!gen_params.process_and_check(IMG_GEN, ctx_params.lora_model_dir)) {
+            if (!gen_params.process_and_check(IMG_GEN, "")) {
                 res.status = 400;
                 res.set_content(R"({"error":"invalid params"})", "application/json");
                 return;

From 8f05f5bc6ee9d6aba9d1ff2be7739a5a3cf1586d Mon Sep 17 00:00:00 2001
From: rmatif <rmatif@proton.me>
Date: Sat, 13 Dec 2025 09:20:02 +0100
Subject: [PATCH 2/5] feat: add support for custom scheduler (#694)

---------

Co-authored-by: leejet <leejet714@gmail.com>
---
 examples/cli/README.md     |  1 +
 examples/cli/main.cpp      | 12 ++++++++--
 examples/common/common.hpp | 46 ++++++++++++++++++++++++++++++++++++
 examples/server/README.md  |  1 +
 stable-diffusion.cpp       | 48 ++++++++++++++++++++++++++++++++------
 stable-diffusion.h         |  2 ++
 6 files changed, 101 insertions(+), 9 deletions(-)

diff --git a/examples/cli/README.md b/examples/cli/README.md
index f6a4278..02650f7 100644
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@@ -121,6 +121,7 @@ Generation Options:
                                            ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
   --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm],
                                            default: discrete
+  --sigmas                                 custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
   --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
   --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
   -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index eaa2591..417d211 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -258,7 +258,15 @@ std::string get_image_params(const SDCliParams& cli_params, const SDContextParam
         parameter_string += "Sampler RNG: " + std::string(sd_rng_type_name(ctx_params.sampler_rng_type)) + ", ";
     }
     parameter_string += "Sampler: " + std::string(sd_sample_method_name(gen_params.sample_params.sample_method));
-    if (gen_params.sample_params.scheduler != SCHEDULER_COUNT) {
+    if (!gen_params.custom_sigmas.empty()) {
+        parameter_string += ", Custom Sigmas: [";
+        for (size_t i = 0; i < gen_params.custom_sigmas.size(); ++i) {
+            std::ostringstream oss;
+            oss << std::fixed << std::setprecision(4) << gen_params.custom_sigmas[i];
+            parameter_string += oss.str() + (i == gen_params.custom_sigmas.size() - 1 ? "" : ", ");
+        }
+        parameter_string += "]";
+    } else if (gen_params.sample_params.scheduler != SCHEDULER_COUNT) {  // Only show schedule if not using custom sigmas
         parameter_string += " " + std::string(sd_scheduler_name(gen_params.sample_params.scheduler));
     }
     parameter_string += ", ";
@@ -806,4 +814,4 @@ int main(int argc, const char* argv[]) {
     release_all_resources();
 
     return 0;
-}
+}
\ No newline at end of file
diff --git a/examples/common/common.hpp b/examples/common/common.hpp
index 0ab5c08..ccd01ce 100644
--- a/examples/common/common.hpp
+++ b/examples/common/common.hpp
@@ -883,6 +883,8 @@ struct SDGenerationParams {
     std::vector<int> high_noise_skip_layers = {7, 8, 9};
     sd_sample_params_t high_noise_sample_params;
 
+    std::vector<float> custom_sigmas;
+
     std::string easycache_option;
     sd_easycache_params_t easycache_params;
 
@@ -1201,6 +1203,43 @@ struct SDGenerationParams {
             return 1;
         };
 
+        auto on_sigmas_arg = [&](int argc, const char** argv, int index) {
+            if (++index >= argc) {
+                return -1;
+            }
+            std::string sigmas_str = argv[index];
+            if (!sigmas_str.empty() && sigmas_str.front() == '[') {
+                sigmas_str.erase(0, 1);
+            }
+            if (!sigmas_str.empty() && sigmas_str.back() == ']') {
+                sigmas_str.pop_back();
+            }
+
+            std::stringstream ss(sigmas_str);
+            std::string item;
+            while (std::getline(ss, item, ',')) {
+                item.erase(0, item.find_first_not_of(" \t\n\r\f\v"));
+                item.erase(item.find_last_not_of(" \t\n\r\f\v") + 1);
+                if (!item.empty()) {
+                    try {
+                        custom_sigmas.push_back(std::stof(item));
+                    } catch (const std::invalid_argument& e) {
+                        fprintf(stderr, "error: invalid float value '%s' in --sigmas\n", item.c_str());
+                        return -1;
+                    } catch (const std::out_of_range& e) {
+                        fprintf(stderr, "error: float value '%s' out of range in --sigmas\n", item.c_str());
+                        return -1;
+                    }
+                }
+            }
+
+            if (custom_sigmas.empty() && !sigmas_str.empty()) {
+                fprintf(stderr, "error: could not parse any sigma values from '%s'\n", argv[index]);
+                return -1;
+            }
+            return 1;
+        };
+
         auto on_ref_image_arg = [&](int argc, const char** argv, int index) {
             if (++index >= argc) {
                 return -1;
@@ -1260,6 +1299,10 @@ struct SDGenerationParams {
              "--scheduler",
              "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm], default: discrete",
              on_scheduler_arg},
+            {"",
+             "--sigmas",
+             "custom sigma values for the sampler, comma-separated (e.g., \"14.61,7.8,3.5,0.0\").",
+             on_sigmas_arg},
             {"",
              "--skip-layers",
              "layers to skip for SLG steps (default: [7,8,9])",
@@ -1512,6 +1555,8 @@ struct SDGenerationParams {
 
         sample_params.guidance.slg.layers                 = skip_layers.data();
         sample_params.guidance.slg.layer_count            = skip_layers.size();
+        sample_params.custom_sigmas                       = custom_sigmas.data();
+        sample_params.custom_sigmas_count                 = static_cast<int>(custom_sigmas.size());
         high_noise_sample_params.guidance.slg.layers      = high_noise_skip_layers.data();
         high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size();
 
@@ -1606,6 +1651,7 @@ struct SDGenerationParams {
             << "  sample_params: " << sample_params_str << ",\n"
             << "  high_noise_skip_layers: " << vec_to_string(high_noise_skip_layers) << ",\n"
             << "  high_noise_sample_params: " << high_noise_sample_params_str << ",\n"
+            << "  custom_sigmas: " << vec_to_string(custom_sigmas) << ",\n"
             << "  easycache_option: \"" << easycache_option << "\",\n"
             << "  easycache: "
             << (easycache_params.enabled ? "enabled" : "disabled")
diff --git a/examples/server/README.md b/examples/server/README.md
index 6393d84..43c5d5f 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -115,6 +115,7 @@ Default Generation Options:
                                            ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
   --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm],
                                            default: discrete
+  --sigmas                                 custom sigma values for the sampler, comma-separated (e.g., "14.61,7.8,3.5,0.0").
   --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
   --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
   -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 1ef8512..2cb5882 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -2600,6 +2600,8 @@ void sd_sample_params_init(sd_sample_params_t* sample_params) {
     sample_params->scheduler                   = SCHEDULER_COUNT;
     sample_params->sample_method               = SAMPLE_METHOD_COUNT;
     sample_params->sample_steps                = 20;
+    sample_params->custom_sigmas               = nullptr;
+    sample_params->custom_sigmas_count         = 0;
 }
 
 char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
@@ -3194,11 +3196,21 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
     }
     LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
 
-    int sample_steps          = sd_img_gen_params->sample_params.sample_steps;
-    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps,
-                                                                 sd_ctx->sd->get_image_seq_len(height, width),
-                                                                 sd_img_gen_params->sample_params.scheduler,
-                                                                 sd_ctx->sd->version);
+    int sample_steps = sd_img_gen_params->sample_params.sample_steps;
+    std::vector<float> sigmas;
+    if (sd_img_gen_params->sample_params.custom_sigmas_count > 0) {
+        sigmas = std::vector<float>(sd_img_gen_params->sample_params.custom_sigmas,
+                                    sd_img_gen_params->sample_params.custom_sigmas + sd_img_gen_params->sample_params.custom_sigmas_count);
+        if (sample_steps != sigmas.size() - 1) {
+            sample_steps = static_cast<int>(sigmas.size()) - 1;
+            LOG_WARN("sample_steps != custom_sigmas_count - 1, set sample_steps to %d", sample_steps);
+        }
+    } else {
+        sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps,
+                                                  sd_ctx->sd->get_image_seq_len(height, width),
+                                                  sd_img_gen_params->sample_params.scheduler,
+                                                  sd_ctx->sd->version);
+    }
 
     ggml_tensor* init_latent   = nullptr;
     ggml_tensor* concat_latent = nullptr;
@@ -3461,7 +3473,29 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
     if (high_noise_sample_steps > 0) {
         total_steps += high_noise_sample_steps;
     }
-    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps, 0, sd_vid_gen_params->sample_params.scheduler, sd_ctx->sd->version);
+
+    std::vector<float> sigmas;
+    if (sd_vid_gen_params->sample_params.custom_sigmas_count > 0) {
+        sigmas = std::vector<float>(sd_vid_gen_params->sample_params.custom_sigmas,
+                                    sd_vid_gen_params->sample_params.custom_sigmas + sd_vid_gen_params->sample_params.custom_sigmas_count);
+        if (total_steps != sigmas.size() - 1) {
+            total_steps = static_cast<int>(sigmas.size()) - 1;
+            LOG_WARN("total_steps != custom_sigmas_count - 1, set total_steps to %d", total_steps);
+            if (sample_steps >= total_steps) {
+                sample_steps = total_steps;
+                LOG_WARN("total_steps != custom_sigmas_count - 1, set sample_steps to %d", sample_steps);
+            }
+            if (high_noise_sample_steps > 0) {
+                high_noise_sample_steps = total_steps - sample_steps;
+                LOG_WARN("total_steps != custom_sigmas_count - 1, set high_noise_sample_steps to %d", high_noise_sample_steps);
+            }
+        }
+    } else {
+        sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps,
+                                                  0,
+                                                  sd_vid_gen_params->sample_params.scheduler,
+                                                  sd_ctx->sd->version);
+    }
 
     if (high_noise_sample_steps < 0) {
         // timesteps ∝ sigmas for Flow models (like wan2.2 a14b)
@@ -3841,4 +3875,4 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
     LOG_INFO("generate_video completed in %.2fs", (t5 - t0) * 1.0f / 1000);
 
     return result_images;
-}
+}
\ No newline at end of file
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 2da70bd..e4abc8d 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -225,6 +225,8 @@ typedef struct {
     int sample_steps;
     float eta;
     int shifted_timestep;
+    float* custom_sigmas;
+    int custom_sigmas_count;
 } sd_sample_params_t;
 
 typedef struct {

From d96b4152d692a2f28cfb1677e4939c1ca551a937 Mon Sep 17 00:00:00 2001
From: stduhpf <stephduh@live.fr>
Date: Sat, 13 Dec 2025 18:22:41 +0100
Subject: [PATCH 3/5] perf: optimize ggml_ext_chunk (#1084)

---
 common.hpp      |  4 +++-
 ggml_extend.hpp | 34 +++++++++++-----------------------
 2 files changed, 14 insertions(+), 24 deletions(-)

diff --git a/common.hpp b/common.hpp
index 33d499f..74b218a 100644
--- a/common.hpp
+++ b/common.hpp
@@ -194,10 +194,12 @@ public:
         auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
 
         x          = proj->forward(ctx, x);  // [ne3, ne2, ne1, dim_out*2]
-        auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0);
+        auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0, false);
         x          = x_vec[0];  // [ne3, ne2, ne1, dim_out]
         auto gate  = x_vec[1];  // [ne3, ne2, ne1, dim_out]
 
+        gate = ggml_cont(ctx->ggml_ctx, gate);
+
         gate = ggml_gelu_inplace(ctx->ggml_ctx, gate);
 
         x = ggml_mul(ctx->ggml_ctx, x, gate);  // [ne3, ne2, ne1, dim_out]
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 07b9bfb..26dff49 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -732,34 +732,22 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_slice(struct ggml_context* ctx,
 __STATIC_INLINE__ std::vector<struct ggml_tensor*> ggml_ext_chunk(struct ggml_context* ctx,
                                                                   struct ggml_tensor* x,
                                                                   int num,
-                                                                  int64_t dim) {
+                                                                  int64_t dim,
+                                                                  bool cont = true) {
     GGML_ASSERT(dim >= 0 && dim < 4);
     GGML_ASSERT(x->ne[dim] % num == 0);
 
-    int perm[4] = {0, 1, 2, 3};
-    for (int i = dim; i < 3; ++i)
-        perm[i] = perm[i + 1];
-    perm[3] = dim;
-
-    int inv_perm[4];
-    for (int i = 0; i < 4; ++i)
-        inv_perm[perm[i]] = i;
-
-    if (dim != 3) {
-        x = ggml_ext_torch_permute(ctx, x, perm[0], perm[1], perm[2], perm[3]);
-        x = ggml_cont(ctx, x);
-    }
-
     std::vector<struct ggml_tensor*> chunks;
-    int64_t chunk_size = x->ne[3] / num;
+    int64_t chunk_size  = x->ne[dim] / num;
+    int64_t stride      = chunk_size * x->nb[dim];
+    int64_t chunk_ne[4] = {x->ne[0], x->ne[1], x->ne[2], x->ne[3]};
+    chunk_ne[dim]       = chunk_size;
     for (int i = 0; i < num; i++) {
         auto chunk = ggml_view_4d(
             ctx, x,
-            x->ne[0], x->ne[1], x->ne[2], chunk_size,
-            x->nb[1], x->nb[2], x->nb[3], x->nb[3] * i * chunk_size);
-
-        if (dim != 3) {
-            chunk = ggml_ext_torch_permute(ctx, chunk, inv_perm[0], inv_perm[1], inv_perm[2], inv_perm[3]);
+            chunk_ne[0], chunk_ne[1], chunk_ne[2], chunk_ne[3],
+            x->nb[1], x->nb[2], x->nb[3], stride * i);
+        if (cont) {
             chunk = ggml_cont(ctx, chunk);
         }
         chunks.push_back(chunk);
@@ -772,7 +760,7 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_silu_act(ggml_context* ctx, ggml_tensor*
     // x: [ne3, ne2, ne1, ne0]
     // return: [ne3, ne2, ne1, ne0/2]
 
-    auto x_vec = ggml_ext_chunk(ctx, x, 2, 0);
+    auto x_vec = ggml_ext_chunk(ctx, x, 2, 0, false);
     ggml_tensor* gate;
     if (gate_first) {
         gate = x_vec[0];
@@ -781,7 +769,7 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_silu_act(ggml_context* ctx, ggml_tensor*
         x    = x_vec[0];
         gate = x_vec[1];
     }
-
+    gate = ggml_cont(ctx, gate);
     gate = ggml_silu_inplace(ctx, gate);
 
     x = ggml_mul(ctx, x, gate);  // [ne3, ne2, ne1, ne0/2]

From 614f8736df54bbf7a20ecb324a821d0e505c6503 Mon Sep 17 00:00:00 2001
From: "Kirill A. Korinsky" <kirill@korins.ky>
Date: Sat, 13 Dec 2025 18:23:34 +0100
Subject: [PATCH 4/5] sync: update ggml (#1082)

---
 ggml            | 2 +-
 ggml_extend.hpp | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/ggml b/ggml
index 2d3876d..f5425c0 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit 2d3876d554551d35c06dccc5852be50d5fd2a275
+Subproject commit f5425c0ee5e582a7d64411f06139870bff3e52e0
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 26dff49..28fd018 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -1270,6 +1270,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context
         }
 
         if (mask_in != nullptr) {
+            // the need for padding got removed in ggml 4767bda
+            // ensure we can still use the old version for now
+#ifdef GGML_KQ_MASK_PAD
             int mask_pad = 0;
             if (mask_in->ne[1] % GGML_KQ_MASK_PAD != 0) {
                 mask_pad = GGML_PAD(L_q, GGML_KQ_MASK_PAD) - mask_in->ne[1];
@@ -1277,6 +1280,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context
             if (mask_pad > 0) {
                 mask_in = ggml_pad(ctx, mask_in, 0, mask_pad, 0, 0);
             }
+#endif
             mask_in = ggml_cast(ctx, mask_in, GGML_TYPE_F16);
         }
 

From 43a70e819b9254dee0d017305d6992f6bb27f850 Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Sun, 14 Dec 2025 01:24:15 +0800
Subject: [PATCH 5/5] fix: add lora info to image metadata (#1086)

---
 examples/cli/main.cpp      | 2 +-
 examples/common/common.hpp | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 417d211..22480d7 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -232,7 +232,7 @@ static std::string sd_basename(const std::string& path) {
 }
 
 std::string get_image_params(const SDCliParams& cli_params, const SDContextParams& ctx_params, const SDGenerationParams& gen_params, int64_t seed) {
-    std::string parameter_string = gen_params.prompt + "\n";
+    std::string parameter_string = gen_params.prompt_with_lora + "\n";
     if (gen_params.negative_prompt.size() != 0) {
         parameter_string += "Negative prompt: " + gen_params.negative_prompt + "\n";
     }
diff --git a/examples/common/common.hpp b/examples/common/common.hpp
index ccd01ce..bf38379 100644
--- a/examples/common/common.hpp
+++ b/examples/common/common.hpp
@@ -863,6 +863,7 @@ static bool is_absolute_path(const std::string& p) {
 
 struct SDGenerationParams {
     std::string prompt;
+    std::string prompt_with_lora; // for metadata record only
     std::string negative_prompt;
     int clip_skip   = -1;  // <= 0 represents unspecified
     int width       = 512;
@@ -1476,6 +1477,7 @@ struct SDGenerationParams {
     }
 
     bool process_and_check(SDMode mode, const std::string& lora_model_dir) {
+        prompt_with_lora = prompt;
         if (width <= 0) {
             fprintf(stderr, "error: the width must be greater than 0\n");
             return false;