feat: add sd3 flash attn support (#815 )

feat: add SmoothStep Scheduler (#813 )
chore: add install() support in CMakeLists.txt (#540 )
2025-12-13 05:48:56 +00:00 · 2025-09-11 23:24:29 +08:00 · 2025-09-11 23:17:46 +08:00 · 2025-09-11 22:24:16 +08:00 · 2025-09-11 22:16:05 +08:00
11 changed files with 105 additions and 35 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,10 +1,10 @@
 build*/
+cmake-build-*/
 test/
 .vscode/
+.idea/
 .cache/
 *.swp
-.vscode/
-.idea/
 *.bat
 *.bin
 *.exe
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -149,3 +149,7 @@ if (SD_BUILD_EXAMPLES)
    add_subdirectory(examples)
 endif()

+set(SD_PUBLIC_HEADERS stable-diffusion.h)
+set_target_properties(${SD_LIB} PROPERTIES PUBLIC_HEADER "${SD_PUBLIC_HEADERS}")
+
+install(TARGETS ${SD_LIB} LIBRARY PUBLIC_HEADER)
--- a/denoiser.hpp
+++ b/denoiser.hpp
@ -251,6 +251,35 @@ struct KarrasSchedule : SigmaSchedule {
    }
 };

+// Close to Beta Schedule, but increadably simple in code.
+struct SmoothStepSchedule : SigmaSchedule {
+    static constexpr float smoothstep(float x) {
+        return x * x * (3.0f - 2.0f * x);
+    }
+
+    std::vector<float> get_sigmas(uint32_t n, float /*sigma_min*/, float /*sigma_max*/, t_to_sigma_t t_to_sigma) override {
+        std::vector<float> result;
+        result.reserve(n + 1);
+
+        const int t_max = TIMESTEPS - 1;
+        if (n == 0) {
+            return result;
+        } else if (n == 1) {
+            result.push_back(t_to_sigma((float)t_max));
+            result.push_back(0.f);
+            return result;
+        }
+
+        for (uint32_t i = 0; i < n; i++) {
+            float u = 1.f - float(i) / float(n);
+            result.push_back(t_to_sigma(std::round(smoothstep(u) * t_max)));
+        }
+
+        result.push_back(0.f);
+        return result;
+    }
+};
+
 struct Denoiser {
    std::shared_ptr<SigmaSchedule> scheduler                                                 = std::make_shared<DiscreteSchedule>();
    virtual float sigma_min()                                                                = 0;
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@ -95,8 +95,9 @@ struct MMDiTModel : public DiffusionModel {

    MMDiTModel(ggml_backend_t backend,
               bool offload_params_to_cpu,
+               bool flash_attn                     = false,
               const String2GGMLType& tensor_types = {})
-        : mmdit(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model") {
+        : mmdit(backend, offload_params_to_cpu, flash_attn, tensor_types, "model.diffusion_model") {
    }

    std::string get_desc() {
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -238,7 +238,7 @@ void print_usage(int argc, const char* argv[]) {
    printf("  --skip-layers LAYERS               Layers to skip for SLG steps: (default: [7,8,9])\n");
    printf("  --skip-layer-start START           SLG enabling point: (default: 0.01)\n");
    printf("  --skip-layer-end END               SLG disabling point: (default: 0.2)\n");
-    printf("  --scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)\n");
+    printf("  --scheduler {discrete, karras, exponential, ays, gits, smoothstep} Denoiser sigma scheduler (default: discrete)\n");
    printf("  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
    printf("                                     sampling method (default: \"euler_a\")\n");
    printf("  --steps  STEPS                     number of sample steps (default: 20)\n");
@ -251,7 +251,7 @@ void print_usage(int argc, const char* argv[]) {
    printf("  --high-noise-skip-layers LAYERS    (high noise) Layers to skip for SLG steps: (default: [7,8,9])\n");
    printf("  --high-noise-skip-layer-start      (high noise) SLG enabling point: (default: 0.01)\n");
    printf("  --high-noise-skip-layer-end END    (high noise) SLG disabling point: (default: 0.2)\n");
-    printf("  --high-noise-scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)\n");
+    printf("  --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep} Denoiser sigma scheduler (default: discrete)\n");
    printf("  --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
    printf("                                     (high noise) sampling method (default: \"euler_a\")\n");
    printf("  --high-noise-steps  STEPS          (high noise) number of sample steps (default: -1 = auto)\n");
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -56,6 +56,25 @@
 #define __STATIC_INLINE__ static inline
 #endif

+__STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) {
+    switch (level) {
+        case GGML_LOG_LEVEL_DEBUG:
+            LOG_DEBUG(text);
+            break;
+        case GGML_LOG_LEVEL_INFO:
+            LOG_INFO(text);
+            break;
+        case GGML_LOG_LEVEL_WARN:
+            LOG_WARN(text);
+            break;
+        case GGML_LOG_LEVEL_ERROR:
+            LOG_ERROR(text);
+            break;
+        default:
+            LOG_DEBUG(text);
+    }
+}
+
 static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128");

 // n-mode trensor-matrix product
@ -124,13 +143,6 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_kronecker(ggml_context* ctx, struct g
                    b);
 }

-__STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void* user_data) {
-    (void)level;
-    (void)user_data;
-    fputs(text, stderr);
-    fflush(stderr);
-}
-
 __STATIC_INLINE__ void ggml_tensor_set_f32_randn(struct ggml_tensor* tensor, std::shared_ptr<RNG> rng) {
    uint32_t n                        = (uint32_t)ggml_nelements(tensor);
    std::vector<float> random_numbers = rng->randn(n);
--- a/mmdit.hpp
+++ b/mmdit.hpp
@ -147,14 +147,16 @@ public:
    int64_t num_heads;
    bool pre_only;
    std::string qk_norm;
+    bool flash_attn;

 public:
    SelfAttention(int64_t dim,
                  int64_t num_heads   = 8,
                  std::string qk_norm = "",
                  bool qkv_bias       = false,
-                  bool pre_only       = false)
-        : num_heads(num_heads), pre_only(pre_only), qk_norm(qk_norm) {
+                  bool pre_only       = false,
+                  bool flash_attn     = false)
+        : num_heads(num_heads), pre_only(pre_only), qk_norm(qk_norm), flash_attn(flash_attn) {
        int64_t d_head = dim / num_heads;
        blocks["qkv"]  = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * 3, qkv_bias));
        if (!pre_only) {
@ -206,8 +208,8 @@ public:
                                ggml_backend_t backend,
                                struct ggml_tensor* x) {
        auto qkv = pre_attention(ctx, x);
-        x        = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads);  // [N, n_token, dim]
-        x        = post_attention(ctx, x);                                                  // [N, n_token, dim]
+        x        = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, true);  // [N, n_token, dim]
+        x        = post_attention(ctx, x);                                                                            // [N, n_token, dim]
        return x;
    }
 };
@ -232,6 +234,7 @@ public:
    int64_t num_heads;
    bool pre_only;
    bool self_attn;
+    bool flash_attn;

 public:
    DismantledBlock(int64_t hidden_size,
@ -240,16 +243,17 @@ public:
                    std::string qk_norm = "",
                    bool qkv_bias       = false,
                    bool pre_only       = false,
-                    bool self_attn      = false)
+                    bool self_attn      = false,
+                    bool flash_attn     = false)
        : num_heads(num_heads), pre_only(pre_only), self_attn(self_attn) {
        // rmsnorm is always Flase
        // scale_mod_only is always Flase
        // swiglu is always Flase
        blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
-        blocks["attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, pre_only));
+        blocks["attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, pre_only, flash_attn));

        if (self_attn) {
-            blocks["attn2"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, false));
+            blocks["attn2"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, false, flash_attn));
        }

        if (!pre_only) {
@ -435,8 +439,8 @@ public:
            auto qkv2          = std::get<1>(qkv_intermediates);
            auto intermediates = std::get<2>(qkv_intermediates);

-            auto attn_out  = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads);     // [N, n_token, dim]
-            auto attn2_out = ggml_nn_attention_ext(ctx, backend, qkv2[0], qkv2[1], qkv2[2], num_heads);  // [N, n_token, dim]
+            auto attn_out  = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, flash_attn);     // [N, n_token, dim]
+            auto attn2_out = ggml_nn_attention_ext(ctx, backend, qkv2[0], qkv2[1], qkv2[2], num_heads, NULL, false, false, flash_attn);  // [N, n_token, dim]
            x              = post_attention_x(ctx,
                                              attn_out,
                                              attn2_out,
@ -452,7 +456,7 @@ public:
            auto qkv               = qkv_intermediates.first;
            auto intermediates     = qkv_intermediates.second;

-            auto attn_out = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads);  // [N, n_token, dim]
+            auto attn_out = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, flash_attn);  // [N, n_token, dim]
            x             = post_attention(ctx,
                                           attn_out,
                                           intermediates[0],
@ -468,6 +472,7 @@ public:
 __STATIC_INLINE__ std::pair<struct ggml_tensor*, struct ggml_tensor*>
 block_mixing(struct ggml_context* ctx,
             ggml_backend_t backend,
+             bool flash_attn,
             struct ggml_tensor* context,
             struct ggml_tensor* x,
             struct ggml_tensor* c,
@ -497,8 +502,8 @@ block_mixing(struct ggml_context* ctx,
        qkv.push_back(ggml_concat(ctx, context_qkv[i], x_qkv[i], 1));
    }

-    auto attn         = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], x_block->num_heads);  // [N, n_context + n_token, hidden_size]
-    attn              = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));                              // [n_context + n_token, N, hidden_size]
+    auto attn         = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], x_block->num_heads, NULL, false, false, flash_attn);  // [N, n_context + n_token, hidden_size]
+    attn              = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));                                                              // [n_context + n_token, N, hidden_size]
    auto context_attn = ggml_view_3d(ctx,
                                     attn,
                                     attn->ne[0],
@ -556,6 +561,8 @@ block_mixing(struct ggml_context* ctx,
 }

 struct JointBlock : public GGMLBlock {
+    bool flash_attn;
+
 public:
    JointBlock(int64_t hidden_size,
               int64_t num_heads,
@ -563,9 +570,11 @@ public:
               std::string qk_norm = "",
               bool qkv_bias       = false,
               bool pre_only       = false,
-               bool self_attn_x    = false) {
-        blocks["context_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, pre_only));
-        blocks["x_block"]       = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, false, self_attn_x));
+               bool self_attn_x    = false,
+               bool flash_attn     = false)
+        : flash_attn(flash_attn) {
+        blocks["context_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, pre_only, false, flash_attn));
+        blocks["x_block"]       = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, false, self_attn_x, flash_attn));
    }

    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
@ -576,7 +585,7 @@ public:
        auto context_block = std::dynamic_pointer_cast<DismantledBlock>(blocks["context_block"]);
        auto x_block       = std::dynamic_pointer_cast<DismantledBlock>(blocks["x_block"]);

-        return block_mixing(ctx, backend, context, x, c, context_block, x_block);
+        return block_mixing(ctx, backend, flash_attn, context, x, c, context_block, x_block);
    }
 };

@ -634,6 +643,7 @@ protected:
    int64_t context_embedder_out_dim = 1536;
    int64_t hidden_size;
    std::string qk_norm;
+    bool flash_attn = false;

    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
        enum ggml_type wtype = GGML_TYPE_F32;
@ -641,7 +651,8 @@ protected:
    }

 public:
-    MMDiT(const String2GGMLType& tensor_types = {}) {
+    MMDiT(bool flash_attn = false, const String2GGMLType& tensor_types = {})
+        : flash_attn(flash_attn) {
        // input_size is always None
        // learn_sigma is always False
        // register_length is alwalys 0
@ -709,7 +720,8 @@ public:
                                                                                                    qk_norm,
                                                                                                    true,
                                                                                                    i == depth - 1,
-                                                                                                    i <= d_self));
+                                                                                                    i <= d_self,
+                                                                                                    flash_attn));
        }

        blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new FinalLayer(hidden_size, patch_size, out_channels));
@ -856,9 +868,10 @@ struct MMDiTRunner : public GGMLRunner {

    MMDiTRunner(ggml_backend_t backend,
                bool offload_params_to_cpu,
+                bool flash_attn,
                const String2GGMLType& tensor_types = {},
                const std::string prefix            = "")
-        : GGMLRunner(backend, offload_params_to_cpu), mmdit(tensor_types) {
+        : GGMLRunner(backend, offload_params_to_cpu), mmdit(flash_attn, tensor_types) {
        mmdit.init(params_ctx, tensor_types, prefix);
    }

@ -957,7 +970,7 @@ struct MMDiTRunner : public GGMLRunner {
        // ggml_backend_t backend    = ggml_backend_cuda_init(0);
        ggml_backend_t backend             = ggml_backend_cpu_init();
        ggml_type model_data_type          = GGML_TYPE_F16;
-        std::shared_ptr<MMDiTRunner> mmdit = std::shared_ptr<MMDiTRunner>(new MMDiTRunner(backend, false));
+        std::shared_ptr<MMDiTRunner> mmdit = std::shared_ptr<MMDiTRunner>(new MMDiTRunner(backend, false, false));
        {
            LOG_INFO("loading from '%s'", file_path.c_str());

--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -145,7 +145,6 @@ public:
 #endif
 #ifdef SD_USE_METAL
        LOG_DEBUG("Using Metal backend");
-        ggml_log_set(ggml_log_callback_default, nullptr);
        backend = ggml_backend_metal_init();
 #endif
 #ifdef SD_USE_VULKAN
@ -192,6 +191,8 @@ public:
            rng = std::make_shared<PhiloxRNG>();
        }

+        ggml_log_set(ggml_log_callback_default, nullptr);
+
        init_backend();

        ModelLoader model_loader;
@ -349,6 +350,7 @@ public:
                                                                     model_loader.tensor_storages_types);
                diffusion_model  = std::make_shared<MMDiTModel>(backend,
                                                               offload_params_to_cpu,
+                                                               sd_ctx_params->diffusion_flash_attn,
                                                               model_loader.tensor_storages_types);
            } else if (sd_version_is_flux(version)) {
                bool is_chroma = false;
@ -750,6 +752,10 @@ public:
                denoiser->scheduler          = std::make_shared<GITSSchedule>();
                denoiser->scheduler->version = version;
                break;
+            case SMOOTHSTEP:
+                LOG_INFO("Running with SmoothStep scheduler");
+                denoiser->scheduler = std::make_shared<SmoothStepSchedule>();
+                break;
            case DEFAULT:
                // Don't touch anything.
                break;
@ -1533,6 +1539,7 @@ const char* schedule_to_str[] = {
    "exponential",
    "ays",
    "gits",
+    "smoothstep",
 };

 const char* sd_schedule_name(enum scheduler_t scheduler) {
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -57,6 +57,7 @@ enum scheduler_t {
    EXPONENTIAL,
    AYS,
    GITS,
+    SMOOTHSTEP,
    SCHEDULE_COUNT
 };

--- a/upscaler.cpp
+++ b/upscaler.cpp
@ -19,13 +19,13 @@ struct UpscalerGGML {

    bool load_from_file(const std::string& esrgan_path,
                        bool offload_params_to_cpu) {
+        ggml_log_set(ggml_log_callback_default, nullptr);
 #ifdef SD_USE_CUDA
        LOG_DEBUG("Using CUDA backend");
        backend = ggml_backend_cuda_init(0);
 #endif
 #ifdef SD_USE_METAL
        LOG_DEBUG("Using Metal backend");
-        ggml_log_set(ggml_log_callback_default, nullptr);
        backend = ggml_backend_metal_init();
 #endif
 #ifdef SD_USE_VULKAN
--- a/util.cpp
+++ b/util.cpp
@ -414,7 +414,10 @@ void log_printf(sd_log_level_t level, const char* file, int line, const char* fo
    if (written >= 0 && written < LOG_BUFFER_SIZE) {
        vsnprintf(log_buffer + written, LOG_BUFFER_SIZE - written, format, args);
    }
-    strncat(log_buffer, "\n", LOG_BUFFER_SIZE - strlen(log_buffer));
+    size_t len = strlen(log_buffer);
+    if (log_buffer[len - 1] != '\n') {
+        strncat(log_buffer, "\n", LOG_BUFFER_SIZE - len);
+    }

    if (sd_log_cb) {
        sd_log_cb(level, log_buffer, sd_log_cb_data);
Author	SHA1	Message	Date
leejet	fce6afcc6a	feat: add sd3 flash attn support (#815 )	2025-09-11 23:24:29 +08:00
Erik Scholz	49d6570c43	feat: add SmoothStep Scheduler (#813 )	2025-09-11 23:17:46 +08:00
clibdev	6bbaf161ad	chore: add install() support in CMakeLists.txt (#540 )	2025-09-11 22:24:16 +08:00
clibdev	87cdbd5978	feat: use log_printf to print ggml logs (#545 )	2025-09-11 22:16:05 +08:00