Merge branch 'master' into wan

2026-06-25 15:46:40 +00:00 · 2025-08-29 23:08:32 +08:00 · 2025-08-29 23:08:32 +08:00 · b05b2b29a3
commit b05b2b29a3
parent 2410ce3dee 5900ef6605
13 changed files with 167 additions and 8 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -33,6 +33,7 @@ option(SD_SYCL                       "sd: sycl backend" OFF)
 option(SD_MUSA                       "sd: musa backend" OFF)
 option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
 option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
+option(SD_USE_SYSTEM_GGML            "sd: use system-installed GGML library" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)

 if(SD_CUDA)
@ -118,13 +119,23 @@ endif()

 set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)

-# see https://github.com/ggerganov/ggml/pull/682
-add_definitions(-DGGML_MAX_NAME=128)
+if (NOT SD_USE_SYSTEM_GGML)
+    # see https://github.com/ggerganov/ggml/pull/682
+    add_definitions(-DGGML_MAX_NAME=128)
+endif()

 # deps
 # Only add ggml if it hasn't been added yet
 if (NOT TARGET ggml)
-    add_subdirectory(ggml)
+    if (SD_USE_SYSTEM_GGML)
+        find_package(ggml REQUIRED)
+        if (NOT ggml_FOUND)
+            message(FATAL_ERROR "System-installed GGML library not found.")
+        endif()
+        add_library(ggml ALIAS ggml::ggml)
+    else()
+        add_subdirectory(ggml)
+    endif()
 endif()

 add_subdirectory(thirdparty)
--- a/README.md
+++ b/README.md
@ -341,6 +341,10 @@ arguments:
  --diffusion-fa                     use flash attention in the diffusion model (for low vram)
                                     Might lower quality, since it implies converting k and v to f16.
                                     This might crash if it is not supported by the backend.
+  --diffusion-conv-direct            use Conv2d direct in the diffusion model
+                                     This might crash if it is not supported by the backend.
+  --vae-conv-direct                  use Conv2d direct in the vae model (should improve the performance)
+                                     This might crash if it is not supported by the backend.
  --control-net-cpu                  keep controlnet in cpu (for low vram)
  --canny                            apply canny preprocessor (edge detection)
  --color                            colors the logging tags according to level
--- a/control.hpp
+++ b/control.hpp
@ -324,6 +324,17 @@ struct ControlNet : public GGMLRunner {
        control_net.init(params_ctx, tensor_types, "");
    }

+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        control_net.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
    ~ControlNet() {
        free_control_ctx();
    }
--- a/esrgan.hpp
+++ b/esrgan.hpp
@ -149,6 +149,17 @@ struct ESRGAN : public GGMLRunner {
        rrdb_net.init(params_ctx, tensor_types, "");
    }

+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        rrdb_net.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
    std::string get_desc() {
        return "esrgan";
    }
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -103,6 +103,8 @@ struct SDParams {
    bool clip_on_cpu           = false;
    bool vae_on_cpu            = false;
    bool diffusion_flash_attn  = false;
+    bool diffusion_conv_direct = false;
+    bool vae_conv_direct       = false;
    bool canny_preprocess      = false;
    bool color                 = false;
    int upscale_repeats        = 1;
@ -153,6 +155,8 @@ void print_params(SDParams params) {
    printf("    control_net_cpu:    %s\n", params.control_net_cpu ? "true" : "false");
    printf("    vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
    printf("    diffusion flash attention:%s\n", params.diffusion_flash_attn ? "true" : "false");
+    printf("    diffusion Conv2d direct:%s\n", params.diffusion_conv_direct ? "true" : "false");
+    printf("    vae Conv2d direct:%s\n", params.vae_conv_direct ? "true" : "false");
    printf("    strength(control): %.2f\n", params.control_strength);
    printf("    prompt:            %s\n", params.prompt.c_str());
    printf("    negative_prompt:   %s\n", params.negative_prompt.c_str());
@ -255,6 +259,10 @@ void print_usage(int argc, const char* argv[]) {
    printf("  --diffusion-fa                     use flash attention in the diffusion model (for low vram)\n");
    printf("                                     Might lower quality, since it implies converting k and v to f16.\n");
    printf("                                     This might crash if it is not supported by the backend.\n");
+    printf("  --diffusion-conv-direct            use Conv2d direct in the diffusion model");
+    printf("                                     This might crash if it is not supported by the backend.\n");
+    printf("  --vae-conv-direct                  use Conv2d direct in the vae model (should improve the performance)");
+    printf("                                     This might crash if it is not supported by the backend.\n");
    printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
    printf("  --canny                            apply canny preprocessor (edge detection)\n");
    printf("  --color                            colors the logging tags according to level\n");
@ -495,6 +503,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
        {"", "--clip-on-cpu", "", true, &params.clip_on_cpu},
        {"", "--vae-on-cpu", "", true, &params.vae_on_cpu},
        {"", "--diffusion-fa", "", true, &params.diffusion_flash_attn},
+        {"", "--diffusion-conv-direct", "", true, &params.diffusion_conv_direct},
+        {"", "--vae-conv-direct", "", true, &params.vae_conv_direct},
        {"", "--canny", "", true, &params.canny_preprocess},
        {"-v", "--verbos", "", true, &params.verbose},
        {"", "--color", "", true, &params.color},
@ -1077,6 +1087,8 @@ int main(int argc, const char* argv[]) {
        params.control_net_cpu,
        params.vae_on_cpu,
        params.diffusion_flash_attn,
+        params.diffusion_conv_direct,
+        params.vae_conv_direct,
        params.chroma_use_dit_mask,
        params.chroma_use_t5_mask,
        params.chroma_t5_mask_pad,
@ -1184,6 +1196,7 @@ int main(int argc, const char* argv[]) {
    if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) {
        upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
                                                        params.offload_params_to_cpu,
+                                                        params.diffusion_conv_direct,
                                                        params.n_threads);

        if (upscaler_ctx == NULL) {
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 089530bb72e70aa9f9ecb98137dfd891c2be20c1
+Subproject commit 9caa235fe8e7e0ed0cbb599c54ec1cf07a9b7b73
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -56,6 +56,8 @@
 #define __STATIC_INLINE__ static inline
 #endif

+static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128");
+
 // n-mode trensor-matrix product
 // example: 2-mode product
 // A: [ne03, k, ne01, ne00]
@ -839,6 +841,27 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,

 // w: [OC*IC, KD, KH, KW]
 // x: [N*IC, ID, IH, IW]
+__STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d_direct(struct ggml_context* ctx,
+                                                             struct ggml_tensor* x,
+                                                             struct ggml_tensor* w,
+                                                             struct ggml_tensor* b,
+                                                             int s0 = 1,
+                                                             int s1 = 1,
+                                                             int p0 = 0,
+                                                             int p1 = 0,
+                                                             int d0 = 1,
+                                                             int d1 = 1) {
+    x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
+    if (b != NULL) {
+        b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
+        // b = ggml_repeat(ctx, b, x);
+        x = ggml_add(ctx, x, b);
+    }
+    return x;
+}
+
+// w: [OC，IC, KD, 1 * 1]
+// x: [N, IC, IH, IW]
 // b: [OC,]
 // result: [N*OC, OD, OH, OW]
 __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_3d(struct ggml_context* ctx,
@ -1607,6 +1630,19 @@ public:
            tensors[prefix + pair.first] = pair.second;
        }
    }
+
+    virtual std::string get_desc() {
+        return "GGMLBlock";
+    }
+
+    void get_all_blocks(std::vector<GGMLBlock*>& result) {
+        result.push_back(this);
+        for (auto& block_iter : blocks) {
+            if (block_iter.second) {
+                block_iter.second->get_all_blocks(result);
+            }
+        }
+    }
 };

 class UnaryBlock : public GGMLBlock {
@ -1703,6 +1739,7 @@ protected:
    std::pair<int, int> padding;
    std::pair<int, int> dilation;
    bool bias;
+    bool direct = false;

    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") {
        enum ggml_type wtype = GGML_TYPE_F16;
@ -1729,13 +1766,25 @@ public:
          dilation(dilation),
          bias(bias) {}

+    void enable_direct() {
+        direct = true;
+    }
+
+    std::string get_desc() {
+        return "Conv2d";
+    }
+
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        struct ggml_tensor* w = params["weight"];
        struct ggml_tensor* b = NULL;
        if (bias) {
            b = params["bias"];
        }
-        return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+        if (direct) {
+            return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+        } else {
+            return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+        }
    }
 };

--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -428,6 +428,10 @@ public:
                                                              model_loader.tensor_storages_types,
                                                              version,
                                                              sd_ctx_params->diffusion_flash_attn);
+                if (sd_ctx_params->diffusion_conv_direct) {
+                    LOG_INFO("Using Conv2d direct in the diffusion model");
+                    std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.enable_conv2d_direct();
+                }
            }

            cond_stage_model->alloc_params_buffer();
@ -465,6 +469,10 @@ public:
                                                                    vae_decode_only,
                                                                    false,
                                                                    version);
+                if (sd_ctx_params->vae_conv_direct) {
+                    LOG_INFO("Using Conv2d direct in the vae model");
+                    first_stage_model->enable_conv2d_direct();
+                }
                first_stage_model->alloc_params_buffer();
                first_stage_model->get_param_tensors(tensors, "first_stage_model");
            } else {
@ -474,6 +482,10 @@ public:
                                                                    "decoder.layers",
                                                                    vae_decode_only,
                                                                    version);
+                if (sd_ctx_params->vae_conv_direct) {
+                    LOG_INFO("Using Conv2d direct in the tae model");
+                    tae_first_stage->enable_conv2d_direct();
+                }
            }
            // first_stage_model->get_param_tensors(tensors, "first_stage_model.");

@ -489,6 +501,10 @@ public:
                                                           offload_params_to_cpu,
                                                           model_loader.tensor_storages_types,
                                                           version);
+                if (sd_ctx_params->diffusion_conv_direct) {
+                    LOG_INFO("Using Conv2d direct in the control net");
+                    control_net->enable_conv2d_direct();
+                }
            }

            if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) {
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -136,6 +136,8 @@ typedef struct {
    bool keep_control_net_on_cpu;
    bool keep_vae_on_cpu;
    bool diffusion_flash_attn;
+    bool diffusion_conv_direct;
+    bool vae_conv_direct;
    bool chroma_use_dit_mask;
    bool chroma_use_t5_mask;
    int chroma_t5_mask_pad;
@ -245,6 +247,7 @@ typedef struct upscaler_ctx_t upscaler_ctx_t;

 SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
                                        bool offload_params_to_cpu,
+                                        bool direct,
                                        int n_threads);
 SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);

--- a/tae.hpp
+++ b/tae.hpp
@ -207,6 +207,17 @@ struct TinyAutoEncoder : public GGMLRunner {
        taesd.init(params_ctx, tensor_types, prefix);
    }

+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        taesd.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
    std::string get_desc() {
        return "taesd";
    }
--- a/unet.hpp
+++ b/unet.hpp
@ -547,6 +547,18 @@ struct UNetModelRunner : public GGMLRunner {
        unet.init(params_ctx, tensor_types, prefix);
    }

+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        unet.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                LOG_DEBUG("block %s", block->get_desc().c_str());
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
    std::string get_desc() {
        return "unet";
    }
--- a/upscaler.cpp
+++ b/upscaler.cpp
@ -9,9 +9,12 @@ struct UpscalerGGML {
    std::shared_ptr<ESRGAN> esrgan_upscaler;
    std::string esrgan_path;
    int n_threads;
+    bool direct = false;

-    UpscalerGGML(int n_threads)
-        : n_threads(n_threads) {
+    UpscalerGGML(int n_threads,
+                 bool direct = false)
+        : n_threads(n_threads),
+          direct(direct) {
    }

    bool load_from_file(const std::string& esrgan_path,
@ -48,6 +51,9 @@ struct UpscalerGGML {
        }
        LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
        esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, model_loader.tensor_storages_types);
+        if (direct) {
+            esrgan_upscaler->enable_conv2d_direct();
+        }
        if (!esrgan_upscaler->load_from_file(esrgan_path)) {
            return false;
        }
@ -106,6 +112,7 @@ struct upscaler_ctx_t {

 upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
                                 bool offload_params_to_cpu,
+                                 bool direct,
                                 int n_threads) {
    upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
    if (upscaler_ctx == NULL) {
@ -113,7 +120,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
    }
    std::string esrgan_path(esrgan_path_c_str);

-    upscaler_ctx->upscaler = new UpscalerGGML(n_threads);
+    upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct);
    if (upscaler_ctx->upscaler == NULL) {
        return NULL;
    }
--- a/vae.hpp
+++ b/vae.hpp
@ -546,6 +546,17 @@ struct AutoEncoderKL : public VAE {
        ae.init(params_ctx, tensor_types, prefix);
    }

+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        ae.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
    std::string get_desc() {
        return "vae";
    }