From 6167e2927a0a60ec020bfd05307e2f050e76371c Mon Sep 17 00:00:00 2001
From: Seas0 <seashkey@gmail.com>
Date: Sat, 2 Aug 2025 11:03:18 +0800
Subject: [PATCH 1/4] feat: support build against system installed GGML library
 (#749)

---
 CMakeLists.txt | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 06de0d5..dc8ca78 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,7 @@ option(SD_SYCL                       "sd: sycl backend" OFF)
 option(SD_MUSA                       "sd: musa backend" OFF)
 option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
 option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
+option(SD_USE_SYSTEM_GGML            "sd: use system-installed GGML library" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)
 
 if(SD_CUDA)
@@ -124,7 +125,15 @@ add_definitions(-DGGML_MAX_NAME=128)
 # deps
 # Only add ggml if it hasn't been added yet
 if (NOT TARGET ggml)
-    add_subdirectory(ggml)
+    if (SD_USE_SYSTEM_GGML)
+        find_package(ggml REQUIRED)
+        if (NOT ggml_FOUND)
+            message(FATAL_ERROR "System-installed GGML library not found.")
+        endif()
+        add_library(ggml ALIAS ggml::ggml)
+    else()
+        add_subdirectory(ggml)
+    endif()
 endif()
 
 add_subdirectory(thirdparty)

From f7f05fb185f075e80cc22cda65b7dab64feec1ca Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@users.noreply.github.com>
Date: Sat, 2 Aug 2025 14:24:40 -0300
Subject: [PATCH 2/4] chore: avoid setting GGML_MAX_NAME when building against
 external ggml (#751)

An external ggml will most likely have been built with the default
GGML_MAX_NAME value (64), which would be inconsistent with the value
set by our build (128). That would be an ODR violation, and it could
easily cause memory corruption issues due to the different
sizeof(struct ggml_tensor) values.

For now, when linking against an external ggml, we demand it has been
patched with a bigger GGML_MAX_NAME, since we can't check against a
value defined only at build time.
---
 CMakeLists.txt  | 6 ++++--
 ggml_extend.hpp | 2 ++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dc8ca78..c0e5e15 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -119,8 +119,10 @@ endif()
 
 set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
 
-# see https://github.com/ggerganov/ggml/pull/682
-add_definitions(-DGGML_MAX_NAME=128)
+if (NOT SD_USE_SYSTEM_GGML)
+    # see https://github.com/ggerganov/ggml/pull/682
+    add_definitions(-DGGML_MAX_NAME=128)
+endif()
 
 # deps
 # Only add ggml if it hasn't been added yet
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index d4e4278..0be8d4b 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -56,6 +56,8 @@
 #define __STATIC_INLINE__ static inline
 #endif
 
+static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128");
+
 // n-mode trensor-matrix product
 // example: 2-mode product
 // A: [ne03, k, ne01, ne00]

From 5b8996f74ac4eb249de735bcbd2dce4b5d21d3c2 Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Sat, 2 Aug 2025 17:25:17 +0000
Subject: [PATCH 3/4] Conv2D direct support (#744)

* Conv2DDirect for VAE stage

* Enable only for Vulkan, reduced duplicated code

* Cmake option to use conv2d direct

* conv2d direct always on for opencl

* conv direct as a flag

* fix merge typo

* Align conv2d behavior to flash attention's

* fix readme

* add conv2d direct for controlnet

* add conv2d direct for esrgan

* clean code, use enable_conv2d_direct/get_all_blocks

* format code

---------

Co-authored-by: leejet <leejet714@gmail.com>
---
 README.md             |  4 ++++
 control.hpp           | 11 ++++++++++
 esrgan.hpp            | 11 ++++++++++
 examples/cli/main.cpp | 15 +++++++++++++-
 ggml_extend.hpp       | 47 ++++++++++++++++++++++++++++++++++++++++++-
 stable-diffusion.cpp  | 16 +++++++++++++++
 stable-diffusion.h    |  5 ++++-
 tae.hpp               | 11 ++++++++++
 unet.hpp              | 12 +++++++++++
 upscaler.cpp          | 15 ++++++++++----
 vae.hpp               | 11 ++++++++++
 11 files changed, 151 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 89eb095..5a28052 100644
--- a/README.md
+++ b/README.md
@@ -341,6 +341,10 @@ arguments:
   --diffusion-fa                     use flash attention in the diffusion model (for low vram)
                                      Might lower quality, since it implies converting k and v to f16.
                                      This might crash if it is not supported by the backend.
+  --diffusion-conv-direct            use Conv2d direct in the diffusion model
+                                     This might crash if it is not supported by the backend.
+  --vae-conv-direct                  use Conv2d direct in the vae model (should improve the performance)
+                                     This might crash if it is not supported by the backend.
   --control-net-cpu                  keep controlnet in cpu (for low vram)
   --canny                            apply canny preprocessor (edge detection)
   --color                            colors the logging tags according to level
diff --git a/control.hpp b/control.hpp
index d8f81fc..63fe704 100644
--- a/control.hpp
+++ b/control.hpp
@@ -323,6 +323,17 @@ struct ControlNet : public GGMLRunner {
         control_net.init(params_ctx, tensor_types, "");
     }
 
+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        control_net.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
     ~ControlNet() {
         free_control_ctx();
     }
diff --git a/esrgan.hpp b/esrgan.hpp
index 4215db1..3e41a88 100644
--- a/esrgan.hpp
+++ b/esrgan.hpp
@@ -147,6 +147,17 @@ struct ESRGAN : public GGMLRunner {
         rrdb_net.init(params_ctx, tensor_types, "");
     }
 
+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        rrdb_net.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
     std::string get_desc() {
         return "esrgan";
     }
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 140e384..ec04dfd 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -97,6 +97,8 @@ struct SDParams {
     bool clip_on_cpu              = false;
     bool vae_on_cpu               = false;
     bool diffusion_flash_attn     = false;
+    bool diffusion_conv_direct    = false;
+    bool vae_conv_direct          = false;
     bool canny_preprocess         = false;
     bool color                    = false;
     int upscale_repeats           = 1;
@@ -142,6 +144,8 @@ void print_params(SDParams params) {
     printf("    controlnet cpu:    %s\n", params.control_net_cpu ? "true" : "false");
     printf("    vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
     printf("    diffusion flash attention:%s\n", params.diffusion_flash_attn ? "true" : "false");
+    printf("    diffusion Conv2d direct:%s\n", params.diffusion_conv_direct ? "true" : "false");
+    printf("    vae Conv2d direct:%s\n", params.vae_conv_direct ? "true" : "false");
     printf("    strength(control): %.2f\n", params.control_strength);
     printf("    prompt:            %s\n", params.prompt.c_str());
     printf("    negative_prompt:   %s\n", params.negative_prompt.c_str());
@@ -232,6 +236,10 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --diffusion-fa                     use flash attention in the diffusion model (for low vram)\n");
     printf("                                     Might lower quality, since it implies converting k and v to f16.\n");
     printf("                                     This might crash if it is not supported by the backend.\n");
+    printf("  --diffusion-conv-direct            use Conv2d direct in the diffusion model");
+    printf("                                     This might crash if it is not supported by the backend.\n");
+    printf("  --vae-conv-direct                  use Conv2d direct in the vae model (should improve the performance)");
+    printf("                                     This might crash if it is not supported by the backend.\n");
     printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
     printf("  --canny                            apply canny preprocessor (edge detection)\n");
     printf("  --color                            colors the logging tags according to level\n");
@@ -422,6 +430,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"", "--clip-on-cpu", "", true, &params.clip_on_cpu},
         {"", "--vae-on-cpu", "", true, &params.vae_on_cpu},
         {"", "--diffusion-fa", "", true, &params.diffusion_flash_attn},
+        {"", "--diffusion-conv-direct", "", true, &params.diffusion_conv_direct},
+        {"", "--vae-conv-direct", "", true, &params.vae_conv_direct},
         {"", "--canny", "", true, &params.canny_preprocess},
         {"-v", "--verbos", "", true, &params.verbose},
         {"", "--color", "", true, &params.color},
@@ -901,6 +911,8 @@ int main(int argc, const char* argv[]) {
         params.control_net_cpu,
         params.vae_on_cpu,
         params.diffusion_flash_attn,
+        params.diffusion_conv_direct,
+        params.vae_conv_direct,
         params.chroma_use_dit_mask,
         params.chroma_use_t5_mask,
         params.chroma_t5_mask_pad,
@@ -1012,7 +1024,8 @@ int main(int argc, const char* argv[]) {
     int upscale_factor = 4;  // unused for RealESRGAN_x4plus_anime_6B.pth
     if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) {
         upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
-                                                        params.n_threads);
+                                                        params.n_threads,
+                                                        params.diffusion_conv_direct);
 
         if (upscaler_ctx == NULL) {
             printf("new_upscaler_ctx failed\n");
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 0be8d4b..22dd88c 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -708,6 +708,25 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
     return x;
 }
 
+__STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d_direct(struct ggml_context* ctx,
+                                                             struct ggml_tensor* x,
+                                                             struct ggml_tensor* w,
+                                                             struct ggml_tensor* b,
+                                                             int s0 = 1,
+                                                             int s1 = 1,
+                                                             int p0 = 0,
+                                                             int p1 = 0,
+                                                             int d0 = 1,
+                                                             int d1 = 1) {
+    x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
+    if (b != NULL) {
+        b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
+        // b = ggml_repeat(ctx, b, x);
+        x = ggml_add(ctx, x, b);
+    }
+    return x;
+}
+
 // w: [OC，IC, KD, 1 * 1]
 // x: [N, IC, IH, IW]
 // b: [OC,]
@@ -1377,6 +1396,19 @@ public:
             tensors[prefix + pair.first] = pair.second;
         }
     }
+
+    virtual std::string get_desc() {
+        return "GGMLBlock";
+    }
+
+    void get_all_blocks(std::vector<GGMLBlock*>& result) {
+        result.push_back(this);
+        for (auto& block_iter : blocks) {
+            if (block_iter.second) {
+                block_iter.second->get_all_blocks(result);
+            }
+        }
+    }
 };
 
 class UnaryBlock : public GGMLBlock {
@@ -1466,6 +1498,7 @@ protected:
     std::pair<int, int> padding;
     std::pair<int, int> dilation;
     bool bias;
+    bool direct = false;
 
     void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") {
         enum ggml_type wtype = GGML_TYPE_F16;
@@ -1492,13 +1525,25 @@ public:
           dilation(dilation),
           bias(bias) {}
 
+    void enable_direct() {
+        direct = true;
+    }
+
+    std::string get_desc() {
+        return "Conv2d";
+    }
+
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         struct ggml_tensor* w = params["weight"];
         struct ggml_tensor* b = NULL;
         if (bias) {
             b = params["bias"];
         }
-        return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+        if (direct) {
+            return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+        } else {
+            return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+        }
     }
 };
 
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 2594ba2..c5448f9 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -374,6 +374,10 @@ public:
                                                               model_loader.tensor_storages_types,
                                                               version,
                                                               sd_ctx_params->diffusion_flash_attn);
+                if (sd_ctx_params->diffusion_conv_direct) {
+                    LOG_INFO("Using Conv2d direct in the diffusion model");
+                    std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.enable_conv2d_direct();
+                }
             }
 
             cond_stage_model->alloc_params_buffer();
@@ -395,6 +399,10 @@ public:
                                                                     vae_decode_only,
                                                                     false,
                                                                     version);
+                if (sd_ctx_params->vae_conv_direct) {
+                    LOG_INFO("Using Conv2d direct in the vae model");
+                    first_stage_model->enable_conv2d_direct();
+                }
                 first_stage_model->alloc_params_buffer();
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
             } else {
@@ -403,6 +411,10 @@ public:
                                                                     "decoder.layers",
                                                                     vae_decode_only,
                                                                     version);
+                if (sd_ctx_params->vae_conv_direct) {
+                    LOG_INFO("Using Conv2d direct in the tae model");
+                    tae_first_stage->enable_conv2d_direct();
+                }
             }
             // first_stage_model->get_param_tensors(tensors, "first_stage_model.");
 
@@ -415,6 +427,10 @@ public:
                     controlnet_backend = backend;
                 }
                 control_net = std::make_shared<ControlNet>(controlnet_backend, model_loader.tensor_storages_types, version);
+                if (sd_ctx_params->diffusion_conv_direct) {
+                    LOG_INFO("Using Conv2d direct in the control net");
+                    control_net->enable_conv2d_direct();
+                }
             }
 
             if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) {
diff --git a/stable-diffusion.h b/stable-diffusion.h
index a603259..e87ac2c 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -134,6 +134,8 @@ typedef struct {
     bool keep_control_net_on_cpu;
     bool keep_vae_on_cpu;
     bool diffusion_flash_attn;
+    bool diffusion_conv_direct;
+    bool vae_conv_direct;
     bool chroma_use_dit_mask;
     bool chroma_use_t5_mask;
     int chroma_t5_mask_pad;
@@ -236,7 +238,8 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
 typedef struct upscaler_ctx_t upscaler_ctx_t;
 
 SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
-                                        int n_threads);
+                                        int n_threads,
+                                        bool direct);
 SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
 
 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
diff --git a/tae.hpp b/tae.hpp
index 51fb94f..4959bbd 100644
--- a/tae.hpp
+++ b/tae.hpp
@@ -206,6 +206,17 @@ struct TinyAutoEncoder : public GGMLRunner {
         taesd.init(params_ctx, tensor_types, prefix);
     }
 
+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        taesd.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
     std::string get_desc() {
         return "taesd";
     }
diff --git a/unet.hpp b/unet.hpp
index 7ab4934..696bc6d 100644
--- a/unet.hpp
+++ b/unet.hpp
@@ -546,6 +546,18 @@ struct UNetModelRunner : public GGMLRunner {
         unet.init(params_ctx, tensor_types, prefix);
     }
 
+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        unet.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                LOG_DEBUG("block %s", block->get_desc().c_str());
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
     std::string get_desc() {
         return "unet";
     }
diff --git a/upscaler.cpp b/upscaler.cpp
index 1372134..599f263 100644
--- a/upscaler.cpp
+++ b/upscaler.cpp
@@ -9,9 +9,12 @@ struct UpscalerGGML {
     std::shared_ptr<ESRGAN> esrgan_upscaler;
     std::string esrgan_path;
     int n_threads;
+    bool direct = false;
 
-    UpscalerGGML(int n_threads)
-        : n_threads(n_threads) {
+    UpscalerGGML(int n_threads,
+                 bool direct = false)
+        : n_threads(n_threads),
+          direct(direct) {
     }
 
     bool load_from_file(const std::string& esrgan_path) {
@@ -47,6 +50,9 @@ struct UpscalerGGML {
         }
         LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
         esrgan_upscaler = std::make_shared<ESRGAN>(backend, model_loader.tensor_storages_types);
+        if (direct) {
+            esrgan_upscaler->enable_conv2d_direct();
+        }
         if (!esrgan_upscaler->load_from_file(esrgan_path)) {
             return false;
         }
@@ -104,14 +110,15 @@ struct upscaler_ctx_t {
 };
 
 upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
-                                 int n_threads) {
+                                 int n_threads,
+                                 bool direct = false) {
     upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
     if (upscaler_ctx == NULL) {
         return NULL;
     }
     std::string esrgan_path(esrgan_path_c_str);
 
-    upscaler_ctx->upscaler = new UpscalerGGML(n_threads);
+    upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct);
     if (upscaler_ctx->upscaler == NULL) {
         return NULL;
     }
diff --git a/vae.hpp b/vae.hpp
index 41f53ee..bdf160b 100644
--- a/vae.hpp
+++ b/vae.hpp
@@ -534,6 +534,17 @@ struct AutoEncoderKL : public GGMLRunner {
         ae.init(params_ctx, tensor_types, prefix);
     }
 
+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        ae.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
     std::string get_desc() {
         return "vae";
     }

From 5900ef6605c6fbf7934239f795c13c97bc993853 Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Sun, 3 Aug 2025 01:29:40 +0800
Subject: [PATCH 4/4] sync: update ggml, make cuda im2col a little faster

---
 ggml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml b/ggml
index b96890f..7dee1d6 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit b96890f3ab5ffbdbe56bc126df5366c34bd08d39
+Subproject commit 7dee1d6a1e7611f238d09be96738388da97c88ed