chore: avoid warnings when building on linux

feat: add OpenCL backend support (#680 )
fix: avoid crash on sdxl loras (#658 )
2025-12-13 05:48:56 +00:00 · 2025-06-30 23:49:52 +08:00 · 2025-06-30 23:32:23 +08:00 · 2025-06-30 23:29:32 +08:00 · 2025-06-30 23:27:40 +08:00 · 2025-06-30 23:23:21 +08:00
14 changed files with 142 additions and 41 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -28,6 +28,7 @@ option(SD_CUDA                       "sd: cuda backend" OFF)
 option(SD_HIPBLAS                    "sd: rocm backend" OFF)
 option(SD_METAL                      "sd: metal backend" OFF)
 option(SD_VULKAN                     "sd: vulkan backend" OFF)
+option(SD_OPENCL                     "sd: opencl backend" OFF)
 option(SD_SYCL                       "sd: sycl backend" OFF)
 option(SD_MUSA                       "sd: musa backend" OFF)
 option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
@ -52,6 +53,12 @@ if (SD_VULKAN)
    add_definitions(-DSD_USE_VULKAN)
 endif ()

+if (SD_OPENCL)
+    message("-- Use OpenCL as backend stable-diffusion")
+    set(GGML_OPENCL ON)
+    add_definitions(-DSD_USE_OPENCL)
+endif ()
+
 if (SD_HIPBLAS)
    message("-- Use HIPBLAS as backend stable-diffusion")
    set(GGML_HIP ON)
--- a/Dockerfile.musa
+++ b/Dockerfile.musa
@ -2,14 +2,17 @@ ARG MUSA_VERSION=rc3.1.1

 FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu22.04 as build

-RUN apt-get update && apt-get install -y cmake
+RUN apt-get update && apt-get install -y ccache cmake git

 WORKDIR /sd.cpp

 COPY . .

 RUN mkdir build && cd build && \
-    cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ \
+        -DCMAKE_C_FLAGS="${CMAKE_C_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
+        -DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
+        -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release && \
    cmake --build . --config Release

 FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu22.04 as runtime
--- a/README.md
+++ b/README.md
@ -22,7 +22,7 @@ Inference of Stable Diffusion and Flux in pure C/C++
 - Accelerated memory-efficient CPU inference
    - Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB.
 - AVX, AVX2 and AVX512 support for x86 architectures
- Full CUDA, Metal, Vulkan and SYCL backend for GPU acceleration.
+- Full CUDA, Metal, Vulkan, OpenCL and SYCL backend for GPU acceleration.
 - Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models
    - No need to convert to `.ggml` or `.gguf` anymore!
 - Flash Attention for memory usage optimization
@ -160,6 +160,73 @@ cmake .. -DSD_VULKAN=ON
 cmake --build . --config Release
 ```

+##### Using OpenCL (for Adreno GPU)
+
+Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
+
+To build for Windows ARM please refers to [Windows 11 Arm64
+](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
+
+Building for Android:
+
+  Android NDK:
+       Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
+
+Setup OpenCL Dependencies for NDK:
+
+You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
+
+*   OpenCL Headers:
+    ```bash
+    # In a temporary working directory
+    git clone https://github.com/KhronosGroup/OpenCL-Headers
+    cd OpenCL-Headers
+    # Replace <YOUR_NDK_PATH> with your actual NDK installation path
+    # e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+    sudo cp -r CL <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+    cd ..
+    ```
+
+*   OpenCL ICD Loader:
+    ```bash
+    # In the same temporary working directory
+    git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
+    cd OpenCL-ICD-Loader
+    mkdir build_ndk && cd build_ndk
+
+    # Replace <YOUR_NDK_PATH> in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
+    cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
+      -DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
+      -DOPENCL_ICD_LOADER_HEADERS_DIR=<YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
+      -DANDROID_ABI=arm64-v8a \
+      -DANDROID_PLATFORM=24 \
+      -DANDROID_STL=c++_shared
+
+    ninja
+    # Replace <YOUR_NDK_PATH>
+    # e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
+    sudo cp libOpenCL.so <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
+    cd ../..
+    ```
+
+Build `stable-diffusion.cpp` for Android with OpenCL:
+
+```bash
+mkdir build-android && cd build-android
+
+# Replace <YOUR_NDK_PATH> with your actual NDK installation path
+# e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake
+cmake .. -G Ninja \
+  -DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
+  -DANDROID_ABI=arm64-v8a \
+  -DANDROID_PLATFORM=android-28 \
+  -DGGML_OPENMP=OFF \
+  -DSD_OPENCL=ON
+
+ninja
+```
+*(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)*
+
 ##### Using SYCL

 Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggerganov/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
--- a/clip.hpp
+++ b/clip.hpp
@ -678,8 +678,8 @@ public:
    bool with_final_ln        = true;

    CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
-                  int clip_skip_value = -1,
-                  bool with_final_ln  = true)
+                  bool with_final_ln  = true,
+                  int clip_skip_value = -1)
        : version(version), with_final_ln(with_final_ln) {
        if (version == OPEN_CLIP_VIT_H_14) {
            hidden_size       = 1024;
@ -701,7 +701,7 @@ public:

    void set_clip_skip(int skip) {
        if (skip <= 0) {
-            return;
+            skip = -1;
        }
        clip_skip = skip;
    }
@ -871,9 +871,9 @@ struct CLIPTextModelRunner : public GGMLRunner {
                        std::map<std::string, enum ggml_type>& tensor_types,
                        const std::string prefix,
                        CLIPVersion version = OPENAI_CLIP_VIT_L_14,
-                        int clip_skip_value = 1,
-                        bool with_final_ln  = true)
-        : GGMLRunner(backend), model(version, clip_skip_value, with_final_ln) {
+                        bool with_final_ln  = true,
+                        int clip_skip_value = -1)
+        : GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) {
        model.init(params_ctx, tensor_types, prefix);
    }

--- a/common.hpp
+++ b/common.hpp
@ -56,7 +56,7 @@ public:
        // x: [N, channels, h, w]
        auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);

-        x = ggml_upscale(ctx, x, 2);  // [N, channels, h*2, w*2]
+        x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST);  // [N, channels, h*2, w*2]
        x = conv->forward(ctx, x);    // [N, out_channels, h*2, w*2]
        return x;
    }
--- a/conditioner.hpp
+++ b/conditioner.hpp
@ -63,23 +63,24 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                      PMVersion pv      = PM_VERSION_1,
                                      int clip_skip     = -1)
        : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
+        if (sd_version_is_sd1(version)) {
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
+        } else if (sd_version_is_sd2(version)) {
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
+        } else if (sd_version_is_sdxl(version)) {
+            text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
+            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
+        }
+        set_clip_skip(clip_skip);
+    }
+
+    void set_clip_skip(int clip_skip) {
        if (clip_skip <= 0) {
            clip_skip = 1;
            if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
                clip_skip = 2;
            }
        }
-        if (sd_version_is_sd1(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip);
-        } else if (sd_version_is_sd2(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, clip_skip);
-        } else if (sd_version_is_sdxl(version)) {
-            text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
-            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
-        }
-    }
-
-    void set_clip_skip(int clip_skip) {
        text_model->set_clip_skip(clip_skip);
        if (sd_version_is_sdxl(version)) {
            text_model2->set_clip_skip(clip_skip);
@ -665,15 +666,16 @@ struct SD3CLIPEmbedder : public Conditioner {
                    std::map<std::string, enum ggml_type>& tensor_types,
                    int clip_skip = -1)
        : clip_g_tokenizer(0) {
-        if (clip_skip <= 0) {
-            clip_skip = 2;
-        }
-        clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
-        clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
+        clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
+        clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
        t5     = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
+        set_clip_skip(clip_skip);
    }

    void set_clip_skip(int clip_skip) {
+        if (clip_skip <= 0) {
+            clip_skip = 2;
+        }
        clip_l->set_clip_skip(clip_skip);
        clip_g->set_clip_skip(clip_skip);
    }
@ -1010,14 +1012,15 @@ struct FluxCLIPEmbedder : public Conditioner {
    FluxCLIPEmbedder(ggml_backend_t backend,
                     std::map<std::string, enum ggml_type>& tensor_types,
                     int clip_skip = -1) {
-        if (clip_skip <= 0) {
-            clip_skip = 2;
-        }
-        clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, true);
+        clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
        t5     = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
+        set_clip_skip(clip_skip);
    }

    void set_clip_skip(int clip_skip) {
+        if (clip_skip <= 0) {
+            clip_skip = 2;
+        }
        clip_l->set_clip_skip(clip_skip);
    }

@ -1422,4 +1425,4 @@ struct PixArtCLIPEmbedder : public Conditioner {
    }
 };

-#endif
+#endif
--- a/esrgan.hpp
+++ b/esrgan.hpp
@ -130,8 +130,8 @@ public:
        body_feat = conv_body->forward(ctx, body_feat);
        feat      = ggml_add(ctx, feat, body_feat);
        // upsample
-        feat     = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2)));
-        feat     = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2)));
+        feat     = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
+        feat     = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
        auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat)));
        return out;
    }
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit ff9052988b76e137bcf92bb335733933ca196ac0
+Subproject commit 9e4bee1c5afc2d677a5b32ecb90cbdb483e81fff
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -39,6 +39,10 @@
 #include "ggml-vulkan.h"
 #endif

+#ifdef SD_USE_OPENCL
+#include "ggml-opencl.h"
+#endif
+
 #ifdef SD_USE_SYCL
 #include "ggml-sycl.h"
 #endif
@ -113,7 +117,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_kronecker(ggml_context* ctx, struct g
                                     a->ne[0] * b->ne[0],
                                     a->ne[1] * b->ne[1],
                                     a->ne[2] * b->ne[2],
-                                     a->ne[3] * b->ne[3]),
+                                     a->ne[3] * b->ne[3],
+                                     GGML_SCALE_MODE_NEAREST), 
                    b);
 }

--- a/lora.hpp
+++ b/lora.hpp
@ -3,7 +3,7 @@

 #include "ggml_extend.hpp"

-#define LORA_GRAPH_SIZE 10240
+#define LORA_GRAPH_SIZE 15360

 struct LoraModel : public GGMLRunner {
    enum lora_t {
--- a/model.cpp
+++ b/model.cpp
@ -26,6 +26,10 @@
 #include "ggml-vulkan.h"
 #endif

+#ifdef SD_USE_OPENCL
+#include "ggml-opencl.h"
+#endif
+
 #define ST_HEADER_SIZE_LEN 8

 uint64_t read_u64(uint8_t* buffer) {
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -181,6 +181,14 @@ public:
            LOG_WARN("Failed to initialize Vulkan backend");
        }
 #endif
+#ifdef SD_USE_OPENCL
+        LOG_DEBUG("Using OpenCL backend");
+        // ggml_log_set(ggml_log_callback_default, nullptr); // Optional ggml logs
+        backend = ggml_backend_opencl_init();
+        if (!backend) {
+            LOG_WARN("Failed to initialize OpenCL backend");
+        }
+#endif
 #ifdef SD_USE_SYCL
        LOG_DEBUG("Using SYCL backend");
        backend = ggml_backend_sycl_init(0);
@ -276,10 +284,10 @@ public:
            model_loader.set_wtype_override(GGML_TYPE_F32, "vae.");
        }

-        LOG_INFO("Weight type:                 %s", model_wtype != SD_TYPE_COUNT ? ggml_type_name(model_wtype) : "??");
-        LOG_INFO("Conditioner weight type:     %s", conditioner_wtype != SD_TYPE_COUNT ? ggml_type_name(conditioner_wtype) : "??");
-        LOG_INFO("Diffusion model weight type: %s", diffusion_model_wtype != SD_TYPE_COUNT ? ggml_type_name(diffusion_model_wtype) : "??");
-        LOG_INFO("VAE weight type:             %s", vae_wtype != SD_TYPE_COUNT ? ggml_type_name(vae_wtype) : "??");
+        LOG_INFO("Weight type:                 %s", model_wtype != GGML_TYPE_COUNT ? ggml_type_name(model_wtype) : "??");
+        LOG_INFO("Conditioner weight type:     %s", conditioner_wtype != GGML_TYPE_COUNT ? ggml_type_name(conditioner_wtype) : "??");
+        LOG_INFO("Diffusion model weight type: %s", diffusion_model_wtype != GGML_TYPE_COUNT ? ggml_type_name(diffusion_model_wtype) : "??");
+        LOG_INFO("VAE weight type:             %s", vae_wtype != GGML_TYPE_COUNT ? ggml_type_name(vae_wtype) : "??");

        LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor));

--- a/tae.hpp
+++ b/tae.hpp
@ -149,7 +149,7 @@ public:
                if (i == 1) {
                    h = ggml_relu_inplace(ctx, h);
                } else {
-                    h = ggml_upscale(ctx, h, 2);
+                    h = ggml_upscale(ctx, h, 2, GGML_SCALE_MODE_NEAREST);
                }
                continue;
            }
--- a/upscaler.cpp
+++ b/upscaler.cpp
@ -28,6 +28,10 @@ struct UpscalerGGML {
        LOG_DEBUG("Using Vulkan backend");
        backend = ggml_backend_vk_init(0);
 #endif
+#ifdef SD_USE_OPENCL
+        LOG_DEBUG("Using OpenCL backend");
+        backend = ggml_backend_opencl_init();
+#endif
 #ifdef SD_USE_SYCL
        LOG_DEBUG("Using SYCL backend");
        backend = ggml_backend_sycl_init(0);
Author	SHA1	Message	Date
leejet	23de7fc44a	chore: avoid warnings when building on linux	2025-06-30 23:49:52 +08:00
rmatif	d42fd59464	feat: add OpenCL backend support (#680 )	2025-06-30 23:32:23 +08:00
Wagner Bruna	0d8b39f0ba	fix: avoid crash on sdxl loras (#658 ) Some SDXL LoRAs (eg. PCM) can exceed 12k nodes.	2025-06-30 23:29:32 +08:00
R0CKSTAR	539b5b9374	fix: fix musa docker build (#662 ) Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>	2025-06-30 23:27:40 +08:00
Wagner Bruna	b1fc16b504	fix: allow resetting clip_skip to its default value (#697 )	2025-06-30 23:23:21 +08:00