feat: add OpenCL backend support (#680)

2026-02-04 19:03:35 +00:00 · 2025-06-30 17:32:23 +02:00 · 2025-06-30 17:32:23 +02:00 · d42fd59464
commit d42fd59464
parent 0d8b39f0ba
10 changed files with 102 additions and 7 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -28,6 +28,7 @@ option(SD_CUDA                       "sd: cuda backend" OFF)
 option(SD_HIPBLAS                    "sd: rocm backend" OFF)
 option(SD_METAL                      "sd: metal backend" OFF)
 option(SD_VULKAN                     "sd: vulkan backend" OFF)
 option(SD_OPENCL                     "sd: opencl backend" OFF)
 option(SD_SYCL                       "sd: sycl backend" OFF)
 option(SD_MUSA                       "sd: musa backend" OFF)
 option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
@ -52,6 +53,12 @@ if (SD_VULKAN)
    add_definitions(-DSD_USE_VULKAN)
 endif ()
 if (SD_OPENCL)
    message("-- Use OpenCL as backend stable-diffusion")
    set(GGML_OPENCL ON)
    add_definitions(-DSD_USE_OPENCL)
 endif ()
 if (SD_HIPBLAS)
    message("-- Use HIPBLAS as backend stable-diffusion")
    set(GGML_HIP ON)
--- a/README.md
+++ b/README.md
@ -22,7 +22,7 @@ Inference of Stable Diffusion and Flux in pure C/C++
 - Accelerated memory-efficient CPU inference
    - Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB.
 - AVX, AVX2 and AVX512 support for x86 architectures
- Full CUDA, Metal, Vulkan and SYCL backend for GPU acceleration.
+- Full CUDA, Metal, Vulkan, OpenCL and SYCL backend for GPU acceleration.
 - Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models
    - No need to convert to `.ggml` or `.gguf` anymore!
 - Flash Attention for memory usage optimization
@ -160,6 +160,73 @@ cmake .. -DSD_VULKAN=ON
 cmake --build . --config Release
 ```
 ##### Using OpenCL (for Adreno GPU)
 Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
 To build for Windows ARM please refers to [Windows 11 Arm64
 ](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
 Building for Android:
  Android NDK:
       Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
 Setup OpenCL Dependencies for NDK:
 You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
 *   OpenCL Headers:
    ```bash
    # In a temporary working directory
    git clone https://github.com/KhronosGroup/OpenCL-Headers
    cd OpenCL-Headers
    # Replace <YOUR_NDK_PATH> with your actual NDK installation path
    # e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
    sudo cp -r CL <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
    cd ..
    ```
 *   OpenCL ICD Loader:
    ```bash
    # In the same temporary working directory
    git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
    cd OpenCL-ICD-Loader
    mkdir build_ndk && cd build_ndk
    # Replace <YOUR_NDK_PATH> in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
    cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
      -DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
      -DOPENCL_ICD_LOADER_HEADERS_DIR=<YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
      -DANDROID_ABI=arm64-v8a \
      -DANDROID_PLATFORM=24 \
      -DANDROID_STL=c++_shared
    ninja
    # Replace <YOUR_NDK_PATH>
    # e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
    sudo cp libOpenCL.so <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
    cd ../..
    ```
 Build `stable-diffusion.cpp` for Android with OpenCL:
 ```bash
 mkdir build-android && cd build-android
 # Replace <YOUR_NDK_PATH> with your actual NDK installation path
 # e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake
 cmake .. -G Ninja \
  -DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
  -DANDROID_ABI=arm64-v8a \
  -DANDROID_PLATFORM=android-28 \
  -DGGML_OPENMP=OFF \
  -DSD_OPENCL=ON
 ninja
 ```
 *(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)*
 ##### Using SYCL
 Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggerganov/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
--- a/common.hpp
+++ b/common.hpp
@ -56,7 +56,7 @@ public:
        // x: [N, channels, h, w]
        auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
-        x = ggml_upscale(ctx, x, 2);  // [N, channels, h*2, w*2]
+        x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST);  // [N, channels, h*2, w*2]
        x = conv->forward(ctx, x);    // [N, out_channels, h*2, w*2]
        return x;
    }
--- a/esrgan.hpp
+++ b/esrgan.hpp
@ -130,8 +130,8 @@ public:
        body_feat = conv_body->forward(ctx, body_feat);
        feat      = ggml_add(ctx, feat, body_feat);
        // upsample
-        feat     = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2)));
+        feat     = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
-        feat     = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2)));
+        feat     = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
        auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat)));
        return out;
    }
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit ff9052988b76e137bcf92bb335733933ca196ac0
+Subproject commit 9e4bee1c5afc2d677a5b32ecb90cbdb483e81fff
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -39,6 +39,10 @@
 #include "ggml-vulkan.h"
 #endif
 #ifdef SD_USE_OPENCL
 #include "ggml-opencl.h"
 #endif
 #ifdef SD_USE_SYCL
 #include "ggml-sycl.h"
 #endif
@ -113,7 +117,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_kronecker(ggml_context* ctx, struct g
                                     a->ne[0] * b->ne[0],
                                     a->ne[1] * b->ne[1],
                                     a->ne[2] * b->ne[2],
-                                     a->ne[3] * b->ne[3]),
+                                     a->ne[3] * b->ne[3],
                                     GGML_SCALE_MODE_NEAREST), 
                    b);
 }
--- a/model.cpp
+++ b/model.cpp
@ -26,6 +26,10 @@
 #include "ggml-vulkan.h"
 #endif
 #ifdef SD_USE_OPENCL
 #include "ggml-opencl.h"
 #endif
 #define ST_HEADER_SIZE_LEN 8
 uint64_t read_u64(uint8_t* buffer) {
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -181,6 +181,14 @@ public:
            LOG_WARN("Failed to initialize Vulkan backend");
        }
 #endif
 #ifdef SD_USE_OPENCL
        LOG_DEBUG("Using OpenCL backend");
        // ggml_log_set(ggml_log_callback_default, nullptr); // Optional ggml logs
        backend = ggml_backend_opencl_init();
        if (!backend) {
            LOG_WARN("Failed to initialize OpenCL backend");
        }
 #endif
 #ifdef SD_USE_SYCL
        LOG_DEBUG("Using SYCL backend");
        backend = ggml_backend_sycl_init(0);
--- a/tae.hpp
+++ b/tae.hpp
@ -149,7 +149,7 @@ public:
                if (i == 1) {
                    h = ggml_relu_inplace(ctx, h);
                } else {
-                    h = ggml_upscale(ctx, h, 2);
+                    h = ggml_upscale(ctx, h, 2, GGML_SCALE_MODE_NEAREST);
                }
                continue;
            }
--- a/upscaler.cpp
+++ b/upscaler.cpp
@ -28,6 +28,10 @@ struct UpscalerGGML {
        LOG_DEBUG("Using Vulkan backend");
        backend = ggml_backend_vk_init(0);
 #endif
 #ifdef SD_USE_OPENCL
        LOG_DEBUG("Using OpenCL backend");
        backend = ggml_backend_opencl_init();
 #endif
 #ifdef SD_USE_SYCL
        LOG_DEBUG("Using SYCL backend");
        backend = ggml_backend_sycl_init(0);
		`@ -1 +1 @@`
			`Subproject commit ff9052988b76e137bcf92bb335733933ca196ac0`				`Subproject commit 9e4bee1c5afc2d677a5b32ecb90cbdb483e81fff`