diff --git a/CMakeLists.txt b/CMakeLists.txt index 06de0d5..c0e5e15 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,6 +33,7 @@ option(SD_SYCL "sd: sycl backend" OFF) option(SD_MUSA "sd: musa backend" OFF) option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF) option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF) +option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF) #option(SD_BUILD_SERVER "sd: build server example" ON) if(SD_CUDA) @@ -118,13 +119,23 @@ endif() set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) -# see https://github.com/ggerganov/ggml/pull/682 -add_definitions(-DGGML_MAX_NAME=128) +if (NOT SD_USE_SYSTEM_GGML) + # see https://github.com/ggerganov/ggml/pull/682 + add_definitions(-DGGML_MAX_NAME=128) +endif() # deps # Only add ggml if it hasn't been added yet if (NOT TARGET ggml) - add_subdirectory(ggml) + if (SD_USE_SYSTEM_GGML) + find_package(ggml REQUIRED) + if (NOT ggml_FOUND) + message(FATAL_ERROR "System-installed GGML library not found.") + endif() + add_library(ggml ALIAS ggml::ggml) + else() + add_subdirectory(ggml) + endif() endif() add_subdirectory(thirdparty) diff --git a/README.md b/README.md index 295d21b..59174fe 100644 --- a/README.md +++ b/README.md @@ -341,6 +341,10 @@ arguments: --diffusion-fa use flash attention in the diffusion model (for low vram) Might lower quality, since it implies converting k and v to f16. This might crash if it is not supported by the backend. + --diffusion-conv-direct use Conv2d direct in the diffusion model + This might crash if it is not supported by the backend. + --vae-conv-direct use Conv2d direct in the vae model (should improve the performance) + This might crash if it is not supported by the backend. --control-net-cpu keep controlnet in cpu (for low vram) --canny apply canny preprocessor (edge detection) --color colors the logging tags according to level diff --git a/control.hpp b/control.hpp index 19f9181..094dd12 100644 --- a/control.hpp +++ b/control.hpp @@ -324,6 +324,17 @@ struct ControlNet : public GGMLRunner { control_net.init(params_ctx, tensor_types, ""); } + void enable_conv2d_direct() { + std::vector blocks; + control_net.get_all_blocks(blocks); + for (auto block : blocks) { + if (block->get_desc() == "Conv2d") { + auto conv_block = (Conv2d*)block; + conv_block->enable_direct(); + } + } + } + ~ControlNet() { free_control_ctx(); } diff --git a/esrgan.hpp b/esrgan.hpp index 154e51b..e2003e4 100644 --- a/esrgan.hpp +++ b/esrgan.hpp @@ -149,6 +149,17 @@ struct ESRGAN : public GGMLRunner { rrdb_net.init(params_ctx, tensor_types, ""); } + void enable_conv2d_direct() { + std::vector blocks; + rrdb_net.get_all_blocks(blocks); + for (auto block : blocks) { + if (block->get_desc() == "Conv2d") { + auto conv_block = (Conv2d*)block; + conv_block->enable_direct(); + } + } + } + std::string get_desc() { return "esrgan"; } diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 6fccd0e..23998ff 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -103,6 +103,8 @@ struct SDParams { bool clip_on_cpu = false; bool vae_on_cpu = false; bool diffusion_flash_attn = false; + bool diffusion_conv_direct = false; + bool vae_conv_direct = false; bool canny_preprocess = false; bool color = false; int upscale_repeats = 1; @@ -153,6 +155,8 @@ void print_params(SDParams params) { printf(" control_net_cpu: %s\n", params.control_net_cpu ? "true" : "false"); printf(" vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false"); printf(" diffusion flash attention:%s\n", params.diffusion_flash_attn ? "true" : "false"); + printf(" diffusion Conv2d direct:%s\n", params.diffusion_conv_direct ? "true" : "false"); + printf(" vae Conv2d direct:%s\n", params.vae_conv_direct ? "true" : "false"); printf(" strength(control): %.2f\n", params.control_strength); printf(" prompt: %s\n", params.prompt.c_str()); printf(" negative_prompt: %s\n", params.negative_prompt.c_str()); @@ -255,6 +259,10 @@ void print_usage(int argc, const char* argv[]) { printf(" --diffusion-fa use flash attention in the diffusion model (for low vram)\n"); printf(" Might lower quality, since it implies converting k and v to f16.\n"); printf(" This might crash if it is not supported by the backend.\n"); + printf(" --diffusion-conv-direct use Conv2d direct in the diffusion model"); + printf(" This might crash if it is not supported by the backend.\n"); + printf(" --vae-conv-direct use Conv2d direct in the vae model (should improve the performance)"); + printf(" This might crash if it is not supported by the backend.\n"); printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n"); printf(" --canny apply canny preprocessor (edge detection)\n"); printf(" --color colors the logging tags according to level\n"); @@ -495,6 +503,8 @@ void parse_args(int argc, const char** argv, SDParams& params) { {"", "--clip-on-cpu", "", true, ¶ms.clip_on_cpu}, {"", "--vae-on-cpu", "", true, ¶ms.vae_on_cpu}, {"", "--diffusion-fa", "", true, ¶ms.diffusion_flash_attn}, + {"", "--diffusion-conv-direct", "", true, ¶ms.diffusion_conv_direct}, + {"", "--vae-conv-direct", "", true, ¶ms.vae_conv_direct}, {"", "--canny", "", true, ¶ms.canny_preprocess}, {"-v", "--verbos", "", true, ¶ms.verbose}, {"", "--color", "", true, ¶ms.color}, @@ -1077,6 +1087,8 @@ int main(int argc, const char* argv[]) { params.control_net_cpu, params.vae_on_cpu, params.diffusion_flash_attn, + params.diffusion_conv_direct, + params.vae_conv_direct, params.chroma_use_dit_mask, params.chroma_use_t5_mask, params.chroma_t5_mask_pad, @@ -1184,6 +1196,7 @@ int main(int argc, const char* argv[]) { if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) { upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(), params.offload_params_to_cpu, + params.diffusion_conv_direct, params.n_threads); if (upscaler_ctx == NULL) { diff --git a/ggml b/ggml index 089530b..9caa235 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit 089530bb72e70aa9f9ecb98137dfd891c2be20c1 +Subproject commit 9caa235fe8e7e0ed0cbb599c54ec1cf07a9b7b73 diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 88f82bd..110bbbc 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -56,6 +56,8 @@ #define __STATIC_INLINE__ static inline #endif +static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128"); + // n-mode trensor-matrix product // example: 2-mode product // A: [ne03, k, ne01, ne00] @@ -839,6 +841,27 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx, // w: [OC*IC, KD, KH, KW] // x: [N*IC, ID, IH, IW] +__STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d_direct(struct ggml_context* ctx, + struct ggml_tensor* x, + struct ggml_tensor* w, + struct ggml_tensor* b, + int s0 = 1, + int s1 = 1, + int p0 = 0, + int p1 = 0, + int d0 = 1, + int d1 = 1) { + x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1); + if (b != NULL) { + b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1); + // b = ggml_repeat(ctx, b, x); + x = ggml_add(ctx, x, b); + } + return x; +} + +// w: [OC,IC, KD, 1 * 1] +// x: [N, IC, IH, IW] // b: [OC,] // result: [N*OC, OD, OH, OW] __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_3d(struct ggml_context* ctx, @@ -1607,6 +1630,19 @@ public: tensors[prefix + pair.first] = pair.second; } } + + virtual std::string get_desc() { + return "GGMLBlock"; + } + + void get_all_blocks(std::vector& result) { + result.push_back(this); + for (auto& block_iter : blocks) { + if (block_iter.second) { + block_iter.second->get_all_blocks(result); + } + } + } }; class UnaryBlock : public GGMLBlock { @@ -1703,6 +1739,7 @@ protected: std::pair padding; std::pair dilation; bool bias; + bool direct = false; void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") { enum ggml_type wtype = GGML_TYPE_F16; @@ -1729,13 +1766,25 @@ public: dilation(dilation), bias(bias) {} + void enable_direct() { + direct = true; + } + + std::string get_desc() { + return "Conv2d"; + } + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* w = params["weight"]; struct ggml_tensor* b = NULL; if (bias) { b = params["bias"]; } - return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); + if (direct) { + return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); + } else { + return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); + } } }; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 513775d..3e6110c 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -428,6 +428,10 @@ public: model_loader.tensor_storages_types, version, sd_ctx_params->diffusion_flash_attn); + if (sd_ctx_params->diffusion_conv_direct) { + LOG_INFO("Using Conv2d direct in the diffusion model"); + std::dynamic_pointer_cast(diffusion_model)->unet.enable_conv2d_direct(); + } } cond_stage_model->alloc_params_buffer(); @@ -465,6 +469,10 @@ public: vae_decode_only, false, version); + if (sd_ctx_params->vae_conv_direct) { + LOG_INFO("Using Conv2d direct in the vae model"); + first_stage_model->enable_conv2d_direct(); + } first_stage_model->alloc_params_buffer(); first_stage_model->get_param_tensors(tensors, "first_stage_model"); } else { @@ -474,6 +482,10 @@ public: "decoder.layers", vae_decode_only, version); + if (sd_ctx_params->vae_conv_direct) { + LOG_INFO("Using Conv2d direct in the tae model"); + tae_first_stage->enable_conv2d_direct(); + } } // first_stage_model->get_param_tensors(tensors, "first_stage_model."); @@ -489,6 +501,10 @@ public: offload_params_to_cpu, model_loader.tensor_storages_types, version); + if (sd_ctx_params->diffusion_conv_direct) { + LOG_INFO("Using Conv2d direct in the control net"); + control_net->enable_conv2d_direct(); + } } if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) { diff --git a/stable-diffusion.h b/stable-diffusion.h index 32c0a94..b26c557 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -136,6 +136,8 @@ typedef struct { bool keep_control_net_on_cpu; bool keep_vae_on_cpu; bool diffusion_flash_attn; + bool diffusion_conv_direct; + bool vae_conv_direct; bool chroma_use_dit_mask; bool chroma_use_t5_mask; int chroma_t5_mask_pad; @@ -245,6 +247,7 @@ typedef struct upscaler_ctx_t upscaler_ctx_t; SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path, bool offload_params_to_cpu, + bool direct, int n_threads); SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx); diff --git a/tae.hpp b/tae.hpp index da5aa56..1ae1257 100644 --- a/tae.hpp +++ b/tae.hpp @@ -207,6 +207,17 @@ struct TinyAutoEncoder : public GGMLRunner { taesd.init(params_ctx, tensor_types, prefix); } + void enable_conv2d_direct() { + std::vector blocks; + taesd.get_all_blocks(blocks); + for (auto block : blocks) { + if (block->get_desc() == "Conv2d") { + auto conv_block = (Conv2d*)block; + conv_block->enable_direct(); + } + } + } + std::string get_desc() { return "taesd"; } diff --git a/unet.hpp b/unet.hpp index 847911d..7e7b227 100644 --- a/unet.hpp +++ b/unet.hpp @@ -547,6 +547,18 @@ struct UNetModelRunner : public GGMLRunner { unet.init(params_ctx, tensor_types, prefix); } + void enable_conv2d_direct() { + std::vector blocks; + unet.get_all_blocks(blocks); + for (auto block : blocks) { + if (block->get_desc() == "Conv2d") { + LOG_DEBUG("block %s", block->get_desc().c_str()); + auto conv_block = (Conv2d*)block; + conv_block->enable_direct(); + } + } + } + std::string get_desc() { return "unet"; } diff --git a/upscaler.cpp b/upscaler.cpp index c7fb305..4ab0b73 100644 --- a/upscaler.cpp +++ b/upscaler.cpp @@ -9,9 +9,12 @@ struct UpscalerGGML { std::shared_ptr esrgan_upscaler; std::string esrgan_path; int n_threads; + bool direct = false; - UpscalerGGML(int n_threads) - : n_threads(n_threads) { + UpscalerGGML(int n_threads, + bool direct = false) + : n_threads(n_threads), + direct(direct) { } bool load_from_file(const std::string& esrgan_path, @@ -48,6 +51,9 @@ struct UpscalerGGML { } LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type)); esrgan_upscaler = std::make_shared(backend, offload_params_to_cpu, model_loader.tensor_storages_types); + if (direct) { + esrgan_upscaler->enable_conv2d_direct(); + } if (!esrgan_upscaler->load_from_file(esrgan_path)) { return false; } @@ -106,6 +112,7 @@ struct upscaler_ctx_t { upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str, bool offload_params_to_cpu, + bool direct, int n_threads) { upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t)); if (upscaler_ctx == NULL) { @@ -113,7 +120,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str, } std::string esrgan_path(esrgan_path_c_str); - upscaler_ctx->upscaler = new UpscalerGGML(n_threads); + upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct); if (upscaler_ctx->upscaler == NULL) { return NULL; } diff --git a/vae.hpp b/vae.hpp index dc44dde..d3793b3 100644 --- a/vae.hpp +++ b/vae.hpp @@ -546,6 +546,17 @@ struct AutoEncoderKL : public VAE { ae.init(params_ctx, tensor_types, prefix); } + void enable_conv2d_direct() { + std::vector blocks; + ae.get_all_blocks(blocks); + for (auto block : blocks) { + if (block->get_desc() == "Conv2d") { + auto conv_block = (Conv2d*)block; + conv_block->enable_direct(); + } + } + } + std::string get_desc() { return "vae"; }