From 1d13041aa231bc29faa0577927bb784bffc0c017 Mon Sep 17 00:00:00 2001 From: leejet Date: Tue, 14 Oct 2025 23:12:39 +0800 Subject: [PATCH] fix: resolve precision issues in SDXL VAE under fp16 --- README.md | 1 - conditioner.hpp | 2 +- ggml_extend.hpp | 70 +++++++++++++++++++++++--------------------- qwen_image.hpp | 2 +- stable-diffusion.cpp | 12 ++++---- vae.hpp | 12 ++++++++ 6 files changed, 55 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index 0a27bc1..516b719 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,6 @@ API and command-line option may change frequently.*** - Image Models - SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo) - SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) - - !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors). - [SD3/SD3.5](./docs/sd3.md) - [Flux-dev/Flux-schnell](./docs/flux.md) - [Chroma](./docs/chroma.md) diff --git a/conditioner.hpp b/conditioner.hpp index abd6dbc..4f9efb8 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -1457,7 +1457,7 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner { const ConditionerParams& conditioner_params) { std::string prompt; std::vector> image_embeds; - size_t system_prompt_length = 0; + size_t system_prompt_length = 0; int prompt_template_encode_start_idx = 34; if (qwenvl->enable_vision && conditioner_params.ref_images.size() > 0) { LOG_INFO("QwenImageEditPlusPipeline"); diff --git a/ggml_extend.hpp b/ggml_extend.hpp index ca91121..d8df0d8 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -975,38 +975,28 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* w, struct ggml_tensor* b, - int s0 = 1, - int s1 = 1, - int p0 = 0, - int p1 = 0, - int d0 = 1, - int d1 = 1) { - x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1); - if (b != NULL) { - b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1); - // b = ggml_repeat(ctx, b, x); - x = ggml_add_inplace(ctx, x, b); + int s0 = 1, + int s1 = 1, + int p0 = 0, + int p1 = 0, + int d0 = 1, + int d1 = 1, + bool direct = false, + float scale = 1.f) { + if (scale != 1.f) { + x = ggml_scale(ctx, x, scale); + } + if (direct) { + x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1); + } else { + x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1); + } + if (scale != 1.f) { + x = ggml_scale(ctx, x, 1.f / scale); } - return x; -} - -// w: [OC*IC, KD, KH, KW] -// x: [N*IC, ID, IH, IW] -__STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d_direct(struct ggml_context* ctx, - struct ggml_tensor* x, - struct ggml_tensor* w, - struct ggml_tensor* b, - int s0 = 1, - int s1 = 1, - int p0 = 0, - int p1 = 0, - int d0 = 1, - int d1 = 1) { - x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1); if (b != NULL) { b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1); - // b = ggml_repeat(ctx, b, x); - x = ggml_add(ctx, x, b); + x = ggml_add_inplace(ctx, x, b); } return x; } @@ -2067,6 +2057,7 @@ protected: std::pair dilation; bool bias; bool direct = false; + float scale = 1.f; void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") { enum ggml_type wtype = GGML_TYPE_F16; @@ -2097,6 +2088,10 @@ public: direct = true; } + void set_scale(float scale_value) { + scale = scale_value; + } + std::string get_desc() { return "Conv2d"; } @@ -2107,11 +2102,18 @@ public: if (bias) { b = params["bias"]; } - if (direct) { - return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); - } else { - return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); - } + return ggml_nn_conv_2d(ctx, + x, + w, + b, + stride.second, + stride.first, + padding.second, + padding.first, + dilation.second, + dilation.first, + direct, + scale); } }; diff --git a/qwen_image.hpp b/qwen_image.hpp index 630e553..ce4e62d 100644 --- a/qwen_image.hpp +++ b/qwen_image.hpp @@ -535,7 +535,7 @@ namespace Qwen { } } LOG_ERROR("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers); - qwen_image = QwenImageModel(qwen_image_params); + qwen_image = QwenImageModel(qwen_image_params); qwen_image.init(params_ctx, tensor_types, prefix); } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 4291280..3de9314 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -330,13 +330,6 @@ public: if (sd_version_is_sdxl(version)) { scale_factor = 0.13025f; - if (strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 && strlen(SAFE_STR(sd_ctx_params->taesd_path)) == 0) { - LOG_WARN( - "!!!It looks like you are using SDXL model. " - "If you find that the generated images are completely black, " - "try specifying SDXL VAE FP16 Fix with the --vae parameter. " - "You can find it here: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors"); - } } else if (sd_version_is_sd3(version)) { scale_factor = 1.5305f; } else if (sd_version_is_flux(version)) { @@ -517,6 +510,11 @@ public: LOG_INFO("Using Conv2d direct in the vae model"); first_stage_model->enable_conv2d_direct(); } + if (version == VERSION_SDXL && strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0) { + float vae_conv_2d_scale = 1.f / 32.f; + LOG_WARN("No VAE specified with --vae, using Conv2D scale %.3f", vae_conv_2d_scale); + first_stage_model->set_conv2d_scale(vae_conv_2d_scale); + } first_stage_model->alloc_params_buffer(); first_stage_model->get_param_tensors(tensors, "first_stage_model"); } else { diff --git a/vae.hpp b/vae.hpp index 622b8bb..20d97a2 100644 --- a/vae.hpp +++ b/vae.hpp @@ -530,6 +530,7 @@ struct VAE : public GGMLRunner { struct ggml_context* output_ctx) = 0; virtual void get_param_tensors(std::map& tensors, const std::string prefix) = 0; virtual void enable_conv2d_direct(){}; + virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); }; }; struct AutoEncoderKL : public VAE { @@ -558,6 +559,17 @@ struct AutoEncoderKL : public VAE { } } + void set_conv2d_scale(float scale) { + std::vector blocks; + ae.get_all_blocks(blocks); + for (auto block : blocks) { + if (block->get_desc() == "Conv2d") { + auto conv_block = (Conv2d*)block; + conv_block->set_scale(scale); + } + } + } + std::string get_desc() { return "vae"; }