From 98d6e71492b518b317a3a0545880a24b09e606eb Mon Sep 17 00:00:00 2001 From: leejet Date: Sun, 12 Oct 2025 15:41:40 +0800 Subject: [PATCH] fix the issue that occurs when using CUDA with k-quants weights --- common.hpp | 12 +++++++++--- ggml_extend.hpp | 4 ++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/common.hpp b/common.hpp index a197e8f..04da712 100644 --- a/common.hpp +++ b/common.hpp @@ -245,7 +245,7 @@ public: Activation activation = Activation::GEGLU, bool force_prec_f32 = false) { int64_t inner_dim = dim * mult; - + SD_UNUSED(force_prec_f32); if (activation == Activation::GELU) { blocks["net.0"] = std::shared_ptr(new GELU(dim, inner_dim)); } else { @@ -253,7 +253,7 @@ public: } // net_1 is nn.Dropout(), skip for inference - blocks["net.2"] = std::shared_ptr(new Linear(inner_dim, dim_out, true, false, force_prec_f32)); + blocks["net.2"] = std::shared_ptr(new Linear(inner_dim, dim_out)); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { @@ -264,7 +264,13 @@ public: auto net_2 = std::dynamic_pointer_cast(blocks["net.2"]); x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim] - x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out] + // The purpose of the scale here is to prevent NaN issues in certain situations. + // For example, when using Vulkan without enabling force_prec_f32, + // or when using CUDA but the weights are k-quants. + float scale = 1.f / 128.f; + x = ggml_scale(ctx, x, scale); + x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out] + x = ggml_scale(ctx, x, 1.f / scale); return x; } }; diff --git a/ggml_extend.hpp b/ggml_extend.hpp index b64dc85..e5012cd 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -56,6 +56,10 @@ #define __STATIC_INLINE__ static inline #endif +#ifndef SD_UNUSED +#define SD_UNUSED(x) (void)(x) +#endif + __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) { switch (level) { case GGML_LOG_LEVEL_DEBUG: