From 98d6e71492b518b317a3a0545880a24b09e606eb Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Sun, 12 Oct 2025 15:41:40 +0800
Subject: [PATCH] fix the issue that occurs when using CUDA with k-quants
 weights

---
 common.hpp      | 12 +++++++++---
 ggml_extend.hpp |  4 ++++
 2 files changed, 13 insertions(+), 3 deletions(-)
diff --git a/common.hpp b/common.hpp
index a197e8f..04da712 100644
--- a/common.hpp
+++ b/common.hpp
@@ -245,7 +245,7 @@ public:
                 Activation activation = Activation::GEGLU,
                 bool force_prec_f32   = false) {
         int64_t inner_dim = dim * mult;
-
+        SD_UNUSED(force_prec_f32);
         if (activation == Activation::GELU) {
             blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GELU(dim, inner_dim));
         } else {
@@ -253,7 +253,7 @@ public:
         }
 
         // net_1 is nn.Dropout(), skip for inference
-        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, force_prec_f32));
+        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -264,7 +264,13 @@ public:
         auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
 
         x = net_0->forward(ctx, x);  // [ne3, ne2, ne1, inner_dim]
-        x = net_2->forward(ctx, x);  // [ne3, ne2, ne1, dim_out]
+        // The purpose of the scale here is to prevent NaN issues in certain situations.
+        // For example, when using Vulkan without enabling force_prec_f32,
+        // or when using CUDA but the weights are k-quants.
+        float scale = 1.f / 128.f;
+        x           = ggml_scale(ctx, x, scale);
+        x           = net_2->forward(ctx, x);  // [ne3, ne2, ne1, dim_out]
+        x           = ggml_scale(ctx, x, 1.f / scale);
         return x;
     }
 };
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index b64dc85..e5012cd 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -56,6 +56,10 @@
 #define __STATIC_INLINE__ static inline
 #endif
 
+#ifndef SD_UNUSED
+#define SD_UNUSED(x) (void)(x)
+#endif
+
 __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) {
     switch (level) {
         case GGML_LOG_LEVEL_DEBUG: