From cc064a0530a433f2eabcc24fdb34c40699340981 Mon Sep 17 00:00:00 2001 From: leejet Date: Sun, 12 Oct 2025 16:36:55 +0800 Subject: [PATCH] optimize the handling of the FeedForward precision fix --- common.hpp | 20 ++++++++++---------- ggml_extend.hpp | 18 ++++++++++++++---- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/common.hpp b/common.hpp index 04da712..d321671 100644 --- a/common.hpp +++ b/common.hpp @@ -243,9 +243,8 @@ public: int64_t dim_out, int64_t mult = 4, Activation activation = Activation::GEGLU, - bool force_prec_f32 = false) { + bool precision_fix = false) { int64_t inner_dim = dim * mult; - SD_UNUSED(force_prec_f32); if (activation == Activation::GELU) { blocks["net.0"] = std::shared_ptr(new GELU(dim, inner_dim)); } else { @@ -253,7 +252,14 @@ public: } // net_1 is nn.Dropout(), skip for inference - blocks["net.2"] = std::shared_ptr(new Linear(inner_dim, dim_out)); + float scale = 1.f; + if (precision_fix) { + scale = 1.f / 128.f; + } + // The purpose of the scale here is to prevent NaN issues in certain situations. + // For example, when using Vulkan without enabling force_prec_f32, + // or when using CUDA but the weights are k-quants. + blocks["net.2"] = std::shared_ptr(new Linear(inner_dim, dim_out, true, false, false, scale)); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { @@ -264,13 +270,7 @@ public: auto net_2 = std::dynamic_pointer_cast(blocks["net.2"]); x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim] - // The purpose of the scale here is to prevent NaN issues in certain situations. - // For example, when using Vulkan without enabling force_prec_f32, - // or when using CUDA but the weights are k-quants. - float scale = 1.f / 128.f; - x = ggml_scale(ctx, x, scale); - x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out] - x = ggml_scale(ctx, x, 1.f / scale); + x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out] return x; } }; diff --git a/ggml_extend.hpp b/ggml_extend.hpp index e5012cd..9f7d0b3 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -944,11 +944,18 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_linear(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* w, struct ggml_tensor* b, - bool force_prec_f32 = false) { + bool force_prec_f32 = false, + float scale = 1.f) { + if (scale != 1.f) { + x = ggml_scale(ctx, x, scale); + } x = ggml_mul_mat(ctx, w, x); if (force_prec_f32) { ggml_mul_mat_set_prec(x, GGML_PREC_F32); } + if (scale != 1.f) { + x = ggml_scale(ctx, x, 1.f / scale); + } if (b != NULL) { x = ggml_add_inplace(ctx, x, b); } @@ -1962,6 +1969,7 @@ protected: bool bias; bool force_f32; bool force_prec_f32; + float scale; void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32); @@ -1980,12 +1988,14 @@ public: int64_t out_features, bool bias = true, bool force_f32 = false, - bool force_prec_f32 = false) + bool force_prec_f32 = false, + float scale = 1.f) : in_features(in_features), out_features(out_features), bias(bias), force_f32(force_f32), - force_prec_f32(force_prec_f32) {} + force_prec_f32(force_prec_f32), + scale(scale) {} struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* w = params["weight"]; @@ -1993,7 +2003,7 @@ public: if (bias) { b = params["bias"]; } - return ggml_nn_linear(ctx, x, w, b, force_prec_f32); + return ggml_nn_linear(ctx, x, w, b, force_prec_f32, scale); } };