2026-06-25 15:46:40 +00:00
3 changed files with 8 additions and 31 deletions
--- a/common.hpp
+++ b/common.hpp
@ -243,8 +243,9 @@ public:
                int64_t dim_out,
                int64_t mult          = 4,
                Activation activation = Activation::GEGLU,
-                bool precision_fix    = false) {
+                bool force_prec_f32   = false) {
        int64_t inner_dim = dim * mult;
+
        if (activation == Activation::GELU) {
            blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GELU(dim, inner_dim));
        } else {
@ -252,14 +253,7 @@ public:
        }

        // net_1 is nn.Dropout(), skip for inference
-        float scale = 1.f;
-        if (precision_fix) {
-            scale = 1.f / 128.f;
-        }
-        // The purpose of the scale here is to prevent NaN issues in certain situations.
-        // For example, when using Vulkan without enabling force_prec_f32,
-        // or when using CUDA but the weights are k-quants.
-        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, false, scale));
+        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, force_prec_f32));
    }

    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -56,10 +56,6 @@
 #define __STATIC_INLINE__ static inline
 #endif

-#ifndef SD_UNUSED
-#define SD_UNUSED(x) (void)(x)
-#endif
-
 __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) {
    switch (level) {
        case GGML_LOG_LEVEL_DEBUG:
@ -941,18 +937,11 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_linear(struct ggml_context* ctx,
                                                     struct ggml_tensor* x,
                                                     struct ggml_tensor* w,
                                                     struct ggml_tensor* b,
-                                                     bool force_prec_f32 = false,
-                                                     float scale         = 1.f) {
-    if (scale != 1.f) {
-        x = ggml_scale(ctx, x, scale);
-    }
+                                                     bool force_prec_f32 = false) {
    x = ggml_mul_mat(ctx, w, x);
    if (force_prec_f32) {
        ggml_mul_mat_set_prec(x, GGML_PREC_F32);
    }
-    if (scale != 1.f) {
-        x = ggml_scale(ctx, x, 1.f / scale);
-    }
    if (b != NULL) {
        x = ggml_add_inplace(ctx, x, b);
    }
@ -1966,7 +1955,6 @@ protected:
    bool bias;
    bool force_f32;
    bool force_prec_f32;
-    float scale;

    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
        enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
@ -1985,14 +1973,12 @@ public:
           int64_t out_features,
           bool bias           = true,
           bool force_f32      = false,
-           bool force_prec_f32 = false,
-           float scale         = 1.f)
+           bool force_prec_f32 = false)
        : in_features(in_features),
          out_features(out_features),
          bias(bias),
          force_f32(force_f32),
-          force_prec_f32(force_prec_f32),
-          scale(scale) {}
+          force_prec_f32(force_prec_f32) {}

    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        struct ggml_tensor* w = params["weight"];
@ -2000,7 +1986,7 @@ public:
        if (bias) {
            b = params["bias"];
        }
-        return ggml_nn_linear(ctx, x, w, b, force_prec_f32, scale);
+        return ggml_nn_linear(ctx, x, w, b, force_prec_f32);
    }
 };

--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@ -97,10 +97,7 @@ namespace Qwen {
            blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias));
            // to_out.1 is nn.Dropout

-            float scale = 1.f / 32.f;
-            // The purpose of the scale here is to prevent NaN issues in certain situations.
-            // For example when using CUDA but the weights are k-quants (not all prompts).
-            blocks["to_add_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_context_dim, out_bias, false, false, scale));
+            blocks["to_add_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_context_dim, out_bias));
        }

        std::pair<ggml_tensor*, ggml_tensor*> forward(struct ggml_context* ctx,