diff --git a/common.hpp b/common.hpp index a197e8f..d321671 100644 --- a/common.hpp +++ b/common.hpp @@ -243,9 +243,8 @@ public: int64_t dim_out, int64_t mult = 4, Activation activation = Activation::GEGLU, - bool force_prec_f32 = false) { + bool precision_fix = false) { int64_t inner_dim = dim * mult; - if (activation == Activation::GELU) { blocks["net.0"] = std::shared_ptr(new GELU(dim, inner_dim)); } else { @@ -253,7 +252,14 @@ public: } // net_1 is nn.Dropout(), skip for inference - blocks["net.2"] = std::shared_ptr(new Linear(inner_dim, dim_out, true, false, force_prec_f32)); + float scale = 1.f; + if (precision_fix) { + scale = 1.f / 128.f; + } + // The purpose of the scale here is to prevent NaN issues in certain situations. + // For example, when using Vulkan without enabling force_prec_f32, + // or when using CUDA but the weights are k-quants. + blocks["net.2"] = std::shared_ptr(new Linear(inner_dim, dim_out, true, false, false, scale)); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 8d48341..19f1055 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -56,6 +56,10 @@ #define __STATIC_INLINE__ static inline #endif +#ifndef SD_UNUSED +#define SD_UNUSED(x) (void)(x) +#endif + __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) { switch (level) { case GGML_LOG_LEVEL_DEBUG: @@ -937,11 +941,18 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_linear(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* w, struct ggml_tensor* b, - bool force_prec_f32 = false) { + bool force_prec_f32 = false, + float scale = 1.f) { + if (scale != 1.f) { + x = ggml_scale(ctx, x, scale); + } x = ggml_mul_mat(ctx, w, x); if (force_prec_f32) { ggml_mul_mat_set_prec(x, GGML_PREC_F32); } + if (scale != 1.f) { + x = ggml_scale(ctx, x, 1.f / scale); + } if (b != NULL) { x = ggml_add_inplace(ctx, x, b); } @@ -1955,6 +1966,7 @@ protected: bool bias; bool force_f32; bool force_prec_f32; + float scale; void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32); @@ -1973,12 +1985,14 @@ public: int64_t out_features, bool bias = true, bool force_f32 = false, - bool force_prec_f32 = false) + bool force_prec_f32 = false, + float scale = 1.f) : in_features(in_features), out_features(out_features), bias(bias), force_f32(force_f32), - force_prec_f32(force_prec_f32) {} + force_prec_f32(force_prec_f32), + scale(scale) {} struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* w = params["weight"]; @@ -1986,7 +2000,7 @@ public: if (bias) { b = params["bias"]; } - return ggml_nn_linear(ctx, x, w, b, force_prec_f32); + return ggml_nn_linear(ctx, x, w, b, force_prec_f32, scale); } };