Compare commits

..

No commits in common. "b4b5b4c1537fc6d13ef428941a307c73accab5c5" and "47c0f8e4bd6916442d04b0a4412554cf3a043e8d" have entirely different histories.

3 changed files with 8 additions and 31 deletions

View File

@ -243,8 +243,9 @@ public:
int64_t dim_out, int64_t dim_out,
int64_t mult = 4, int64_t mult = 4,
Activation activation = Activation::GEGLU, Activation activation = Activation::GEGLU,
bool precision_fix = false) { bool force_prec_f32 = false) {
int64_t inner_dim = dim * mult; int64_t inner_dim = dim * mult;
if (activation == Activation::GELU) { if (activation == Activation::GELU) {
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GELU(dim, inner_dim)); blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GELU(dim, inner_dim));
} else { } else {
@ -252,14 +253,7 @@ public:
} }
// net_1 is nn.Dropout(), skip for inference // net_1 is nn.Dropout(), skip for inference
float scale = 1.f; blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, force_prec_f32));
if (precision_fix) {
scale = 1.f / 128.f;
}
// The purpose of the scale here is to prevent NaN issues in certain situations.
// For example, when using Vulkan without enabling force_prec_f32,
// or when using CUDA but the weights are k-quants.
blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, false, scale));
} }
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {

View File

@ -56,10 +56,6 @@
#define __STATIC_INLINE__ static inline #define __STATIC_INLINE__ static inline
#endif #endif
#ifndef SD_UNUSED
#define SD_UNUSED(x) (void)(x)
#endif
__STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) { __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) {
switch (level) { switch (level) {
case GGML_LOG_LEVEL_DEBUG: case GGML_LOG_LEVEL_DEBUG:
@ -941,18 +937,11 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_linear(struct ggml_context* ctx,
struct ggml_tensor* x, struct ggml_tensor* x,
struct ggml_tensor* w, struct ggml_tensor* w,
struct ggml_tensor* b, struct ggml_tensor* b,
bool force_prec_f32 = false, bool force_prec_f32 = false) {
float scale = 1.f) {
if (scale != 1.f) {
x = ggml_scale(ctx, x, scale);
}
x = ggml_mul_mat(ctx, w, x); x = ggml_mul_mat(ctx, w, x);
if (force_prec_f32) { if (force_prec_f32) {
ggml_mul_mat_set_prec(x, GGML_PREC_F32); ggml_mul_mat_set_prec(x, GGML_PREC_F32);
} }
if (scale != 1.f) {
x = ggml_scale(ctx, x, 1.f / scale);
}
if (b != NULL) { if (b != NULL) {
x = ggml_add_inplace(ctx, x, b); x = ggml_add_inplace(ctx, x, b);
} }
@ -1966,7 +1955,6 @@ protected:
bool bias; bool bias;
bool force_f32; bool force_f32;
bool force_prec_f32; bool force_prec_f32;
float scale;
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32); enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
@ -1985,14 +1973,12 @@ public:
int64_t out_features, int64_t out_features,
bool bias = true, bool bias = true,
bool force_f32 = false, bool force_f32 = false,
bool force_prec_f32 = false, bool force_prec_f32 = false)
float scale = 1.f)
: in_features(in_features), : in_features(in_features),
out_features(out_features), out_features(out_features),
bias(bias), bias(bias),
force_f32(force_f32), force_f32(force_f32),
force_prec_f32(force_prec_f32), force_prec_f32(force_prec_f32) {}
scale(scale) {}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
struct ggml_tensor* w = params["weight"]; struct ggml_tensor* w = params["weight"];
@ -2000,7 +1986,7 @@ public:
if (bias) { if (bias) {
b = params["bias"]; b = params["bias"];
} }
return ggml_nn_linear(ctx, x, w, b, force_prec_f32, scale); return ggml_nn_linear(ctx, x, w, b, force_prec_f32);
} }
}; };

View File

@ -97,10 +97,7 @@ namespace Qwen {
blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias)); blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias));
// to_out.1 is nn.Dropout // to_out.1 is nn.Dropout
float scale = 1.f / 32.f; blocks["to_add_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_context_dim, out_bias));
// The purpose of the scale here is to prevent NaN issues in certain situations.
// For example when using CUDA but the weights are k-quants (not all prompts).
blocks["to_add_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_context_dim, out_bias, false, false, scale));
} }
std::pair<ggml_tensor*, ggml_tensor*> forward(struct ggml_context* ctx, std::pair<ggml_tensor*, ggml_tensor*> forward(struct ggml_context* ctx,