From 4edc3ad2ad89feb8b0cac427b2f945535fb01fcb Mon Sep 17 00:00:00 2001 From: leejet Date: Mon, 13 Oct 2025 23:02:24 +0800 Subject: [PATCH] to_out.0 precision fix --- qwen_image.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/qwen_image.hpp b/qwen_image.hpp index 3ac32de..630e553 100644 --- a/qwen_image.hpp +++ b/qwen_image.hpp @@ -94,12 +94,12 @@ namespace Qwen { blocks["norm_added_q"] = std::shared_ptr(new RMSNorm(dim_head, eps)); blocks["norm_added_k"] = std::shared_ptr(new RMSNorm(dim_head, eps)); - blocks["to_out.0"] = std::shared_ptr(new Linear(inner_dim, out_dim, out_bias)); - // to_out.1 is nn.Dropout - float scale = 1.f / 32.f; // The purpose of the scale here is to prevent NaN issues in certain situations. // For example when using CUDA but the weights are k-quants (not all prompts). + blocks["to_out.0"] = std::shared_ptr(new Linear(inner_dim, out_dim, out_bias, false, false, scale)); + // to_out.1 is nn.Dropout + blocks["to_add_out"] = std::shared_ptr(new Linear(inner_dim, out_context_dim, out_bias, false, false, scale)); } @@ -159,7 +159,7 @@ namespace Qwen { auto k = ggml_concat(ctx, txt_k, img_k, 2); // [N, n_txt_token + n_img_token, n_head, d_head] auto v = ggml_concat(ctx, txt_v, img_v, 2); // [N, n_txt_token + n_img_token, n_head, d_head] - auto attn = Rope::attention(ctx, backend, q, k, v, pe, mask, flash_attn, (1.0f / 256.f)); // [N, n_txt_token + n_img_token, n_head*d_head] + auto attn = Rope::attention(ctx, backend, q, k, v, pe, mask, flash_attn, (1.0f / 128.f)); // [N, n_txt_token + n_img_token, n_head*d_head] attn = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3)); // [n_txt_token + n_img_token, N, hidden_size] auto txt_attn_out = ggml_view_3d(ctx, attn,