From e9e952462f2be9242dc2d24e914b3b7d6221f654 Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Sun, 21 Jun 2026 18:21:38 +0200 Subject: [PATCH] fix: workaround for Ernie with Vulkan and Flash Attention (#1680) --- src/model/diffusion/ernie_image.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/model/diffusion/ernie_image.hpp b/src/model/diffusion/ernie_image.hpp index 12fcada5..0427b3b3 100644 --- a/src/model/diffusion/ernie_image.hpp +++ b/src/model/diffusion/ernie_image.hpp @@ -162,6 +162,8 @@ namespace ErnieImage { int64_t S = x->ne[1]; int64_t N = x->ne[2]; + float scale = (sd_backend_is(ctx->backend, "Vulkan") && ctx->flash_attn_enabled) ? 1.0f / 32.0f : 1.0f; + auto q = to_q->forward(ctx, x); auto k = to_k->forward(ctx, x); auto v = to_v->forward(ctx, x); @@ -182,7 +184,7 @@ namespace ErnieImage { k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 0, 2, 1, 3)); // [N, heads, S, head_dim] k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]); - x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, ctx->flash_attn_enabled); // [N, S, hidden_size] + x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, ctx->flash_attn_enabled, scale); // [N, S, hidden_size] x = to_out_0->forward(ctx, x); return x; }