From e2600bd442bf6c588b5e03e15dcc3df3e32f3bf0 Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Sun, 25 Jan 2026 00:25:06 +0800
Subject: [PATCH] make qwen image a litter faster

---
 ggml_extend.hpp | 15 +++++++++++++++
 qwen_image.hpp  | 15 +++++++--------
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 692ba85..f61f7c6 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -690,6 +690,18 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_slice(struct ggml_context* ctx,
                                                      int64_t end,
                                                      bool cont = true) {
     GGML_ASSERT(dim >= 0 && dim < 4);
+    if (x->ne[dim] == 1) {
+        return x;
+    }
+    while (start < 0) {
+        start = x->ne[dim] + start;
+    }
+    while (end < 0) {
+        end = x->ne[dim] + end;
+    }
+    GGML_ASSERT(end > start);
+    GGML_ASSERT(start >= 0 && start < x->ne[dim]);
+    GGML_ASSERT(end > start && end <= x->ne[dim]);
 
     int64_t slice_size  = end - start;
     int64_t slice_ne[4] = {x->ne[0], x->ne[1], x->ne[2], x->ne[3]};
@@ -944,6 +956,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_linear(struct ggml_context* ctx,
                                                       bool force_prec_f32 = false,
                                                       float scale         = 1.f) {
     if (scale != 1.f) {
+        if (!ggml_is_contiguous(x)) {
+            x = ggml_cont(ctx, x);
+        }
         x = ggml_scale(ctx, x, scale);
     }
     if (x->ne[2] * x->ne[3] > 1024) {
diff --git a/qwen_image.hpp b/qwen_image.hpp
index ec2231b..dfa5397 100644
--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@@ -162,26 +162,25 @@ namespace Qwen {
             auto k = ggml_concat(ctx->ggml_ctx, txt_k, img_k, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
             auto v = ggml_concat(ctx->ggml_ctx, txt_v, img_v, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
 
-            auto attn         = Rope::attention(ctx, q, k, v, pe, mask, (1.0f / 128.f));                  // [N, n_txt_token + n_img_token, n_head*d_head]
-            attn              = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, attn, 0, 2, 1, 3));  // [n_txt_token + n_img_token, N, hidden_size]
+            auto attn         = Rope::attention(ctx, q, k, v, pe, mask, (1.0f / 128.f));  // [N, n_txt_token + n_img_token, n_head*d_head]
             auto txt_attn_out = ggml_view_3d(ctx->ggml_ctx,
                                              attn,
                                              attn->ne[0],
-                                             attn->ne[1],
                                              txt->ne[1],
+                                             attn->ne[2],
                                              attn->nb[1],
                                              attn->nb[2],
-                                             0);                                                                  // [n_txt_token, N, hidden_size]
-            txt_attn_out      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, txt_attn_out, 0, 2, 1, 3));  // [N, n_txt_token, hidden_size]
+                                             0);  // [N, n_txt_token, n_head*d_head]
             auto img_attn_out = ggml_view_3d(ctx->ggml_ctx,
                                              attn,
                                              attn->ne[0],
-                                             attn->ne[1],
                                              img->ne[1],
+                                             attn->ne[2],
                                              attn->nb[1],
                                              attn->nb[2],
-                                             attn->nb[2] * txt->ne[1]);                                           // [n_img_token, N, hidden_size]
-            img_attn_out      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, img_attn_out, 0, 2, 1, 3));  // [N, n_img_token, hidden_size]
+                                             txt->ne[1] * attn->nb[1]);  // [N, n_img_token, n_head*d_head]
+            img_attn_out      = ggml_cont(ctx->ggml_ctx, img_attn_out);
+            txt_attn_out      = ggml_cont(ctx->ggml_ctx, txt_attn_out);
 
             img_attn_out = to_out_0->forward(ctx, img_attn_out);
             txt_attn_out = to_add_out->forward(ctx, txt_attn_out);