make qwen image a litter faster

2026-06-25 15:46:40 +00:00 · 2026-01-25 00:25:06 +08:00 · 2026-01-25 00:25:06 +08:00 · e2600bd442
commit e2600bd442
parent 6f4b49239c
2 changed files with 22 additions and 8 deletions
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -690,6 +690,18 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_slice(struct ggml_context* ctx,
                                                     int64_t end,
                                                     bool cont = true) {
    GGML_ASSERT(dim >= 0 && dim < 4);
+    if (x->ne[dim] == 1) {
+        return x;
+    }
+    while (start < 0) {
+        start = x->ne[dim] + start;
+    }
+    while (end < 0) {
+        end = x->ne[dim] + end;
+    }
+    GGML_ASSERT(end > start);
+    GGML_ASSERT(start >= 0 && start < x->ne[dim]);
+    GGML_ASSERT(end > start && end <= x->ne[dim]);

    int64_t slice_size  = end - start;
    int64_t slice_ne[4] = {x->ne[0], x->ne[1], x->ne[2], x->ne[3]};
@ -944,6 +956,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_linear(struct ggml_context* ctx,
                                                      bool force_prec_f32 = false,
                                                      float scale         = 1.f) {
    if (scale != 1.f) {
+        if (!ggml_is_contiguous(x)) {
+            x = ggml_cont(ctx, x);
+        }
        x = ggml_scale(ctx, x, scale);
    }
    if (x->ne[2] * x->ne[3] > 1024) {
--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@ -163,25 +163,24 @@ namespace Qwen {
            auto v = ggml_concat(ctx->ggml_ctx, txt_v, img_v, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]

            auto attn         = Rope::attention(ctx, q, k, v, pe, mask, (1.0f / 128.f));  // [N, n_txt_token + n_img_token, n_head*d_head]
-            attn              = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, attn, 0, 2, 1, 3));  // [n_txt_token + n_img_token, N, hidden_size]
            auto txt_attn_out = ggml_view_3d(ctx->ggml_ctx,
                                             attn,
                                             attn->ne[0],
-                                             attn->ne[1],
                                             txt->ne[1],
+                                             attn->ne[2],
                                             attn->nb[1],
                                             attn->nb[2],
-                                             0);                                                                  // [n_txt_token, N, hidden_size]
-            txt_attn_out      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, txt_attn_out, 0, 2, 1, 3));  // [N, n_txt_token, hidden_size]
+                                             0);  // [N, n_txt_token, n_head*d_head]
            auto img_attn_out = ggml_view_3d(ctx->ggml_ctx,
                                             attn,
                                             attn->ne[0],
-                                             attn->ne[1],
                                             img->ne[1],
+                                             attn->ne[2],
                                             attn->nb[1],
                                             attn->nb[2],
-                                             attn->nb[2] * txt->ne[1]);                                           // [n_img_token, N, hidden_size]
-            img_attn_out      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, img_attn_out, 0, 2, 1, 3));  // [N, n_img_token, hidden_size]
+                                             txt->ne[1] * attn->nb[1]);  // [N, n_img_token, n_head*d_head]
+            img_attn_out      = ggml_cont(ctx->ggml_ctx, img_attn_out);
+            txt_attn_out      = ggml_cont(ctx->ggml_ctx, txt_attn_out);

            img_attn_out = to_out_0->forward(ctx, img_attn_out);
            txt_attn_out = to_add_out->forward(ctx, txt_attn_out);