add qwen image model

2026-06-25 15:46:40 +00:00 · 2025-09-21 23:37:13 +08:00 · 2025-09-21 23:37:13 +08:00 · d232509b6e
commit d232509b6e
parent d8d4c268dc
8 changed files with 725 additions and 25 deletions
--- a/common.hpp
+++ b/common.hpp
@ -177,7 +177,7 @@ public:
    }
 };

-class GEGLU : public GGMLBlock {
+class GEGLU : public UnaryBlock {
 protected:
    int64_t dim_in;
    int64_t dim_out;
@ -216,14 +216,41 @@ public:
    }
 };

+class GELU : public UnaryBlock {
+public:
+    GELU(int64_t dim_in, int64_t dim_out, bool bias = true) {
+        blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out, bias));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [ne3, ne2, ne1, dim_in]
+        // return: [ne3, ne2, ne1, dim_out]
+        auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
+
+        x = proj->forward(ctx, x);
+        x = ggml_gelu_inplace(ctx, x);
+        return x;
+    }
+};
+
 class FeedForward : public GGMLBlock {
 public:
+    enum class Activation {
+        GEGLU,
+        GELU
+    };
    FeedForward(int64_t dim,
                int64_t dim_out,
-                int64_t mult = 4) {
+                int64_t mult          = 4,
+                Activation activation = Activation::GEGLU) {
        int64_t inner_dim = dim * mult;

-        blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
+        if (activation == Activation::GELU) {
+            blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GELU(dim, inner_dim));
+        } else {
+            blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
+        }
+
        // net_1 is nn.Dropout(), skip for inference
        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out));
    }
@ -232,7 +259,7 @@ public:
        // x: [ne3, ne2, ne1, dim]
        // return: [ne3, ne2, ne1, dim_out]

-        auto net_0 = std::dynamic_pointer_cast<GEGLU>(blocks["net.0"]);
+        auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
        auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);

        x = net_0->forward(ctx, x);  // [ne3, ne2, ne1, inner_dim]
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -27,7 +27,7 @@

 #include "avi_writer.h"

-#include "qwen.hpp"
+#include "qwen_image.hpp"

 #if defined(_WIN32)
 #define NOMINMAX
@ -1142,7 +1142,7 @@ int main(int argc, const char* argv[]) {
    SDParams params;
    params.verbose = true;
    sd_set_log_callback(sd_log_cb, (void*)&params);
-    Qwen::Qwen2_5_VLEmbedder::load_from_file_and_test(argv[1]);
+    Qwen::QwenImageRunner::load_from_file_and_test(argv[1]);
    exit(1);
    parse_args(argc, argv, params);
    params.sample_params.guidance.slg.layers                 = params.skip_layers.data();
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -1353,15 +1353,13 @@ __STATIC_INLINE__ std::vector<float> arange(float start, float end, float step =
 // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
 __STATIC_INLINE__ std::vector<float> timestep_embedding(std::vector<float> timesteps,
                                                        int dim,
-                                                        int max_period = 10000) {
+                                                        int max_period       = 10000,
+                                                        bool flip_sin_to_cos = true,
+                                                        float scale          = 1.f) {
    // timesteps: [N,]
    // embedding: [N, dim]
-    size_t N        = timesteps.size();
-    int acutual_dim = dim;
-    if (dim % 2 != 0) {
-        acutual_dim = dim + 1;
-    }
-    std::vector<float> embedding(N * acutual_dim, 0.f);
+    size_t N = timesteps.size();
+    std::vector<float> embedding(N * dim, 0.f);
    int half = dim / 2;
    std::vector<float> freqs(half);
    for (int i = 0; i < half; ++i) {
@ -1369,9 +1367,14 @@ __STATIC_INLINE__ std::vector<float> timestep_embedding(std::vector<float> times
    }
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < half; ++j) {
-            float arg                             = timesteps[i] * freqs[j];
-            embedding[i * acutual_dim + j]        = std::cos(arg);
-            embedding[i * acutual_dim + j + half] = std::sin(arg);
+            float arg = timesteps[i] * freqs[j] * scale;
+            if (flip_sin_to_cos) {
+                embedding[i * dim + j]        = std::cos(arg);
+                embedding[i * dim + j + half] = std::sin(arg);
+            } else {
+                embedding[i * dim + j]        = std::sin(arg);
+                embedding[i * dim + j + half] = std::cos(arg);
+            }
        }
    }
    return embedding;
@ -1392,11 +1395,7 @@ __STATIC_INLINE__ struct ggml_tensor* new_timestep_embedding(struct ggml_context
    // timesteps: [N,]
    // embedding: [N, dim]
    std::vector<float> embedding_vec = timestep_embedding(timesteps, dim, max_period);
-    int acutual_dim                  = dim;
-    if (dim % 2 != 0) {
-        acutual_dim = dim + 1;
-    }
-    struct ggml_tensor* embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, acutual_dim, timesteps.size());
+    struct ggml_tensor* embedding    = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, timesteps.size());
    if (embedding->data != NULL) {
        memcpy(((char*)embedding->data), ((char*)embedding_vec.data()), ggml_nbytes(embedding));
    } else {
--- a/model.cpp
+++ b/model.cpp
@ -728,6 +728,7 @@ void preprocess_tensor(TensorStorage tensor_storage,

    // convert unet transformer linear to conv2d 1x1
    if (starts_with(new_name, "model.diffusion_model.") &&
+        !starts_with(new_name, "model.diffusion_model.proj_out.") &&
        (ends_with(new_name, "proj_in.weight") || ends_with(new_name, "proj_out.weight"))) {
        tensor_storage.unsqueeze();
    }
--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@ -0,0 +1,642 @@
+#ifndef __QWEN_IMAGE_HPP__
+#define __QWEN_IMAGE_HPP__
+
+#include "common.hpp"
+#include "flux.hpp"
+#include "ggml_extend.hpp"
+
+namespace Qwen {
+    constexpr int QWEN_IMAGE_GRAPH_SIZE = 20480;
+
+    struct TimestepEmbedding : public GGMLBlock {
+    public:
+        TimestepEmbedding(int64_t in_channels,
+                          int64_t time_embed_dim,
+                          int64_t out_dim       = 0,
+                          int64_t cond_proj_dim = 0,
+                          bool sample_proj_bias = true) {
+            blocks["linear_1"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, time_embed_dim, sample_proj_bias));
+            if (cond_proj_dim > 0) {
+                blocks["cond_proj"] = std::shared_ptr<GGMLBlock>(new Linear(cond_proj_dim, in_channels, false));
+            }
+            if (out_dim <= 0) {
+                out_dim = time_embed_dim;
+            }
+            blocks["linear_2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, out_dim, sample_proj_bias));
+        }
+
+        struct ggml_tensor* forward(struct ggml_context* ctx,
+                                    struct ggml_tensor* sample,
+                                    struct ggml_tensor* condition = nullptr) {
+            if (condition != nullptr) {
+                auto cond_proj = std::dynamic_pointer_cast<Linear>(blocks["cond_proj"]);
+                sample         = ggml_add(ctx, sample, cond_proj->forward(ctx, condition));
+            }
+            auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
+            auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);
+
+            sample = linear_1->forward(ctx, sample);
+            sample = ggml_silu_inplace(ctx, sample);
+            sample = linear_2->forward(ctx, sample);
+            return sample;
+        }
+    };
+
+    struct QwenTimestepProjEmbeddings : public GGMLBlock {
+    public:
+        QwenTimestepProjEmbeddings(int64_t embedding_dim) {
+            blocks["timestep_embedder"] = std::shared_ptr<GGMLBlock>(new TimestepEmbedding(256, embedding_dim));
+        }
+
+        struct ggml_tensor* forward(struct ggml_context* ctx,
+                                    struct ggml_tensor* timesteps) {
+            // timesteps: [N,]
+            // return: [N, embedding_dim]
+            auto timestep_embedder = std::dynamic_pointer_cast<TimestepEmbedding>(blocks["timestep_embedder"]);
+
+            auto timesteps_proj = ggml_nn_timestep_embedding(ctx, timesteps, 256, 10000, 1000.f);
+            auto timesteps_emb  = timestep_embedder->forward(ctx, timesteps_proj);
+            return timesteps_emb;
+        }
+    };
+
+    struct QwenImageAttention : public GGMLBlock {
+    protected:
+        int64_t dim_head;
+        bool flash_attn;
+
+    public:
+        QwenImageAttention(int64_t query_dim,
+                           int64_t dim_head,
+                           int64_t num_heads,
+                           int64_t out_dim         = 0,
+                           int64_t out_context_dim = 0,
+                           bool bias               = true,
+                           bool out_bias           = true,
+                           float eps               = 1e-6,
+                           bool flash_attn         = false)
+            : dim_head(dim_head), flash_attn(flash_attn) {
+            int64_t inner_dim = out_dim > 0 ? out_dim : dim_head * num_heads;
+            out_dim           = out_dim > 0 ? out_dim : query_dim;
+            out_context_dim   = out_context_dim > 0 ? out_context_dim : query_dim;
+
+            blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, bias));
+            blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, bias));
+            blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, bias));
+
+            blocks["norm_q"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps));
+            blocks["norm_k"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps));
+
+            blocks["add_q_proj"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, bias));
+            blocks["add_k_proj"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, bias));
+            blocks["add_v_proj"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, bias));
+
+            blocks["norm_added_q"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps));
+            blocks["norm_added_k"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim_head, eps));
+
+            blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias));
+            // to_out.1 is nn.Dropout
+
+            blocks["to_add_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_context_dim, out_bias));
+        }
+
+        std::pair<ggml_tensor*, ggml_tensor*> forward(struct ggml_context* ctx,
+                                                      ggml_backend_t backend,
+                                                      struct ggml_tensor* img,
+                                                      struct ggml_tensor* txt,
+                                                      struct ggml_tensor* pe,
+                                                      struct ggml_tensor* mask = nullptr) {
+            // img: [N, n_img_token, hidden_size]
+            // txt: [N, n_txt_token, hidden_size]
+            // pe: [n_img_token + n_txt_token, d_head/2, 2, 2]
+            // return: ([N, n_img_token, hidden_size], [N, n_txt_token, hidden_size])
+
+            auto norm_q = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_q"]);
+            auto norm_k = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_k"]);
+
+            auto to_q     = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
+            auto to_k     = std::dynamic_pointer_cast<Linear>(blocks["to_k"]);
+            auto to_v     = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
+            auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
+
+            auto norm_added_q = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_added_q"]);
+            auto norm_added_k = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_added_k"]);
+
+            auto add_q_proj = std::dynamic_pointer_cast<Linear>(blocks["add_q_proj"]);
+            auto add_k_proj = std::dynamic_pointer_cast<Linear>(blocks["add_k_proj"]);
+            auto add_v_proj = std::dynamic_pointer_cast<Linear>(blocks["add_v_proj"]);
+            auto to_add_out = std::dynamic_pointer_cast<Linear>(blocks["to_add_out"]);
+
+            int64_t N           = img->ne[2];
+            int64_t n_img_token = img->ne[1];
+            int64_t n_txt_token = txt->ne[1];
+
+            auto img_q        = to_q->forward(ctx, img);
+            int64_t num_heads = img_q->ne[0] / dim_head;
+            img_q             = ggml_reshape_4d(ctx, img_q, dim_head, num_heads, n_img_token, N);  // [N, n_img_token, n_head, d_head]
+            auto img_k        = to_k->forward(ctx, img);
+            img_k             = ggml_reshape_4d(ctx, img_k, dim_head, num_heads, n_img_token, N);  // [N, n_img_token, n_head, d_head]
+            auto img_v        = to_v->forward(ctx, img);
+            img_v             = ggml_reshape_4d(ctx, img_v, dim_head, num_heads, n_img_token, N);  // [N, n_img_token, n_head, d_head]
+
+            img_q = norm_q->forward(ctx, img_q);
+            img_k = norm_k->forward(ctx, img_k);
+
+            auto txt_q = add_q_proj->forward(ctx, txt);
+            txt_q      = ggml_reshape_4d(ctx, txt_q, dim_head, num_heads, n_txt_token, N);  // [N, n_txt_token, n_head, d_head]
+            auto txt_k = add_k_proj->forward(ctx, txt);
+            txt_k      = ggml_reshape_4d(ctx, txt_k, dim_head, num_heads, n_txt_token, N);  // [N, n_txt_token, n_head, d_head]
+            auto txt_v = add_v_proj->forward(ctx, txt);
+            txt_v      = ggml_reshape_4d(ctx, txt_v, dim_head, num_heads, n_txt_token, N);  // [N, n_txt_token, n_head, d_head]
+
+            txt_q = norm_added_q->forward(ctx, txt_q);
+            txt_k = norm_added_k->forward(ctx, txt_k);
+
+            auto q = ggml_concat(ctx, txt_q, img_q, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
+            auto k = ggml_concat(ctx, txt_k, img_k, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
+            auto v = ggml_concat(ctx, txt_v, img_v, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
+
+            auto attn         = Flux::attention(ctx, backend, q, k, v, pe, mask, flash_attn);  // [N, n_txt_token + n_img_token, n_head*d_head]
+            attn              = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));           // [n_txt_token + n_img_token, N, hidden_size]
+            auto txt_attn_out = ggml_view_3d(ctx,
+                                             attn,
+                                             attn->ne[0],
+                                             attn->ne[1],
+                                             txt->ne[1],
+                                             attn->nb[1],
+                                             attn->nb[2],
+                                             0);                                              // [n_txt_token, N, hidden_size]
+            txt_attn_out      = ggml_cont(ctx, ggml_permute(ctx, txt_attn_out, 0, 2, 1, 3));  // [N, n_txt_token, hidden_size]
+            auto img_attn_out = ggml_view_3d(ctx,
+                                             attn,
+                                             attn->ne[0],
+                                             attn->ne[1],
+                                             img->ne[1],
+                                             attn->nb[1],
+                                             attn->nb[2],
+                                             attn->nb[2] * txt->ne[1]);                       // [n_img_token, N, hidden_size]
+            img_attn_out      = ggml_cont(ctx, ggml_permute(ctx, img_attn_out, 0, 2, 1, 3));  // [N, n_img_token, hidden_size]
+
+            img_attn_out = to_out_0->forward(ctx, img_attn_out);
+            txt_attn_out = to_add_out->forward(ctx, txt_attn_out);
+
+            return {img_attn_out, txt_attn_out};
+        }
+    };
+
+    class QwenImageTransformerBlock : public GGMLBlock {
+    public:
+        QwenImageTransformerBlock(int64_t dim,
+                                  int64_t num_attention_heads,
+                                  int64_t attention_head_dim,
+                                  float eps       = 1e-6,
+                                  bool flash_attn = false) {
+            // img_mod.0 is nn.SiLU()
+            blocks["img_mod.1"] = std::shared_ptr<GGMLBlock>(new Linear(dim, 6 * dim, true));
+
+            blocks["img_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim, eps, false));
+            blocks["img_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim, eps, false));
+            blocks["img_mlp"]   = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim, 4, FeedForward::Activation::GELU));
+
+            // txt_mod.0 is nn.SiLU()
+            blocks["txt_mod.1"] = std::shared_ptr<GGMLBlock>(new Linear(dim, 6 * dim, true));
+
+            blocks["txt_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim, eps, false));
+            blocks["txt_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim, eps, false));
+            blocks["txt_mlp"]   = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim, 4, FeedForward::Activation::GELU));
+
+            blocks["attn"] = std::shared_ptr<GGMLBlock>(new QwenImageAttention(dim,
+                                                                               attention_head_dim,
+                                                                               num_attention_heads,
+                                                                               0,     // out_dim
+                                                                               0,     // out_context-dim
+                                                                               true,  // bias
+                                                                               true,  // out_bias
+                                                                               eps,
+                                                                               flash_attn));
+        }
+
+        virtual std::pair<ggml_tensor*, ggml_tensor*> forward(struct ggml_context* ctx,
+                                                              ggml_backend_t backend,
+                                                              struct ggml_tensor* img,
+                                                              struct ggml_tensor* txt,
+                                                              struct ggml_tensor* t_emb,
+                                                              struct ggml_tensor* pe) {
+            // img: [N, n_img_token, hidden_size]
+            // txt: [N, n_txt_token, hidden_size]
+            // pe: [n_img_token + n_txt_token, d_head/2, 2, 2]
+            // return: ([N, n_img_token, hidden_size], [N, n_txt_token, hidden_size])
+
+            auto img_mod_1 = std::dynamic_pointer_cast<Linear>(blocks["img_mod.1"]);
+            auto img_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["img_norm1"]);
+            auto img_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["img_norm2"]);
+            auto img_mlp   = std::dynamic_pointer_cast<FeedForward>(blocks["img_mlp"]);
+
+            auto txt_mod_1 = std::dynamic_pointer_cast<Linear>(blocks["txt_mod.1"]);
+            auto txt_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["txt_norm1"]);
+            auto txt_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["txt_norm2"]);
+            auto txt_mlp   = std::dynamic_pointer_cast<FeedForward>(blocks["txt_mlp"]);
+
+            auto attn = std::dynamic_pointer_cast<QwenImageAttention>(blocks["attn"]);
+
+            auto img_mod_params    = ggml_silu(ctx, t_emb);
+            img_mod_params         = img_mod_1->forward(ctx, img_mod_params);
+            auto img_mod_param_vec = ggml_chunk(ctx, img_mod_params, 6, 0);
+
+            auto txt_mod_params    = ggml_silu(ctx, t_emb);
+            txt_mod_params         = txt_mod_1->forward(ctx, txt_mod_params);
+            auto txt_mod_param_vec = ggml_chunk(ctx, txt_mod_params, 6, 0);
+
+            auto img_normed    = img_norm1->forward(ctx, img);
+            auto img_modulated = Flux::modulate(ctx, img_normed, img_mod_param_vec[0], img_mod_param_vec[1]);
+            auto img_gate1     = img_mod_param_vec[2];
+
+            auto txt_normed    = txt_norm1->forward(ctx, txt);
+            auto txt_modulated = Flux::modulate(ctx, txt_normed, txt_mod_param_vec[0], txt_mod_param_vec[1]);
+            auto txt_gate1     = txt_mod_param_vec[2];
+
+            auto [img_attn_output, txt_attn_output] = attn->forward(ctx, backend, img_modulated, txt_modulated, pe);
+
+            img = ggml_add(ctx, img, ggml_mul(ctx, img_attn_output, img_gate1));
+            txt = ggml_add(ctx, txt, ggml_mul(ctx, txt_attn_output, txt_gate1));
+
+            auto img_normed2    = img_norm2->forward(ctx, img);
+            auto img_modulated2 = Flux::modulate(ctx, img_normed2, img_mod_param_vec[3], img_mod_param_vec[4]);
+            auto img_gate2      = img_mod_param_vec[5];
+
+            auto txt_normed2    = txt_norm2->forward(ctx, txt);
+            auto txt_modulated2 = Flux::modulate(ctx, txt_normed2, txt_mod_param_vec[3], txt_mod_param_vec[4]);
+            auto txt_gate2      = txt_mod_param_vec[5];
+
+            auto img_mlp_out = img_mlp->forward(ctx, img_modulated2);
+            auto txt_mlp_out = txt_mlp->forward(ctx, txt_modulated2);
+
+            img = ggml_add(ctx, img, ggml_mul(ctx, img_mlp_out, img_gate2));
+            txt = ggml_add(ctx, txt, ggml_mul(ctx, txt_mlp_out, txt_gate2));
+
+            return {img, txt};
+        }
+    };
+
+    struct AdaLayerNormContinuous : public GGMLBlock {
+    public:
+        AdaLayerNormContinuous(int64_t embedding_dim,
+                               int64_t conditioning_embedding_dim,
+                               bool elementwise_affine = true,
+                               float eps               = 1e-5f,
+                               bool bias               = true) {
+            blocks["norm"]   = std::shared_ptr<GGMLBlock>(new LayerNorm(conditioning_embedding_dim, eps, elementwise_affine, bias));
+            blocks["linear"] = std::shared_ptr<GGMLBlock>(new Linear(conditioning_embedding_dim, embedding_dim * 2, bias));
+        }
+
+        struct ggml_tensor* forward(struct ggml_context* ctx,
+                                    struct ggml_tensor* x,
+                                    struct ggml_tensor* c) {
+            // x: [N, n_token, hidden_size]
+            // c: [N, hidden_size]
+            // return: [N, n_token, patch_size * patch_size * out_channels]
+
+            auto norm   = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
+            auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
+
+            auto emb   = linear->forward(ctx, ggml_silu(ctx, c));
+            auto mods  = ggml_chunk(ctx, emb, 2, 0);
+            auto scale = mods[0];
+            auto shift = mods[1];
+
+            x = norm->forward(ctx, x);
+            x = Flux::modulate(ctx, x, shift, scale);
+
+            return x;
+        }
+    };
+
+    struct QwenImageParams {
+        int64_t patch_size          = 2;
+        int64_t in_channels         = 64;
+        int64_t out_channels        = 16;
+        int64_t num_layers          = 60;
+        int64_t attention_head_dim  = 128;
+        int64_t num_attention_heads = 24;
+        int64_t joint_attention_dim = 3584;
+        float theta                 = 10000;
+        std::vector<int> axes_dim   = {16, 56, 56};
+        int64_t axes_dim_sum        = 128;
+        bool flash_attn             = false;
+    };
+
+    class QwenImageModel : public GGMLBlock {
+    protected:
+        QwenImageParams params;
+
+    public:
+        QwenImageModel() {}
+        QwenImageModel(QwenImageParams params)
+            : params(params) {
+            int64_t inner_dim         = params.num_attention_heads * params.attention_head_dim;
+            blocks["time_text_embed"] = std::shared_ptr<GGMLBlock>(new QwenTimestepProjEmbeddings(inner_dim));
+            blocks["txt_norm"]        = std::shared_ptr<GGMLBlock>(new RMSNorm(params.joint_attention_dim, 1e-6f));
+            blocks["img_in"]          = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, inner_dim));
+            blocks["txt_in"]          = std::shared_ptr<GGMLBlock>(new Linear(params.joint_attention_dim, inner_dim));
+
+            // blocks
+            for (int i = 0; i < params.num_layers; i++) {
+                auto block                                        = std::shared_ptr<GGMLBlock>(new QwenImageTransformerBlock(inner_dim,
+                                                                                                                             params.num_attention_heads,
+                                                                                                                             params.attention_head_dim,
+                                                                                                                             1e-6f,
+                                                                                                                             params.flash_attn));
+                blocks["transformer_blocks." + std::to_string(i)] = block;
+            }
+
+            blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new AdaLayerNormContinuous(inner_dim, inner_dim, false, 1e-6f));
+            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, params.patch_size * params.patch_size * params.out_channels));
+        }
+
+        struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx,
+                                              struct ggml_tensor* x) {
+            int64_t W = x->ne[0];
+            int64_t H = x->ne[1];
+
+            int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
+            int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
+            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            return x;
+        }
+
+        struct ggml_tensor* patchify(struct ggml_context* ctx,
+                                     struct ggml_tensor* x) {
+            // x: [N, C, H, W]
+            // return: [N, h*w, C * patch_size * patch_size]
+            int64_t N = x->ne[3];
+            int64_t C = x->ne[2];
+            int64_t H = x->ne[1];
+            int64_t W = x->ne[0];
+            int64_t p = params.patch_size;
+            int64_t h = H / params.patch_size;
+            int64_t w = W / params.patch_size;
+
+            GGML_ASSERT(h * p == H && w * p == W);
+
+            x = ggml_reshape_4d(ctx, x, p, w, p, h * C * N);       // [N*C*h, p, w, p]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N*C*h, w, p, p]
+            x = ggml_reshape_4d(ctx, x, p * p, w * h, C, N);       // [N, C, h*w, p*p]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N, h*w, C, p*p]
+            x = ggml_reshape_3d(ctx, x, p * p * C, w * h, N);      // [N, h*w, C*p*p]
+            return x;
+        }
+
+        struct ggml_tensor* unpatchify(struct ggml_context* ctx,
+                                       struct ggml_tensor* x,
+                                       int64_t h,
+                                       int64_t w) {
+            // x: [N, h*w, C*patch_size*patch_size]
+            // return: [N, C, H, W]
+            int64_t N = x->ne[2];
+            int64_t C = x->ne[0] / params.patch_size / params.patch_size;
+            int64_t H = h * params.patch_size;
+            int64_t W = w * params.patch_size;
+            int64_t p = params.patch_size;
+
+            GGML_ASSERT(C * p * p == x->ne[0]);
+
+            x = ggml_reshape_4d(ctx, x, p * p, C, w * h, N);       // [N, h*w, C, p*p]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N, C, h*w, p*p]
+            x = ggml_reshape_4d(ctx, x, p, p, w, h * C * N);       // [N*C*h, w, p, p]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N*C*h, p, w, p]
+            x = ggml_reshape_4d(ctx, x, W, H, C, N);               // [N, C, h*p, w*p]
+
+            return x;
+        }
+
+        struct ggml_tensor* forward_orig(struct ggml_context* ctx,
+                                         ggml_backend_t backend,
+                                         struct ggml_tensor* x,
+                                         struct ggml_tensor* timestep,
+                                         struct ggml_tensor* context,
+                                         struct ggml_tensor* pe) {
+            auto time_text_embed = std::dynamic_pointer_cast<QwenTimestepProjEmbeddings>(blocks["time_text_embed"]);
+            auto txt_norm        = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm"]);
+            auto img_in          = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
+            auto txt_in          = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
+            auto norm_out        = std::dynamic_pointer_cast<AdaLayerNormContinuous>(blocks["norm_out"]);
+            auto proj_out        = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]);
+
+            auto t_emb = time_text_embed->forward(ctx, timestep);
+            LOG_DEBUG("xxx");
+            auto img = img_in->forward(ctx, x);
+            LOG_DEBUG("xxx");
+            auto txt = txt_norm->forward(ctx, context);
+            LOG_DEBUG("xxx");
+            txt = txt_in->forward(ctx, txt);
+            LOG_DEBUG("xxx");
+
+            for (int i = 0; i < params.num_layers; i++) {
+                auto block = std::dynamic_pointer_cast<QwenImageTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
+
+                auto result = block->forward(ctx, backend, img, txt, t_emb, pe);
+                img         = result.first;
+                txt         = result.second;
+            }
+
+            img = norm_out->forward(ctx, img, t_emb);
+            img = proj_out->forward(ctx, img);
+
+            return img;
+        }
+
+        struct ggml_tensor* forward(struct ggml_context* ctx,
+                                    ggml_backend_t backend,
+                                    struct ggml_tensor* x,
+                                    struct ggml_tensor* timestep,
+                                    struct ggml_tensor* context,
+                                    struct ggml_tensor* pe) {
+            // Forward pass of DiT.
+            // x: [N, C, H, W]
+            // timestep: [N,]
+            // context: [N, L, D]
+            // pe: [L, d_head/2, 2, 2]
+            // return: [N, C, H, W]
+
+            int64_t W = x->ne[0];
+            int64_t H = x->ne[1];
+            int64_t C = x->ne[2];
+            int64_t N = x->ne[3];
+
+            x = pad_to_patch_size(ctx, x);
+            x = patchify(ctx, x);
+
+            int64_t h_len = ((H + (params.patch_size / 2)) / params.patch_size);
+            int64_t w_len = ((W + (params.patch_size / 2)) / params.patch_size);
+
+            auto out = forward_orig(ctx, backend, x, timestep, context, pe);  // [N, h_len*w_len, ph*pw*C]
+
+            out = unpatchify(ctx, out, h_len, w_len);  // [N, C, H + pad_h, W + pad_w]
+
+            // slice
+            out = ggml_slice(ctx, out, 1, 0, H);  // [N, C, H, W + pad_w]
+            out = ggml_slice(ctx, out, 0, 0, W);  // [N, C, H, W]
+
+            return out;
+        }
+    };
+
+    struct QwenImageRunner : public GGMLRunner {
+    public:
+        QwenImageParams qwen_image_params;
+        QwenImageModel qwen_image;
+        std::vector<float> pe_vec;
+        SDVersion version;
+
+        QwenImageRunner(ggml_backend_t backend,
+                        bool offload_params_to_cpu,
+                        const String2GGMLType& tensor_types = {},
+                        const std::string prefix            = "",
+                        SDVersion version                   = VERSION_FLUX,
+                        bool flash_attn                     = false)
+            : GGMLRunner(backend, offload_params_to_cpu) {
+            qwen_image_params.flash_attn = flash_attn;
+            qwen_image                   = QwenImageModel(qwen_image_params);
+            qwen_image.init(params_ctx, tensor_types, prefix);
+        }
+
+        std::string get_desc() {
+            return "qwen_image";
+        }
+
+        void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+            qwen_image.get_param_tensors(tensors, prefix);
+        }
+
+        struct ggml_cgraph* build_graph(struct ggml_tensor* x,
+                                        struct ggml_tensor* timesteps,
+                                        struct ggml_tensor* context) {
+            GGML_ASSERT(x->ne[3] == 1);
+            struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, QWEN_IMAGE_GRAPH_SIZE, false);
+
+            x         = to_backend(x);
+            context   = to_backend(context);
+            timesteps = to_backend(timesteps);
+
+            pe_vec      = Rope::gen_qwen_image_pe(x->ne[1],
+                                                  x->ne[0],
+                                                  qwen_image_params.patch_size,
+                                                  x->ne[3],
+                                                  context->ne[1],
+                                                  qwen_image_params.theta,
+                                                  qwen_image_params.axes_dim);
+            int pos_len = pe_vec.size() / qwen_image_params.axes_dim_sum / 2;
+            // LOG_DEBUG("pos_len %d", pos_len);
+            auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, qwen_image_params.axes_dim_sum / 2, pos_len);
+            // pe->data = pe_vec.data();
+            // print_ggml_tensor(pe);
+            // pe->data = NULL;
+            set_backend_tensor_data(pe, pe_vec.data());
+
+            struct ggml_tensor* out = qwen_image.forward(compute_ctx,
+                                                         runtime_backend,
+                                                         x,
+                                                         timesteps,
+                                                         context,
+                                                         pe);
+
+            ggml_build_forward_expand(gf, out);
+
+            return gf;
+        }
+
+        void compute(int n_threads,
+                     struct ggml_tensor* x,
+                     struct ggml_tensor* timesteps,
+                     struct ggml_tensor* context,
+                     struct ggml_tensor** output     = NULL,
+                     struct ggml_context* output_ctx = NULL) {
+            // x: [N, in_channels, h, w]
+            // timesteps: [N, ]
+            // context: [N, max_position, hidden_size]
+            auto get_graph = [&]() -> struct ggml_cgraph* {
+                return build_graph(x, timesteps, context);
+            };
+
+            GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        }
+
+        void test() {
+            struct ggml_init_params params;
+            params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1GB
+            params.mem_buffer = NULL;
+            params.no_alloc   = false;
+
+            struct ggml_context* work_ctx = ggml_init(params);
+            GGML_ASSERT(work_ctx != NULL);
+
+            {
+                // cpu f16:
+                // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1);
+                // ggml_set_f32(x, 0.01f);
+                auto x = load_tensor_from_file(work_ctx, "./qwen_image_x.bin");
+                print_ggml_tensor(x);
+
+                std::vector<float> timesteps_vec(1, 1.f);
+                auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
+
+                // auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 3584, 256, 1);
+                // ggml_set_f32(context, 0.01f);
+                auto context = load_tensor_from_file(work_ctx, "./qwen_image_context.bin");
+                print_ggml_tensor(context);
+
+                struct ggml_tensor* out = NULL;
+
+                int t0 = ggml_time_ms();
+                compute(8, x, timesteps, context, &out, work_ctx);
+                int t1 = ggml_time_ms();
+
+                print_ggml_tensor(out);
+                LOG_DEBUG("qwen_image test done in %dms", t1 - t0);
+            }
+        }
+
+        static void load_from_file_and_test(const std::string& file_path) {
+            // cuda q8: pass
+            // ggml_backend_t backend    = ggml_backend_cuda_init(0);
+            ggml_backend_t backend    = ggml_backend_cpu_init();
+            ggml_type model_data_type = GGML_TYPE_Q8_0;
+
+            ModelLoader model_loader;
+            if (!model_loader.init_from_file(file_path, "model.diffusion_model.")) {
+                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
+                return;
+            }
+
+            auto tensor_types = model_loader.tensor_storages_types;
+            for (auto& item : tensor_types) {
+                // LOG_DEBUG("%s %u", item.first.c_str(), item.second);
+                if (ends_with(item.first, "weight")) {
+                    item.second = model_data_type;
+                }
+            }
+
+            std::shared_ptr<QwenImageRunner> qwen_image = std::shared_ptr<QwenImageRunner>(new QwenImageRunner(backend,
+                                                                                                               false,
+                                                                                                               tensor_types,
+                                                                                                               "model.diffusion_model"));
+
+            qwen_image->alloc_params_buffer();
+            std::map<std::string, ggml_tensor*> tensors;
+            qwen_image->get_param_tensors(tensors, "model.diffusion_model");
+
+            bool success = model_loader.load_tensors(tensors);
+
+            if (!success) {
+                LOG_ERROR("load tensors from model loader failed");
+                return;
+            }
+
+            LOG_INFO("qwen_image model loaded");
+            qwen_image->test();
+        }
+    };
+
+}  // namespace name
+
+#endif  // __QWEN_IMAGE_HPP__
--- a/qwenvl.hpp
+++ b/qwenvl.hpp
@ -1,8 +1,6 @@
 #ifndef __QWENVL_HPP__
 #define __QWENVL_HPP__

-#include "ggml_extend.hpp"
-
 #include <algorithm>
 #include <fstream>
 #include <iostream>
@ -15,6 +13,7 @@
 #include <vector>

 #include "clip.hpp"
+#include "ggml_extend.hpp"
 #include "json.hpp"
 #include "tokenize_util.h"

@ -360,7 +359,7 @@ namespace Qwen {
        }
    };

-    class Qwen2_5_VLAttention : public GGMLBlock {
+    struct Qwen2_5_VLAttention : public GGMLBlock {
    protected:
        int64_t head_dim;
        int64_t num_heads;
--- a/rope.hpp
+++ b/rope.hpp
@ -203,6 +203,38 @@ struct Rope {
        return embed_nd(ids, bs, theta, axes_dim);
    }

+    static std::vector<std::vector<float>> gen_qwen_image_ids(int h,
+                                                              int w,
+                                                              int patch_size,
+                                                              int bs,
+                                                              int context_len) {
+        int h_len        = (h + (patch_size / 2)) / patch_size;
+        int w_len        = (w + (patch_size / 2)) / patch_size;
+        int txt_id_start = std::max(h_len, w_len);
+        auto txt_ids     = linspace<float>(txt_id_start, context_len + txt_id_start, context_len);
+        std::vector<std::vector<float>> txt_ids_repeated(bs * context_len, std::vector<float>(3));
+        for (int i = 0; i < bs; ++i) {
+            for (int j = 0; j < txt_ids.size(); ++j) {
+                txt_ids_repeated[i * txt_ids.size() + j] = {txt_ids[j], txt_ids[j], txt_ids[j]};
+            }
+        }
+        auto img_ids = gen_img_ids(h, w, patch_size, bs);
+        auto ids     = concat_ids(txt_ids_repeated, img_ids, bs);
+        return ids;
+    }
+
+    // Generate qwen_image positional embeddings
+    static std::vector<float> gen_qwen_image_pe(int h,
+                                                int w,
+                                                int patch_size,
+                                                int bs,
+                                                int context_len,
+                                                int theta,
+                                                const std::vector<int>& axes_dim) {
+        std::vector<std::vector<float>> ids = gen_qwen_image_ids(h, w, patch_size, bs, context_len);
+        return embed_nd(ids, bs, theta, axes_dim);
+    }
+
    static std::vector<std::vector<float>> gen_vid_ids(int t,
                                                       int h,
                                                       int w,
--- a/wan.hpp
+++ b/wan.hpp
@ -1833,7 +1833,7 @@ namespace WAN {
                                              struct ggml_tensor* x) {
            int64_t W = x->ne[0];
            int64_t H = x->ne[1];
-            int64_t T = x->ne[1];
+            int64_t T = x->ne[2];

            int pad_t = (std::get<0>(params.patch_size) - T % std::get<0>(params.patch_size)) % std::get<0>(params.patch_size);
            int pad_h = (std::get<1>(params.patch_size) - H % std::get<1>(params.patch_size)) % std::get<1>(params.patch_size);