diff --git a/common.hpp b/common.hpp index bf4da24..9c8aba1 100644 --- a/common.hpp +++ b/common.hpp @@ -177,7 +177,7 @@ public: } }; -class GEGLU : public GGMLBlock { +class GEGLU : public UnaryBlock { protected: int64_t dim_in; int64_t dim_out; @@ -216,14 +216,41 @@ public: } }; +class GELU : public UnaryBlock { +public: + GELU(int64_t dim_in, int64_t dim_out, bool bias = true) { + blocks["proj"] = std::shared_ptr(new Linear(dim_in, dim_out, bias)); + } + + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { + // x: [ne3, ne2, ne1, dim_in] + // return: [ne3, ne2, ne1, dim_out] + auto proj = std::dynamic_pointer_cast(blocks["proj"]); + + x = proj->forward(ctx, x); + x = ggml_gelu_inplace(ctx, x); + return x; + } +}; + class FeedForward : public GGMLBlock { public: + enum class Activation { + GEGLU, + GELU + }; FeedForward(int64_t dim, int64_t dim_out, - int64_t mult = 4) { + int64_t mult = 4, + Activation activation = Activation::GEGLU) { int64_t inner_dim = dim * mult; - blocks["net.0"] = std::shared_ptr(new GEGLU(dim, inner_dim)); + if (activation == Activation::GELU) { + blocks["net.0"] = std::shared_ptr(new GELU(dim, inner_dim)); + } else { + blocks["net.0"] = std::shared_ptr(new GEGLU(dim, inner_dim)); + } + // net_1 is nn.Dropout(), skip for inference blocks["net.2"] = std::shared_ptr(new Linear(inner_dim, dim_out)); } @@ -232,7 +259,7 @@ public: // x: [ne3, ne2, ne1, dim] // return: [ne3, ne2, ne1, dim_out] - auto net_0 = std::dynamic_pointer_cast(blocks["net.0"]); + auto net_0 = std::dynamic_pointer_cast(blocks["net.0"]); auto net_2 = std::dynamic_pointer_cast(blocks["net.2"]); x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim] diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 423d3b9..7762ff5 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -27,7 +27,7 @@ #include "avi_writer.h" -#include "qwen.hpp" +#include "qwen_image.hpp" #if defined(_WIN32) #define NOMINMAX @@ -1142,7 +1142,7 @@ int main(int argc, const char* argv[]) { SDParams params; params.verbose = true; sd_set_log_callback(sd_log_cb, (void*)¶ms); - Qwen::Qwen2_5_VLEmbedder::load_from_file_and_test(argv[1]); + Qwen::QwenImageRunner::load_from_file_and_test(argv[1]); exit(1); parse_args(argc, argv, params); params.sample_params.guidance.slg.layers = params.skip_layers.data(); diff --git a/ggml_extend.hpp b/ggml_extend.hpp index e26472a..99e53bf 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -1353,15 +1353,13 @@ __STATIC_INLINE__ std::vector arange(float start, float end, float step = // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 __STATIC_INLINE__ std::vector timestep_embedding(std::vector timesteps, int dim, - int max_period = 10000) { + int max_period = 10000, + bool flip_sin_to_cos = true, + float scale = 1.f) { // timesteps: [N,] // embedding: [N, dim] - size_t N = timesteps.size(); - int acutual_dim = dim; - if (dim % 2 != 0) { - acutual_dim = dim + 1; - } - std::vector embedding(N * acutual_dim, 0.f); + size_t N = timesteps.size(); + std::vector embedding(N * dim, 0.f); int half = dim / 2; std::vector freqs(half); for (int i = 0; i < half; ++i) { @@ -1369,9 +1367,14 @@ __STATIC_INLINE__ std::vector timestep_embedding(std::vector times } for (int i = 0; i < N; ++i) { for (int j = 0; j < half; ++j) { - float arg = timesteps[i] * freqs[j]; - embedding[i * acutual_dim + j] = std::cos(arg); - embedding[i * acutual_dim + j + half] = std::sin(arg); + float arg = timesteps[i] * freqs[j] * scale; + if (flip_sin_to_cos) { + embedding[i * dim + j] = std::cos(arg); + embedding[i * dim + j + half] = std::sin(arg); + } else { + embedding[i * dim + j] = std::sin(arg); + embedding[i * dim + j + half] = std::cos(arg); + } } } return embedding; @@ -1392,11 +1395,7 @@ __STATIC_INLINE__ struct ggml_tensor* new_timestep_embedding(struct ggml_context // timesteps: [N,] // embedding: [N, dim] std::vector embedding_vec = timestep_embedding(timesteps, dim, max_period); - int acutual_dim = dim; - if (dim % 2 != 0) { - acutual_dim = dim + 1; - } - struct ggml_tensor* embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, acutual_dim, timesteps.size()); + struct ggml_tensor* embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, timesteps.size()); if (embedding->data != NULL) { memcpy(((char*)embedding->data), ((char*)embedding_vec.data()), ggml_nbytes(embedding)); } else { diff --git a/model.cpp b/model.cpp index 0d9574a..66ae574 100644 --- a/model.cpp +++ b/model.cpp @@ -728,6 +728,7 @@ void preprocess_tensor(TensorStorage tensor_storage, // convert unet transformer linear to conv2d 1x1 if (starts_with(new_name, "model.diffusion_model.") && + !starts_with(new_name, "model.diffusion_model.proj_out.") && (ends_with(new_name, "proj_in.weight") || ends_with(new_name, "proj_out.weight"))) { tensor_storage.unsqueeze(); } diff --git a/qwen_image.hpp b/qwen_image.hpp new file mode 100644 index 0000000..f2dc3de --- /dev/null +++ b/qwen_image.hpp @@ -0,0 +1,642 @@ +#ifndef __QWEN_IMAGE_HPP__ +#define __QWEN_IMAGE_HPP__ + +#include "common.hpp" +#include "flux.hpp" +#include "ggml_extend.hpp" + +namespace Qwen { + constexpr int QWEN_IMAGE_GRAPH_SIZE = 20480; + + struct TimestepEmbedding : public GGMLBlock { + public: + TimestepEmbedding(int64_t in_channels, + int64_t time_embed_dim, + int64_t out_dim = 0, + int64_t cond_proj_dim = 0, + bool sample_proj_bias = true) { + blocks["linear_1"] = std::shared_ptr(new Linear(in_channels, time_embed_dim, sample_proj_bias)); + if (cond_proj_dim > 0) { + blocks["cond_proj"] = std::shared_ptr(new Linear(cond_proj_dim, in_channels, false)); + } + if (out_dim <= 0) { + out_dim = time_embed_dim; + } + blocks["linear_2"] = std::shared_ptr(new Linear(time_embed_dim, out_dim, sample_proj_bias)); + } + + struct ggml_tensor* forward(struct ggml_context* ctx, + struct ggml_tensor* sample, + struct ggml_tensor* condition = nullptr) { + if (condition != nullptr) { + auto cond_proj = std::dynamic_pointer_cast(blocks["cond_proj"]); + sample = ggml_add(ctx, sample, cond_proj->forward(ctx, condition)); + } + auto linear_1 = std::dynamic_pointer_cast(blocks["linear_1"]); + auto linear_2 = std::dynamic_pointer_cast(blocks["linear_2"]); + + sample = linear_1->forward(ctx, sample); + sample = ggml_silu_inplace(ctx, sample); + sample = linear_2->forward(ctx, sample); + return sample; + } + }; + + struct QwenTimestepProjEmbeddings : public GGMLBlock { + public: + QwenTimestepProjEmbeddings(int64_t embedding_dim) { + blocks["timestep_embedder"] = std::shared_ptr(new TimestepEmbedding(256, embedding_dim)); + } + + struct ggml_tensor* forward(struct ggml_context* ctx, + struct ggml_tensor* timesteps) { + // timesteps: [N,] + // return: [N, embedding_dim] + auto timestep_embedder = std::dynamic_pointer_cast(blocks["timestep_embedder"]); + + auto timesteps_proj = ggml_nn_timestep_embedding(ctx, timesteps, 256, 10000, 1000.f); + auto timesteps_emb = timestep_embedder->forward(ctx, timesteps_proj); + return timesteps_emb; + } + }; + + struct QwenImageAttention : public GGMLBlock { + protected: + int64_t dim_head; + bool flash_attn; + + public: + QwenImageAttention(int64_t query_dim, + int64_t dim_head, + int64_t num_heads, + int64_t out_dim = 0, + int64_t out_context_dim = 0, + bool bias = true, + bool out_bias = true, + float eps = 1e-6, + bool flash_attn = false) + : dim_head(dim_head), flash_attn(flash_attn) { + int64_t inner_dim = out_dim > 0 ? out_dim : dim_head * num_heads; + out_dim = out_dim > 0 ? out_dim : query_dim; + out_context_dim = out_context_dim > 0 ? out_context_dim : query_dim; + + blocks["to_q"] = std::shared_ptr(new Linear(query_dim, inner_dim, bias)); + blocks["to_k"] = std::shared_ptr(new Linear(query_dim, inner_dim, bias)); + blocks["to_v"] = std::shared_ptr(new Linear(query_dim, inner_dim, bias)); + + blocks["norm_q"] = std::shared_ptr(new RMSNorm(dim_head, eps)); + blocks["norm_k"] = std::shared_ptr(new RMSNorm(dim_head, eps)); + + blocks["add_q_proj"] = std::shared_ptr(new Linear(query_dim, inner_dim, bias)); + blocks["add_k_proj"] = std::shared_ptr(new Linear(query_dim, inner_dim, bias)); + blocks["add_v_proj"] = std::shared_ptr(new Linear(query_dim, inner_dim, bias)); + + blocks["norm_added_q"] = std::shared_ptr(new RMSNorm(dim_head, eps)); + blocks["norm_added_k"] = std::shared_ptr(new RMSNorm(dim_head, eps)); + + blocks["to_out.0"] = std::shared_ptr(new Linear(inner_dim, out_dim, out_bias)); + // to_out.1 is nn.Dropout + + blocks["to_add_out"] = std::shared_ptr(new Linear(inner_dim, out_context_dim, out_bias)); + } + + std::pair forward(struct ggml_context* ctx, + ggml_backend_t backend, + struct ggml_tensor* img, + struct ggml_tensor* txt, + struct ggml_tensor* pe, + struct ggml_tensor* mask = nullptr) { + // img: [N, n_img_token, hidden_size] + // txt: [N, n_txt_token, hidden_size] + // pe: [n_img_token + n_txt_token, d_head/2, 2, 2] + // return: ([N, n_img_token, hidden_size], [N, n_txt_token, hidden_size]) + + auto norm_q = std::dynamic_pointer_cast(blocks["norm_q"]); + auto norm_k = std::dynamic_pointer_cast(blocks["norm_k"]); + + auto to_q = std::dynamic_pointer_cast(blocks["to_q"]); + auto to_k = std::dynamic_pointer_cast(blocks["to_k"]); + auto to_v = std::dynamic_pointer_cast(blocks["to_v"]); + auto to_out_0 = std::dynamic_pointer_cast(blocks["to_out.0"]); + + auto norm_added_q = std::dynamic_pointer_cast(blocks["norm_added_q"]); + auto norm_added_k = std::dynamic_pointer_cast(blocks["norm_added_k"]); + + auto add_q_proj = std::dynamic_pointer_cast(blocks["add_q_proj"]); + auto add_k_proj = std::dynamic_pointer_cast(blocks["add_k_proj"]); + auto add_v_proj = std::dynamic_pointer_cast(blocks["add_v_proj"]); + auto to_add_out = std::dynamic_pointer_cast(blocks["to_add_out"]); + + int64_t N = img->ne[2]; + int64_t n_img_token = img->ne[1]; + int64_t n_txt_token = txt->ne[1]; + + auto img_q = to_q->forward(ctx, img); + int64_t num_heads = img_q->ne[0] / dim_head; + img_q = ggml_reshape_4d(ctx, img_q, dim_head, num_heads, n_img_token, N); // [N, n_img_token, n_head, d_head] + auto img_k = to_k->forward(ctx, img); + img_k = ggml_reshape_4d(ctx, img_k, dim_head, num_heads, n_img_token, N); // [N, n_img_token, n_head, d_head] + auto img_v = to_v->forward(ctx, img); + img_v = ggml_reshape_4d(ctx, img_v, dim_head, num_heads, n_img_token, N); // [N, n_img_token, n_head, d_head] + + img_q = norm_q->forward(ctx, img_q); + img_k = norm_k->forward(ctx, img_k); + + auto txt_q = add_q_proj->forward(ctx, txt); + txt_q = ggml_reshape_4d(ctx, txt_q, dim_head, num_heads, n_txt_token, N); // [N, n_txt_token, n_head, d_head] + auto txt_k = add_k_proj->forward(ctx, txt); + txt_k = ggml_reshape_4d(ctx, txt_k, dim_head, num_heads, n_txt_token, N); // [N, n_txt_token, n_head, d_head] + auto txt_v = add_v_proj->forward(ctx, txt); + txt_v = ggml_reshape_4d(ctx, txt_v, dim_head, num_heads, n_txt_token, N); // [N, n_txt_token, n_head, d_head] + + txt_q = norm_added_q->forward(ctx, txt_q); + txt_k = norm_added_k->forward(ctx, txt_k); + + auto q = ggml_concat(ctx, txt_q, img_q, 2); // [N, n_txt_token + n_img_token, n_head, d_head] + auto k = ggml_concat(ctx, txt_k, img_k, 2); // [N, n_txt_token + n_img_token, n_head, d_head] + auto v = ggml_concat(ctx, txt_v, img_v, 2); // [N, n_txt_token + n_img_token, n_head, d_head] + + auto attn = Flux::attention(ctx, backend, q, k, v, pe, mask, flash_attn); // [N, n_txt_token + n_img_token, n_head*d_head] + attn = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3)); // [n_txt_token + n_img_token, N, hidden_size] + auto txt_attn_out = ggml_view_3d(ctx, + attn, + attn->ne[0], + attn->ne[1], + txt->ne[1], + attn->nb[1], + attn->nb[2], + 0); // [n_txt_token, N, hidden_size] + txt_attn_out = ggml_cont(ctx, ggml_permute(ctx, txt_attn_out, 0, 2, 1, 3)); // [N, n_txt_token, hidden_size] + auto img_attn_out = ggml_view_3d(ctx, + attn, + attn->ne[0], + attn->ne[1], + img->ne[1], + attn->nb[1], + attn->nb[2], + attn->nb[2] * txt->ne[1]); // [n_img_token, N, hidden_size] + img_attn_out = ggml_cont(ctx, ggml_permute(ctx, img_attn_out, 0, 2, 1, 3)); // [N, n_img_token, hidden_size] + + img_attn_out = to_out_0->forward(ctx, img_attn_out); + txt_attn_out = to_add_out->forward(ctx, txt_attn_out); + + return {img_attn_out, txt_attn_out}; + } + }; + + class QwenImageTransformerBlock : public GGMLBlock { + public: + QwenImageTransformerBlock(int64_t dim, + int64_t num_attention_heads, + int64_t attention_head_dim, + float eps = 1e-6, + bool flash_attn = false) { + // img_mod.0 is nn.SiLU() + blocks["img_mod.1"] = std::shared_ptr(new Linear(dim, 6 * dim, true)); + + blocks["img_norm1"] = std::shared_ptr(new LayerNorm(dim, eps, false)); + blocks["img_norm2"] = std::shared_ptr(new LayerNorm(dim, eps, false)); + blocks["img_mlp"] = std::shared_ptr(new FeedForward(dim, dim, 4, FeedForward::Activation::GELU)); + + // txt_mod.0 is nn.SiLU() + blocks["txt_mod.1"] = std::shared_ptr(new Linear(dim, 6 * dim, true)); + + blocks["txt_norm1"] = std::shared_ptr(new LayerNorm(dim, eps, false)); + blocks["txt_norm2"] = std::shared_ptr(new LayerNorm(dim, eps, false)); + blocks["txt_mlp"] = std::shared_ptr(new FeedForward(dim, dim, 4, FeedForward::Activation::GELU)); + + blocks["attn"] = std::shared_ptr(new QwenImageAttention(dim, + attention_head_dim, + num_attention_heads, + 0, // out_dim + 0, // out_context-dim + true, // bias + true, // out_bias + eps, + flash_attn)); + } + + virtual std::pair forward(struct ggml_context* ctx, + ggml_backend_t backend, + struct ggml_tensor* img, + struct ggml_tensor* txt, + struct ggml_tensor* t_emb, + struct ggml_tensor* pe) { + // img: [N, n_img_token, hidden_size] + // txt: [N, n_txt_token, hidden_size] + // pe: [n_img_token + n_txt_token, d_head/2, 2, 2] + // return: ([N, n_img_token, hidden_size], [N, n_txt_token, hidden_size]) + + auto img_mod_1 = std::dynamic_pointer_cast(blocks["img_mod.1"]); + auto img_norm1 = std::dynamic_pointer_cast(blocks["img_norm1"]); + auto img_norm2 = std::dynamic_pointer_cast(blocks["img_norm2"]); + auto img_mlp = std::dynamic_pointer_cast(blocks["img_mlp"]); + + auto txt_mod_1 = std::dynamic_pointer_cast(blocks["txt_mod.1"]); + auto txt_norm1 = std::dynamic_pointer_cast(blocks["txt_norm1"]); + auto txt_norm2 = std::dynamic_pointer_cast(blocks["txt_norm2"]); + auto txt_mlp = std::dynamic_pointer_cast(blocks["txt_mlp"]); + + auto attn = std::dynamic_pointer_cast(blocks["attn"]); + + auto img_mod_params = ggml_silu(ctx, t_emb); + img_mod_params = img_mod_1->forward(ctx, img_mod_params); + auto img_mod_param_vec = ggml_chunk(ctx, img_mod_params, 6, 0); + + auto txt_mod_params = ggml_silu(ctx, t_emb); + txt_mod_params = txt_mod_1->forward(ctx, txt_mod_params); + auto txt_mod_param_vec = ggml_chunk(ctx, txt_mod_params, 6, 0); + + auto img_normed = img_norm1->forward(ctx, img); + auto img_modulated = Flux::modulate(ctx, img_normed, img_mod_param_vec[0], img_mod_param_vec[1]); + auto img_gate1 = img_mod_param_vec[2]; + + auto txt_normed = txt_norm1->forward(ctx, txt); + auto txt_modulated = Flux::modulate(ctx, txt_normed, txt_mod_param_vec[0], txt_mod_param_vec[1]); + auto txt_gate1 = txt_mod_param_vec[2]; + + auto [img_attn_output, txt_attn_output] = attn->forward(ctx, backend, img_modulated, txt_modulated, pe); + + img = ggml_add(ctx, img, ggml_mul(ctx, img_attn_output, img_gate1)); + txt = ggml_add(ctx, txt, ggml_mul(ctx, txt_attn_output, txt_gate1)); + + auto img_normed2 = img_norm2->forward(ctx, img); + auto img_modulated2 = Flux::modulate(ctx, img_normed2, img_mod_param_vec[3], img_mod_param_vec[4]); + auto img_gate2 = img_mod_param_vec[5]; + + auto txt_normed2 = txt_norm2->forward(ctx, txt); + auto txt_modulated2 = Flux::modulate(ctx, txt_normed2, txt_mod_param_vec[3], txt_mod_param_vec[4]); + auto txt_gate2 = txt_mod_param_vec[5]; + + auto img_mlp_out = img_mlp->forward(ctx, img_modulated2); + auto txt_mlp_out = txt_mlp->forward(ctx, txt_modulated2); + + img = ggml_add(ctx, img, ggml_mul(ctx, img_mlp_out, img_gate2)); + txt = ggml_add(ctx, txt, ggml_mul(ctx, txt_mlp_out, txt_gate2)); + + return {img, txt}; + } + }; + + struct AdaLayerNormContinuous : public GGMLBlock { + public: + AdaLayerNormContinuous(int64_t embedding_dim, + int64_t conditioning_embedding_dim, + bool elementwise_affine = true, + float eps = 1e-5f, + bool bias = true) { + blocks["norm"] = std::shared_ptr(new LayerNorm(conditioning_embedding_dim, eps, elementwise_affine, bias)); + blocks["linear"] = std::shared_ptr(new Linear(conditioning_embedding_dim, embedding_dim * 2, bias)); + } + + struct ggml_tensor* forward(struct ggml_context* ctx, + struct ggml_tensor* x, + struct ggml_tensor* c) { + // x: [N, n_token, hidden_size] + // c: [N, hidden_size] + // return: [N, n_token, patch_size * patch_size * out_channels] + + auto norm = std::dynamic_pointer_cast(blocks["norm"]); + auto linear = std::dynamic_pointer_cast(blocks["linear"]); + + auto emb = linear->forward(ctx, ggml_silu(ctx, c)); + auto mods = ggml_chunk(ctx, emb, 2, 0); + auto scale = mods[0]; + auto shift = mods[1]; + + x = norm->forward(ctx, x); + x = Flux::modulate(ctx, x, shift, scale); + + return x; + } + }; + + struct QwenImageParams { + int64_t patch_size = 2; + int64_t in_channels = 64; + int64_t out_channels = 16; + int64_t num_layers = 60; + int64_t attention_head_dim = 128; + int64_t num_attention_heads = 24; + int64_t joint_attention_dim = 3584; + float theta = 10000; + std::vector axes_dim = {16, 56, 56}; + int64_t axes_dim_sum = 128; + bool flash_attn = false; + }; + + class QwenImageModel : public GGMLBlock { + protected: + QwenImageParams params; + + public: + QwenImageModel() {} + QwenImageModel(QwenImageParams params) + : params(params) { + int64_t inner_dim = params.num_attention_heads * params.attention_head_dim; + blocks["time_text_embed"] = std::shared_ptr(new QwenTimestepProjEmbeddings(inner_dim)); + blocks["txt_norm"] = std::shared_ptr(new RMSNorm(params.joint_attention_dim, 1e-6f)); + blocks["img_in"] = std::shared_ptr(new Linear(params.in_channels, inner_dim)); + blocks["txt_in"] = std::shared_ptr(new Linear(params.joint_attention_dim, inner_dim)); + + // blocks + for (int i = 0; i < params.num_layers; i++) { + auto block = std::shared_ptr(new QwenImageTransformerBlock(inner_dim, + params.num_attention_heads, + params.attention_head_dim, + 1e-6f, + params.flash_attn)); + blocks["transformer_blocks." + std::to_string(i)] = block; + } + + blocks["norm_out"] = std::shared_ptr(new AdaLayerNormContinuous(inner_dim, inner_dim, false, 1e-6f)); + blocks["proj_out"] = std::shared_ptr(new Linear(inner_dim, params.patch_size * params.patch_size * params.out_channels)); + } + + struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx, + struct ggml_tensor* x) { + int64_t W = x->ne[0]; + int64_t H = x->ne[1]; + + int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size; + int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size; + x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] + return x; + } + + struct ggml_tensor* patchify(struct ggml_context* ctx, + struct ggml_tensor* x) { + // x: [N, C, H, W] + // return: [N, h*w, C * patch_size * patch_size] + int64_t N = x->ne[3]; + int64_t C = x->ne[2]; + int64_t H = x->ne[1]; + int64_t W = x->ne[0]; + int64_t p = params.patch_size; + int64_t h = H / params.patch_size; + int64_t w = W / params.patch_size; + + GGML_ASSERT(h * p == H && w * p == W); + + x = ggml_reshape_4d(ctx, x, p, w, p, h * C * N); // [N*C*h, p, w, p] + x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, w, p, p] + x = ggml_reshape_4d(ctx, x, p * p, w * h, C, N); // [N, C, h*w, p*p] + x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, h*w, C, p*p] + x = ggml_reshape_3d(ctx, x, p * p * C, w * h, N); // [N, h*w, C*p*p] + return x; + } + + struct ggml_tensor* unpatchify(struct ggml_context* ctx, + struct ggml_tensor* x, + int64_t h, + int64_t w) { + // x: [N, h*w, C*patch_size*patch_size] + // return: [N, C, H, W] + int64_t N = x->ne[2]; + int64_t C = x->ne[0] / params.patch_size / params.patch_size; + int64_t H = h * params.patch_size; + int64_t W = w * params.patch_size; + int64_t p = params.patch_size; + + GGML_ASSERT(C * p * p == x->ne[0]); + + x = ggml_reshape_4d(ctx, x, p * p, C, w * h, N); // [N, h*w, C, p*p] + x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, C, h*w, p*p] + x = ggml_reshape_4d(ctx, x, p, p, w, h * C * N); // [N*C*h, w, p, p] + x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, p, w, p] + x = ggml_reshape_4d(ctx, x, W, H, C, N); // [N, C, h*p, w*p] + + return x; + } + + struct ggml_tensor* forward_orig(struct ggml_context* ctx, + ggml_backend_t backend, + struct ggml_tensor* x, + struct ggml_tensor* timestep, + struct ggml_tensor* context, + struct ggml_tensor* pe) { + auto time_text_embed = std::dynamic_pointer_cast(blocks["time_text_embed"]); + auto txt_norm = std::dynamic_pointer_cast(blocks["txt_norm"]); + auto img_in = std::dynamic_pointer_cast(blocks["img_in"]); + auto txt_in = std::dynamic_pointer_cast(blocks["txt_in"]); + auto norm_out = std::dynamic_pointer_cast(blocks["norm_out"]); + auto proj_out = std::dynamic_pointer_cast(blocks["proj_out"]); + + auto t_emb = time_text_embed->forward(ctx, timestep); + LOG_DEBUG("xxx"); + auto img = img_in->forward(ctx, x); + LOG_DEBUG("xxx"); + auto txt = txt_norm->forward(ctx, context); + LOG_DEBUG("xxx"); + txt = txt_in->forward(ctx, txt); + LOG_DEBUG("xxx"); + + for (int i = 0; i < params.num_layers; i++) { + auto block = std::dynamic_pointer_cast(blocks["transformer_blocks." + std::to_string(i)]); + + auto result = block->forward(ctx, backend, img, txt, t_emb, pe); + img = result.first; + txt = result.second; + } + + img = norm_out->forward(ctx, img, t_emb); + img = proj_out->forward(ctx, img); + + return img; + } + + struct ggml_tensor* forward(struct ggml_context* ctx, + ggml_backend_t backend, + struct ggml_tensor* x, + struct ggml_tensor* timestep, + struct ggml_tensor* context, + struct ggml_tensor* pe) { + // Forward pass of DiT. + // x: [N, C, H, W] + // timestep: [N,] + // context: [N, L, D] + // pe: [L, d_head/2, 2, 2] + // return: [N, C, H, W] + + int64_t W = x->ne[0]; + int64_t H = x->ne[1]; + int64_t C = x->ne[2]; + int64_t N = x->ne[3]; + + x = pad_to_patch_size(ctx, x); + x = patchify(ctx, x); + + int64_t h_len = ((H + (params.patch_size / 2)) / params.patch_size); + int64_t w_len = ((W + (params.patch_size / 2)) / params.patch_size); + + auto out = forward_orig(ctx, backend, x, timestep, context, pe); // [N, h_len*w_len, ph*pw*C] + + out = unpatchify(ctx, out, h_len, w_len); // [N, C, H + pad_h, W + pad_w] + + // slice + out = ggml_slice(ctx, out, 1, 0, H); // [N, C, H, W + pad_w] + out = ggml_slice(ctx, out, 0, 0, W); // [N, C, H, W] + + return out; + } + }; + + struct QwenImageRunner : public GGMLRunner { + public: + QwenImageParams qwen_image_params; + QwenImageModel qwen_image; + std::vector pe_vec; + SDVersion version; + + QwenImageRunner(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2GGMLType& tensor_types = {}, + const std::string prefix = "", + SDVersion version = VERSION_FLUX, + bool flash_attn = false) + : GGMLRunner(backend, offload_params_to_cpu) { + qwen_image_params.flash_attn = flash_attn; + qwen_image = QwenImageModel(qwen_image_params); + qwen_image.init(params_ctx, tensor_types, prefix); + } + + std::string get_desc() { + return "qwen_image"; + } + + void get_param_tensors(std::map& tensors, const std::string prefix) { + qwen_image.get_param_tensors(tensors, prefix); + } + + struct ggml_cgraph* build_graph(struct ggml_tensor* x, + struct ggml_tensor* timesteps, + struct ggml_tensor* context) { + GGML_ASSERT(x->ne[3] == 1); + struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, QWEN_IMAGE_GRAPH_SIZE, false); + + x = to_backend(x); + context = to_backend(context); + timesteps = to_backend(timesteps); + + pe_vec = Rope::gen_qwen_image_pe(x->ne[1], + x->ne[0], + qwen_image_params.patch_size, + x->ne[3], + context->ne[1], + qwen_image_params.theta, + qwen_image_params.axes_dim); + int pos_len = pe_vec.size() / qwen_image_params.axes_dim_sum / 2; + // LOG_DEBUG("pos_len %d", pos_len); + auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, qwen_image_params.axes_dim_sum / 2, pos_len); + // pe->data = pe_vec.data(); + // print_ggml_tensor(pe); + // pe->data = NULL; + set_backend_tensor_data(pe, pe_vec.data()); + + struct ggml_tensor* out = qwen_image.forward(compute_ctx, + runtime_backend, + x, + timesteps, + context, + pe); + + ggml_build_forward_expand(gf, out); + + return gf; + } + + void compute(int n_threads, + struct ggml_tensor* x, + struct ggml_tensor* timesteps, + struct ggml_tensor* context, + struct ggml_tensor** output = NULL, + struct ggml_context* output_ctx = NULL) { + // x: [N, in_channels, h, w] + // timesteps: [N, ] + // context: [N, max_position, hidden_size] + auto get_graph = [&]() -> struct ggml_cgraph* { + return build_graph(x, timesteps, context); + }; + + GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + } + + void test() { + struct ggml_init_params params; + params.mem_size = static_cast(1024 * 1024) * 1024; // 1GB + params.mem_buffer = NULL; + params.no_alloc = false; + + struct ggml_context* work_ctx = ggml_init(params); + GGML_ASSERT(work_ctx != NULL); + + { + // cpu f16: + // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1); + // ggml_set_f32(x, 0.01f); + auto x = load_tensor_from_file(work_ctx, "./qwen_image_x.bin"); + print_ggml_tensor(x); + + std::vector timesteps_vec(1, 1.f); + auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); + + // auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 3584, 256, 1); + // ggml_set_f32(context, 0.01f); + auto context = load_tensor_from_file(work_ctx, "./qwen_image_context.bin"); + print_ggml_tensor(context); + + struct ggml_tensor* out = NULL; + + int t0 = ggml_time_ms(); + compute(8, x, timesteps, context, &out, work_ctx); + int t1 = ggml_time_ms(); + + print_ggml_tensor(out); + LOG_DEBUG("qwen_image test done in %dms", t1 - t0); + } + } + + static void load_from_file_and_test(const std::string& file_path) { + // cuda q8: pass + // ggml_backend_t backend = ggml_backend_cuda_init(0); + ggml_backend_t backend = ggml_backend_cpu_init(); + ggml_type model_data_type = GGML_TYPE_Q8_0; + + ModelLoader model_loader; + if (!model_loader.init_from_file(file_path, "model.diffusion_model.")) { + LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); + return; + } + + auto tensor_types = model_loader.tensor_storages_types; + for (auto& item : tensor_types) { + // LOG_DEBUG("%s %u", item.first.c_str(), item.second); + if (ends_with(item.first, "weight")) { + item.second = model_data_type; + } + } + + std::shared_ptr qwen_image = std::shared_ptr(new QwenImageRunner(backend, + false, + tensor_types, + "model.diffusion_model")); + + qwen_image->alloc_params_buffer(); + std::map tensors; + qwen_image->get_param_tensors(tensors, "model.diffusion_model"); + + bool success = model_loader.load_tensors(tensors); + + if (!success) { + LOG_ERROR("load tensors from model loader failed"); + return; + } + + LOG_INFO("qwen_image model loaded"); + qwen_image->test(); + } + }; + +} // namespace name + +#endif // __QWEN_IMAGE_HPP__ \ No newline at end of file diff --git a/qwenvl.hpp b/qwenvl.hpp index 46ae7f7..bb2b0e0 100644 --- a/qwenvl.hpp +++ b/qwenvl.hpp @@ -1,8 +1,6 @@ #ifndef __QWENVL_HPP__ #define __QWENVL_HPP__ -#include "ggml_extend.hpp" - #include #include #include @@ -15,6 +13,7 @@ #include #include "clip.hpp" +#include "ggml_extend.hpp" #include "json.hpp" #include "tokenize_util.h" @@ -360,7 +359,7 @@ namespace Qwen { } }; - class Qwen2_5_VLAttention : public GGMLBlock { + struct Qwen2_5_VLAttention : public GGMLBlock { protected: int64_t head_dim; int64_t num_heads; diff --git a/rope.hpp b/rope.hpp index bde075a..5e3aaf9 100644 --- a/rope.hpp +++ b/rope.hpp @@ -203,6 +203,38 @@ struct Rope { return embed_nd(ids, bs, theta, axes_dim); } + static std::vector> gen_qwen_image_ids(int h, + int w, + int patch_size, + int bs, + int context_len) { + int h_len = (h + (patch_size / 2)) / patch_size; + int w_len = (w + (patch_size / 2)) / patch_size; + int txt_id_start = std::max(h_len, w_len); + auto txt_ids = linspace(txt_id_start, context_len + txt_id_start, context_len); + std::vector> txt_ids_repeated(bs * context_len, std::vector(3)); + for (int i = 0; i < bs; ++i) { + for (int j = 0; j < txt_ids.size(); ++j) { + txt_ids_repeated[i * txt_ids.size() + j] = {txt_ids[j], txt_ids[j], txt_ids[j]}; + } + } + auto img_ids = gen_img_ids(h, w, patch_size, bs); + auto ids = concat_ids(txt_ids_repeated, img_ids, bs); + return ids; + } + + // Generate qwen_image positional embeddings + static std::vector gen_qwen_image_pe(int h, + int w, + int patch_size, + int bs, + int context_len, + int theta, + const std::vector& axes_dim) { + std::vector> ids = gen_qwen_image_ids(h, w, patch_size, bs, context_len); + return embed_nd(ids, bs, theta, axes_dim); + } + static std::vector> gen_vid_ids(int t, int h, int w, diff --git a/wan.hpp b/wan.hpp index 7e3510a..af829b1 100644 --- a/wan.hpp +++ b/wan.hpp @@ -1833,7 +1833,7 @@ namespace WAN { struct ggml_tensor* x) { int64_t W = x->ne[0]; int64_t H = x->ne[1]; - int64_t T = x->ne[1]; + int64_t T = x->ne[2]; int pad_t = (std::get<0>(params.patch_size) - T % std::get<0>(params.patch_size)) % std::get<0>(params.patch_size); int pad_h = (std::get<1>(params.patch_size) - H % std::get<1>(params.patch_size)) % std::get<1>(params.patch_size);