#ifndef __SD_MODEL_DIFFUSION_IDEOGRAM4_HPP__ #define __SD_MODEL_DIFFUSION_IDEOGRAM4_HPP__ #include #include #include #include #include #include #include "core/ggml_extend.hpp" #include "core/ggml_graph_cut.h" #include "model/common/rope.hpp" #include "model/diffusion/model.hpp" namespace Ideogram4 { constexpr int IDEOGRAM4_GRAPH_SIZE = 65536; constexpr int OUTPUT_IMAGE_INDICATOR = 2; constexpr int IMAGE_POSITION_OFFSET = 65536; constexpr int DEFAULT_MROPE_SECTION_T = 24; constexpr int DEFAULT_MROPE_SECTION_H = 20; constexpr int DEFAULT_MROPE_SECTION_W = 20; constexpr int TIMESTEP_MAX_PERIOD = 10000; constexpr int LLM_HIDDEN_STATE_LAYERS = 13; struct Ideogram4Config { int64_t emb_dim = 4608; int64_t num_layers = 34; int64_t num_heads = 18; int64_t intermediate_size = 12288; int64_t adanln_dim = 512; int64_t in_channels = 128; int64_t llm_features_dim = 53248; int64_t rope_theta = 5000000; float norm_eps = 1e-5f; int patch_size = 2; int ae_channels = 32; std::vector mrope_section = {DEFAULT_MROPE_SECTION_T, DEFAULT_MROPE_SECTION_H, DEFAULT_MROPE_SECTION_W}; static Ideogram4Config detect_from_weights(const String2TensorStorage& tensor_storage_map, const std::string& prefix) { Ideogram4Config config; int64_t detected_layers = 0; std::string layer_prefix = prefix.empty() ? "layers." : prefix + ".layers."; for (const auto& [name, _] : tensor_storage_map) { if (name.find(layer_prefix) != 0) { continue; } std::string tail = name.substr(layer_prefix.size()); size_t dot = tail.find('.'); if (dot == std::string::npos) { continue; } int layer_idx = std::atoi(tail.substr(0, dot).c_str()); detected_layers = std::max(detected_layers, layer_idx + 1); } if (detected_layers > 0) { config.num_layers = detected_layers; LOG_DEBUG("ideogram4: num_layers = %" PRId64 ", emb_dim = %" PRId64 ", num_heads = %" PRId64 ", intermediate_size = %" PRId64, config.num_layers, config.emb_dim, config.num_heads, config.intermediate_size); } return config; } }; __STATIC_INLINE__ ggml_tensor* timestep_embedding_sin_cos(ggml_context* ctx, ggml_tensor* timesteps, int dim) { GGML_ASSERT(dim % 2 == 0); auto embedding = ggml_ext_timestep_embedding(ctx, timesteps, dim, TIMESTEP_MAX_PERIOD, 10.f); auto chunks = ggml_ext_chunk(ctx, embedding, 2, 0); return ggml_concat(ctx, chunks[1], chunks[0], 0); } __STATIC_INLINE__ ggml_tensor* to_token_modulation(ggml_context* ctx, ggml_tensor* x) { // [N, C] -> [N, 1, C] in PyTorch layout. if (ggml_n_dims(x) < 3 || x->ne[1] != 1) { x = ggml_reshape_3d(ctx, x, x->ne[0], 1, x->ne[1]); } return x; } __STATIC_INLINE__ ggml_tensor* interleave_hidden_state_layers(ggml_context* ctx, ggml_tensor* x) { // Match upstream stack(...).permute(1, 2, 3, 0).reshape(...): // [layers * hidden, tokens, batch] -> [hidden * layers, tokens, batch]. GGML_ASSERT(x->ne[0] % LLM_HIDDEN_STATE_LAYERS == 0); const int64_t hidden_size = x->ne[0] / LLM_HIDDEN_STATE_LAYERS; const int64_t token_count = x->ne[1]; const int64_t batch_count = x->ne[2]; x = ggml_reshape_4d(ctx, x, hidden_size, LLM_HIDDEN_STATE_LAYERS, token_count, batch_count); x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); return ggml_reshape_3d(ctx, x, hidden_size * LLM_HIDDEN_STATE_LAYERS, token_count, batch_count); } __STATIC_INLINE__ ggml_tensor* modulate(ggml_context* ctx, ggml_tensor* x, ggml_tensor* scale) { scale = to_token_modulation(ctx, scale); return ggml_add(ctx, x, ggml_mul(ctx, x, scale)); } __STATIC_INLINE__ ggml_tensor* patchify(ggml_context* ctx, ggml_tensor* x, const Ideogram4Config& config) { // x: [N, 128, H, W] with channel order [ae, ph, pw]. // return: [N, H*W, 128] with token channel order [ph, pw, ae]. const int64_t W = x->ne[0]; const int64_t H = x->ne[1]; const int64_t C = x->ne[2]; const int64_t N = x->ne[3]; GGML_ASSERT(N == 1); GGML_ASSERT(C == config.ae_channels * config.patch_size * config.patch_size); x = ggml_cont(ctx, x); x = ggml_reshape_4d(ctx, x, W * H, config.patch_size, config.patch_size, config.ae_channels); x = ggml_cont(ctx, ggml_permute(ctx, x, 3, 1, 2, 0)); x = ggml_reshape_3d(ctx, x, C, W * H, N); return x; } __STATIC_INLINE__ ggml_tensor* unpatchify(ggml_context* ctx, ggml_tensor* x, int64_t H, int64_t W, const Ideogram4Config& config) { const int64_t C = x->ne[0]; const int64_t N = x->ne[2]; GGML_ASSERT(N == 1); GGML_ASSERT(C == config.ae_channels * config.patch_size * config.patch_size); GGML_ASSERT(x->ne[1] == H * W); x = ggml_reshape_4d(ctx, x, config.ae_channels, config.patch_size, config.patch_size, H * W); x = ggml_cont(ctx, ggml_permute(ctx, x, 3, 1, 2, 0)); x = ggml_reshape_4d(ctx, x, W, H, C, N); return x; } __STATIC_INLINE__ std::shared_ptr make_linear(int64_t in_features, int64_t out_features, bool bias = true) { return std::make_shared(in_features, out_features, bias, false, false, 1.f, true); } __STATIC_INLINE__ std::vector gen_ideogram4_pe(int grid_h, int grid_w, int bs, int context_len, int head_dim, int rope_theta, const std::vector& mrope_section, bool circular_x = false, bool circular_y = false) { GGML_ASSERT(bs == 1); std::vector> ids(static_cast(bs) * (context_len + grid_h * grid_w), std::vector(3, 0.f)); for (int i = 0; i < context_len; ++i) { ids[i] = {static_cast(i), static_cast(i), static_cast(i)}; } int cursor = context_len; for (int y = 0; y < grid_h; ++y) { for (int x = 0; x < grid_w; ++x) { ids[cursor++] = {static_cast(IMAGE_POSITION_OFFSET), static_cast(IMAGE_POSITION_OFFSET + y), static_cast(IMAGE_POSITION_OFFSET + x)}; } } std::vector> axis_wrap_dims(3); if (circular_y || circular_x) { size_t total_len = static_cast(bs) * (context_len + grid_h * grid_w); axis_wrap_dims[1].assign(total_len, 0); axis_wrap_dims[2].assign(total_len, 0); if (circular_y) { for (size_t idx = static_cast(context_len); idx < total_len; ++idx) { axis_wrap_dims[1][idx] = grid_h; } } if (circular_x) { for (size_t idx = static_cast(context_len); idx < total_len; ++idx) { axis_wrap_dims[2][idx] = grid_w; } } } return Rope::embed_interleaved_mrope(ids, bs, static_cast(rope_theta), head_dim, mrope_section, axis_wrap_dims); } class Ideogram4Attention : public GGMLBlock { protected: int64_t hidden_size; int64_t num_heads; int64_t head_dim; public: Ideogram4Attention(int64_t hidden_size, int64_t num_heads, float eps) : hidden_size(hidden_size), num_heads(num_heads), head_dim(hidden_size / num_heads) { GGML_ASSERT(hidden_size % num_heads == 0); blocks["qkv"] = make_linear(hidden_size, hidden_size * 3, false); blocks["norm_q"] = std::make_shared(head_dim, eps); blocks["norm_k"] = std::make_shared(head_dim, eps); blocks["o"] = make_linear(hidden_size, hidden_size, false); } ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* pe, ggml_tensor* mask = nullptr) { int64_t n_token = x->ne[1]; int64_t N = x->ne[2]; auto qkv_proj = std::dynamic_pointer_cast(blocks["qkv"]); auto norm_q = std::dynamic_pointer_cast(blocks["norm_q"]); auto norm_k = std::dynamic_pointer_cast(blocks["norm_k"]); auto out_proj = std::dynamic_pointer_cast(blocks["o"]); auto qkv = qkv_proj->forward(ctx, x); auto qkv_vec = split_qkv(ctx->ggml_ctx, qkv); auto q = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[0], head_dim, num_heads, n_token, N); auto k = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[1], head_dim, num_heads, n_token, N); auto v = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[2], head_dim, num_heads, n_token, N); q = norm_q->forward(ctx, q); k = norm_k->forward(ctx, k); x = Rope::attention(ctx, q, k, v, pe, mask, 1.f / 128.f, false); x = out_proj->forward(ctx, x); return x; } }; class Ideogram4MLP : public GGMLBlock { public: Ideogram4MLP(int64_t dim, int64_t hidden_dim) { blocks["w1"] = make_linear(dim, hidden_dim, false); blocks["w2"] = make_linear(hidden_dim, dim, false); blocks["w3"] = make_linear(dim, hidden_dim, false); } ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { auto w1 = std::dynamic_pointer_cast(blocks["w1"]); auto w2 = std::dynamic_pointer_cast(blocks["w2"]); auto w3 = std::dynamic_pointer_cast(blocks["w3"]); auto x1 = ggml_silu(ctx->ggml_ctx, w1->forward(ctx, x)); auto x3 = w3->forward(ctx, x); x = ggml_mul(ctx->ggml_ctx, x1, x3); x = w2->forward(ctx, x); return x; } }; class Ideogram4TransformerBlock : public GGMLBlock { public: Ideogram4TransformerBlock(const Ideogram4Config& config) { blocks["attention"] = std::make_shared(config.emb_dim, config.num_heads, config.norm_eps); blocks["feed_forward"] = std::make_shared(config.emb_dim, config.intermediate_size); blocks["attention_norm1"] = std::make_shared(config.emb_dim, config.norm_eps); blocks["ffn_norm1"] = std::make_shared(config.emb_dim, config.norm_eps); blocks["attention_norm2"] = std::make_shared(config.emb_dim, config.norm_eps); blocks["ffn_norm2"] = std::make_shared(config.emb_dim, config.norm_eps); blocks["adaln_modulation"] = make_linear(config.adanln_dim, 4 * config.emb_dim, true); } ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* pe, ggml_tensor* adaln_input, ggml_tensor* mask = nullptr) { auto attention = std::dynamic_pointer_cast(blocks["attention"]); auto feed_forward = std::dynamic_pointer_cast(blocks["feed_forward"]); auto attention_norm1 = std::dynamic_pointer_cast(blocks["attention_norm1"]); auto ffn_norm1 = std::dynamic_pointer_cast(blocks["ffn_norm1"]); auto attention_norm2 = std::dynamic_pointer_cast(blocks["attention_norm2"]); auto ffn_norm2 = std::dynamic_pointer_cast(blocks["ffn_norm2"]); auto adaln_modulation = std::dynamic_pointer_cast(blocks["adaln_modulation"]); auto mod = adaln_modulation->forward(ctx, adaln_input); auto mods = ggml_ext_chunk(ctx->ggml_ctx, mod, 4, 0); auto scale_msa = mods[0]; auto gate_msa = to_token_modulation(ctx->ggml_ctx, ggml_tanh(ctx->ggml_ctx, mods[1])); auto scale_mlp = mods[2]; auto gate_mlp = to_token_modulation(ctx->ggml_ctx, ggml_tanh(ctx->ggml_ctx, mods[3])); auto attn_out = attention_norm1->forward(ctx, x); attn_out = modulate(ctx->ggml_ctx, attn_out, scale_msa); attn_out = attention->forward(ctx, attn_out, pe, mask); attn_out = attention_norm2->forward(ctx, attn_out); x = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn_out, gate_msa)); auto ffn_out = ffn_norm1->forward(ctx, x); ffn_out = modulate(ctx->ggml_ctx, ffn_out, scale_mlp); ffn_out = feed_forward->forward(ctx, ffn_out); ffn_out = ffn_norm2->forward(ctx, ffn_out); x = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, ffn_out, gate_mlp)); return x; } }; class Ideogram4EmbedScalar : public GGMLBlock { protected: int64_t dim; public: Ideogram4EmbedScalar(int64_t dim) : dim(dim) { blocks["mlp_in"] = make_linear(dim, dim, true); blocks["mlp_out"] = make_linear(dim, dim, true); } ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { auto mlp_in = std::dynamic_pointer_cast(blocks["mlp_in"]); auto mlp_out = std::dynamic_pointer_cast(blocks["mlp_out"]); x = timestep_embedding_sin_cos(ctx->ggml_ctx, x, static_cast(dim)); x = ggml_silu(ctx->ggml_ctx, mlp_in->forward(ctx, x)); x = mlp_out->forward(ctx, x); return x; } }; class Ideogram4FinalLayer : public GGMLBlock { public: Ideogram4FinalLayer(const Ideogram4Config& config) { blocks["norm_final"] = std::make_shared(config.emb_dim, 1e-6f, false); blocks["linear"] = make_linear(config.emb_dim, config.in_channels, true); blocks["adaln_modulation"] = make_linear(config.adanln_dim, config.emb_dim, true); } ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* c) { auto norm_final = std::dynamic_pointer_cast(blocks["norm_final"]); auto linear = std::dynamic_pointer_cast(blocks["linear"]); auto adaln_modulation = std::dynamic_pointer_cast(blocks["adaln_modulation"]); auto scale = adaln_modulation->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); x = norm_final->forward(ctx, x); x = modulate(ctx->ggml_ctx, x, scale); x = linear->forward(ctx, x); return x; } }; class Ideogram4Transformer : public GGMLBlock { protected: Ideogram4Config config; public: Ideogram4Transformer() = default; explicit Ideogram4Transformer(Ideogram4Config config) : config(std::move(config)) { blocks["input_proj"] = make_linear(this->config.in_channels, this->config.emb_dim, true); blocks["llm_cond_norm"] = std::make_shared(this->config.llm_features_dim, 1e-6f); blocks["llm_cond_proj"] = make_linear(this->config.llm_features_dim, this->config.emb_dim, true); blocks["t_embedding"] = std::make_shared(this->config.emb_dim); blocks["adaln_proj"] = make_linear(this->config.emb_dim, this->config.adanln_dim, true); blocks["embed_image_indicator"] = std::make_shared(2, this->config.emb_dim); for (int i = 0; i < this->config.num_layers; ++i) { blocks["layers." + std::to_string(i)] = std::make_shared(this->config); } blocks["final_layer"] = std::make_shared(this->config); } ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* timestep, ggml_tensor* context, ggml_tensor* pe, ggml_tensor* image_indicator_ids) { int64_t W = x->ne[0]; int64_t H = x->ne[1]; int64_t N = x->ne[3]; GGML_ASSERT(N == 1); auto input_proj = std::dynamic_pointer_cast(blocks["input_proj"]); auto llm_cond_norm = std::dynamic_pointer_cast(blocks["llm_cond_norm"]); auto llm_cond_proj = std::dynamic_pointer_cast(blocks["llm_cond_proj"]); auto t_embedding = std::dynamic_pointer_cast(blocks["t_embedding"]); auto adaln_proj = std::dynamic_pointer_cast(blocks["adaln_proj"]); auto embed_image_indicator = std::dynamic_pointer_cast(blocks["embed_image_indicator"]); auto final_layer = std::dynamic_pointer_cast(blocks["final_layer"]); auto img = patchify(ctx->ggml_ctx, x, config); img = input_proj->forward(ctx, img); ggml_tensor* h = img; int64_t context_len = 0; if (context != nullptr) { if (ggml_n_dims(context) < 3) { context = ggml_reshape_3d(ctx->ggml_ctx, context, context->ne[0], context->ne[1], 1); } context = interleave_hidden_state_layers(ctx->ggml_ctx, context); context_len = context->ne[1]; auto txt = llm_cond_norm->forward(ctx, context); txt = llm_cond_proj->forward(ctx, txt); h = ggml_concat(ctx->ggml_ctx, txt, img, 1); } auto indicator_embedding = embed_image_indicator->forward(ctx, image_indicator_ids); h = ggml_add(ctx->ggml_ctx, h, indicator_embedding); auto t_cond = t_embedding->forward(ctx, timestep); auto adaln_input = ggml_silu(ctx->ggml_ctx, adaln_proj->forward(ctx, t_cond)); for (int i = 0; i < config.num_layers; ++i) { auto block = std::dynamic_pointer_cast(blocks["layers." + std::to_string(i)]); h = block->forward(ctx, h, pe, adaln_input, nullptr); sd::ggml_graph_cut::mark_graph_cut(h, "ideogram4.layers." + std::to_string(i), "hidden"); } h = final_layer->forward(ctx, h, adaln_input); if (context_len > 0) { h = ggml_ext_slice(ctx->ggml_ctx, h, 1, context_len, h->ne[1]); } h = unpatchify(ctx->ggml_ctx, h, H, W, config); h = ggml_ext_scale(ctx->ggml_ctx, h, -1.f); return h; } }; class Ideogram4Runner : public DiffusionModelRunner { protected: bool should_use_uncond_model(const DiffusionParams& diffusion_params) const { return has_uncond_model && diffusion_params.context == nullptr && diffusion_params.y != nullptr && !diffusion_params.y->empty(); } public: Ideogram4Config config; Ideogram4Transformer model; Ideogram4Transformer uncond_model; bool has_uncond_model = false; std::string uncond_prefix; std::vector pe_vec; std::vector image_indicator_vec; Ideogram4Runner(ggml_backend_t backend, ggml_backend_t params_backend, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") : DiffusionModelRunner(backend, params_backend, prefix), config(Ideogram4Config::detect_from_weights(tensor_storage_map, prefix)), uncond_prefix(prefix + ".uncond") { model = Ideogram4Transformer(config); model.init(params_ctx, tensor_storage_map, prefix); for (const auto& pair : tensor_storage_map) { const std::string& name = pair.first; if (starts_with(name, uncond_prefix)) { has_uncond_model = true; break; } } if (has_uncond_model) { LOG_DEBUG("using uncond model"); uncond_model = Ideogram4Transformer(config); uncond_model.init(params_ctx, tensor_storage_map, uncond_prefix); } } std::string get_desc() override { return "ideogram4"; } void get_param_tensors(std::map& tensors, const std::string& prefix) override { model.get_param_tensors(tensors, prefix); if (has_uncond_model) { uncond_model.get_param_tensors(tensors, this->uncond_prefix); } } ggml_cgraph* build_graph(const sd::Tensor& x_tensor, const sd::Tensor& timesteps_tensor, const sd::Tensor& context_tensor, bool use_uncond_model = false) { ggml_cgraph* gf = new_graph_custom(IDEOGRAM4_GRAPH_SIZE); ggml_tensor* x = make_input(x_tensor); ggml_tensor* timesteps = make_input(timesteps_tensor); GGML_ASSERT(x->ne[3] == 1); Ideogram4Transformer& active_model = use_uncond_model ? uncond_model : model; ggml_tensor* context = nullptr; int64_t context_len = 0; if (!context_tensor.empty()) { context = make_input(context_tensor); context_len = context->ne[1]; } int64_t grid_w = x->ne[0]; int64_t grid_h = x->ne[1]; int64_t pos_len = context_len + grid_h * grid_w; int64_t head_dim = config.emb_dim / config.num_heads; auto runner_ctx = get_context(); pe_vec = gen_ideogram4_pe(static_cast(grid_h), static_cast(grid_w), static_cast(x->ne[3]), static_cast(context_len), static_cast(head_dim), static_cast(config.rope_theta), config.mrope_section, runner_ctx.circular_x_enabled, runner_ctx.circular_y_enabled); auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, head_dim / 2, pos_len); set_backend_tensor_data(pe, pe_vec.data()); image_indicator_vec.assign(static_cast(pos_len), 1); for (int64_t i = 0; i < context_len; ++i) { image_indicator_vec[static_cast(i)] = 0; } auto indicator = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_I32, pos_len, x->ne[3]); set_backend_tensor_data(indicator, image_indicator_vec.data()); ggml_tensor* out = active_model.forward(&runner_ctx, x, timesteps, context, pe, indicator); ggml_build_forward_expand(gf, out); return gf; } sd::Tensor compute(int n_threads, const sd::Tensor& x, const sd::Tensor& timesteps, const sd::Tensor& context, bool use_uncond_model = false) { auto get_graph = [&]() -> ggml_cgraph* { return build_graph(x, timesteps, context, use_uncond_model); }; return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false, false, false), x.dim()); } sd::Tensor compute(int n_threads, const DiffusionParams& diffusion_params) override { GGML_ASSERT(diffusion_params.x != nullptr); GGML_ASSERT(diffusion_params.timesteps != nullptr); bool use_uncond_model = should_use_uncond_model(diffusion_params); return compute(n_threads, *diffusion_params.x, *diffusion_params.timesteps, tensor_or_empty(diffusion_params.context), use_uncond_model); } }; } // namespace Ideogram4 #endif // __SD_MODEL_DIFFUSION_IDEOGRAM4_HPP__