feat: add ideogram4 support

This commit is contained in:
leejet 2026-06-04 22:37:02 +08:00
parent 1f9ee88e09
commit a8ccc808d4
13 changed files with 680 additions and 20 deletions

View File

@ -41,6 +41,8 @@ Context Options:
--qwen2vl_vision <string> alias of --llm_vision. Deprecated. --qwen2vl_vision <string> alias of --llm_vision. Deprecated.
--diffusion-model <string> path to the standalone diffusion model --diffusion-model <string> path to the standalone diffusion model
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model --high-noise-diffusion-model <string> path to the standalone high noise diffusion model
--uncond-diffusion-model <string> path to the standalone unconditional diffusion model, currently used by
Ideogram4 CFG
--vae <string> path to standalone vae model --vae <string> path to standalone vae model
--taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality) --taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
--tae <string> alias of --taesd --tae <string> alias of --taesd

View File

@ -356,6 +356,10 @@ ArgOptions SDContextParams::get_options() {
"--high-noise-diffusion-model", "--high-noise-diffusion-model",
"path to the standalone high noise diffusion model", "path to the standalone high noise diffusion model",
&high_noise_diffusion_model_path}, &high_noise_diffusion_model_path},
{"",
"--uncond-diffusion-model",
"path to the standalone unconditional diffusion model, currently used by Ideogram4 CFG",
&uncond_diffusion_model_path},
{"", {"",
"--embeddings-connectors", "--embeddings-connectors",
"path to LTXAV embeddings connectors", "path to LTXAV embeddings connectors",
@ -706,6 +710,7 @@ std::string SDContextParams::to_string() const {
<< " llm_vision_path: \"" << llm_vision_path << "\",\n" << " llm_vision_path: \"" << llm_vision_path << "\",\n"
<< " diffusion_model_path: \"" << diffusion_model_path << "\",\n" << " diffusion_model_path: \"" << diffusion_model_path << "\",\n"
<< " high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n" << " high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n"
<< " uncond_diffusion_model_path: \"" << uncond_diffusion_model_path << "\",\n"
<< " embeddings_connectors_path: \"" << embeddings_connectors_path << "\",\n" << " embeddings_connectors_path: \"" << embeddings_connectors_path << "\",\n"
<< " vae_path: \"" << vae_path << "\",\n" << " vae_path: \"" << vae_path << "\",\n"
<< " vae_format: \"" << vae_format << "\",\n" << " vae_format: \"" << vae_format << "\",\n"
@ -769,6 +774,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
llm_vision_path.c_str(), llm_vision_path.c_str(),
diffusion_model_path.c_str(), diffusion_model_path.c_str(),
high_noise_diffusion_model_path.c_str(), high_noise_diffusion_model_path.c_str(),
uncond_diffusion_model_path.c_str(),
embeddings_connectors_path.c_str(), embeddings_connectors_path.c_str(),
vae_path.c_str(), vae_path.c_str(),
audio_vae_path.c_str(), audio_vae_path.c_str(),
@ -2519,6 +2525,7 @@ std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params,
set_json_basename_if_not_empty(models, "llm_vision", ctx_params.llm_vision_path); set_json_basename_if_not_empty(models, "llm_vision", ctx_params.llm_vision_path);
set_json_basename_if_not_empty(models, "diffusion_model", ctx_params.diffusion_model_path); set_json_basename_if_not_empty(models, "diffusion_model", ctx_params.diffusion_model_path);
set_json_basename_if_not_empty(models, "high_noise_diffusion_model", ctx_params.high_noise_diffusion_model_path); set_json_basename_if_not_empty(models, "high_noise_diffusion_model", ctx_params.high_noise_diffusion_model_path);
set_json_basename_if_not_empty(models, "uncond_diffusion_model", ctx_params.uncond_diffusion_model_path);
set_json_basename_if_not_empty(models, "vae", ctx_params.vae_path); set_json_basename_if_not_empty(models, "vae", ctx_params.vae_path);
set_json_basename_if_not_empty(models, "taesd", ctx_params.taesd_path); set_json_basename_if_not_empty(models, "taesd", ctx_params.taesd_path);
set_json_basename_if_not_empty(models, "control_net", ctx_params.control_net_path); set_json_basename_if_not_empty(models, "control_net", ctx_params.control_net_path);
@ -2686,6 +2693,9 @@ std::string get_image_params(const SDContextParams& ctx_params,
if (!ctx_params.diffusion_model_path.empty()) { if (!ctx_params.diffusion_model_path.empty()) {
parameter_string += "Unet: " + sd_basename(ctx_params.diffusion_model_path) + ", "; parameter_string += "Unet: " + sd_basename(ctx_params.diffusion_model_path) + ", ";
} }
if (!ctx_params.uncond_diffusion_model_path.empty()) {
parameter_string += "Uncond Unet: " + sd_basename(ctx_params.uncond_diffusion_model_path) + ", ";
}
if (!ctx_params.vae_path.empty()) { if (!ctx_params.vae_path.empty()) {
parameter_string += "VAE: " + sd_basename(ctx_params.vae_path) + ", "; parameter_string += "VAE: " + sd_basename(ctx_params.vae_path) + ", ";
} }

View File

@ -92,6 +92,7 @@ struct SDContextParams {
std::string llm_vision_path; std::string llm_vision_path;
std::string diffusion_model_path; std::string diffusion_model_path;
std::string high_noise_diffusion_model_path; std::string high_noise_diffusion_model_path;
std::string uncond_diffusion_model_path;
std::string embeddings_connectors_path; std::string embeddings_connectors_path;
std::string vae_path; std::string vae_path;
std::string vae_format = "auto"; std::string vae_format = "auto";

View File

@ -143,6 +143,8 @@ Context Options:
--qwen2vl_vision <string> alias of --llm_vision. Deprecated. --qwen2vl_vision <string> alias of --llm_vision. Deprecated.
--diffusion-model <string> path to the standalone diffusion model --diffusion-model <string> path to the standalone diffusion model
--high-noise-diffusion-model <string> path to the standalone high noise diffusion model --high-noise-diffusion-model <string> path to the standalone high noise diffusion model
--uncond-diffusion-model <string> path to the standalone unconditional diffusion model, currently used by
Ideogram4 CFG
--vae <string> path to standalone vae model --vae <string> path to standalone vae model
--taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality) --taesd <string> path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
--tae <string> alias of --taesd --tae <string> alias of --taesd

View File

@ -186,6 +186,7 @@ typedef struct {
const char* llm_vision_path; const char* llm_vision_path;
const char* diffusion_model_path; const char* diffusion_model_path;
const char* high_noise_diffusion_model_path; const char* high_noise_diffusion_model_path;
const char* uncond_diffusion_model_path;
const char* embeddings_connectors_path; const char* embeddings_connectors_path;
const char* vae_path; const char* vae_path;
const char* audio_vae_path; const char* audio_vae_path;

View File

@ -1759,6 +1759,8 @@ struct LLMEmbedder : public Conditioner {
arch = LLM::LLMArch::GPT_OSS_20B; arch = LLM::LLMArch::GPT_OSS_20B;
} else if (sd_version_is_pid(version)) { } else if (sd_version_is_pid(version)) {
arch = LLM::LLMArch::GEMMA2_2B; arch = LLM::LLMArch::GEMMA2_2B;
} else if (sd_version_is_ideogram4(version)) {
arch = LLM::LLMArch::QWEN3_VL;
} else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) { } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
arch = LLM::LLMArch::QWEN3; arch = LLM::LLMArch::QWEN3;
} }
@ -2101,6 +2103,14 @@ struct LLMEmbedder : public Conditioner {
prompt_attn_range.second = static_cast<int>(prompt.size()); prompt_attn_range.second = static_cast<int>(prompt.size());
prompt += "[/INST]"; prompt += "[/INST]";
} else if (sd_version_is_ideogram4(version)) {
prompt_template_encode_start_idx = 0;
out_layers = {1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 36};
prompt = "<|im_start|>user\n";
prompt += conditioner_params.text;
prompt += "<|im_end|>\n<|im_start|>assistant\n";
prompt_attn_range = {0, 0};
} else if (sd_version_is_ernie_image(version)) { } else if (sd_version_is_ernie_image(version)) {
prompt_template_encode_start_idx = 0; prompt_template_encode_start_idx = 0;
out_layers = {25}; // -2 out_layers = {25}; // -2

View File

@ -3318,11 +3318,14 @@ protected:
bool bias; bool bias;
bool force_f32; bool force_f32;
bool force_prec_f32; bool force_prec_f32;
bool allow_weight_scale;
bool has_weight_scale = false;
float scale; float scale;
std::string prefix; std::string prefix;
void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
this->prefix = prefix; this->prefix = prefix;
has_weight_scale = false;
enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32); enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
if (in_features % ggml_blck_size(wtype) != 0 || force_f32) { if (in_features % ggml_blck_size(wtype) != 0 || force_f32) {
wtype = GGML_TYPE_F32; wtype = GGML_TYPE_F32;
@ -3332,20 +3335,26 @@ protected:
enum ggml_type wtype = GGML_TYPE_F32; enum ggml_type wtype = GGML_TYPE_F32;
params["bias"] = ggml_new_tensor_1d(ctx, wtype, out_features); params["bias"] = ggml_new_tensor_1d(ctx, wtype, out_features);
} }
if (allow_weight_scale && tensor_storage_map.find(prefix + "weight_scale") != tensor_storage_map.end()) {
params["weight_scale"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_features);
has_weight_scale = true;
}
} }
public: public:
Linear(int64_t in_features, Linear(int64_t in_features,
int64_t out_features, int64_t out_features,
bool bias = true, bool bias = true,
bool force_f32 = false, bool force_f32 = false,
bool force_prec_f32 = false, bool force_prec_f32 = false,
float scale = 1.f) float scale = 1.f,
bool allow_weight_scale = false)
: in_features(in_features), : in_features(in_features),
out_features(out_features), out_features(out_features),
bias(bias), bias(bias),
force_f32(force_f32), force_f32(force_f32),
force_prec_f32(force_prec_f32), force_prec_f32(force_prec_f32),
allow_weight_scale(allow_weight_scale),
scale(scale) {} scale(scale) {}
void set_scale(float scale_) { void set_scale(float scale_) {
@ -3362,14 +3371,24 @@ public:
if (bias) { if (bias) {
b = params["bias"]; b = params["bias"];
} }
ggml_tensor* linear_bias = has_weight_scale ? nullptr : b;
ggml_tensor* out = nullptr;
if (ctx->weight_adapter) { if (ctx->weight_adapter) {
WeightAdapter::ForwardParams forward_params; WeightAdapter::ForwardParams forward_params;
forward_params.op_type = WeightAdapter::ForwardParams::op_type_t::OP_LINEAR; forward_params.op_type = WeightAdapter::ForwardParams::op_type_t::OP_LINEAR;
forward_params.linear.force_prec_f32 = force_prec_f32; forward_params.linear.force_prec_f32 = force_prec_f32;
forward_params.linear.scale = scale; forward_params.linear.scale = scale;
return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, ctx->backend, x, w, b, prefix, forward_params); out = ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, ctx->backend, x, w, linear_bias, prefix, forward_params);
} else {
out = ggml_ext_linear(ctx->ggml_ctx, x, w, linear_bias, force_prec_f32, 1 / 128.f);
} }
return ggml_ext_linear(ctx->ggml_ctx, x, w, b, force_prec_f32, scale); if (has_weight_scale) {
out = ggml_mul(ctx->ggml_ctx, out, params["weight_scale"]);
if (b != nullptr) {
out = ggml_add_inplace(ctx->ggml_ctx, out, b);
}
}
return out;
} }
}; };

527
src/ideogram4.hpp Normal file
View File

@ -0,0 +1,527 @@
#ifndef __IDEOGRAM4_HPP__
#define __IDEOGRAM4_HPP__
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <memory>
#include <string>
#include <vector>
#include "diffusion_model.hpp"
#include "ggml_extend.hpp"
#include "ggml_graph_cut.h"
#include "rope.hpp"
namespace Ideogram4 {
constexpr int IDEOGRAM4_GRAPH_SIZE = 65536;
constexpr int OUTPUT_IMAGE_INDICATOR = 2;
constexpr int IMAGE_POSITION_OFFSET = 65536;
constexpr int DEFAULT_MROPE_SECTION_T = 24;
constexpr int DEFAULT_MROPE_SECTION_H = 20;
constexpr int DEFAULT_MROPE_SECTION_W = 20;
constexpr int TIMESTEP_MAX_PERIOD = 10000;
constexpr int LLM_HIDDEN_STATE_LAYERS = 13;
struct Ideogram4Config {
int64_t emb_dim = 4608;
int64_t num_layers = 34;
int64_t num_heads = 18;
int64_t intermediate_size = 12288;
int64_t adanln_dim = 512;
int64_t in_channels = 128;
int64_t llm_features_dim = 53248;
int64_t rope_theta = 5000000;
float norm_eps = 1e-5f;
int patch_size = 2;
int ae_channels = 32;
std::vector<int> mrope_section = {DEFAULT_MROPE_SECTION_T,
DEFAULT_MROPE_SECTION_H,
DEFAULT_MROPE_SECTION_W};
};
__STATIC_INLINE__ ggml_tensor* timestep_embedding_sin_cos(ggml_context* ctx,
ggml_tensor* timesteps,
int dim) {
GGML_ASSERT(dim % 2 == 0);
auto embedding = ggml_ext_timestep_embedding(ctx, timesteps, dim, TIMESTEP_MAX_PERIOD, 10.f);
auto chunks = ggml_ext_chunk(ctx, embedding, 2, 0);
return ggml_concat(ctx, chunks[1], chunks[0], 0);
}
__STATIC_INLINE__ ggml_tensor* to_token_modulation(ggml_context* ctx, ggml_tensor* x) {
// [N, C] -> [N, 1, C] in PyTorch layout.
if (ggml_n_dims(x) < 3 || x->ne[1] != 1) {
x = ggml_reshape_3d(ctx, x, x->ne[0], 1, x->ne[1]);
}
return x;
}
__STATIC_INLINE__ ggml_tensor* interleave_hidden_state_layers(ggml_context* ctx, ggml_tensor* x) {
// Match upstream stack(...).permute(1, 2, 3, 0).reshape(...):
// [layers * hidden, tokens, batch] -> [hidden * layers, tokens, batch].
GGML_ASSERT(x->ne[0] % LLM_HIDDEN_STATE_LAYERS == 0);
const int64_t hidden_size = x->ne[0] / LLM_HIDDEN_STATE_LAYERS;
const int64_t token_count = x->ne[1];
const int64_t batch_count = x->ne[2];
x = ggml_reshape_4d(ctx, x, hidden_size, LLM_HIDDEN_STATE_LAYERS, token_count, batch_count);
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));
return ggml_reshape_3d(ctx, x, hidden_size * LLM_HIDDEN_STATE_LAYERS, token_count, batch_count);
}
__STATIC_INLINE__ ggml_tensor* modulate(ggml_context* ctx, ggml_tensor* x, ggml_tensor* scale) {
scale = to_token_modulation(ctx, scale);
return ggml_add(ctx, x, ggml_mul(ctx, x, scale));
}
__STATIC_INLINE__ ggml_tensor* patchify(ggml_context* ctx, ggml_tensor* x, const Ideogram4Config& config) {
// x: [N, 128, H, W] with channel order [ae, ph, pw].
// return: [N, H*W, 128] with token channel order [ph, pw, ae].
const int64_t W = x->ne[0];
const int64_t H = x->ne[1];
const int64_t C = x->ne[2];
const int64_t N = x->ne[3];
GGML_ASSERT(N == 1);
GGML_ASSERT(C == config.ae_channels * config.patch_size * config.patch_size);
x = ggml_cont(ctx, x);
x = ggml_reshape_4d(ctx, x, W * H, config.patch_size, config.patch_size, config.ae_channels);
x = ggml_cont(ctx, ggml_permute(ctx, x, 3, 1, 2, 0));
x = ggml_reshape_3d(ctx, x, C, W * H, N);
return x;
}
__STATIC_INLINE__ ggml_tensor* unpatchify(ggml_context* ctx,
ggml_tensor* x,
int64_t H,
int64_t W,
const Ideogram4Config& config) {
const int64_t C = x->ne[0];
const int64_t N = x->ne[2];
GGML_ASSERT(N == 1);
GGML_ASSERT(C == config.ae_channels * config.patch_size * config.patch_size);
GGML_ASSERT(x->ne[1] == H * W);
x = ggml_reshape_4d(ctx, x, config.ae_channels, config.patch_size, config.patch_size, H * W);
x = ggml_cont(ctx, ggml_permute(ctx, x, 3, 1, 2, 0));
x = ggml_reshape_4d(ctx, x, W, H, C, N);
return x;
}
__STATIC_INLINE__ std::shared_ptr<Linear> make_linear(int64_t in_features,
int64_t out_features,
bool bias = true) {
return std::make_shared<Linear>(in_features, out_features, bias, false, false, 1.f, true);
}
__STATIC_INLINE__ std::vector<float> gen_ideogram4_pe(int grid_h,
int grid_w,
int bs,
int context_len,
int head_dim,
int rope_theta,
const std::vector<int>& mrope_section) {
GGML_ASSERT(bs == 1);
std::vector<std::vector<float>> ids(static_cast<size_t>(bs) * (context_len + grid_h * grid_w),
std::vector<float>(3, 0.f));
for (int i = 0; i < context_len; ++i) {
ids[i] = {static_cast<float>(i), static_cast<float>(i), static_cast<float>(i)};
}
int cursor = context_len;
for (int y = 0; y < grid_h; ++y) {
for (int x = 0; x < grid_w; ++x) {
ids[cursor++] = {static_cast<float>(IMAGE_POSITION_OFFSET),
static_cast<float>(IMAGE_POSITION_OFFSET + y),
static_cast<float>(IMAGE_POSITION_OFFSET + x)};
}
}
return Rope::embed_interleaved_mrope(ids, bs, static_cast<float>(rope_theta), head_dim, mrope_section);
}
class Ideogram4Attention : public GGMLBlock {
protected:
int64_t hidden_size;
int64_t num_heads;
int64_t head_dim;
public:
Ideogram4Attention(int64_t hidden_size, int64_t num_heads, float eps)
: hidden_size(hidden_size), num_heads(num_heads), head_dim(hidden_size / num_heads) {
GGML_ASSERT(hidden_size % num_heads == 0);
blocks["qkv"] = make_linear(hidden_size, hidden_size * 3, false);
blocks["norm_q"] = std::make_shared<RMSNorm>(head_dim, eps);
blocks["norm_k"] = std::make_shared<RMSNorm>(head_dim, eps);
blocks["o"] = make_linear(hidden_size, hidden_size, false);
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* x,
ggml_tensor* pe,
ggml_tensor* mask = nullptr) {
int64_t n_token = x->ne[1];
int64_t N = x->ne[2];
auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
auto norm_q = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_q"]);
auto norm_k = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_k"]);
auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["o"]);
auto qkv = qkv_proj->forward(ctx, x);
auto qkv_vec = split_qkv(ctx->ggml_ctx, qkv);
auto q = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[0], head_dim, num_heads, n_token, N);
auto k = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[1], head_dim, num_heads, n_token, N);
auto v = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[2], head_dim, num_heads, n_token, N);
q = norm_q->forward(ctx, q);
k = norm_k->forward(ctx, k);
x = Rope::attention(ctx, q, k, v, pe, mask, 1.f / std::sqrt(static_cast<float>(head_dim)), false);
x = out_proj->forward(ctx, x);
return x;
}
};
class Ideogram4MLP : public GGMLBlock {
public:
Ideogram4MLP(int64_t dim, int64_t hidden_dim) {
blocks["w1"] = make_linear(dim, hidden_dim, false);
blocks["w2"] = make_linear(hidden_dim, dim, false);
blocks["w3"] = make_linear(dim, hidden_dim, false);
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto w1 = std::dynamic_pointer_cast<Linear>(blocks["w1"]);
auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]);
auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]);
auto x1 = ggml_silu(ctx->ggml_ctx, w1->forward(ctx, x));
auto x3 = w3->forward(ctx, x);
x = ggml_mul(ctx->ggml_ctx, x1, x3);
x = w2->forward(ctx, x);
return x;
}
};
class Ideogram4TransformerBlock : public GGMLBlock {
public:
Ideogram4TransformerBlock(const Ideogram4Config& config) {
blocks["attention"] = std::make_shared<Ideogram4Attention>(config.emb_dim, config.num_heads, config.norm_eps);
blocks["feed_forward"] = std::make_shared<Ideogram4MLP>(config.emb_dim, config.intermediate_size);
blocks["attention_norm1"] = std::make_shared<RMSNorm>(config.emb_dim, config.norm_eps);
blocks["ffn_norm1"] = std::make_shared<RMSNorm>(config.emb_dim, config.norm_eps);
blocks["attention_norm2"] = std::make_shared<RMSNorm>(config.emb_dim, config.norm_eps);
blocks["ffn_norm2"] = std::make_shared<RMSNorm>(config.emb_dim, config.norm_eps);
blocks["adaln_modulation"] = make_linear(config.adanln_dim, 4 * config.emb_dim, true);
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* x,
ggml_tensor* pe,
ggml_tensor* adaln_input,
ggml_tensor* mask = nullptr) {
auto attention = std::dynamic_pointer_cast<Ideogram4Attention>(blocks["attention"]);
auto feed_forward = std::dynamic_pointer_cast<Ideogram4MLP>(blocks["feed_forward"]);
auto attention_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["attention_norm1"]);
auto ffn_norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["ffn_norm1"]);
auto attention_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["attention_norm2"]);
auto ffn_norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["ffn_norm2"]);
auto adaln_modulation = std::dynamic_pointer_cast<Linear>(blocks["adaln_modulation"]);
auto mod = adaln_modulation->forward(ctx, adaln_input);
auto mods = ggml_ext_chunk(ctx->ggml_ctx, mod, 4, 0);
auto scale_msa = mods[0];
auto gate_msa = to_token_modulation(ctx->ggml_ctx, ggml_tanh(ctx->ggml_ctx, mods[1]));
auto scale_mlp = mods[2];
auto gate_mlp = to_token_modulation(ctx->ggml_ctx, ggml_tanh(ctx->ggml_ctx, mods[3]));
auto attn_out = attention_norm1->forward(ctx, x);
attn_out = modulate(ctx->ggml_ctx, attn_out, scale_msa);
attn_out = attention->forward(ctx, attn_out, pe, mask);
attn_out = attention_norm2->forward(ctx, attn_out);
x = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn_out, gate_msa));
auto ffn_out = ffn_norm1->forward(ctx, x);
ffn_out = modulate(ctx->ggml_ctx, ffn_out, scale_mlp);
ffn_out = feed_forward->forward(ctx, ffn_out);
ffn_out = ffn_norm2->forward(ctx, ffn_out);
x = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, ffn_out, gate_mlp));
return x;
}
};
class Ideogram4EmbedScalar : public GGMLBlock {
protected:
int64_t dim;
public:
Ideogram4EmbedScalar(int64_t dim)
: dim(dim) {
blocks["mlp_in"] = make_linear(dim, dim, true);
blocks["mlp_out"] = make_linear(dim, dim, true);
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto mlp_in = std::dynamic_pointer_cast<Linear>(blocks["mlp_in"]);
auto mlp_out = std::dynamic_pointer_cast<Linear>(blocks["mlp_out"]);
x = timestep_embedding_sin_cos(ctx->ggml_ctx, x, static_cast<int>(dim));
x = ggml_silu(ctx->ggml_ctx, mlp_in->forward(ctx, x));
x = mlp_out->forward(ctx, x);
return x;
}
};
class Ideogram4FinalLayer : public GGMLBlock {
public:
Ideogram4FinalLayer(const Ideogram4Config& config) {
blocks["norm_final"] = std::make_shared<LayerNorm>(config.emb_dim, 1e-6f, false);
blocks["linear"] = make_linear(config.emb_dim, config.in_channels, true);
blocks["adaln_modulation"] = make_linear(config.adanln_dim, config.emb_dim, true);
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x, ggml_tensor* c) {
auto norm_final = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_final"]);
auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
auto adaln_modulation = std::dynamic_pointer_cast<Linear>(blocks["adaln_modulation"]);
auto scale = adaln_modulation->forward(ctx, ggml_silu(ctx->ggml_ctx, c));
x = norm_final->forward(ctx, x);
x = modulate(ctx->ggml_ctx, x, scale);
x = linear->forward(ctx, x);
return x;
}
};
class Ideogram4Transformer : public GGMLBlock {
protected:
Ideogram4Config config;
public:
Ideogram4Transformer() = default;
explicit Ideogram4Transformer(Ideogram4Config config)
: config(std::move(config)) {
blocks["input_proj"] = make_linear(this->config.in_channels, this->config.emb_dim, true);
blocks["llm_cond_norm"] = std::make_shared<RMSNorm>(this->config.llm_features_dim, 1e-6f);
blocks["llm_cond_proj"] = make_linear(this->config.llm_features_dim, this->config.emb_dim, true);
blocks["t_embedding"] = std::make_shared<Ideogram4EmbedScalar>(this->config.emb_dim);
blocks["adaln_proj"] = make_linear(this->config.emb_dim, this->config.adanln_dim, true);
blocks["embed_image_indicator"] = std::make_shared<Embedding>(2, this->config.emb_dim);
for (int i = 0; i < this->config.num_layers; ++i) {
blocks["layers." + std::to_string(i)] = std::make_shared<Ideogram4TransformerBlock>(this->config);
}
blocks["final_layer"] = std::make_shared<Ideogram4FinalLayer>(this->config);
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* x,
ggml_tensor* timestep,
ggml_tensor* context,
ggml_tensor* pe,
ggml_tensor* image_indicator_ids) {
int64_t W = x->ne[0];
int64_t H = x->ne[1];
int64_t N = x->ne[3];
GGML_ASSERT(N == 1);
auto input_proj = std::dynamic_pointer_cast<Linear>(blocks["input_proj"]);
auto llm_cond_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["llm_cond_norm"]);
auto llm_cond_proj = std::dynamic_pointer_cast<Linear>(blocks["llm_cond_proj"]);
auto t_embedding = std::dynamic_pointer_cast<Ideogram4EmbedScalar>(blocks["t_embedding"]);
auto adaln_proj = std::dynamic_pointer_cast<Linear>(blocks["adaln_proj"]);
auto embed_image_indicator = std::dynamic_pointer_cast<Embedding>(blocks["embed_image_indicator"]);
auto final_layer = std::dynamic_pointer_cast<Ideogram4FinalLayer>(blocks["final_layer"]);
auto img = patchify(ctx->ggml_ctx, x, config);
img = input_proj->forward(ctx, img);
ggml_tensor* h = img;
int64_t context_len = 0;
if (context != nullptr) {
if (ggml_n_dims(context) < 3) {
context = ggml_reshape_3d(ctx->ggml_ctx, context, context->ne[0], context->ne[1], 1);
}
context = interleave_hidden_state_layers(ctx->ggml_ctx, context);
context_len = context->ne[1];
auto txt = llm_cond_norm->forward(ctx, context);
txt = llm_cond_proj->forward(ctx, txt);
h = ggml_concat(ctx->ggml_ctx, txt, img, 1);
}
auto indicator_embedding = embed_image_indicator->forward(ctx, image_indicator_ids);
h = ggml_add(ctx->ggml_ctx, h, indicator_embedding);
auto t_cond = t_embedding->forward(ctx, timestep);
auto adaln_input = ggml_silu(ctx->ggml_ctx, adaln_proj->forward(ctx, t_cond));
for (int i = 0; i < config.num_layers; ++i) {
auto block = std::dynamic_pointer_cast<Ideogram4TransformerBlock>(blocks["layers." + std::to_string(i)]);
h = block->forward(ctx, h, pe, adaln_input, nullptr);
sd::ggml_graph_cut::mark_graph_cut(h, "ideogram4.layers." + std::to_string(i), "hidden");
}
h = final_layer->forward(ctx, h, adaln_input);
if (context_len > 0) {
h = ggml_ext_slice(ctx->ggml_ctx, h, 1, context_len, h->ne[1]);
}
h = unpatchify(ctx->ggml_ctx, h, H, W, config);
h = ggml_ext_scale(ctx->ggml_ctx, h, -1.f);
return h;
}
};
class Ideogram4Runner : public DiffusionModelRunner {
protected:
static int64_t detect_num_layers(const String2TensorStorage& tensor_storage_map,
const std::string& prefix) {
int64_t detected_layers = 0;
std::string layer_prefix = prefix.empty() ? "layers." : prefix + ".layers.";
for (const auto& pair : tensor_storage_map) {
const std::string& name = pair.first;
if (name.find(layer_prefix) != 0) {
continue;
}
std::string tail = name.substr(layer_prefix.size());
size_t dot = tail.find('.');
if (dot == std::string::npos) {
continue;
}
int layer_idx = std::atoi(tail.substr(0, dot).c_str());
detected_layers = std::max<int64_t>(detected_layers, layer_idx + 1);
}
return detected_layers;
}
bool should_use_uncond_model(const DiffusionParams& diffusion_params) const {
return has_uncond_model &&
diffusion_params.context == nullptr &&
diffusion_params.y != nullptr &&
!diffusion_params.y->empty();
}
public:
Ideogram4Config config;
Ideogram4Transformer model;
Ideogram4Transformer uncond_model;
bool has_uncond_model = false;
std::string uncond_prefix;
std::vector<float> pe_vec;
std::vector<int32_t> image_indicator_vec;
Ideogram4Runner(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "")
: DiffusionModelRunner(backend, params_backend, prefix),
uncond_prefix(prefix + ".uncond") {
int64_t detected_layers = detect_num_layers(tensor_storage_map, prefix);
if (detected_layers > 0) {
config.num_layers = detected_layers;
}
model = Ideogram4Transformer(config);
model.init(params_ctx, tensor_storage_map, prefix);
for (const auto& pair : tensor_storage_map) {
const std::string& name = pair.first;
if (starts_with(name, uncond_prefix)) {
has_uncond_model = true;
break;
}
}
if (has_uncond_model) {
LOG_DEBUG("using uncond model");
uncond_model = Ideogram4Transformer(config);
uncond_model.init(params_ctx, tensor_storage_map, uncond_prefix);
}
}
std::string get_desc() override {
return "ideogram4";
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
model.get_param_tensors(tensors, prefix);
if (has_uncond_model) {
uncond_model.get_param_tensors(tensors, this->uncond_prefix);
}
}
ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
const sd::Tensor<float>& timesteps_tensor,
const sd::Tensor<float>& context_tensor,
bool use_uncond_model = false) {
ggml_cgraph* gf = new_graph_custom(IDEOGRAM4_GRAPH_SIZE);
ggml_tensor* x = make_input(x_tensor);
ggml_tensor* timesteps = make_input(timesteps_tensor);
GGML_ASSERT(x->ne[3] == 1);
Ideogram4Transformer& active_model = use_uncond_model ? uncond_model : model;
ggml_tensor* context = nullptr;
int64_t context_len = 0;
if (!context_tensor.empty()) {
context = make_input(context_tensor);
context_len = context->ne[1];
}
int64_t grid_w = x->ne[0];
int64_t grid_h = x->ne[1];
int64_t pos_len = context_len + grid_h * grid_w;
int64_t head_dim = config.emb_dim / config.num_heads;
pe_vec = gen_ideogram4_pe(static_cast<int>(grid_h),
static_cast<int>(grid_w),
static_cast<int>(x->ne[3]),
static_cast<int>(context_len),
static_cast<int>(head_dim),
static_cast<int>(config.rope_theta),
config.mrope_section);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, head_dim / 2, pos_len);
set_backend_tensor_data(pe, pe_vec.data());
image_indicator_vec.assign(static_cast<size_t>(pos_len), 1);
for (int64_t i = 0; i < context_len; ++i) {
image_indicator_vec[static_cast<size_t>(i)] = 0;
}
auto indicator = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_I32, pos_len, x->ne[3]);
set_backend_tensor_data(indicator, image_indicator_vec.data());
auto runner_ctx = get_context();
ggml_tensor* out = active_model.forward(&runner_ctx, x, timesteps, context, pe, indicator);
ggml_build_forward_expand(gf, out);
return gf;
}
sd::Tensor<float> compute(int n_threads,
const sd::Tensor<float>& x,
const sd::Tensor<float>& timesteps,
const sd::Tensor<float>& context,
bool use_uncond_model = false) {
auto get_graph = [&]() -> ggml_cgraph* {
return build_graph(x, timesteps, context, use_uncond_model);
};
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
bool use_uncond_model = should_use_uncond_model(diffusion_params);
return compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
use_uncond_model);
}
};
} // namespace Ideogram4
#endif // __IDEOGRAM4_HPP__

View File

@ -1460,13 +1460,18 @@ namespace LLM {
params.num_kv_heads = 8; params.num_kv_heads = 8;
params.qkv_bias = false; params.qkv_bias = false;
params.rms_norm_eps = 1e-5f; params.rms_norm_eps = 1e-5f;
} else if (arch == LLMArch::QWEN3) { } else if (arch == LLMArch::QWEN3 || arch == LLMArch::QWEN3_VL) {
params.head_dim = 128; params.head_dim = 128;
params.num_heads = 32; params.num_heads = 32;
params.num_kv_heads = 8; params.num_kv_heads = 8;
params.qkv_bias = false; params.qkv_bias = false;
params.qk_norm = true; params.qk_norm = true;
params.rms_norm_eps = 1e-6f; params.rms_norm_eps = 1e-6f;
if (arch == LLMArch::QWEN3_VL) {
params.max_position_embeddings = 262144;
params.rope_thetas = {5000000.f};
params.vision.arch = LLMVisionArch::QWEN3_VL;
}
} else if (arch == LLMArch::GEMMA3_12B) { } else if (arch == LLMArch::GEMMA3_12B) {
params.head_dim = 256; params.head_dim = 256;
params.num_heads = 16; params.num_heads = 16;

View File

@ -435,6 +435,9 @@ SDVersion ModelLoader::get_sd_version() {
if (tensor_storage.name.find("model.diffusion_model.net.lq_proj.latent_proj.0.weight") != std::string::npos) { if (tensor_storage.name.find("model.diffusion_model.net.lq_proj.latent_proj.0.weight") != std::string::npos) {
return VERSION_PID; return VERSION_PID;
} }
if (tensor_storage.name.find("embed_image_indicator.weight") != std::string::npos) {
return VERSION_IDEOGRAM4;
}
if (tensor_storage.name.find("model.diffusion_model.nerf_final_layer_conv.") != std::string::npos) { if (tensor_storage.name.find("model.diffusion_model.nerf_final_layer_conv.") != std::string::npos) {
return VERSION_CHROMA_RADIANCE; return VERSION_CHROMA_RADIANCE;
} }
@ -1254,6 +1257,8 @@ bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage
// Pass, do not convert // Pass, do not convert
} else if (ends_with(name, ".scale")) { } else if (ends_with(name, ".scale")) {
// Pass, do not convert // Pass, do not convert
} else if (ends_with(name, ".weight_scale")) {
// Pass, do not convert
} else if (contains(name, "img_in.") || } else if (contains(name, "img_in.") ||
contains(name, "txt_in.") || contains(name, "txt_in.") ||
contains(name, "time_in.") || contains(name, "time_in.") ||

View File

@ -50,6 +50,7 @@ enum SDVersion {
VERSION_LENS, VERSION_LENS,
VERSION_LONGCAT, VERSION_LONGCAT,
VERSION_PID, VERSION_PID,
VERSION_IDEOGRAM4,
VERSION_COUNT, VERSION_COUNT,
}; };
@ -172,8 +173,15 @@ static inline bool sd_version_is_pid(SDVersion version) {
return false; return false;
} }
static inline bool sd_version_is_ideogram4(SDVersion version) {
if (version == VERSION_IDEOGRAM4) {
return true;
}
return false;
}
static inline bool sd_version_uses_flux2_vae(SDVersion version) { static inline bool sd_version_uses_flux2_vae(SDVersion version) {
if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version)) { if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_ideogram4(version)) {
return true; return true;
} }
return false; return false;
@ -203,7 +211,8 @@ static inline bool sd_version_is_dit(SDVersion version) {
sd_version_is_ernie_image(version) || sd_version_is_ernie_image(version) ||
sd_version_is_lens(version) || sd_version_is_lens(version) ||
sd_version_is_longcat(version) || sd_version_is_longcat(version) ||
sd_version_is_pid(version)) { sd_version_is_pid(version) ||
sd_version_is_ideogram4(version)) {
return true; return true;
} }
return false; return false;

View File

@ -249,6 +249,40 @@ namespace Rope {
return embed_nd(ids, bs, axis_thetas, axes_dim, wrap_dims, layout); return embed_nd(ids, bs, axis_thetas, axes_dim, wrap_dims, layout);
} }
__STATIC_INLINE__ std::vector<float> embed_interleaved_mrope(const std::vector<std::vector<float>>& ids,
int bs,
float theta,
int head_dim,
const std::vector<int>& mrope_section) {
GGML_ASSERT(bs > 0);
GGML_ASSERT(head_dim % 2 == 0);
GGML_ASSERT(mrope_section.size() >= 3);
std::vector<std::vector<float>> trans_ids = transpose(ids);
size_t pos_len = ids.size() / bs;
int half_dim = head_dim / 2;
std::vector<std::vector<std::vector<float>>> axis_embs;
axis_embs.reserve(3);
for (int axis = 0; axis < 3; ++axis) {
axis_embs.push_back(rope(trans_ids[axis], head_dim, theta));
}
std::vector<std::vector<float>> emb = axis_embs[0];
for (int axis = 1; axis < 3; ++axis) {
int length = std::min<int>(mrope_section[axis] * 3, half_dim);
for (int freq_idx = axis; freq_idx < length; freq_idx += 3) {
for (size_t pos_idx = 0; pos_idx < bs * pos_len; ++pos_idx) {
for (int k = 0; k < 4; ++k) {
emb[pos_idx][4 * freq_idx + k] = axis_embs[axis][pos_idx][4 * freq_idx + k];
}
}
}
}
return flatten(emb);
}
__STATIC_INLINE__ std::vector<float> embed_2d_interleaved(int height, __STATIC_INLINE__ std::vector<float> embed_2d_interleaved(int height,
int width, int width,
int dim, int dim,

View File

@ -23,6 +23,7 @@
#include "flux.hpp" #include "flux.hpp"
#include "guidance.h" #include "guidance.h"
#include "hidream_o1.hpp" #include "hidream_o1.hpp"
#include "ideogram4.hpp"
#include "lens.hpp" #include "lens.hpp"
#include "lora.hpp" #include "lora.hpp"
#include "ltx_audio_vae.h" #include "ltx_audio_vae.h"
@ -84,6 +85,7 @@ const char* model_version_to_str[] = {
"Lens", "Lens",
"Longcat-Image", "Longcat-Image",
"PiD", "PiD",
"Ideogram 4",
}; };
const char* sampling_methods_str[] = { const char* sampling_methods_str[] = {
@ -315,6 +317,13 @@ public:
} }
} }
if (strlen(SAFE_STR(sd_ctx_params->uncond_diffusion_model_path)) > 0) {
LOG_INFO("loading unconditional diffusion model from '%s'", sd_ctx_params->uncond_diffusion_model_path);
if (!model_loader.init_from_file(sd_ctx_params->uncond_diffusion_model_path, "model.diffusion_model.uncond.")) {
LOG_WARN("loading unconditional diffusion model from '%s' failed", sd_ctx_params->uncond_diffusion_model_path);
}
}
bool is_unet = sd_version_is_unet(model_loader.get_sd_version()); bool is_unet = sd_version_is_unet(model_loader.get_sd_version());
if (strlen(SAFE_STR(sd_ctx_params->clip_l_path)) > 0) { if (strlen(SAFE_STR(sd_ctx_params->clip_l_path)) > 0) {
@ -547,6 +556,17 @@ public:
params_backend_for(SDBackendModule::DIFFUSION), params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map, tensor_storage_map,
"model.diffusion_model.net"); "model.diffusion_model.net");
} else if (sd_version_is_ideogram4(version)) {
cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
params_backend_for(SDBackendModule::TE),
tensor_storage_map,
version,
"",
false);
diffusion_model = std::make_shared<Ideogram4::Ideogram4Runner>(backend_for(SDBackendModule::DIFFUSION),
params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.diffusion_model");
} else if (sd_version_is_flux(version)) { } else if (sd_version_is_flux(version)) {
bool is_chroma = false; bool is_chroma = false;
for (auto pair : tensor_storage_map) { for (auto pair : tensor_storage_map) {
@ -1024,6 +1044,12 @@ public:
ignore_tensors.insert("text_encoders.llm.model.layers.0.mlp.experts.gate_up_proj.weight_scale_2"); ignore_tensors.insert("text_encoders.llm.model.layers.0.mlp.experts.gate_up_proj.weight_scale_2");
ignore_tensors.insert("text_encoders.llm.model.layers.0.mlp.experts.down_proj.weight_scale_2"); ignore_tensors.insert("text_encoders.llm.model.layers.0.mlp.experts.down_proj.weight_scale_2");
} }
if (sd_version_is_ideogram4(version)) {
ignore_tensors.insert("text_encoders.llm.lm_head.");
ignore_tensors.insert("text_encoders.llm.visual.");
ignore_tensors.insert("text_encoders.llm.vision_model.");
ignore_tensors.insert("text_encoders.llm.tokenizer_json");
}
if (version == VERSION_HIDREAM_O1) { if (version == VERSION_HIDREAM_O1) {
ignore_tensors.insert("lm_head."); ignore_tensors.insert("lm_head.");
ignore_tensors.insert("model.visual.deepstack_merger_list."); ignore_tensors.insert("model.visual.deepstack_merger_list.");
@ -1199,7 +1225,8 @@ public:
sd_version_is_anima(version) || sd_version_is_anima(version) ||
sd_version_is_ernie_image(version) || sd_version_is_ernie_image(version) ||
sd_version_is_z_image(version) || sd_version_is_z_image(version) ||
sd_version_is_pid(version)) { sd_version_is_pid(version) ||
sd_version_is_ideogram4(version)) {
pred_type = FLOW_PRED; pred_type = FLOW_PRED;
if (sd_version_is_wan(version)) { if (sd_version_is_wan(version)) {
default_flow_shift = 5.f; default_flow_shift = 5.f;
@ -1207,6 +1234,8 @@ public:
default_flow_shift = 4.f; default_flow_shift = 4.f;
} else if (sd_version_is_pid(version)) { } else if (sd_version_is_pid(version)) {
default_flow_shift = 1.5f; default_flow_shift = 1.5f;
} else if (sd_version_is_ideogram4(version)) {
default_flow_shift = 1.0f;
} else { } else {
default_flow_shift = 3.f; default_flow_shift = 3.f;
} }
@ -1869,7 +1898,7 @@ public:
if (version == VERSION_HIDREAM_O1) { if (version == VERSION_HIDREAM_O1) {
return std::vector<float>{1.0f - (t / static_cast<float>(TIMESTEPS))}; return std::vector<float>{1.0f - (t / static_cast<float>(TIMESTEPS))};
} }
if (sd_version_is_z_image(version)) { if (sd_version_is_z_image(version) || sd_version_is_ideogram4(version)) {
return std::vector<float>{1000.f - t}; return std::vector<float>{1000.f - t};
} }
return std::vector<float>{t}; return std::vector<float>{t};
@ -2771,6 +2800,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
"llm_vision_path: %s\n" "llm_vision_path: %s\n"
"diffusion_model_path: %s\n" "diffusion_model_path: %s\n"
"high_noise_diffusion_model_path: %s\n" "high_noise_diffusion_model_path: %s\n"
"uncond_diffusion_model_path: %s\n"
"embeddings_connectors_path: %s\n" "embeddings_connectors_path: %s\n"
"vae_path: %s\n" "vae_path: %s\n"
"audio_vae_path: %s\n" "audio_vae_path: %s\n"
@ -2810,6 +2840,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
SAFE_STR(sd_ctx_params->llm_vision_path), SAFE_STR(sd_ctx_params->llm_vision_path),
SAFE_STR(sd_ctx_params->diffusion_model_path), SAFE_STR(sd_ctx_params->diffusion_model_path),
SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path), SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path),
SAFE_STR(sd_ctx_params->uncond_diffusion_model_path),
SAFE_STR(sd_ctx_params->embeddings_connectors_path), SAFE_STR(sd_ctx_params->embeddings_connectors_path),
SAFE_STR(sd_ctx_params->vae_path), SAFE_STR(sd_ctx_params->vae_path),
SAFE_STR(sd_ctx_params->audio_vae_path), SAFE_STR(sd_ctx_params->audio_vae_path),
@ -4178,16 +4209,20 @@ static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_c
SDCondition uncond; SDCondition uncond;
if (request->use_uncond || request->use_high_noise_uncond) { if (request->use_uncond || request->use_high_noise_uncond) {
bool zero_out_masked = false; if (sd_version_is_ideogram4(sd_ctx->sd->version)) {
if (sd_version_is_sdxl(sd_ctx->sd->version) && uncond.c_vector = sd::Tensor<float>::from_vector({1.0f});
request->negative_prompt.empty() && } else {
!sd_ctx->sd->is_using_edm_v_parameterization) { bool zero_out_masked = false;
zero_out_masked = true; if (sd_version_is_sdxl(sd_ctx->sd->version) &&
request->negative_prompt.empty() &&
!sd_ctx->sd->is_using_edm_v_parameterization) {
zero_out_masked = true;
}
condition_params.text = request->negative_prompt;
condition_params.zero_out_masked = zero_out_masked;
uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads,
condition_params);
} }
condition_params.text = request->negative_prompt;
condition_params.zero_out_masked = zero_out_masked;
uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads,
condition_params);
if (uncond.c_concat.empty()) { if (uncond.c_concat.empty()) {
uncond.c_concat = latents->concat_latent; // TODO: optimize uncond.c_concat = latents->concat_latent; // TODO: optimize
} }