stable-diffusion.cpp/src/hidream_o1.hpp

680 lines
33 KiB
C++

#ifndef __SD_HIDREAM_O1_H__
#define __SD_HIDREAM_O1_H__
#include <algorithm>
#include <array>
#include <cmath>
#include <cstring>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "common_dit.hpp"
#include "conditioner.hpp"
#include "diffusion_model.hpp"
#include "llm.hpp"
#include "util.h"
namespace HiDreamO1 {
constexpr int HIDREAM_O1_GRAPH_SIZE = 32768;
constexpr int PATCH_SIZE = 32;
constexpr int TIMESTEP_TOKEN_NUM = 1;
constexpr int IMAGE_TOKEN_ID = 151655;
constexpr int VISION_START_TOKEN_ID = 151652;
static inline std::string repeat_special_token(const std::string& token, int64_t count) {
std::string out;
out.reserve(static_cast<size_t>(count) * token.size());
for (int64_t i = 0; i < count; ++i) {
out += token;
}
return out;
}
static inline std::pair<int, int> calculate_dimensions(int max_size, double ratio) {
int width = static_cast<int>(std::sqrt(max_size * max_size * ratio));
int height = static_cast<int>(width / ratio);
width = (width / PATCH_SIZE) * PATCH_SIZE;
height = (height / PATCH_SIZE) * PATCH_SIZE;
width = std::max(width, PATCH_SIZE);
height = std::max(height, PATCH_SIZE);
return {width, height};
}
static inline sd::Tensor<float> resize_to_area(const sd::Tensor<float>& image, int image_size) {
int64_t width = image.shape()[0];
int64_t height = image.shape()[1];
int64_t s_max = static_cast<int64_t>(image_size) * image_size;
double scale = std::sqrt(static_cast<double>(s_max) / static_cast<double>(width * height));
std::vector<std::pair<int64_t, int64_t>> sizes = {
{(static_cast<int64_t>(std::llround(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast<int64_t>(std::llround(height * scale)) / PATCH_SIZE) * PATCH_SIZE},
{(static_cast<int64_t>(std::llround(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast<int64_t>(std::floor(height * scale)) / PATCH_SIZE) * PATCH_SIZE},
{(static_cast<int64_t>(std::floor(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast<int64_t>(std::llround(height * scale)) / PATCH_SIZE) * PATCH_SIZE},
{(static_cast<int64_t>(std::floor(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast<int64_t>(std::floor(height * scale)) / PATCH_SIZE) * PATCH_SIZE},
};
std::sort(sizes.begin(), sizes.end(), [](const auto& a, const auto& b) {
return a.first * a.second > b.first * b.second;
});
std::pair<int64_t, int64_t> new_size = sizes.back();
for (const auto& size : sizes) {
if (size.first > 0 && size.second > 0 && size.first * size.second <= s_max) {
new_size = size;
break;
}
}
double s1 = static_cast<double>(width) / static_cast<double>(new_size.first);
double s2 = static_cast<double>(height) / static_cast<double>(new_size.second);
sd::Tensor<float> resized;
if (s1 < s2) {
int64_t resized_h = static_cast<int64_t>(std::llround(height / s1));
resized = sd::ops::interpolate(image,
{new_size.first, resized_h, image.shape()[2], image.shape()[3]},
sd::ops::InterpolateMode::Bicubic);
int64_t top = (resized_h - new_size.second) / 2;
resized = sd::ops::slice(resized, 1, top, top + new_size.second);
} else {
int64_t resized_w = static_cast<int64_t>(std::llround(width / s2));
resized = sd::ops::interpolate(image,
{resized_w, new_size.second, image.shape()[2], image.shape()[3]},
sd::ops::InterpolateMode::Bicubic);
int64_t left = (resized_w - new_size.first) / 2;
resized = sd::ops::slice(resized, 0, left, left + new_size.first);
}
return resized;
}
static inline std::vector<int32_t> build_position_ids(const std::vector<int32_t>& input_ids,
const std::vector<std::array<int32_t, 3>>& image_grids,
const std::vector<int32_t>& skip_vision_start_token) {
std::vector<int32_t> position_ids(4 * input_ids.size(), 0);
int image_index = 0;
int st = 0;
int fix_point = 4096;
std::vector<int32_t> out_t;
std::vector<int32_t> out_h;
std::vector<int32_t> out_w;
while (st < static_cast<int>(input_ids.size())) {
int ed = st;
while (ed < static_cast<int>(input_ids.size()) && input_ids[ed] != IMAGE_TOKEN_ID) {
ed++;
}
if (ed >= static_cast<int>(input_ids.size())) {
int st_idx = out_t.empty() ? 0 : (*std::max_element(out_t.begin(), out_t.end()) + 1);
for (int i = 0; i < static_cast<int>(input_ids.size()) - st; ++i) {
out_t.push_back(st_idx + i);
out_h.push_back(st_idx + i);
out_w.push_back(st_idx + i);
}
break;
}
int text_len = std::max(0, ed - st - skip_vision_start_token[image_index]);
int st_idx = out_t.empty() ? 0 : (*std::max_element(out_t.begin(), out_t.end()) + 1);
for (int i = 0; i < text_len; ++i) {
out_t.push_back(st_idx + i);
out_h.push_back(st_idx + i);
out_w.push_back(st_idx + i);
}
auto grid = image_grids[image_index];
int base;
if (skip_vision_start_token[image_index]) {
if (fix_point > 0) {
base = fix_point;
fix_point = 0;
} else {
base = st_idx;
}
} else {
base = text_len + st_idx;
}
for (int32_t ti = 0; ti < grid[0]; ++ti) {
for (int32_t hi = 0; hi < grid[1]; ++hi) {
for (int32_t wi = 0; wi < grid[2]; ++wi) {
out_t.push_back(base + ti);
out_h.push_back(base + hi);
out_w.push_back(base + wi);
}
}
}
st = ed + grid[0] * grid[1] * grid[2];
image_index++;
}
GGML_ASSERT(out_t.size() == input_ids.size());
for (size_t i = 0; i < input_ids.size(); ++i) {
// ggml IMROPE consumes 4 flattened position streams:
// [t, h, w, e]
// llama.cpp's generic Qwen-VL fallback expands text positions as
// [pos, pos, pos, 0]. Keep the extra stream zeroed here too.
position_ids[i] = out_t[i];
position_ids[input_ids.size() + i] = out_h[i];
position_ids[input_ids.size() * 2 + i] = out_w[i];
position_ids[input_ids.size() * 3 + i] = 0;
}
return position_ids;
}
struct TimestepEmbedder : public GGMLBlock {
int frequency_embedding_size = 256;
TimestepEmbedder(int64_t hidden_size) {
blocks["mlp.0"] = std::make_shared<Linear>(frequency_embedding_size, hidden_size, true);
blocks["mlp.2"] = std::make_shared<Linear>(hidden_size, hidden_size, true);
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* t) {
auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);
auto emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, t, frequency_embedding_size, 10000, 1000.0f);
emb = mlp_0->forward(ctx, emb);
emb = ggml_silu_inplace(ctx->ggml_ctx, emb);
emb = mlp_2->forward(ctx, emb);
return emb;
}
};
struct BottleneckPatchEmbed : public GGMLBlock {
BottleneckPatchEmbed(int64_t in_dim, int64_t pca_dim, int64_t embed_dim) {
blocks["proj1"] = std::make_shared<Linear>(in_dim, pca_dim, false);
blocks["proj2"] = std::make_shared<Linear>(pca_dim, embed_dim, true);
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto proj1 = std::dynamic_pointer_cast<Linear>(blocks["proj1"]);
auto proj2 = std::dynamic_pointer_cast<Linear>(blocks["proj2"]);
return proj2->forward(ctx, proj1->forward(ctx, x));
}
};
struct FinalLayer : public GGMLBlock {
FinalLayer(int64_t hidden_size, int64_t out_dim) {
blocks["linear"] = std::make_shared<Linear>(hidden_size, out_dim, true);
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
return linear->forward(ctx, x);
}
};
struct HiDreamO1Params {
LLM::LLMParams llm;
int patch_size = PATCH_SIZE;
};
static inline HiDreamO1Params make_hidream_o1_params() {
HiDreamO1Params params;
params.llm.arch = LLM::LLMArch::QWEN3_VL;
params.llm.hidden_size = 4096;
params.llm.intermediate_size = 12288;
params.llm.num_layers = 36;
params.llm.num_heads = 32;
params.llm.num_kv_heads = 8;
params.llm.head_dim = 128;
params.llm.qkv_bias = false;
params.llm.qk_norm = true;
params.llm.vocab_size = 151936;
params.llm.rms_norm_eps = 1e-6f;
params.llm.vision.arch = LLM::LLMVisionArch::QWEN3_VL;
params.llm.vision.num_layers = 27;
params.llm.vision.hidden_size = 1152;
params.llm.vision.intermediate_size = 4304;
params.llm.vision.num_heads = 16;
params.llm.vision.out_hidden_size = 4096;
params.llm.vision.patch_size = 16;
params.llm.vision.spatial_merge_size = 2;
params.llm.vision.temporal_patch_size = 2;
params.llm.vision.num_position_embeddings = 2304;
return params;
}
struct HiDreamO1Model : public GGMLBlock {
HiDreamO1Params params;
HiDreamO1Model() = default;
explicit HiDreamO1Model(HiDreamO1Params params)
: params(std::move(params)) {
blocks["language_model"] = std::make_shared<LLM::TextModel>(this->params.llm);
blocks["t_embedder1"] = std::make_shared<TimestepEmbedder>(this->params.llm.hidden_size);
blocks["x_embedder"] = std::make_shared<BottleneckPatchEmbed>(this->params.patch_size * this->params.patch_size * 3,
this->params.llm.hidden_size / 4,
this->params.llm.hidden_size);
blocks["final_layer2"] = std::make_shared<FinalLayer>(this->params.llm.hidden_size,
this->params.patch_size * this->params.patch_size * 3);
}
std::shared_ptr<LLM::TextModel> text_model() {
return std::dynamic_pointer_cast<LLM::TextModel>(blocks["language_model"]);
}
std::shared_ptr<TimestepEmbedder> timestep_embedder() {
return std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder1"]);
}
std::shared_ptr<BottleneckPatchEmbed> patch_embedder() {
return std::dynamic_pointer_cast<BottleneckPatchEmbed>(blocks["x_embedder"]);
}
std::shared_ptr<FinalLayer> final_layer() {
return std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer2"]);
}
};
struct HiDreamO1VisionRunner : public GGMLRunner {
HiDreamO1Params params;
std::shared_ptr<LLM::VisionModel> model;
std::vector<int> window_index_vec;
std::vector<int> window_inverse_index_vec;
std::vector<float> window_mask_vec;
std::vector<float> pe_vec;
std::array<std::vector<int32_t>, 4> pos_embed_idx_data_;
std::array<std::vector<float>, 4> pos_embed_weight_data_;
HiDreamO1VisionRunner(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string& prefix = "model.visual")
: GGMLRunner(backend, params_backend),
params(make_hidream_o1_params()),
model(std::make_shared<LLM::VisionModel>(false, params.llm.vision)) {
model->init(params_ctx, tensor_storage_map, prefix);
}
std::string get_desc() override {
return "hidream_o1_vision";
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix = "model.visual") {
model->get_param_tensors(tensors, prefix);
}
ggml_tensor* encode_image(GGMLRunnerContext* runner_ctx, ggml_tensor* image) {
return LLM::LLMRunner::encode_image_common(this,
compute_ctx,
runner_ctx,
image,
params.llm.vision,
model,
window_index_vec,
window_inverse_index_vec,
window_mask_vec,
pe_vec,
pos_embed_idx_data_,
pos_embed_weight_data_);
}
ggml_cgraph* build_graph(const sd::Tensor<float>& image_tensor) {
ggml_cgraph* gf = new_graph_custom(HIDREAM_O1_GRAPH_SIZE);
ggml_tensor* image = make_input(image_tensor);
auto runner_ctx = get_context();
auto image_embeds = encode_image(&runner_ctx, image);
ggml_build_forward_expand(gf, image_embeds);
return gf;
}
sd::Tensor<float> compute(int n_threads, const sd::Tensor<float>& image) {
auto get_graph = [&]() {
return build_graph(image);
};
auto output = GGMLRunner::compute<float>(get_graph, n_threads, false);
return output.has_value() ? std::move(output.value()) : sd::Tensor<float>();
}
};
struct HiDreamO1Runner : public DiffusionModelRunner {
HiDreamO1Params params;
HiDreamO1Model model;
std::vector<float> attention_mask_vec;
HiDreamO1Runner(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string& prefix = "model")
: DiffusionModelRunner(backend, params_backend, prefix),
params(make_hidream_o1_params()) {
model = HiDreamO1Model(params);
model.init(params_ctx, tensor_storage_map, prefix);
}
std::string get_desc() override {
return "hidream_o1";
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
model.get_param_tensors(tensors, prefix);
}
ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
const sd::Tensor<float>& timestep_tensor,
const sd::Tensor<int32_t>& input_ids_tensor,
const sd::Tensor<int32_t>& input_pos_tensor,
const sd::Tensor<int32_t>& token_types_tensor,
const sd::Tensor<int32_t>& vinput_mask_tensor,
const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds_tensor,
const std::vector<sd::Tensor<float>>& ref_images) {
ggml_cgraph* gf = new_graph_custom(HIDREAM_O1_GRAPH_SIZE);
ggml_tensor* x = make_input(x_tensor);
ggml_tensor* timestep = make_input(timestep_tensor);
ggml_tensor* input_ids = make_input(input_ids_tensor);
ggml_tensor* input_pos = make_input(input_pos_tensor);
auto text_model = model.text_model();
auto t_embedder1 = model.timestep_embedder();
auto x_embedder = model.patch_embedder();
auto final_layer2 = model.final_layer();
std::vector<ggml_tensor*> ref_image_tensors;
for (const auto& image : ref_images) {
ref_image_tensors.push_back(make_input(image));
}
attention_mask_vec = std::vector<float>(static_cast<size_t>(token_types_tensor.shape()[0] * token_types_tensor.shape()[0]), 0.0f);
int64_t total_seq_len = token_types_tensor.shape()[0];
for (int64_t query = 0; query < total_seq_len; ++query) {
bool is_gen = token_types_tensor.values()[static_cast<size_t>(query)] > 0;
for (int64_t key = 0; key < total_seq_len; ++key) {
if (!is_gen && key > query) {
attention_mask_vec[static_cast<size_t>(query * total_seq_len + key)] = -INFINITY;
}
}
}
auto attention_mask = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, total_seq_len, total_seq_len);
set_backend_tensor_data(attention_mask, attention_mask_vec.data());
auto runner_ctx = get_context();
auto txt = text_model->embed(&runner_ctx, input_ids);
std::vector<std::pair<int, ggml_tensor*>> image_embeds;
image_embeds.reserve(image_embeds_tensor.size());
for (const auto& image_embed : image_embeds_tensor) {
image_embeds.emplace_back(image_embed.first, make_input(image_embed.second));
}
txt = LLM::splice_image_embeds(&runner_ctx, txt, image_embeds);
auto t_emb = t_embedder1->forward(&runner_ctx, timestep);
int64_t txt_seq_len = input_ids->ne[0];
if (txt_seq_len > 1) {
auto prefix = ggml_ext_slice(compute_ctx, txt, 1, 0, txt_seq_len - 1);
txt = ggml_concat(compute_ctx, prefix, ggml_reshape_3d(compute_ctx, t_emb, t_emb->ne[0], 1, 1), 1);
} else {
txt = ggml_reshape_3d(compute_ctx, t_emb, t_emb->ne[0], 1, 1);
}
auto vinputs = DiT::pad_and_patchify(&runner_ctx, x, PATCH_SIZE, PATCH_SIZE);
int64_t target_tokens = vinputs->ne[1];
for (ggml_tensor* ref_image : ref_image_tensors) {
auto ref = DiT::pad_and_patchify(&runner_ctx, ref_image, PATCH_SIZE, PATCH_SIZE);
vinputs = ggml_concat(compute_ctx, vinputs, ref, 1);
}
auto vis = x_embedder->forward(&runner_ctx, vinputs);
auto inputs_embeds = ggml_concat(compute_ctx, txt, vis, 1);
auto hidden_states = text_model->forward_embeds(&runner_ctx, inputs_embeds, input_pos, attention_mask, {});
auto x_pred_all = final_layer2->forward(&runner_ctx, hidden_states);
int64_t x_pred_start = txt_seq_len;
if (!vinput_mask_tensor.empty()) {
int64_t seq_len = static_cast<int64_t>(vinput_mask_tensor.shape()[0]);
int64_t first_vinput = 0;
while (first_vinput < seq_len && vinput_mask_tensor.values()[static_cast<size_t>(first_vinput)] == 0) {
first_vinput++;
}
x_pred_start = first_vinput;
}
auto x_pred = ggml_ext_slice(compute_ctx, x_pred_all, 1, x_pred_start, x_pred_start + target_tokens);
x_pred = DiT::unpatchify_and_crop(compute_ctx, x_pred, x->ne[1], x->ne[0], PATCH_SIZE, PATCH_SIZE);
float sigma = 1.0f - timestep_tensor.values()[0];
sigma = std::max(1e-6f, sigma);
auto out = ggml_scale(compute_ctx, ggml_sub(compute_ctx, x, x_pred), 1.0f / sigma);
ggml_build_forward_expand(gf, out);
return gf;
}
sd::Tensor<float> compute(int n_threads,
const sd::Tensor<float>& x,
const sd::Tensor<float>& timestep,
const sd::Tensor<int32_t>& input_ids,
const sd::Tensor<int32_t>& input_pos,
const sd::Tensor<int32_t>& token_types,
const sd::Tensor<int32_t>& vinput_mask,
const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds,
const std::vector<sd::Tensor<float>>& ref_images) {
auto get_graph = [&]() {
return build_graph(x, timestep, input_ids, input_pos, token_types, vinput_mask, image_embeds, ref_images);
};
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
const auto* extra = diffusion_extra_as<HiDreamO1DiffusionExtra>(diffusion_params);
GGML_ASSERT(extra != nullptr);
GGML_ASSERT(extra->input_ids != nullptr);
GGML_ASSERT(extra->input_pos != nullptr);
GGML_ASSERT(extra->token_types != nullptr);
static const std::vector<sd::Tensor<float>> empty_images;
static const std::vector<std::pair<int, sd::Tensor<float>>> empty_image_embeds;
return compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
*extra->input_ids,
*extra->input_pos,
*extra->token_types,
tensor_or_empty(extra->vinput_mask),
extra->image_embeds ? *extra->image_embeds : empty_image_embeds,
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_images);
}
};
struct HiDreamO1Conditioner : public Conditioner {
Qwen2Tokenizer tokenizer;
std::shared_ptr<HiDreamO1VisionRunner> vision_runner;
HiDreamO1Conditioner(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {})
: vision_runner(std::make_shared<HiDreamO1VisionRunner>(backend, params_backend, tensor_storage_map)) {}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
vision_runner->get_param_tensors(tensors);
}
bool alloc_params_buffer() override {
if (!vision_runner->alloc_params_buffer()) {
return false;
}
return true;
}
void free_params_buffer() override {
vision_runner->free_params_buffer();
}
size_t get_params_buffer_size() override {
return vision_runner->get_params_buffer_size();
}
void set_max_graph_vram_bytes(size_t max_graph_vram_bytes) override {
vision_runner->set_max_graph_vram_bytes(max_graph_vram_bytes);
}
void set_flash_attention_enabled(bool enabled) override {
vision_runner->set_flash_attention_enabled(enabled);
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
vision_runner->set_weight_adapter(adapter);
}
SDCondition get_learned_condition(int n_threads,
const ConditionerParams& conditioner_params) override {
SDCondition result;
int width = conditioner_params.width;
int height = conditioner_params.height;
int64_t target_image_len = static_cast<int64_t>(width / PATCH_SIZE) * static_cast<int64_t>(height / PATCH_SIZE);
std::vector<sd::Tensor<float>> ref_images;
if (conditioner_params.ref_images != nullptr) {
ref_images = *conditioner_params.ref_images;
}
std::vector<std::pair<int, sd::Tensor<float>>> vlm_images;
std::vector<std::array<int32_t, 3>> image_grids;
std::vector<int32_t> skip_vision_start;
std::string prompt = "<|im_start|>user\n";
if (ref_images.empty()) {
prompt += conditioner_params.text;
prompt += "<|im_end|>\n<|im_start|>assistant\n<|boi_token|><|tms_token|>";
auto input_ids = tokenizer.encode(prompt, nullptr);
std::vector<int32_t> input_ids_pad = input_ids;
input_ids_pad.push_back(VISION_START_TOKEN_ID);
input_ids_pad.insert(input_ids_pad.end(), target_image_len - 1, IMAGE_TOKEN_ID);
image_grids.push_back({1, static_cast<int32_t>(height / PATCH_SIZE), static_cast<int32_t>(width / PATCH_SIZE)});
skip_vision_start.push_back(1);
std::vector<int32_t> token_types(input_ids_pad.size(), 0);
int txt_seq_len = static_cast<int>(input_ids.size());
int bgn = txt_seq_len - TIMESTEP_TOKEN_NUM;
for (int i = bgn; i < static_cast<int>(token_types.size()); ++i) {
token_types[i] = 1;
}
auto position_ids = build_position_ids(input_ids_pad, image_grids, skip_vision_start);
std::vector<int64_t> input_shape{static_cast<int64_t>(input_ids.size())};
std::vector<int64_t> position_shape{static_cast<int64_t>(input_ids_pad.size() * 4)};
std::vector<int64_t> token_type_shape{static_cast<int64_t>(token_types.size())};
std::vector<int32_t> vinput_mask(token_types.size(), 0);
for (int64_t i = txt_seq_len; i < static_cast<int64_t>(vinput_mask.size()); ++i) {
vinput_mask[static_cast<size_t>(i)] = 1;
}
std::vector<int64_t> vinput_mask_shape{static_cast<int64_t>(vinput_mask.size())};
result.c_input_ids = sd::Tensor<int32_t>(input_shape, std::move(input_ids));
result.c_position_ids = sd::Tensor<int32_t>(position_shape, position_ids);
result.c_token_types = sd::Tensor<int32_t>(token_type_shape, std::move(token_types));
result.c_vinput_mask = sd::Tensor<int32_t>(vinput_mask_shape, std::move(vinput_mask));
return result;
}
int K = static_cast<int>(ref_images.size());
int max_size;
if (K == 1) {
max_size = std::max(height, width);
} else if (K == 2) {
max_size = std::max(height, width) * 48 / 64;
} else if (K <= 4) {
max_size = std::max(height, width) / 2;
} else if (K <= 8) {
max_size = std::max(height, width) * 24 / 64;
} else {
max_size = std::max(height, width) / 4;
}
int cond_img_size;
if (K <= 4) {
cond_img_size = 384;
} else if (K <= 8) {
cond_img_size = 384 * 48 / 64;
} else {
cond_img_size = 384 / 2;
}
for (const auto& ref_image : ref_images) {
auto resized_ref = resize_to_area(ref_image, max_size);
resized_ref = sd::ops::clamp(resized_ref, 0.0f, 1.0f);
// VLM image: Qwen3-VL expects mean=[0.5]/std=[0.5] (i.e. range [-1,1]),
// not CLIP normalization. Resize the already-resized ref directly to
// (cond_w, cond_h) to match the Python pipeline's pil_r.resize().
auto dims = calculate_dimensions(cond_img_size,
static_cast<double>(resized_ref.shape()[0]) / static_cast<double>(resized_ref.shape()[1]));
sd::Tensor<float> vlm_image = sd::ops::interpolate(
resized_ref,
{dims.first, dims.second, resized_ref.shape()[2], resized_ref.shape()[3]});
vlm_image = vlm_image * 2.0f - 1.0f;
int64_t image_tokens = static_cast<int64_t>(dims.first / PATCH_SIZE) * static_cast<int64_t>(dims.second / PATCH_SIZE);
auto patch_img = resized_ref * 2.0f - 1.0f;
result.c_ref_images.push_back(std::move(patch_img));
int64_t prompt_start = static_cast<int64_t>(tokenizer.encode(prompt + "<|vision_start|>", nullptr).size());
prompt += "<|vision_start|>";
prompt += repeat_special_token("<|image_pad|>", image_tokens);
prompt += "<|vision_end|>";
vlm_images.emplace_back(static_cast<int>(prompt_start), std::move(vlm_image));
image_grids.push_back({1, dims.second / PATCH_SIZE, dims.first / PATCH_SIZE});
skip_vision_start.push_back(0);
}
prompt += conditioner_params.text;
prompt += "<|im_end|>\n<|im_start|>assistant\n<|boi_token|><|tms_token|>";
auto input_ids = tokenizer.encode(prompt, nullptr);
std::vector<int32_t> input_ids_pad = input_ids;
input_ids_pad.push_back(VISION_START_TOKEN_ID);
input_ids_pad.insert(input_ids_pad.end(), target_image_len - 1, IMAGE_TOKEN_ID);
image_grids.push_back({1, static_cast<int32_t>(height / PATCH_SIZE), static_cast<int32_t>(width / PATCH_SIZE)});
skip_vision_start.push_back(1);
for (const auto& ref_image : result.c_ref_images) {
int64_t ref_len = static_cast<int64_t>(ref_image.shape()[0] / PATCH_SIZE) * static_cast<int64_t>(ref_image.shape()[1] / PATCH_SIZE);
input_ids_pad.push_back(VISION_START_TOKEN_ID);
input_ids_pad.insert(input_ids_pad.end(), ref_len - 1, IMAGE_TOKEN_ID);
image_grids.push_back({1, static_cast<int32_t>(ref_image.shape()[1] / PATCH_SIZE), static_cast<int32_t>(ref_image.shape()[0] / PATCH_SIZE)});
skip_vision_start.push_back(1);
}
std::vector<int32_t> token_types(input_ids_pad.size(), 0);
int txt_seq_len = static_cast<int>(input_ids.size());
int bgn = txt_seq_len - TIMESTEP_TOKEN_NUM;
for (int i = bgn; i < static_cast<int>(token_types.size()); ++i) {
token_types[i] = 1;
}
std::vector<int64_t> input_shape{static_cast<int64_t>(input_ids.size())};
std::vector<int64_t> position_shape{static_cast<int64_t>(input_ids_pad.size() * 4)};
std::vector<int64_t> token_type_shape{static_cast<int64_t>(token_types.size())};
std::vector<int32_t> vinput_mask(token_types.size(), 0);
for (int i = txt_seq_len; i < static_cast<int>(vinput_mask.size()); ++i) {
vinput_mask[static_cast<size_t>(i)] = 1;
}
std::vector<int64_t> vinput_mask_shape{static_cast<int64_t>(vinput_mask.size())};
result.c_input_ids = sd::Tensor<int32_t>(input_shape, std::move(input_ids));
result.c_position_ids = sd::Tensor<int32_t>(position_shape, build_position_ids(input_ids_pad, image_grids, skip_vision_start));
result.c_token_types = sd::Tensor<int32_t>(token_type_shape, std::move(token_types));
result.c_vinput_mask = sd::Tensor<int32_t>(vinput_mask_shape, std::move(vinput_mask));
result.c_image_embeds.reserve(vlm_images.size());
for (const auto& vlm_image : vlm_images) {
auto image_embed = vision_runner->compute(n_threads, vlm_image.second);
if (image_embed.empty()) {
LOG_ERROR("hidream_o1 conditioner: encode VLM image failed");
return SDCondition();
}
result.c_image_embeds.emplace_back(vlm_image.first, std::move(image_embed));
}
return result;
}
};
} // namespace HiDreamO1
#endif // __SD_HIDREAM_O1_H__