diff --git a/README.md b/README.md
index 80d98c30..33c272e9 100644
--- a/README.md
+++ b/README.md
@@ -58,6 +58,7 @@ API and command-line option may change frequently.***
- [Ovis-Image](./docs/ovis_image.md)
- [Anima](./docs/anima.md)
- [ERNIE-Image](./docs/ernie_image.md)
+ - [HiDream-O1-Image](./docs/hidream_o1_image.md)
- Image Edit Models
- [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Qwen Image Edit series](./docs/qwen_image_edit.md)
@@ -148,6 +149,7 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe
- [Ovis-Image](./docs/ovis_image.md)
- [Anima](./docs/anima.md)
- [ERNIE-Image](./docs/ernie_image.md)
+- [HiDream-O1-Image](./docs/hidream_o1_image.md)
- [LoRA](./docs/lora.md)
- [LCM/LCM-LoRA](./docs/lcm.md)
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
diff --git a/assets/hidream-o1/dev_example.png b/assets/hidream-o1/dev_example.png
new file mode 100644
index 00000000..e7ab12bb
Binary files /dev/null and b/assets/hidream-o1/dev_example.png differ
diff --git a/docs/hidream_o1_image.md b/docs/hidream_o1_image.md
new file mode 100644
index 00000000..771d4f29
--- /dev/null
+++ b/docs/hidream_o1_image.md
@@ -0,0 +1,20 @@
+# How to Use
+
+## Download weights
+
+- Download HiDream-O1-Image-Dev
+ - safetensors: https://huggingface.co/Comfy-Org/HiDream-O1-Image/tree/main/checkpoints
+- Download HiDream-O1-Image
+ - safetensors: https://huggingface.co/Comfy-Org/HiDream-O1-Image/tree/main/checkpoints
+
+## Examples
+
+### HiDream-O1-Image-Dev
+
+```
+.\bin\Release\sd-cli.exe -m ..\..\ComfyUI\models\diffusion_models\hidream_o1_image_dev_bf16.safetensors -p "a lovely cat holding a sign says
+'hidream o1 cpp'" --cfg-scale 1.0 -v -H 1024 -W 1024
+```
+
+
+
diff --git a/examples/cli/README.md b/examples/cli/README.md
index 5fbeec39..b5475794 100644
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@@ -103,6 +103,8 @@ Generation Options:
--hires-upscaler highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
(nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
antialiased), or a model name under --hires-upscalers-dir (default: Latent)
+ --extra-sample-args extra sampler args, key=value list. Currently lcm supports noise_clip_std,
+ noise_scale_start, noise_scale_end
-H, --height image height, in pixel space (default: 512)
-W, --width image width, in pixel space (default: 512)
--steps number of sample steps (default: 20)
diff --git a/examples/common/common.cpp b/examples/common/common.cpp
index 8ca7a2dc..28deecfa 100644
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@@ -807,6 +807,10 @@ ArgOptions SDGenerationParams::get_options() {
"Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name "
"under --hires-upscalers-dir (default: Latent)",
&hires_upscaler},
+ {"",
+ "--extra-sample-args",
+ "extra sampler args, key=value list. Currently lcm supports noise_clip_std, noise_scale_start, noise_scale_end",
+ &extra_sample_args},
};
options.int_options = {
@@ -1607,6 +1611,7 @@ bool SDGenerationParams::from_json_str(
auto parse_sample_params_json = [&](const json& sample_json,
sd_sample_params_t& target_params,
+ std::string& target_extra_sample_args,
std::vector& target_skip_layers,
std::vector* target_custom_sigmas) {
if (sample_json.contains("sample_steps") && sample_json["sample_steps"].is_number_integer()) {
@@ -1621,6 +1626,9 @@ bool SDGenerationParams::from_json_str(
if (sample_json.contains("flow_shift") && sample_json["flow_shift"].is_number()) {
target_params.flow_shift = sample_json["flow_shift"];
}
+ if (sample_json.contains("extra_sample_args") && sample_json["extra_sample_args"].is_string()) {
+ target_extra_sample_args = sample_json["extra_sample_args"].get();
+ }
if (target_custom_sigmas != nullptr &&
sample_json.contains("custom_sigmas") &&
sample_json["custom_sigmas"].is_array()) {
@@ -1668,11 +1676,12 @@ bool SDGenerationParams::from_json_str(
};
if (j.contains("sample_params") && j["sample_params"].is_object()) {
- parse_sample_params_json(j["sample_params"], sample_params, skip_layers, &custom_sigmas);
+ parse_sample_params_json(j["sample_params"], sample_params, extra_sample_args, skip_layers, &custom_sigmas);
}
if (j.contains("high_noise_sample_params") && j["high_noise_sample_params"].is_object()) {
parse_sample_params_json(j["high_noise_sample_params"],
high_noise_sample_params,
+ high_noise_extra_sample_args,
high_noise_skip_layers,
nullptr);
}
@@ -2099,6 +2108,8 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() {
high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size();
sample_params.custom_sigmas = custom_sigmas.empty() ? nullptr : custom_sigmas.data();
sample_params.custom_sigmas_count = static_cast(custom_sigmas.size());
+ sample_params.extra_sample_args = extra_sample_args.empty() ? nullptr : extra_sample_args.c_str();
+ high_noise_sample_params.extra_sample_args = high_noise_extra_sample_args.empty() ? nullptr : high_noise_extra_sample_args.c_str();
cache_params.scm_mask = scm_mask.empty() ? nullptr : scm_mask.c_str();
sd_pm_params_t pm_params = {
@@ -2168,6 +2179,8 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() {
high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size();
sample_params.custom_sigmas = custom_sigmas.empty() ? nullptr : custom_sigmas.data();
sample_params.custom_sigmas_count = static_cast(custom_sigmas.size());
+ sample_params.extra_sample_args = extra_sample_args.empty() ? nullptr : extra_sample_args.c_str();
+ high_noise_sample_params.extra_sample_args = high_noise_extra_sample_args.empty() ? nullptr : high_noise_extra_sample_args.c_str();
cache_params.scm_mask = scm_mask.empty() ? nullptr : scm_mask.c_str();
params.loras = lora_vec.empty() ? nullptr : lora_vec.data();
@@ -2306,6 +2319,7 @@ static json build_sampling_metadata_json(const sd_sample_params_t& sample_params
{"eta", sample_params.eta},
{"shifted_timestep", sample_params.shifted_timestep},
{"flow_shift", sample_params.flow_shift},
+ {"extra_sample_args", safe_json_string(sample_params.extra_sample_args)},
{"guidance",
{
{"txt_cfg", sample_params.guidance.txt_cfg},
@@ -2497,6 +2511,9 @@ std::string get_image_params(const SDContextParams& ctx_params,
}
parameter_string += "Guidance: " + std::to_string(gen_params.sample_params.guidance.distilled_guidance) + ", ";
parameter_string += "Eta: " + std::to_string(gen_params.sample_params.eta) + ", ";
+ if (!gen_params.extra_sample_args.empty()) {
+ parameter_string += "Extra sample args: " + gen_params.extra_sample_args + ", ";
+ }
parameter_string += "Seed: " + std::to_string(seed) + ", ";
parameter_string += "Size: " + std::to_string(gen_params.get_resolved_width()) + "x" + std::to_string(gen_params.get_resolved_height()) + ", ";
parameter_string += "Model: " + sd_basename(ctx_params.model_path) + ", ";
diff --git a/examples/common/common.h b/examples/common/common.h
index f87293f3..badaa875 100644
--- a/examples/common/common.h
+++ b/examples/common/common.h
@@ -168,6 +168,8 @@ struct SDGenerationParams {
sd_sample_params_t sample_params;
sd_sample_params_t high_noise_sample_params;
+ std::string extra_sample_args;
+ std::string high_noise_extra_sample_args;
std::vector skip_layers = {7, 8, 9};
std::vector high_noise_skip_layers = {7, 8, 9};
diff --git a/examples/server/README.md b/examples/server/README.md
index ead185cf..a2160203 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -205,6 +205,8 @@ Default Generation Options:
--hires-upscaler highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
(nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
antialiased), or a model name under --hires-upscalers-dir (default: Latent)
+ --extra-sample-args extra sampler args, key=value list. Currently lcm supports noise_clip_std,
+ noise_scale_start, noise_scale_end
-H, --height image height, in pixel space (default: 512)
-W, --width image width, in pixel space (default: 512)
--steps number of sample steps (default: 20)
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index 7f87d669..d906f856 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -240,6 +240,7 @@ typedef struct {
float* custom_sigmas;
int custom_sigmas_count;
float flow_shift;
+ const char* extra_sample_args;
} sd_sample_params_t;
typedef struct {
diff --git a/src/conditioner.hpp b/src/conditioner.hpp
index 4907938b..5050eeff 100644
--- a/src/conditioner.hpp
+++ b/src/conditioner.hpp
@@ -14,6 +14,12 @@ struct SDCondition {
sd::Tensor c_concat;
sd::Tensor c_t5_ids;
sd::Tensor c_t5_weights;
+ sd::Tensor c_input_ids;
+ sd::Tensor c_position_ids;
+ sd::Tensor c_token_types;
+ sd::Tensor c_vinput_mask;
+ std::vector>> c_image_embeds;
+ std::vector> c_ref_images;
std::vector> extra_c_crossattns;
@@ -26,10 +32,24 @@ struct SDCondition {
bool empty() const {
if (!c_crossattn.empty() || !c_vector.empty() || !c_concat.empty() ||
- !c_t5_ids.empty() || !c_t5_weights.empty()) {
+ !c_t5_ids.empty() || !c_t5_weights.empty() ||
+ !c_input_ids.empty() || !c_position_ids.empty() ||
+ !c_token_types.empty() || !c_vinput_mask.empty()) {
return false;
}
+ for (const auto& image_embed : c_image_embeds) {
+ if (!image_embed.second.empty()) {
+ return false;
+ }
+ }
+
+ for (const auto& tensor : c_ref_images) {
+ if (!tensor.empty()) {
+ return false;
+ }
+ }
+
for (const auto& tensor : extra_c_crossattns) {
if (!tensor.empty()) {
return false;
diff --git a/src/denoiser.hpp b/src/denoiser.hpp
index 3742f53b..0b0f8201 100644
--- a/src/denoiser.hpp
+++ b/src/denoiser.hpp
@@ -2,6 +2,7 @@
#define __DENOISER_HPP__
#include
+#include
#include
#include "ggml_extend.hpp"
@@ -1148,7 +1149,80 @@ static sd::Tensor sample_lcm(denoise_cb_t model,
sd::Tensor x,
const std::vector& sigmas,
std::shared_ptr rng,
- bool is_flow_denoiser) {
+ bool is_flow_denoiser,
+ const char* extra_sample_args = nullptr) {
+ struct LCMSampleArgs {
+ float noise_clip_std = 0.0f;
+ float noise_scale_start = 1.0f;
+ float noise_scale_end = 1.0f;
+ };
+
+ auto trim = [](std::string value) -> std::string {
+ const char* whitespace = " \t\r\n";
+ size_t begin = value.find_first_not_of(whitespace);
+ if (begin == std::string::npos) {
+ return "";
+ }
+ size_t end = value.find_last_not_of(whitespace);
+ return value.substr(begin, end - begin + 1);
+ };
+
+ LCMSampleArgs args;
+ if (extra_sample_args != nullptr && extra_sample_args[0] != '\0') {
+ std::string raw(extra_sample_args);
+ size_t start = 0;
+ bool noise_scale_end_was_set = false;
+ bool noise_scale_start_was_set = false;
+ auto parse_arg = [&](const std::string& item) {
+ std::string token = trim(item);
+ if (token.empty()) {
+ return;
+ }
+ size_t eq = token.find('=');
+ if (eq == std::string::npos) {
+ LOG_WARN("ignoring invalid lcm extra sample arg '%s'", token.c_str());
+ return;
+ }
+
+ std::string key = trim(token.substr(0, eq));
+ std::string value = trim(token.substr(eq + 1));
+ float parsed = 0.0f;
+ try {
+ size_t consumed = 0;
+ parsed = std::stof(value, &consumed);
+ if (trim(value.substr(consumed)).size() != 0) {
+ LOG_WARN("ignoring invalid lcm extra sample arg '%s'", token.c_str());
+ return;
+ }
+ } catch (const std::exception&) {
+ LOG_WARN("ignoring invalid lcm extra sample arg '%s'", token.c_str());
+ return;
+ }
+
+ if (key == "noise_clip_std") {
+ args.noise_clip_std = parsed;
+ } else if (key == "noise_scale_start") {
+ args.noise_scale_start = parsed;
+ noise_scale_start_was_set = true;
+ } else if (key == "noise_scale_end") {
+ args.noise_scale_end = parsed;
+ noise_scale_end_was_set = true;
+ } else {
+ LOG_WARN("ignoring unknown lcm extra sample arg '%s'", key.c_str());
+ }
+ };
+
+ for (size_t pos = 0; pos <= raw.size(); ++pos) {
+ if (pos == raw.size() || raw[pos] == ',' || raw[pos] == ';') {
+ parse_arg(raw.substr(start, pos - start));
+ start = pos + 1;
+ }
+ }
+ if (noise_scale_start_was_set && !noise_scale_end_was_set) {
+ args.noise_scale_end = args.noise_scale_start;
+ }
+ }
+
int steps = static_cast(sigmas.size()) - 1;
for (int i = 0; i < steps; i++) {
auto denoised_opt = model(x, sigmas[i], i + 1, nullptr);
@@ -1160,7 +1234,27 @@ static sd::Tensor sample_lcm(denoise_cb_t model,
if (is_flow_denoiser) {
x *= (1 - sigmas[i + 1]);
}
- x += sd::Tensor::randn_like(x, rng) * sigmas[i + 1];
+ auto noise = sd::Tensor::randn_like(x, rng);
+ if (args.noise_clip_std > 0.0f && noise.numel() > 0) {
+ double mean = 0.0;
+ for (int64_t j = 0; j < noise.numel(); ++j) {
+ mean += static_cast(noise[j]);
+ }
+ mean /= static_cast(noise.numel());
+
+ double variance = 0.0;
+ for (int64_t j = 0; j < noise.numel(); ++j) {
+ double centered = static_cast(noise[j]) - mean;
+ variance += centered * centered;
+ }
+ variance /= static_cast(noise.numel());
+
+ float clip_val = args.noise_clip_std * static_cast(std::sqrt(variance));
+ noise = sd::ops::clamp(noise, -clip_val, clip_val);
+ }
+ float t = steps > 1 ? static_cast(i) / static_cast(steps - 1) : 0.0f;
+ float noise_scale = args.noise_scale_start + (args.noise_scale_end - args.noise_scale_start) * t;
+ x += noise * (sigmas[i + 1] * noise_scale);
}
}
return x;
@@ -1656,15 +1750,15 @@ static sd::Tensor sample_euler_cfg_pp(denoise_cb_t model,
for (int i = 0; i < steps; i++) {
float sigma = sigmas[i];
sd::Tensor uncond_denoised;
-
+
auto denoised_opt = model(x, sigma, i + 1, &uncond_denoised);
if (denoised_opt.empty() || uncond_denoised.empty()) {
return {};
}
-
+
sd::Tensor denoised = std::move(denoised_opt);
- sd::Tensor d = (x - uncond_denoised) / sigma;
-
+ sd::Tensor d = (x - uncond_denoised) / sigma;
+
x = denoised + d * sigmas[i + 1];
}
return x;
@@ -1679,19 +1773,19 @@ static sd::Tensor sample_euler_ancestral_cfg_pp(denoise_cb_t model,
for (int i = 0; i < steps; i++) {
float sigma = sigmas[i];
sd::Tensor uncond_denoised;
-
+
auto denoised_opt = model(x, sigma, i + 1, &uncond_denoised);
if (denoised_opt.empty() || uncond_denoised.empty()) {
return {};
}
-
+
sd::Tensor denoised = std::move(denoised_opt);
- sd::Tensor d = (x - uncond_denoised) / sigma;
-
+ sd::Tensor d = (x - uncond_denoised) / sigma;
+
auto [sigma_down, sigma_up] = get_ancestral_step(sigmas[i], sigmas[i + 1], eta);
-
+
x = denoised + d * sigma_down;
-
+
if (sigmas[i + 1] > 0) {
x += sd::Tensor::randn_like(x, rng) * sigma_up;
}
@@ -1706,7 +1800,8 @@ static sd::Tensor sample_k_diffusion(sample_method_t method,
std::vector sigmas,
std::shared_ptr rng,
float eta,
- bool is_flow_denoiser) {
+ bool is_flow_denoiser,
+ const char* extra_sample_args) {
switch (method) {
case EULER_A_SAMPLE_METHOD:
if (is_flow_denoiser)
@@ -1729,7 +1824,7 @@ static sd::Tensor sample_k_diffusion(sample_method_t method,
case DPMPP2Mv2_SAMPLE_METHOD:
return sample_dpmpp_2m_v2(model, std::move(x), sigmas);
case LCM_SAMPLE_METHOD:
- return sample_lcm(model, std::move(x), sigmas, rng, is_flow_denoiser);
+ return sample_lcm(model, std::move(x), sigmas, rng, is_flow_denoiser, extra_sample_args);
case IPNDM_SAMPLE_METHOD:
return sample_ipndm(model, std::move(x), sigmas);
case IPNDM_V_SAMPLE_METHOD:
diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp
index 1a202a1a..26021ef2 100644
--- a/src/diffusion_model.hpp
+++ b/src/diffusion_model.hpp
@@ -5,6 +5,7 @@
#include "anima.hpp"
#include "ernie_image.hpp"
#include "flux.hpp"
+#include "hidream_o1.hpp"
#include "mmdit.hpp"
#include "qwen_image.hpp"
#include "tensor_ggml.hpp"
@@ -13,22 +14,28 @@
#include "z_image.hpp"
struct DiffusionParams {
- const sd::Tensor* x = nullptr;
- const sd::Tensor* timesteps = nullptr;
- const sd::Tensor* context = nullptr;
- const sd::Tensor* c_concat = nullptr;
- const sd::Tensor* y = nullptr;
- const sd::Tensor* t5_ids = nullptr;
- const sd::Tensor* t5_weights = nullptr;
- const sd::Tensor* guidance = nullptr;
- const std::vector>* ref_latents = nullptr;
- bool increase_ref_index = false;
- int num_video_frames = -1;
- const std::vector>* controls = nullptr;
- float control_strength = 0.f;
- const sd::Tensor* vace_context = nullptr;
- float vace_strength = 1.f;
- const std::vector* skip_layers = nullptr;
+ const sd::Tensor* x = nullptr;
+ const sd::Tensor* timesteps = nullptr;
+ const sd::Tensor* context = nullptr;
+ const sd::Tensor* c_concat = nullptr;
+ const sd::Tensor* y = nullptr;
+ const sd::Tensor* t5_ids = nullptr;
+ const sd::Tensor* t5_weights = nullptr;
+ const sd::Tensor* guidance = nullptr;
+ const std::vector>* ref_latents = nullptr;
+ const sd::Tensor* input_ids = nullptr;
+ const sd::Tensor* input_pos = nullptr;
+ const sd::Tensor* token_types = nullptr;
+ const sd::Tensor* vinput_mask = nullptr;
+ const std::vector>* vlm_images = nullptr;
+ const std::vector>>* image_embeds = nullptr;
+ bool increase_ref_index = false;
+ int num_video_frames = -1;
+ const std::vector>* controls = nullptr;
+ float control_strength = 0.f;
+ const sd::Tensor* vace_context = nullptr;
+ float vace_strength = 1.f;
+ const std::vector* skip_layers = nullptr;
};
template
@@ -476,6 +483,82 @@ struct QwenImageModel : public DiffusionModel {
}
};
+struct HiDreamO1Model : public DiffusionModel {
+ std::string prefix;
+ HiDreamO1::HiDreamO1Runner hidream_o1;
+
+ HiDreamO1Model(ggml_backend_t backend,
+ bool offload_params_to_cpu,
+ const String2TensorStorage& tensor_storage_map = {},
+ const std::string& prefix = "model")
+ : prefix(prefix), hidream_o1(backend, offload_params_to_cpu, tensor_storage_map, prefix) {
+ }
+
+ std::string get_desc() override {
+ return hidream_o1.get_desc();
+ }
+
+ void alloc_params_buffer() override {
+ hidream_o1.alloc_params_buffer();
+ }
+
+ void free_params_buffer() override {
+ hidream_o1.free_params_buffer();
+ }
+
+ void free_compute_buffer() override {
+ hidream_o1.free_compute_buffer();
+ }
+
+ void get_param_tensors(std::map& tensors) override {
+ hidream_o1.get_param_tensors(tensors, prefix);
+ }
+
+ size_t get_params_buffer_size() override {
+ return hidream_o1.get_params_buffer_size();
+ }
+
+ void set_weight_adapter(const std::shared_ptr& adapter) override {
+ hidream_o1.set_weight_adapter(adapter);
+ }
+
+ int64_t get_adm_in_channels() override {
+ return 0;
+ }
+
+ void set_flash_attention_enabled(bool enabled) {
+ hidream_o1.set_flash_attention_enabled(enabled);
+ }
+
+ void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
+ hidream_o1.set_max_graph_vram_bytes(max_vram_bytes);
+ }
+
+ void set_circular_axes(bool circular_x, bool circular_y) override {
+ hidream_o1.set_circular_axes(circular_x, circular_y);
+ }
+
+ sd::Tensor compute(int n_threads,
+ const DiffusionParams& diffusion_params) override {
+ GGML_ASSERT(diffusion_params.x != nullptr);
+ GGML_ASSERT(diffusion_params.timesteps != nullptr);
+ GGML_ASSERT(diffusion_params.input_ids != nullptr);
+ GGML_ASSERT(diffusion_params.input_pos != nullptr);
+ GGML_ASSERT(diffusion_params.token_types != nullptr);
+ static const std::vector> empty_images;
+ static const std::vector>> empty_image_embeds;
+ return hidream_o1.compute(n_threads,
+ *diffusion_params.x,
+ *diffusion_params.timesteps,
+ *diffusion_params.input_ids,
+ *diffusion_params.input_pos,
+ *diffusion_params.token_types,
+ tensor_or_empty(diffusion_params.vinput_mask),
+ diffusion_params.image_embeds ? *diffusion_params.image_embeds : empty_image_embeds,
+ diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_images);
+ }
+};
+
struct ZImageModel : public DiffusionModel {
std::string prefix;
ZImage::ZImageRunner z_image;
diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index f88eeb60..c6cd1c3a 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -280,6 +280,9 @@ __STATIC_INLINE__ void print_sd_tensor(const sd::Tensor& tensor, bool shape_o
if (shape_only) {
return;
}
+ if (tensor.empty()) {
+ return;
+ }
int range = 3;
std::vector shape = tensor.shape();
while (shape.size() < 4) {
@@ -1698,13 +1701,41 @@ struct WeightAdapter {
};
struct GGMLRunnerContext {
- ggml_backend_t backend = nullptr;
- ggml_context* ggml_ctx = nullptr;
- bool flash_attn_enabled = false;
- bool conv2d_direct_enabled = false;
- bool circular_x_enabled = false;
- bool circular_y_enabled = false;
- std::shared_ptr weight_adapter = nullptr;
+ ggml_backend_t backend = nullptr;
+ ggml_context* ggml_ctx = nullptr;
+ bool flash_attn_enabled = false;
+ bool conv2d_direct_enabled = false;
+ bool circular_x_enabled = false;
+ bool circular_y_enabled = false;
+ std::shared_ptr weight_adapter = nullptr;
+ std::vector>* debug_tensors = nullptr;
+ std::function get_cache_tensor;
+ std::function cache_tensor;
+
+ void capture_tensor(const std::string& name, ggml_tensor* tensor) {
+ if (debug_tensors == nullptr || tensor == nullptr) {
+ return;
+ }
+ ggml_tensor* snapshot = ggml_cont(ggml_ctx, tensor);
+ ggml_tensor* dst = ggml_dup_tensor(ggml_ctx, snapshot);
+ snapshot = ggml_cpy(ggml_ctx, snapshot, dst);
+ ggml_set_output(snapshot);
+ debug_tensors->push_back({snapshot, name});
+ }
+
+ ggml_tensor* load_cache_tensor(const std::string& name) const {
+ if (!get_cache_tensor) {
+ return nullptr;
+ }
+ return get_cache_tensor(name);
+ }
+
+ void persist_cache_tensor(const std::string& name, ggml_tensor* tensor) const {
+ if (!cache_tensor || tensor == nullptr) {
+ return;
+ }
+ cache_tensor(name, tensor);
+ }
};
struct GGMLRunner {
@@ -1743,6 +1774,7 @@ protected:
std::map backend_tensor_data_map;
std::map cache_tensor_map; // name -> tensor
+ std::vector> debug_tensors;
const std::string final_result_name = "ggml_runner_final_result_tensor";
bool flash_attn_enabled = false;
@@ -1838,6 +1870,7 @@ protected:
}
void free_compute_ctx() {
+ debug_tensors.clear();
if (compute_ctx != nullptr) {
ggml_free(compute_ctx);
compute_ctx = nullptr;
@@ -1884,6 +1917,16 @@ protected:
auto result = ggml_graph_node(gf, -1);
ggml_set_name(result, final_result_name.c_str());
}
+ for (const auto& entry : debug_tensors) {
+ if (entry.first != nullptr) {
+ ggml_build_forward_expand(gf, entry.first);
+ }
+ }
+ for (const auto& entry : cache_tensor_map) {
+ if (entry.second != nullptr) {
+ ggml_build_forward_expand(gf, entry.second);
+ }
+ }
prepare_build_in_tensor_after(gf);
return gf;
}
@@ -1981,9 +2024,13 @@ protected:
ggml_backend_buffer_t src_buf = sd::ggml_graph_cut::tensor_buffer(src);
ggml_backend_buffer_t dst_buf = sd::ggml_graph_cut::tensor_buffer(dst);
if (src_buf == nullptr || dst_buf == nullptr) {
- LOG_ERROR("%s cache copy tensor buffer missing: name=%s src_buffer=%p src_view_src=%p src_view_src_buffer=%p dst_buffer=%p",
+ LOG_ERROR("%s cache copy tensor buffer missing: name=%s op=%s src0=%p src0_name=%s src0_buffer=%p src_buffer=%p src_view_src=%p src_view_src_buffer=%p dst_buffer=%p",
get_desc().c_str(),
src && src->name[0] != '\0' ? src->name : "",
+ src ? ggml_op_name(src->op) : "",
+ src ? src->src[0] : nullptr,
+ (src && src->src[0] && src->src[0]->name[0] != '\0') ? src->src[0]->name : "",
+ (src && src->src[0]) ? sd::ggml_graph_cut::tensor_buffer(src->src[0]) : nullptr,
src ? src->buffer : nullptr,
src ? src->view_src : nullptr,
(src && src->view_src) ? src->view_src->buffer : nullptr,
@@ -2015,6 +2062,42 @@ protected:
return true;
}
+ template
+ std::optional> read_graph_tensor(ggml_tensor* tensor, const char* label) {
+ if (tensor == nullptr) {
+ LOG_ERROR("%s %s tensor is null", get_desc().c_str(), label);
+ return std::nullopt;
+ }
+ if (tensor->type != sd::GGMLTypeTraits::type) {
+ LOG_ERROR("%s %s tensor type mismatch: got %s",
+ get_desc().c_str(),
+ label,
+ ggml_type_name(tensor->type));
+ return std::nullopt;
+ }
+ ggml_backend_buffer_t buf = sd::ggml_graph_cut::tensor_buffer(tensor);
+ if (buf == nullptr) {
+ LOG_ERROR("%s %s tensor buffer missing: name=%s op=%s buffer=%p view_src=%p view_src_buffer=%p data=%p",
+ get_desc().c_str(),
+ label,
+ tensor->name[0] != '\0' ? tensor->name : "",
+ ggml_op_name(tensor->op),
+ tensor->buffer,
+ tensor->view_src,
+ tensor->view_src ? tensor->view_src->buffer : nullptr,
+ tensor->data);
+ return std::nullopt;
+ }
+
+ sd::Tensor result(sd::shape_from_ggml(tensor));
+ if (tensor->view_src != nullptr || !ggml_is_contiguous(tensor) || tensor->buffer == nullptr) {
+ ggml_backend_tensor_get(tensor, result.data(), 0, ggml_nbytes(tensor));
+ } else {
+ ggml_backend_tensor_get(tensor, result.data(), 0, ggml_nbytes(tensor));
+ }
+ return result;
+ }
+
void copy_data_to_backend_tensor(ggml_cgraph* gf, bool clear_after_copy = true) {
GGML_ASSERT(gf != nullptr);
std::unordered_set graph_tensor_set;
@@ -2031,10 +2114,24 @@ protected:
for (auto& kv : backend_tensor_data_map) {
auto tensor = kv.first;
auto data = kv.second;
-
+ if (tensor == nullptr || data == nullptr) {
+ continue;
+ }
+ const char* name = ggml_get_name(tensor);
if (graph_tensor_set.find(tensor) == graph_tensor_set.end()) {
continue;
}
+ if (tensor->buffer == nullptr) {
+ LOG_WARN("%s skip backend tensor copy: tensor buffer not set, name='%s', ne=[%lld,%lld,%lld,%lld], type=%s",
+ get_desc().c_str(),
+ name != nullptr ? name : "",
+ (long long)tensor->ne[0],
+ (long long)tensor->ne[1],
+ (long long)tensor->ne[2],
+ (long long)tensor->ne[3],
+ ggml_type_name(tensor->type));
+ continue;
+ }
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
if (buf == nullptr) {
@@ -2421,6 +2518,43 @@ protected:
return std::nullopt;
}
+ std::unordered_set debug_graph_tensor_set;
+ const int n_debug_leafs = sd::ggml_graph_cut::leaf_count(gf);
+ const int n_debug_nodes = ggml_graph_n_nodes(gf);
+ debug_graph_tensor_set.reserve(static_cast(n_debug_leafs + n_debug_nodes));
+ for (int i = 0; i < n_debug_leafs; ++i) {
+ debug_graph_tensor_set.insert(sd::ggml_graph_cut::leaf_tensor(gf, i));
+ }
+ for (int i = 0; i < n_debug_nodes; ++i) {
+ debug_graph_tensor_set.insert(ggml_graph_node(gf, i));
+ }
+
+ for (const auto& entry : debug_tensors) {
+ auto tensor = entry.first;
+ if (tensor == nullptr) {
+ continue;
+ }
+ if (debug_graph_tensor_set.find(tensor) == debug_graph_tensor_set.end()) {
+ continue;
+ }
+ ggml_backend_buffer_t tensor_buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+ if (tensor_buf == nullptr) {
+ LOG_WARN("%s skip debug tensor '%s': tensor buffer not set",
+ get_desc().c_str(),
+ entry.second.c_str());
+ continue;
+ }
+ if (tensor->type != GGML_TYPE_F32) {
+ LOG_WARN("%s skip debug tensor '%s': only GGML_TYPE_F32 is supported, got %s",
+ get_desc().c_str(),
+ entry.second.c_str(),
+ ggml_type_name(tensor->type));
+ continue;
+ }
+ auto debug_tensor = sd::make_sd_tensor_from_ggml(tensor);
+ print_sd_tensor(debug_tensor, false, entry.second.c_str());
+ }
+
int64_t t_cache_begin = ggml_time_ms();
if (!copy_cache_tensors_to_cache_buffer(cache_keep_names)) {
if (free_compute_buffer_immediately) {
@@ -2434,7 +2568,15 @@ protected:
auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
std::optional> output;
if (!no_return) {
- output = sd::make_sd_tensor_from_ggml(result);
+ output = read_graph_tensor(result, "output");
+ if (!output.has_value()) {
+ if (free_compute_buffer_immediately) {
+ free_compute_buffer();
+ } else if (use_partial_param_offload) {
+ restore_partial_params();
+ }
+ return std::nullopt;
+ }
} else {
output = sd::Tensor();
}
@@ -2557,6 +2699,13 @@ public:
runner_ctx.circular_x_enabled = circular_x_enabled;
runner_ctx.circular_y_enabled = circular_y_enabled;
runner_ctx.weight_adapter = weight_adapter;
+ runner_ctx.debug_tensors = &debug_tensors;
+ runner_ctx.get_cache_tensor = [this](const std::string& name) {
+ return this->get_cache_tensor_by_name(name);
+ };
+ runner_ctx.cache_tensor = [this](const std::string& name, ggml_tensor* tensor) {
+ this->cache(name, tensor);
+ };
return runner_ctx;
}
@@ -2676,6 +2825,9 @@ public:
}
void cache(const std::string name, ggml_tensor* tensor) {
+ if (tensor != nullptr && tensor->view_src != nullptr) {
+ tensor = ggml_cont(compute_ctx, tensor);
+ }
cache_tensor_map[name] = tensor;
}
diff --git a/src/ggml_graph_cut.cpp b/src/ggml_graph_cut.cpp
index f206f2d2..0958d888 100644
--- a/src/ggml_graph_cut.cpp
+++ b/src/ggml_graph_cut.cpp
@@ -45,6 +45,21 @@ namespace sd::ggml_graph_cut {
return params_tensor_set.find(tensor) != params_tensor_set.end();
}
+ static int graph_node_index_by_name(ggml_cgraph* gf, const char* name) {
+ GGML_ASSERT(gf != nullptr);
+ if (name == nullptr || name[0] == '\0') {
+ return -1;
+ }
+ const int n_nodes = ggml_graph_n_nodes(gf);
+ for (int i = 0; i < n_nodes; ++i) {
+ ggml_tensor* node = ggml_graph_node(gf, i);
+ if (node != nullptr && std::strcmp(node->name, name) == 0) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
static Plan::InputShape input_shape(const ggml_tensor* tensor) {
Plan::InputShape shape;
if (tensor == nullptr) {
@@ -244,6 +259,11 @@ namespace sd::ggml_graph_cut {
if (tensor == nullptr) {
return nullptr;
}
+ if (tensor_buffer(tensor) == nullptr && tensor->src[0] != nullptr &&
+ ggml_nelements(tensor->src[0]) == ggml_nelements(tensor) &&
+ ggml_nbytes(tensor->src[0]) == ggml_nbytes(tensor)) {
+ return cache_source_tensor(tensor->src[0]);
+ }
return tensor->view_src ? tensor->view_src : tensor;
}
@@ -503,11 +523,15 @@ namespace sd::ggml_graph_cut {
log_desc);
}
- ggml_tensor* final_output = ggml_graph_node(gf, -1);
- if (final_output != nullptr && available_cut_output_node_indices.find(n_nodes - 1) == available_cut_output_node_indices.end()) {
+ int final_output_index = graph_node_index_by_name(gf, "ggml_runner_final_result_tensor");
+ if (final_output_index < 0) {
+ final_output_index = n_nodes - 1;
+ }
+ ggml_tensor* final_output = final_output_index >= 0 ? ggml_graph_node(gf, final_output_index) : nullptr;
+ if (final_output != nullptr && available_cut_output_node_indices.find(final_output_index) == available_cut_output_node_indices.end()) {
Segment final_segment;
final_segment.group_name = "ggml_runner.final";
- final_segment.output_node_indices.push_back(n_nodes - 1);
+ final_segment.output_node_indices.push_back(final_output_index);
build_segment(gf,
plan,
final_segment,
diff --git a/src/hidream_o1.hpp b/src/hidream_o1.hpp
new file mode 100644
index 00000000..908f2de3
--- /dev/null
+++ b/src/hidream_o1.hpp
@@ -0,0 +1,653 @@
+#ifndef __SD_HIDREAM_O1_H__
+#define __SD_HIDREAM_O1_H__
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "common_dit.hpp"
+#include "conditioner.hpp"
+#include "llm.hpp"
+#include "util.h"
+
+namespace HiDreamO1 {
+ constexpr int HIDREAM_O1_GRAPH_SIZE = 32768;
+ constexpr int PATCH_SIZE = 32;
+ constexpr int TIMESTEP_TOKEN_NUM = 1;
+ constexpr int IMAGE_TOKEN_ID = 151655;
+ constexpr int VISION_START_TOKEN_ID = 151652;
+
+ static inline std::string repeat_special_token(const std::string& token, int64_t count) {
+ std::string out;
+ out.reserve(static_cast(count) * token.size());
+ for (int64_t i = 0; i < count; ++i) {
+ out += token;
+ }
+ return out;
+ }
+
+ static inline std::pair calculate_dimensions(int max_size, double ratio) {
+ int width = static_cast(std::sqrt(max_size * max_size * ratio));
+ int height = static_cast(width / ratio);
+ width = (width / PATCH_SIZE) * PATCH_SIZE;
+ height = (height / PATCH_SIZE) * PATCH_SIZE;
+ width = std::max(width, PATCH_SIZE);
+ height = std::max(height, PATCH_SIZE);
+ return {width, height};
+ }
+
+ static inline sd::Tensor resize_to_area(const sd::Tensor& image, int image_size) {
+ int64_t width = image.shape()[0];
+ int64_t height = image.shape()[1];
+ int64_t s_max = static_cast(image_size) * image_size;
+ double scale = std::sqrt(static_cast(s_max) / static_cast(width * height));
+
+ std::vector> sizes = {
+ {(static_cast(std::llround(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast(std::llround(height * scale)) / PATCH_SIZE) * PATCH_SIZE},
+ {(static_cast(std::llround(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast(std::floor(height * scale)) / PATCH_SIZE) * PATCH_SIZE},
+ {(static_cast(std::floor(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast(std::llround(height * scale)) / PATCH_SIZE) * PATCH_SIZE},
+ {(static_cast(std::floor(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast(std::floor(height * scale)) / PATCH_SIZE) * PATCH_SIZE},
+ };
+ std::sort(sizes.begin(), sizes.end(), [](const auto& a, const auto& b) {
+ return a.first * a.second > b.first * b.second;
+ });
+
+ std::pair new_size = sizes.back();
+ for (const auto& size : sizes) {
+ if (size.first > 0 && size.second > 0 && size.first * size.second <= s_max) {
+ new_size = size;
+ break;
+ }
+ }
+
+ double s1 = static_cast(width) / static_cast(new_size.first);
+ double s2 = static_cast(height) / static_cast(new_size.second);
+ sd::Tensor resized;
+ if (s1 < s2) {
+ int64_t resized_h = static_cast(std::llround(height / s1));
+ resized = sd::ops::interpolate(image,
+ {new_size.first, resized_h, image.shape()[2], image.shape()[3]},
+ sd::ops::InterpolateMode::Bicubic);
+ int64_t top = (resized_h - new_size.second) / 2;
+ resized = sd::ops::slice(resized, 1, top, top + new_size.second);
+ } else {
+ int64_t resized_w = static_cast(std::llround(width / s2));
+ resized = sd::ops::interpolate(image,
+ {resized_w, new_size.second, image.shape()[2], image.shape()[3]},
+ sd::ops::InterpolateMode::Bicubic);
+ int64_t left = (resized_w - new_size.first) / 2;
+ resized = sd::ops::slice(resized, 0, left, left + new_size.first);
+ }
+ return resized;
+ }
+
+ static inline std::vector build_position_ids(const std::vector& input_ids,
+ const std::vector>& image_grids,
+ const std::vector& skip_vision_start_token) {
+ std::vector position_ids(4 * input_ids.size(), 0);
+ int image_index = 0;
+ int st = 0;
+ int fix_point = 4096;
+ std::vector out_t;
+ std::vector out_h;
+ std::vector out_w;
+
+ while (st < static_cast(input_ids.size())) {
+ int ed = st;
+ while (ed < static_cast(input_ids.size()) && input_ids[ed] != IMAGE_TOKEN_ID) {
+ ed++;
+ }
+
+ if (ed >= static_cast(input_ids.size())) {
+ int st_idx = out_t.empty() ? 0 : (*std::max_element(out_t.begin(), out_t.end()) + 1);
+ for (int i = 0; i < static_cast(input_ids.size()) - st; ++i) {
+ out_t.push_back(st_idx + i);
+ out_h.push_back(st_idx + i);
+ out_w.push_back(st_idx + i);
+ }
+ break;
+ }
+
+ int text_len = std::max(0, ed - st - skip_vision_start_token[image_index]);
+ int st_idx = out_t.empty() ? 0 : (*std::max_element(out_t.begin(), out_t.end()) + 1);
+ for (int i = 0; i < text_len; ++i) {
+ out_t.push_back(st_idx + i);
+ out_h.push_back(st_idx + i);
+ out_w.push_back(st_idx + i);
+ }
+
+ auto grid = image_grids[image_index];
+ int base;
+ if (skip_vision_start_token[image_index]) {
+ if (fix_point > 0) {
+ base = fix_point;
+ fix_point = 0;
+ } else {
+ base = st_idx;
+ }
+ } else {
+ base = text_len + st_idx;
+ }
+ for (int32_t ti = 0; ti < grid[0]; ++ti) {
+ for (int32_t hi = 0; hi < grid[1]; ++hi) {
+ for (int32_t wi = 0; wi < grid[2]; ++wi) {
+ out_t.push_back(base + ti);
+ out_h.push_back(base + hi);
+ out_w.push_back(base + wi);
+ }
+ }
+ }
+
+ st = ed + grid[0] * grid[1] * grid[2];
+ image_index++;
+ }
+
+ GGML_ASSERT(out_t.size() == input_ids.size());
+ for (size_t i = 0; i < input_ids.size(); ++i) {
+ // ggml IMROPE consumes 4 flattened position streams:
+ // [t, h, w, e]
+ // llama.cpp's generic Qwen-VL fallback expands text positions as
+ // [pos, pos, pos, 0]. Keep the extra stream zeroed here too.
+ position_ids[i] = out_t[i];
+ position_ids[input_ids.size() + i] = out_h[i];
+ position_ids[input_ids.size() * 2 + i] = out_w[i];
+ position_ids[input_ids.size() * 3 + i] = 0;
+ }
+ return position_ids;
+ }
+
+ struct TimestepEmbedder : public GGMLBlock {
+ int frequency_embedding_size = 256;
+
+ TimestepEmbedder(int64_t hidden_size) {
+ blocks["mlp.0"] = std::make_shared(frequency_embedding_size, hidden_size, true);
+ blocks["mlp.2"] = std::make_shared(hidden_size, hidden_size, true);
+ }
+
+ ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* t) {
+ auto mlp_0 = std::dynamic_pointer_cast(blocks["mlp.0"]);
+ auto mlp_2 = std::dynamic_pointer_cast(blocks["mlp.2"]);
+ auto emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, t, frequency_embedding_size, 10000, 1000.0f);
+ emb = mlp_0->forward(ctx, emb);
+ emb = ggml_silu_inplace(ctx->ggml_ctx, emb);
+ emb = mlp_2->forward(ctx, emb);
+ return emb;
+ }
+ };
+
+ struct BottleneckPatchEmbed : public GGMLBlock {
+ BottleneckPatchEmbed(int64_t in_dim, int64_t pca_dim, int64_t embed_dim) {
+ blocks["proj1"] = std::make_shared(in_dim, pca_dim, false);
+ blocks["proj2"] = std::make_shared(pca_dim, embed_dim, true);
+ }
+
+ ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+ auto proj1 = std::dynamic_pointer_cast(blocks["proj1"]);
+ auto proj2 = std::dynamic_pointer_cast(blocks["proj2"]);
+ return proj2->forward(ctx, proj1->forward(ctx, x));
+ }
+ };
+
+ struct FinalLayer : public GGMLBlock {
+ FinalLayer(int64_t hidden_size, int64_t out_dim) {
+ blocks["linear"] = std::make_shared(hidden_size, out_dim, true);
+ }
+
+ ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
+ auto linear = std::dynamic_pointer_cast(blocks["linear"]);
+ return linear->forward(ctx, x);
+ }
+ };
+
+ struct HiDreamO1Params {
+ LLM::LLMParams llm;
+ int patch_size = PATCH_SIZE;
+ };
+
+ static inline HiDreamO1Params make_hidream_o1_params() {
+ HiDreamO1Params params;
+ params.llm.arch = LLM::LLMArch::QWEN3_VL;
+ params.llm.hidden_size = 4096;
+ params.llm.intermediate_size = 12288;
+ params.llm.num_layers = 36;
+ params.llm.num_heads = 32;
+ params.llm.num_kv_heads = 8;
+ params.llm.head_dim = 128;
+ params.llm.qkv_bias = false;
+ params.llm.qk_norm = true;
+ params.llm.vocab_size = 151936;
+ params.llm.rms_norm_eps = 1e-6f;
+ params.llm.vision.arch = LLM::LLMVisionArch::QWEN3_VL;
+ params.llm.vision.num_layers = 27;
+ params.llm.vision.hidden_size = 1152;
+ params.llm.vision.intermediate_size = 4304;
+ params.llm.vision.num_heads = 16;
+ params.llm.vision.out_hidden_size = 4096;
+ params.llm.vision.patch_size = 16;
+ params.llm.vision.spatial_merge_size = 2;
+ params.llm.vision.temporal_patch_size = 2;
+ params.llm.vision.num_position_embeddings = 2304;
+ return params;
+ }
+
+ struct HiDreamO1Model : public GGMLBlock {
+ HiDreamO1Params params;
+
+ HiDreamO1Model() = default;
+ explicit HiDreamO1Model(HiDreamO1Params params)
+ : params(std::move(params)) {
+ blocks["language_model"] = std::make_shared(this->params.llm);
+ blocks["t_embedder1"] = std::make_shared(this->params.llm.hidden_size);
+ blocks["x_embedder"] = std::make_shared(this->params.patch_size * this->params.patch_size * 3,
+ this->params.llm.hidden_size / 4,
+ this->params.llm.hidden_size);
+ blocks["final_layer2"] = std::make_shared(this->params.llm.hidden_size,
+ this->params.patch_size * this->params.patch_size * 3);
+ }
+
+ std::shared_ptr text_model() {
+ return std::dynamic_pointer_cast(blocks["language_model"]);
+ }
+
+ std::shared_ptr timestep_embedder() {
+ return std::dynamic_pointer_cast(blocks["t_embedder1"]);
+ }
+
+ std::shared_ptr patch_embedder() {
+ return std::dynamic_pointer_cast(blocks["x_embedder"]);
+ }
+
+ std::shared_ptr final_layer() {
+ return std::dynamic_pointer_cast(blocks["final_layer2"]);
+ }
+ };
+
+ struct HiDreamO1VisionRunner : public GGMLRunner {
+ HiDreamO1Params params;
+ std::shared_ptr model;
+
+ std::vector window_index_vec;
+ std::vector window_inverse_index_vec;
+ std::vector window_mask_vec;
+ std::vector pe_vec;
+ std::array, 4> pos_embed_idx_data_;
+ std::array, 4> pos_embed_weight_data_;
+
+ HiDreamO1VisionRunner(ggml_backend_t backend,
+ bool offload_params_to_cpu,
+ const String2TensorStorage& tensor_storage_map = {},
+ const std::string& prefix = "model.visual")
+ : GGMLRunner(backend, offload_params_to_cpu),
+ params(make_hidream_o1_params()),
+ model(std::make_shared(false, params.llm.vision)) {
+ model->init(params_ctx, tensor_storage_map, prefix);
+ }
+
+ std::string get_desc() override {
+ return "hidream_o1_vision";
+ }
+
+ void get_param_tensors(std::map& tensors, const std::string& prefix = "model.visual") {
+ model->get_param_tensors(tensors, prefix);
+ }
+
+ ggml_tensor* encode_image(GGMLRunnerContext* runner_ctx, ggml_tensor* image) {
+ return LLM::LLMRunner::encode_image_common(this,
+ compute_ctx,
+ runner_ctx,
+ image,
+ params.llm.vision,
+ model,
+ window_index_vec,
+ window_inverse_index_vec,
+ window_mask_vec,
+ pe_vec,
+ pos_embed_idx_data_,
+ pos_embed_weight_data_);
+ }
+
+ ggml_cgraph* build_graph(const sd::Tensor& image_tensor) {
+ ggml_cgraph* gf = new_graph_custom(HIDREAM_O1_GRAPH_SIZE);
+ ggml_tensor* image = make_input(image_tensor);
+ auto runner_ctx = get_context();
+ auto image_embeds = encode_image(&runner_ctx, image);
+ ggml_build_forward_expand(gf, image_embeds);
+ return gf;
+ }
+
+ sd::Tensor compute(int n_threads, const sd::Tensor& image) {
+ auto get_graph = [&]() {
+ return build_graph(image);
+ };
+ auto output = GGMLRunner::compute(get_graph, n_threads, false);
+ return output.has_value() ? std::move(output.value()) : sd::Tensor();
+ }
+ };
+
+ struct HiDreamO1Runner : public GGMLRunner {
+ HiDreamO1Params params;
+ HiDreamO1Model model;
+
+ std::vector attention_mask_vec;
+
+ HiDreamO1Runner(ggml_backend_t backend,
+ bool offload_params_to_cpu,
+ const String2TensorStorage& tensor_storage_map = {},
+ const std::string& prefix = "model")
+ : GGMLRunner(backend, offload_params_to_cpu),
+ params(make_hidream_o1_params()) {
+ model = HiDreamO1Model(params);
+ model.init(params_ctx, tensor_storage_map, prefix);
+ }
+
+ std::string get_desc() override {
+ return "hidream_o1";
+ }
+
+ void get_param_tensors(std::map& tensors, const std::string& prefix) {
+ model.get_param_tensors(tensors, prefix);
+ }
+
+ ggml_cgraph* build_graph(const sd::Tensor& x_tensor,
+ const sd::Tensor& timestep_tensor,
+ const sd::Tensor& input_ids_tensor,
+ const sd::Tensor& input_pos_tensor,
+ const sd::Tensor& token_types_tensor,
+ const sd::Tensor& vinput_mask_tensor,
+ const std::vector>>& image_embeds_tensor,
+ const std::vector>& ref_images) {
+ ggml_cgraph* gf = new_graph_custom(HIDREAM_O1_GRAPH_SIZE);
+ ggml_tensor* x = make_input(x_tensor);
+ ggml_tensor* timestep = make_input(timestep_tensor);
+ ggml_tensor* input_ids = make_input(input_ids_tensor);
+ ggml_tensor* input_pos = make_input(input_pos_tensor);
+
+ auto text_model = model.text_model();
+ auto t_embedder1 = model.timestep_embedder();
+ auto x_embedder = model.patch_embedder();
+ auto final_layer2 = model.final_layer();
+
+ std::vector ref_image_tensors;
+ for (const auto& image : ref_images) {
+ ref_image_tensors.push_back(make_input(image));
+ }
+
+ attention_mask_vec = std::vector(static_cast(token_types_tensor.shape()[0] * token_types_tensor.shape()[0]), 0.0f);
+ int64_t total_seq_len = token_types_tensor.shape()[0];
+ for (int64_t query = 0; query < total_seq_len; ++query) {
+ bool is_gen = token_types_tensor.values()[static_cast(query)] > 0;
+ for (int64_t key = 0; key < total_seq_len; ++key) {
+ if (!is_gen && key > query) {
+ attention_mask_vec[static_cast(query * total_seq_len + key)] = -INFINITY;
+ }
+ }
+ }
+ auto attention_mask = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, total_seq_len, total_seq_len);
+ set_backend_tensor_data(attention_mask, attention_mask_vec.data());
+
+ auto runner_ctx = get_context();
+ auto txt = text_model->embed(&runner_ctx, input_ids);
+ std::vector> image_embeds;
+ image_embeds.reserve(image_embeds_tensor.size());
+ for (const auto& image_embed : image_embeds_tensor) {
+ image_embeds.emplace_back(image_embed.first, make_input(image_embed.second));
+ }
+ txt = LLM::splice_image_embeds(&runner_ctx, txt, image_embeds);
+
+ auto t_emb = t_embedder1->forward(&runner_ctx, timestep);
+ int64_t txt_seq_len = input_ids->ne[0];
+ if (txt_seq_len > 1) {
+ auto prefix = ggml_ext_slice(compute_ctx, txt, 1, 0, txt_seq_len - 1);
+ txt = ggml_concat(compute_ctx, prefix, ggml_reshape_3d(compute_ctx, t_emb, t_emb->ne[0], 1, 1), 1);
+ } else {
+ txt = ggml_reshape_3d(compute_ctx, t_emb, t_emb->ne[0], 1, 1);
+ }
+
+ auto vinputs = DiT::pad_and_patchify(&runner_ctx, x, PATCH_SIZE, PATCH_SIZE);
+ int64_t target_tokens = vinputs->ne[1];
+ for (ggml_tensor* ref_image : ref_image_tensors) {
+ auto ref = DiT::pad_and_patchify(&runner_ctx, ref_image, PATCH_SIZE, PATCH_SIZE);
+ vinputs = ggml_concat(compute_ctx, vinputs, ref, 1);
+ }
+ auto vis = x_embedder->forward(&runner_ctx, vinputs);
+
+ auto inputs_embeds = ggml_concat(compute_ctx, txt, vis, 1);
+ auto hidden_states = text_model->forward_embeds(&runner_ctx, inputs_embeds, input_pos, attention_mask, {});
+ auto x_pred_all = final_layer2->forward(&runner_ctx, hidden_states);
+
+ int64_t x_pred_start = txt_seq_len;
+ if (!vinput_mask_tensor.empty()) {
+ int64_t seq_len = static_cast(vinput_mask_tensor.shape()[0]);
+ int64_t first_vinput = 0;
+ while (first_vinput < seq_len && vinput_mask_tensor.values()[static_cast(first_vinput)] == 0) {
+ first_vinput++;
+ }
+ x_pred_start = first_vinput;
+ }
+ auto x_pred = ggml_ext_slice(compute_ctx, x_pred_all, 1, x_pred_start, x_pred_start + target_tokens);
+ x_pred = DiT::unpatchify_and_crop(compute_ctx, x_pred, x->ne[1], x->ne[0], PATCH_SIZE, PATCH_SIZE);
+
+ float sigma = 1.0f - timestep_tensor.values()[0];
+ sigma = std::max(1e-6f, sigma);
+ auto out = ggml_scale(compute_ctx, ggml_sub(compute_ctx, x, x_pred), 1.0f / sigma);
+
+ ggml_build_forward_expand(gf, out);
+ return gf;
+ }
+
+ sd::Tensor compute(int n_threads,
+ const sd::Tensor& x,
+ const sd::Tensor& timestep,
+ const sd::Tensor& input_ids,
+ const sd::Tensor& input_pos,
+ const sd::Tensor& token_types,
+ const sd::Tensor& vinput_mask,
+ const std::vector>>& image_embeds,
+ const std::vector>& ref_images) {
+ auto get_graph = [&]() {
+ return build_graph(x, timestep, input_ids, input_pos, token_types, vinput_mask, image_embeds, ref_images);
+ };
+ return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim());
+ }
+ };
+
+ struct HiDreamO1Conditioner : public Conditioner {
+ Qwen2Tokenizer tokenizer;
+ std::shared_ptr vision_runner;
+
+ HiDreamO1Conditioner(ggml_backend_t backend,
+ bool offload_params_to_cpu,
+ const String2TensorStorage& tensor_storage_map = {})
+ : vision_runner(std::make_shared(backend, offload_params_to_cpu, tensor_storage_map)) {}
+
+ void get_param_tensors(std::map& tensors) override {
+ vision_runner->get_param_tensors(tensors);
+ }
+
+ void alloc_params_buffer() override {
+ vision_runner->alloc_params_buffer();
+ }
+
+ void free_params_buffer() override {
+ vision_runner->free_params_buffer();
+ }
+
+ size_t get_params_buffer_size() override {
+ return vision_runner->get_params_buffer_size();
+ }
+
+ void set_max_graph_vram_bytes(size_t max_graph_vram_bytes) override {
+ vision_runner->set_max_graph_vram_bytes(max_graph_vram_bytes);
+ }
+
+ void set_flash_attention_enabled(bool enabled) override {
+ vision_runner->set_flash_attention_enabled(enabled);
+ }
+
+ void set_weight_adapter(const std::shared_ptr& adapter) override {
+ vision_runner->set_weight_adapter(adapter);
+ }
+
+ SDCondition get_learned_condition(int n_threads,
+ const ConditionerParams& conditioner_params) override {
+ SDCondition result;
+
+ int width = conditioner_params.width;
+ int height = conditioner_params.height;
+ int64_t target_image_len = static_cast(width / PATCH_SIZE) * static_cast(height / PATCH_SIZE);
+
+ std::vector> ref_images;
+ if (conditioner_params.ref_images != nullptr) {
+ ref_images = *conditioner_params.ref_images;
+ }
+
+ std::vector>> vlm_images;
+ std::vector> image_grids;
+ std::vector skip_vision_start;
+
+ std::string prompt = "<|im_start|>user\n";
+
+ if (ref_images.empty()) {
+ prompt += conditioner_params.text;
+ prompt += "<|im_end|>\n<|im_start|>assistant\n<|boi_token|><|tms_token|>";
+ auto input_ids = tokenizer.encode(prompt, nullptr);
+
+ std::vector input_ids_pad = input_ids;
+ input_ids_pad.push_back(VISION_START_TOKEN_ID);
+ input_ids_pad.insert(input_ids_pad.end(), target_image_len - 1, IMAGE_TOKEN_ID);
+
+ image_grids.push_back({1, static_cast(height / PATCH_SIZE), static_cast(width / PATCH_SIZE)});
+ skip_vision_start.push_back(1);
+
+ std::vector token_types(input_ids_pad.size(), 0);
+ int txt_seq_len = static_cast(input_ids.size());
+ int bgn = txt_seq_len - TIMESTEP_TOKEN_NUM;
+ for (int i = bgn; i < static_cast(token_types.size()); ++i) {
+ token_types[i] = 1;
+ }
+
+ auto position_ids = build_position_ids(input_ids_pad, image_grids, skip_vision_start);
+
+ std::vector input_shape{static_cast(input_ids.size())};
+ std::vector position_shape{static_cast(input_ids_pad.size() * 4)};
+ std::vector token_type_shape{static_cast(token_types.size())};
+ std::vector