feat: add hidream o1 image support (#1485)

This commit is contained in:
leejet 2026-05-15 00:40:21 +08:00 committed by GitHub
parent eeac950b44
commit 0665a7f8bf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
20 changed files with 1703 additions and 334 deletions

View File

@ -58,6 +58,7 @@ API and command-line option may change frequently.***
- [Ovis-Image](./docs/ovis_image.md) - [Ovis-Image](./docs/ovis_image.md)
- [Anima](./docs/anima.md) - [Anima](./docs/anima.md)
- [ERNIE-Image](./docs/ernie_image.md) - [ERNIE-Image](./docs/ernie_image.md)
- [HiDream-O1-Image](./docs/hidream_o1_image.md)
- Image Edit Models - Image Edit Models
- [FLUX.1-Kontext-dev](./docs/kontext.md) - [FLUX.1-Kontext-dev](./docs/kontext.md)
- [Qwen Image Edit series](./docs/qwen_image_edit.md) - [Qwen Image Edit series](./docs/qwen_image_edit.md)
@ -148,6 +149,7 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe
- [Ovis-Image](./docs/ovis_image.md) - [Ovis-Image](./docs/ovis_image.md)
- [Anima](./docs/anima.md) - [Anima](./docs/anima.md)
- [ERNIE-Image](./docs/ernie_image.md) - [ERNIE-Image](./docs/ernie_image.md)
- [HiDream-O1-Image](./docs/hidream_o1_image.md)
- [LoRA](./docs/lora.md) - [LoRA](./docs/lora.md)
- [LCM/LCM-LoRA](./docs/lcm.md) - [LCM/LCM-LoRA](./docs/lcm.md)
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md) - [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.2 MiB

20
docs/hidream_o1_image.md Normal file
View File

@ -0,0 +1,20 @@
# How to Use
## Download weights
- Download HiDream-O1-Image-Dev
- safetensors: https://huggingface.co/Comfy-Org/HiDream-O1-Image/tree/main/checkpoints
- Download HiDream-O1-Image
- safetensors: https://huggingface.co/Comfy-Org/HiDream-O1-Image/tree/main/checkpoints
## Examples
### HiDream-O1-Image-Dev
```
.\bin\Release\sd-cli.exe -m ..\..\ComfyUI\models\diffusion_models\hidream_o1_image_dev_bf16.safetensors -p "a lovely cat holding a sign says
'hidream o1 cpp'" --cfg-scale 1.0 -v -H 1024 -W 1024
```
<img width="256" alt="HiDream-O1-Image-Dev example" src="../assets/hidream-o1/dev_example.png" />

View File

@ -103,6 +103,8 @@ Generation Options:
--hires-upscaler <string> highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent --hires-upscaler <string> highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
(nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic (nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
antialiased), or a model name under --hires-upscalers-dir (default: Latent) antialiased), or a model name under --hires-upscalers-dir (default: Latent)
--extra-sample-args <string> extra sampler args, key=value list. Currently lcm supports noise_clip_std,
noise_scale_start, noise_scale_end
-H, --height <int> image height, in pixel space (default: 512) -H, --height <int> image height, in pixel space (default: 512)
-W, --width <int> image width, in pixel space (default: 512) -W, --width <int> image width, in pixel space (default: 512)
--steps <int> number of sample steps (default: 20) --steps <int> number of sample steps (default: 20)

View File

@ -807,6 +807,10 @@ ArgOptions SDGenerationParams::get_options() {
"Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name " "Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name "
"under --hires-upscalers-dir (default: Latent)", "under --hires-upscalers-dir (default: Latent)",
&hires_upscaler}, &hires_upscaler},
{"",
"--extra-sample-args",
"extra sampler args, key=value list. Currently lcm supports noise_clip_std, noise_scale_start, noise_scale_end",
&extra_sample_args},
}; };
options.int_options = { options.int_options = {
@ -1607,6 +1611,7 @@ bool SDGenerationParams::from_json_str(
auto parse_sample_params_json = [&](const json& sample_json, auto parse_sample_params_json = [&](const json& sample_json,
sd_sample_params_t& target_params, sd_sample_params_t& target_params,
std::string& target_extra_sample_args,
std::vector<int>& target_skip_layers, std::vector<int>& target_skip_layers,
std::vector<float>* target_custom_sigmas) { std::vector<float>* target_custom_sigmas) {
if (sample_json.contains("sample_steps") && sample_json["sample_steps"].is_number_integer()) { if (sample_json.contains("sample_steps") && sample_json["sample_steps"].is_number_integer()) {
@ -1621,6 +1626,9 @@ bool SDGenerationParams::from_json_str(
if (sample_json.contains("flow_shift") && sample_json["flow_shift"].is_number()) { if (sample_json.contains("flow_shift") && sample_json["flow_shift"].is_number()) {
target_params.flow_shift = sample_json["flow_shift"]; target_params.flow_shift = sample_json["flow_shift"];
} }
if (sample_json.contains("extra_sample_args") && sample_json["extra_sample_args"].is_string()) {
target_extra_sample_args = sample_json["extra_sample_args"].get<std::string>();
}
if (target_custom_sigmas != nullptr && if (target_custom_sigmas != nullptr &&
sample_json.contains("custom_sigmas") && sample_json.contains("custom_sigmas") &&
sample_json["custom_sigmas"].is_array()) { sample_json["custom_sigmas"].is_array()) {
@ -1668,11 +1676,12 @@ bool SDGenerationParams::from_json_str(
}; };
if (j.contains("sample_params") && j["sample_params"].is_object()) { if (j.contains("sample_params") && j["sample_params"].is_object()) {
parse_sample_params_json(j["sample_params"], sample_params, skip_layers, &custom_sigmas); parse_sample_params_json(j["sample_params"], sample_params, extra_sample_args, skip_layers, &custom_sigmas);
} }
if (j.contains("high_noise_sample_params") && j["high_noise_sample_params"].is_object()) { if (j.contains("high_noise_sample_params") && j["high_noise_sample_params"].is_object()) {
parse_sample_params_json(j["high_noise_sample_params"], parse_sample_params_json(j["high_noise_sample_params"],
high_noise_sample_params, high_noise_sample_params,
high_noise_extra_sample_args,
high_noise_skip_layers, high_noise_skip_layers,
nullptr); nullptr);
} }
@ -2099,6 +2108,8 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() {
high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size(); high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size();
sample_params.custom_sigmas = custom_sigmas.empty() ? nullptr : custom_sigmas.data(); sample_params.custom_sigmas = custom_sigmas.empty() ? nullptr : custom_sigmas.data();
sample_params.custom_sigmas_count = static_cast<int>(custom_sigmas.size()); sample_params.custom_sigmas_count = static_cast<int>(custom_sigmas.size());
sample_params.extra_sample_args = extra_sample_args.empty() ? nullptr : extra_sample_args.c_str();
high_noise_sample_params.extra_sample_args = high_noise_extra_sample_args.empty() ? nullptr : high_noise_extra_sample_args.c_str();
cache_params.scm_mask = scm_mask.empty() ? nullptr : scm_mask.c_str(); cache_params.scm_mask = scm_mask.empty() ? nullptr : scm_mask.c_str();
sd_pm_params_t pm_params = { sd_pm_params_t pm_params = {
@ -2168,6 +2179,8 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() {
high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size(); high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size();
sample_params.custom_sigmas = custom_sigmas.empty() ? nullptr : custom_sigmas.data(); sample_params.custom_sigmas = custom_sigmas.empty() ? nullptr : custom_sigmas.data();
sample_params.custom_sigmas_count = static_cast<int>(custom_sigmas.size()); sample_params.custom_sigmas_count = static_cast<int>(custom_sigmas.size());
sample_params.extra_sample_args = extra_sample_args.empty() ? nullptr : extra_sample_args.c_str();
high_noise_sample_params.extra_sample_args = high_noise_extra_sample_args.empty() ? nullptr : high_noise_extra_sample_args.c_str();
cache_params.scm_mask = scm_mask.empty() ? nullptr : scm_mask.c_str(); cache_params.scm_mask = scm_mask.empty() ? nullptr : scm_mask.c_str();
params.loras = lora_vec.empty() ? nullptr : lora_vec.data(); params.loras = lora_vec.empty() ? nullptr : lora_vec.data();
@ -2306,6 +2319,7 @@ static json build_sampling_metadata_json(const sd_sample_params_t& sample_params
{"eta", sample_params.eta}, {"eta", sample_params.eta},
{"shifted_timestep", sample_params.shifted_timestep}, {"shifted_timestep", sample_params.shifted_timestep},
{"flow_shift", sample_params.flow_shift}, {"flow_shift", sample_params.flow_shift},
{"extra_sample_args", safe_json_string(sample_params.extra_sample_args)},
{"guidance", {"guidance",
{ {
{"txt_cfg", sample_params.guidance.txt_cfg}, {"txt_cfg", sample_params.guidance.txt_cfg},
@ -2497,6 +2511,9 @@ std::string get_image_params(const SDContextParams& ctx_params,
} }
parameter_string += "Guidance: " + std::to_string(gen_params.sample_params.guidance.distilled_guidance) + ", "; parameter_string += "Guidance: " + std::to_string(gen_params.sample_params.guidance.distilled_guidance) + ", ";
parameter_string += "Eta: " + std::to_string(gen_params.sample_params.eta) + ", "; parameter_string += "Eta: " + std::to_string(gen_params.sample_params.eta) + ", ";
if (!gen_params.extra_sample_args.empty()) {
parameter_string += "Extra sample args: " + gen_params.extra_sample_args + ", ";
}
parameter_string += "Seed: " + std::to_string(seed) + ", "; parameter_string += "Seed: " + std::to_string(seed) + ", ";
parameter_string += "Size: " + std::to_string(gen_params.get_resolved_width()) + "x" + std::to_string(gen_params.get_resolved_height()) + ", "; parameter_string += "Size: " + std::to_string(gen_params.get_resolved_width()) + "x" + std::to_string(gen_params.get_resolved_height()) + ", ";
parameter_string += "Model: " + sd_basename(ctx_params.model_path) + ", "; parameter_string += "Model: " + sd_basename(ctx_params.model_path) + ", ";

View File

@ -168,6 +168,8 @@ struct SDGenerationParams {
sd_sample_params_t sample_params; sd_sample_params_t sample_params;
sd_sample_params_t high_noise_sample_params; sd_sample_params_t high_noise_sample_params;
std::string extra_sample_args;
std::string high_noise_extra_sample_args;
std::vector<int> skip_layers = {7, 8, 9}; std::vector<int> skip_layers = {7, 8, 9};
std::vector<int> high_noise_skip_layers = {7, 8, 9}; std::vector<int> high_noise_skip_layers = {7, 8, 9};

View File

@ -205,6 +205,8 @@ Default Generation Options:
--hires-upscaler <string> highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent --hires-upscaler <string> highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
(nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic (nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
antialiased), or a model name under --hires-upscalers-dir (default: Latent) antialiased), or a model name under --hires-upscalers-dir (default: Latent)
--extra-sample-args <string> extra sampler args, key=value list. Currently lcm supports noise_clip_std,
noise_scale_start, noise_scale_end
-H, --height <int> image height, in pixel space (default: 512) -H, --height <int> image height, in pixel space (default: 512)
-W, --width <int> image width, in pixel space (default: 512) -W, --width <int> image width, in pixel space (default: 512)
--steps <int> number of sample steps (default: 20) --steps <int> number of sample steps (default: 20)

View File

@ -240,6 +240,7 @@ typedef struct {
float* custom_sigmas; float* custom_sigmas;
int custom_sigmas_count; int custom_sigmas_count;
float flow_shift; float flow_shift;
const char* extra_sample_args;
} sd_sample_params_t; } sd_sample_params_t;
typedef struct { typedef struct {

View File

@ -14,6 +14,12 @@ struct SDCondition {
sd::Tensor<float> c_concat; sd::Tensor<float> c_concat;
sd::Tensor<int32_t> c_t5_ids; sd::Tensor<int32_t> c_t5_ids;
sd::Tensor<float> c_t5_weights; sd::Tensor<float> c_t5_weights;
sd::Tensor<int32_t> c_input_ids;
sd::Tensor<int32_t> c_position_ids;
sd::Tensor<int32_t> c_token_types;
sd::Tensor<int32_t> c_vinput_mask;
std::vector<std::pair<int, sd::Tensor<float>>> c_image_embeds;
std::vector<sd::Tensor<float>> c_ref_images;
std::vector<sd::Tensor<float>> extra_c_crossattns; std::vector<sd::Tensor<float>> extra_c_crossattns;
@ -26,10 +32,24 @@ struct SDCondition {
bool empty() const { bool empty() const {
if (!c_crossattn.empty() || !c_vector.empty() || !c_concat.empty() || if (!c_crossattn.empty() || !c_vector.empty() || !c_concat.empty() ||
!c_t5_ids.empty() || !c_t5_weights.empty()) { !c_t5_ids.empty() || !c_t5_weights.empty() ||
!c_input_ids.empty() || !c_position_ids.empty() ||
!c_token_types.empty() || !c_vinput_mask.empty()) {
return false; return false;
} }
for (const auto& image_embed : c_image_embeds) {
if (!image_embed.second.empty()) {
return false;
}
}
for (const auto& tensor : c_ref_images) {
if (!tensor.empty()) {
return false;
}
}
for (const auto& tensor : extra_c_crossattns) { for (const auto& tensor : extra_c_crossattns) {
if (!tensor.empty()) { if (!tensor.empty()) {
return false; return false;

View File

@ -2,6 +2,7 @@
#define __DENOISER_HPP__ #define __DENOISER_HPP__
#include <cmath> #include <cmath>
#include <string>
#include <utility> #include <utility>
#include "ggml_extend.hpp" #include "ggml_extend.hpp"
@ -1148,7 +1149,80 @@ static sd::Tensor<float> sample_lcm(denoise_cb_t model,
sd::Tensor<float> x, sd::Tensor<float> x,
const std::vector<float>& sigmas, const std::vector<float>& sigmas,
std::shared_ptr<RNG> rng, std::shared_ptr<RNG> rng,
bool is_flow_denoiser) { bool is_flow_denoiser,
const char* extra_sample_args = nullptr) {
struct LCMSampleArgs {
float noise_clip_std = 0.0f;
float noise_scale_start = 1.0f;
float noise_scale_end = 1.0f;
};
auto trim = [](std::string value) -> std::string {
const char* whitespace = " \t\r\n";
size_t begin = value.find_first_not_of(whitespace);
if (begin == std::string::npos) {
return "";
}
size_t end = value.find_last_not_of(whitespace);
return value.substr(begin, end - begin + 1);
};
LCMSampleArgs args;
if (extra_sample_args != nullptr && extra_sample_args[0] != '\0') {
std::string raw(extra_sample_args);
size_t start = 0;
bool noise_scale_end_was_set = false;
bool noise_scale_start_was_set = false;
auto parse_arg = [&](const std::string& item) {
std::string token = trim(item);
if (token.empty()) {
return;
}
size_t eq = token.find('=');
if (eq == std::string::npos) {
LOG_WARN("ignoring invalid lcm extra sample arg '%s'", token.c_str());
return;
}
std::string key = trim(token.substr(0, eq));
std::string value = trim(token.substr(eq + 1));
float parsed = 0.0f;
try {
size_t consumed = 0;
parsed = std::stof(value, &consumed);
if (trim(value.substr(consumed)).size() != 0) {
LOG_WARN("ignoring invalid lcm extra sample arg '%s'", token.c_str());
return;
}
} catch (const std::exception&) {
LOG_WARN("ignoring invalid lcm extra sample arg '%s'", token.c_str());
return;
}
if (key == "noise_clip_std") {
args.noise_clip_std = parsed;
} else if (key == "noise_scale_start") {
args.noise_scale_start = parsed;
noise_scale_start_was_set = true;
} else if (key == "noise_scale_end") {
args.noise_scale_end = parsed;
noise_scale_end_was_set = true;
} else {
LOG_WARN("ignoring unknown lcm extra sample arg '%s'", key.c_str());
}
};
for (size_t pos = 0; pos <= raw.size(); ++pos) {
if (pos == raw.size() || raw[pos] == ',' || raw[pos] == ';') {
parse_arg(raw.substr(start, pos - start));
start = pos + 1;
}
}
if (noise_scale_start_was_set && !noise_scale_end_was_set) {
args.noise_scale_end = args.noise_scale_start;
}
}
int steps = static_cast<int>(sigmas.size()) - 1; int steps = static_cast<int>(sigmas.size()) - 1;
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
auto denoised_opt = model(x, sigmas[i], i + 1, nullptr); auto denoised_opt = model(x, sigmas[i], i + 1, nullptr);
@ -1160,7 +1234,27 @@ static sd::Tensor<float> sample_lcm(denoise_cb_t model,
if (is_flow_denoiser) { if (is_flow_denoiser) {
x *= (1 - sigmas[i + 1]); x *= (1 - sigmas[i + 1]);
} }
x += sd::Tensor<float>::randn_like(x, rng) * sigmas[i + 1]; auto noise = sd::Tensor<float>::randn_like(x, rng);
if (args.noise_clip_std > 0.0f && noise.numel() > 0) {
double mean = 0.0;
for (int64_t j = 0; j < noise.numel(); ++j) {
mean += static_cast<double>(noise[j]);
}
mean /= static_cast<double>(noise.numel());
double variance = 0.0;
for (int64_t j = 0; j < noise.numel(); ++j) {
double centered = static_cast<double>(noise[j]) - mean;
variance += centered * centered;
}
variance /= static_cast<double>(noise.numel());
float clip_val = args.noise_clip_std * static_cast<float>(std::sqrt(variance));
noise = sd::ops::clamp(noise, -clip_val, clip_val);
}
float t = steps > 1 ? static_cast<float>(i) / static_cast<float>(steps - 1) : 0.0f;
float noise_scale = args.noise_scale_start + (args.noise_scale_end - args.noise_scale_start) * t;
x += noise * (sigmas[i + 1] * noise_scale);
} }
} }
return x; return x;
@ -1663,7 +1757,7 @@ static sd::Tensor<float> sample_euler_cfg_pp(denoise_cb_t model,
} }
sd::Tensor<float> denoised = std::move(denoised_opt); sd::Tensor<float> denoised = std::move(denoised_opt);
sd::Tensor<float> d = (x - uncond_denoised) / sigma; sd::Tensor<float> d = (x - uncond_denoised) / sigma;
x = denoised + d * sigmas[i + 1]; x = denoised + d * sigmas[i + 1];
} }
@ -1686,7 +1780,7 @@ static sd::Tensor<float> sample_euler_ancestral_cfg_pp(denoise_cb_t model,
} }
sd::Tensor<float> denoised = std::move(denoised_opt); sd::Tensor<float> denoised = std::move(denoised_opt);
sd::Tensor<float> d = (x - uncond_denoised) / sigma; sd::Tensor<float> d = (x - uncond_denoised) / sigma;
auto [sigma_down, sigma_up] = get_ancestral_step(sigmas[i], sigmas[i + 1], eta); auto [sigma_down, sigma_up] = get_ancestral_step(sigmas[i], sigmas[i + 1], eta);
@ -1706,7 +1800,8 @@ static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
std::vector<float> sigmas, std::vector<float> sigmas,
std::shared_ptr<RNG> rng, std::shared_ptr<RNG> rng,
float eta, float eta,
bool is_flow_denoiser) { bool is_flow_denoiser,
const char* extra_sample_args) {
switch (method) { switch (method) {
case EULER_A_SAMPLE_METHOD: case EULER_A_SAMPLE_METHOD:
if (is_flow_denoiser) if (is_flow_denoiser)
@ -1729,7 +1824,7 @@ static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
case DPMPP2Mv2_SAMPLE_METHOD: case DPMPP2Mv2_SAMPLE_METHOD:
return sample_dpmpp_2m_v2(model, std::move(x), sigmas); return sample_dpmpp_2m_v2(model, std::move(x), sigmas);
case LCM_SAMPLE_METHOD: case LCM_SAMPLE_METHOD:
return sample_lcm(model, std::move(x), sigmas, rng, is_flow_denoiser); return sample_lcm(model, std::move(x), sigmas, rng, is_flow_denoiser, extra_sample_args);
case IPNDM_SAMPLE_METHOD: case IPNDM_SAMPLE_METHOD:
return sample_ipndm(model, std::move(x), sigmas); return sample_ipndm(model, std::move(x), sigmas);
case IPNDM_V_SAMPLE_METHOD: case IPNDM_V_SAMPLE_METHOD:

View File

@ -5,6 +5,7 @@
#include "anima.hpp" #include "anima.hpp"
#include "ernie_image.hpp" #include "ernie_image.hpp"
#include "flux.hpp" #include "flux.hpp"
#include "hidream_o1.hpp"
#include "mmdit.hpp" #include "mmdit.hpp"
#include "qwen_image.hpp" #include "qwen_image.hpp"
#include "tensor_ggml.hpp" #include "tensor_ggml.hpp"
@ -13,22 +14,28 @@
#include "z_image.hpp" #include "z_image.hpp"
struct DiffusionParams { struct DiffusionParams {
const sd::Tensor<float>* x = nullptr; const sd::Tensor<float>* x = nullptr;
const sd::Tensor<float>* timesteps = nullptr; const sd::Tensor<float>* timesteps = nullptr;
const sd::Tensor<float>* context = nullptr; const sd::Tensor<float>* context = nullptr;
const sd::Tensor<float>* c_concat = nullptr; const sd::Tensor<float>* c_concat = nullptr;
const sd::Tensor<float>* y = nullptr; const sd::Tensor<float>* y = nullptr;
const sd::Tensor<int32_t>* t5_ids = nullptr; const sd::Tensor<int32_t>* t5_ids = nullptr;
const sd::Tensor<float>* t5_weights = nullptr; const sd::Tensor<float>* t5_weights = nullptr;
const sd::Tensor<float>* guidance = nullptr; const sd::Tensor<float>* guidance = nullptr;
const std::vector<sd::Tensor<float>>* ref_latents = nullptr; const std::vector<sd::Tensor<float>>* ref_latents = nullptr;
bool increase_ref_index = false; const sd::Tensor<int32_t>* input_ids = nullptr;
int num_video_frames = -1; const sd::Tensor<int32_t>* input_pos = nullptr;
const std::vector<sd::Tensor<float>>* controls = nullptr; const sd::Tensor<int32_t>* token_types = nullptr;
float control_strength = 0.f; const sd::Tensor<int32_t>* vinput_mask = nullptr;
const sd::Tensor<float>* vace_context = nullptr; const std::vector<sd::Tensor<float>>* vlm_images = nullptr;
float vace_strength = 1.f; const std::vector<std::pair<int, sd::Tensor<float>>>* image_embeds = nullptr;
const std::vector<int>* skip_layers = nullptr; bool increase_ref_index = false;
int num_video_frames = -1;
const std::vector<sd::Tensor<float>>* controls = nullptr;
float control_strength = 0.f;
const sd::Tensor<float>* vace_context = nullptr;
float vace_strength = 1.f;
const std::vector<int>* skip_layers = nullptr;
}; };
template <typename T> template <typename T>
@ -476,6 +483,82 @@ struct QwenImageModel : public DiffusionModel {
} }
}; };
struct HiDreamO1Model : public DiffusionModel {
std::string prefix;
HiDreamO1::HiDreamO1Runner hidream_o1;
HiDreamO1Model(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string& prefix = "model")
: prefix(prefix), hidream_o1(backend, offload_params_to_cpu, tensor_storage_map, prefix) {
}
std::string get_desc() override {
return hidream_o1.get_desc();
}
void alloc_params_buffer() override {
hidream_o1.alloc_params_buffer();
}
void free_params_buffer() override {
hidream_o1.free_params_buffer();
}
void free_compute_buffer() override {
hidream_o1.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
hidream_o1.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return hidream_o1.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
hidream_o1.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 0;
}
void set_flash_attention_enabled(bool enabled) {
hidream_o1.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
hidream_o1.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
hidream_o1.set_circular_axes(circular_x, circular_y);
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
GGML_ASSERT(diffusion_params.input_ids != nullptr);
GGML_ASSERT(diffusion_params.input_pos != nullptr);
GGML_ASSERT(diffusion_params.token_types != nullptr);
static const std::vector<sd::Tensor<float>> empty_images;
static const std::vector<std::pair<int, sd::Tensor<float>>> empty_image_embeds;
return hidream_o1.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
*diffusion_params.input_ids,
*diffusion_params.input_pos,
*diffusion_params.token_types,
tensor_or_empty(diffusion_params.vinput_mask),
diffusion_params.image_embeds ? *diffusion_params.image_embeds : empty_image_embeds,
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_images);
}
};
struct ZImageModel : public DiffusionModel { struct ZImageModel : public DiffusionModel {
std::string prefix; std::string prefix;
ZImage::ZImageRunner z_image; ZImage::ZImageRunner z_image;

View File

@ -280,6 +280,9 @@ __STATIC_INLINE__ void print_sd_tensor(const sd::Tensor<T>& tensor, bool shape_o
if (shape_only) { if (shape_only) {
return; return;
} }
if (tensor.empty()) {
return;
}
int range = 3; int range = 3;
std::vector<int64_t> shape = tensor.shape(); std::vector<int64_t> shape = tensor.shape();
while (shape.size() < 4) { while (shape.size() < 4) {
@ -1698,13 +1701,41 @@ struct WeightAdapter {
}; };
struct GGMLRunnerContext { struct GGMLRunnerContext {
ggml_backend_t backend = nullptr; ggml_backend_t backend = nullptr;
ggml_context* ggml_ctx = nullptr; ggml_context* ggml_ctx = nullptr;
bool flash_attn_enabled = false; bool flash_attn_enabled = false;
bool conv2d_direct_enabled = false; bool conv2d_direct_enabled = false;
bool circular_x_enabled = false; bool circular_x_enabled = false;
bool circular_y_enabled = false; bool circular_y_enabled = false;
std::shared_ptr<WeightAdapter> weight_adapter = nullptr; std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
std::vector<std::pair<ggml_tensor*, std::string>>* debug_tensors = nullptr;
std::function<ggml_tensor*(const std::string&)> get_cache_tensor;
std::function<void(const std::string&, ggml_tensor*)> cache_tensor;
void capture_tensor(const std::string& name, ggml_tensor* tensor) {
if (debug_tensors == nullptr || tensor == nullptr) {
return;
}
ggml_tensor* snapshot = ggml_cont(ggml_ctx, tensor);
ggml_tensor* dst = ggml_dup_tensor(ggml_ctx, snapshot);
snapshot = ggml_cpy(ggml_ctx, snapshot, dst);
ggml_set_output(snapshot);
debug_tensors->push_back({snapshot, name});
}
ggml_tensor* load_cache_tensor(const std::string& name) const {
if (!get_cache_tensor) {
return nullptr;
}
return get_cache_tensor(name);
}
void persist_cache_tensor(const std::string& name, ggml_tensor* tensor) const {
if (!cache_tensor || tensor == nullptr) {
return;
}
cache_tensor(name, tensor);
}
}; };
struct GGMLRunner { struct GGMLRunner {
@ -1743,6 +1774,7 @@ protected:
std::map<ggml_tensor*, const void*> backend_tensor_data_map; std::map<ggml_tensor*, const void*> backend_tensor_data_map;
std::map<std::string, ggml_tensor*> cache_tensor_map; // name -> tensor std::map<std::string, ggml_tensor*> cache_tensor_map; // name -> tensor
std::vector<std::pair<ggml_tensor*, std::string>> debug_tensors;
const std::string final_result_name = "ggml_runner_final_result_tensor"; const std::string final_result_name = "ggml_runner_final_result_tensor";
bool flash_attn_enabled = false; bool flash_attn_enabled = false;
@ -1838,6 +1870,7 @@ protected:
} }
void free_compute_ctx() { void free_compute_ctx() {
debug_tensors.clear();
if (compute_ctx != nullptr) { if (compute_ctx != nullptr) {
ggml_free(compute_ctx); ggml_free(compute_ctx);
compute_ctx = nullptr; compute_ctx = nullptr;
@ -1884,6 +1917,16 @@ protected:
auto result = ggml_graph_node(gf, -1); auto result = ggml_graph_node(gf, -1);
ggml_set_name(result, final_result_name.c_str()); ggml_set_name(result, final_result_name.c_str());
} }
for (const auto& entry : debug_tensors) {
if (entry.first != nullptr) {
ggml_build_forward_expand(gf, entry.first);
}
}
for (const auto& entry : cache_tensor_map) {
if (entry.second != nullptr) {
ggml_build_forward_expand(gf, entry.second);
}
}
prepare_build_in_tensor_after(gf); prepare_build_in_tensor_after(gf);
return gf; return gf;
} }
@ -1981,9 +2024,13 @@ protected:
ggml_backend_buffer_t src_buf = sd::ggml_graph_cut::tensor_buffer(src); ggml_backend_buffer_t src_buf = sd::ggml_graph_cut::tensor_buffer(src);
ggml_backend_buffer_t dst_buf = sd::ggml_graph_cut::tensor_buffer(dst); ggml_backend_buffer_t dst_buf = sd::ggml_graph_cut::tensor_buffer(dst);
if (src_buf == nullptr || dst_buf == nullptr) { if (src_buf == nullptr || dst_buf == nullptr) {
LOG_ERROR("%s cache copy tensor buffer missing: name=%s src_buffer=%p src_view_src=%p src_view_src_buffer=%p dst_buffer=%p", LOG_ERROR("%s cache copy tensor buffer missing: name=%s op=%s src0=%p src0_name=%s src0_buffer=%p src_buffer=%p src_view_src=%p src_view_src_buffer=%p dst_buffer=%p",
get_desc().c_str(), get_desc().c_str(),
src && src->name[0] != '\0' ? src->name : "<unnamed>", src && src->name[0] != '\0' ? src->name : "<unnamed>",
src ? ggml_op_name(src->op) : "<null>",
src ? src->src[0] : nullptr,
(src && src->src[0] && src->src[0]->name[0] != '\0') ? src->src[0]->name : "<unnamed>",
(src && src->src[0]) ? sd::ggml_graph_cut::tensor_buffer(src->src[0]) : nullptr,
src ? src->buffer : nullptr, src ? src->buffer : nullptr,
src ? src->view_src : nullptr, src ? src->view_src : nullptr,
(src && src->view_src) ? src->view_src->buffer : nullptr, (src && src->view_src) ? src->view_src->buffer : nullptr,
@ -2015,6 +2062,42 @@ protected:
return true; return true;
} }
template <typename T>
std::optional<sd::Tensor<T>> read_graph_tensor(ggml_tensor* tensor, const char* label) {
if (tensor == nullptr) {
LOG_ERROR("%s %s tensor is null", get_desc().c_str(), label);
return std::nullopt;
}
if (tensor->type != sd::GGMLTypeTraits<T>::type) {
LOG_ERROR("%s %s tensor type mismatch: got %s",
get_desc().c_str(),
label,
ggml_type_name(tensor->type));
return std::nullopt;
}
ggml_backend_buffer_t buf = sd::ggml_graph_cut::tensor_buffer(tensor);
if (buf == nullptr) {
LOG_ERROR("%s %s tensor buffer missing: name=%s op=%s buffer=%p view_src=%p view_src_buffer=%p data=%p",
get_desc().c_str(),
label,
tensor->name[0] != '\0' ? tensor->name : "<unnamed>",
ggml_op_name(tensor->op),
tensor->buffer,
tensor->view_src,
tensor->view_src ? tensor->view_src->buffer : nullptr,
tensor->data);
return std::nullopt;
}
sd::Tensor<T> result(sd::shape_from_ggml(tensor));
if (tensor->view_src != nullptr || !ggml_is_contiguous(tensor) || tensor->buffer == nullptr) {
ggml_backend_tensor_get(tensor, result.data(), 0, ggml_nbytes(tensor));
} else {
ggml_backend_tensor_get(tensor, result.data(), 0, ggml_nbytes(tensor));
}
return result;
}
void copy_data_to_backend_tensor(ggml_cgraph* gf, bool clear_after_copy = true) { void copy_data_to_backend_tensor(ggml_cgraph* gf, bool clear_after_copy = true) {
GGML_ASSERT(gf != nullptr); GGML_ASSERT(gf != nullptr);
std::unordered_set<const ggml_tensor*> graph_tensor_set; std::unordered_set<const ggml_tensor*> graph_tensor_set;
@ -2031,10 +2114,24 @@ protected:
for (auto& kv : backend_tensor_data_map) { for (auto& kv : backend_tensor_data_map) {
auto tensor = kv.first; auto tensor = kv.first;
auto data = kv.second; auto data = kv.second;
if (tensor == nullptr || data == nullptr) {
continue;
}
const char* name = ggml_get_name(tensor);
if (graph_tensor_set.find(tensor) == graph_tensor_set.end()) { if (graph_tensor_set.find(tensor) == graph_tensor_set.end()) {
continue; continue;
} }
if (tensor->buffer == nullptr) {
LOG_WARN("%s skip backend tensor copy: tensor buffer not set, name='%s', ne=[%lld,%lld,%lld,%lld], type=%s",
get_desc().c_str(),
name != nullptr ? name : "",
(long long)tensor->ne[0],
(long long)tensor->ne[1],
(long long)tensor->ne[2],
(long long)tensor->ne[3],
ggml_type_name(tensor->type));
continue;
}
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
if (buf == nullptr) { if (buf == nullptr) {
@ -2421,6 +2518,43 @@ protected:
return std::nullopt; return std::nullopt;
} }
std::unordered_set<const ggml_tensor*> debug_graph_tensor_set;
const int n_debug_leafs = sd::ggml_graph_cut::leaf_count(gf);
const int n_debug_nodes = ggml_graph_n_nodes(gf);
debug_graph_tensor_set.reserve(static_cast<size_t>(n_debug_leafs + n_debug_nodes));
for (int i = 0; i < n_debug_leafs; ++i) {
debug_graph_tensor_set.insert(sd::ggml_graph_cut::leaf_tensor(gf, i));
}
for (int i = 0; i < n_debug_nodes; ++i) {
debug_graph_tensor_set.insert(ggml_graph_node(gf, i));
}
for (const auto& entry : debug_tensors) {
auto tensor = entry.first;
if (tensor == nullptr) {
continue;
}
if (debug_graph_tensor_set.find(tensor) == debug_graph_tensor_set.end()) {
continue;
}
ggml_backend_buffer_t tensor_buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
if (tensor_buf == nullptr) {
LOG_WARN("%s skip debug tensor '%s': tensor buffer not set",
get_desc().c_str(),
entry.second.c_str());
continue;
}
if (tensor->type != GGML_TYPE_F32) {
LOG_WARN("%s skip debug tensor '%s': only GGML_TYPE_F32 is supported, got %s",
get_desc().c_str(),
entry.second.c_str(),
ggml_type_name(tensor->type));
continue;
}
auto debug_tensor = sd::make_sd_tensor_from_ggml<float>(tensor);
print_sd_tensor(debug_tensor, false, entry.second.c_str());
}
int64_t t_cache_begin = ggml_time_ms(); int64_t t_cache_begin = ggml_time_ms();
if (!copy_cache_tensors_to_cache_buffer(cache_keep_names)) { if (!copy_cache_tensors_to_cache_buffer(cache_keep_names)) {
if (free_compute_buffer_immediately) { if (free_compute_buffer_immediately) {
@ -2434,7 +2568,15 @@ protected:
auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str()); auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
std::optional<sd::Tensor<T>> output; std::optional<sd::Tensor<T>> output;
if (!no_return) { if (!no_return) {
output = sd::make_sd_tensor_from_ggml<T>(result); output = read_graph_tensor<T>(result, "output");
if (!output.has_value()) {
if (free_compute_buffer_immediately) {
free_compute_buffer();
} else if (use_partial_param_offload) {
restore_partial_params();
}
return std::nullopt;
}
} else { } else {
output = sd::Tensor<T>(); output = sd::Tensor<T>();
} }
@ -2557,6 +2699,13 @@ public:
runner_ctx.circular_x_enabled = circular_x_enabled; runner_ctx.circular_x_enabled = circular_x_enabled;
runner_ctx.circular_y_enabled = circular_y_enabled; runner_ctx.circular_y_enabled = circular_y_enabled;
runner_ctx.weight_adapter = weight_adapter; runner_ctx.weight_adapter = weight_adapter;
runner_ctx.debug_tensors = &debug_tensors;
runner_ctx.get_cache_tensor = [this](const std::string& name) {
return this->get_cache_tensor_by_name(name);
};
runner_ctx.cache_tensor = [this](const std::string& name, ggml_tensor* tensor) {
this->cache(name, tensor);
};
return runner_ctx; return runner_ctx;
} }
@ -2676,6 +2825,9 @@ public:
} }
void cache(const std::string name, ggml_tensor* tensor) { void cache(const std::string name, ggml_tensor* tensor) {
if (tensor != nullptr && tensor->view_src != nullptr) {
tensor = ggml_cont(compute_ctx, tensor);
}
cache_tensor_map[name] = tensor; cache_tensor_map[name] = tensor;
} }

View File

@ -45,6 +45,21 @@ namespace sd::ggml_graph_cut {
return params_tensor_set.find(tensor) != params_tensor_set.end(); return params_tensor_set.find(tensor) != params_tensor_set.end();
} }
static int graph_node_index_by_name(ggml_cgraph* gf, const char* name) {
GGML_ASSERT(gf != nullptr);
if (name == nullptr || name[0] == '\0') {
return -1;
}
const int n_nodes = ggml_graph_n_nodes(gf);
for (int i = 0; i < n_nodes; ++i) {
ggml_tensor* node = ggml_graph_node(gf, i);
if (node != nullptr && std::strcmp(node->name, name) == 0) {
return i;
}
}
return -1;
}
static Plan::InputShape input_shape(const ggml_tensor* tensor) { static Plan::InputShape input_shape(const ggml_tensor* tensor) {
Plan::InputShape shape; Plan::InputShape shape;
if (tensor == nullptr) { if (tensor == nullptr) {
@ -244,6 +259,11 @@ namespace sd::ggml_graph_cut {
if (tensor == nullptr) { if (tensor == nullptr) {
return nullptr; return nullptr;
} }
if (tensor_buffer(tensor) == nullptr && tensor->src[0] != nullptr &&
ggml_nelements(tensor->src[0]) == ggml_nelements(tensor) &&
ggml_nbytes(tensor->src[0]) == ggml_nbytes(tensor)) {
return cache_source_tensor(tensor->src[0]);
}
return tensor->view_src ? tensor->view_src : tensor; return tensor->view_src ? tensor->view_src : tensor;
} }
@ -503,11 +523,15 @@ namespace sd::ggml_graph_cut {
log_desc); log_desc);
} }
ggml_tensor* final_output = ggml_graph_node(gf, -1); int final_output_index = graph_node_index_by_name(gf, "ggml_runner_final_result_tensor");
if (final_output != nullptr && available_cut_output_node_indices.find(n_nodes - 1) == available_cut_output_node_indices.end()) { if (final_output_index < 0) {
final_output_index = n_nodes - 1;
}
ggml_tensor* final_output = final_output_index >= 0 ? ggml_graph_node(gf, final_output_index) : nullptr;
if (final_output != nullptr && available_cut_output_node_indices.find(final_output_index) == available_cut_output_node_indices.end()) {
Segment final_segment; Segment final_segment;
final_segment.group_name = "ggml_runner.final"; final_segment.group_name = "ggml_runner.final";
final_segment.output_node_indices.push_back(n_nodes - 1); final_segment.output_node_indices.push_back(final_output_index);
build_segment(gf, build_segment(gf,
plan, plan,
final_segment, final_segment,

653
src/hidream_o1.hpp Normal file
View File

@ -0,0 +1,653 @@
#ifndef __SD_HIDREAM_O1_H__
#define __SD_HIDREAM_O1_H__
#include <algorithm>
#include <array>
#include <cmath>
#include <cstring>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "common_dit.hpp"
#include "conditioner.hpp"
#include "llm.hpp"
#include "util.h"
namespace HiDreamO1 {
constexpr int HIDREAM_O1_GRAPH_SIZE = 32768;
constexpr int PATCH_SIZE = 32;
constexpr int TIMESTEP_TOKEN_NUM = 1;
constexpr int IMAGE_TOKEN_ID = 151655;
constexpr int VISION_START_TOKEN_ID = 151652;
static inline std::string repeat_special_token(const std::string& token, int64_t count) {
std::string out;
out.reserve(static_cast<size_t>(count) * token.size());
for (int64_t i = 0; i < count; ++i) {
out += token;
}
return out;
}
static inline std::pair<int, int> calculate_dimensions(int max_size, double ratio) {
int width = static_cast<int>(std::sqrt(max_size * max_size * ratio));
int height = static_cast<int>(width / ratio);
width = (width / PATCH_SIZE) * PATCH_SIZE;
height = (height / PATCH_SIZE) * PATCH_SIZE;
width = std::max(width, PATCH_SIZE);
height = std::max(height, PATCH_SIZE);
return {width, height};
}
static inline sd::Tensor<float> resize_to_area(const sd::Tensor<float>& image, int image_size) {
int64_t width = image.shape()[0];
int64_t height = image.shape()[1];
int64_t s_max = static_cast<int64_t>(image_size) * image_size;
double scale = std::sqrt(static_cast<double>(s_max) / static_cast<double>(width * height));
std::vector<std::pair<int64_t, int64_t>> sizes = {
{(static_cast<int64_t>(std::llround(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast<int64_t>(std::llround(height * scale)) / PATCH_SIZE) * PATCH_SIZE},
{(static_cast<int64_t>(std::llround(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast<int64_t>(std::floor(height * scale)) / PATCH_SIZE) * PATCH_SIZE},
{(static_cast<int64_t>(std::floor(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast<int64_t>(std::llround(height * scale)) / PATCH_SIZE) * PATCH_SIZE},
{(static_cast<int64_t>(std::floor(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast<int64_t>(std::floor(height * scale)) / PATCH_SIZE) * PATCH_SIZE},
};
std::sort(sizes.begin(), sizes.end(), [](const auto& a, const auto& b) {
return a.first * a.second > b.first * b.second;
});
std::pair<int64_t, int64_t> new_size = sizes.back();
for (const auto& size : sizes) {
if (size.first > 0 && size.second > 0 && size.first * size.second <= s_max) {
new_size = size;
break;
}
}
double s1 = static_cast<double>(width) / static_cast<double>(new_size.first);
double s2 = static_cast<double>(height) / static_cast<double>(new_size.second);
sd::Tensor<float> resized;
if (s1 < s2) {
int64_t resized_h = static_cast<int64_t>(std::llround(height / s1));
resized = sd::ops::interpolate(image,
{new_size.first, resized_h, image.shape()[2], image.shape()[3]},
sd::ops::InterpolateMode::Bicubic);
int64_t top = (resized_h - new_size.second) / 2;
resized = sd::ops::slice(resized, 1, top, top + new_size.second);
} else {
int64_t resized_w = static_cast<int64_t>(std::llround(width / s2));
resized = sd::ops::interpolate(image,
{resized_w, new_size.second, image.shape()[2], image.shape()[3]},
sd::ops::InterpolateMode::Bicubic);
int64_t left = (resized_w - new_size.first) / 2;
resized = sd::ops::slice(resized, 0, left, left + new_size.first);
}
return resized;
}
static inline std::vector<int32_t> build_position_ids(const std::vector<int32_t>& input_ids,
const std::vector<std::array<int32_t, 3>>& image_grids,
const std::vector<int32_t>& skip_vision_start_token) {
std::vector<int32_t> position_ids(4 * input_ids.size(), 0);
int image_index = 0;
int st = 0;
int fix_point = 4096;
std::vector<int32_t> out_t;
std::vector<int32_t> out_h;
std::vector<int32_t> out_w;
while (st < static_cast<int>(input_ids.size())) {
int ed = st;
while (ed < static_cast<int>(input_ids.size()) && input_ids[ed] != IMAGE_TOKEN_ID) {
ed++;
}
if (ed >= static_cast<int>(input_ids.size())) {
int st_idx = out_t.empty() ? 0 : (*std::max_element(out_t.begin(), out_t.end()) + 1);
for (int i = 0; i < static_cast<int>(input_ids.size()) - st; ++i) {
out_t.push_back(st_idx + i);
out_h.push_back(st_idx + i);
out_w.push_back(st_idx + i);
}
break;
}
int text_len = std::max(0, ed - st - skip_vision_start_token[image_index]);
int st_idx = out_t.empty() ? 0 : (*std::max_element(out_t.begin(), out_t.end()) + 1);
for (int i = 0; i < text_len; ++i) {
out_t.push_back(st_idx + i);
out_h.push_back(st_idx + i);
out_w.push_back(st_idx + i);
}
auto grid = image_grids[image_index];
int base;
if (skip_vision_start_token[image_index]) {
if (fix_point > 0) {
base = fix_point;
fix_point = 0;
} else {
base = st_idx;
}
} else {
base = text_len + st_idx;
}
for (int32_t ti = 0; ti < grid[0]; ++ti) {
for (int32_t hi = 0; hi < grid[1]; ++hi) {
for (int32_t wi = 0; wi < grid[2]; ++wi) {
out_t.push_back(base + ti);
out_h.push_back(base + hi);
out_w.push_back(base + wi);
}
}
}
st = ed + grid[0] * grid[1] * grid[2];
image_index++;
}
GGML_ASSERT(out_t.size() == input_ids.size());
for (size_t i = 0; i < input_ids.size(); ++i) {
// ggml IMROPE consumes 4 flattened position streams:
// [t, h, w, e]
// llama.cpp's generic Qwen-VL fallback expands text positions as
// [pos, pos, pos, 0]. Keep the extra stream zeroed here too.
position_ids[i] = out_t[i];
position_ids[input_ids.size() + i] = out_h[i];
position_ids[input_ids.size() * 2 + i] = out_w[i];
position_ids[input_ids.size() * 3 + i] = 0;
}
return position_ids;
}
struct TimestepEmbedder : public GGMLBlock {
int frequency_embedding_size = 256;
TimestepEmbedder(int64_t hidden_size) {
blocks["mlp.0"] = std::make_shared<Linear>(frequency_embedding_size, hidden_size, true);
blocks["mlp.2"] = std::make_shared<Linear>(hidden_size, hidden_size, true);
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* t) {
auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);
auto emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, t, frequency_embedding_size, 10000, 1000.0f);
emb = mlp_0->forward(ctx, emb);
emb = ggml_silu_inplace(ctx->ggml_ctx, emb);
emb = mlp_2->forward(ctx, emb);
return emb;
}
};
struct BottleneckPatchEmbed : public GGMLBlock {
BottleneckPatchEmbed(int64_t in_dim, int64_t pca_dim, int64_t embed_dim) {
blocks["proj1"] = std::make_shared<Linear>(in_dim, pca_dim, false);
blocks["proj2"] = std::make_shared<Linear>(pca_dim, embed_dim, true);
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto proj1 = std::dynamic_pointer_cast<Linear>(blocks["proj1"]);
auto proj2 = std::dynamic_pointer_cast<Linear>(blocks["proj2"]);
return proj2->forward(ctx, proj1->forward(ctx, x));
}
};
struct FinalLayer : public GGMLBlock {
FinalLayer(int64_t hidden_size, int64_t out_dim) {
blocks["linear"] = std::make_shared<Linear>(hidden_size, out_dim, true);
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
return linear->forward(ctx, x);
}
};
struct HiDreamO1Params {
LLM::LLMParams llm;
int patch_size = PATCH_SIZE;
};
static inline HiDreamO1Params make_hidream_o1_params() {
HiDreamO1Params params;
params.llm.arch = LLM::LLMArch::QWEN3_VL;
params.llm.hidden_size = 4096;
params.llm.intermediate_size = 12288;
params.llm.num_layers = 36;
params.llm.num_heads = 32;
params.llm.num_kv_heads = 8;
params.llm.head_dim = 128;
params.llm.qkv_bias = false;
params.llm.qk_norm = true;
params.llm.vocab_size = 151936;
params.llm.rms_norm_eps = 1e-6f;
params.llm.vision.arch = LLM::LLMVisionArch::QWEN3_VL;
params.llm.vision.num_layers = 27;
params.llm.vision.hidden_size = 1152;
params.llm.vision.intermediate_size = 4304;
params.llm.vision.num_heads = 16;
params.llm.vision.out_hidden_size = 4096;
params.llm.vision.patch_size = 16;
params.llm.vision.spatial_merge_size = 2;
params.llm.vision.temporal_patch_size = 2;
params.llm.vision.num_position_embeddings = 2304;
return params;
}
struct HiDreamO1Model : public GGMLBlock {
HiDreamO1Params params;
HiDreamO1Model() = default;
explicit HiDreamO1Model(HiDreamO1Params params)
: params(std::move(params)) {
blocks["language_model"] = std::make_shared<LLM::TextModel>(this->params.llm);
blocks["t_embedder1"] = std::make_shared<TimestepEmbedder>(this->params.llm.hidden_size);
blocks["x_embedder"] = std::make_shared<BottleneckPatchEmbed>(this->params.patch_size * this->params.patch_size * 3,
this->params.llm.hidden_size / 4,
this->params.llm.hidden_size);
blocks["final_layer2"] = std::make_shared<FinalLayer>(this->params.llm.hidden_size,
this->params.patch_size * this->params.patch_size * 3);
}
std::shared_ptr<LLM::TextModel> text_model() {
return std::dynamic_pointer_cast<LLM::TextModel>(blocks["language_model"]);
}
std::shared_ptr<TimestepEmbedder> timestep_embedder() {
return std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder1"]);
}
std::shared_ptr<BottleneckPatchEmbed> patch_embedder() {
return std::dynamic_pointer_cast<BottleneckPatchEmbed>(blocks["x_embedder"]);
}
std::shared_ptr<FinalLayer> final_layer() {
return std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer2"]);
}
};
struct HiDreamO1VisionRunner : public GGMLRunner {
HiDreamO1Params params;
std::shared_ptr<LLM::VisionModel> model;
std::vector<int> window_index_vec;
std::vector<int> window_inverse_index_vec;
std::vector<float> window_mask_vec;
std::vector<float> pe_vec;
std::array<std::vector<int32_t>, 4> pos_embed_idx_data_;
std::array<std::vector<float>, 4> pos_embed_weight_data_;
HiDreamO1VisionRunner(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string& prefix = "model.visual")
: GGMLRunner(backend, offload_params_to_cpu),
params(make_hidream_o1_params()),
model(std::make_shared<LLM::VisionModel>(false, params.llm.vision)) {
model->init(params_ctx, tensor_storage_map, prefix);
}
std::string get_desc() override {
return "hidream_o1_vision";
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix = "model.visual") {
model->get_param_tensors(tensors, prefix);
}
ggml_tensor* encode_image(GGMLRunnerContext* runner_ctx, ggml_tensor* image) {
return LLM::LLMRunner::encode_image_common(this,
compute_ctx,
runner_ctx,
image,
params.llm.vision,
model,
window_index_vec,
window_inverse_index_vec,
window_mask_vec,
pe_vec,
pos_embed_idx_data_,
pos_embed_weight_data_);
}
ggml_cgraph* build_graph(const sd::Tensor<float>& image_tensor) {
ggml_cgraph* gf = new_graph_custom(HIDREAM_O1_GRAPH_SIZE);
ggml_tensor* image = make_input(image_tensor);
auto runner_ctx = get_context();
auto image_embeds = encode_image(&runner_ctx, image);
ggml_build_forward_expand(gf, image_embeds);
return gf;
}
sd::Tensor<float> compute(int n_threads, const sd::Tensor<float>& image) {
auto get_graph = [&]() {
return build_graph(image);
};
auto output = GGMLRunner::compute<float>(get_graph, n_threads, false);
return output.has_value() ? std::move(output.value()) : sd::Tensor<float>();
}
};
struct HiDreamO1Runner : public GGMLRunner {
HiDreamO1Params params;
HiDreamO1Model model;
std::vector<float> attention_mask_vec;
HiDreamO1Runner(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string& prefix = "model")
: GGMLRunner(backend, offload_params_to_cpu),
params(make_hidream_o1_params()) {
model = HiDreamO1Model(params);
model.init(params_ctx, tensor_storage_map, prefix);
}
std::string get_desc() override {
return "hidream_o1";
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) {
model.get_param_tensors(tensors, prefix);
}
ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor,
const sd::Tensor<float>& timestep_tensor,
const sd::Tensor<int32_t>& input_ids_tensor,
const sd::Tensor<int32_t>& input_pos_tensor,
const sd::Tensor<int32_t>& token_types_tensor,
const sd::Tensor<int32_t>& vinput_mask_tensor,
const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds_tensor,
const std::vector<sd::Tensor<float>>& ref_images) {
ggml_cgraph* gf = new_graph_custom(HIDREAM_O1_GRAPH_SIZE);
ggml_tensor* x = make_input(x_tensor);
ggml_tensor* timestep = make_input(timestep_tensor);
ggml_tensor* input_ids = make_input(input_ids_tensor);
ggml_tensor* input_pos = make_input(input_pos_tensor);
auto text_model = model.text_model();
auto t_embedder1 = model.timestep_embedder();
auto x_embedder = model.patch_embedder();
auto final_layer2 = model.final_layer();
std::vector<ggml_tensor*> ref_image_tensors;
for (const auto& image : ref_images) {
ref_image_tensors.push_back(make_input(image));
}
attention_mask_vec = std::vector<float>(static_cast<size_t>(token_types_tensor.shape()[0] * token_types_tensor.shape()[0]), 0.0f);
int64_t total_seq_len = token_types_tensor.shape()[0];
for (int64_t query = 0; query < total_seq_len; ++query) {
bool is_gen = token_types_tensor.values()[static_cast<size_t>(query)] > 0;
for (int64_t key = 0; key < total_seq_len; ++key) {
if (!is_gen && key > query) {
attention_mask_vec[static_cast<size_t>(query * total_seq_len + key)] = -INFINITY;
}
}
}
auto attention_mask = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, total_seq_len, total_seq_len);
set_backend_tensor_data(attention_mask, attention_mask_vec.data());
auto runner_ctx = get_context();
auto txt = text_model->embed(&runner_ctx, input_ids);
std::vector<std::pair<int, ggml_tensor*>> image_embeds;
image_embeds.reserve(image_embeds_tensor.size());
for (const auto& image_embed : image_embeds_tensor) {
image_embeds.emplace_back(image_embed.first, make_input(image_embed.second));
}
txt = LLM::splice_image_embeds(&runner_ctx, txt, image_embeds);
auto t_emb = t_embedder1->forward(&runner_ctx, timestep);
int64_t txt_seq_len = input_ids->ne[0];
if (txt_seq_len > 1) {
auto prefix = ggml_ext_slice(compute_ctx, txt, 1, 0, txt_seq_len - 1);
txt = ggml_concat(compute_ctx, prefix, ggml_reshape_3d(compute_ctx, t_emb, t_emb->ne[0], 1, 1), 1);
} else {
txt = ggml_reshape_3d(compute_ctx, t_emb, t_emb->ne[0], 1, 1);
}
auto vinputs = DiT::pad_and_patchify(&runner_ctx, x, PATCH_SIZE, PATCH_SIZE);
int64_t target_tokens = vinputs->ne[1];
for (ggml_tensor* ref_image : ref_image_tensors) {
auto ref = DiT::pad_and_patchify(&runner_ctx, ref_image, PATCH_SIZE, PATCH_SIZE);
vinputs = ggml_concat(compute_ctx, vinputs, ref, 1);
}
auto vis = x_embedder->forward(&runner_ctx, vinputs);
auto inputs_embeds = ggml_concat(compute_ctx, txt, vis, 1);
auto hidden_states = text_model->forward_embeds(&runner_ctx, inputs_embeds, input_pos, attention_mask, {});
auto x_pred_all = final_layer2->forward(&runner_ctx, hidden_states);
int64_t x_pred_start = txt_seq_len;
if (!vinput_mask_tensor.empty()) {
int64_t seq_len = static_cast<int64_t>(vinput_mask_tensor.shape()[0]);
int64_t first_vinput = 0;
while (first_vinput < seq_len && vinput_mask_tensor.values()[static_cast<size_t>(first_vinput)] == 0) {
first_vinput++;
}
x_pred_start = first_vinput;
}
auto x_pred = ggml_ext_slice(compute_ctx, x_pred_all, 1, x_pred_start, x_pred_start + target_tokens);
x_pred = DiT::unpatchify_and_crop(compute_ctx, x_pred, x->ne[1], x->ne[0], PATCH_SIZE, PATCH_SIZE);
float sigma = 1.0f - timestep_tensor.values()[0];
sigma = std::max(1e-6f, sigma);
auto out = ggml_scale(compute_ctx, ggml_sub(compute_ctx, x, x_pred), 1.0f / sigma);
ggml_build_forward_expand(gf, out);
return gf;
}
sd::Tensor<float> compute(int n_threads,
const sd::Tensor<float>& x,
const sd::Tensor<float>& timestep,
const sd::Tensor<int32_t>& input_ids,
const sd::Tensor<int32_t>& input_pos,
const sd::Tensor<int32_t>& token_types,
const sd::Tensor<int32_t>& vinput_mask,
const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds,
const std::vector<sd::Tensor<float>>& ref_images) {
auto get_graph = [&]() {
return build_graph(x, timestep, input_ids, input_pos, token_types, vinput_mask, image_embeds, ref_images);
};
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
}
};
struct HiDreamO1Conditioner : public Conditioner {
Qwen2Tokenizer tokenizer;
std::shared_ptr<HiDreamO1VisionRunner> vision_runner;
HiDreamO1Conditioner(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {})
: vision_runner(std::make_shared<HiDreamO1VisionRunner>(backend, offload_params_to_cpu, tensor_storage_map)) {}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
vision_runner->get_param_tensors(tensors);
}
void alloc_params_buffer() override {
vision_runner->alloc_params_buffer();
}
void free_params_buffer() override {
vision_runner->free_params_buffer();
}
size_t get_params_buffer_size() override {
return vision_runner->get_params_buffer_size();
}
void set_max_graph_vram_bytes(size_t max_graph_vram_bytes) override {
vision_runner->set_max_graph_vram_bytes(max_graph_vram_bytes);
}
void set_flash_attention_enabled(bool enabled) override {
vision_runner->set_flash_attention_enabled(enabled);
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
vision_runner->set_weight_adapter(adapter);
}
SDCondition get_learned_condition(int n_threads,
const ConditionerParams& conditioner_params) override {
SDCondition result;
int width = conditioner_params.width;
int height = conditioner_params.height;
int64_t target_image_len = static_cast<int64_t>(width / PATCH_SIZE) * static_cast<int64_t>(height / PATCH_SIZE);
std::vector<sd::Tensor<float>> ref_images;
if (conditioner_params.ref_images != nullptr) {
ref_images = *conditioner_params.ref_images;
}
std::vector<std::pair<int, sd::Tensor<float>>> vlm_images;
std::vector<std::array<int32_t, 3>> image_grids;
std::vector<int32_t> skip_vision_start;
std::string prompt = "<|im_start|>user\n";
if (ref_images.empty()) {
prompt += conditioner_params.text;
prompt += "<|im_end|>\n<|im_start|>assistant\n<|boi_token|><|tms_token|>";
auto input_ids = tokenizer.encode(prompt, nullptr);
std::vector<int32_t> input_ids_pad = input_ids;
input_ids_pad.push_back(VISION_START_TOKEN_ID);
input_ids_pad.insert(input_ids_pad.end(), target_image_len - 1, IMAGE_TOKEN_ID);
image_grids.push_back({1, static_cast<int32_t>(height / PATCH_SIZE), static_cast<int32_t>(width / PATCH_SIZE)});
skip_vision_start.push_back(1);
std::vector<int32_t> token_types(input_ids_pad.size(), 0);
int txt_seq_len = static_cast<int>(input_ids.size());
int bgn = txt_seq_len - TIMESTEP_TOKEN_NUM;
for (int i = bgn; i < static_cast<int>(token_types.size()); ++i) {
token_types[i] = 1;
}
auto position_ids = build_position_ids(input_ids_pad, image_grids, skip_vision_start);
std::vector<int64_t> input_shape{static_cast<int64_t>(input_ids.size())};
std::vector<int64_t> position_shape{static_cast<int64_t>(input_ids_pad.size() * 4)};
std::vector<int64_t> token_type_shape{static_cast<int64_t>(token_types.size())};
std::vector<int32_t> vinput_mask(token_types.size(), 0);
for (int64_t i = txt_seq_len; i < static_cast<int64_t>(vinput_mask.size()); ++i) {
vinput_mask[static_cast<size_t>(i)] = 1;
}
std::vector<int64_t> vinput_mask_shape{static_cast<int64_t>(vinput_mask.size())};
result.c_input_ids = sd::Tensor<int32_t>(input_shape, std::move(input_ids));
result.c_position_ids = sd::Tensor<int32_t>(position_shape, position_ids);
result.c_token_types = sd::Tensor<int32_t>(token_type_shape, std::move(token_types));
result.c_vinput_mask = sd::Tensor<int32_t>(vinput_mask_shape, std::move(vinput_mask));
return result;
}
int K = static_cast<int>(ref_images.size());
int max_size;
if (K == 1) {
max_size = std::max(height, width);
} else if (K == 2) {
max_size = std::max(height, width) * 48 / 64;
} else if (K <= 4) {
max_size = std::max(height, width) / 2;
} else if (K <= 8) {
max_size = std::max(height, width) * 24 / 64;
} else {
max_size = std::max(height, width) / 4;
}
int cond_img_size;
if (K <= 4) {
cond_img_size = 384;
} else if (K <= 8) {
cond_img_size = 384 * 48 / 64;
} else {
cond_img_size = 384 / 2;
}
for (const auto& ref_image : ref_images) {
auto resized_ref = resize_to_area(ref_image, max_size);
resized_ref = sd::ops::clamp(resized_ref, 0.0f, 1.0f);
// VLM image: Qwen3-VL expects mean=[0.5]/std=[0.5] (i.e. range [-1,1]),
// not CLIP normalization. Resize the already-resized ref directly to
// (cond_w, cond_h) to match the Python pipeline's pil_r.resize().
auto dims = calculate_dimensions(cond_img_size,
static_cast<double>(resized_ref.shape()[0]) / static_cast<double>(resized_ref.shape()[1]));
sd::Tensor<float> vlm_image = sd::ops::interpolate(
resized_ref,
{dims.first, dims.second, resized_ref.shape()[2], resized_ref.shape()[3]});
vlm_image = vlm_image * 2.0f - 1.0f;
int64_t image_tokens = static_cast<int64_t>(dims.first / PATCH_SIZE) * static_cast<int64_t>(dims.second / PATCH_SIZE);
auto patch_img = resized_ref * 2.0f - 1.0f;
result.c_ref_images.push_back(std::move(patch_img));
int64_t prompt_start = static_cast<int64_t>(tokenizer.encode(prompt + "<|vision_start|>", nullptr).size());
prompt += "<|vision_start|>";
prompt += repeat_special_token("<|image_pad|>", image_tokens);
prompt += "<|vision_end|>";
vlm_images.emplace_back(static_cast<int>(prompt_start), std::move(vlm_image));
image_grids.push_back({1, dims.second / PATCH_SIZE, dims.first / PATCH_SIZE});
skip_vision_start.push_back(0);
}
prompt += conditioner_params.text;
prompt += "<|im_end|>\n<|im_start|>assistant\n<|boi_token|><|tms_token|>";
auto input_ids = tokenizer.encode(prompt, nullptr);
std::vector<int32_t> input_ids_pad = input_ids;
input_ids_pad.push_back(VISION_START_TOKEN_ID);
input_ids_pad.insert(input_ids_pad.end(), target_image_len - 1, IMAGE_TOKEN_ID);
image_grids.push_back({1, static_cast<int32_t>(height / PATCH_SIZE), static_cast<int32_t>(width / PATCH_SIZE)});
skip_vision_start.push_back(1);
for (const auto& ref_image : result.c_ref_images) {
int64_t ref_len = static_cast<int64_t>(ref_image.shape()[0] / PATCH_SIZE) * static_cast<int64_t>(ref_image.shape()[1] / PATCH_SIZE);
input_ids_pad.push_back(VISION_START_TOKEN_ID);
input_ids_pad.insert(input_ids_pad.end(), ref_len - 1, IMAGE_TOKEN_ID);
image_grids.push_back({1, static_cast<int32_t>(ref_image.shape()[1] / PATCH_SIZE), static_cast<int32_t>(ref_image.shape()[0] / PATCH_SIZE)});
skip_vision_start.push_back(1);
}
std::vector<int32_t> token_types(input_ids_pad.size(), 0);
int txt_seq_len = static_cast<int>(input_ids.size());
int bgn = txt_seq_len - TIMESTEP_TOKEN_NUM;
for (int i = bgn; i < static_cast<int>(token_types.size()); ++i) {
token_types[i] = 1;
}
std::vector<int64_t> input_shape{static_cast<int64_t>(input_ids.size())};
std::vector<int64_t> position_shape{static_cast<int64_t>(input_ids_pad.size() * 4)};
std::vector<int64_t> token_type_shape{static_cast<int64_t>(token_types.size())};
std::vector<int32_t> vinput_mask(token_types.size(), 0);
for (int i = txt_seq_len; i < static_cast<int>(vinput_mask.size()); ++i) {
vinput_mask[static_cast<size_t>(i)] = 1;
}
std::vector<int64_t> vinput_mask_shape{static_cast<int64_t>(vinput_mask.size())};
result.c_input_ids = sd::Tensor<int32_t>(input_shape, std::move(input_ids));
result.c_position_ids = sd::Tensor<int32_t>(position_shape, build_position_ids(input_ids_pad, image_grids, skip_vision_start));
result.c_token_types = sd::Tensor<int32_t>(token_type_shape, std::move(token_types));
result.c_vinput_mask = sd::Tensor<int32_t>(vinput_mask_shape, std::move(vinput_mask));
result.c_image_embeds.reserve(vlm_images.size());
for (const auto& vlm_image : vlm_images) {
auto image_embed = vision_runner->compute(n_threads, vlm_image.second);
if (image_embed.empty()) {
LOG_ERROR("hidream_o1 conditioner: encode VLM image failed");
return SDCondition();
}
result.c_image_embeds.emplace_back(vlm_image.first, std::move(image_embed));
}
return result;
}
};
} // namespace HiDreamO1
#endif // __SD_HIDREAM_O1_H__

View File

@ -2,7 +2,10 @@
#define __LLM_HPP__ #define __LLM_HPP__
#include <algorithm> #include <algorithm>
#include <array>
#include <cmath>
#include <fstream> #include <fstream>
#include <functional>
#include <iostream> #include <iostream>
#include <map> #include <map>
#include <memory> #include <memory>
@ -27,6 +30,7 @@ namespace LLM {
enum class LLMArch { enum class LLMArch {
QWEN2_5_VL, QWEN2_5_VL,
QWEN3, QWEN3,
QWEN3_VL,
MISTRAL_SMALL_3_2, MISTRAL_SMALL_3_2,
MINISTRAL_3_3B, MINISTRAL_3_3B,
ARCH_COUNT, ARCH_COUNT,
@ -35,11 +39,18 @@ namespace LLM {
static const char* llm_arch_to_str[] = { static const char* llm_arch_to_str[] = {
"qwen2.5vl", "qwen2.5vl",
"qwen3", "qwen3",
"qwen3vl",
"mistral_small3.2", "mistral_small3.2",
"ministral3.3b", "ministral3.3b",
}; };
enum class LLMVisionArch {
QWEN2_5_VL,
QWEN3_VL,
};
struct LLMVisionParams { struct LLMVisionParams {
LLMVisionArch arch = LLMVisionArch::QWEN2_5_VL;
int num_layers = 32; int num_layers = 32;
int64_t hidden_size = 1280; int64_t hidden_size = 1280;
int64_t intermediate_size = 3420; int64_t intermediate_size = 3420;
@ -50,6 +61,7 @@ namespace LLM {
int patch_size = 14; int patch_size = 14;
int spatial_merge_size = 2; int spatial_merge_size = 2;
int window_size = 112; int window_size = 112;
int num_position_embeddings = 0;
std::set<int> fullatt_block_indexes = {7, 15, 23, 31}; std::set<int> fullatt_block_indexes = {7, 15, 23, 31};
}; };
@ -90,6 +102,84 @@ namespace LLM {
} }
}; };
static ggml_tensor* splice_image_embeds(GGMLRunnerContext* ctx,
ggml_tensor* x,
const std::vector<std::pair<int, ggml_tensor*>>& image_embeds) {
if (image_embeds.empty()) {
return x;
}
GGML_ASSERT(x->ne[2] == 1); // N == 1
auto raw_x = ggml_cast(ctx->ggml_ctx, x, image_embeds[0].second->type);
int64_t txt_token_start = 0;
int64_t txt_token_end = 0;
ggml_tensor* input_embed = nullptr;
for (int i = 0; i < image_embeds.size(); i++) {
if (i == 0) {
txt_token_start = 0;
} else {
txt_token_start = image_embeds[i - 1].first + image_embeds[i - 1].second->ne[1];
}
txt_token_end = image_embeds[i].first;
auto txt_embed = ggml_ext_slice(ctx->ggml_ctx, raw_x, 1, txt_token_start, txt_token_end);
if (input_embed == nullptr) {
input_embed = txt_embed;
} else {
input_embed = ggml_concat(ctx->ggml_ctx, input_embed, txt_embed, 1);
}
input_embed = ggml_concat(ctx->ggml_ctx, input_embed, image_embeds[i].second, 1);
}
txt_token_start = image_embeds[image_embeds.size() - 1].first + image_embeds[image_embeds.size() - 1].second->ne[1];
txt_token_end = raw_x->ne[1];
auto final_txt_embed = ggml_ext_slice(ctx->ggml_ctx, raw_x, 1, txt_token_start, txt_token_end);
input_embed = ggml_concat(ctx->ggml_ctx, input_embed, final_txt_embed, 1);
GGML_ASSERT(raw_x->ne[1] == input_embed->ne[1]);
return input_embed;
}
struct VisionMLP : public GGMLBlock {
protected:
LLMVisionArch arch_;
public:
VisionMLP(LLMVisionArch arch, int64_t hidden_size, int64_t intermediate_size)
: arch_(arch) {
if (arch_ == LLMVisionArch::QWEN3_VL) {
blocks["linear_fc1"] = std::make_shared<Linear>(hidden_size, intermediate_size, true);
blocks["linear_fc2"] = std::make_shared<Linear>(intermediate_size, hidden_size, true);
} else {
blocks["gate_proj"] = std::make_shared<Linear>(hidden_size, intermediate_size, true);
blocks["up_proj"] = std::make_shared<Linear>(hidden_size, intermediate_size, true);
blocks["down_proj"] = std::make_shared<Linear>(intermediate_size, hidden_size, true);
}
}
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
if (arch_ == LLMVisionArch::QWEN3_VL) {
auto linear_fc1 = std::dynamic_pointer_cast<Linear>(blocks["linear_fc1"]);
auto linear_fc2 = std::dynamic_pointer_cast<Linear>(blocks["linear_fc2"]);
x = linear_fc1->forward(ctx, x);
x = ggml_ext_gelu(ctx->ggml_ctx, x);
x = linear_fc2->forward(ctx, x);
} else {
auto gate_proj = std::dynamic_pointer_cast<Linear>(blocks["gate_proj"]);
auto up_proj = std::dynamic_pointer_cast<Linear>(blocks["up_proj"]);
auto down_proj = std::dynamic_pointer_cast<Linear>(blocks["down_proj"]);
auto h = gate_proj->forward(ctx, x);
h = ggml_silu_inplace(ctx->ggml_ctx, h);
h = ggml_mul_inplace(ctx->ggml_ctx, h, up_proj->forward(ctx, x));
x = down_proj->forward(ctx, h);
}
return x;
}
};
struct VisionPatchEmbed : public GGMLBlock { struct VisionPatchEmbed : public GGMLBlock {
protected: protected:
bool llama_cpp_style; bool llama_cpp_style;
@ -100,6 +190,7 @@ namespace LLM {
public: public:
VisionPatchEmbed(bool llama_cpp_style, VisionPatchEmbed(bool llama_cpp_style,
LLMVisionArch arch,
int patch_size = 14, int patch_size = 14,
int temporal_patch_size = 2, int temporal_patch_size = 2,
int64_t in_channels = 3, int64_t in_channels = 3,
@ -109,36 +200,35 @@ namespace LLM {
temporal_patch_size(temporal_patch_size), temporal_patch_size(temporal_patch_size),
in_channels(in_channels), in_channels(in_channels),
embed_dim(embed_dim) { embed_dim(embed_dim) {
bool bias = arch == LLMVisionArch::QWEN3_VL;
if (llama_cpp_style) { if (llama_cpp_style) {
blocks["proj.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, blocks["proj.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels,
embed_dim, embed_dim,
{patch_size, patch_size}, {patch_size, patch_size},
{patch_size, patch_size}, // stride {patch_size, patch_size},
{0, 0}, // padding {0, 0},
{1, 1}, // dilation {1, 1},
false)); bias));
blocks["proj.1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, blocks["proj.1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels,
embed_dim, embed_dim,
{patch_size, patch_size}, {patch_size, patch_size},
{patch_size, patch_size}, // stride {patch_size, patch_size},
{0, 0}, // padding {0, 0},
{1, 1}, // dilation {1, 1},
false)); bias));
} else { } else {
std::tuple<int, int, int> kernel_size = {(int)temporal_patch_size, (int)patch_size, (int)patch_size}; std::tuple<int, int, int> kernel_size = {(int)temporal_patch_size, (int)patch_size, (int)patch_size};
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Conv3d(in_channels, blocks["proj"] = std::shared_ptr<GGMLBlock>(new Conv3d(in_channels,
embed_dim, embed_dim,
kernel_size, kernel_size,
kernel_size, // stride kernel_size,
{0, 0, 0}, // padding {0, 0, 0},
{1, 1, 1}, // dilation {1, 1, 1},
false)); bias));
} }
} }
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
// x: [N*grid_t*grid_h*grid_w, in_channels, temporal_patch_size*patch_size*patch_size]
// return: [N*grid_t*grid_h*grid_w, embed_dim]
x = ggml_reshape_4d(ctx->ggml_ctx, x = ggml_reshape_4d(ctx->ggml_ctx,
x, x,
patch_size, patch_size,
@ -170,22 +260,43 @@ namespace LLM {
} }
}; };
struct PatchMerger : public GGMLBlock { struct VisionPatchMerger : public GGMLBlock {
protected: protected:
LLMVisionArch arch_;
int64_t hidden_size; int64_t hidden_size;
public: public:
PatchMerger(int64_t dim, VisionPatchMerger(LLMVisionArch arch,
int64_t context_dim, int64_t dim,
int64_t spatial_merge_size) { int64_t context_dim,
hidden_size = context_dim * spatial_merge_size * spatial_merge_size; int64_t spatial_merge_size)
blocks["ln_q"] = std::shared_ptr<GGMLBlock>(new RMSNorm(context_dim, 1e-6f)); : arch_(arch),
blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size)); hidden_size(context_dim * spatial_merge_size * spatial_merge_size) {
// mlp.1 is nn.GELU() if (arch_ == LLMVisionArch::QWEN3_VL) {
blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, dim)); blocks["norm"] = std::make_shared<LayerNorm>(context_dim, 1e-6f);
blocks["linear_fc1"] = std::make_shared<Linear>(hidden_size, hidden_size, true);
blocks["linear_fc2"] = std::make_shared<Linear>(hidden_size, dim, true);
} else {
blocks["ln_q"] = std::make_shared<RMSNorm>(context_dim, 1e-6f);
blocks["mlp.0"] = std::make_shared<Linear>(hidden_size, hidden_size);
blocks["mlp.2"] = std::make_shared<Linear>(hidden_size, dim);
}
} }
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
if (arch_ == LLMVisionArch::QWEN3_VL) {
auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
auto linear_fc1 = std::dynamic_pointer_cast<Linear>(blocks["linear_fc1"]);
auto linear_fc2 = std::dynamic_pointer_cast<Linear>(blocks["linear_fc2"]);
x = norm->forward(ctx, x);
x = ggml_reshape_2d(ctx->ggml_ctx, x, hidden_size, ggml_nelements(x) / hidden_size);
x = linear_fc1->forward(ctx, x);
x = ggml_gelu_erf(ctx->ggml_ctx, x);
x = linear_fc2->forward(ctx, x);
return x;
}
auto ln_q = std::dynamic_pointer_cast<RMSNorm>(blocks["ln_q"]); auto ln_q = std::dynamic_pointer_cast<RMSNorm>(blocks["ln_q"]);
auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]); auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]); auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);
@ -260,16 +371,35 @@ namespace LLM {
}; };
struct VisionBlock : public GGMLBlock { struct VisionBlock : public GGMLBlock {
protected:
LLMVisionArch arch_;
ggml_tensor* forward_norm(GGMLRunnerContext* ctx, const std::string& name, ggml_tensor* x) {
if (arch_ == LLMVisionArch::QWEN3_VL) {
auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks[name]);
return norm->forward(ctx, x);
}
auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks[name]);
return norm->forward(ctx, x);
}
public: public:
VisionBlock(bool llama_cpp_style, VisionBlock(bool llama_cpp_style,
LLMVisionArch arch,
int64_t hidden_size, int64_t hidden_size,
int64_t intermediate_size, int64_t intermediate_size,
int num_heads, int num_heads,
float eps = 1e-6f) { float eps = 1e-6f)
blocks["attn"] = std::shared_ptr<GGMLBlock>(new VisionAttention(llama_cpp_style, hidden_size, num_heads)); : arch_(arch) {
blocks["mlp"] = std::shared_ptr<GGMLBlock>(new MLP(hidden_size, intermediate_size, true)); blocks["attn"] = std::shared_ptr<GGMLBlock>(new VisionAttention(llama_cpp_style, hidden_size, num_heads));
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new RMSNorm(hidden_size, eps)); blocks["mlp"] = std::shared_ptr<GGMLBlock>(new VisionMLP(arch_, hidden_size, intermediate_size));
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new RMSNorm(hidden_size, eps)); if (arch_ == LLMVisionArch::QWEN3_VL) {
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, eps));
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, eps));
} else {
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new RMSNorm(hidden_size, eps));
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new RMSNorm(hidden_size, eps));
}
} }
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
@ -277,18 +407,16 @@ namespace LLM {
ggml_tensor* pe, ggml_tensor* pe,
ggml_tensor* mask = nullptr) { ggml_tensor* mask = nullptr) {
// x: [N, n_token, hidden_size] // x: [N, n_token, hidden_size]
auto attn = std::dynamic_pointer_cast<VisionAttention>(blocks["attn"]); auto attn = std::dynamic_pointer_cast<VisionAttention>(blocks["attn"]);
auto mlp = std::dynamic_pointer_cast<MLP>(blocks["mlp"]); auto mlp = std::dynamic_pointer_cast<VisionMLP>(blocks["mlp"]);
auto norm1 = std::dynamic_pointer_cast<RMSNorm>(blocks["norm1"]);
auto norm2 = std::dynamic_pointer_cast<RMSNorm>(blocks["norm2"]);
auto residual = x; auto residual = x;
x = norm1->forward(ctx, x); x = forward_norm(ctx, "norm1", x);
x = attn->forward(ctx, x, pe, mask); x = attn->forward(ctx, x, pe, mask);
x = ggml_add_inplace(ctx->ggml_ctx, x, residual); x = ggml_add_inplace(ctx->ggml_ctx, x, residual);
residual = x; residual = x;
x = norm2->forward(ctx, x); x = forward_norm(ctx, "norm2", x);
x = mlp->forward(ctx, x); x = mlp->forward(ctx, x);
x = ggml_add_inplace(ctx->ggml_ctx, x, residual); x = ggml_add_inplace(ctx->ggml_ctx, x, residual);
@ -298,38 +426,58 @@ namespace LLM {
struct VisionModel : public GGMLBlock { struct VisionModel : public GGMLBlock {
protected: protected:
LLMVisionArch arch_;
int num_layers; int num_layers;
int spatial_merge_size; int spatial_merge_size;
int num_grid_per_side;
std::set<int> fullatt_block_indexes; std::set<int> fullatt_block_indexes;
public: public:
VisionModel(bool llama_cpp_style, VisionModel(bool llama_cpp_style,
int num_layers, const LLMVisionParams& vision_params,
int64_t in_channels, float eps = 1e-6f)
int64_t hidden_size, : arch_(vision_params.arch),
int64_t out_hidden_size, num_layers(vision_params.num_layers),
int64_t intermediate_size, spatial_merge_size(vision_params.spatial_merge_size),
int num_heads, num_grid_per_side(vision_params.num_position_embeddings > 0 ? static_cast<int>(std::sqrt(vision_params.num_position_embeddings)) : 0),
int spatial_merge_size, fullatt_block_indexes(vision_params.fullatt_block_indexes) {
int patch_size,
int temporal_patch_size,
int window_size,
std::set<int> fullatt_block_indexes = {7, 15, 23, 31},
float eps = 1e-6f)
: num_layers(num_layers), fullatt_block_indexes(std::move(fullatt_block_indexes)), spatial_merge_size(spatial_merge_size) {
blocks["patch_embed"] = std::shared_ptr<GGMLBlock>(new VisionPatchEmbed(llama_cpp_style, blocks["patch_embed"] = std::shared_ptr<GGMLBlock>(new VisionPatchEmbed(llama_cpp_style,
patch_size, arch_,
temporal_patch_size, vision_params.patch_size,
in_channels, vision_params.temporal_patch_size,
hidden_size)); vision_params.in_channels,
vision_params.hidden_size));
if (vision_params.num_position_embeddings > 0) {
blocks["pos_embed"] = std::make_shared<Embedding>(vision_params.num_position_embeddings, vision_params.hidden_size);
}
for (int i = 0; i < num_layers; i++) { for (int i = 0; i < num_layers; i++) {
blocks["blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new VisionBlock(llama_cpp_style, blocks["blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new VisionBlock(llama_cpp_style,
hidden_size, arch_,
intermediate_size, vision_params.hidden_size,
num_heads, vision_params.intermediate_size,
vision_params.num_heads,
eps)); eps));
} }
blocks["merger"] = std::shared_ptr<GGMLBlock>(new PatchMerger(out_hidden_size, hidden_size, spatial_merge_size)); blocks["merger"] = std::shared_ptr<GGMLBlock>(new VisionPatchMerger(arch_,
vision_params.out_hidden_size,
vision_params.hidden_size,
spatial_merge_size));
}
std::shared_ptr<Embedding> pos_embedder() {
auto it = blocks.find("pos_embed");
if (it == blocks.end()) {
return nullptr;
}
return std::dynamic_pointer_cast<Embedding>(it->second);
}
int get_num_grid_per_side() const {
return num_grid_per_side;
}
int get_spatial_merge_size() const {
return spatial_merge_size;
} }
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* forward(GGMLRunnerContext* ctx,
@ -337,20 +485,26 @@ namespace LLM {
ggml_tensor* pe, ggml_tensor* pe,
ggml_tensor* window_index, ggml_tensor* window_index,
ggml_tensor* window_inverse_index, ggml_tensor* window_inverse_index,
ggml_tensor* window_mask) { ggml_tensor* window_mask,
ggml_tensor* pos_embeds = nullptr) {
// pixel_values: [grid_t*(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw] // pixel_values: [grid_t*(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw]
// window_index: [grid_t*(H/mh/ph)*(W/mw/pw)] // window_index: [grid_t*(H/mh/ph)*(W/mw/pw)]
// window_inverse_index: [grid_t*(H/mh/ph)*(W/mw/pw)] // window_inverse_index: [grid_t*(H/mh/ph)*(W/mw/pw)]
// window_mask: [grid_h*grid_w, grid_h*grid_w] // window_mask: [grid_h*grid_w, grid_h*grid_w]
auto patch_embed = std::dynamic_pointer_cast<VisionPatchEmbed>(blocks["patch_embed"]); auto patch_embed = std::dynamic_pointer_cast<VisionPatchEmbed>(blocks["patch_embed"]);
auto merger = std::dynamic_pointer_cast<PatchMerger>(blocks["merger"]); auto merger = std::dynamic_pointer_cast<VisionPatchMerger>(blocks["merger"]);
auto x = patch_embed->forward(ctx, pixel_values); auto x = patch_embed->forward(ctx, pixel_values);
sd::ggml_graph_cut::mark_graph_cut(x, "llm.vision.prelude", "x"); sd::ggml_graph_cut::mark_graph_cut(x, "llm.vision.prelude", "x");
if (pos_embeds != nullptr) {
x = ggml_add(ctx->ggml_ctx, x, pos_embeds);
}
x = ggml_reshape_4d(ctx->ggml_ctx, x, x->ne[0] * spatial_merge_size * spatial_merge_size, x->ne[1] / spatial_merge_size / spatial_merge_size, x->ne[2], x->ne[3]); if (window_index != nullptr) {
x = ggml_get_rows(ctx->ggml_ctx, x, window_index); x = ggml_reshape_4d(ctx->ggml_ctx, x, x->ne[0] * spatial_merge_size * spatial_merge_size, x->ne[1] / spatial_merge_size / spatial_merge_size, x->ne[2], x->ne[3]);
x = ggml_reshape_4d(ctx->ggml_ctx, x, x->ne[0] / spatial_merge_size / spatial_merge_size, x->ne[1] * spatial_merge_size * spatial_merge_size, x->ne[2], x->ne[3]); x = ggml_get_rows(ctx->ggml_ctx, x, window_index);
x = ggml_reshape_4d(ctx->ggml_ctx, x, x->ne[0] / spatial_merge_size / spatial_merge_size, x->ne[1] * spatial_merge_size * spatial_merge_size, x->ne[2], x->ne[3]);
}
for (int i = 0; i < num_layers; i++) { for (int i = 0; i < num_layers; i++) {
auto block = std::dynamic_pointer_cast<VisionBlock>(blocks["blocks." + std::to_string(i)]); auto block = std::dynamic_pointer_cast<VisionBlock>(blocks["blocks." + std::to_string(i)]);
@ -360,13 +514,17 @@ namespace LLM {
mask = nullptr; mask = nullptr;
} }
x = block->forward(ctx, x, pe, mask); x = block->forward(ctx, x, pe, mask);
if (i == 0) {
}
sd::ggml_graph_cut::mark_graph_cut(x, "llm.vision.blocks." + std::to_string(i), "x"); sd::ggml_graph_cut::mark_graph_cut(x, "llm.vision.blocks." + std::to_string(i), "x");
} }
x = merger->forward(ctx, x); x = merger->forward(ctx, x);
sd::ggml_graph_cut::mark_graph_cut(x, "llm.vision.final", "x"); sd::ggml_graph_cut::mark_graph_cut(x, "llm.vision.final", "x");
x = ggml_get_rows(ctx->ggml_ctx, x, window_inverse_index); if (window_inverse_index != nullptr) {
x = ggml_get_rows(ctx->ggml_ctx, x, window_inverse_index);
}
return x; return x;
} }
@ -430,6 +588,10 @@ namespace LLM {
} else if (arch == LLMArch::QWEN3) { } else if (arch == LLMArch::QWEN3) {
q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
} else if (arch == LLMArch::QWEN3_VL) {
int sections[4] = {24, 20, 20, 0};
q = ggml_rope_multi(ctx->ggml_ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_IMROPE, 262144, 5000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
k = ggml_rope_multi(ctx->ggml_ctx, k, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_IMROPE, 262144, 5000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
} else { } else {
int sections[4] = {16, 24, 24, 0}; int sections[4] = {16, 24, 24, 0};
q = ggml_rope_multi(ctx->ggml_ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_MROPE, 128000, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); q = ggml_rope_multi(ctx->ggml_ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_MROPE, 128000, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
@ -485,10 +647,11 @@ namespace LLM {
struct TextModel : public GGMLBlock { struct TextModel : public GGMLBlock {
protected: protected:
int64_t num_layers; int64_t num_layers;
LLMParams params;
public: public:
TextModel(const LLMParams& params) TextModel(const LLMParams& params)
: num_layers(params.num_layers) { : num_layers(params.num_layers), params(params) {
blocks["embed_tokens"] = std::shared_ptr<GGMLBlock>(new Embedding(params.vocab_size, params.hidden_size)); blocks["embed_tokens"] = std::shared_ptr<GGMLBlock>(new Embedding(params.vocab_size, params.hidden_size));
for (int i = 0; i < num_layers; i++) { for (int i = 0; i < num_layers; i++) {
blocks["layers." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new TransformerBlock(params)); blocks["layers." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new TransformerBlock(params));
@ -496,62 +659,22 @@ namespace LLM {
blocks["norm"] = std::shared_ptr<GGMLBlock>(new RMSNorm(params.hidden_size, params.rms_norm_eps)); blocks["norm"] = std::shared_ptr<GGMLBlock>(new RMSNorm(params.hidden_size, params.rms_norm_eps));
} }
ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* embed(GGMLRunnerContext* ctx,
ggml_tensor* input_ids, ggml_tensor* input_ids) {
ggml_tensor* input_pos,
ggml_tensor* attention_mask,
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
std::set<int> out_layers) {
// input_ids: [N, n_token]
// return: [N, n_token, hidden_size]
auto embed_tokens = std::dynamic_pointer_cast<Embedding>(blocks["embed_tokens"]); auto embed_tokens = std::dynamic_pointer_cast<Embedding>(blocks["embed_tokens"]);
auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]); auto x = embed_tokens->forward(ctx, input_ids);
return x;
auto x = embed_tokens->forward(ctx, input_ids); }
sd::ggml_graph_cut::mark_graph_cut(x, "llm.text.prelude", "x");
ggml_tensor* forward_embeds(GGMLRunnerContext* ctx,
ggml_tensor* x,
ggml_tensor* input_pos,
ggml_tensor* attention_mask,
std::set<int> out_layers) {
auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
std::vector<ggml_tensor*> intermediate_outputs; std::vector<ggml_tensor*> intermediate_outputs;
if (image_embeds.size() > 0) { sd::ggml_graph_cut::mark_graph_cut(x, "llm.text.prelude", "x");
GGML_ASSERT(x->ne[2] == 1); // N == 1
auto raw_x = ggml_cast(ctx->ggml_ctx, x, image_embeds[0].second->type);
int64_t txt_token_start = 0;
int64_t txt_token_end = 0;
ggml_tensor* input_embed = nullptr;
for (int i = 0; i < image_embeds.size(); i++) {
if (i == 0) {
txt_token_start = 0;
} else {
txt_token_start = image_embeds[i - 1].first + image_embeds[i - 1].second->ne[1];
}
txt_token_end = image_embeds[i].first;
auto txt_embed = ggml_ext_slice(ctx->ggml_ctx, raw_x, 1, txt_token_start, txt_token_end);
if (input_embed == nullptr) {
input_embed = txt_embed;
} else {
input_embed = ggml_concat(ctx->ggml_ctx, input_embed, txt_embed, 1);
}
auto image_embed = image_embeds[i].second;
input_embed = ggml_concat(ctx->ggml_ctx, input_embed, image_embed, 1);
}
txt_token_start = image_embeds[image_embeds.size() - 1].first + image_embeds[image_embeds.size() - 1].second->ne[1];
txt_token_end = raw_x->ne[1];
auto final_txt_embed = ggml_ext_slice(ctx->ggml_ctx, raw_x, 1, txt_token_start, txt_token_end);
input_embed = ggml_concat(ctx->ggml_ctx, input_embed, final_txt_embed, 1);
GGML_ASSERT(raw_x->ne[1] == input_embed->ne[1]);
x = input_embed;
}
for (int i = 0; i < num_layers; i++) { for (int i = 0; i < num_layers; i++) {
auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["layers." + std::to_string(i)]); auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["layers." + std::to_string(i)]);
@ -570,10 +693,23 @@ namespace LLM {
for (int i = 1; i < intermediate_outputs.size(); i++) { for (int i = 1; i < intermediate_outputs.size(); i++) {
x = ggml_concat(ctx->ggml_ctx, x, intermediate_outputs[i], 0); x = ggml_concat(ctx->ggml_ctx, x, intermediate_outputs[i], 0);
} }
} else { return x;
x = norm->forward(ctx, x);
} }
return x;
return norm->forward(ctx, x);
}
ggml_tensor* forward(GGMLRunnerContext* ctx,
ggml_tensor* input_ids,
ggml_tensor* input_pos,
ggml_tensor* attention_mask,
std::vector<std::pair<int, ggml_tensor*>> image_embeds,
std::set<int> out_layers) {
// input_ids: [N, n_token]
// return: [N, n_token, hidden_size]
auto x = embed(ctx, input_ids);
x = splice_image_embeds(ctx, x, image_embeds);
return forward_embeds(ctx, x, input_pos, attention_mask, std::move(out_layers));
} }
}; };
@ -587,18 +723,7 @@ namespace LLM {
: enable_vision(enable_vision), params(params) { : enable_vision(enable_vision), params(params) {
blocks["model"] = std::shared_ptr<GGMLBlock>(new TextModel(params)); blocks["model"] = std::shared_ptr<GGMLBlock>(new TextModel(params));
if (enable_vision) { if (enable_vision) {
blocks["visual"] = std::shared_ptr<GGMLBlock>(new VisionModel(llama_cpp_style, blocks["visual"] = std::shared_ptr<GGMLBlock>(new VisionModel(llama_cpp_style, params.vision));
params.vision.num_layers,
params.vision.in_channels,
params.vision.hidden_size,
params.vision.out_hidden_size,
params.vision.intermediate_size,
params.vision.num_heads,
params.vision.spatial_merge_size,
params.vision.patch_size,
params.vision.temporal_patch_size,
params.vision.window_size,
params.vision.fullatt_block_indexes));
} }
} }
@ -615,15 +740,20 @@ namespace LLM {
return x; return x;
} }
std::shared_ptr<VisionModel> vision_model() {
GGML_ASSERT(enable_vision);
return std::dynamic_pointer_cast<VisionModel>(blocks["visual"]);
}
ggml_tensor* vision_forward(GGMLRunnerContext* ctx, ggml_tensor* vision_forward(GGMLRunnerContext* ctx,
ggml_tensor* pixel_values, ggml_tensor* pixel_values,
ggml_tensor* pe, ggml_tensor* pe,
ggml_tensor* window_index, ggml_tensor* window_index,
ggml_tensor* window_inverse_index, ggml_tensor* window_inverse_index,
ggml_tensor* window_mask) { ggml_tensor* window_mask,
ggml_tensor* pos_embeds = nullptr) {
GGML_ASSERT(enable_vision); GGML_ASSERT(enable_vision);
auto vision_model = std::dynamic_pointer_cast<VisionModel>(blocks["visual"]); return vision_model()->forward(ctx, pixel_values, pe, window_index, window_inverse_index, window_mask, pos_embeds);
return vision_model->forward(ctx, pixel_values, pe, window_index, window_inverse_index, window_mask);
} }
}; };
@ -638,7 +768,215 @@ namespace LLM {
std::vector<int> window_index_vec; std::vector<int> window_index_vec;
std::vector<int> window_inverse_index_vec; std::vector<int> window_inverse_index_vec;
std::vector<float> pe_vec; std::vector<float> pe_vec;
std::array<std::vector<int32_t>, 4> pos_embed_idx_data_;
std::array<std::vector<float>, 4> pos_embed_weight_data_;
static ggml_tensor* process_image_common(ggml_context* ctx,
ggml_tensor* image,
const LLMVisionParams& vision_params) {
// image: [C, H, W]
// return: [grid_t*(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw], grid_t == 1
int64_t C = image->ne[2];
int64_t H = image->ne[1];
int64_t W = image->ne[0];
int64_t mh = vision_params.spatial_merge_size;
int64_t mw = vision_params.spatial_merge_size;
int64_t pt = vision_params.temporal_patch_size;
int64_t ph = vision_params.patch_size;
int64_t pw = vision_params.patch_size;
image = ggml_reshape_4d(ctx, image, pw, mw, (W / mw / pw), H * C); // [C*H, (W/mw/pw), mw, pw]
image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 3, 1)); // [mw, C*H, (W/mw/pw), pw]
image = ggml_reshape_4d(ctx, image, pw * (W / mw / pw), H, C, mw); // [mw, C, H, (W/mw/pw)*pw]
image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 3, 1)); // [H, mw, C, (W/mw/pw)*pw]
image = ggml_reshape_4d(ctx, image, pw, (W / mw / pw) * C * mw, ph, mh * (H / mh / ph)); // [(H/mh/ph)*mh, ph, mw*C*(W/mw/pw), pw]
image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 1, 3)); // [(H/mh/ph)*mh, mw*C*(W/mw/pw), ph, pw]
image = ggml_reshape_4d(ctx, image, pw * ph, (W / mw / pw), C, mw * mh * (H / mh / ph)); // [(H/mh/ph)*mh*mw, C, (W/mw/pw), ph*pw]
image = ggml_concat(ctx, image, image, 0); // [(H/mh/ph)*mh*mw, C, (W/mw/pw), pt*ph*pw]
image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 1, 3)); // [(H/mh/ph)*mh*mw, (W/mw/pw), C, pt*ph*pw]
image = ggml_reshape_4d(ctx, image, pw * ph * pt * C, (W / mw / pw), mw * mh, (H / mh / ph)); // [(H/mh/ph), mh*mw, (W/mw/pw), C*pt*ph*pw]
image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 1, 3)); // [(H/mh/ph), (W/mw/pw), mh*mw, C*pt*ph*pw]
image = ggml_reshape_2d(ctx, image, pw * ph * pt * C, mw * mh * (W / mw / pw) * (H / mh / ph)); // [(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw]
return image;
}
static ggml_tensor* build_patch_pos_embeds_common(GGMLRunner* runner,
ggml_context* compute_ctx,
GGMLRunnerContext* runner_ctx,
std::shared_ptr<VisionModel> vision,
int grid_h,
int grid_w,
std::array<std::vector<int32_t>, 4>& pos_embed_idx_data,
std::array<std::vector<float>, 4>& pos_embed_weight_data) {
auto pos_embed = vision->pos_embedder();
GGML_ASSERT(pos_embed != nullptr);
for (int i = 0; i < 4; ++i) {
pos_embed_idx_data[i].clear();
pos_embed_weight_data[i].clear();
pos_embed_idx_data[i].reserve(static_cast<size_t>(grid_h * grid_w));
pos_embed_weight_data[i].reserve(static_cast<size_t>(grid_h * grid_w));
}
int num_grid_per_side = vision->get_num_grid_per_side();
double max_index = static_cast<double>(num_grid_per_side - 1);
int merge_size = vision->get_spatial_merge_size();
GGML_ASSERT(grid_h % merge_size == 0);
GGML_ASSERT(grid_w % merge_size == 0);
for (int bh = 0; bh < grid_h / merge_size; ++bh) {
for (int bw = 0; bw < grid_w / merge_size; ++bw) {
for (int ih = 0; ih < merge_size; ++ih) {
int h = bh * merge_size + ih;
double h_pos = grid_h == 1 ? 0.0 : max_index * h / static_cast<double>(grid_h - 1);
int h_floor = static_cast<int>(std::floor(h_pos));
int h_ceil = std::min(h_floor + 1, num_grid_per_side - 1);
double dh = h_pos - h_floor;
for (int iw = 0; iw < merge_size; ++iw) {
int w = bw * merge_size + iw;
double w_pos = grid_w == 1 ? 0.0 : max_index * w / static_cast<double>(grid_w - 1);
int w_floor = static_cast<int>(std::floor(w_pos));
int w_ceil = std::min(w_floor + 1, num_grid_per_side - 1);
double dw = w_pos - w_floor;
pos_embed_idx_data[0].push_back(h_floor * num_grid_per_side + w_floor);
pos_embed_idx_data[1].push_back(h_floor * num_grid_per_side + w_ceil);
pos_embed_idx_data[2].push_back(h_ceil * num_grid_per_side + w_floor);
pos_embed_idx_data[3].push_back(h_ceil * num_grid_per_side + w_ceil);
pos_embed_weight_data[0].push_back(static_cast<float>((1.0 - dh) * (1.0 - dw)));
pos_embed_weight_data[1].push_back(static_cast<float>((1.0 - dh) * dw));
pos_embed_weight_data[2].push_back(static_cast<float>(dh * (1.0 - dw)));
pos_embed_weight_data[3].push_back(static_cast<float>(dh * dw));
}
}
}
}
ggml_tensor* patch_pos_embeds = nullptr;
for (int i = 0; i < 4; ++i) {
auto idx_tensor = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, static_cast<int64_t>(pos_embed_idx_data[i].size()));
runner->set_backend_tensor_data(idx_tensor, pos_embed_idx_data[i].data());
auto embed = pos_embed->forward(runner_ctx, idx_tensor);
auto weight_tensor = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, 1, static_cast<int64_t>(pos_embed_weight_data[i].size()));
runner->set_backend_tensor_data(weight_tensor, pos_embed_weight_data[i].data());
embed = ggml_mul(compute_ctx, embed, weight_tensor);
patch_pos_embeds = patch_pos_embeds == nullptr ? embed : ggml_add(compute_ctx, patch_pos_embeds, embed);
}
return patch_pos_embeds;
}
static ggml_tensor* encode_image_common(GGMLRunner* runner,
ggml_context* compute_ctx,
GGMLRunnerContext* runner_ctx,
ggml_tensor* image,
const LLMVisionParams& vision_params,
std::shared_ptr<VisionModel> vision_model,
std::vector<int>& window_index_vec,
std::vector<int>& window_inverse_index_vec,
std::vector<float>& window_mask_vec,
std::vector<float>& pe_vec,
std::array<std::vector<int32_t>, 4>& pos_embed_idx_data,
std::array<std::vector<float>, 4>& pos_embed_weight_data) {
GGML_ASSERT(image->ne[1] % (vision_params.patch_size * vision_params.spatial_merge_size) == 0);
GGML_ASSERT(image->ne[0] % (vision_params.patch_size * vision_params.spatial_merge_size) == 0);
int grid_h = static_cast<int>(image->ne[1]) / vision_params.patch_size;
int grid_w = static_cast<int>(image->ne[0]) / vision_params.patch_size;
auto pixel_values = process_image_common(compute_ctx, image, vision_params);
int head_dim = static_cast<int>(vision_params.hidden_size / vision_params.num_heads);
if (vision_params.arch == LLMVisionArch::QWEN3_VL) {
auto pos_embeds = build_patch_pos_embeds_common(runner,
compute_ctx,
runner_ctx,
vision_model,
grid_h,
grid_w,
pos_embed_idx_data,
pos_embed_weight_data);
window_index_vec.resize(static_cast<size_t>((grid_h / vision_params.spatial_merge_size) * (grid_w / vision_params.spatial_merge_size)));
for (int i = 0; i < static_cast<int>(window_index_vec.size()); ++i) {
window_index_vec[static_cast<size_t>(i)] = i;
}
pe_vec = Rope::gen_qwen2vl_pe(grid_h,
grid_w,
vision_params.spatial_merge_size,
window_index_vec,
10000,
{head_dim / 2, head_dim / 2});
int pos_len = static_cast<int>(pe_vec.size() / head_dim / 2);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, head_dim / 2, pos_len);
runner->set_backend_tensor_data(pe, pe_vec.data());
return vision_model->forward(runner_ctx, pixel_values, pe, nullptr, nullptr, nullptr, pos_embeds);
}
int llm_grid_h = grid_h / vision_params.spatial_merge_size;
int llm_grid_w = grid_w / vision_params.spatial_merge_size;
int vit_merger_window_size = vision_params.window_size / vision_params.patch_size / vision_params.spatial_merge_size;
int inverse_index = 0;
window_index_vec.resize(llm_grid_h * llm_grid_w);
window_inverse_index_vec.resize(llm_grid_h * llm_grid_w);
std::vector<int> seqlens;
for (int ih = 0; ih < llm_grid_h; ih += vit_merger_window_size) {
for (int iw = 0; iw < llm_grid_w; iw += vit_merger_window_size) {
int win_h = std::min(vit_merger_window_size, llm_grid_h - ih);
int win_w = std::min(vit_merger_window_size, llm_grid_w - iw);
for (int iy = 0; iy < win_h; iy++) {
for (int ix = 0; ix < win_w; ix++) {
int index = (ih + iy) * llm_grid_w + iw + ix;
window_index_vec[inverse_index] = index;
window_inverse_index_vec[index] = inverse_index;
inverse_index++;
}
}
seqlens.push_back(win_h * win_w * vision_params.spatial_merge_size * vision_params.spatial_merge_size);
}
}
auto window_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, llm_grid_h * llm_grid_w);
auto window_inverse_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, llm_grid_h * llm_grid_w);
runner->set_backend_tensor_data(window_index, window_index_vec.data());
runner->set_backend_tensor_data(window_inverse_index, window_inverse_index_vec.data());
window_mask_vec.resize((grid_h * grid_w) * (grid_h * grid_w));
int window_start_index = 0;
for (int seq_index = 0; seq_index < seqlens.size(); seq_index++) {
int window_end_index = window_start_index + seqlens[seq_index];
GGML_ASSERT(window_end_index <= grid_h * grid_w);
for (int i = window_start_index; i < window_end_index; i++) {
for (int j = 0; j < grid_h * grid_w; j++) {
float mask_value = -INFINITY;
if (j >= window_start_index && j < window_end_index) {
mask_value = 0;
}
GGML_ASSERT((i * (grid_h * grid_w) + j) < window_mask_vec.size());
window_mask_vec[i * (grid_h * grid_w) + j] = mask_value;
}
}
window_start_index = window_end_index;
}
auto window_mask = ggml_new_tensor_2d(compute_ctx,
GGML_TYPE_F32,
grid_h * grid_w,
grid_h * grid_w);
runner->set_backend_tensor_data(window_mask, window_mask_vec.data());
pe_vec = Rope::gen_qwen2vl_pe(grid_h,
grid_w,
vision_params.spatial_merge_size,
window_inverse_index_vec,
10000,
{head_dim / 2, head_dim / 2});
int pos_len = static_cast<int>(pe_vec.size() / head_dim / 2);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, head_dim / 2, pos_len);
runner->set_backend_tensor_data(pe, pe_vec.data());
return vision_model->forward(runner_ctx, pixel_values, pe, window_index, window_inverse_index, window_mask);
}
public:
LLMRunner(LLMArch arch, LLMRunner(LLMArch arch,
ggml_backend_t backend, ggml_backend_t backend,
bool offload_params_to_cpu, bool offload_params_to_cpu,
@ -740,8 +1078,9 @@ namespace LLM {
ggml_tensor* input_pos, ggml_tensor* input_pos,
ggml_tensor* window_index, ggml_tensor* window_index,
ggml_tensor* window_inverse_index, ggml_tensor* window_inverse_index,
ggml_tensor* window_mask) { ggml_tensor* window_mask,
auto hidden_states = model.vision_forward(ctx, pixel_values, input_pos, window_index, window_inverse_index, window_mask); ggml_tensor* pos_embeds = nullptr) {
auto hidden_states = model.vision_forward(ctx, pixel_values, input_pos, window_index, window_inverse_index, window_mask, pos_embeds);
return hidden_states; return hidden_states;
} }
@ -827,30 +1166,36 @@ namespace LLM {
} }
ggml_tensor* process_image(ggml_context* ctx, ggml_tensor* image) { ggml_tensor* process_image(ggml_context* ctx, ggml_tensor* image) {
// image: [C, H, W] return process_image_common(ctx, image, params.vision);
// return: [grid_t*(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw], grid_t == 1 }
int64_t C = image->ne[2];
int64_t H = image->ne[1];
int64_t W = image->ne[0];
int64_t mh = params.vision.spatial_merge_size;
int64_t mw = params.vision.spatial_merge_size;
int64_t pt = params.vision.temporal_patch_size;
int64_t ph = params.vision.patch_size;
int64_t pw = params.vision.patch_size;
image = ggml_reshape_4d(ctx, image, pw, mw, (W / mw / pw), H * C); // [C*H, (W/mw/pw), mw, pw] ggml_tensor* build_patch_pos_embeds(GGMLRunnerContext* runner_ctx,
image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 3, 1)); // [mw, C*H, (W/mw/pw), pw] std::shared_ptr<VisionModel> vision,
image = ggml_reshape_4d(ctx, image, pw * (W / mw / pw), H, C, mw); // [mw, C, H, (W/mw/pw)*pw] int grid_h,
image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 3, 1)); // [H, mw, C, (W/mw/pw)*pw] int grid_w) {
image = ggml_reshape_4d(ctx, image, pw, (W / mw / pw) * C * mw, ph, mh * (H / mh / ph)); // [(H/mh/ph)*mh, ph, mw*C*(W/mw/pw), pw] return build_patch_pos_embeds_common(this,
image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 1, 3)); // [(H/mh/ph)*mh, mw*C*(W/mw/pw), ph, pw] compute_ctx,
image = ggml_reshape_4d(ctx, image, pw * ph, (W / mw / pw), C, mw * mh * (H / mh / ph)); // [(H/mh/ph)*mh*mw, C, (W/mw/pw), ph*pw] runner_ctx,
image = ggml_concat(ctx, image, image, 0); // [(H/mh/ph)*mh*mw, C, (W/mw/pw), pt*ph*pw] vision,
image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 1, 3)); // [(H/mh/ph)*mh*mw, (W/mw/pw), C, pt*ph*pw] grid_h,
image = ggml_reshape_4d(ctx, image, pw * ph * pt * C, (W / mw / pw), mw * mh, (H / mh / ph)); // [(H/mh/ph), mh*mw, (W/mw/pw), C*pt*ph*pw] grid_w,
image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 1, 3)); // [(H/mh/ph), (W/mw/pw), mh*mw, C*pt*ph*pw] pos_embed_idx_data_,
image = ggml_reshape_2d(ctx, image, pw * ph * pt * C, mw * mh * (W / mw / pw) * (H / mh / ph)); // [(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw] pos_embed_weight_data_);
return image; }
ggml_tensor* encode_image(GGMLRunnerContext* runner_ctx, ggml_tensor* image) {
return encode_image_common(this,
compute_ctx,
runner_ctx,
image,
params.vision,
model.vision_model(),
window_index_vec,
window_inverse_index_vec,
window_mask_vec,
pe_vec,
pos_embed_idx_data_,
pos_embed_weight_data_);
} }
ggml_cgraph* build_encode_image_graph(const sd::Tensor<float>& image_tensor) { ggml_cgraph* build_encode_image_graph(const sd::Tensor<float>& image_tensor) {
@ -860,116 +1205,8 @@ namespace LLM {
GGML_ASSERT(image->ne[1] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0); GGML_ASSERT(image->ne[1] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0);
GGML_ASSERT(image->ne[0] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0); GGML_ASSERT(image->ne[0] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0);
int grid_t = 1;
int grid_h = static_cast<int>(image->ne[1]) / params.vision.patch_size;
int grid_w = static_cast<int>(image->ne[0]) / params.vision.patch_size;
int llm_grid_h = grid_h / params.vision.spatial_merge_size;
int llm_grid_w = grid_w / params.vision.spatial_merge_size;
int vit_merger_window_size = params.vision.window_size / params.vision.patch_size / params.vision.spatial_merge_size;
auto pixel_values = process_image(compute_ctx, image);
// window index
int inverse_index = 0;
window_index_vec.resize(llm_grid_h * llm_grid_w);
window_inverse_index_vec.resize(llm_grid_h * llm_grid_w);
std::vector<int> seqlens;
for (int ih = 0; ih < llm_grid_h; ih += vit_merger_window_size) {
for (int iw = 0; iw < llm_grid_w; iw += vit_merger_window_size) {
int win_h = std::min(vit_merger_window_size, llm_grid_h - ih);
int win_w = std::min(vit_merger_window_size, llm_grid_w - iw);
for (int iy = 0; iy < win_h; iy++) {
for (int ix = 0; ix < win_w; ix++) {
int index = (ih + iy) * llm_grid_w + iw + ix;
window_index_vec[inverse_index] = index;
window_inverse_index_vec[index] = inverse_index;
inverse_index++;
}
}
seqlens.push_back(win_h * win_w * params.vision.spatial_merge_size * params.vision.spatial_merge_size);
}
}
// printf("window_index: ");
// for (int i : window_index_vec) {
// printf("%d ", i);
// }
// printf("\n");
// printf("window_inverse_index: ");
// for (int i : window_inverse_index_vec) {
// printf("%d ", i);
// }
// printf("\n");
// printf("seqlens: ");
// for (int i : seqlens) {
// printf("%d ", i);
// }
// printf("\n");
auto window_index = ggml_new_tensor_1d(compute_ctx,
GGML_TYPE_I32,
llm_grid_h * llm_grid_w);
auto window_inverse_index = ggml_new_tensor_1d(compute_ctx,
GGML_TYPE_I32,
llm_grid_h * llm_grid_w);
set_backend_tensor_data(window_index, window_index_vec.data());
set_backend_tensor_data(window_inverse_index, window_inverse_index_vec.data());
// window mask
int seq_window_size = (vit_merger_window_size * params.vision.spatial_merge_size) * (vit_merger_window_size * params.vision.spatial_merge_size);
window_mask_vec.resize((grid_h * grid_w) * (grid_h * grid_w));
int window_start_index = 0;
for (int seq_index = 0; seq_index < seqlens.size(); seq_index++) {
int window_end_index = window_start_index + seqlens[seq_index];
// LOG_DEBUG("%d %d", window_start_index, window_end_index);
GGML_ASSERT(window_end_index <= grid_h * grid_w);
for (int i = window_start_index; i < window_end_index; i++) {
for (int j = 0; j < grid_h * grid_w; j++) {
float mask_value = -INFINITY;
if (j >= window_start_index && j < window_end_index) {
mask_value = 0;
}
GGML_ASSERT((i * (grid_h * grid_w) + j) < window_mask_vec.size());
window_mask_vec[i * (grid_h * grid_w) + j] = mask_value;
}
}
window_start_index = window_end_index;
// printf("\n");
}
// printf("window_mask: \n");
// for (int i = 0; i < grid_h*grid_w; i++) {
// for (int j = 0; j < grid_h*grid_w; j++) {
// printf("%f ", window_mask_vec[i * (grid_h * grid_w) + j]);
// }
// printf("\n");
// }
auto window_mask = ggml_new_tensor_2d(compute_ctx,
GGML_TYPE_F32,
grid_h * grid_w,
grid_h * grid_w);
set_backend_tensor_data(window_mask, window_mask_vec.data());
// pe
int head_dim = static_cast<int>(params.vision.hidden_size / params.vision.num_heads);
pe_vec = Rope::gen_qwen2vl_pe(grid_h,
grid_w,
params.vision.spatial_merge_size,
window_inverse_index_vec,
10000,
{head_dim / 2, head_dim / 2});
int pos_len = static_cast<int>(pe_vec.size() / head_dim / 2);
// LOG_DEBUG("pos_len %d", pos_len);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, head_dim / 2, pos_len);
// pe->data = pe_vec.data();
// print_ggml_tensor(pe);
// pe->data = nullptr;
set_backend_tensor_data(pe, pe_vec.data());
auto runnter_ctx = get_context(); auto runnter_ctx = get_context();
ggml_tensor* hidden_states = vision_forward(&runnter_ctx, ggml_tensor* hidden_states = encode_image(&runnter_ctx, image);
pixel_values,
pe,
window_index,
window_inverse_index,
window_mask);
ggml_build_forward_expand(gf, hidden_states); ggml_build_forward_expand(gf, hidden_states);
return gf; return gf;

View File

@ -437,6 +437,10 @@ SDVersion ModelLoader::get_sd_version() {
if (tensor_storage.name.find("model.diffusion_model.joint_blocks.") != std::string::npos) { if (tensor_storage.name.find("model.diffusion_model.joint_blocks.") != std::string::npos) {
return VERSION_SD3; return VERSION_SD3;
} }
if (tensor_storage.name.find("model.x_embedder.proj1.weight") != std::string::npos &&
tensor_storage_map.find("model.language_model.layers.0.self_attn.q_proj.weight") != tensor_storage_map.end()) {
return VERSION_HIDREAM_O1;
}
if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) { if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) {
return VERSION_QWEN_IMAGE; return VERSION_QWEN_IMAGE;
} }

View File

@ -42,6 +42,7 @@ enum SDVersion {
VERSION_ANIMA, VERSION_ANIMA,
VERSION_FLUX2, VERSION_FLUX2,
VERSION_FLUX2_KLEIN, VERSION_FLUX2_KLEIN,
VERSION_HIDREAM_O1,
VERSION_Z_IMAGE, VERSION_Z_IMAGE,
VERSION_OVIS_IMAGE, VERSION_OVIS_IMAGE,
VERSION_ERNIE_IMAGE, VERSION_ERNIE_IMAGE,
@ -163,6 +164,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
sd_version_is_sd3(version) || sd_version_is_sd3(version) ||
sd_version_is_wan(version) || sd_version_is_wan(version) ||
sd_version_is_qwen_image(version) || sd_version_is_qwen_image(version) ||
version == VERSION_HIDREAM_O1 ||
sd_version_is_anima(version) || sd_version_is_anima(version) ||
sd_version_is_z_image(version) || sd_version_is_z_image(version) ||
sd_version_is_ernie_image(version)) { sd_version_is_ernie_image(version)) {

View File

@ -52,6 +52,7 @@ const char* model_version_to_str[] = {
"Anima", "Anima",
"Flux.2", "Flux.2",
"Flux.2 klein", "Flux.2 klein",
"HiDream O1",
"Z-Image", "Z-Image",
"Ovis Image", "Ovis Image",
"Ernie Image", "Ernie Image",
@ -538,6 +539,14 @@ public:
"model.diffusion_model", "model.diffusion_model",
version, version,
sd_ctx_params->qwen_image_zero_cond_t); sd_ctx_params->qwen_image_zero_cond_t);
} else if (version == VERSION_HIDREAM_O1) {
cond_stage_model = std::make_shared<HiDreamO1::HiDreamO1Conditioner>(clip_backend,
offload_params_to_cpu,
tensor_storage_map);
diffusion_model = std::make_shared<HiDreamO1Model>(backend,
offload_params_to_cpu,
tensor_storage_map,
"model");
} else if (sd_version_is_anima(version)) { } else if (sd_version_is_anima(version)) {
cond_stage_model = std::make_shared<AnimaConditioner>(clip_backend, cond_stage_model = std::make_shared<AnimaConditioner>(clip_backend,
offload_params_to_cpu, offload_params_to_cpu,
@ -671,7 +680,7 @@ public:
bool force_vae_cpu = sd_ctx_params->keep_vae_on_cpu; bool force_vae_cpu = sd_ctx_params->keep_vae_on_cpu;
if (version == VERSION_CHROMA_RADIANCE) { if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1) {
LOG_INFO("using FakeVAE"); LOG_INFO("using FakeVAE");
first_stage_model = std::make_shared<FakeVAE>(version, first_stage_model = std::make_shared<FakeVAE>(version,
vae_backend, vae_backend,
@ -835,6 +844,10 @@ public:
ignore_tensors.insert("text_encoders.llm.vision_tower."); ignore_tensors.insert("text_encoders.llm.vision_tower.");
ignore_tensors.insert("text_encoders.llm.multi_modal_projector."); ignore_tensors.insert("text_encoders.llm.multi_modal_projector.");
} }
if (version == VERSION_HIDREAM_O1) {
ignore_tensors.insert("lm_head.");
ignore_tensors.insert("model.visual.deepstack_merger_list.");
}
if (enable_mmap_tensors) { if (enable_mmap_tensors) {
if (mmap_able_tensors.empty()) { if (mmap_able_tensors.empty()) {
@ -972,6 +985,7 @@ public:
} else if (sd_version_is_sd3(version) || } else if (sd_version_is_sd3(version) ||
sd_version_is_wan(version) || sd_version_is_wan(version) ||
sd_version_is_qwen_image(version) || sd_version_is_qwen_image(version) ||
version == VERSION_HIDREAM_O1 ||
sd_version_is_anima(version) || sd_version_is_anima(version) ||
sd_version_is_ernie_image(version) || sd_version_is_ernie_image(version) ||
sd_version_is_z_image(version)) { sd_version_is_z_image(version)) {
@ -1569,6 +1583,9 @@ public:
if (sd_version_is_anima(version)) { if (sd_version_is_anima(version)) {
return std::vector<float>{t / static_cast<float>(TIMESTEPS)}; return std::vector<float>{t / static_cast<float>(TIMESTEPS)};
} }
if (version == VERSION_HIDREAM_O1) {
return std::vector<float>{1.0f - (t / static_cast<float>(TIMESTEPS))};
}
if (sd_version_is_z_image(version)) { if (sd_version_is_z_image(version)) {
return std::vector<float>{1000.f - t}; return std::vector<float>{1000.f - t};
} }
@ -1657,6 +1674,7 @@ public:
int shifted_timestep, int shifted_timestep,
sample_method_t method, sample_method_t method,
bool is_flow_denoiser, bool is_flow_denoiser,
const char* extra_sample_args,
const std::vector<float>& sigmas, const std::vector<float>& sigmas,
int start_merge_step, int start_merge_step,
const std::vector<sd::Tensor<float>>& ref_latents, const std::vector<sd::Tensor<float>>& ref_latents,
@ -1683,13 +1701,17 @@ public:
} }
} }
size_t steps = sigmas.size() - 1; size_t steps = sigmas.size() - 1;
bool has_skiplayer = slg_scale != 0.0f && !skip_layers.empty(); bool has_skiplayer = slg_scale != 0.0f && !skip_layers.empty();
if (has_skiplayer && !sd_version_is_dit(version)) { if (has_skiplayer && !sd_version_is_dit(version)) {
has_skiplayer = false; has_skiplayer = false;
LOG_WARN("SLG is incompatible with this model type"); LOG_WARN("SLG is incompatible with this model type");
} }
if (version == VERSION_HIDREAM_O1 && !noise.empty()) {
noise *= eta;
}
int64_t t0 = ggml_time_us(); int64_t t0 = ggml_time_us();
sd::Tensor<float> x_t = !noise.empty() sd::Tensor<float> x_t = !noise.empty()
? denoiser->noise_scaling(sigmas[0], noise, init_latent) ? denoiser->noise_scaling(sigmas[0], noise, init_latent)
@ -1764,12 +1786,18 @@ public:
auto run_condition = [&](const SDCondition& condition, auto run_condition = [&](const SDCondition& condition,
const sd::Tensor<float>* c_concat_override = nullptr, const sd::Tensor<float>* c_concat_override = nullptr,
const std::vector<int>* local_skip_layers = nullptr) -> sd::Tensor<float> { const std::vector<int>* local_skip_layers = nullptr) -> sd::Tensor<float> {
diffusion_params.context = condition.c_crossattn.empty() ? nullptr : &condition.c_crossattn; diffusion_params.context = condition.c_crossattn.empty() ? nullptr : &condition.c_crossattn;
diffusion_params.c_concat = c_concat_override != nullptr ? c_concat_override : (condition.c_concat.empty() ? nullptr : &condition.c_concat); diffusion_params.c_concat = c_concat_override != nullptr ? c_concat_override : (condition.c_concat.empty() ? nullptr : &condition.c_concat);
diffusion_params.y = condition.c_vector.empty() ? nullptr : &condition.c_vector; diffusion_params.y = condition.c_vector.empty() ? nullptr : &condition.c_vector;
diffusion_params.t5_ids = condition.c_t5_ids.empty() ? nullptr : &condition.c_t5_ids; diffusion_params.t5_ids = condition.c_t5_ids.empty() ? nullptr : &condition.c_t5_ids;
diffusion_params.t5_weights = condition.c_t5_weights.empty() ? nullptr : &condition.c_t5_weights; diffusion_params.t5_weights = condition.c_t5_weights.empty() ? nullptr : &condition.c_t5_weights;
diffusion_params.skip_layers = local_skip_layers; diffusion_params.input_ids = condition.c_input_ids.empty() ? nullptr : &condition.c_input_ids;
diffusion_params.input_pos = condition.c_position_ids.empty() ? nullptr : &condition.c_position_ids;
diffusion_params.token_types = condition.c_token_types.empty() ? nullptr : &condition.c_token_types;
diffusion_params.vinput_mask = condition.c_vinput_mask.empty() ? nullptr : &condition.c_vinput_mask;
diffusion_params.image_embeds = condition.c_image_embeds.empty() ? nullptr : &condition.c_image_embeds;
diffusion_params.ref_latents = condition.c_ref_images.empty() ? &ref_latents : &condition.c_ref_images;
diffusion_params.skip_layers = local_skip_layers;
sd::Tensor<float> cached_output; sd::Tensor<float> cached_output;
if (step_cache.before_condition(&condition, noised_input, &cached_output)) { if (step_cache.before_condition(&condition, noised_input, &cached_output)) {
@ -1855,7 +1883,7 @@ public:
denoised = latent_result * c_out + x * c_skip; denoised = latent_result * c_out + x * c_skip;
if (out_uncond_denoised != nullptr) { if (out_uncond_denoised != nullptr) {
sd::Tensor<float> base_uncond = !uncond_out.empty() ? uncond_out : cond_out; sd::Tensor<float> base_uncond = !uncond_out.empty() ? uncond_out : cond_out;
*out_uncond_denoised = base_uncond * c_out + x * c_skip; *out_uncond_denoised = base_uncond * c_out + x * c_skip;
} }
if (cache_runtime.spectrum_enabled) { if (cache_runtime.spectrum_enabled) {
cache_runtime.spectrum.update(denoised); cache_runtime.spectrum.update(denoised);
@ -1870,7 +1898,7 @@ public:
return denoised; return denoised;
}; };
auto x0_opt = sample_k_diffusion(method, denoise, x_t, sigmas, sampler_rng, eta, is_flow_denoiser); auto x0_opt = sample_k_diffusion(method, denoise, x_t, sigmas, sampler_rng, eta, is_flow_denoiser, extra_sample_args);
if (x0_opt.empty()) { if (x0_opt.empty()) {
LOG_ERROR("Diffusion model sampling failed"); LOG_ERROR("Diffusion model sampling failed");
if (control_net) { if (control_net) {
@ -1920,6 +1948,8 @@ public:
if (sd_version_is_dit(version)) { if (sd_version_is_dit(version)) {
if (version == VERSION_WAN2_2_TI2V) { if (version == VERSION_WAN2_2_TI2V) {
latent_channel = 48; latent_channel = 48;
} else if (version == VERSION_HIDREAM_O1) {
latent_channel = 3;
} else if (version == VERSION_CHROMA_RADIANCE) { } else if (version == VERSION_CHROMA_RADIANCE) {
latent_channel = 3; latent_channel = 3;
} else if (sd_version_uses_flux2_vae(version)) { } else if (sd_version_uses_flux2_vae(version)) {
@ -2361,6 +2391,7 @@ void sd_sample_params_init(sd_sample_params_t* sample_params) {
sample_params->custom_sigmas = nullptr; sample_params->custom_sigmas = nullptr;
sample_params->custom_sigmas_count = 0; sample_params->custom_sigmas_count = 0;
sample_params->flow_shift = INFINITY; sample_params->flow_shift = INFINITY;
sample_params->extra_sample_args = nullptr;
} }
char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) { char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
@ -2382,7 +2413,8 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
"sample_steps: %d, " "sample_steps: %d, "
"eta: %.2f, " "eta: %.2f, "
"shifted_timestep: %d, " "shifted_timestep: %d, "
"flow_shift: %.2f)", "flow_shift: %.2f, "
"extra_sample_args: %s)",
sample_params->guidance.txt_cfg, sample_params->guidance.txt_cfg,
std::isfinite(sample_params->guidance.img_cfg) std::isfinite(sample_params->guidance.img_cfg)
? sample_params->guidance.img_cfg ? sample_params->guidance.img_cfg
@ -2397,7 +2429,8 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
sample_params->sample_steps, sample_params->sample_steps,
sample_params->eta, sample_params->eta,
sample_params->shifted_timestep, sample_params->shifted_timestep,
sample_params->flow_shift); sample_params->flow_shift,
SAFE_STR(sample_params->extra_sample_args));
return buf; return buf;
} }
@ -2609,6 +2642,9 @@ static float resolve_eta(sd_ctx_t* sd_ctx,
float eta, float eta,
enum sample_method_t sample_method) { enum sample_method_t sample_method) {
if (eta == INFINITY) { if (eta == INFINITY) {
if (sd_ctx->sd->version == VERSION_HIDREAM_O1) {
return 8.f;
}
switch (sample_method) { switch (sample_method) {
case DDIM_TRAILING_SAMPLE_METHOD: case DDIM_TRAILING_SAMPLE_METHOD:
case TCD_SAMPLE_METHOD: case TCD_SAMPLE_METHOD:
@ -2828,6 +2864,8 @@ struct GenerationRequest {
struct SamplePlan { struct SamplePlan {
enum sample_method_t sample_method = SAMPLE_METHOD_COUNT; enum sample_method_t sample_method = SAMPLE_METHOD_COUNT;
enum sample_method_t high_noise_sample_method = SAMPLE_METHOD_COUNT; enum sample_method_t high_noise_sample_method = SAMPLE_METHOD_COUNT;
const char* extra_sample_args = nullptr;
const char* high_noise_extra_sample_args = nullptr;
float eta = 0.f; float eta = 0.f;
float high_noise_eta = 0.f; float high_noise_eta = 0.f;
int sample_steps = 0; int sample_steps = 0;
@ -2840,22 +2878,25 @@ struct SamplePlan {
SamplePlan(sd_ctx_t* sd_ctx, SamplePlan(sd_ctx_t* sd_ctx,
const sd_img_gen_params_t* sd_img_gen_params, const sd_img_gen_params_t* sd_img_gen_params,
const GenerationRequest& request) { const GenerationRequest& request) {
sample_method = sd_img_gen_params->sample_params.sample_method; sample_method = sd_img_gen_params->sample_params.sample_method;
eta = sd_img_gen_params->sample_params.eta; extra_sample_args = sd_img_gen_params->sample_params.extra_sample_args;
sample_steps = sd_img_gen_params->sample_params.sample_steps; eta = sd_img_gen_params->sample_params.eta;
sample_steps = sd_img_gen_params->sample_params.sample_steps;
resolve(sd_ctx, &request, &sd_img_gen_params->sample_params); resolve(sd_ctx, &request, &sd_img_gen_params->sample_params);
} }
SamplePlan(sd_ctx_t* sd_ctx, SamplePlan(sd_ctx_t* sd_ctx,
const sd_vid_gen_params_t* sd_vid_gen_params, const sd_vid_gen_params_t* sd_vid_gen_params,
const GenerationRequest& request) { const GenerationRequest& request) {
sample_method = sd_vid_gen_params->sample_params.sample_method; sample_method = sd_vid_gen_params->sample_params.sample_method;
eta = sd_vid_gen_params->sample_params.eta; extra_sample_args = sd_vid_gen_params->sample_params.extra_sample_args;
sample_steps = sd_vid_gen_params->sample_params.sample_steps; eta = sd_vid_gen_params->sample_params.eta;
sample_steps = sd_vid_gen_params->sample_params.sample_steps;
if (sd_ctx->sd->high_noise_diffusion_model) { if (sd_ctx->sd->high_noise_diffusion_model) {
high_noise_sample_steps = sd_vid_gen_params->high_noise_sample_params.sample_steps; high_noise_sample_steps = sd_vid_gen_params->high_noise_sample_params.sample_steps;
high_noise_sample_method = sd_vid_gen_params->high_noise_sample_params.sample_method; high_noise_sample_method = sd_vid_gen_params->high_noise_sample_params.sample_method;
high_noise_eta = sd_vid_gen_params->high_noise_sample_params.eta; high_noise_extra_sample_args = sd_vid_gen_params->high_noise_sample_params.extra_sample_args;
high_noise_eta = sd_vid_gen_params->high_noise_sample_params.eta;
} }
moe_boundary = sd_vid_gen_params->moe_boundary; moe_boundary = sd_vid_gen_params->moe_boundary;
resolve(sd_ctx, &request, &sd_vid_gen_params->sample_params); resolve(sd_ctx, &request, &sd_vid_gen_params->sample_params);
@ -3101,6 +3142,9 @@ static std::optional<ImageGenerationLatents> prepare_image_generation_latents(sd
std::vector<sd::Tensor<float>> ref_latents; std::vector<sd::Tensor<float>> ref_latents;
for (size_t i = 0; i < ref_images.size(); i++) { for (size_t i = 0; i < ref_images.size(); i++) {
if (sd_ctx->sd->version == VERSION_HIDREAM_O1) {
continue;
}
sd::Tensor<float> ref_latent; sd::Tensor<float> ref_latent;
if (request->auto_resize_ref_image) { if (request->auto_resize_ref_image) {
LOG_DEBUG("auto resize ref images"); LOG_DEBUG("auto resize ref images");
@ -3511,6 +3555,7 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
request.shifted_timestep, request.shifted_timestep,
plan.sample_method, plan.sample_method,
sd_ctx->sd->is_flow_denoiser(), sd_ctx->sd->is_flow_denoiser(),
plan.extra_sample_args,
plan.sigmas, plan.sigmas,
plan.start_merge_step, plan.start_merge_step,
latents.ref_latents, latents.ref_latents,
@ -3636,6 +3681,7 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
request.shifted_timestep, request.shifted_timestep,
plan.sample_method, plan.sample_method,
sd_ctx->sd->is_flow_denoiser(), sd_ctx->sd->is_flow_denoiser(),
plan.extra_sample_args,
hires_sigma_sched, hires_sigma_sched,
plan.start_merge_step, plan.start_merge_step,
latents.ref_latents, latents.ref_latents,
@ -4000,6 +4046,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
request.shifted_timestep, request.shifted_timestep,
plan.high_noise_sample_method, plan.high_noise_sample_method,
sd_ctx->sd->is_flow_denoiser(), sd_ctx->sd->is_flow_denoiser(),
plan.high_noise_extra_sample_args,
high_noise_sigmas, high_noise_sigmas,
-1, -1,
std::vector<sd::Tensor<float>>{}, std::vector<sd::Tensor<float>>{},
@ -4042,6 +4089,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
sd_vid_gen_params->sample_params.shifted_timestep, sd_vid_gen_params->sample_params.shifted_timestep,
plan.sample_method, plan.sample_method,
sd_ctx->sd->is_flow_denoiser(), sd_ctx->sd->is_flow_denoiser(),
plan.extra_sample_args,
plan.sigmas, plan.sigmas,
-1, -1,
std::vector<sd::Tensor<float>>{}, std::vector<sd::Tensor<float>>{},

View File

@ -81,6 +81,11 @@ Qwen2Tokenizer::Qwen2Tokenizer(const std::string& merges_utf8_str) {
"</tool_response>", "</tool_response>",
"<think>", "<think>",
"</think>", "</think>",
"<|boi_token|>",
"<|bor_token|>",
"<|eor_token|>",
"<|bot_token|>",
"<|tms_token|>",
}; };
if (merges_utf8_str.size() > 0) { if (merges_utf8_str.size() > 0) {

View File

@ -71,7 +71,7 @@ public:
scale_factor = 16; scale_factor = 16;
} else if (sd_version_uses_flux2_vae(version)) { } else if (sd_version_uses_flux2_vae(version)) {
scale_factor = 16; scale_factor = 16;
} else if (version == VERSION_CHROMA_RADIANCE) { } else if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1) {
scale_factor = 1; scale_factor = 1;
} }
return scale_factor; return scale_factor;