diff --git a/README.md b/README.md index 80d98c30..33c272e9 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,7 @@ API and command-line option may change frequently.*** - [Ovis-Image](./docs/ovis_image.md) - [Anima](./docs/anima.md) - [ERNIE-Image](./docs/ernie_image.md) + - [HiDream-O1-Image](./docs/hidream_o1_image.md) - Image Edit Models - [FLUX.1-Kontext-dev](./docs/kontext.md) - [Qwen Image Edit series](./docs/qwen_image_edit.md) @@ -148,6 +149,7 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe - [Ovis-Image](./docs/ovis_image.md) - [Anima](./docs/anima.md) - [ERNIE-Image](./docs/ernie_image.md) +- [HiDream-O1-Image](./docs/hidream_o1_image.md) - [LoRA](./docs/lora.md) - [LCM/LCM-LoRA](./docs/lcm.md) - [Using PhotoMaker to personalize image generation](./docs/photo_maker.md) diff --git a/assets/hidream-o1/dev_example.png b/assets/hidream-o1/dev_example.png new file mode 100644 index 00000000..e7ab12bb Binary files /dev/null and b/assets/hidream-o1/dev_example.png differ diff --git a/docs/hidream_o1_image.md b/docs/hidream_o1_image.md new file mode 100644 index 00000000..771d4f29 --- /dev/null +++ b/docs/hidream_o1_image.md @@ -0,0 +1,20 @@ +# How to Use + +## Download weights + +- Download HiDream-O1-Image-Dev + - safetensors: https://huggingface.co/Comfy-Org/HiDream-O1-Image/tree/main/checkpoints +- Download HiDream-O1-Image + - safetensors: https://huggingface.co/Comfy-Org/HiDream-O1-Image/tree/main/checkpoints + +## Examples + +### HiDream-O1-Image-Dev + +``` +.\bin\Release\sd-cli.exe -m ..\..\ComfyUI\models\diffusion_models\hidream_o1_image_dev_bf16.safetensors -p "a lovely cat holding a sign says +'hidream o1 cpp'" --cfg-scale 1.0 -v -H 1024 -W 1024 +``` + +HiDream-O1-Image-Dev example + diff --git a/examples/cli/README.md b/examples/cli/README.md index 5fbeec39..b5475794 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -103,6 +103,8 @@ Generation Options: --hires-upscaler highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent (nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name under --hires-upscalers-dir (default: Latent) + --extra-sample-args extra sampler args, key=value list. Currently lcm supports noise_clip_std, + noise_scale_start, noise_scale_end -H, --height image height, in pixel space (default: 512) -W, --width image width, in pixel space (default: 512) --steps number of sample steps (default: 20) diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 8ca7a2dc..28deecfa 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -807,6 +807,10 @@ ArgOptions SDGenerationParams::get_options() { "Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name " "under --hires-upscalers-dir (default: Latent)", &hires_upscaler}, + {"", + "--extra-sample-args", + "extra sampler args, key=value list. Currently lcm supports noise_clip_std, noise_scale_start, noise_scale_end", + &extra_sample_args}, }; options.int_options = { @@ -1607,6 +1611,7 @@ bool SDGenerationParams::from_json_str( auto parse_sample_params_json = [&](const json& sample_json, sd_sample_params_t& target_params, + std::string& target_extra_sample_args, std::vector& target_skip_layers, std::vector* target_custom_sigmas) { if (sample_json.contains("sample_steps") && sample_json["sample_steps"].is_number_integer()) { @@ -1621,6 +1626,9 @@ bool SDGenerationParams::from_json_str( if (sample_json.contains("flow_shift") && sample_json["flow_shift"].is_number()) { target_params.flow_shift = sample_json["flow_shift"]; } + if (sample_json.contains("extra_sample_args") && sample_json["extra_sample_args"].is_string()) { + target_extra_sample_args = sample_json["extra_sample_args"].get(); + } if (target_custom_sigmas != nullptr && sample_json.contains("custom_sigmas") && sample_json["custom_sigmas"].is_array()) { @@ -1668,11 +1676,12 @@ bool SDGenerationParams::from_json_str( }; if (j.contains("sample_params") && j["sample_params"].is_object()) { - parse_sample_params_json(j["sample_params"], sample_params, skip_layers, &custom_sigmas); + parse_sample_params_json(j["sample_params"], sample_params, extra_sample_args, skip_layers, &custom_sigmas); } if (j.contains("high_noise_sample_params") && j["high_noise_sample_params"].is_object()) { parse_sample_params_json(j["high_noise_sample_params"], high_noise_sample_params, + high_noise_extra_sample_args, high_noise_skip_layers, nullptr); } @@ -2099,6 +2108,8 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() { high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size(); sample_params.custom_sigmas = custom_sigmas.empty() ? nullptr : custom_sigmas.data(); sample_params.custom_sigmas_count = static_cast(custom_sigmas.size()); + sample_params.extra_sample_args = extra_sample_args.empty() ? nullptr : extra_sample_args.c_str(); + high_noise_sample_params.extra_sample_args = high_noise_extra_sample_args.empty() ? nullptr : high_noise_extra_sample_args.c_str(); cache_params.scm_mask = scm_mask.empty() ? nullptr : scm_mask.c_str(); sd_pm_params_t pm_params = { @@ -2168,6 +2179,8 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() { high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size(); sample_params.custom_sigmas = custom_sigmas.empty() ? nullptr : custom_sigmas.data(); sample_params.custom_sigmas_count = static_cast(custom_sigmas.size()); + sample_params.extra_sample_args = extra_sample_args.empty() ? nullptr : extra_sample_args.c_str(); + high_noise_sample_params.extra_sample_args = high_noise_extra_sample_args.empty() ? nullptr : high_noise_extra_sample_args.c_str(); cache_params.scm_mask = scm_mask.empty() ? nullptr : scm_mask.c_str(); params.loras = lora_vec.empty() ? nullptr : lora_vec.data(); @@ -2306,6 +2319,7 @@ static json build_sampling_metadata_json(const sd_sample_params_t& sample_params {"eta", sample_params.eta}, {"shifted_timestep", sample_params.shifted_timestep}, {"flow_shift", sample_params.flow_shift}, + {"extra_sample_args", safe_json_string(sample_params.extra_sample_args)}, {"guidance", { {"txt_cfg", sample_params.guidance.txt_cfg}, @@ -2497,6 +2511,9 @@ std::string get_image_params(const SDContextParams& ctx_params, } parameter_string += "Guidance: " + std::to_string(gen_params.sample_params.guidance.distilled_guidance) + ", "; parameter_string += "Eta: " + std::to_string(gen_params.sample_params.eta) + ", "; + if (!gen_params.extra_sample_args.empty()) { + parameter_string += "Extra sample args: " + gen_params.extra_sample_args + ", "; + } parameter_string += "Seed: " + std::to_string(seed) + ", "; parameter_string += "Size: " + std::to_string(gen_params.get_resolved_width()) + "x" + std::to_string(gen_params.get_resolved_height()) + ", "; parameter_string += "Model: " + sd_basename(ctx_params.model_path) + ", "; diff --git a/examples/common/common.h b/examples/common/common.h index f87293f3..badaa875 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -168,6 +168,8 @@ struct SDGenerationParams { sd_sample_params_t sample_params; sd_sample_params_t high_noise_sample_params; + std::string extra_sample_args; + std::string high_noise_extra_sample_args; std::vector skip_layers = {7, 8, 9}; std::vector high_noise_skip_layers = {7, 8, 9}; diff --git a/examples/server/README.md b/examples/server/README.md index ead185cf..a2160203 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -205,6 +205,8 @@ Default Generation Options: --hires-upscaler highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent (nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name under --hires-upscalers-dir (default: Latent) + --extra-sample-args extra sampler args, key=value list. Currently lcm supports noise_clip_std, + noise_scale_start, noise_scale_end -H, --height image height, in pixel space (default: 512) -W, --width image width, in pixel space (default: 512) --steps number of sample steps (default: 20) diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index 7f87d669..d906f856 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -240,6 +240,7 @@ typedef struct { float* custom_sigmas; int custom_sigmas_count; float flow_shift; + const char* extra_sample_args; } sd_sample_params_t; typedef struct { diff --git a/src/conditioner.hpp b/src/conditioner.hpp index 4907938b..5050eeff 100644 --- a/src/conditioner.hpp +++ b/src/conditioner.hpp @@ -14,6 +14,12 @@ struct SDCondition { sd::Tensor c_concat; sd::Tensor c_t5_ids; sd::Tensor c_t5_weights; + sd::Tensor c_input_ids; + sd::Tensor c_position_ids; + sd::Tensor c_token_types; + sd::Tensor c_vinput_mask; + std::vector>> c_image_embeds; + std::vector> c_ref_images; std::vector> extra_c_crossattns; @@ -26,10 +32,24 @@ struct SDCondition { bool empty() const { if (!c_crossattn.empty() || !c_vector.empty() || !c_concat.empty() || - !c_t5_ids.empty() || !c_t5_weights.empty()) { + !c_t5_ids.empty() || !c_t5_weights.empty() || + !c_input_ids.empty() || !c_position_ids.empty() || + !c_token_types.empty() || !c_vinput_mask.empty()) { return false; } + for (const auto& image_embed : c_image_embeds) { + if (!image_embed.second.empty()) { + return false; + } + } + + for (const auto& tensor : c_ref_images) { + if (!tensor.empty()) { + return false; + } + } + for (const auto& tensor : extra_c_crossattns) { if (!tensor.empty()) { return false; diff --git a/src/denoiser.hpp b/src/denoiser.hpp index 3742f53b..0b0f8201 100644 --- a/src/denoiser.hpp +++ b/src/denoiser.hpp @@ -2,6 +2,7 @@ #define __DENOISER_HPP__ #include +#include #include #include "ggml_extend.hpp" @@ -1148,7 +1149,80 @@ static sd::Tensor sample_lcm(denoise_cb_t model, sd::Tensor x, const std::vector& sigmas, std::shared_ptr rng, - bool is_flow_denoiser) { + bool is_flow_denoiser, + const char* extra_sample_args = nullptr) { + struct LCMSampleArgs { + float noise_clip_std = 0.0f; + float noise_scale_start = 1.0f; + float noise_scale_end = 1.0f; + }; + + auto trim = [](std::string value) -> std::string { + const char* whitespace = " \t\r\n"; + size_t begin = value.find_first_not_of(whitespace); + if (begin == std::string::npos) { + return ""; + } + size_t end = value.find_last_not_of(whitespace); + return value.substr(begin, end - begin + 1); + }; + + LCMSampleArgs args; + if (extra_sample_args != nullptr && extra_sample_args[0] != '\0') { + std::string raw(extra_sample_args); + size_t start = 0; + bool noise_scale_end_was_set = false; + bool noise_scale_start_was_set = false; + auto parse_arg = [&](const std::string& item) { + std::string token = trim(item); + if (token.empty()) { + return; + } + size_t eq = token.find('='); + if (eq == std::string::npos) { + LOG_WARN("ignoring invalid lcm extra sample arg '%s'", token.c_str()); + return; + } + + std::string key = trim(token.substr(0, eq)); + std::string value = trim(token.substr(eq + 1)); + float parsed = 0.0f; + try { + size_t consumed = 0; + parsed = std::stof(value, &consumed); + if (trim(value.substr(consumed)).size() != 0) { + LOG_WARN("ignoring invalid lcm extra sample arg '%s'", token.c_str()); + return; + } + } catch (const std::exception&) { + LOG_WARN("ignoring invalid lcm extra sample arg '%s'", token.c_str()); + return; + } + + if (key == "noise_clip_std") { + args.noise_clip_std = parsed; + } else if (key == "noise_scale_start") { + args.noise_scale_start = parsed; + noise_scale_start_was_set = true; + } else if (key == "noise_scale_end") { + args.noise_scale_end = parsed; + noise_scale_end_was_set = true; + } else { + LOG_WARN("ignoring unknown lcm extra sample arg '%s'", key.c_str()); + } + }; + + for (size_t pos = 0; pos <= raw.size(); ++pos) { + if (pos == raw.size() || raw[pos] == ',' || raw[pos] == ';') { + parse_arg(raw.substr(start, pos - start)); + start = pos + 1; + } + } + if (noise_scale_start_was_set && !noise_scale_end_was_set) { + args.noise_scale_end = args.noise_scale_start; + } + } + int steps = static_cast(sigmas.size()) - 1; for (int i = 0; i < steps; i++) { auto denoised_opt = model(x, sigmas[i], i + 1, nullptr); @@ -1160,7 +1234,27 @@ static sd::Tensor sample_lcm(denoise_cb_t model, if (is_flow_denoiser) { x *= (1 - sigmas[i + 1]); } - x += sd::Tensor::randn_like(x, rng) * sigmas[i + 1]; + auto noise = sd::Tensor::randn_like(x, rng); + if (args.noise_clip_std > 0.0f && noise.numel() > 0) { + double mean = 0.0; + for (int64_t j = 0; j < noise.numel(); ++j) { + mean += static_cast(noise[j]); + } + mean /= static_cast(noise.numel()); + + double variance = 0.0; + for (int64_t j = 0; j < noise.numel(); ++j) { + double centered = static_cast(noise[j]) - mean; + variance += centered * centered; + } + variance /= static_cast(noise.numel()); + + float clip_val = args.noise_clip_std * static_cast(std::sqrt(variance)); + noise = sd::ops::clamp(noise, -clip_val, clip_val); + } + float t = steps > 1 ? static_cast(i) / static_cast(steps - 1) : 0.0f; + float noise_scale = args.noise_scale_start + (args.noise_scale_end - args.noise_scale_start) * t; + x += noise * (sigmas[i + 1] * noise_scale); } } return x; @@ -1656,15 +1750,15 @@ static sd::Tensor sample_euler_cfg_pp(denoise_cb_t model, for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; sd::Tensor uncond_denoised; - + auto denoised_opt = model(x, sigma, i + 1, &uncond_denoised); if (denoised_opt.empty() || uncond_denoised.empty()) { return {}; } - + sd::Tensor denoised = std::move(denoised_opt); - sd::Tensor d = (x - uncond_denoised) / sigma; - + sd::Tensor d = (x - uncond_denoised) / sigma; + x = denoised + d * sigmas[i + 1]; } return x; @@ -1679,19 +1773,19 @@ static sd::Tensor sample_euler_ancestral_cfg_pp(denoise_cb_t model, for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; sd::Tensor uncond_denoised; - + auto denoised_opt = model(x, sigma, i + 1, &uncond_denoised); if (denoised_opt.empty() || uncond_denoised.empty()) { return {}; } - + sd::Tensor denoised = std::move(denoised_opt); - sd::Tensor d = (x - uncond_denoised) / sigma; - + sd::Tensor d = (x - uncond_denoised) / sigma; + auto [sigma_down, sigma_up] = get_ancestral_step(sigmas[i], sigmas[i + 1], eta); - + x = denoised + d * sigma_down; - + if (sigmas[i + 1] > 0) { x += sd::Tensor::randn_like(x, rng) * sigma_up; } @@ -1706,7 +1800,8 @@ static sd::Tensor sample_k_diffusion(sample_method_t method, std::vector sigmas, std::shared_ptr rng, float eta, - bool is_flow_denoiser) { + bool is_flow_denoiser, + const char* extra_sample_args) { switch (method) { case EULER_A_SAMPLE_METHOD: if (is_flow_denoiser) @@ -1729,7 +1824,7 @@ static sd::Tensor sample_k_diffusion(sample_method_t method, case DPMPP2Mv2_SAMPLE_METHOD: return sample_dpmpp_2m_v2(model, std::move(x), sigmas); case LCM_SAMPLE_METHOD: - return sample_lcm(model, std::move(x), sigmas, rng, is_flow_denoiser); + return sample_lcm(model, std::move(x), sigmas, rng, is_flow_denoiser, extra_sample_args); case IPNDM_SAMPLE_METHOD: return sample_ipndm(model, std::move(x), sigmas); case IPNDM_V_SAMPLE_METHOD: diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp index 1a202a1a..26021ef2 100644 --- a/src/diffusion_model.hpp +++ b/src/diffusion_model.hpp @@ -5,6 +5,7 @@ #include "anima.hpp" #include "ernie_image.hpp" #include "flux.hpp" +#include "hidream_o1.hpp" #include "mmdit.hpp" #include "qwen_image.hpp" #include "tensor_ggml.hpp" @@ -13,22 +14,28 @@ #include "z_image.hpp" struct DiffusionParams { - const sd::Tensor* x = nullptr; - const sd::Tensor* timesteps = nullptr; - const sd::Tensor* context = nullptr; - const sd::Tensor* c_concat = nullptr; - const sd::Tensor* y = nullptr; - const sd::Tensor* t5_ids = nullptr; - const sd::Tensor* t5_weights = nullptr; - const sd::Tensor* guidance = nullptr; - const std::vector>* ref_latents = nullptr; - bool increase_ref_index = false; - int num_video_frames = -1; - const std::vector>* controls = nullptr; - float control_strength = 0.f; - const sd::Tensor* vace_context = nullptr; - float vace_strength = 1.f; - const std::vector* skip_layers = nullptr; + const sd::Tensor* x = nullptr; + const sd::Tensor* timesteps = nullptr; + const sd::Tensor* context = nullptr; + const sd::Tensor* c_concat = nullptr; + const sd::Tensor* y = nullptr; + const sd::Tensor* t5_ids = nullptr; + const sd::Tensor* t5_weights = nullptr; + const sd::Tensor* guidance = nullptr; + const std::vector>* ref_latents = nullptr; + const sd::Tensor* input_ids = nullptr; + const sd::Tensor* input_pos = nullptr; + const sd::Tensor* token_types = nullptr; + const sd::Tensor* vinput_mask = nullptr; + const std::vector>* vlm_images = nullptr; + const std::vector>>* image_embeds = nullptr; + bool increase_ref_index = false; + int num_video_frames = -1; + const std::vector>* controls = nullptr; + float control_strength = 0.f; + const sd::Tensor* vace_context = nullptr; + float vace_strength = 1.f; + const std::vector* skip_layers = nullptr; }; template @@ -476,6 +483,82 @@ struct QwenImageModel : public DiffusionModel { } }; +struct HiDreamO1Model : public DiffusionModel { + std::string prefix; + HiDreamO1::HiDreamO1Runner hidream_o1; + + HiDreamO1Model(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map = {}, + const std::string& prefix = "model") + : prefix(prefix), hidream_o1(backend, offload_params_to_cpu, tensor_storage_map, prefix) { + } + + std::string get_desc() override { + return hidream_o1.get_desc(); + } + + void alloc_params_buffer() override { + hidream_o1.alloc_params_buffer(); + } + + void free_params_buffer() override { + hidream_o1.free_params_buffer(); + } + + void free_compute_buffer() override { + hidream_o1.free_compute_buffer(); + } + + void get_param_tensors(std::map& tensors) override { + hidream_o1.get_param_tensors(tensors, prefix); + } + + size_t get_params_buffer_size() override { + return hidream_o1.get_params_buffer_size(); + } + + void set_weight_adapter(const std::shared_ptr& adapter) override { + hidream_o1.set_weight_adapter(adapter); + } + + int64_t get_adm_in_channels() override { + return 0; + } + + void set_flash_attention_enabled(bool enabled) { + hidream_o1.set_flash_attention_enabled(enabled); + } + + void set_max_graph_vram_bytes(size_t max_vram_bytes) override { + hidream_o1.set_max_graph_vram_bytes(max_vram_bytes); + } + + void set_circular_axes(bool circular_x, bool circular_y) override { + hidream_o1.set_circular_axes(circular_x, circular_y); + } + + sd::Tensor compute(int n_threads, + const DiffusionParams& diffusion_params) override { + GGML_ASSERT(diffusion_params.x != nullptr); + GGML_ASSERT(diffusion_params.timesteps != nullptr); + GGML_ASSERT(diffusion_params.input_ids != nullptr); + GGML_ASSERT(diffusion_params.input_pos != nullptr); + GGML_ASSERT(diffusion_params.token_types != nullptr); + static const std::vector> empty_images; + static const std::vector>> empty_image_embeds; + return hidream_o1.compute(n_threads, + *diffusion_params.x, + *diffusion_params.timesteps, + *diffusion_params.input_ids, + *diffusion_params.input_pos, + *diffusion_params.token_types, + tensor_or_empty(diffusion_params.vinput_mask), + diffusion_params.image_embeds ? *diffusion_params.image_embeds : empty_image_embeds, + diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_images); + } +}; + struct ZImageModel : public DiffusionModel { std::string prefix; ZImage::ZImageRunner z_image; diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index f88eeb60..c6cd1c3a 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -280,6 +280,9 @@ __STATIC_INLINE__ void print_sd_tensor(const sd::Tensor& tensor, bool shape_o if (shape_only) { return; } + if (tensor.empty()) { + return; + } int range = 3; std::vector shape = tensor.shape(); while (shape.size() < 4) { @@ -1698,13 +1701,41 @@ struct WeightAdapter { }; struct GGMLRunnerContext { - ggml_backend_t backend = nullptr; - ggml_context* ggml_ctx = nullptr; - bool flash_attn_enabled = false; - bool conv2d_direct_enabled = false; - bool circular_x_enabled = false; - bool circular_y_enabled = false; - std::shared_ptr weight_adapter = nullptr; + ggml_backend_t backend = nullptr; + ggml_context* ggml_ctx = nullptr; + bool flash_attn_enabled = false; + bool conv2d_direct_enabled = false; + bool circular_x_enabled = false; + bool circular_y_enabled = false; + std::shared_ptr weight_adapter = nullptr; + std::vector>* debug_tensors = nullptr; + std::function get_cache_tensor; + std::function cache_tensor; + + void capture_tensor(const std::string& name, ggml_tensor* tensor) { + if (debug_tensors == nullptr || tensor == nullptr) { + return; + } + ggml_tensor* snapshot = ggml_cont(ggml_ctx, tensor); + ggml_tensor* dst = ggml_dup_tensor(ggml_ctx, snapshot); + snapshot = ggml_cpy(ggml_ctx, snapshot, dst); + ggml_set_output(snapshot); + debug_tensors->push_back({snapshot, name}); + } + + ggml_tensor* load_cache_tensor(const std::string& name) const { + if (!get_cache_tensor) { + return nullptr; + } + return get_cache_tensor(name); + } + + void persist_cache_tensor(const std::string& name, ggml_tensor* tensor) const { + if (!cache_tensor || tensor == nullptr) { + return; + } + cache_tensor(name, tensor); + } }; struct GGMLRunner { @@ -1743,6 +1774,7 @@ protected: std::map backend_tensor_data_map; std::map cache_tensor_map; // name -> tensor + std::vector> debug_tensors; const std::string final_result_name = "ggml_runner_final_result_tensor"; bool flash_attn_enabled = false; @@ -1838,6 +1870,7 @@ protected: } void free_compute_ctx() { + debug_tensors.clear(); if (compute_ctx != nullptr) { ggml_free(compute_ctx); compute_ctx = nullptr; @@ -1884,6 +1917,16 @@ protected: auto result = ggml_graph_node(gf, -1); ggml_set_name(result, final_result_name.c_str()); } + for (const auto& entry : debug_tensors) { + if (entry.first != nullptr) { + ggml_build_forward_expand(gf, entry.first); + } + } + for (const auto& entry : cache_tensor_map) { + if (entry.second != nullptr) { + ggml_build_forward_expand(gf, entry.second); + } + } prepare_build_in_tensor_after(gf); return gf; } @@ -1981,9 +2024,13 @@ protected: ggml_backend_buffer_t src_buf = sd::ggml_graph_cut::tensor_buffer(src); ggml_backend_buffer_t dst_buf = sd::ggml_graph_cut::tensor_buffer(dst); if (src_buf == nullptr || dst_buf == nullptr) { - LOG_ERROR("%s cache copy tensor buffer missing: name=%s src_buffer=%p src_view_src=%p src_view_src_buffer=%p dst_buffer=%p", + LOG_ERROR("%s cache copy tensor buffer missing: name=%s op=%s src0=%p src0_name=%s src0_buffer=%p src_buffer=%p src_view_src=%p src_view_src_buffer=%p dst_buffer=%p", get_desc().c_str(), src && src->name[0] != '\0' ? src->name : "", + src ? ggml_op_name(src->op) : "", + src ? src->src[0] : nullptr, + (src && src->src[0] && src->src[0]->name[0] != '\0') ? src->src[0]->name : "", + (src && src->src[0]) ? sd::ggml_graph_cut::tensor_buffer(src->src[0]) : nullptr, src ? src->buffer : nullptr, src ? src->view_src : nullptr, (src && src->view_src) ? src->view_src->buffer : nullptr, @@ -2015,6 +2062,42 @@ protected: return true; } + template + std::optional> read_graph_tensor(ggml_tensor* tensor, const char* label) { + if (tensor == nullptr) { + LOG_ERROR("%s %s tensor is null", get_desc().c_str(), label); + return std::nullopt; + } + if (tensor->type != sd::GGMLTypeTraits::type) { + LOG_ERROR("%s %s tensor type mismatch: got %s", + get_desc().c_str(), + label, + ggml_type_name(tensor->type)); + return std::nullopt; + } + ggml_backend_buffer_t buf = sd::ggml_graph_cut::tensor_buffer(tensor); + if (buf == nullptr) { + LOG_ERROR("%s %s tensor buffer missing: name=%s op=%s buffer=%p view_src=%p view_src_buffer=%p data=%p", + get_desc().c_str(), + label, + tensor->name[0] != '\0' ? tensor->name : "", + ggml_op_name(tensor->op), + tensor->buffer, + tensor->view_src, + tensor->view_src ? tensor->view_src->buffer : nullptr, + tensor->data); + return std::nullopt; + } + + sd::Tensor result(sd::shape_from_ggml(tensor)); + if (tensor->view_src != nullptr || !ggml_is_contiguous(tensor) || tensor->buffer == nullptr) { + ggml_backend_tensor_get(tensor, result.data(), 0, ggml_nbytes(tensor)); + } else { + ggml_backend_tensor_get(tensor, result.data(), 0, ggml_nbytes(tensor)); + } + return result; + } + void copy_data_to_backend_tensor(ggml_cgraph* gf, bool clear_after_copy = true) { GGML_ASSERT(gf != nullptr); std::unordered_set graph_tensor_set; @@ -2031,10 +2114,24 @@ protected: for (auto& kv : backend_tensor_data_map) { auto tensor = kv.first; auto data = kv.second; - + if (tensor == nullptr || data == nullptr) { + continue; + } + const char* name = ggml_get_name(tensor); if (graph_tensor_set.find(tensor) == graph_tensor_set.end()) { continue; } + if (tensor->buffer == nullptr) { + LOG_WARN("%s skip backend tensor copy: tensor buffer not set, name='%s', ne=[%lld,%lld,%lld,%lld], type=%s", + get_desc().c_str(), + name != nullptr ? name : "", + (long long)tensor->ne[0], + (long long)tensor->ne[1], + (long long)tensor->ne[2], + (long long)tensor->ne[3], + ggml_type_name(tensor->type)); + continue; + } ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; if (buf == nullptr) { @@ -2421,6 +2518,43 @@ protected: return std::nullopt; } + std::unordered_set debug_graph_tensor_set; + const int n_debug_leafs = sd::ggml_graph_cut::leaf_count(gf); + const int n_debug_nodes = ggml_graph_n_nodes(gf); + debug_graph_tensor_set.reserve(static_cast(n_debug_leafs + n_debug_nodes)); + for (int i = 0; i < n_debug_leafs; ++i) { + debug_graph_tensor_set.insert(sd::ggml_graph_cut::leaf_tensor(gf, i)); + } + for (int i = 0; i < n_debug_nodes; ++i) { + debug_graph_tensor_set.insert(ggml_graph_node(gf, i)); + } + + for (const auto& entry : debug_tensors) { + auto tensor = entry.first; + if (tensor == nullptr) { + continue; + } + if (debug_graph_tensor_set.find(tensor) == debug_graph_tensor_set.end()) { + continue; + } + ggml_backend_buffer_t tensor_buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + if (tensor_buf == nullptr) { + LOG_WARN("%s skip debug tensor '%s': tensor buffer not set", + get_desc().c_str(), + entry.second.c_str()); + continue; + } + if (tensor->type != GGML_TYPE_F32) { + LOG_WARN("%s skip debug tensor '%s': only GGML_TYPE_F32 is supported, got %s", + get_desc().c_str(), + entry.second.c_str(), + ggml_type_name(tensor->type)); + continue; + } + auto debug_tensor = sd::make_sd_tensor_from_ggml(tensor); + print_sd_tensor(debug_tensor, false, entry.second.c_str()); + } + int64_t t_cache_begin = ggml_time_ms(); if (!copy_cache_tensors_to_cache_buffer(cache_keep_names)) { if (free_compute_buffer_immediately) { @@ -2434,7 +2568,15 @@ protected: auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str()); std::optional> output; if (!no_return) { - output = sd::make_sd_tensor_from_ggml(result); + output = read_graph_tensor(result, "output"); + if (!output.has_value()) { + if (free_compute_buffer_immediately) { + free_compute_buffer(); + } else if (use_partial_param_offload) { + restore_partial_params(); + } + return std::nullopt; + } } else { output = sd::Tensor(); } @@ -2557,6 +2699,13 @@ public: runner_ctx.circular_x_enabled = circular_x_enabled; runner_ctx.circular_y_enabled = circular_y_enabled; runner_ctx.weight_adapter = weight_adapter; + runner_ctx.debug_tensors = &debug_tensors; + runner_ctx.get_cache_tensor = [this](const std::string& name) { + return this->get_cache_tensor_by_name(name); + }; + runner_ctx.cache_tensor = [this](const std::string& name, ggml_tensor* tensor) { + this->cache(name, tensor); + }; return runner_ctx; } @@ -2676,6 +2825,9 @@ public: } void cache(const std::string name, ggml_tensor* tensor) { + if (tensor != nullptr && tensor->view_src != nullptr) { + tensor = ggml_cont(compute_ctx, tensor); + } cache_tensor_map[name] = tensor; } diff --git a/src/ggml_graph_cut.cpp b/src/ggml_graph_cut.cpp index f206f2d2..0958d888 100644 --- a/src/ggml_graph_cut.cpp +++ b/src/ggml_graph_cut.cpp @@ -45,6 +45,21 @@ namespace sd::ggml_graph_cut { return params_tensor_set.find(tensor) != params_tensor_set.end(); } + static int graph_node_index_by_name(ggml_cgraph* gf, const char* name) { + GGML_ASSERT(gf != nullptr); + if (name == nullptr || name[0] == '\0') { + return -1; + } + const int n_nodes = ggml_graph_n_nodes(gf); + for (int i = 0; i < n_nodes; ++i) { + ggml_tensor* node = ggml_graph_node(gf, i); + if (node != nullptr && std::strcmp(node->name, name) == 0) { + return i; + } + } + return -1; + } + static Plan::InputShape input_shape(const ggml_tensor* tensor) { Plan::InputShape shape; if (tensor == nullptr) { @@ -244,6 +259,11 @@ namespace sd::ggml_graph_cut { if (tensor == nullptr) { return nullptr; } + if (tensor_buffer(tensor) == nullptr && tensor->src[0] != nullptr && + ggml_nelements(tensor->src[0]) == ggml_nelements(tensor) && + ggml_nbytes(tensor->src[0]) == ggml_nbytes(tensor)) { + return cache_source_tensor(tensor->src[0]); + } return tensor->view_src ? tensor->view_src : tensor; } @@ -503,11 +523,15 @@ namespace sd::ggml_graph_cut { log_desc); } - ggml_tensor* final_output = ggml_graph_node(gf, -1); - if (final_output != nullptr && available_cut_output_node_indices.find(n_nodes - 1) == available_cut_output_node_indices.end()) { + int final_output_index = graph_node_index_by_name(gf, "ggml_runner_final_result_tensor"); + if (final_output_index < 0) { + final_output_index = n_nodes - 1; + } + ggml_tensor* final_output = final_output_index >= 0 ? ggml_graph_node(gf, final_output_index) : nullptr; + if (final_output != nullptr && available_cut_output_node_indices.find(final_output_index) == available_cut_output_node_indices.end()) { Segment final_segment; final_segment.group_name = "ggml_runner.final"; - final_segment.output_node_indices.push_back(n_nodes - 1); + final_segment.output_node_indices.push_back(final_output_index); build_segment(gf, plan, final_segment, diff --git a/src/hidream_o1.hpp b/src/hidream_o1.hpp new file mode 100644 index 00000000..908f2de3 --- /dev/null +++ b/src/hidream_o1.hpp @@ -0,0 +1,653 @@ +#ifndef __SD_HIDREAM_O1_H__ +#define __SD_HIDREAM_O1_H__ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common_dit.hpp" +#include "conditioner.hpp" +#include "llm.hpp" +#include "util.h" + +namespace HiDreamO1 { + constexpr int HIDREAM_O1_GRAPH_SIZE = 32768; + constexpr int PATCH_SIZE = 32; + constexpr int TIMESTEP_TOKEN_NUM = 1; + constexpr int IMAGE_TOKEN_ID = 151655; + constexpr int VISION_START_TOKEN_ID = 151652; + + static inline std::string repeat_special_token(const std::string& token, int64_t count) { + std::string out; + out.reserve(static_cast(count) * token.size()); + for (int64_t i = 0; i < count; ++i) { + out += token; + } + return out; + } + + static inline std::pair calculate_dimensions(int max_size, double ratio) { + int width = static_cast(std::sqrt(max_size * max_size * ratio)); + int height = static_cast(width / ratio); + width = (width / PATCH_SIZE) * PATCH_SIZE; + height = (height / PATCH_SIZE) * PATCH_SIZE; + width = std::max(width, PATCH_SIZE); + height = std::max(height, PATCH_SIZE); + return {width, height}; + } + + static inline sd::Tensor resize_to_area(const sd::Tensor& image, int image_size) { + int64_t width = image.shape()[0]; + int64_t height = image.shape()[1]; + int64_t s_max = static_cast(image_size) * image_size; + double scale = std::sqrt(static_cast(s_max) / static_cast(width * height)); + + std::vector> sizes = { + {(static_cast(std::llround(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast(std::llround(height * scale)) / PATCH_SIZE) * PATCH_SIZE}, + {(static_cast(std::llround(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast(std::floor(height * scale)) / PATCH_SIZE) * PATCH_SIZE}, + {(static_cast(std::floor(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast(std::llround(height * scale)) / PATCH_SIZE) * PATCH_SIZE}, + {(static_cast(std::floor(width * scale)) / PATCH_SIZE) * PATCH_SIZE, (static_cast(std::floor(height * scale)) / PATCH_SIZE) * PATCH_SIZE}, + }; + std::sort(sizes.begin(), sizes.end(), [](const auto& a, const auto& b) { + return a.first * a.second > b.first * b.second; + }); + + std::pair new_size = sizes.back(); + for (const auto& size : sizes) { + if (size.first > 0 && size.second > 0 && size.first * size.second <= s_max) { + new_size = size; + break; + } + } + + double s1 = static_cast(width) / static_cast(new_size.first); + double s2 = static_cast(height) / static_cast(new_size.second); + sd::Tensor resized; + if (s1 < s2) { + int64_t resized_h = static_cast(std::llround(height / s1)); + resized = sd::ops::interpolate(image, + {new_size.first, resized_h, image.shape()[2], image.shape()[3]}, + sd::ops::InterpolateMode::Bicubic); + int64_t top = (resized_h - new_size.second) / 2; + resized = sd::ops::slice(resized, 1, top, top + new_size.second); + } else { + int64_t resized_w = static_cast(std::llround(width / s2)); + resized = sd::ops::interpolate(image, + {resized_w, new_size.second, image.shape()[2], image.shape()[3]}, + sd::ops::InterpolateMode::Bicubic); + int64_t left = (resized_w - new_size.first) / 2; + resized = sd::ops::slice(resized, 0, left, left + new_size.first); + } + return resized; + } + + static inline std::vector build_position_ids(const std::vector& input_ids, + const std::vector>& image_grids, + const std::vector& skip_vision_start_token) { + std::vector position_ids(4 * input_ids.size(), 0); + int image_index = 0; + int st = 0; + int fix_point = 4096; + std::vector out_t; + std::vector out_h; + std::vector out_w; + + while (st < static_cast(input_ids.size())) { + int ed = st; + while (ed < static_cast(input_ids.size()) && input_ids[ed] != IMAGE_TOKEN_ID) { + ed++; + } + + if (ed >= static_cast(input_ids.size())) { + int st_idx = out_t.empty() ? 0 : (*std::max_element(out_t.begin(), out_t.end()) + 1); + for (int i = 0; i < static_cast(input_ids.size()) - st; ++i) { + out_t.push_back(st_idx + i); + out_h.push_back(st_idx + i); + out_w.push_back(st_idx + i); + } + break; + } + + int text_len = std::max(0, ed - st - skip_vision_start_token[image_index]); + int st_idx = out_t.empty() ? 0 : (*std::max_element(out_t.begin(), out_t.end()) + 1); + for (int i = 0; i < text_len; ++i) { + out_t.push_back(st_idx + i); + out_h.push_back(st_idx + i); + out_w.push_back(st_idx + i); + } + + auto grid = image_grids[image_index]; + int base; + if (skip_vision_start_token[image_index]) { + if (fix_point > 0) { + base = fix_point; + fix_point = 0; + } else { + base = st_idx; + } + } else { + base = text_len + st_idx; + } + for (int32_t ti = 0; ti < grid[0]; ++ti) { + for (int32_t hi = 0; hi < grid[1]; ++hi) { + for (int32_t wi = 0; wi < grid[2]; ++wi) { + out_t.push_back(base + ti); + out_h.push_back(base + hi); + out_w.push_back(base + wi); + } + } + } + + st = ed + grid[0] * grid[1] * grid[2]; + image_index++; + } + + GGML_ASSERT(out_t.size() == input_ids.size()); + for (size_t i = 0; i < input_ids.size(); ++i) { + // ggml IMROPE consumes 4 flattened position streams: + // [t, h, w, e] + // llama.cpp's generic Qwen-VL fallback expands text positions as + // [pos, pos, pos, 0]. Keep the extra stream zeroed here too. + position_ids[i] = out_t[i]; + position_ids[input_ids.size() + i] = out_h[i]; + position_ids[input_ids.size() * 2 + i] = out_w[i]; + position_ids[input_ids.size() * 3 + i] = 0; + } + return position_ids; + } + + struct TimestepEmbedder : public GGMLBlock { + int frequency_embedding_size = 256; + + TimestepEmbedder(int64_t hidden_size) { + blocks["mlp.0"] = std::make_shared(frequency_embedding_size, hidden_size, true); + blocks["mlp.2"] = std::make_shared(hidden_size, hidden_size, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* t) { + auto mlp_0 = std::dynamic_pointer_cast(blocks["mlp.0"]); + auto mlp_2 = std::dynamic_pointer_cast(blocks["mlp.2"]); + auto emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, t, frequency_embedding_size, 10000, 1000.0f); + emb = mlp_0->forward(ctx, emb); + emb = ggml_silu_inplace(ctx->ggml_ctx, emb); + emb = mlp_2->forward(ctx, emb); + return emb; + } + }; + + struct BottleneckPatchEmbed : public GGMLBlock { + BottleneckPatchEmbed(int64_t in_dim, int64_t pca_dim, int64_t embed_dim) { + blocks["proj1"] = std::make_shared(in_dim, pca_dim, false); + blocks["proj2"] = std::make_shared(pca_dim, embed_dim, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + auto proj1 = std::dynamic_pointer_cast(blocks["proj1"]); + auto proj2 = std::dynamic_pointer_cast(blocks["proj2"]); + return proj2->forward(ctx, proj1->forward(ctx, x)); + } + }; + + struct FinalLayer : public GGMLBlock { + FinalLayer(int64_t hidden_size, int64_t out_dim) { + blocks["linear"] = std::make_shared(hidden_size, out_dim, true); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + auto linear = std::dynamic_pointer_cast(blocks["linear"]); + return linear->forward(ctx, x); + } + }; + + struct HiDreamO1Params { + LLM::LLMParams llm; + int patch_size = PATCH_SIZE; + }; + + static inline HiDreamO1Params make_hidream_o1_params() { + HiDreamO1Params params; + params.llm.arch = LLM::LLMArch::QWEN3_VL; + params.llm.hidden_size = 4096; + params.llm.intermediate_size = 12288; + params.llm.num_layers = 36; + params.llm.num_heads = 32; + params.llm.num_kv_heads = 8; + params.llm.head_dim = 128; + params.llm.qkv_bias = false; + params.llm.qk_norm = true; + params.llm.vocab_size = 151936; + params.llm.rms_norm_eps = 1e-6f; + params.llm.vision.arch = LLM::LLMVisionArch::QWEN3_VL; + params.llm.vision.num_layers = 27; + params.llm.vision.hidden_size = 1152; + params.llm.vision.intermediate_size = 4304; + params.llm.vision.num_heads = 16; + params.llm.vision.out_hidden_size = 4096; + params.llm.vision.patch_size = 16; + params.llm.vision.spatial_merge_size = 2; + params.llm.vision.temporal_patch_size = 2; + params.llm.vision.num_position_embeddings = 2304; + return params; + } + + struct HiDreamO1Model : public GGMLBlock { + HiDreamO1Params params; + + HiDreamO1Model() = default; + explicit HiDreamO1Model(HiDreamO1Params params) + : params(std::move(params)) { + blocks["language_model"] = std::make_shared(this->params.llm); + blocks["t_embedder1"] = std::make_shared(this->params.llm.hidden_size); + blocks["x_embedder"] = std::make_shared(this->params.patch_size * this->params.patch_size * 3, + this->params.llm.hidden_size / 4, + this->params.llm.hidden_size); + blocks["final_layer2"] = std::make_shared(this->params.llm.hidden_size, + this->params.patch_size * this->params.patch_size * 3); + } + + std::shared_ptr text_model() { + return std::dynamic_pointer_cast(blocks["language_model"]); + } + + std::shared_ptr timestep_embedder() { + return std::dynamic_pointer_cast(blocks["t_embedder1"]); + } + + std::shared_ptr patch_embedder() { + return std::dynamic_pointer_cast(blocks["x_embedder"]); + } + + std::shared_ptr final_layer() { + return std::dynamic_pointer_cast(blocks["final_layer2"]); + } + }; + + struct HiDreamO1VisionRunner : public GGMLRunner { + HiDreamO1Params params; + std::shared_ptr model; + + std::vector window_index_vec; + std::vector window_inverse_index_vec; + std::vector window_mask_vec; + std::vector pe_vec; + std::array, 4> pos_embed_idx_data_; + std::array, 4> pos_embed_weight_data_; + + HiDreamO1VisionRunner(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map = {}, + const std::string& prefix = "model.visual") + : GGMLRunner(backend, offload_params_to_cpu), + params(make_hidream_o1_params()), + model(std::make_shared(false, params.llm.vision)) { + model->init(params_ctx, tensor_storage_map, prefix); + } + + std::string get_desc() override { + return "hidream_o1_vision"; + } + + void get_param_tensors(std::map& tensors, const std::string& prefix = "model.visual") { + model->get_param_tensors(tensors, prefix); + } + + ggml_tensor* encode_image(GGMLRunnerContext* runner_ctx, ggml_tensor* image) { + return LLM::LLMRunner::encode_image_common(this, + compute_ctx, + runner_ctx, + image, + params.llm.vision, + model, + window_index_vec, + window_inverse_index_vec, + window_mask_vec, + pe_vec, + pos_embed_idx_data_, + pos_embed_weight_data_); + } + + ggml_cgraph* build_graph(const sd::Tensor& image_tensor) { + ggml_cgraph* gf = new_graph_custom(HIDREAM_O1_GRAPH_SIZE); + ggml_tensor* image = make_input(image_tensor); + auto runner_ctx = get_context(); + auto image_embeds = encode_image(&runner_ctx, image); + ggml_build_forward_expand(gf, image_embeds); + return gf; + } + + sd::Tensor compute(int n_threads, const sd::Tensor& image) { + auto get_graph = [&]() { + return build_graph(image); + }; + auto output = GGMLRunner::compute(get_graph, n_threads, false); + return output.has_value() ? std::move(output.value()) : sd::Tensor(); + } + }; + + struct HiDreamO1Runner : public GGMLRunner { + HiDreamO1Params params; + HiDreamO1Model model; + + std::vector attention_mask_vec; + + HiDreamO1Runner(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map = {}, + const std::string& prefix = "model") + : GGMLRunner(backend, offload_params_to_cpu), + params(make_hidream_o1_params()) { + model = HiDreamO1Model(params); + model.init(params_ctx, tensor_storage_map, prefix); + } + + std::string get_desc() override { + return "hidream_o1"; + } + + void get_param_tensors(std::map& tensors, const std::string& prefix) { + model.get_param_tensors(tensors, prefix); + } + + ggml_cgraph* build_graph(const sd::Tensor& x_tensor, + const sd::Tensor& timestep_tensor, + const sd::Tensor& input_ids_tensor, + const sd::Tensor& input_pos_tensor, + const sd::Tensor& token_types_tensor, + const sd::Tensor& vinput_mask_tensor, + const std::vector>>& image_embeds_tensor, + const std::vector>& ref_images) { + ggml_cgraph* gf = new_graph_custom(HIDREAM_O1_GRAPH_SIZE); + ggml_tensor* x = make_input(x_tensor); + ggml_tensor* timestep = make_input(timestep_tensor); + ggml_tensor* input_ids = make_input(input_ids_tensor); + ggml_tensor* input_pos = make_input(input_pos_tensor); + + auto text_model = model.text_model(); + auto t_embedder1 = model.timestep_embedder(); + auto x_embedder = model.patch_embedder(); + auto final_layer2 = model.final_layer(); + + std::vector ref_image_tensors; + for (const auto& image : ref_images) { + ref_image_tensors.push_back(make_input(image)); + } + + attention_mask_vec = std::vector(static_cast(token_types_tensor.shape()[0] * token_types_tensor.shape()[0]), 0.0f); + int64_t total_seq_len = token_types_tensor.shape()[0]; + for (int64_t query = 0; query < total_seq_len; ++query) { + bool is_gen = token_types_tensor.values()[static_cast(query)] > 0; + for (int64_t key = 0; key < total_seq_len; ++key) { + if (!is_gen && key > query) { + attention_mask_vec[static_cast(query * total_seq_len + key)] = -INFINITY; + } + } + } + auto attention_mask = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, total_seq_len, total_seq_len); + set_backend_tensor_data(attention_mask, attention_mask_vec.data()); + + auto runner_ctx = get_context(); + auto txt = text_model->embed(&runner_ctx, input_ids); + std::vector> image_embeds; + image_embeds.reserve(image_embeds_tensor.size()); + for (const auto& image_embed : image_embeds_tensor) { + image_embeds.emplace_back(image_embed.first, make_input(image_embed.second)); + } + txt = LLM::splice_image_embeds(&runner_ctx, txt, image_embeds); + + auto t_emb = t_embedder1->forward(&runner_ctx, timestep); + int64_t txt_seq_len = input_ids->ne[0]; + if (txt_seq_len > 1) { + auto prefix = ggml_ext_slice(compute_ctx, txt, 1, 0, txt_seq_len - 1); + txt = ggml_concat(compute_ctx, prefix, ggml_reshape_3d(compute_ctx, t_emb, t_emb->ne[0], 1, 1), 1); + } else { + txt = ggml_reshape_3d(compute_ctx, t_emb, t_emb->ne[0], 1, 1); + } + + auto vinputs = DiT::pad_and_patchify(&runner_ctx, x, PATCH_SIZE, PATCH_SIZE); + int64_t target_tokens = vinputs->ne[1]; + for (ggml_tensor* ref_image : ref_image_tensors) { + auto ref = DiT::pad_and_patchify(&runner_ctx, ref_image, PATCH_SIZE, PATCH_SIZE); + vinputs = ggml_concat(compute_ctx, vinputs, ref, 1); + } + auto vis = x_embedder->forward(&runner_ctx, vinputs); + + auto inputs_embeds = ggml_concat(compute_ctx, txt, vis, 1); + auto hidden_states = text_model->forward_embeds(&runner_ctx, inputs_embeds, input_pos, attention_mask, {}); + auto x_pred_all = final_layer2->forward(&runner_ctx, hidden_states); + + int64_t x_pred_start = txt_seq_len; + if (!vinput_mask_tensor.empty()) { + int64_t seq_len = static_cast(vinput_mask_tensor.shape()[0]); + int64_t first_vinput = 0; + while (first_vinput < seq_len && vinput_mask_tensor.values()[static_cast(first_vinput)] == 0) { + first_vinput++; + } + x_pred_start = first_vinput; + } + auto x_pred = ggml_ext_slice(compute_ctx, x_pred_all, 1, x_pred_start, x_pred_start + target_tokens); + x_pred = DiT::unpatchify_and_crop(compute_ctx, x_pred, x->ne[1], x->ne[0], PATCH_SIZE, PATCH_SIZE); + + float sigma = 1.0f - timestep_tensor.values()[0]; + sigma = std::max(1e-6f, sigma); + auto out = ggml_scale(compute_ctx, ggml_sub(compute_ctx, x, x_pred), 1.0f / sigma); + + ggml_build_forward_expand(gf, out); + return gf; + } + + sd::Tensor compute(int n_threads, + const sd::Tensor& x, + const sd::Tensor& timestep, + const sd::Tensor& input_ids, + const sd::Tensor& input_pos, + const sd::Tensor& token_types, + const sd::Tensor& vinput_mask, + const std::vector>>& image_embeds, + const std::vector>& ref_images) { + auto get_graph = [&]() { + return build_graph(x, timestep, input_ids, input_pos, token_types, vinput_mask, image_embeds, ref_images); + }; + return restore_trailing_singleton_dims(GGMLRunner::compute(get_graph, n_threads, false), x.dim()); + } + }; + + struct HiDreamO1Conditioner : public Conditioner { + Qwen2Tokenizer tokenizer; + std::shared_ptr vision_runner; + + HiDreamO1Conditioner(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map = {}) + : vision_runner(std::make_shared(backend, offload_params_to_cpu, tensor_storage_map)) {} + + void get_param_tensors(std::map& tensors) override { + vision_runner->get_param_tensors(tensors); + } + + void alloc_params_buffer() override { + vision_runner->alloc_params_buffer(); + } + + void free_params_buffer() override { + vision_runner->free_params_buffer(); + } + + size_t get_params_buffer_size() override { + return vision_runner->get_params_buffer_size(); + } + + void set_max_graph_vram_bytes(size_t max_graph_vram_bytes) override { + vision_runner->set_max_graph_vram_bytes(max_graph_vram_bytes); + } + + void set_flash_attention_enabled(bool enabled) override { + vision_runner->set_flash_attention_enabled(enabled); + } + + void set_weight_adapter(const std::shared_ptr& adapter) override { + vision_runner->set_weight_adapter(adapter); + } + + SDCondition get_learned_condition(int n_threads, + const ConditionerParams& conditioner_params) override { + SDCondition result; + + int width = conditioner_params.width; + int height = conditioner_params.height; + int64_t target_image_len = static_cast(width / PATCH_SIZE) * static_cast(height / PATCH_SIZE); + + std::vector> ref_images; + if (conditioner_params.ref_images != nullptr) { + ref_images = *conditioner_params.ref_images; + } + + std::vector>> vlm_images; + std::vector> image_grids; + std::vector skip_vision_start; + + std::string prompt = "<|im_start|>user\n"; + + if (ref_images.empty()) { + prompt += conditioner_params.text; + prompt += "<|im_end|>\n<|im_start|>assistant\n<|boi_token|><|tms_token|>"; + auto input_ids = tokenizer.encode(prompt, nullptr); + + std::vector input_ids_pad = input_ids; + input_ids_pad.push_back(VISION_START_TOKEN_ID); + input_ids_pad.insert(input_ids_pad.end(), target_image_len - 1, IMAGE_TOKEN_ID); + + image_grids.push_back({1, static_cast(height / PATCH_SIZE), static_cast(width / PATCH_SIZE)}); + skip_vision_start.push_back(1); + + std::vector token_types(input_ids_pad.size(), 0); + int txt_seq_len = static_cast(input_ids.size()); + int bgn = txt_seq_len - TIMESTEP_TOKEN_NUM; + for (int i = bgn; i < static_cast(token_types.size()); ++i) { + token_types[i] = 1; + } + + auto position_ids = build_position_ids(input_ids_pad, image_grids, skip_vision_start); + + std::vector input_shape{static_cast(input_ids.size())}; + std::vector position_shape{static_cast(input_ids_pad.size() * 4)}; + std::vector token_type_shape{static_cast(token_types.size())}; + std::vector vinput_mask(token_types.size(), 0); + for (int64_t i = txt_seq_len; i < static_cast(vinput_mask.size()); ++i) { + vinput_mask[static_cast(i)] = 1; + } + std::vector vinput_mask_shape{static_cast(vinput_mask.size())}; + + result.c_input_ids = sd::Tensor(input_shape, std::move(input_ids)); + result.c_position_ids = sd::Tensor(position_shape, position_ids); + result.c_token_types = sd::Tensor(token_type_shape, std::move(token_types)); + result.c_vinput_mask = sd::Tensor(vinput_mask_shape, std::move(vinput_mask)); + return result; + } + + int K = static_cast(ref_images.size()); + int max_size; + if (K == 1) { + max_size = std::max(height, width); + } else if (K == 2) { + max_size = std::max(height, width) * 48 / 64; + } else if (K <= 4) { + max_size = std::max(height, width) / 2; + } else if (K <= 8) { + max_size = std::max(height, width) * 24 / 64; + } else { + max_size = std::max(height, width) / 4; + } + + int cond_img_size; + if (K <= 4) { + cond_img_size = 384; + } else if (K <= 8) { + cond_img_size = 384 * 48 / 64; + } else { + cond_img_size = 384 / 2; + } + + for (const auto& ref_image : ref_images) { + auto resized_ref = resize_to_area(ref_image, max_size); + resized_ref = sd::ops::clamp(resized_ref, 0.0f, 1.0f); + + // VLM image: Qwen3-VL expects mean=[0.5]/std=[0.5] (i.e. range [-1,1]), + // not CLIP normalization. Resize the already-resized ref directly to + // (cond_w, cond_h) to match the Python pipeline's pil_r.resize(). + auto dims = calculate_dimensions(cond_img_size, + static_cast(resized_ref.shape()[0]) / static_cast(resized_ref.shape()[1])); + sd::Tensor vlm_image = sd::ops::interpolate( + resized_ref, + {dims.first, dims.second, resized_ref.shape()[2], resized_ref.shape()[3]}); + vlm_image = vlm_image * 2.0f - 1.0f; + int64_t image_tokens = static_cast(dims.first / PATCH_SIZE) * static_cast(dims.second / PATCH_SIZE); + + auto patch_img = resized_ref * 2.0f - 1.0f; + result.c_ref_images.push_back(std::move(patch_img)); + int64_t prompt_start = static_cast(tokenizer.encode(prompt + "<|vision_start|>", nullptr).size()); + prompt += "<|vision_start|>"; + prompt += repeat_special_token("<|image_pad|>", image_tokens); + prompt += "<|vision_end|>"; + vlm_images.emplace_back(static_cast(prompt_start), std::move(vlm_image)); + image_grids.push_back({1, dims.second / PATCH_SIZE, dims.first / PATCH_SIZE}); + skip_vision_start.push_back(0); + } + + prompt += conditioner_params.text; + prompt += "<|im_end|>\n<|im_start|>assistant\n<|boi_token|><|tms_token|>"; + auto input_ids = tokenizer.encode(prompt, nullptr); + + std::vector input_ids_pad = input_ids; + input_ids_pad.push_back(VISION_START_TOKEN_ID); + input_ids_pad.insert(input_ids_pad.end(), target_image_len - 1, IMAGE_TOKEN_ID); + image_grids.push_back({1, static_cast(height / PATCH_SIZE), static_cast(width / PATCH_SIZE)}); + skip_vision_start.push_back(1); + + for (const auto& ref_image : result.c_ref_images) { + int64_t ref_len = static_cast(ref_image.shape()[0] / PATCH_SIZE) * static_cast(ref_image.shape()[1] / PATCH_SIZE); + input_ids_pad.push_back(VISION_START_TOKEN_ID); + input_ids_pad.insert(input_ids_pad.end(), ref_len - 1, IMAGE_TOKEN_ID); + image_grids.push_back({1, static_cast(ref_image.shape()[1] / PATCH_SIZE), static_cast(ref_image.shape()[0] / PATCH_SIZE)}); + skip_vision_start.push_back(1); + } + + std::vector token_types(input_ids_pad.size(), 0); + int txt_seq_len = static_cast(input_ids.size()); + int bgn = txt_seq_len - TIMESTEP_TOKEN_NUM; + for (int i = bgn; i < static_cast(token_types.size()); ++i) { + token_types[i] = 1; + } + + std::vector input_shape{static_cast(input_ids.size())}; + std::vector position_shape{static_cast(input_ids_pad.size() * 4)}; + std::vector token_type_shape{static_cast(token_types.size())}; + std::vector vinput_mask(token_types.size(), 0); + for (int i = txt_seq_len; i < static_cast(vinput_mask.size()); ++i) { + vinput_mask[static_cast(i)] = 1; + } + std::vector vinput_mask_shape{static_cast(vinput_mask.size())}; + + result.c_input_ids = sd::Tensor(input_shape, std::move(input_ids)); + result.c_position_ids = sd::Tensor(position_shape, build_position_ids(input_ids_pad, image_grids, skip_vision_start)); + result.c_token_types = sd::Tensor(token_type_shape, std::move(token_types)); + result.c_vinput_mask = sd::Tensor(vinput_mask_shape, std::move(vinput_mask)); + result.c_image_embeds.reserve(vlm_images.size()); + for (const auto& vlm_image : vlm_images) { + auto image_embed = vision_runner->compute(n_threads, vlm_image.second); + if (image_embed.empty()) { + LOG_ERROR("hidream_o1 conditioner: encode VLM image failed"); + return SDCondition(); + } + result.c_image_embeds.emplace_back(vlm_image.first, std::move(image_embed)); + } + return result; + } + }; +} // namespace HiDreamO1 + +#endif // __SD_HIDREAM_O1_H__ diff --git a/src/llm.hpp b/src/llm.hpp index a67b4ebf..8509b6b7 100644 --- a/src/llm.hpp +++ b/src/llm.hpp @@ -2,7 +2,10 @@ #define __LLM_HPP__ #include +#include +#include #include +#include #include #include #include @@ -27,6 +30,7 @@ namespace LLM { enum class LLMArch { QWEN2_5_VL, QWEN3, + QWEN3_VL, MISTRAL_SMALL_3_2, MINISTRAL_3_3B, ARCH_COUNT, @@ -35,11 +39,18 @@ namespace LLM { static const char* llm_arch_to_str[] = { "qwen2.5vl", "qwen3", + "qwen3vl", "mistral_small3.2", "ministral3.3b", }; + enum class LLMVisionArch { + QWEN2_5_VL, + QWEN3_VL, + }; + struct LLMVisionParams { + LLMVisionArch arch = LLMVisionArch::QWEN2_5_VL; int num_layers = 32; int64_t hidden_size = 1280; int64_t intermediate_size = 3420; @@ -50,6 +61,7 @@ namespace LLM { int patch_size = 14; int spatial_merge_size = 2; int window_size = 112; + int num_position_embeddings = 0; std::set fullatt_block_indexes = {7, 15, 23, 31}; }; @@ -90,6 +102,84 @@ namespace LLM { } }; + static ggml_tensor* splice_image_embeds(GGMLRunnerContext* ctx, + ggml_tensor* x, + const std::vector>& image_embeds) { + if (image_embeds.empty()) { + return x; + } + + GGML_ASSERT(x->ne[2] == 1); // N == 1 + + auto raw_x = ggml_cast(ctx->ggml_ctx, x, image_embeds[0].second->type); + int64_t txt_token_start = 0; + int64_t txt_token_end = 0; + ggml_tensor* input_embed = nullptr; + + for (int i = 0; i < image_embeds.size(); i++) { + if (i == 0) { + txt_token_start = 0; + } else { + txt_token_start = image_embeds[i - 1].first + image_embeds[i - 1].second->ne[1]; + } + txt_token_end = image_embeds[i].first; + + auto txt_embed = ggml_ext_slice(ctx->ggml_ctx, raw_x, 1, txt_token_start, txt_token_end); + if (input_embed == nullptr) { + input_embed = txt_embed; + } else { + input_embed = ggml_concat(ctx->ggml_ctx, input_embed, txt_embed, 1); + } + + input_embed = ggml_concat(ctx->ggml_ctx, input_embed, image_embeds[i].second, 1); + } + + txt_token_start = image_embeds[image_embeds.size() - 1].first + image_embeds[image_embeds.size() - 1].second->ne[1]; + txt_token_end = raw_x->ne[1]; + + auto final_txt_embed = ggml_ext_slice(ctx->ggml_ctx, raw_x, 1, txt_token_start, txt_token_end); + input_embed = ggml_concat(ctx->ggml_ctx, input_embed, final_txt_embed, 1); + GGML_ASSERT(raw_x->ne[1] == input_embed->ne[1]); + return input_embed; + } + + struct VisionMLP : public GGMLBlock { + protected: + LLMVisionArch arch_; + + public: + VisionMLP(LLMVisionArch arch, int64_t hidden_size, int64_t intermediate_size) + : arch_(arch) { + if (arch_ == LLMVisionArch::QWEN3_VL) { + blocks["linear_fc1"] = std::make_shared(hidden_size, intermediate_size, true); + blocks["linear_fc2"] = std::make_shared(intermediate_size, hidden_size, true); + } else { + blocks["gate_proj"] = std::make_shared(hidden_size, intermediate_size, true); + blocks["up_proj"] = std::make_shared(hidden_size, intermediate_size, true); + blocks["down_proj"] = std::make_shared(intermediate_size, hidden_size, true); + } + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + if (arch_ == LLMVisionArch::QWEN3_VL) { + auto linear_fc1 = std::dynamic_pointer_cast(blocks["linear_fc1"]); + auto linear_fc2 = std::dynamic_pointer_cast(blocks["linear_fc2"]); + x = linear_fc1->forward(ctx, x); + x = ggml_ext_gelu(ctx->ggml_ctx, x); + x = linear_fc2->forward(ctx, x); + } else { + auto gate_proj = std::dynamic_pointer_cast(blocks["gate_proj"]); + auto up_proj = std::dynamic_pointer_cast(blocks["up_proj"]); + auto down_proj = std::dynamic_pointer_cast(blocks["down_proj"]); + auto h = gate_proj->forward(ctx, x); + h = ggml_silu_inplace(ctx->ggml_ctx, h); + h = ggml_mul_inplace(ctx->ggml_ctx, h, up_proj->forward(ctx, x)); + x = down_proj->forward(ctx, h); + } + return x; + } + }; + struct VisionPatchEmbed : public GGMLBlock { protected: bool llama_cpp_style; @@ -100,6 +190,7 @@ namespace LLM { public: VisionPatchEmbed(bool llama_cpp_style, + LLMVisionArch arch, int patch_size = 14, int temporal_patch_size = 2, int64_t in_channels = 3, @@ -109,36 +200,35 @@ namespace LLM { temporal_patch_size(temporal_patch_size), in_channels(in_channels), embed_dim(embed_dim) { + bool bias = arch == LLMVisionArch::QWEN3_VL; if (llama_cpp_style) { blocks["proj.0"] = std::shared_ptr(new Conv2d(in_channels, embed_dim, {patch_size, patch_size}, - {patch_size, patch_size}, // stride - {0, 0}, // padding - {1, 1}, // dilation - false)); + {patch_size, patch_size}, + {0, 0}, + {1, 1}, + bias)); blocks["proj.1"] = std::shared_ptr(new Conv2d(in_channels, embed_dim, {patch_size, patch_size}, - {patch_size, patch_size}, // stride - {0, 0}, // padding - {1, 1}, // dilation - false)); + {patch_size, patch_size}, + {0, 0}, + {1, 1}, + bias)); } else { std::tuple kernel_size = {(int)temporal_patch_size, (int)patch_size, (int)patch_size}; blocks["proj"] = std::shared_ptr(new Conv3d(in_channels, embed_dim, kernel_size, - kernel_size, // stride - {0, 0, 0}, // padding - {1, 1, 1}, // dilation - false)); + kernel_size, + {0, 0, 0}, + {1, 1, 1}, + bias)); } } ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { - // x: [N*grid_t*grid_h*grid_w, in_channels, temporal_patch_size*patch_size*patch_size] - // return: [N*grid_t*grid_h*grid_w, embed_dim] x = ggml_reshape_4d(ctx->ggml_ctx, x, patch_size, @@ -170,22 +260,43 @@ namespace LLM { } }; - struct PatchMerger : public GGMLBlock { + struct VisionPatchMerger : public GGMLBlock { protected: + LLMVisionArch arch_; int64_t hidden_size; public: - PatchMerger(int64_t dim, - int64_t context_dim, - int64_t spatial_merge_size) { - hidden_size = context_dim * spatial_merge_size * spatial_merge_size; - blocks["ln_q"] = std::shared_ptr(new RMSNorm(context_dim, 1e-6f)); - blocks["mlp.0"] = std::shared_ptr(new Linear(hidden_size, hidden_size)); - // mlp.1 is nn.GELU() - blocks["mlp.2"] = std::shared_ptr(new Linear(hidden_size, dim)); + VisionPatchMerger(LLMVisionArch arch, + int64_t dim, + int64_t context_dim, + int64_t spatial_merge_size) + : arch_(arch), + hidden_size(context_dim * spatial_merge_size * spatial_merge_size) { + if (arch_ == LLMVisionArch::QWEN3_VL) { + blocks["norm"] = std::make_shared(context_dim, 1e-6f); + blocks["linear_fc1"] = std::make_shared(hidden_size, hidden_size, true); + blocks["linear_fc2"] = std::make_shared(hidden_size, dim, true); + } else { + blocks["ln_q"] = std::make_shared(context_dim, 1e-6f); + blocks["mlp.0"] = std::make_shared(hidden_size, hidden_size); + blocks["mlp.2"] = std::make_shared(hidden_size, dim); + } } ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { + if (arch_ == LLMVisionArch::QWEN3_VL) { + auto norm = std::dynamic_pointer_cast(blocks["norm"]); + auto linear_fc1 = std::dynamic_pointer_cast(blocks["linear_fc1"]); + auto linear_fc2 = std::dynamic_pointer_cast(blocks["linear_fc2"]); + + x = norm->forward(ctx, x); + x = ggml_reshape_2d(ctx->ggml_ctx, x, hidden_size, ggml_nelements(x) / hidden_size); + x = linear_fc1->forward(ctx, x); + x = ggml_gelu_erf(ctx->ggml_ctx, x); + x = linear_fc2->forward(ctx, x); + return x; + } + auto ln_q = std::dynamic_pointer_cast(blocks["ln_q"]); auto mlp_0 = std::dynamic_pointer_cast(blocks["mlp.0"]); auto mlp_2 = std::dynamic_pointer_cast(blocks["mlp.2"]); @@ -260,16 +371,35 @@ namespace LLM { }; struct VisionBlock : public GGMLBlock { + protected: + LLMVisionArch arch_; + + ggml_tensor* forward_norm(GGMLRunnerContext* ctx, const std::string& name, ggml_tensor* x) { + if (arch_ == LLMVisionArch::QWEN3_VL) { + auto norm = std::dynamic_pointer_cast(blocks[name]); + return norm->forward(ctx, x); + } + auto norm = std::dynamic_pointer_cast(blocks[name]); + return norm->forward(ctx, x); + } + public: VisionBlock(bool llama_cpp_style, + LLMVisionArch arch, int64_t hidden_size, int64_t intermediate_size, int num_heads, - float eps = 1e-6f) { - blocks["attn"] = std::shared_ptr(new VisionAttention(llama_cpp_style, hidden_size, num_heads)); - blocks["mlp"] = std::shared_ptr(new MLP(hidden_size, intermediate_size, true)); - blocks["norm1"] = std::shared_ptr(new RMSNorm(hidden_size, eps)); - blocks["norm2"] = std::shared_ptr(new RMSNorm(hidden_size, eps)); + float eps = 1e-6f) + : arch_(arch) { + blocks["attn"] = std::shared_ptr(new VisionAttention(llama_cpp_style, hidden_size, num_heads)); + blocks["mlp"] = std::shared_ptr(new VisionMLP(arch_, hidden_size, intermediate_size)); + if (arch_ == LLMVisionArch::QWEN3_VL) { + blocks["norm1"] = std::shared_ptr(new LayerNorm(hidden_size, eps)); + blocks["norm2"] = std::shared_ptr(new LayerNorm(hidden_size, eps)); + } else { + blocks["norm1"] = std::shared_ptr(new RMSNorm(hidden_size, eps)); + blocks["norm2"] = std::shared_ptr(new RMSNorm(hidden_size, eps)); + } } ggml_tensor* forward(GGMLRunnerContext* ctx, @@ -277,18 +407,16 @@ namespace LLM { ggml_tensor* pe, ggml_tensor* mask = nullptr) { // x: [N, n_token, hidden_size] - auto attn = std::dynamic_pointer_cast(blocks["attn"]); - auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); - auto norm1 = std::dynamic_pointer_cast(blocks["norm1"]); - auto norm2 = std::dynamic_pointer_cast(blocks["norm2"]); + auto attn = std::dynamic_pointer_cast(blocks["attn"]); + auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); auto residual = x; - x = norm1->forward(ctx, x); + x = forward_norm(ctx, "norm1", x); x = attn->forward(ctx, x, pe, mask); x = ggml_add_inplace(ctx->ggml_ctx, x, residual); residual = x; - x = norm2->forward(ctx, x); + x = forward_norm(ctx, "norm2", x); x = mlp->forward(ctx, x); x = ggml_add_inplace(ctx->ggml_ctx, x, residual); @@ -298,38 +426,58 @@ namespace LLM { struct VisionModel : public GGMLBlock { protected: + LLMVisionArch arch_; int num_layers; int spatial_merge_size; + int num_grid_per_side; std::set fullatt_block_indexes; public: VisionModel(bool llama_cpp_style, - int num_layers, - int64_t in_channels, - int64_t hidden_size, - int64_t out_hidden_size, - int64_t intermediate_size, - int num_heads, - int spatial_merge_size, - int patch_size, - int temporal_patch_size, - int window_size, - std::set fullatt_block_indexes = {7, 15, 23, 31}, - float eps = 1e-6f) - : num_layers(num_layers), fullatt_block_indexes(std::move(fullatt_block_indexes)), spatial_merge_size(spatial_merge_size) { + const LLMVisionParams& vision_params, + float eps = 1e-6f) + : arch_(vision_params.arch), + num_layers(vision_params.num_layers), + spatial_merge_size(vision_params.spatial_merge_size), + num_grid_per_side(vision_params.num_position_embeddings > 0 ? static_cast(std::sqrt(vision_params.num_position_embeddings)) : 0), + fullatt_block_indexes(vision_params.fullatt_block_indexes) { blocks["patch_embed"] = std::shared_ptr(new VisionPatchEmbed(llama_cpp_style, - patch_size, - temporal_patch_size, - in_channels, - hidden_size)); + arch_, + vision_params.patch_size, + vision_params.temporal_patch_size, + vision_params.in_channels, + vision_params.hidden_size)); + if (vision_params.num_position_embeddings > 0) { + blocks["pos_embed"] = std::make_shared(vision_params.num_position_embeddings, vision_params.hidden_size); + } for (int i = 0; i < num_layers; i++) { blocks["blocks." + std::to_string(i)] = std::shared_ptr(new VisionBlock(llama_cpp_style, - hidden_size, - intermediate_size, - num_heads, + arch_, + vision_params.hidden_size, + vision_params.intermediate_size, + vision_params.num_heads, eps)); } - blocks["merger"] = std::shared_ptr(new PatchMerger(out_hidden_size, hidden_size, spatial_merge_size)); + blocks["merger"] = std::shared_ptr(new VisionPatchMerger(arch_, + vision_params.out_hidden_size, + vision_params.hidden_size, + spatial_merge_size)); + } + + std::shared_ptr pos_embedder() { + auto it = blocks.find("pos_embed"); + if (it == blocks.end()) { + return nullptr; + } + return std::dynamic_pointer_cast(it->second); + } + + int get_num_grid_per_side() const { + return num_grid_per_side; + } + + int get_spatial_merge_size() const { + return spatial_merge_size; } ggml_tensor* forward(GGMLRunnerContext* ctx, @@ -337,20 +485,26 @@ namespace LLM { ggml_tensor* pe, ggml_tensor* window_index, ggml_tensor* window_inverse_index, - ggml_tensor* window_mask) { + ggml_tensor* window_mask, + ggml_tensor* pos_embeds = nullptr) { // pixel_values: [grid_t*(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw] // window_index: [grid_t*(H/mh/ph)*(W/mw/pw)] // window_inverse_index: [grid_t*(H/mh/ph)*(W/mw/pw)] // window_mask: [grid_h*grid_w, grid_h*grid_w] auto patch_embed = std::dynamic_pointer_cast(blocks["patch_embed"]); - auto merger = std::dynamic_pointer_cast(blocks["merger"]); + auto merger = std::dynamic_pointer_cast(blocks["merger"]); auto x = patch_embed->forward(ctx, pixel_values); sd::ggml_graph_cut::mark_graph_cut(x, "llm.vision.prelude", "x"); + if (pos_embeds != nullptr) { + x = ggml_add(ctx->ggml_ctx, x, pos_embeds); + } - x = ggml_reshape_4d(ctx->ggml_ctx, x, x->ne[0] * spatial_merge_size * spatial_merge_size, x->ne[1] / spatial_merge_size / spatial_merge_size, x->ne[2], x->ne[3]); - x = ggml_get_rows(ctx->ggml_ctx, x, window_index); - x = ggml_reshape_4d(ctx->ggml_ctx, x, x->ne[0] / spatial_merge_size / spatial_merge_size, x->ne[1] * spatial_merge_size * spatial_merge_size, x->ne[2], x->ne[3]); + if (window_index != nullptr) { + x = ggml_reshape_4d(ctx->ggml_ctx, x, x->ne[0] * spatial_merge_size * spatial_merge_size, x->ne[1] / spatial_merge_size / spatial_merge_size, x->ne[2], x->ne[3]); + x = ggml_get_rows(ctx->ggml_ctx, x, window_index); + x = ggml_reshape_4d(ctx->ggml_ctx, x, x->ne[0] / spatial_merge_size / spatial_merge_size, x->ne[1] * spatial_merge_size * spatial_merge_size, x->ne[2], x->ne[3]); + } for (int i = 0; i < num_layers; i++) { auto block = std::dynamic_pointer_cast(blocks["blocks." + std::to_string(i)]); @@ -360,13 +514,17 @@ namespace LLM { mask = nullptr; } x = block->forward(ctx, x, pe, mask); + if (i == 0) { + } sd::ggml_graph_cut::mark_graph_cut(x, "llm.vision.blocks." + std::to_string(i), "x"); } x = merger->forward(ctx, x); sd::ggml_graph_cut::mark_graph_cut(x, "llm.vision.final", "x"); - x = ggml_get_rows(ctx->ggml_ctx, x, window_inverse_index); + if (window_inverse_index != nullptr) { + x = ggml_get_rows(ctx->ggml_ctx, x, window_inverse_index); + } return x; } @@ -430,6 +588,10 @@ namespace LLM { } else if (arch == LLMArch::QWEN3) { q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); + } else if (arch == LLMArch::QWEN3_VL) { + int sections[4] = {24, 20, 20, 0}; + q = ggml_rope_multi(ctx->ggml_ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_IMROPE, 262144, 5000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); + k = ggml_rope_multi(ctx->ggml_ctx, k, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_IMROPE, 262144, 5000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); } else { int sections[4] = {16, 24, 24, 0}; q = ggml_rope_multi(ctx->ggml_ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_MROPE, 128000, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); @@ -485,10 +647,11 @@ namespace LLM { struct TextModel : public GGMLBlock { protected: int64_t num_layers; + LLMParams params; public: TextModel(const LLMParams& params) - : num_layers(params.num_layers) { + : num_layers(params.num_layers), params(params) { blocks["embed_tokens"] = std::shared_ptr(new Embedding(params.vocab_size, params.hidden_size)); for (int i = 0; i < num_layers; i++) { blocks["layers." + std::to_string(i)] = std::shared_ptr(new TransformerBlock(params)); @@ -496,62 +659,22 @@ namespace LLM { blocks["norm"] = std::shared_ptr(new RMSNorm(params.hidden_size, params.rms_norm_eps)); } - ggml_tensor* forward(GGMLRunnerContext* ctx, - ggml_tensor* input_ids, - ggml_tensor* input_pos, - ggml_tensor* attention_mask, - std::vector> image_embeds, - std::set out_layers) { - // input_ids: [N, n_token] - // return: [N, n_token, hidden_size] - + ggml_tensor* embed(GGMLRunnerContext* ctx, + ggml_tensor* input_ids) { auto embed_tokens = std::dynamic_pointer_cast(blocks["embed_tokens"]); - auto norm = std::dynamic_pointer_cast(blocks["norm"]); - - auto x = embed_tokens->forward(ctx, input_ids); - sd::ggml_graph_cut::mark_graph_cut(x, "llm.text.prelude", "x"); + auto x = embed_tokens->forward(ctx, input_ids); + return x; + } + ggml_tensor* forward_embeds(GGMLRunnerContext* ctx, + ggml_tensor* x, + ggml_tensor* input_pos, + ggml_tensor* attention_mask, + std::set out_layers) { + auto norm = std::dynamic_pointer_cast(blocks["norm"]); std::vector intermediate_outputs; - if (image_embeds.size() > 0) { - GGML_ASSERT(x->ne[2] == 1); // N == 1 - - auto raw_x = ggml_cast(ctx->ggml_ctx, x, image_embeds[0].second->type); - int64_t txt_token_start = 0; - int64_t txt_token_end = 0; - - ggml_tensor* input_embed = nullptr; - - for (int i = 0; i < image_embeds.size(); i++) { - if (i == 0) { - txt_token_start = 0; - } else { - txt_token_start = image_embeds[i - 1].first + image_embeds[i - 1].second->ne[1]; - } - txt_token_end = image_embeds[i].first; - - auto txt_embed = ggml_ext_slice(ctx->ggml_ctx, raw_x, 1, txt_token_start, txt_token_end); - if (input_embed == nullptr) { - input_embed = txt_embed; - } else { - input_embed = ggml_concat(ctx->ggml_ctx, input_embed, txt_embed, 1); - } - - auto image_embed = image_embeds[i].second; - input_embed = ggml_concat(ctx->ggml_ctx, input_embed, image_embed, 1); - } - - txt_token_start = image_embeds[image_embeds.size() - 1].first + image_embeds[image_embeds.size() - 1].second->ne[1]; - txt_token_end = raw_x->ne[1]; - - auto final_txt_embed = ggml_ext_slice(ctx->ggml_ctx, raw_x, 1, txt_token_start, txt_token_end); - - input_embed = ggml_concat(ctx->ggml_ctx, input_embed, final_txt_embed, 1); - GGML_ASSERT(raw_x->ne[1] == input_embed->ne[1]); - - x = input_embed; - } - + sd::ggml_graph_cut::mark_graph_cut(x, "llm.text.prelude", "x"); for (int i = 0; i < num_layers; i++) { auto block = std::dynamic_pointer_cast(blocks["layers." + std::to_string(i)]); @@ -570,10 +693,23 @@ namespace LLM { for (int i = 1; i < intermediate_outputs.size(); i++) { x = ggml_concat(ctx->ggml_ctx, x, intermediate_outputs[i], 0); } - } else { - x = norm->forward(ctx, x); + return x; } - return x; + + return norm->forward(ctx, x); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* input_ids, + ggml_tensor* input_pos, + ggml_tensor* attention_mask, + std::vector> image_embeds, + std::set out_layers) { + // input_ids: [N, n_token] + // return: [N, n_token, hidden_size] + auto x = embed(ctx, input_ids); + x = splice_image_embeds(ctx, x, image_embeds); + return forward_embeds(ctx, x, input_pos, attention_mask, std::move(out_layers)); } }; @@ -587,18 +723,7 @@ namespace LLM { : enable_vision(enable_vision), params(params) { blocks["model"] = std::shared_ptr(new TextModel(params)); if (enable_vision) { - blocks["visual"] = std::shared_ptr(new VisionModel(llama_cpp_style, - params.vision.num_layers, - params.vision.in_channels, - params.vision.hidden_size, - params.vision.out_hidden_size, - params.vision.intermediate_size, - params.vision.num_heads, - params.vision.spatial_merge_size, - params.vision.patch_size, - params.vision.temporal_patch_size, - params.vision.window_size, - params.vision.fullatt_block_indexes)); + blocks["visual"] = std::shared_ptr(new VisionModel(llama_cpp_style, params.vision)); } } @@ -615,15 +740,20 @@ namespace LLM { return x; } + std::shared_ptr vision_model() { + GGML_ASSERT(enable_vision); + return std::dynamic_pointer_cast(blocks["visual"]); + } + ggml_tensor* vision_forward(GGMLRunnerContext* ctx, ggml_tensor* pixel_values, ggml_tensor* pe, ggml_tensor* window_index, ggml_tensor* window_inverse_index, - ggml_tensor* window_mask) { + ggml_tensor* window_mask, + ggml_tensor* pos_embeds = nullptr) { GGML_ASSERT(enable_vision); - auto vision_model = std::dynamic_pointer_cast(blocks["visual"]); - return vision_model->forward(ctx, pixel_values, pe, window_index, window_inverse_index, window_mask); + return vision_model()->forward(ctx, pixel_values, pe, window_index, window_inverse_index, window_mask, pos_embeds); } }; @@ -638,7 +768,215 @@ namespace LLM { std::vector window_index_vec; std::vector window_inverse_index_vec; std::vector pe_vec; + std::array, 4> pos_embed_idx_data_; + std::array, 4> pos_embed_weight_data_; + static ggml_tensor* process_image_common(ggml_context* ctx, + ggml_tensor* image, + const LLMVisionParams& vision_params) { + // image: [C, H, W] + // return: [grid_t*(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw], grid_t == 1 + int64_t C = image->ne[2]; + int64_t H = image->ne[1]; + int64_t W = image->ne[0]; + int64_t mh = vision_params.spatial_merge_size; + int64_t mw = vision_params.spatial_merge_size; + int64_t pt = vision_params.temporal_patch_size; + int64_t ph = vision_params.patch_size; + int64_t pw = vision_params.patch_size; + + image = ggml_reshape_4d(ctx, image, pw, mw, (W / mw / pw), H * C); // [C*H, (W/mw/pw), mw, pw] + image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 3, 1)); // [mw, C*H, (W/mw/pw), pw] + image = ggml_reshape_4d(ctx, image, pw * (W / mw / pw), H, C, mw); // [mw, C, H, (W/mw/pw)*pw] + image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 3, 1)); // [H, mw, C, (W/mw/pw)*pw] + image = ggml_reshape_4d(ctx, image, pw, (W / mw / pw) * C * mw, ph, mh * (H / mh / ph)); // [(H/mh/ph)*mh, ph, mw*C*(W/mw/pw), pw] + image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 1, 3)); // [(H/mh/ph)*mh, mw*C*(W/mw/pw), ph, pw] + image = ggml_reshape_4d(ctx, image, pw * ph, (W / mw / pw), C, mw * mh * (H / mh / ph)); // [(H/mh/ph)*mh*mw, C, (W/mw/pw), ph*pw] + image = ggml_concat(ctx, image, image, 0); // [(H/mh/ph)*mh*mw, C, (W/mw/pw), pt*ph*pw] + image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 1, 3)); // [(H/mh/ph)*mh*mw, (W/mw/pw), C, pt*ph*pw] + image = ggml_reshape_4d(ctx, image, pw * ph * pt * C, (W / mw / pw), mw * mh, (H / mh / ph)); // [(H/mh/ph), mh*mw, (W/mw/pw), C*pt*ph*pw] + image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 1, 3)); // [(H/mh/ph), (W/mw/pw), mh*mw, C*pt*ph*pw] + image = ggml_reshape_2d(ctx, image, pw * ph * pt * C, mw * mh * (W / mw / pw) * (H / mh / ph)); // [(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw] + return image; + } + + static ggml_tensor* build_patch_pos_embeds_common(GGMLRunner* runner, + ggml_context* compute_ctx, + GGMLRunnerContext* runner_ctx, + std::shared_ptr vision, + int grid_h, + int grid_w, + std::array, 4>& pos_embed_idx_data, + std::array, 4>& pos_embed_weight_data) { + auto pos_embed = vision->pos_embedder(); + GGML_ASSERT(pos_embed != nullptr); + for (int i = 0; i < 4; ++i) { + pos_embed_idx_data[i].clear(); + pos_embed_weight_data[i].clear(); + pos_embed_idx_data[i].reserve(static_cast(grid_h * grid_w)); + pos_embed_weight_data[i].reserve(static_cast(grid_h * grid_w)); + } + + int num_grid_per_side = vision->get_num_grid_per_side(); + double max_index = static_cast(num_grid_per_side - 1); + int merge_size = vision->get_spatial_merge_size(); + GGML_ASSERT(grid_h % merge_size == 0); + GGML_ASSERT(grid_w % merge_size == 0); + for (int bh = 0; bh < grid_h / merge_size; ++bh) { + for (int bw = 0; bw < grid_w / merge_size; ++bw) { + for (int ih = 0; ih < merge_size; ++ih) { + int h = bh * merge_size + ih; + double h_pos = grid_h == 1 ? 0.0 : max_index * h / static_cast(grid_h - 1); + int h_floor = static_cast(std::floor(h_pos)); + int h_ceil = std::min(h_floor + 1, num_grid_per_side - 1); + double dh = h_pos - h_floor; + for (int iw = 0; iw < merge_size; ++iw) { + int w = bw * merge_size + iw; + double w_pos = grid_w == 1 ? 0.0 : max_index * w / static_cast(grid_w - 1); + int w_floor = static_cast(std::floor(w_pos)); + int w_ceil = std::min(w_floor + 1, num_grid_per_side - 1); + double dw = w_pos - w_floor; + + pos_embed_idx_data[0].push_back(h_floor * num_grid_per_side + w_floor); + pos_embed_idx_data[1].push_back(h_floor * num_grid_per_side + w_ceil); + pos_embed_idx_data[2].push_back(h_ceil * num_grid_per_side + w_floor); + pos_embed_idx_data[3].push_back(h_ceil * num_grid_per_side + w_ceil); + + pos_embed_weight_data[0].push_back(static_cast((1.0 - dh) * (1.0 - dw))); + pos_embed_weight_data[1].push_back(static_cast((1.0 - dh) * dw)); + pos_embed_weight_data[2].push_back(static_cast(dh * (1.0 - dw))); + pos_embed_weight_data[3].push_back(static_cast(dh * dw)); + } + } + } + } + + ggml_tensor* patch_pos_embeds = nullptr; + for (int i = 0; i < 4; ++i) { + auto idx_tensor = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, static_cast(pos_embed_idx_data[i].size())); + runner->set_backend_tensor_data(idx_tensor, pos_embed_idx_data[i].data()); + auto embed = pos_embed->forward(runner_ctx, idx_tensor); + auto weight_tensor = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, 1, static_cast(pos_embed_weight_data[i].size())); + runner->set_backend_tensor_data(weight_tensor, pos_embed_weight_data[i].data()); + embed = ggml_mul(compute_ctx, embed, weight_tensor); + patch_pos_embeds = patch_pos_embeds == nullptr ? embed : ggml_add(compute_ctx, patch_pos_embeds, embed); + } + return patch_pos_embeds; + } + + static ggml_tensor* encode_image_common(GGMLRunner* runner, + ggml_context* compute_ctx, + GGMLRunnerContext* runner_ctx, + ggml_tensor* image, + const LLMVisionParams& vision_params, + std::shared_ptr vision_model, + std::vector& window_index_vec, + std::vector& window_inverse_index_vec, + std::vector& window_mask_vec, + std::vector& pe_vec, + std::array, 4>& pos_embed_idx_data, + std::array, 4>& pos_embed_weight_data) { + GGML_ASSERT(image->ne[1] % (vision_params.patch_size * vision_params.spatial_merge_size) == 0); + GGML_ASSERT(image->ne[0] % (vision_params.patch_size * vision_params.spatial_merge_size) == 0); + + int grid_h = static_cast(image->ne[1]) / vision_params.patch_size; + int grid_w = static_cast(image->ne[0]) / vision_params.patch_size; + + auto pixel_values = process_image_common(compute_ctx, image, vision_params); + int head_dim = static_cast(vision_params.hidden_size / vision_params.num_heads); + + if (vision_params.arch == LLMVisionArch::QWEN3_VL) { + auto pos_embeds = build_patch_pos_embeds_common(runner, + compute_ctx, + runner_ctx, + vision_model, + grid_h, + grid_w, + pos_embed_idx_data, + pos_embed_weight_data); + window_index_vec.resize(static_cast((grid_h / vision_params.spatial_merge_size) * (grid_w / vision_params.spatial_merge_size))); + for (int i = 0; i < static_cast(window_index_vec.size()); ++i) { + window_index_vec[static_cast(i)] = i; + } + pe_vec = Rope::gen_qwen2vl_pe(grid_h, + grid_w, + vision_params.spatial_merge_size, + window_index_vec, + 10000, + {head_dim / 2, head_dim / 2}); + int pos_len = static_cast(pe_vec.size() / head_dim / 2); + auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, head_dim / 2, pos_len); + runner->set_backend_tensor_data(pe, pe_vec.data()); + return vision_model->forward(runner_ctx, pixel_values, pe, nullptr, nullptr, nullptr, pos_embeds); + } + + int llm_grid_h = grid_h / vision_params.spatial_merge_size; + int llm_grid_w = grid_w / vision_params.spatial_merge_size; + int vit_merger_window_size = vision_params.window_size / vision_params.patch_size / vision_params.spatial_merge_size; + + int inverse_index = 0; + window_index_vec.resize(llm_grid_h * llm_grid_w); + window_inverse_index_vec.resize(llm_grid_h * llm_grid_w); + std::vector seqlens; + for (int ih = 0; ih < llm_grid_h; ih += vit_merger_window_size) { + for (int iw = 0; iw < llm_grid_w; iw += vit_merger_window_size) { + int win_h = std::min(vit_merger_window_size, llm_grid_h - ih); + int win_w = std::min(vit_merger_window_size, llm_grid_w - iw); + for (int iy = 0; iy < win_h; iy++) { + for (int ix = 0; ix < win_w; ix++) { + int index = (ih + iy) * llm_grid_w + iw + ix; + window_index_vec[inverse_index] = index; + window_inverse_index_vec[index] = inverse_index; + inverse_index++; + } + } + seqlens.push_back(win_h * win_w * vision_params.spatial_merge_size * vision_params.spatial_merge_size); + } + } + auto window_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, llm_grid_h * llm_grid_w); + auto window_inverse_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, llm_grid_h * llm_grid_w); + runner->set_backend_tensor_data(window_index, window_index_vec.data()); + runner->set_backend_tensor_data(window_inverse_index, window_inverse_index_vec.data()); + + window_mask_vec.resize((grid_h * grid_w) * (grid_h * grid_w)); + int window_start_index = 0; + for (int seq_index = 0; seq_index < seqlens.size(); seq_index++) { + int window_end_index = window_start_index + seqlens[seq_index]; + GGML_ASSERT(window_end_index <= grid_h * grid_w); + for (int i = window_start_index; i < window_end_index; i++) { + for (int j = 0; j < grid_h * grid_w; j++) { + float mask_value = -INFINITY; + if (j >= window_start_index && j < window_end_index) { + mask_value = 0; + } + GGML_ASSERT((i * (grid_h * grid_w) + j) < window_mask_vec.size()); + window_mask_vec[i * (grid_h * grid_w) + j] = mask_value; + } + } + window_start_index = window_end_index; + } + + auto window_mask = ggml_new_tensor_2d(compute_ctx, + GGML_TYPE_F32, + grid_h * grid_w, + grid_h * grid_w); + runner->set_backend_tensor_data(window_mask, window_mask_vec.data()); + + pe_vec = Rope::gen_qwen2vl_pe(grid_h, + grid_w, + vision_params.spatial_merge_size, + window_inverse_index_vec, + 10000, + {head_dim / 2, head_dim / 2}); + int pos_len = static_cast(pe_vec.size() / head_dim / 2); + + auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, head_dim / 2, pos_len); + runner->set_backend_tensor_data(pe, pe_vec.data()); + + return vision_model->forward(runner_ctx, pixel_values, pe, window_index, window_inverse_index, window_mask); + } + + public: LLMRunner(LLMArch arch, ggml_backend_t backend, bool offload_params_to_cpu, @@ -740,8 +1078,9 @@ namespace LLM { ggml_tensor* input_pos, ggml_tensor* window_index, ggml_tensor* window_inverse_index, - ggml_tensor* window_mask) { - auto hidden_states = model.vision_forward(ctx, pixel_values, input_pos, window_index, window_inverse_index, window_mask); + ggml_tensor* window_mask, + ggml_tensor* pos_embeds = nullptr) { + auto hidden_states = model.vision_forward(ctx, pixel_values, input_pos, window_index, window_inverse_index, window_mask, pos_embeds); return hidden_states; } @@ -827,30 +1166,36 @@ namespace LLM { } ggml_tensor* process_image(ggml_context* ctx, ggml_tensor* image) { - // image: [C, H, W] - // return: [grid_t*(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw], grid_t == 1 - int64_t C = image->ne[2]; - int64_t H = image->ne[1]; - int64_t W = image->ne[0]; - int64_t mh = params.vision.spatial_merge_size; - int64_t mw = params.vision.spatial_merge_size; - int64_t pt = params.vision.temporal_patch_size; - int64_t ph = params.vision.patch_size; - int64_t pw = params.vision.patch_size; + return process_image_common(ctx, image, params.vision); + } - image = ggml_reshape_4d(ctx, image, pw, mw, (W / mw / pw), H * C); // [C*H, (W/mw/pw), mw, pw] - image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 3, 1)); // [mw, C*H, (W/mw/pw), pw] - image = ggml_reshape_4d(ctx, image, pw * (W / mw / pw), H, C, mw); // [mw, C, H, (W/mw/pw)*pw] - image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 3, 1)); // [H, mw, C, (W/mw/pw)*pw] - image = ggml_reshape_4d(ctx, image, pw, (W / mw / pw) * C * mw, ph, mh * (H / mh / ph)); // [(H/mh/ph)*mh, ph, mw*C*(W/mw/pw), pw] - image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 1, 3)); // [(H/mh/ph)*mh, mw*C*(W/mw/pw), ph, pw] - image = ggml_reshape_4d(ctx, image, pw * ph, (W / mw / pw), C, mw * mh * (H / mh / ph)); // [(H/mh/ph)*mh*mw, C, (W/mw/pw), ph*pw] - image = ggml_concat(ctx, image, image, 0); // [(H/mh/ph)*mh*mw, C, (W/mw/pw), pt*ph*pw] - image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 1, 3)); // [(H/mh/ph)*mh*mw, (W/mw/pw), C, pt*ph*pw] - image = ggml_reshape_4d(ctx, image, pw * ph * pt * C, (W / mw / pw), mw * mh, (H / mh / ph)); // [(H/mh/ph), mh*mw, (W/mw/pw), C*pt*ph*pw] - image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 1, 3)); // [(H/mh/ph), (W/mw/pw), mh*mw, C*pt*ph*pw] - image = ggml_reshape_2d(ctx, image, pw * ph * pt * C, mw * mh * (W / mw / pw) * (H / mh / ph)); // [(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw] - return image; + ggml_tensor* build_patch_pos_embeds(GGMLRunnerContext* runner_ctx, + std::shared_ptr vision, + int grid_h, + int grid_w) { + return build_patch_pos_embeds_common(this, + compute_ctx, + runner_ctx, + vision, + grid_h, + grid_w, + pos_embed_idx_data_, + pos_embed_weight_data_); + } + + ggml_tensor* encode_image(GGMLRunnerContext* runner_ctx, ggml_tensor* image) { + return encode_image_common(this, + compute_ctx, + runner_ctx, + image, + params.vision, + model.vision_model(), + window_index_vec, + window_inverse_index_vec, + window_mask_vec, + pe_vec, + pos_embed_idx_data_, + pos_embed_weight_data_); } ggml_cgraph* build_encode_image_graph(const sd::Tensor& image_tensor) { @@ -860,116 +1205,8 @@ namespace LLM { GGML_ASSERT(image->ne[1] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0); GGML_ASSERT(image->ne[0] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0); - int grid_t = 1; - int grid_h = static_cast(image->ne[1]) / params.vision.patch_size; - int grid_w = static_cast(image->ne[0]) / params.vision.patch_size; - int llm_grid_h = grid_h / params.vision.spatial_merge_size; - int llm_grid_w = grid_w / params.vision.spatial_merge_size; - int vit_merger_window_size = params.vision.window_size / params.vision.patch_size / params.vision.spatial_merge_size; - - auto pixel_values = process_image(compute_ctx, image); - - // window index - int inverse_index = 0; - window_index_vec.resize(llm_grid_h * llm_grid_w); - window_inverse_index_vec.resize(llm_grid_h * llm_grid_w); - std::vector seqlens; - for (int ih = 0; ih < llm_grid_h; ih += vit_merger_window_size) { - for (int iw = 0; iw < llm_grid_w; iw += vit_merger_window_size) { - int win_h = std::min(vit_merger_window_size, llm_grid_h - ih); - int win_w = std::min(vit_merger_window_size, llm_grid_w - iw); - for (int iy = 0; iy < win_h; iy++) { - for (int ix = 0; ix < win_w; ix++) { - int index = (ih + iy) * llm_grid_w + iw + ix; - window_index_vec[inverse_index] = index; - window_inverse_index_vec[index] = inverse_index; - inverse_index++; - } - } - seqlens.push_back(win_h * win_w * params.vision.spatial_merge_size * params.vision.spatial_merge_size); - } - } - // printf("window_index: "); - // for (int i : window_index_vec) { - // printf("%d ", i); - // } - // printf("\n"); - // printf("window_inverse_index: "); - // for (int i : window_inverse_index_vec) { - // printf("%d ", i); - // } - // printf("\n"); - // printf("seqlens: "); - // for (int i : seqlens) { - // printf("%d ", i); - // } - // printf("\n"); - auto window_index = ggml_new_tensor_1d(compute_ctx, - GGML_TYPE_I32, - llm_grid_h * llm_grid_w); - auto window_inverse_index = ggml_new_tensor_1d(compute_ctx, - GGML_TYPE_I32, - llm_grid_h * llm_grid_w); - set_backend_tensor_data(window_index, window_index_vec.data()); - set_backend_tensor_data(window_inverse_index, window_inverse_index_vec.data()); - - // window mask - int seq_window_size = (vit_merger_window_size * params.vision.spatial_merge_size) * (vit_merger_window_size * params.vision.spatial_merge_size); - window_mask_vec.resize((grid_h * grid_w) * (grid_h * grid_w)); - int window_start_index = 0; - for (int seq_index = 0; seq_index < seqlens.size(); seq_index++) { - int window_end_index = window_start_index + seqlens[seq_index]; - // LOG_DEBUG("%d %d", window_start_index, window_end_index); - GGML_ASSERT(window_end_index <= grid_h * grid_w); - for (int i = window_start_index; i < window_end_index; i++) { - for (int j = 0; j < grid_h * grid_w; j++) { - float mask_value = -INFINITY; - if (j >= window_start_index && j < window_end_index) { - mask_value = 0; - } - GGML_ASSERT((i * (grid_h * grid_w) + j) < window_mask_vec.size()); - window_mask_vec[i * (grid_h * grid_w) + j] = mask_value; - } - } - window_start_index = window_end_index; - // printf("\n"); - } - // printf("window_mask: \n"); - // for (int i = 0; i < grid_h*grid_w; i++) { - // for (int j = 0; j < grid_h*grid_w; j++) { - // printf("%f ", window_mask_vec[i * (grid_h * grid_w) + j]); - // } - // printf("\n"); - // } - auto window_mask = ggml_new_tensor_2d(compute_ctx, - GGML_TYPE_F32, - grid_h * grid_w, - grid_h * grid_w); - set_backend_tensor_data(window_mask, window_mask_vec.data()); - - // pe - int head_dim = static_cast(params.vision.hidden_size / params.vision.num_heads); - pe_vec = Rope::gen_qwen2vl_pe(grid_h, - grid_w, - params.vision.spatial_merge_size, - window_inverse_index_vec, - 10000, - {head_dim / 2, head_dim / 2}); - int pos_len = static_cast(pe_vec.size() / head_dim / 2); - // LOG_DEBUG("pos_len %d", pos_len); - auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, head_dim / 2, pos_len); - // pe->data = pe_vec.data(); - // print_ggml_tensor(pe); - // pe->data = nullptr; - set_backend_tensor_data(pe, pe_vec.data()); - auto runnter_ctx = get_context(); - ggml_tensor* hidden_states = vision_forward(&runnter_ctx, - pixel_values, - pe, - window_index, - window_inverse_index, - window_mask); + ggml_tensor* hidden_states = encode_image(&runnter_ctx, image); ggml_build_forward_expand(gf, hidden_states); return gf; diff --git a/src/model.cpp b/src/model.cpp index 0f13a02b..9d7a9233 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -437,6 +437,10 @@ SDVersion ModelLoader::get_sd_version() { if (tensor_storage.name.find("model.diffusion_model.joint_blocks.") != std::string::npos) { return VERSION_SD3; } + if (tensor_storage.name.find("model.x_embedder.proj1.weight") != std::string::npos && + tensor_storage_map.find("model.language_model.layers.0.self_attn.q_proj.weight") != tensor_storage_map.end()) { + return VERSION_HIDREAM_O1; + } if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) { return VERSION_QWEN_IMAGE; } diff --git a/src/model.h b/src/model.h index 340a29ae..550274ef 100644 --- a/src/model.h +++ b/src/model.h @@ -42,6 +42,7 @@ enum SDVersion { VERSION_ANIMA, VERSION_FLUX2, VERSION_FLUX2_KLEIN, + VERSION_HIDREAM_O1, VERSION_Z_IMAGE, VERSION_OVIS_IMAGE, VERSION_ERNIE_IMAGE, @@ -163,6 +164,7 @@ static inline bool sd_version_is_dit(SDVersion version) { sd_version_is_sd3(version) || sd_version_is_wan(version) || sd_version_is_qwen_image(version) || + version == VERSION_HIDREAM_O1 || sd_version_is_anima(version) || sd_version_is_z_image(version) || sd_version_is_ernie_image(version)) { diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 8459e877..ccd52bd9 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -52,6 +52,7 @@ const char* model_version_to_str[] = { "Anima", "Flux.2", "Flux.2 klein", + "HiDream O1", "Z-Image", "Ovis Image", "Ernie Image", @@ -538,6 +539,14 @@ public: "model.diffusion_model", version, sd_ctx_params->qwen_image_zero_cond_t); + } else if (version == VERSION_HIDREAM_O1) { + cond_stage_model = std::make_shared(clip_backend, + offload_params_to_cpu, + tensor_storage_map); + diffusion_model = std::make_shared(backend, + offload_params_to_cpu, + tensor_storage_map, + "model"); } else if (sd_version_is_anima(version)) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, @@ -671,7 +680,7 @@ public: bool force_vae_cpu = sd_ctx_params->keep_vae_on_cpu; - if (version == VERSION_CHROMA_RADIANCE) { + if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1) { LOG_INFO("using FakeVAE"); first_stage_model = std::make_shared(version, vae_backend, @@ -835,6 +844,10 @@ public: ignore_tensors.insert("text_encoders.llm.vision_tower."); ignore_tensors.insert("text_encoders.llm.multi_modal_projector."); } + if (version == VERSION_HIDREAM_O1) { + ignore_tensors.insert("lm_head."); + ignore_tensors.insert("model.visual.deepstack_merger_list."); + } if (enable_mmap_tensors) { if (mmap_able_tensors.empty()) { @@ -972,6 +985,7 @@ public: } else if (sd_version_is_sd3(version) || sd_version_is_wan(version) || sd_version_is_qwen_image(version) || + version == VERSION_HIDREAM_O1 || sd_version_is_anima(version) || sd_version_is_ernie_image(version) || sd_version_is_z_image(version)) { @@ -1569,6 +1583,9 @@ public: if (sd_version_is_anima(version)) { return std::vector{t / static_cast(TIMESTEPS)}; } + if (version == VERSION_HIDREAM_O1) { + return std::vector{1.0f - (t / static_cast(TIMESTEPS))}; + } if (sd_version_is_z_image(version)) { return std::vector{1000.f - t}; } @@ -1657,6 +1674,7 @@ public: int shifted_timestep, sample_method_t method, bool is_flow_denoiser, + const char* extra_sample_args, const std::vector& sigmas, int start_merge_step, const std::vector>& ref_latents, @@ -1683,13 +1701,17 @@ public: } } - size_t steps = sigmas.size() - 1; - bool has_skiplayer = slg_scale != 0.0f && !skip_layers.empty(); + size_t steps = sigmas.size() - 1; + bool has_skiplayer = slg_scale != 0.0f && !skip_layers.empty(); if (has_skiplayer && !sd_version_is_dit(version)) { has_skiplayer = false; LOG_WARN("SLG is incompatible with this model type"); } + if (version == VERSION_HIDREAM_O1 && !noise.empty()) { + noise *= eta; + } + int64_t t0 = ggml_time_us(); sd::Tensor x_t = !noise.empty() ? denoiser->noise_scaling(sigmas[0], noise, init_latent) @@ -1764,12 +1786,18 @@ public: auto run_condition = [&](const SDCondition& condition, const sd::Tensor* c_concat_override = nullptr, const std::vector* local_skip_layers = nullptr) -> sd::Tensor { - diffusion_params.context = condition.c_crossattn.empty() ? nullptr : &condition.c_crossattn; - diffusion_params.c_concat = c_concat_override != nullptr ? c_concat_override : (condition.c_concat.empty() ? nullptr : &condition.c_concat); - diffusion_params.y = condition.c_vector.empty() ? nullptr : &condition.c_vector; - diffusion_params.t5_ids = condition.c_t5_ids.empty() ? nullptr : &condition.c_t5_ids; - diffusion_params.t5_weights = condition.c_t5_weights.empty() ? nullptr : &condition.c_t5_weights; - diffusion_params.skip_layers = local_skip_layers; + diffusion_params.context = condition.c_crossattn.empty() ? nullptr : &condition.c_crossattn; + diffusion_params.c_concat = c_concat_override != nullptr ? c_concat_override : (condition.c_concat.empty() ? nullptr : &condition.c_concat); + diffusion_params.y = condition.c_vector.empty() ? nullptr : &condition.c_vector; + diffusion_params.t5_ids = condition.c_t5_ids.empty() ? nullptr : &condition.c_t5_ids; + diffusion_params.t5_weights = condition.c_t5_weights.empty() ? nullptr : &condition.c_t5_weights; + diffusion_params.input_ids = condition.c_input_ids.empty() ? nullptr : &condition.c_input_ids; + diffusion_params.input_pos = condition.c_position_ids.empty() ? nullptr : &condition.c_position_ids; + diffusion_params.token_types = condition.c_token_types.empty() ? nullptr : &condition.c_token_types; + diffusion_params.vinput_mask = condition.c_vinput_mask.empty() ? nullptr : &condition.c_vinput_mask; + diffusion_params.image_embeds = condition.c_image_embeds.empty() ? nullptr : &condition.c_image_embeds; + diffusion_params.ref_latents = condition.c_ref_images.empty() ? &ref_latents : &condition.c_ref_images; + diffusion_params.skip_layers = local_skip_layers; sd::Tensor cached_output; if (step_cache.before_condition(&condition, noised_input, &cached_output)) { @@ -1855,7 +1883,7 @@ public: denoised = latent_result * c_out + x * c_skip; if (out_uncond_denoised != nullptr) { sd::Tensor base_uncond = !uncond_out.empty() ? uncond_out : cond_out; - *out_uncond_denoised = base_uncond * c_out + x * c_skip; + *out_uncond_denoised = base_uncond * c_out + x * c_skip; } if (cache_runtime.spectrum_enabled) { cache_runtime.spectrum.update(denoised); @@ -1870,7 +1898,7 @@ public: return denoised; }; - auto x0_opt = sample_k_diffusion(method, denoise, x_t, sigmas, sampler_rng, eta, is_flow_denoiser); + auto x0_opt = sample_k_diffusion(method, denoise, x_t, sigmas, sampler_rng, eta, is_flow_denoiser, extra_sample_args); if (x0_opt.empty()) { LOG_ERROR("Diffusion model sampling failed"); if (control_net) { @@ -1920,6 +1948,8 @@ public: if (sd_version_is_dit(version)) { if (version == VERSION_WAN2_2_TI2V) { latent_channel = 48; + } else if (version == VERSION_HIDREAM_O1) { + latent_channel = 3; } else if (version == VERSION_CHROMA_RADIANCE) { latent_channel = 3; } else if (sd_version_uses_flux2_vae(version)) { @@ -2361,6 +2391,7 @@ void sd_sample_params_init(sd_sample_params_t* sample_params) { sample_params->custom_sigmas = nullptr; sample_params->custom_sigmas_count = 0; sample_params->flow_shift = INFINITY; + sample_params->extra_sample_args = nullptr; } char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) { @@ -2382,7 +2413,8 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) { "sample_steps: %d, " "eta: %.2f, " "shifted_timestep: %d, " - "flow_shift: %.2f)", + "flow_shift: %.2f, " + "extra_sample_args: %s)", sample_params->guidance.txt_cfg, std::isfinite(sample_params->guidance.img_cfg) ? sample_params->guidance.img_cfg @@ -2397,7 +2429,8 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) { sample_params->sample_steps, sample_params->eta, sample_params->shifted_timestep, - sample_params->flow_shift); + sample_params->flow_shift, + SAFE_STR(sample_params->extra_sample_args)); return buf; } @@ -2609,6 +2642,9 @@ static float resolve_eta(sd_ctx_t* sd_ctx, float eta, enum sample_method_t sample_method) { if (eta == INFINITY) { + if (sd_ctx->sd->version == VERSION_HIDREAM_O1) { + return 8.f; + } switch (sample_method) { case DDIM_TRAILING_SAMPLE_METHOD: case TCD_SAMPLE_METHOD: @@ -2828,6 +2864,8 @@ struct GenerationRequest { struct SamplePlan { enum sample_method_t sample_method = SAMPLE_METHOD_COUNT; enum sample_method_t high_noise_sample_method = SAMPLE_METHOD_COUNT; + const char* extra_sample_args = nullptr; + const char* high_noise_extra_sample_args = nullptr; float eta = 0.f; float high_noise_eta = 0.f; int sample_steps = 0; @@ -2840,22 +2878,25 @@ struct SamplePlan { SamplePlan(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, const GenerationRequest& request) { - sample_method = sd_img_gen_params->sample_params.sample_method; - eta = sd_img_gen_params->sample_params.eta; - sample_steps = sd_img_gen_params->sample_params.sample_steps; + sample_method = sd_img_gen_params->sample_params.sample_method; + extra_sample_args = sd_img_gen_params->sample_params.extra_sample_args; + eta = sd_img_gen_params->sample_params.eta; + sample_steps = sd_img_gen_params->sample_params.sample_steps; resolve(sd_ctx, &request, &sd_img_gen_params->sample_params); } SamplePlan(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, const GenerationRequest& request) { - sample_method = sd_vid_gen_params->sample_params.sample_method; - eta = sd_vid_gen_params->sample_params.eta; - sample_steps = sd_vid_gen_params->sample_params.sample_steps; + sample_method = sd_vid_gen_params->sample_params.sample_method; + extra_sample_args = sd_vid_gen_params->sample_params.extra_sample_args; + eta = sd_vid_gen_params->sample_params.eta; + sample_steps = sd_vid_gen_params->sample_params.sample_steps; if (sd_ctx->sd->high_noise_diffusion_model) { - high_noise_sample_steps = sd_vid_gen_params->high_noise_sample_params.sample_steps; - high_noise_sample_method = sd_vid_gen_params->high_noise_sample_params.sample_method; - high_noise_eta = sd_vid_gen_params->high_noise_sample_params.eta; + high_noise_sample_steps = sd_vid_gen_params->high_noise_sample_params.sample_steps; + high_noise_sample_method = sd_vid_gen_params->high_noise_sample_params.sample_method; + high_noise_extra_sample_args = sd_vid_gen_params->high_noise_sample_params.extra_sample_args; + high_noise_eta = sd_vid_gen_params->high_noise_sample_params.eta; } moe_boundary = sd_vid_gen_params->moe_boundary; resolve(sd_ctx, &request, &sd_vid_gen_params->sample_params); @@ -3101,6 +3142,9 @@ static std::optional prepare_image_generation_latents(sd std::vector> ref_latents; for (size_t i = 0; i < ref_images.size(); i++) { + if (sd_ctx->sd->version == VERSION_HIDREAM_O1) { + continue; + } sd::Tensor ref_latent; if (request->auto_resize_ref_image) { LOG_DEBUG("auto resize ref images"); @@ -3511,6 +3555,7 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s request.shifted_timestep, plan.sample_method, sd_ctx->sd->is_flow_denoiser(), + plan.extra_sample_args, plan.sigmas, plan.start_merge_step, latents.ref_latents, @@ -3636,6 +3681,7 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s request.shifted_timestep, plan.sample_method, sd_ctx->sd->is_flow_denoiser(), + plan.extra_sample_args, hires_sigma_sched, plan.start_merge_step, latents.ref_latents, @@ -4000,6 +4046,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s request.shifted_timestep, plan.high_noise_sample_method, sd_ctx->sd->is_flow_denoiser(), + plan.high_noise_extra_sample_args, high_noise_sigmas, -1, std::vector>{}, @@ -4042,6 +4089,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s sd_vid_gen_params->sample_params.shifted_timestep, plan.sample_method, sd_ctx->sd->is_flow_denoiser(), + plan.extra_sample_args, plan.sigmas, -1, std::vector>{}, diff --git a/src/tokenizers/qwen2_tokenizer.cpp b/src/tokenizers/qwen2_tokenizer.cpp index 9929ea38..46ee3117 100644 --- a/src/tokenizers/qwen2_tokenizer.cpp +++ b/src/tokenizers/qwen2_tokenizer.cpp @@ -81,6 +81,11 @@ Qwen2Tokenizer::Qwen2Tokenizer(const std::string& merges_utf8_str) { "", "", "", + "<|boi_token|>", + "<|bor_token|>", + "<|eor_token|>", + "<|bot_token|>", + "<|tms_token|>", }; if (merges_utf8_str.size() > 0) { diff --git a/src/vae.hpp b/src/vae.hpp index 54bd88ab..35e73e41 100644 --- a/src/vae.hpp +++ b/src/vae.hpp @@ -71,7 +71,7 @@ public: scale_factor = 16; } else if (sd_version_uses_flux2_vae(version)) { scale_factor = 16; - } else if (version == VERSION_CHROMA_RADIANCE) { + } else if (version == VERSION_CHROMA_RADIANCE || version == VERSION_HIDREAM_O1) { scale_factor = 1; } return scale_factor;