refactor: simplify diffusion model runner params (#1569)

This commit is contained in:
leejet 2026-05-28 00:12:35 +08:00 committed by GitHub
parent 8eded497e5
commit 55c2aed52c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 417 additions and 955 deletions

View File

@ -7,6 +7,7 @@
#include <vector>
#include "common_block.hpp"
#include "diffusion_model.hpp"
#include "flux.hpp"
#include "rope.hpp"
@ -518,7 +519,7 @@ namespace Anima {
}
};
struct AnimaRunner : public GGMLRunner {
struct AnimaRunner : public DiffusionModelRunner {
public:
std::vector<float> image_pe_vec;
std::vector<float> adapter_q_pe_vec;
@ -529,7 +530,7 @@ namespace Anima {
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model")
: GGMLRunner(backend, params_backend) {
: DiffusionModelRunner(backend, params_backend, prefix) {
int64_t num_layers = 0;
std::string layer_tag = prefix + ".net.blocks.";
for (const auto& kv : tensor_storage_map) {
@ -559,7 +560,7 @@ namespace Anima {
return "anima";
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
net.get_param_tensors(tensors, prefix + ".net");
}
@ -684,6 +685,19 @@ namespace Anima {
};
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
const auto* extra = diffusion_extra_as<AnimaDiffusionExtra>(diffusion_params);
return compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
tensor_or_empty(extra->t5_ids),
tensor_or_empty(extra->t5_weights));
}
};
} // namespace Anima

View File

@ -102,7 +102,6 @@ struct ConditionerParams {
int clip_skip = -1;
int width = -1;
int height = -1;
int adm_in_channels = -1;
bool zero_out_masked = false;
int num_input_imgs = 0; // for photomaker
const std::vector<sd::Tensor<float>>* ref_images = nullptr; // for qwen image edit
@ -502,7 +501,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
int clip_skip,
int width,
int height,
int adm_in_channels = -1,
bool zero_out_masked = false) {
int64_t t0 = ggml_time_ms();
sd::Tensor<float> hidden_states; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
@ -589,6 +587,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
sd::Tensor<float> vec;
if (sd_version_is_sdxl(version)) {
int out_dim = 256;
int adm_in_channels = 2816;
GGML_ASSERT(!pooled.empty());
vec = sd::Tensor<float>({adm_in_channels});
vec.fill_(0.0f);
@ -647,7 +646,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
conditioner_params.clip_skip,
conditioner_params.width,
conditioner_params.height,
conditioner_params.adm_in_channels,
conditioner_params.zero_out_masked);
return std::make_tuple(cond, clsm);
}
@ -674,7 +672,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
conditioner_params.clip_skip,
conditioner_params.width,
conditioner_params.height,
conditioner_params.adm_in_channels,
conditioner_params.zero_out_masked);
}
};

View File

@ -1,842 +1,107 @@
#ifndef __DIFFUSION_MODEL_H__
#define __DIFFUSION_MODEL_H__
#include <optional>
#include "anima.hpp"
#include "ernie_image.hpp"
#include "flux.hpp"
#include "hidream_o1.hpp"
#include "lens.hpp"
#include "ltxv.hpp"
#include "mmdit.hpp"
#include "qwen_image.hpp"
#include "tensor_ggml.hpp"
#include "unet.hpp"
#include "wan.hpp"
#include "z_image.hpp"
#include <string>
#include <utility>
#include <variant>
struct DiffusionParams {
const sd::Tensor<float>* x = nullptr;
const sd::Tensor<float>* timesteps = nullptr;
const sd::Tensor<float>* audio_x = nullptr;
const sd::Tensor<float>* audio_timesteps = nullptr;
const sd::Tensor<float>* context = nullptr;
const sd::Tensor<float>* c_concat = nullptr;
const sd::Tensor<float>* y = nullptr;
#include "ggml_extend.hpp"
#include "tensor_ggml.hpp"
struct UNetDiffusionExtra {
int num_video_frames = -1;
const std::vector<sd::Tensor<float>>* controls = nullptr;
float control_strength = 0.f;
};
struct SkipLayerDiffusionExtra {
const std::vector<int>* skip_layers = nullptr;
};
struct FluxDiffusionExtra {
const sd::Tensor<float>* guidance = nullptr;
const std::vector<int>* skip_layers = nullptr;
};
struct AnimaDiffusionExtra {
const sd::Tensor<int32_t>* t5_ids = nullptr;
const sd::Tensor<float>* t5_weights = nullptr;
const sd::Tensor<float>* guidance = nullptr;
const std::vector<sd::Tensor<float>>* ref_latents = nullptr;
};
struct WanDiffusionExtra {
const sd::Tensor<float>* vace_context = nullptr;
float vace_strength = 1.f;
};
struct HiDreamO1DiffusionExtra {
const sd::Tensor<int32_t>* input_ids = nullptr;
const sd::Tensor<int32_t>* input_pos = nullptr;
const sd::Tensor<int32_t>* token_types = nullptr;
const sd::Tensor<int32_t>* vinput_mask = nullptr;
const std::vector<sd::Tensor<float>>* vlm_images = nullptr;
const std::vector<std::pair<int, sd::Tensor<float>>>* image_embeds = nullptr;
bool increase_ref_index = false;
int num_video_frames = -1;
const std::vector<sd::Tensor<float>>* controls = nullptr;
float control_strength = 0.f;
const sd::Tensor<float>* vace_context = nullptr;
float vace_strength = 1.f;
};
struct LTXAVDiffusionExtra {
const sd::Tensor<float>* audio_x = nullptr;
const sd::Tensor<float>* audio_timesteps = nullptr;
int audio_length = 0;
float frame_rate = 24.f;
const sd::Tensor<float>* video_positions = nullptr;
const std::vector<int>* skip_layers = nullptr;
};
using DiffusionExtraParams = std::variant<std::monostate,
UNetDiffusionExtra,
SkipLayerDiffusionExtra,
FluxDiffusionExtra,
AnimaDiffusionExtra,
WanDiffusionExtra,
HiDreamO1DiffusionExtra,
LTXAVDiffusionExtra>;
struct DiffusionParams {
const sd::Tensor<float>* x = nullptr;
const sd::Tensor<float>* timesteps = nullptr;
const sd::Tensor<float>* context = nullptr;
const sd::Tensor<float>* c_concat = nullptr;
const sd::Tensor<float>* y = nullptr;
const std::vector<sd::Tensor<float>>* ref_latents = nullptr;
bool increase_ref_index = false;
DiffusionExtraParams extra = std::monostate{};
};
template <typename T>
static inline const T* diffusion_extra_as(const DiffusionParams& params) {
const auto* extra = std::get_if<T>(&params.extra);
GGML_ASSERT(extra != nullptr);
return extra;
}
template <typename T>
static inline const sd::Tensor<T>& tensor_or_empty(const sd::Tensor<T>* tensor) {
static const sd::Tensor<T> kEmpty;
return tensor != nullptr ? *tensor : kEmpty;
}
struct DiffusionModel {
virtual std::string get_desc() = 0;
struct DiffusionModelRunner : public GGMLRunner {
protected:
std::string prefix;
public:
DiffusionModelRunner(ggml_backend_t backend,
ggml_backend_t params_backend,
const std::string& prefix)
: GGMLRunner(backend, params_backend),
prefix(prefix) {}
virtual sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) = 0;
virtual void alloc_params_buffer() = 0;
virtual void free_params_buffer() = 0;
virtual void free_compute_buffer() = 0;
virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) = 0;
virtual size_t get_params_buffer_size() = 0;
virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
virtual int64_t get_adm_in_channels() = 0;
virtual void set_flash_attention_enabled(bool enabled) = 0;
virtual void set_max_graph_vram_bytes(size_t max_vram_bytes) = 0;
virtual void set_circular_axes(bool circular_x, bool circular_y) = 0;
};
struct UNetModel : public DiffusionModel {
UNetModelRunner unet;
UNetModel(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
SDVersion version = VERSION_SD1)
: unet(backend, params_backend, tensor_storage_map, "model.diffusion_model", version) {
}
std::string get_desc() override {
return unet.get_desc();
}
void alloc_params_buffer() override {
unet.alloc_params_buffer();
}
void free_params_buffer() override {
unet.free_params_buffer();
}
void free_compute_buffer() override {
unet.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
unet.get_param_tensors(tensors, "model.diffusion_model");
}
size_t get_params_buffer_size() override {
return unet.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
unet.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return unet.unet.adm_in_channels;
}
void set_flash_attention_enabled(bool enabled) {
unet.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
unet.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
unet.set_circular_axes(circular_x, circular_y);
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
static const std::vector<sd::Tensor<float>> empty_controls;
return unet.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
tensor_or_empty(diffusion_params.c_concat),
tensor_or_empty(diffusion_params.y),
diffusion_params.num_video_frames,
diffusion_params.controls ? *diffusion_params.controls : empty_controls,
diffusion_params.control_strength);
}
};
struct MMDiTModel : public DiffusionModel {
MMDiTRunner mmdit;
MMDiTModel(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {})
: mmdit(backend, params_backend, tensor_storage_map, "model.diffusion_model") {
}
std::string get_desc() override {
return mmdit.get_desc();
}
void alloc_params_buffer() override {
mmdit.alloc_params_buffer();
}
void free_params_buffer() override {
mmdit.free_params_buffer();
}
void free_compute_buffer() override {
mmdit.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
mmdit.get_param_tensors(tensors, "model.diffusion_model");
}
size_t get_params_buffer_size() override {
return mmdit.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
mmdit.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768 + 1280;
}
void set_flash_attention_enabled(bool enabled) {
mmdit.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
mmdit.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
mmdit.set_circular_axes(circular_x, circular_y);
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
static const std::vector<int> empty_skip_layers;
return mmdit.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
tensor_or_empty(diffusion_params.y),
diffusion_params.skip_layers ? *diffusion_params.skip_layers : empty_skip_layers);
}
};
struct FluxModel : public DiffusionModel {
Flux::FluxRunner flux;
FluxModel(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
SDVersion version = VERSION_FLUX,
bool use_mask = false)
: flux(backend, params_backend, tensor_storage_map, "model.diffusion_model", version, use_mask) {
}
std::string get_desc() override {
return flux.get_desc();
}
void alloc_params_buffer() override {
flux.alloc_params_buffer();
}
void free_params_buffer() override {
flux.free_params_buffer();
}
void free_compute_buffer() override {
flux.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
flux.get_param_tensors(tensors, "model.diffusion_model");
}
size_t get_params_buffer_size() override {
return flux.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
flux.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attention_enabled(bool enabled) {
flux.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
flux.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
flux.set_circular_axes(circular_x, circular_y);
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
static const std::vector<sd::Tensor<float>> empty_ref_latents;
static const std::vector<int> empty_skip_layers;
return flux.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
tensor_or_empty(diffusion_params.c_concat),
tensor_or_empty(diffusion_params.y),
tensor_or_empty(diffusion_params.guidance),
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
diffusion_params.increase_ref_index,
diffusion_params.skip_layers ? *diffusion_params.skip_layers : empty_skip_layers);
}
};
struct AnimaModel : public DiffusionModel {
std::string prefix;
Anima::AnimaRunner anima;
AnimaModel(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model")
: prefix(prefix), anima(backend, params_backend, tensor_storage_map, prefix) {
}
std::string get_desc() override {
return anima.get_desc();
}
void alloc_params_buffer() override {
anima.alloc_params_buffer();
}
void free_params_buffer() override {
anima.free_params_buffer();
}
void free_compute_buffer() override {
anima.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
anima.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return anima.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
anima.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attention_enabled(bool enabled) {
anima.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
anima.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
anima.set_circular_axes(circular_x, circular_y);
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
return anima.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
tensor_or_empty(diffusion_params.t5_ids),
tensor_or_empty(diffusion_params.t5_weights));
}
};
struct WanModel : public DiffusionModel {
std::string prefix;
WAN::WanRunner wan;
WanModel(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model",
SDVersion version = VERSION_WAN2)
: prefix(prefix), wan(backend, params_backend, tensor_storage_map, prefix, version) {
}
std::string get_desc() override {
return wan.get_desc();
}
void alloc_params_buffer() override {
wan.alloc_params_buffer();
}
void free_params_buffer() override {
wan.free_params_buffer();
}
void free_compute_buffer() override {
wan.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
wan.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return wan.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
wan.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attention_enabled(bool enabled) {
wan.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
wan.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
wan.set_circular_axes(circular_x, circular_y);
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
return wan.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
tensor_or_empty(diffusion_params.y),
tensor_or_empty(diffusion_params.c_concat),
sd::Tensor<float>(),
tensor_or_empty(diffusion_params.vace_context),
diffusion_params.vace_strength);
}
};
struct QwenImageModel : public DiffusionModel {
std::string prefix;
Qwen::QwenImageRunner qwen_image;
QwenImageModel(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model",
SDVersion version = VERSION_QWEN_IMAGE,
bool zero_cond_t = false)
: prefix(prefix), qwen_image(backend, params_backend, tensor_storage_map, prefix, version, zero_cond_t) {
}
std::string get_desc() override {
return qwen_image.get_desc();
}
void alloc_params_buffer() override {
qwen_image.alloc_params_buffer();
}
void free_params_buffer() override {
qwen_image.free_params_buffer();
}
void free_compute_buffer() override {
qwen_image.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
qwen_image.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return qwen_image.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
qwen_image.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attention_enabled(bool enabled) {
qwen_image.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
qwen_image.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
qwen_image.set_circular_axes(circular_x, circular_y);
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
static const std::vector<sd::Tensor<float>> empty_ref_latents;
return qwen_image.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
true);
}
};
struct HiDreamO1Model : public DiffusionModel {
std::string prefix;
HiDreamO1::HiDreamO1Runner hidream_o1;
HiDreamO1Model(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string& prefix = "model")
: prefix(prefix), hidream_o1(backend, params_backend, tensor_storage_map, prefix) {
}
std::string get_desc() override {
return hidream_o1.get_desc();
}
void alloc_params_buffer() override {
hidream_o1.alloc_params_buffer();
}
void free_params_buffer() override {
hidream_o1.free_params_buffer();
}
void free_compute_buffer() override {
hidream_o1.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
hidream_o1.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return hidream_o1.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
hidream_o1.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 0;
}
void set_flash_attention_enabled(bool enabled) {
hidream_o1.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
hidream_o1.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
hidream_o1.set_circular_axes(circular_x, circular_y);
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
GGML_ASSERT(diffusion_params.input_ids != nullptr);
GGML_ASSERT(diffusion_params.input_pos != nullptr);
GGML_ASSERT(diffusion_params.token_types != nullptr);
static const std::vector<sd::Tensor<float>> empty_images;
static const std::vector<std::pair<int, sd::Tensor<float>>> empty_image_embeds;
return hidream_o1.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
*diffusion_params.input_ids,
*diffusion_params.input_pos,
*diffusion_params.token_types,
tensor_or_empty(diffusion_params.vinput_mask),
diffusion_params.image_embeds ? *diffusion_params.image_embeds : empty_image_embeds,
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_images);
}
};
struct ZImageModel : public DiffusionModel {
std::string prefix;
ZImage::ZImageRunner z_image;
ZImageModel(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model",
SDVersion version = VERSION_Z_IMAGE)
: prefix(prefix), z_image(backend, params_backend, tensor_storage_map, prefix, version) {
}
std::string get_desc() override {
return z_image.get_desc();
}
void alloc_params_buffer() override {
z_image.alloc_params_buffer();
}
void free_params_buffer() override {
z_image.free_params_buffer();
}
void free_compute_buffer() override {
z_image.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
z_image.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return z_image.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
z_image.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attention_enabled(bool enabled) {
z_image.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
z_image.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
z_image.set_circular_axes(circular_x, circular_y);
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
static const std::vector<sd::Tensor<float>> empty_ref_latents;
return z_image.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
true);
}
};
struct ErnieImageModel : public DiffusionModel {
std::string prefix;
ErnieImage::ErnieImageRunner ernie_image;
ErnieImageModel(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model")
: prefix(prefix), ernie_image(backend, params_backend, tensor_storage_map, prefix) {
}
std::string get_desc() override {
return ernie_image.get_desc();
}
void alloc_params_buffer() override {
ernie_image.alloc_params_buffer();
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) {
get_param_tensors(tensors, prefix);
}
void free_params_buffer() override {
ernie_image.free_params_buffer();
}
void free_compute_buffer() override {
ernie_image.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
ernie_image.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return ernie_image.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
ernie_image.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attention_enabled(bool enabled) {
ernie_image.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
ernie_image.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
ernie_image.set_circular_axes(circular_x, circular_y);
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
return ernie_image.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context));
}
};
struct LensModel : public DiffusionModel {
std::string prefix;
Lens::LensRunner lens;
LensModel(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model")
: prefix(prefix), lens(backend, params_backend, tensor_storage_map, prefix) {
}
std::string get_desc() override {
return lens.get_desc();
}
void alloc_params_buffer() override {
lens.alloc_params_buffer();
}
void free_params_buffer() override {
lens.free_params_buffer();
}
void free_compute_buffer() override {
lens.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
lens.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return lens.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
lens.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 768;
}
void set_flash_attention_enabled(bool enabled) {
lens.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
lens.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
lens.set_circular_axes(circular_x, circular_y);
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
return lens.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context));
}
};
struct LTXAVModel : public DiffusionModel {
std::string prefix;
LTXV::LTXAVRunner ltxav;
LTXAVModel(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model")
: prefix(prefix), ltxav(backend, params_backend, tensor_storage_map, prefix) {
}
std::string get_desc() override {
return ltxav.get_desc();
}
void alloc_params_buffer() override {
ltxav.alloc_params_buffer();
}
void free_params_buffer() override {
ltxav.free_params_buffer();
}
void free_compute_buffer() override {
ltxav.free_compute_buffer();
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
ltxav.get_param_tensors(tensors, prefix);
}
size_t get_params_buffer_size() override {
return ltxav.get_params_buffer_size();
}
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
ltxav.set_weight_adapter(adapter);
}
int64_t get_adm_in_channels() override {
return 0;
}
void set_flash_attention_enabled(bool enabled) override {
ltxav.set_flash_attention_enabled(enabled);
}
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
ltxav.set_max_graph_vram_bytes(max_vram_bytes);
}
void set_circular_axes(bool circular_x, bool circular_y) override {
ltxav.set_circular_axes(circular_x, circular_y);
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
return ltxav.compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
tensor_or_empty(diffusion_params.audio_x),
tensor_or_empty(diffusion_params.audio_timesteps),
diffusion_params.audio_length,
diffusion_params.frame_rate,
tensor_or_empty(diffusion_params.video_positions));
}
virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors,
const std::string& prefix) = 0;
};
#endif

View File

@ -5,6 +5,7 @@
#include <vector>
#include "common_dit.hpp"
#include "diffusion_model.hpp"
#include "flux.hpp"
#include "qwen_image.hpp"
#include "rope.hpp"
@ -325,7 +326,7 @@ namespace ErnieImage {
}
};
struct ErnieImageRunner : public GGMLRunner {
struct ErnieImageRunner : public DiffusionModelRunner {
ErnieImageParams ernie_params;
ErnieImageModel ernie_image;
std::vector<float> pe_vec;
@ -334,7 +335,7 @@ namespace ErnieImage {
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "")
: GGMLRunner(backend, params_backend) {
: DiffusionModelRunner(backend, params_backend, prefix) {
ernie_params.num_layers = 0;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
@ -393,7 +394,7 @@ namespace ErnieImage {
return "ernie_image";
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
ernie_image.get_param_tensors(tensors, prefix);
}
@ -435,6 +436,16 @@ namespace ErnieImage {
};
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
return compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context));
}
};
} // namespace ErnieImage

View File

@ -5,6 +5,7 @@
#include <vector>
#include "common_dit.hpp"
#include "diffusion_model.hpp"
#include "model.h"
#include "rope.hpp"
@ -1176,7 +1177,7 @@ namespace Flux {
}
};
struct FluxRunner : public GGMLRunner {
struct FluxRunner : public DiffusionModelRunner {
public:
FluxParams flux_params;
Flux flux;
@ -1193,7 +1194,7 @@ namespace Flux {
const std::string prefix = "",
SDVersion version = VERSION_FLUX,
bool use_mask = false)
: GGMLRunner(backend, params_backend), version(version), use_mask(use_mask) {
: DiffusionModelRunner(backend, params_backend, prefix), version(version), use_mask(use_mask) {
flux_params.version = version;
flux_params.guidance_embed = false;
flux_params.depth = 0;
@ -1308,7 +1309,7 @@ namespace Flux {
return "flux";
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
flux.get_param_tensors(tensors, prefix);
}
@ -1490,6 +1491,25 @@ namespace Flux {
return result;
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
const auto* extra = diffusion_extra_as<FluxDiffusionExtra>(diffusion_params);
static const std::vector<sd::Tensor<float>> empty_ref_latents;
static const std::vector<int> empty_skip_layers;
return compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
tensor_or_empty(diffusion_params.c_concat),
tensor_or_empty(diffusion_params.y),
tensor_or_empty(extra->guidance),
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
diffusion_params.increase_ref_index,
extra->skip_layers ? *extra->skip_layers : empty_skip_layers);
}
void test() {
ggml_init_params params;
params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1GB

View File

@ -12,6 +12,7 @@
#include "common_dit.hpp"
#include "conditioner.hpp"
#include "diffusion_model.hpp"
#include "llm.hpp"
#include "util.h"
@ -329,7 +330,7 @@ namespace HiDreamO1 {
}
};
struct HiDreamO1Runner : public GGMLRunner {
struct HiDreamO1Runner : public DiffusionModelRunner {
HiDreamO1Params params;
HiDreamO1Model model;
@ -339,7 +340,7 @@ namespace HiDreamO1 {
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string& prefix = "model")
: GGMLRunner(backend, params_backend),
: DiffusionModelRunner(backend, params_backend, prefix),
params(make_hidream_o1_params()) {
model = HiDreamO1Model(params);
model.init(params_ctx, tensor_storage_map, prefix);
@ -349,7 +350,7 @@ namespace HiDreamO1 {
return "hidream_o1";
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) {
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
model.get_param_tensors(tensors, prefix);
}
@ -454,6 +455,28 @@ namespace HiDreamO1 {
};
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
const auto* extra = diffusion_extra_as<HiDreamO1DiffusionExtra>(diffusion_params);
GGML_ASSERT(extra != nullptr);
GGML_ASSERT(extra->input_ids != nullptr);
GGML_ASSERT(extra->input_pos != nullptr);
GGML_ASSERT(extra->token_types != nullptr);
static const std::vector<sd::Tensor<float>> empty_images;
static const std::vector<std::pair<int, sd::Tensor<float>>> empty_image_embeds;
return compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
*extra->input_ids,
*extra->input_pos,
*extra->token_types,
tensor_or_empty(extra->vinput_mask),
extra->image_embeds ? *extra->image_embeds : empty_image_embeds,
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_images);
}
};
struct HiDreamO1Conditioner : public Conditioner {

View File

@ -5,6 +5,7 @@
#include <vector>
#include "common_block.hpp"
#include "diffusion_model.hpp"
#include "flux.hpp"
#include "qwen_image.hpp"
#include "rope.hpp"
@ -298,7 +299,7 @@ namespace Lens {
}
};
struct LensRunner : public GGMLRunner {
struct LensRunner : public DiffusionModelRunner {
LensParams lens_params;
LensModel lens;
std::vector<float> pe_vec;
@ -307,7 +308,7 @@ namespace Lens {
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "")
: GGMLRunner(backend, params_backend) {
: DiffusionModelRunner(backend, params_backend, prefix) {
lens_params.num_layers = 0;
for (const auto& [name, tensor_storage] : tensor_storage_map) {
if (!starts_with(name, prefix)) {
@ -361,7 +362,7 @@ namespace Lens {
return "lens";
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
lens.get_param_tensors(tensors, prefix);
}
@ -402,6 +403,16 @@ namespace Lens {
};
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
return compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context));
}
};
} // namespace Lens

View File

@ -10,6 +10,7 @@
#include <vector>
#include "common_block.hpp"
#include "diffusion_model.hpp"
#include "flux.hpp"
#include "rope.hpp"
@ -1534,8 +1535,7 @@ namespace LTXV {
}
};
struct LTXAVRunner : public GGMLRunner {
std::string prefix;
struct LTXAVRunner : public DiffusionModelRunner {
LTXAVParams params;
LTXAVModelBlock model;
std::vector<float> video_pe_vec;
@ -1561,8 +1561,7 @@ namespace LTXV {
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string& prefix = "model.diffusion_model")
: GGMLRunner(backend, params_backend),
prefix(prefix),
: DiffusionModelRunner(backend, params_backend, prefix),
params(),
model(params) {
auto patchify_proj_iter = tensor_storage_map.find(prefix + ".patchify_proj.weight");
@ -1673,7 +1672,7 @@ namespace LTXV {
return "ltxav";
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) {
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
model.get_param_tensors(tensors, prefix);
}
@ -1917,6 +1916,22 @@ namespace LTXV {
return out;
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
const auto* extra = diffusion_extra_as<LTXAVDiffusionExtra>(diffusion_params);
return compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
tensor_or_empty(extra->audio_x),
tensor_or_empty(extra->audio_timesteps),
extra->audio_length,
extra->frame_rate,
tensor_or_empty(extra->video_positions));
}
void test(const std::string& x_path,
const std::string& timesteps_path = "",
const std::string& context_path = "",

View File

@ -3,6 +3,7 @@
#include <memory>
#include "diffusion_model.hpp"
#include "ggml_extend.hpp"
#include "model.h"
@ -824,14 +825,14 @@ public:
return x;
}
};
struct MMDiTRunner : public GGMLRunner {
struct MMDiTRunner : public DiffusionModelRunner {
MMDiT mmdit;
MMDiTRunner(ggml_backend_t backend,
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "")
: GGMLRunner(backend, params_backend), mmdit(tensor_storage_map) {
: DiffusionModelRunner(backend, params_backend, prefix), mmdit(tensor_storage_map) {
mmdit.init(params_ctx, tensor_storage_map, prefix);
}
@ -839,7 +840,7 @@ struct MMDiTRunner : public GGMLRunner {
return "mmdit";
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
mmdit.get_param_tensors(tensors, prefix);
}
@ -885,6 +886,20 @@ struct MMDiTRunner : public GGMLRunner {
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
const auto* extra = diffusion_extra_as<SkipLayerDiffusionExtra>(diffusion_params);
static const std::vector<int> empty_skip_layers;
return compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
tensor_or_empty(diffusion_params.y),
extra->skip_layers ? *extra->skip_layers : empty_skip_layers);
}
void test() {
ggml_init_params params;
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB

View File

@ -4,6 +4,7 @@
#include <memory>
#include "common_block.hpp"
#include "diffusion_model.hpp"
#include "flux.hpp"
namespace Qwen {
@ -479,7 +480,7 @@ namespace Qwen {
}
};
struct QwenImageRunner : public GGMLRunner {
struct QwenImageRunner : public DiffusionModelRunner {
public:
QwenImageParams qwen_image_params;
QwenImageModel qwen_image;
@ -493,7 +494,7 @@ namespace Qwen {
const std::string prefix = "",
SDVersion version = VERSION_QWEN_IMAGE,
bool zero_cond_t = false)
: GGMLRunner(backend, params_backend) {
: DiffusionModelRunner(backend, params_backend, prefix) {
qwen_image_params.num_layers = 0;
qwen_image_params.zero_cond_t = zero_cond_t;
for (auto pair : tensor_storage_map) {
@ -528,7 +529,7 @@ namespace Qwen {
return "qwen_image";
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
qwen_image.get_param_tensors(tensors, prefix);
}
@ -624,6 +625,19 @@ namespace Qwen {
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
static const std::vector<sd::Tensor<float>> empty_ref_latents;
return compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
diffusion_params.increase_ref_index);
}
void test() {
ggml_init_params params;
params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1GB

View File

@ -8,22 +8,33 @@
#include "stable-diffusion.h"
#include "util.h"
#include "anima.hpp"
#include "auto_encoder_kl.hpp"
#include "conditioner.hpp"
#include "control.hpp"
#include "denoiser.hpp"
#include "diffusion_model.hpp"
#include "ernie_image.hpp"
#include "esrgan.hpp"
#include "flux.hpp"
#include "guidance.h"
#include "hidream_o1.hpp"
#include "lens.hpp"
#include "lora.hpp"
#include "ltx_audio_vae.h"
#include "ltx_latent_upscaler.hpp"
#include "ltx_vae.hpp"
#include "ltxv.hpp"
#include "mmdit.hpp"
#include "pmid.hpp"
#include "qwen_image.hpp"
#include "sample-cache.h"
#include "tae.hpp"
#include "unet.hpp"
#include "upscaler.h"
#include "vae.hpp"
#include "wan.hpp"
#include "z_image.hpp"
#include "latent-preview.h"
#include "name_conversion.h"
@ -138,8 +149,8 @@ public:
std::shared_ptr<Conditioner> cond_stage_model;
std::shared_ptr<FrozenCLIPVisionEmbedder> clip_vision; // for svd or wan2.1 i2v
std::shared_ptr<DiffusionModel> diffusion_model;
std::shared_ptr<DiffusionModel> high_noise_diffusion_model;
std::shared_ptr<DiffusionModelRunner> diffusion_model;
std::shared_ptr<DiffusionModelRunner> high_noise_diffusion_model;
std::shared_ptr<VAE> first_stage_model;
std::shared_ptr<VAE> preview_vae;
std::shared_ptr<LTXV::LTXAudioVAERunner> audio_vae_model;
@ -486,9 +497,10 @@ public:
cond_stage_model = std::make_shared<SD3CLIPEmbedder>(backend_for(SDBackendModule::TE),
params_backend_for(SDBackendModule::TE),
tensor_storage_map);
diffusion_model = std::make_shared<MMDiTModel>(backend_for(SDBackendModule::DIFFUSION),
diffusion_model = std::make_shared<MMDiTRunner>(backend_for(SDBackendModule::DIFFUSION),
params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map);
tensor_storage_map,
"model.diffusion_model");
} else if (sd_version_is_flux(version)) {
bool is_chroma = false;
for (auto pair : tensor_storage_map) {
@ -524,9 +536,10 @@ public:
params_backend_for(SDBackendModule::TE),
tensor_storage_map);
}
diffusion_model = std::make_shared<FluxModel>(backend_for(SDBackendModule::DIFFUSION),
diffusion_model = std::make_shared<Flux::FluxRunner>(backend_for(SDBackendModule::DIFFUSION),
params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.diffusion_model",
version,
sd_ctx_params->chroma_use_dit_mask);
} else if (sd_version_is_flux2(version)) {
@ -535,16 +548,17 @@ public:
params_backend_for(SDBackendModule::TE),
tensor_storage_map,
version);
diffusion_model = std::make_shared<FluxModel>(backend_for(SDBackendModule::DIFFUSION),
diffusion_model = std::make_shared<Flux::FluxRunner>(backend_for(SDBackendModule::DIFFUSION),
params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.diffusion_model",
version,
sd_ctx_params->chroma_use_dit_mask);
} else if (sd_version_is_ltxav(version)) {
cond_stage_model = std::make_shared<LTXAVEmbedder>(backend_for(SDBackendModule::TE),
params_backend_for(SDBackendModule::TE),
tensor_storage_map);
diffusion_model = std::make_shared<LTXAVModel>(backend_for(SDBackendModule::DIFFUSION),
diffusion_model = std::make_shared<LTXV::LTXAVRunner>(backend_for(SDBackendModule::DIFFUSION),
params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.diffusion_model");
@ -555,13 +569,13 @@ public:
true,
0,
true);
diffusion_model = std::make_shared<WanModel>(backend_for(SDBackendModule::DIFFUSION),
diffusion_model = std::make_shared<WAN::WanRunner>(backend_for(SDBackendModule::DIFFUSION),
params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.diffusion_model",
version);
if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
high_noise_diffusion_model = std::make_shared<WanModel>(backend_for(SDBackendModule::DIFFUSION),
high_noise_diffusion_model = std::make_shared<WAN::WanRunner>(backend_for(SDBackendModule::DIFFUSION),
params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.high_noise_diffusion_model",
@ -590,7 +604,7 @@ public:
version,
"",
enable_vision);
diffusion_model = std::make_shared<QwenImageModel>(backend_for(SDBackendModule::DIFFUSION),
diffusion_model = std::make_shared<Qwen::QwenImageRunner>(backend_for(SDBackendModule::DIFFUSION),
params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.diffusion_model",
@ -607,16 +621,17 @@ public:
version,
"",
enable_vision);
diffusion_model = std::make_shared<FluxModel>(backend_for(SDBackendModule::DIFFUSION),
diffusion_model = std::make_shared<Flux::FluxRunner>(backend_for(SDBackendModule::DIFFUSION),
params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.diffusion_model",
version,
sd_ctx_params->chroma_use_dit_mask);
} else if (version == VERSION_HIDREAM_O1) {
cond_stage_model = std::make_shared<HiDreamO1::HiDreamO1Conditioner>(backend_for(SDBackendModule::TE),
params_backend_for(SDBackendModule::TE),
tensor_storage_map);
diffusion_model = std::make_shared<HiDreamO1Model>(backend_for(SDBackendModule::DIFFUSION),
diffusion_model = std::make_shared<HiDreamO1::HiDreamO1Runner>(backend_for(SDBackendModule::DIFFUSION),
params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model");
@ -624,7 +639,7 @@ public:
cond_stage_model = std::make_shared<AnimaConditioner>(backend_for(SDBackendModule::TE),
params_backend_for(SDBackendModule::TE),
tensor_storage_map);
diffusion_model = std::make_shared<AnimaModel>(backend_for(SDBackendModule::DIFFUSION),
diffusion_model = std::make_shared<Anima::AnimaRunner>(backend_for(SDBackendModule::DIFFUSION),
params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.diffusion_model");
@ -633,7 +648,7 @@ public:
params_backend_for(SDBackendModule::TE),
tensor_storage_map,
version);
diffusion_model = std::make_shared<ZImageModel>(backend_for(SDBackendModule::DIFFUSION),
diffusion_model = std::make_shared<ZImage::ZImageRunner>(backend_for(SDBackendModule::DIFFUSION),
params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.diffusion_model",
@ -643,7 +658,7 @@ public:
params_backend_for(SDBackendModule::TE),
tensor_storage_map,
version);
diffusion_model = std::make_shared<ErnieImageModel>(backend_for(SDBackendModule::DIFFUSION),
diffusion_model = std::make_shared<ErnieImage::ErnieImageRunner>(backend_for(SDBackendModule::DIFFUSION),
params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.diffusion_model");
@ -652,7 +667,7 @@ public:
params_backend_for(SDBackendModule::TE),
tensor_storage_map,
version);
diffusion_model = std::make_shared<LensModel>(backend_for(SDBackendModule::DIFFUSION),
diffusion_model = std::make_shared<Lens::LensRunner>(backend_for(SDBackendModule::DIFFUSION),
params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.diffusion_model");
@ -675,13 +690,14 @@ public:
embbeding_map,
version);
}
diffusion_model = std::make_shared<UNetModel>(backend_for(SDBackendModule::DIFFUSION),
diffusion_model = std::make_shared<UNetModelRunner>(backend_for(SDBackendModule::DIFFUSION),
params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.diffusion_model",
version);
if (sd_ctx_params->diffusion_conv_direct) {
LOG_INFO("Using Conv2d direct in the diffusion model");
std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.set_conv2d_direct_enabled(true);
diffusion_model->set_conv2d_direct_enabled(true);
}
}
@ -1221,6 +1237,7 @@ public:
diffusion_params.x = &x_t;
diffusion_params.timesteps = &steps;
diffusion_params.context = &c;
diffusion_params.extra = UNetDiffusionExtra{};
if (!concat.empty()) {
diffusion_params.c_concat = &concat;
}
@ -1855,7 +1872,7 @@ public:
*controls = std::move(*control_result);
}
sd::Tensor<float> sample(const std::shared_ptr<DiffusionModel>& work_diffusion_model,
sd::Tensor<float> sample(const std::shared_ptr<DiffusionModelRunner>& work_diffusion_model,
bool inverse_noise_scaling,
const sd::Tensor<float>& init_latent,
sd::Tensor<float> noise,
@ -1982,18 +1999,7 @@ public:
DiffusionParams diffusion_params;
diffusion_params.x = &noised_input;
diffusion_params.timesteps = &timesteps_tensor;
diffusion_params.audio_timesteps = audio_timesteps_tensor.empty() ? nullptr : &audio_timesteps_tensor;
diffusion_params.guidance = &guidance_tensor;
diffusion_params.ref_latents = &ref_latents;
diffusion_params.increase_ref_index = increase_ref_index;
diffusion_params.controls = &controls;
diffusion_params.control_strength = control_strength;
diffusion_params.vace_context = vace_context.empty() ? nullptr : &vace_context;
diffusion_params.vace_strength = vace_strength;
diffusion_params.audio_length = audio_length;
diffusion_params.frame_rate = frame_rate;
diffusion_params.video_positions = video_positions.empty() ? nullptr : &video_positions;
diffusion_params.skip_layers = nullptr;
compute_sample_controls(control_image,
noised_input,
@ -2007,15 +2013,38 @@ public:
diffusion_params.context = condition.c_crossattn.empty() ? nullptr : &condition.c_crossattn;
diffusion_params.c_concat = c_concat_override != nullptr ? c_concat_override : (condition.c_concat.empty() ? nullptr : &condition.c_concat);
diffusion_params.y = condition.c_vector.empty() ? nullptr : &condition.c_vector;
diffusion_params.t5_ids = condition.c_t5_ids.empty() ? nullptr : &condition.c_t5_ids;
diffusion_params.t5_weights = condition.c_t5_weights.empty() ? nullptr : &condition.c_t5_weights;
diffusion_params.input_ids = condition.c_input_ids.empty() ? nullptr : &condition.c_input_ids;
diffusion_params.input_pos = condition.c_position_ids.empty() ? nullptr : &condition.c_position_ids;
diffusion_params.token_types = condition.c_token_types.empty() ? nullptr : &condition.c_token_types;
diffusion_params.vinput_mask = condition.c_vinput_mask.empty() ? nullptr : &condition.c_vinput_mask;
diffusion_params.image_embeds = condition.c_image_embeds.empty() ? nullptr : &condition.c_image_embeds;
diffusion_params.ref_latents = condition.c_ref_images.empty() ? &ref_latents : &condition.c_ref_images;
diffusion_params.skip_layers = local_skip_layers;
if (sd_version_is_unet(version)) {
diffusion_params.extra = UNetDiffusionExtra{-1, &controls, control_strength};
} else if (sd_version_is_sd3(version)) {
diffusion_params.extra = SkipLayerDiffusionExtra{local_skip_layers};
} else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version)) {
diffusion_params.extra = FluxDiffusionExtra{&guidance_tensor,
local_skip_layers};
} else if (sd_version_is_anima(version)) {
diffusion_params.extra = AnimaDiffusionExtra{condition.c_t5_ids.empty() ? nullptr : &condition.c_t5_ids,
condition.c_t5_weights.empty() ? nullptr : &condition.c_t5_weights};
} else if (sd_version_is_wan(version)) {
diffusion_params.extra = WanDiffusionExtra{vace_context.empty() ? nullptr : &vace_context,
vace_strength};
} else if (version == VERSION_HIDREAM_O1) {
diffusion_params.extra = HiDreamO1DiffusionExtra{
condition.c_input_ids.empty() ? nullptr : &condition.c_input_ids,
condition.c_position_ids.empty() ? nullptr : &condition.c_position_ids,
condition.c_token_types.empty() ? nullptr : &condition.c_token_types,
condition.c_vinput_mask.empty() ? nullptr : &condition.c_vinput_mask,
condition.c_image_embeds.empty() ? nullptr : &condition.c_image_embeds};
} else if (sd_version_is_ltxav(version)) {
diffusion_params.extra = LTXAVDiffusionExtra{
nullptr,
audio_timesteps_tensor.empty() ? nullptr : &audio_timesteps_tensor,
audio_length,
frame_rate,
video_positions.empty() ? nullptr : &video_positions};
} else {
diffusion_params.extra = std::monostate{};
}
sd::Tensor<float> cached_output;
if (step_cache.before_condition(&condition, noised_input, &cached_output)) {
@ -3914,7 +3943,7 @@ static std::optional<ImageGenerationLatents> prepare_image_generation_latents(sd
concat_latent = sd::ops::interpolate<float>(ref_latents[0], init_latent.shape());
uncond_concat_latent = sd::Tensor<float>::zeros_like(concat_latent);
}
if (sd_version_is_control(sd_ctx->sd->version)) {
if (sd_ctx->sd->version == VERSION_FLUX_CONTROLS) {
if (!control_latent.empty()) {
concat_latent = control_latent;
} else {
@ -3958,7 +3987,6 @@ static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_c
condition_params.width = request->width;
condition_params.height = request->height;
condition_params.ref_images = &latents->ref_images;
condition_params.adm_in_channels = static_cast<int>(sd_ctx->sd->diffusion_model->get_adm_in_channels());
auto id_cond = sd_ctx->sd->get_pmid_conditon(request->pm_params, condition_params);
int64_t prepare_start_ms = ggml_time_ms();

View File

@ -2,6 +2,7 @@
#define __UNET_HPP__
#include "common_block.hpp"
#include "diffusion_model.hpp"
#include "model.h"
/*==================================================== UnetModel =====================================================*/
@ -599,7 +600,7 @@ public:
}
};
struct UNetModelRunner : public GGMLRunner {
struct UNetModelRunner : public DiffusionModelRunner {
UnetModelBlock unet;
UNetModelRunner(ggml_backend_t backend,
@ -607,7 +608,7 @@ struct UNetModelRunner : public GGMLRunner {
const String2TensorStorage& tensor_storage_map,
const std::string prefix,
SDVersion version = VERSION_SD1)
: GGMLRunner(backend, params_backend), unet(version, tensor_storage_map) {
: DiffusionModelRunner(backend, params_backend, prefix), unet(version, tensor_storage_map) {
unet.init(params_ctx, tensor_storage_map, prefix);
}
@ -615,7 +616,7 @@ struct UNetModelRunner : public GGMLRunner {
return "unet";
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
unet.get_param_tensors(tensors, prefix);
}
@ -682,6 +683,23 @@ struct UNetModelRunner : public GGMLRunner {
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
const auto* extra = diffusion_extra_as<UNetDiffusionExtra>(diffusion_params);
static const std::vector<sd::Tensor<float>> empty_controls;
return compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
tensor_or_empty(diffusion_params.c_concat),
tensor_or_empty(diffusion_params.y),
extra->num_video_frames,
extra->controls ? *extra->controls : empty_controls,
extra->control_strength);
}
void test() {
ggml_init_params params;
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB

View File

@ -6,6 +6,7 @@
#include <utility>
#include "common_block.hpp"
#include "diffusion_model.hpp"
#include "flux.hpp"
#include "rope.hpp"
#include "vae.hpp"
@ -2085,7 +2086,7 @@ namespace WAN {
}
};
struct WanRunner : public GGMLRunner {
struct WanRunner : public DiffusionModelRunner {
public:
std::string desc = "wan";
WanParams wan_params;
@ -2098,7 +2099,7 @@ namespace WAN {
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "",
SDVersion version = VERSION_WAN2)
: GGMLRunner(backend, params_backend) {
: DiffusionModelRunner(backend, params_backend, prefix) {
wan_params.num_layers = 0;
for (auto pair : tensor_storage_map) {
std::string tensor_name = pair.first;
@ -2208,7 +2209,7 @@ namespace WAN {
return desc;
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
wan.get_param_tensors(tensors, prefix);
}
@ -2284,6 +2285,22 @@ namespace WAN {
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
const auto* extra = diffusion_extra_as<WanDiffusionExtra>(diffusion_params);
return compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
tensor_or_empty(diffusion_params.y),
tensor_or_empty(diffusion_params.c_concat),
sd::Tensor<float>(),
tensor_or_empty(extra->vace_context),
extra->vace_strength);
}
void test() {
ggml_init_params params;
params.mem_size = static_cast<size_t>(200 * 1024 * 1024); // 200 MB

View File

@ -3,6 +3,7 @@
#include <algorithm>
#include "diffusion_model.hpp"
#include "flux.hpp"
#include "ggml_extend.hpp"
#include "mmdit.hpp"
@ -464,7 +465,7 @@ namespace ZImage {
}
};
struct ZImageRunner : public GGMLRunner {
struct ZImageRunner : public DiffusionModelRunner {
public:
ZImageParams z_image_params;
ZImageModel z_image;
@ -477,7 +478,7 @@ namespace ZImage {
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "",
SDVersion version = VERSION_Z_IMAGE)
: GGMLRunner(backend, params_backend) {
: DiffusionModelRunner(backend, params_backend, prefix) {
z_image = ZImageModel(z_image_params);
z_image.init(params_ctx, tensor_storage_map, prefix);
}
@ -486,7 +487,7 @@ namespace ZImage {
return "z_image";
}
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
z_image.get_param_tensors(tensors, prefix);
}
@ -556,6 +557,19 @@ namespace ZImage {
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
}
sd::Tensor<float> compute(int n_threads,
const DiffusionParams& diffusion_params) override {
GGML_ASSERT(diffusion_params.x != nullptr);
GGML_ASSERT(diffusion_params.timesteps != nullptr);
static const std::vector<sd::Tensor<float>> empty_ref_latents;
return compute(n_threads,
*diffusion_params.x,
*diffusion_params.timesteps,
tensor_or_empty(diffusion_params.context),
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
diffusion_params.increase_ref_index);
}
void test() {
ggml_init_params params;
params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1GB