mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-06-09 15:56:39 +00:00
refactor: simplify diffusion model runner params (#1569)
This commit is contained in:
parent
8eded497e5
commit
55c2aed52c
@ -7,6 +7,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "common_block.hpp"
|
||||
#include "diffusion_model.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "rope.hpp"
|
||||
|
||||
@ -518,7 +519,7 @@ namespace Anima {
|
||||
}
|
||||
};
|
||||
|
||||
struct AnimaRunner : public GGMLRunner {
|
||||
struct AnimaRunner : public DiffusionModelRunner {
|
||||
public:
|
||||
std::vector<float> image_pe_vec;
|
||||
std::vector<float> adapter_q_pe_vec;
|
||||
@ -529,7 +530,7 @@ namespace Anima {
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "model.diffusion_model")
|
||||
: GGMLRunner(backend, params_backend) {
|
||||
: DiffusionModelRunner(backend, params_backend, prefix) {
|
||||
int64_t num_layers = 0;
|
||||
std::string layer_tag = prefix + ".net.blocks.";
|
||||
for (const auto& kv : tensor_storage_map) {
|
||||
@ -559,7 +560,7 @@ namespace Anima {
|
||||
return "anima";
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
|
||||
net.get_param_tensors(tensors, prefix + ".net");
|
||||
}
|
||||
|
||||
@ -684,6 +685,19 @@ namespace Anima {
|
||||
};
|
||||
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
const auto* extra = diffusion_extra_as<AnimaDiffusionExtra>(diffusion_params);
|
||||
return compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context),
|
||||
tensor_or_empty(extra->t5_ids),
|
||||
tensor_or_empty(extra->t5_weights));
|
||||
}
|
||||
};
|
||||
} // namespace Anima
|
||||
|
||||
|
||||
@ -102,7 +102,6 @@ struct ConditionerParams {
|
||||
int clip_skip = -1;
|
||||
int width = -1;
|
||||
int height = -1;
|
||||
int adm_in_channels = -1;
|
||||
bool zero_out_masked = false;
|
||||
int num_input_imgs = 0; // for photomaker
|
||||
const std::vector<sd::Tensor<float>>* ref_images = nullptr; // for qwen image edit
|
||||
@ -502,7 +501,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
||||
int clip_skip,
|
||||
int width,
|
||||
int height,
|
||||
int adm_in_channels = -1,
|
||||
bool zero_out_masked = false) {
|
||||
int64_t t0 = ggml_time_ms();
|
||||
sd::Tensor<float> hidden_states; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
|
||||
@ -588,7 +586,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
||||
|
||||
sd::Tensor<float> vec;
|
||||
if (sd_version_is_sdxl(version)) {
|
||||
int out_dim = 256;
|
||||
int out_dim = 256;
|
||||
int adm_in_channels = 2816;
|
||||
GGML_ASSERT(!pooled.empty());
|
||||
vec = sd::Tensor<float>({adm_in_channels});
|
||||
vec.fill_(0.0f);
|
||||
@ -647,7 +646,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
||||
conditioner_params.clip_skip,
|
||||
conditioner_params.width,
|
||||
conditioner_params.height,
|
||||
conditioner_params.adm_in_channels,
|
||||
conditioner_params.zero_out_masked);
|
||||
return std::make_tuple(cond, clsm);
|
||||
}
|
||||
@ -674,7 +672,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
||||
conditioner_params.clip_skip,
|
||||
conditioner_params.width,
|
||||
conditioner_params.height,
|
||||
conditioner_params.adm_in_channels,
|
||||
conditioner_params.zero_out_masked);
|
||||
}
|
||||
};
|
||||
|
||||
@ -1,842 +1,107 @@
|
||||
#ifndef __DIFFUSION_MODEL_H__
|
||||
#define __DIFFUSION_MODEL_H__
|
||||
|
||||
#include <optional>
|
||||
#include "anima.hpp"
|
||||
#include "ernie_image.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "hidream_o1.hpp"
|
||||
#include "lens.hpp"
|
||||
#include "ltxv.hpp"
|
||||
#include "mmdit.hpp"
|
||||
#include "qwen_image.hpp"
|
||||
#include "tensor_ggml.hpp"
|
||||
#include "unet.hpp"
|
||||
#include "wan.hpp"
|
||||
#include "z_image.hpp"
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <variant>
|
||||
|
||||
struct DiffusionParams {
|
||||
const sd::Tensor<float>* x = nullptr;
|
||||
const sd::Tensor<float>* timesteps = nullptr;
|
||||
const sd::Tensor<float>* audio_x = nullptr;
|
||||
const sd::Tensor<float>* audio_timesteps = nullptr;
|
||||
const sd::Tensor<float>* context = nullptr;
|
||||
const sd::Tensor<float>* c_concat = nullptr;
|
||||
const sd::Tensor<float>* y = nullptr;
|
||||
const sd::Tensor<int32_t>* t5_ids = nullptr;
|
||||
const sd::Tensor<float>* t5_weights = nullptr;
|
||||
const sd::Tensor<float>* guidance = nullptr;
|
||||
const std::vector<sd::Tensor<float>>* ref_latents = nullptr;
|
||||
#include "ggml_extend.hpp"
|
||||
#include "tensor_ggml.hpp"
|
||||
|
||||
struct UNetDiffusionExtra {
|
||||
int num_video_frames = -1;
|
||||
const std::vector<sd::Tensor<float>>* controls = nullptr;
|
||||
float control_strength = 0.f;
|
||||
};
|
||||
|
||||
struct SkipLayerDiffusionExtra {
|
||||
const std::vector<int>* skip_layers = nullptr;
|
||||
};
|
||||
|
||||
struct FluxDiffusionExtra {
|
||||
const sd::Tensor<float>* guidance = nullptr;
|
||||
const std::vector<int>* skip_layers = nullptr;
|
||||
};
|
||||
|
||||
struct AnimaDiffusionExtra {
|
||||
const sd::Tensor<int32_t>* t5_ids = nullptr;
|
||||
const sd::Tensor<float>* t5_weights = nullptr;
|
||||
};
|
||||
|
||||
struct WanDiffusionExtra {
|
||||
const sd::Tensor<float>* vace_context = nullptr;
|
||||
float vace_strength = 1.f;
|
||||
};
|
||||
|
||||
struct HiDreamO1DiffusionExtra {
|
||||
const sd::Tensor<int32_t>* input_ids = nullptr;
|
||||
const sd::Tensor<int32_t>* input_pos = nullptr;
|
||||
const sd::Tensor<int32_t>* token_types = nullptr;
|
||||
const sd::Tensor<int32_t>* vinput_mask = nullptr;
|
||||
const std::vector<sd::Tensor<float>>* vlm_images = nullptr;
|
||||
const std::vector<std::pair<int, sd::Tensor<float>>>* image_embeds = nullptr;
|
||||
bool increase_ref_index = false;
|
||||
int num_video_frames = -1;
|
||||
const std::vector<sd::Tensor<float>>* controls = nullptr;
|
||||
float control_strength = 0.f;
|
||||
const sd::Tensor<float>* vace_context = nullptr;
|
||||
float vace_strength = 1.f;
|
||||
int audio_length = 0;
|
||||
float frame_rate = 24.f;
|
||||
const sd::Tensor<float>* video_positions = nullptr;
|
||||
const std::vector<int>* skip_layers = nullptr;
|
||||
};
|
||||
|
||||
struct LTXAVDiffusionExtra {
|
||||
const sd::Tensor<float>* audio_x = nullptr;
|
||||
const sd::Tensor<float>* audio_timesteps = nullptr;
|
||||
int audio_length = 0;
|
||||
float frame_rate = 24.f;
|
||||
const sd::Tensor<float>* video_positions = nullptr;
|
||||
};
|
||||
|
||||
using DiffusionExtraParams = std::variant<std::monostate,
|
||||
UNetDiffusionExtra,
|
||||
SkipLayerDiffusionExtra,
|
||||
FluxDiffusionExtra,
|
||||
AnimaDiffusionExtra,
|
||||
WanDiffusionExtra,
|
||||
HiDreamO1DiffusionExtra,
|
||||
LTXAVDiffusionExtra>;
|
||||
|
||||
struct DiffusionParams {
|
||||
const sd::Tensor<float>* x = nullptr;
|
||||
const sd::Tensor<float>* timesteps = nullptr;
|
||||
const sd::Tensor<float>* context = nullptr;
|
||||
const sd::Tensor<float>* c_concat = nullptr;
|
||||
const sd::Tensor<float>* y = nullptr;
|
||||
const std::vector<sd::Tensor<float>>* ref_latents = nullptr;
|
||||
bool increase_ref_index = false;
|
||||
DiffusionExtraParams extra = std::monostate{};
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
static inline const T* diffusion_extra_as(const DiffusionParams& params) {
|
||||
const auto* extra = std::get_if<T>(¶ms.extra);
|
||||
GGML_ASSERT(extra != nullptr);
|
||||
return extra;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline const sd::Tensor<T>& tensor_or_empty(const sd::Tensor<T>* tensor) {
|
||||
static const sd::Tensor<T> kEmpty;
|
||||
return tensor != nullptr ? *tensor : kEmpty;
|
||||
}
|
||||
|
||||
struct DiffusionModel {
|
||||
virtual std::string get_desc() = 0;
|
||||
struct DiffusionModelRunner : public GGMLRunner {
|
||||
protected:
|
||||
std::string prefix;
|
||||
|
||||
public:
|
||||
DiffusionModelRunner(ggml_backend_t backend,
|
||||
ggml_backend_t params_backend,
|
||||
const std::string& prefix)
|
||||
: GGMLRunner(backend, params_backend),
|
||||
prefix(prefix) {}
|
||||
|
||||
virtual sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) = 0;
|
||||
virtual void alloc_params_buffer() = 0;
|
||||
virtual void free_params_buffer() = 0;
|
||||
virtual void free_compute_buffer() = 0;
|
||||
virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) = 0;
|
||||
virtual size_t get_params_buffer_size() = 0;
|
||||
virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
|
||||
virtual int64_t get_adm_in_channels() = 0;
|
||||
virtual void set_flash_attention_enabled(bool enabled) = 0;
|
||||
virtual void set_max_graph_vram_bytes(size_t max_vram_bytes) = 0;
|
||||
virtual void set_circular_axes(bool circular_x, bool circular_y) = 0;
|
||||
};
|
||||
const DiffusionParams& diffusion_params) = 0;
|
||||
|
||||
struct UNetModel : public DiffusionModel {
|
||||
UNetModelRunner unet;
|
||||
|
||||
UNetModel(ggml_backend_t backend,
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
SDVersion version = VERSION_SD1)
|
||||
: unet(backend, params_backend, tensor_storage_map, "model.diffusion_model", version) {
|
||||
}
|
||||
|
||||
std::string get_desc() override {
|
||||
return unet.get_desc();
|
||||
}
|
||||
|
||||
void alloc_params_buffer() override {
|
||||
unet.alloc_params_buffer();
|
||||
}
|
||||
|
||||
void free_params_buffer() override {
|
||||
unet.free_params_buffer();
|
||||
}
|
||||
|
||||
void free_compute_buffer() override {
|
||||
unet.free_compute_buffer();
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
||||
unet.get_param_tensors(tensors, "model.diffusion_model");
|
||||
}
|
||||
|
||||
size_t get_params_buffer_size() override {
|
||||
return unet.get_params_buffer_size();
|
||||
}
|
||||
|
||||
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
|
||||
unet.set_weight_adapter(adapter);
|
||||
}
|
||||
|
||||
int64_t get_adm_in_channels() override {
|
||||
return unet.unet.adm_in_channels;
|
||||
}
|
||||
|
||||
void set_flash_attention_enabled(bool enabled) {
|
||||
unet.set_flash_attention_enabled(enabled);
|
||||
}
|
||||
|
||||
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
|
||||
unet.set_max_graph_vram_bytes(max_vram_bytes);
|
||||
}
|
||||
|
||||
void set_circular_axes(bool circular_x, bool circular_y) override {
|
||||
unet.set_circular_axes(circular_x, circular_y);
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
static const std::vector<sd::Tensor<float>> empty_controls;
|
||||
return unet.compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context),
|
||||
tensor_or_empty(diffusion_params.c_concat),
|
||||
tensor_or_empty(diffusion_params.y),
|
||||
diffusion_params.num_video_frames,
|
||||
diffusion_params.controls ? *diffusion_params.controls : empty_controls,
|
||||
diffusion_params.control_strength);
|
||||
}
|
||||
};
|
||||
|
||||
struct MMDiTModel : public DiffusionModel {
|
||||
MMDiTRunner mmdit;
|
||||
|
||||
MMDiTModel(ggml_backend_t backend,
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {})
|
||||
: mmdit(backend, params_backend, tensor_storage_map, "model.diffusion_model") {
|
||||
}
|
||||
|
||||
std::string get_desc() override {
|
||||
return mmdit.get_desc();
|
||||
}
|
||||
|
||||
void alloc_params_buffer() override {
|
||||
mmdit.alloc_params_buffer();
|
||||
}
|
||||
|
||||
void free_params_buffer() override {
|
||||
mmdit.free_params_buffer();
|
||||
}
|
||||
|
||||
void free_compute_buffer() override {
|
||||
mmdit.free_compute_buffer();
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
||||
mmdit.get_param_tensors(tensors, "model.diffusion_model");
|
||||
}
|
||||
|
||||
size_t get_params_buffer_size() override {
|
||||
return mmdit.get_params_buffer_size();
|
||||
}
|
||||
|
||||
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
|
||||
mmdit.set_weight_adapter(adapter);
|
||||
}
|
||||
|
||||
int64_t get_adm_in_channels() override {
|
||||
return 768 + 1280;
|
||||
}
|
||||
|
||||
void set_flash_attention_enabled(bool enabled) {
|
||||
mmdit.set_flash_attention_enabled(enabled);
|
||||
}
|
||||
|
||||
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
|
||||
mmdit.set_max_graph_vram_bytes(max_vram_bytes);
|
||||
}
|
||||
|
||||
void set_circular_axes(bool circular_x, bool circular_y) override {
|
||||
mmdit.set_circular_axes(circular_x, circular_y);
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
static const std::vector<int> empty_skip_layers;
|
||||
return mmdit.compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context),
|
||||
tensor_or_empty(diffusion_params.y),
|
||||
diffusion_params.skip_layers ? *diffusion_params.skip_layers : empty_skip_layers);
|
||||
}
|
||||
};
|
||||
|
||||
struct FluxModel : public DiffusionModel {
|
||||
Flux::FluxRunner flux;
|
||||
|
||||
FluxModel(ggml_backend_t backend,
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
SDVersion version = VERSION_FLUX,
|
||||
bool use_mask = false)
|
||||
: flux(backend, params_backend, tensor_storage_map, "model.diffusion_model", version, use_mask) {
|
||||
}
|
||||
|
||||
std::string get_desc() override {
|
||||
return flux.get_desc();
|
||||
}
|
||||
|
||||
void alloc_params_buffer() override {
|
||||
flux.alloc_params_buffer();
|
||||
}
|
||||
|
||||
void free_params_buffer() override {
|
||||
flux.free_params_buffer();
|
||||
}
|
||||
|
||||
void free_compute_buffer() override {
|
||||
flux.free_compute_buffer();
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
||||
flux.get_param_tensors(tensors, "model.diffusion_model");
|
||||
}
|
||||
|
||||
size_t get_params_buffer_size() override {
|
||||
return flux.get_params_buffer_size();
|
||||
}
|
||||
|
||||
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
|
||||
flux.set_weight_adapter(adapter);
|
||||
}
|
||||
|
||||
int64_t get_adm_in_channels() override {
|
||||
return 768;
|
||||
}
|
||||
|
||||
void set_flash_attention_enabled(bool enabled) {
|
||||
flux.set_flash_attention_enabled(enabled);
|
||||
}
|
||||
|
||||
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
|
||||
flux.set_max_graph_vram_bytes(max_vram_bytes);
|
||||
}
|
||||
|
||||
void set_circular_axes(bool circular_x, bool circular_y) override {
|
||||
flux.set_circular_axes(circular_x, circular_y);
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
static const std::vector<sd::Tensor<float>> empty_ref_latents;
|
||||
static const std::vector<int> empty_skip_layers;
|
||||
return flux.compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context),
|
||||
tensor_or_empty(diffusion_params.c_concat),
|
||||
tensor_or_empty(diffusion_params.y),
|
||||
tensor_or_empty(diffusion_params.guidance),
|
||||
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
|
||||
diffusion_params.increase_ref_index,
|
||||
diffusion_params.skip_layers ? *diffusion_params.skip_layers : empty_skip_layers);
|
||||
}
|
||||
};
|
||||
|
||||
struct AnimaModel : public DiffusionModel {
|
||||
std::string prefix;
|
||||
Anima::AnimaRunner anima;
|
||||
|
||||
AnimaModel(ggml_backend_t backend,
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "model.diffusion_model")
|
||||
: prefix(prefix), anima(backend, params_backend, tensor_storage_map, prefix) {
|
||||
}
|
||||
|
||||
std::string get_desc() override {
|
||||
return anima.get_desc();
|
||||
}
|
||||
|
||||
void alloc_params_buffer() override {
|
||||
anima.alloc_params_buffer();
|
||||
}
|
||||
|
||||
void free_params_buffer() override {
|
||||
anima.free_params_buffer();
|
||||
}
|
||||
|
||||
void free_compute_buffer() override {
|
||||
anima.free_compute_buffer();
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
||||
anima.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
size_t get_params_buffer_size() override {
|
||||
return anima.get_params_buffer_size();
|
||||
}
|
||||
|
||||
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
|
||||
anima.set_weight_adapter(adapter);
|
||||
}
|
||||
|
||||
int64_t get_adm_in_channels() override {
|
||||
return 768;
|
||||
}
|
||||
|
||||
void set_flash_attention_enabled(bool enabled) {
|
||||
anima.set_flash_attention_enabled(enabled);
|
||||
}
|
||||
|
||||
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
|
||||
anima.set_max_graph_vram_bytes(max_vram_bytes);
|
||||
}
|
||||
|
||||
void set_circular_axes(bool circular_x, bool circular_y) override {
|
||||
anima.set_circular_axes(circular_x, circular_y);
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
return anima.compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context),
|
||||
tensor_or_empty(diffusion_params.t5_ids),
|
||||
tensor_or_empty(diffusion_params.t5_weights));
|
||||
}
|
||||
};
|
||||
|
||||
struct WanModel : public DiffusionModel {
|
||||
std::string prefix;
|
||||
WAN::WanRunner wan;
|
||||
|
||||
WanModel(ggml_backend_t backend,
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "model.diffusion_model",
|
||||
SDVersion version = VERSION_WAN2)
|
||||
: prefix(prefix), wan(backend, params_backend, tensor_storage_map, prefix, version) {
|
||||
}
|
||||
|
||||
std::string get_desc() override {
|
||||
return wan.get_desc();
|
||||
}
|
||||
|
||||
void alloc_params_buffer() override {
|
||||
wan.alloc_params_buffer();
|
||||
}
|
||||
|
||||
void free_params_buffer() override {
|
||||
wan.free_params_buffer();
|
||||
}
|
||||
|
||||
void free_compute_buffer() override {
|
||||
wan.free_compute_buffer();
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
||||
wan.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
size_t get_params_buffer_size() override {
|
||||
return wan.get_params_buffer_size();
|
||||
}
|
||||
|
||||
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
|
||||
wan.set_weight_adapter(adapter);
|
||||
}
|
||||
|
||||
int64_t get_adm_in_channels() override {
|
||||
return 768;
|
||||
}
|
||||
|
||||
void set_flash_attention_enabled(bool enabled) {
|
||||
wan.set_flash_attention_enabled(enabled);
|
||||
}
|
||||
|
||||
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
|
||||
wan.set_max_graph_vram_bytes(max_vram_bytes);
|
||||
}
|
||||
|
||||
void set_circular_axes(bool circular_x, bool circular_y) override {
|
||||
wan.set_circular_axes(circular_x, circular_y);
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
return wan.compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context),
|
||||
tensor_or_empty(diffusion_params.y),
|
||||
tensor_or_empty(diffusion_params.c_concat),
|
||||
sd::Tensor<float>(),
|
||||
tensor_or_empty(diffusion_params.vace_context),
|
||||
diffusion_params.vace_strength);
|
||||
}
|
||||
};
|
||||
|
||||
struct QwenImageModel : public DiffusionModel {
|
||||
std::string prefix;
|
||||
Qwen::QwenImageRunner qwen_image;
|
||||
|
||||
QwenImageModel(ggml_backend_t backend,
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "model.diffusion_model",
|
||||
SDVersion version = VERSION_QWEN_IMAGE,
|
||||
bool zero_cond_t = false)
|
||||
: prefix(prefix), qwen_image(backend, params_backend, tensor_storage_map, prefix, version, zero_cond_t) {
|
||||
}
|
||||
|
||||
std::string get_desc() override {
|
||||
return qwen_image.get_desc();
|
||||
}
|
||||
|
||||
void alloc_params_buffer() override {
|
||||
qwen_image.alloc_params_buffer();
|
||||
}
|
||||
|
||||
void free_params_buffer() override {
|
||||
qwen_image.free_params_buffer();
|
||||
}
|
||||
|
||||
void free_compute_buffer() override {
|
||||
qwen_image.free_compute_buffer();
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
||||
qwen_image.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
size_t get_params_buffer_size() override {
|
||||
return qwen_image.get_params_buffer_size();
|
||||
}
|
||||
|
||||
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
|
||||
qwen_image.set_weight_adapter(adapter);
|
||||
}
|
||||
|
||||
int64_t get_adm_in_channels() override {
|
||||
return 768;
|
||||
}
|
||||
|
||||
void set_flash_attention_enabled(bool enabled) {
|
||||
qwen_image.set_flash_attention_enabled(enabled);
|
||||
}
|
||||
|
||||
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
|
||||
qwen_image.set_max_graph_vram_bytes(max_vram_bytes);
|
||||
}
|
||||
|
||||
void set_circular_axes(bool circular_x, bool circular_y) override {
|
||||
qwen_image.set_circular_axes(circular_x, circular_y);
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
static const std::vector<sd::Tensor<float>> empty_ref_latents;
|
||||
return qwen_image.compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context),
|
||||
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
|
||||
true);
|
||||
}
|
||||
};
|
||||
|
||||
struct HiDreamO1Model : public DiffusionModel {
|
||||
std::string prefix;
|
||||
HiDreamO1::HiDreamO1Runner hidream_o1;
|
||||
|
||||
HiDreamO1Model(ggml_backend_t backend,
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string& prefix = "model")
|
||||
: prefix(prefix), hidream_o1(backend, params_backend, tensor_storage_map, prefix) {
|
||||
}
|
||||
|
||||
std::string get_desc() override {
|
||||
return hidream_o1.get_desc();
|
||||
}
|
||||
|
||||
void alloc_params_buffer() override {
|
||||
hidream_o1.alloc_params_buffer();
|
||||
}
|
||||
|
||||
void free_params_buffer() override {
|
||||
hidream_o1.free_params_buffer();
|
||||
}
|
||||
|
||||
void free_compute_buffer() override {
|
||||
hidream_o1.free_compute_buffer();
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
||||
hidream_o1.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
size_t get_params_buffer_size() override {
|
||||
return hidream_o1.get_params_buffer_size();
|
||||
}
|
||||
|
||||
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
|
||||
hidream_o1.set_weight_adapter(adapter);
|
||||
}
|
||||
|
||||
int64_t get_adm_in_channels() override {
|
||||
return 0;
|
||||
}
|
||||
|
||||
void set_flash_attention_enabled(bool enabled) {
|
||||
hidream_o1.set_flash_attention_enabled(enabled);
|
||||
}
|
||||
|
||||
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
|
||||
hidream_o1.set_max_graph_vram_bytes(max_vram_bytes);
|
||||
}
|
||||
|
||||
void set_circular_axes(bool circular_x, bool circular_y) override {
|
||||
hidream_o1.set_circular_axes(circular_x, circular_y);
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
GGML_ASSERT(diffusion_params.input_ids != nullptr);
|
||||
GGML_ASSERT(diffusion_params.input_pos != nullptr);
|
||||
GGML_ASSERT(diffusion_params.token_types != nullptr);
|
||||
static const std::vector<sd::Tensor<float>> empty_images;
|
||||
static const std::vector<std::pair<int, sd::Tensor<float>>> empty_image_embeds;
|
||||
return hidream_o1.compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
*diffusion_params.input_ids,
|
||||
*diffusion_params.input_pos,
|
||||
*diffusion_params.token_types,
|
||||
tensor_or_empty(diffusion_params.vinput_mask),
|
||||
diffusion_params.image_embeds ? *diffusion_params.image_embeds : empty_image_embeds,
|
||||
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_images);
|
||||
}
|
||||
};
|
||||
|
||||
struct ZImageModel : public DiffusionModel {
|
||||
std::string prefix;
|
||||
ZImage::ZImageRunner z_image;
|
||||
|
||||
ZImageModel(ggml_backend_t backend,
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "model.diffusion_model",
|
||||
SDVersion version = VERSION_Z_IMAGE)
|
||||
: prefix(prefix), z_image(backend, params_backend, tensor_storage_map, prefix, version) {
|
||||
}
|
||||
|
||||
std::string get_desc() override {
|
||||
return z_image.get_desc();
|
||||
}
|
||||
|
||||
void alloc_params_buffer() override {
|
||||
z_image.alloc_params_buffer();
|
||||
}
|
||||
|
||||
void free_params_buffer() override {
|
||||
z_image.free_params_buffer();
|
||||
}
|
||||
|
||||
void free_compute_buffer() override {
|
||||
z_image.free_compute_buffer();
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
||||
z_image.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
size_t get_params_buffer_size() override {
|
||||
return z_image.get_params_buffer_size();
|
||||
}
|
||||
|
||||
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
|
||||
z_image.set_weight_adapter(adapter);
|
||||
}
|
||||
|
||||
int64_t get_adm_in_channels() override {
|
||||
return 768;
|
||||
}
|
||||
|
||||
void set_flash_attention_enabled(bool enabled) {
|
||||
z_image.set_flash_attention_enabled(enabled);
|
||||
}
|
||||
|
||||
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
|
||||
z_image.set_max_graph_vram_bytes(max_vram_bytes);
|
||||
}
|
||||
|
||||
void set_circular_axes(bool circular_x, bool circular_y) override {
|
||||
z_image.set_circular_axes(circular_x, circular_y);
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
static const std::vector<sd::Tensor<float>> empty_ref_latents;
|
||||
return z_image.compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context),
|
||||
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
|
||||
true);
|
||||
}
|
||||
};
|
||||
|
||||
struct ErnieImageModel : public DiffusionModel {
|
||||
std::string prefix;
|
||||
ErnieImage::ErnieImageRunner ernie_image;
|
||||
|
||||
ErnieImageModel(ggml_backend_t backend,
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "model.diffusion_model")
|
||||
: prefix(prefix), ernie_image(backend, params_backend, tensor_storage_map, prefix) {
|
||||
}
|
||||
|
||||
std::string get_desc() override {
|
||||
return ernie_image.get_desc();
|
||||
}
|
||||
|
||||
void alloc_params_buffer() override {
|
||||
ernie_image.alloc_params_buffer();
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) {
|
||||
get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
void free_params_buffer() override {
|
||||
ernie_image.free_params_buffer();
|
||||
}
|
||||
|
||||
void free_compute_buffer() override {
|
||||
ernie_image.free_compute_buffer();
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
||||
ernie_image.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
size_t get_params_buffer_size() override {
|
||||
return ernie_image.get_params_buffer_size();
|
||||
}
|
||||
|
||||
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
|
||||
ernie_image.set_weight_adapter(adapter);
|
||||
}
|
||||
|
||||
int64_t get_adm_in_channels() override {
|
||||
return 768;
|
||||
}
|
||||
|
||||
void set_flash_attention_enabled(bool enabled) {
|
||||
ernie_image.set_flash_attention_enabled(enabled);
|
||||
}
|
||||
|
||||
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
|
||||
ernie_image.set_max_graph_vram_bytes(max_vram_bytes);
|
||||
}
|
||||
|
||||
void set_circular_axes(bool circular_x, bool circular_y) override {
|
||||
ernie_image.set_circular_axes(circular_x, circular_y);
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
return ernie_image.compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context));
|
||||
}
|
||||
};
|
||||
|
||||
struct LensModel : public DiffusionModel {
|
||||
std::string prefix;
|
||||
Lens::LensRunner lens;
|
||||
|
||||
LensModel(ggml_backend_t backend,
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "model.diffusion_model")
|
||||
: prefix(prefix), lens(backend, params_backend, tensor_storage_map, prefix) {
|
||||
}
|
||||
|
||||
std::string get_desc() override {
|
||||
return lens.get_desc();
|
||||
}
|
||||
|
||||
void alloc_params_buffer() override {
|
||||
lens.alloc_params_buffer();
|
||||
}
|
||||
|
||||
void free_params_buffer() override {
|
||||
lens.free_params_buffer();
|
||||
}
|
||||
|
||||
void free_compute_buffer() override {
|
||||
lens.free_compute_buffer();
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
||||
lens.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
size_t get_params_buffer_size() override {
|
||||
return lens.get_params_buffer_size();
|
||||
}
|
||||
|
||||
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
|
||||
lens.set_weight_adapter(adapter);
|
||||
}
|
||||
|
||||
int64_t get_adm_in_channels() override {
|
||||
return 768;
|
||||
}
|
||||
|
||||
void set_flash_attention_enabled(bool enabled) {
|
||||
lens.set_flash_attention_enabled(enabled);
|
||||
}
|
||||
|
||||
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
|
||||
lens.set_max_graph_vram_bytes(max_vram_bytes);
|
||||
}
|
||||
|
||||
void set_circular_axes(bool circular_x, bool circular_y) override {
|
||||
lens.set_circular_axes(circular_x, circular_y);
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
return lens.compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context));
|
||||
}
|
||||
};
|
||||
|
||||
struct LTXAVModel : public DiffusionModel {
|
||||
std::string prefix;
|
||||
LTXV::LTXAVRunner ltxav;
|
||||
|
||||
LTXAVModel(ggml_backend_t backend,
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "model.diffusion_model")
|
||||
: prefix(prefix), ltxav(backend, params_backend, tensor_storage_map, prefix) {
|
||||
}
|
||||
|
||||
std::string get_desc() override {
|
||||
return ltxav.get_desc();
|
||||
}
|
||||
|
||||
void alloc_params_buffer() override {
|
||||
ltxav.alloc_params_buffer();
|
||||
}
|
||||
|
||||
void free_params_buffer() override {
|
||||
ltxav.free_params_buffer();
|
||||
}
|
||||
|
||||
void free_compute_buffer() override {
|
||||
ltxav.free_compute_buffer();
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
|
||||
ltxav.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
size_t get_params_buffer_size() override {
|
||||
return ltxav.get_params_buffer_size();
|
||||
}
|
||||
|
||||
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
|
||||
ltxav.set_weight_adapter(adapter);
|
||||
}
|
||||
|
||||
int64_t get_adm_in_channels() override {
|
||||
return 0;
|
||||
}
|
||||
|
||||
void set_flash_attention_enabled(bool enabled) override {
|
||||
ltxav.set_flash_attention_enabled(enabled);
|
||||
}
|
||||
|
||||
void set_max_graph_vram_bytes(size_t max_vram_bytes) override {
|
||||
ltxav.set_max_graph_vram_bytes(max_vram_bytes);
|
||||
}
|
||||
|
||||
void set_circular_axes(bool circular_x, bool circular_y) override {
|
||||
ltxav.set_circular_axes(circular_x, circular_y);
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
return ltxav.compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context),
|
||||
tensor_or_empty(diffusion_params.audio_x),
|
||||
tensor_or_empty(diffusion_params.audio_timesteps),
|
||||
diffusion_params.audio_length,
|
||||
diffusion_params.frame_rate,
|
||||
tensor_or_empty(diffusion_params.video_positions));
|
||||
}
|
||||
virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors,
|
||||
const std::string& prefix) = 0;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@ -5,6 +5,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "common_dit.hpp"
|
||||
#include "diffusion_model.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "qwen_image.hpp"
|
||||
#include "rope.hpp"
|
||||
@ -325,7 +326,7 @@ namespace ErnieImage {
|
||||
}
|
||||
};
|
||||
|
||||
struct ErnieImageRunner : public GGMLRunner {
|
||||
struct ErnieImageRunner : public DiffusionModelRunner {
|
||||
ErnieImageParams ernie_params;
|
||||
ErnieImageModel ernie_image;
|
||||
std::vector<float> pe_vec;
|
||||
@ -334,7 +335,7 @@ namespace ErnieImage {
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "")
|
||||
: GGMLRunner(backend, params_backend) {
|
||||
: DiffusionModelRunner(backend, params_backend, prefix) {
|
||||
ernie_params.num_layers = 0;
|
||||
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
||||
if (!starts_with(name, prefix)) {
|
||||
@ -393,7 +394,7 @@ namespace ErnieImage {
|
||||
return "ernie_image";
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
|
||||
ernie_image.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
@ -435,6 +436,16 @@ namespace ErnieImage {
|
||||
};
|
||||
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
return compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context));
|
||||
}
|
||||
};
|
||||
} // namespace ErnieImage
|
||||
|
||||
|
||||
26
src/flux.hpp
26
src/flux.hpp
@ -5,6 +5,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "common_dit.hpp"
|
||||
#include "diffusion_model.hpp"
|
||||
#include "model.h"
|
||||
#include "rope.hpp"
|
||||
|
||||
@ -1176,7 +1177,7 @@ namespace Flux {
|
||||
}
|
||||
};
|
||||
|
||||
struct FluxRunner : public GGMLRunner {
|
||||
struct FluxRunner : public DiffusionModelRunner {
|
||||
public:
|
||||
FluxParams flux_params;
|
||||
Flux flux;
|
||||
@ -1193,7 +1194,7 @@ namespace Flux {
|
||||
const std::string prefix = "",
|
||||
SDVersion version = VERSION_FLUX,
|
||||
bool use_mask = false)
|
||||
: GGMLRunner(backend, params_backend), version(version), use_mask(use_mask) {
|
||||
: DiffusionModelRunner(backend, params_backend, prefix), version(version), use_mask(use_mask) {
|
||||
flux_params.version = version;
|
||||
flux_params.guidance_embed = false;
|
||||
flux_params.depth = 0;
|
||||
@ -1308,7 +1309,7 @@ namespace Flux {
|
||||
return "flux";
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
|
||||
flux.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
@ -1490,6 +1491,25 @@ namespace Flux {
|
||||
return result;
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
const auto* extra = diffusion_extra_as<FluxDiffusionExtra>(diffusion_params);
|
||||
static const std::vector<sd::Tensor<float>> empty_ref_latents;
|
||||
static const std::vector<int> empty_skip_layers;
|
||||
return compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context),
|
||||
tensor_or_empty(diffusion_params.c_concat),
|
||||
tensor_or_empty(diffusion_params.y),
|
||||
tensor_or_empty(extra->guidance),
|
||||
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
|
||||
diffusion_params.increase_ref_index,
|
||||
extra->skip_layers ? *extra->skip_layers : empty_skip_layers);
|
||||
}
|
||||
|
||||
void test() {
|
||||
ggml_init_params params;
|
||||
params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1GB
|
||||
|
||||
@ -12,6 +12,7 @@
|
||||
|
||||
#include "common_dit.hpp"
|
||||
#include "conditioner.hpp"
|
||||
#include "diffusion_model.hpp"
|
||||
#include "llm.hpp"
|
||||
#include "util.h"
|
||||
|
||||
@ -329,7 +330,7 @@ namespace HiDreamO1 {
|
||||
}
|
||||
};
|
||||
|
||||
struct HiDreamO1Runner : public GGMLRunner {
|
||||
struct HiDreamO1Runner : public DiffusionModelRunner {
|
||||
HiDreamO1Params params;
|
||||
HiDreamO1Model model;
|
||||
|
||||
@ -339,7 +340,7 @@ namespace HiDreamO1 {
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string& prefix = "model")
|
||||
: GGMLRunner(backend, params_backend),
|
||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
||||
params(make_hidream_o1_params()) {
|
||||
model = HiDreamO1Model(params);
|
||||
model.init(params_ctx, tensor_storage_map, prefix);
|
||||
@ -349,7 +350,7 @@ namespace HiDreamO1 {
|
||||
return "hidream_o1";
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) {
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
|
||||
model.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
@ -454,6 +455,28 @@ namespace HiDreamO1 {
|
||||
};
|
||||
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
const auto* extra = diffusion_extra_as<HiDreamO1DiffusionExtra>(diffusion_params);
|
||||
GGML_ASSERT(extra != nullptr);
|
||||
GGML_ASSERT(extra->input_ids != nullptr);
|
||||
GGML_ASSERT(extra->input_pos != nullptr);
|
||||
GGML_ASSERT(extra->token_types != nullptr);
|
||||
static const std::vector<sd::Tensor<float>> empty_images;
|
||||
static const std::vector<std::pair<int, sd::Tensor<float>>> empty_image_embeds;
|
||||
return compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
*extra->input_ids,
|
||||
*extra->input_pos,
|
||||
*extra->token_types,
|
||||
tensor_or_empty(extra->vinput_mask),
|
||||
extra->image_embeds ? *extra->image_embeds : empty_image_embeds,
|
||||
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_images);
|
||||
}
|
||||
};
|
||||
|
||||
struct HiDreamO1Conditioner : public Conditioner {
|
||||
|
||||
17
src/lens.hpp
17
src/lens.hpp
@ -5,6 +5,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "common_block.hpp"
|
||||
#include "diffusion_model.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "qwen_image.hpp"
|
||||
#include "rope.hpp"
|
||||
@ -298,7 +299,7 @@ namespace Lens {
|
||||
}
|
||||
};
|
||||
|
||||
struct LensRunner : public GGMLRunner {
|
||||
struct LensRunner : public DiffusionModelRunner {
|
||||
LensParams lens_params;
|
||||
LensModel lens;
|
||||
std::vector<float> pe_vec;
|
||||
@ -307,7 +308,7 @@ namespace Lens {
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "")
|
||||
: GGMLRunner(backend, params_backend) {
|
||||
: DiffusionModelRunner(backend, params_backend, prefix) {
|
||||
lens_params.num_layers = 0;
|
||||
for (const auto& [name, tensor_storage] : tensor_storage_map) {
|
||||
if (!starts_with(name, prefix)) {
|
||||
@ -361,7 +362,7 @@ namespace Lens {
|
||||
return "lens";
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
|
||||
lens.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
@ -402,6 +403,16 @@ namespace Lens {
|
||||
};
|
||||
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
return compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context));
|
||||
}
|
||||
};
|
||||
} // namespace Lens
|
||||
|
||||
|
||||
25
src/ltxv.hpp
25
src/ltxv.hpp
@ -10,6 +10,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "common_block.hpp"
|
||||
#include "diffusion_model.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "rope.hpp"
|
||||
|
||||
@ -1534,8 +1535,7 @@ namespace LTXV {
|
||||
}
|
||||
};
|
||||
|
||||
struct LTXAVRunner : public GGMLRunner {
|
||||
std::string prefix;
|
||||
struct LTXAVRunner : public DiffusionModelRunner {
|
||||
LTXAVParams params;
|
||||
LTXAVModelBlock model;
|
||||
std::vector<float> video_pe_vec;
|
||||
@ -1561,8 +1561,7 @@ namespace LTXV {
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string& prefix = "model.diffusion_model")
|
||||
: GGMLRunner(backend, params_backend),
|
||||
prefix(prefix),
|
||||
: DiffusionModelRunner(backend, params_backend, prefix),
|
||||
params(),
|
||||
model(params) {
|
||||
auto patchify_proj_iter = tensor_storage_map.find(prefix + ".patchify_proj.weight");
|
||||
@ -1673,7 +1672,7 @@ namespace LTXV {
|
||||
return "ltxav";
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) {
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
|
||||
model.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
@ -1917,6 +1916,22 @@ namespace LTXV {
|
||||
return out;
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
const auto* extra = diffusion_extra_as<LTXAVDiffusionExtra>(diffusion_params);
|
||||
return compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context),
|
||||
tensor_or_empty(extra->audio_x),
|
||||
tensor_or_empty(extra->audio_timesteps),
|
||||
extra->audio_length,
|
||||
extra->frame_rate,
|
||||
tensor_or_empty(extra->video_positions));
|
||||
}
|
||||
|
||||
void test(const std::string& x_path,
|
||||
const std::string& timesteps_path = "",
|
||||
const std::string& context_path = "",
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "diffusion_model.hpp"
|
||||
#include "ggml_extend.hpp"
|
||||
#include "model.h"
|
||||
|
||||
@ -824,14 +825,14 @@ public:
|
||||
return x;
|
||||
}
|
||||
};
|
||||
struct MMDiTRunner : public GGMLRunner {
|
||||
struct MMDiTRunner : public DiffusionModelRunner {
|
||||
MMDiT mmdit;
|
||||
|
||||
MMDiTRunner(ggml_backend_t backend,
|
||||
ggml_backend_t params_backend,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "")
|
||||
: GGMLRunner(backend, params_backend), mmdit(tensor_storage_map) {
|
||||
: DiffusionModelRunner(backend, params_backend, prefix), mmdit(tensor_storage_map) {
|
||||
mmdit.init(params_ctx, tensor_storage_map, prefix);
|
||||
}
|
||||
|
||||
@ -839,7 +840,7 @@ struct MMDiTRunner : public GGMLRunner {
|
||||
return "mmdit";
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
|
||||
mmdit.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
@ -885,6 +886,20 @@ struct MMDiTRunner : public GGMLRunner {
|
||||
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
const auto* extra = diffusion_extra_as<SkipLayerDiffusionExtra>(diffusion_params);
|
||||
static const std::vector<int> empty_skip_layers;
|
||||
return compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context),
|
||||
tensor_or_empty(diffusion_params.y),
|
||||
extra->skip_layers ? *extra->skip_layers : empty_skip_layers);
|
||||
}
|
||||
|
||||
void test() {
|
||||
ggml_init_params params;
|
||||
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
#include <memory>
|
||||
|
||||
#include "common_block.hpp"
|
||||
#include "diffusion_model.hpp"
|
||||
#include "flux.hpp"
|
||||
|
||||
namespace Qwen {
|
||||
@ -479,7 +480,7 @@ namespace Qwen {
|
||||
}
|
||||
};
|
||||
|
||||
struct QwenImageRunner : public GGMLRunner {
|
||||
struct QwenImageRunner : public DiffusionModelRunner {
|
||||
public:
|
||||
QwenImageParams qwen_image_params;
|
||||
QwenImageModel qwen_image;
|
||||
@ -493,7 +494,7 @@ namespace Qwen {
|
||||
const std::string prefix = "",
|
||||
SDVersion version = VERSION_QWEN_IMAGE,
|
||||
bool zero_cond_t = false)
|
||||
: GGMLRunner(backend, params_backend) {
|
||||
: DiffusionModelRunner(backend, params_backend, prefix) {
|
||||
qwen_image_params.num_layers = 0;
|
||||
qwen_image_params.zero_cond_t = zero_cond_t;
|
||||
for (auto pair : tensor_storage_map) {
|
||||
@ -528,7 +529,7 @@ namespace Qwen {
|
||||
return "qwen_image";
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
|
||||
qwen_image.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
@ -624,6 +625,19 @@ namespace Qwen {
|
||||
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
static const std::vector<sd::Tensor<float>> empty_ref_latents;
|
||||
return compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context),
|
||||
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
|
||||
diffusion_params.increase_ref_index);
|
||||
}
|
||||
|
||||
void test() {
|
||||
ggml_init_params params;
|
||||
params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1GB
|
||||
|
||||
@ -8,22 +8,33 @@
|
||||
#include "stable-diffusion.h"
|
||||
#include "util.h"
|
||||
|
||||
#include "anima.hpp"
|
||||
#include "auto_encoder_kl.hpp"
|
||||
#include "conditioner.hpp"
|
||||
#include "control.hpp"
|
||||
#include "denoiser.hpp"
|
||||
#include "diffusion_model.hpp"
|
||||
#include "ernie_image.hpp"
|
||||
#include "esrgan.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "guidance.h"
|
||||
#include "hidream_o1.hpp"
|
||||
#include "lens.hpp"
|
||||
#include "lora.hpp"
|
||||
#include "ltx_audio_vae.h"
|
||||
#include "ltx_latent_upscaler.hpp"
|
||||
#include "ltx_vae.hpp"
|
||||
#include "ltxv.hpp"
|
||||
#include "mmdit.hpp"
|
||||
#include "pmid.hpp"
|
||||
#include "qwen_image.hpp"
|
||||
#include "sample-cache.h"
|
||||
#include "tae.hpp"
|
||||
#include "unet.hpp"
|
||||
#include "upscaler.h"
|
||||
#include "vae.hpp"
|
||||
#include "wan.hpp"
|
||||
#include "z_image.hpp"
|
||||
|
||||
#include "latent-preview.h"
|
||||
#include "name_conversion.h"
|
||||
@ -138,8 +149,8 @@ public:
|
||||
|
||||
std::shared_ptr<Conditioner> cond_stage_model;
|
||||
std::shared_ptr<FrozenCLIPVisionEmbedder> clip_vision; // for svd or wan2.1 i2v
|
||||
std::shared_ptr<DiffusionModel> diffusion_model;
|
||||
std::shared_ptr<DiffusionModel> high_noise_diffusion_model;
|
||||
std::shared_ptr<DiffusionModelRunner> diffusion_model;
|
||||
std::shared_ptr<DiffusionModelRunner> high_noise_diffusion_model;
|
||||
std::shared_ptr<VAE> first_stage_model;
|
||||
std::shared_ptr<VAE> preview_vae;
|
||||
std::shared_ptr<LTXV::LTXAudioVAERunner> audio_vae_model;
|
||||
@ -486,9 +497,10 @@ public:
|
||||
cond_stage_model = std::make_shared<SD3CLIPEmbedder>(backend_for(SDBackendModule::TE),
|
||||
params_backend_for(SDBackendModule::TE),
|
||||
tensor_storage_map);
|
||||
diffusion_model = std::make_shared<MMDiTModel>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map);
|
||||
diffusion_model = std::make_shared<MMDiTRunner>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model");
|
||||
} else if (sd_version_is_flux(version)) {
|
||||
bool is_chroma = false;
|
||||
for (auto pair : tensor_storage_map) {
|
||||
@ -524,30 +536,32 @@ public:
|
||||
params_backend_for(SDBackendModule::TE),
|
||||
tensor_storage_map);
|
||||
}
|
||||
diffusion_model = std::make_shared<FluxModel>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
version,
|
||||
sd_ctx_params->chroma_use_dit_mask);
|
||||
diffusion_model = std::make_shared<Flux::FluxRunner>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model",
|
||||
version,
|
||||
sd_ctx_params->chroma_use_dit_mask);
|
||||
} else if (sd_version_is_flux2(version)) {
|
||||
bool is_chroma = false;
|
||||
cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
|
||||
params_backend_for(SDBackendModule::TE),
|
||||
tensor_storage_map,
|
||||
version);
|
||||
diffusion_model = std::make_shared<FluxModel>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
version,
|
||||
sd_ctx_params->chroma_use_dit_mask);
|
||||
diffusion_model = std::make_shared<Flux::FluxRunner>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model",
|
||||
version,
|
||||
sd_ctx_params->chroma_use_dit_mask);
|
||||
} else if (sd_version_is_ltxav(version)) {
|
||||
cond_stage_model = std::make_shared<LTXAVEmbedder>(backend_for(SDBackendModule::TE),
|
||||
params_backend_for(SDBackendModule::TE),
|
||||
tensor_storage_map);
|
||||
diffusion_model = std::make_shared<LTXAVModel>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model");
|
||||
diffusion_model = std::make_shared<LTXV::LTXAVRunner>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model");
|
||||
} else if (sd_version_is_wan(version)) {
|
||||
cond_stage_model = std::make_shared<T5CLIPEmbedder>(backend_for(SDBackendModule::TE),
|
||||
params_backend_for(SDBackendModule::TE),
|
||||
@ -555,17 +569,17 @@ public:
|
||||
true,
|
||||
0,
|
||||
true);
|
||||
diffusion_model = std::make_shared<WanModel>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model",
|
||||
version);
|
||||
diffusion_model = std::make_shared<WAN::WanRunner>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model",
|
||||
version);
|
||||
if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
|
||||
high_noise_diffusion_model = std::make_shared<WanModel>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.high_noise_diffusion_model",
|
||||
version);
|
||||
high_noise_diffusion_model = std::make_shared<WAN::WanRunner>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.high_noise_diffusion_model",
|
||||
version);
|
||||
}
|
||||
if (diffusion_model->get_desc() == "Wan2.1-I2V-14B" ||
|
||||
diffusion_model->get_desc() == "Wan2.1-FLF2V-14B" ||
|
||||
@ -590,12 +604,12 @@ public:
|
||||
version,
|
||||
"",
|
||||
enable_vision);
|
||||
diffusion_model = std::make_shared<QwenImageModel>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model",
|
||||
version,
|
||||
sd_ctx_params->qwen_image_zero_cond_t);
|
||||
diffusion_model = std::make_shared<Qwen::QwenImageRunner>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model",
|
||||
version,
|
||||
sd_ctx_params->qwen_image_zero_cond_t);
|
||||
} else if (sd_version_is_longcat(version)) {
|
||||
bool enable_vision = false;
|
||||
if (!vae_decode_only) {
|
||||
@ -607,55 +621,56 @@ public:
|
||||
version,
|
||||
"",
|
||||
enable_vision);
|
||||
diffusion_model = std::make_shared<FluxModel>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
version,
|
||||
sd_ctx_params->chroma_use_dit_mask);
|
||||
diffusion_model = std::make_shared<Flux::FluxRunner>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model",
|
||||
version,
|
||||
sd_ctx_params->chroma_use_dit_mask);
|
||||
} else if (version == VERSION_HIDREAM_O1) {
|
||||
cond_stage_model = std::make_shared<HiDreamO1::HiDreamO1Conditioner>(backend_for(SDBackendModule::TE),
|
||||
params_backend_for(SDBackendModule::TE),
|
||||
tensor_storage_map);
|
||||
diffusion_model = std::make_shared<HiDreamO1Model>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model");
|
||||
diffusion_model = std::make_shared<HiDreamO1::HiDreamO1Runner>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model");
|
||||
} else if (sd_version_is_anima(version)) {
|
||||
cond_stage_model = std::make_shared<AnimaConditioner>(backend_for(SDBackendModule::TE),
|
||||
params_backend_for(SDBackendModule::TE),
|
||||
tensor_storage_map);
|
||||
diffusion_model = std::make_shared<AnimaModel>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model");
|
||||
diffusion_model = std::make_shared<Anima::AnimaRunner>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model");
|
||||
} else if (sd_version_is_z_image(version)) {
|
||||
cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
|
||||
params_backend_for(SDBackendModule::TE),
|
||||
tensor_storage_map,
|
||||
version);
|
||||
diffusion_model = std::make_shared<ZImageModel>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model",
|
||||
version);
|
||||
diffusion_model = std::make_shared<ZImage::ZImageRunner>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model",
|
||||
version);
|
||||
} else if (sd_version_is_ernie_image(version)) {
|
||||
cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
|
||||
params_backend_for(SDBackendModule::TE),
|
||||
tensor_storage_map,
|
||||
version);
|
||||
diffusion_model = std::make_shared<ErnieImageModel>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model");
|
||||
diffusion_model = std::make_shared<ErnieImage::ErnieImageRunner>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model");
|
||||
} else if (sd_version_is_lens(version)) {
|
||||
cond_stage_model = std::make_shared<LLMEmbedder>(backend_for(SDBackendModule::TE),
|
||||
params_backend_for(SDBackendModule::TE),
|
||||
tensor_storage_map,
|
||||
version);
|
||||
diffusion_model = std::make_shared<LensModel>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model");
|
||||
diffusion_model = std::make_shared<Lens::LensRunner>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model");
|
||||
} else { // SD1.x SD2.x SDXL
|
||||
std::map<std::string, std::string> embbeding_map;
|
||||
for (uint32_t i = 0; i < sd_ctx_params->embedding_count; i++) {
|
||||
@ -675,13 +690,14 @@ public:
|
||||
embbeding_map,
|
||||
version);
|
||||
}
|
||||
diffusion_model = std::make_shared<UNetModel>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
version);
|
||||
diffusion_model = std::make_shared<UNetModelRunner>(backend_for(SDBackendModule::DIFFUSION),
|
||||
params_backend_for(SDBackendModule::DIFFUSION),
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model",
|
||||
version);
|
||||
if (sd_ctx_params->diffusion_conv_direct) {
|
||||
LOG_INFO("Using Conv2d direct in the diffusion model");
|
||||
std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.set_conv2d_direct_enabled(true);
|
||||
diffusion_model->set_conv2d_direct_enabled(true);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1221,6 +1237,7 @@ public:
|
||||
diffusion_params.x = &x_t;
|
||||
diffusion_params.timesteps = &steps;
|
||||
diffusion_params.context = &c;
|
||||
diffusion_params.extra = UNetDiffusionExtra{};
|
||||
if (!concat.empty()) {
|
||||
diffusion_params.c_concat = &concat;
|
||||
}
|
||||
@ -1855,7 +1872,7 @@ public:
|
||||
*controls = std::move(*control_result);
|
||||
}
|
||||
|
||||
sd::Tensor<float> sample(const std::shared_ptr<DiffusionModel>& work_diffusion_model,
|
||||
sd::Tensor<float> sample(const std::shared_ptr<DiffusionModelRunner>& work_diffusion_model,
|
||||
bool inverse_noise_scaling,
|
||||
const sd::Tensor<float>& init_latent,
|
||||
sd::Tensor<float> noise,
|
||||
@ -1982,18 +1999,7 @@ public:
|
||||
DiffusionParams diffusion_params;
|
||||
diffusion_params.x = &noised_input;
|
||||
diffusion_params.timesteps = ×teps_tensor;
|
||||
diffusion_params.audio_timesteps = audio_timesteps_tensor.empty() ? nullptr : &audio_timesteps_tensor;
|
||||
diffusion_params.guidance = &guidance_tensor;
|
||||
diffusion_params.ref_latents = &ref_latents;
|
||||
diffusion_params.increase_ref_index = increase_ref_index;
|
||||
diffusion_params.controls = &controls;
|
||||
diffusion_params.control_strength = control_strength;
|
||||
diffusion_params.vace_context = vace_context.empty() ? nullptr : &vace_context;
|
||||
diffusion_params.vace_strength = vace_strength;
|
||||
diffusion_params.audio_length = audio_length;
|
||||
diffusion_params.frame_rate = frame_rate;
|
||||
diffusion_params.video_positions = video_positions.empty() ? nullptr : &video_positions;
|
||||
diffusion_params.skip_layers = nullptr;
|
||||
|
||||
compute_sample_controls(control_image,
|
||||
noised_input,
|
||||
@ -2004,18 +2010,41 @@ public:
|
||||
auto run_condition = [&](const SDCondition& condition,
|
||||
const sd::Tensor<float>* c_concat_override = nullptr,
|
||||
const std::vector<int>* local_skip_layers = nullptr) -> sd::Tensor<float> {
|
||||
diffusion_params.context = condition.c_crossattn.empty() ? nullptr : &condition.c_crossattn;
|
||||
diffusion_params.c_concat = c_concat_override != nullptr ? c_concat_override : (condition.c_concat.empty() ? nullptr : &condition.c_concat);
|
||||
diffusion_params.y = condition.c_vector.empty() ? nullptr : &condition.c_vector;
|
||||
diffusion_params.t5_ids = condition.c_t5_ids.empty() ? nullptr : &condition.c_t5_ids;
|
||||
diffusion_params.t5_weights = condition.c_t5_weights.empty() ? nullptr : &condition.c_t5_weights;
|
||||
diffusion_params.input_ids = condition.c_input_ids.empty() ? nullptr : &condition.c_input_ids;
|
||||
diffusion_params.input_pos = condition.c_position_ids.empty() ? nullptr : &condition.c_position_ids;
|
||||
diffusion_params.token_types = condition.c_token_types.empty() ? nullptr : &condition.c_token_types;
|
||||
diffusion_params.vinput_mask = condition.c_vinput_mask.empty() ? nullptr : &condition.c_vinput_mask;
|
||||
diffusion_params.image_embeds = condition.c_image_embeds.empty() ? nullptr : &condition.c_image_embeds;
|
||||
diffusion_params.ref_latents = condition.c_ref_images.empty() ? &ref_latents : &condition.c_ref_images;
|
||||
diffusion_params.skip_layers = local_skip_layers;
|
||||
diffusion_params.context = condition.c_crossattn.empty() ? nullptr : &condition.c_crossattn;
|
||||
diffusion_params.c_concat = c_concat_override != nullptr ? c_concat_override : (condition.c_concat.empty() ? nullptr : &condition.c_concat);
|
||||
diffusion_params.y = condition.c_vector.empty() ? nullptr : &condition.c_vector;
|
||||
diffusion_params.ref_latents = condition.c_ref_images.empty() ? &ref_latents : &condition.c_ref_images;
|
||||
|
||||
if (sd_version_is_unet(version)) {
|
||||
diffusion_params.extra = UNetDiffusionExtra{-1, &controls, control_strength};
|
||||
} else if (sd_version_is_sd3(version)) {
|
||||
diffusion_params.extra = SkipLayerDiffusionExtra{local_skip_layers};
|
||||
} else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version)) {
|
||||
diffusion_params.extra = FluxDiffusionExtra{&guidance_tensor,
|
||||
local_skip_layers};
|
||||
} else if (sd_version_is_anima(version)) {
|
||||
diffusion_params.extra = AnimaDiffusionExtra{condition.c_t5_ids.empty() ? nullptr : &condition.c_t5_ids,
|
||||
condition.c_t5_weights.empty() ? nullptr : &condition.c_t5_weights};
|
||||
} else if (sd_version_is_wan(version)) {
|
||||
diffusion_params.extra = WanDiffusionExtra{vace_context.empty() ? nullptr : &vace_context,
|
||||
vace_strength};
|
||||
} else if (version == VERSION_HIDREAM_O1) {
|
||||
diffusion_params.extra = HiDreamO1DiffusionExtra{
|
||||
condition.c_input_ids.empty() ? nullptr : &condition.c_input_ids,
|
||||
condition.c_position_ids.empty() ? nullptr : &condition.c_position_ids,
|
||||
condition.c_token_types.empty() ? nullptr : &condition.c_token_types,
|
||||
condition.c_vinput_mask.empty() ? nullptr : &condition.c_vinput_mask,
|
||||
condition.c_image_embeds.empty() ? nullptr : &condition.c_image_embeds};
|
||||
} else if (sd_version_is_ltxav(version)) {
|
||||
diffusion_params.extra = LTXAVDiffusionExtra{
|
||||
nullptr,
|
||||
audio_timesteps_tensor.empty() ? nullptr : &audio_timesteps_tensor,
|
||||
audio_length,
|
||||
frame_rate,
|
||||
video_positions.empty() ? nullptr : &video_positions};
|
||||
} else {
|
||||
diffusion_params.extra = std::monostate{};
|
||||
}
|
||||
|
||||
sd::Tensor<float> cached_output;
|
||||
if (step_cache.before_condition(&condition, noised_input, &cached_output)) {
|
||||
@ -3914,7 +3943,7 @@ static std::optional<ImageGenerationLatents> prepare_image_generation_latents(sd
|
||||
concat_latent = sd::ops::interpolate<float>(ref_latents[0], init_latent.shape());
|
||||
uncond_concat_latent = sd::Tensor<float>::zeros_like(concat_latent);
|
||||
}
|
||||
if (sd_version_is_control(sd_ctx->sd->version)) {
|
||||
if (sd_ctx->sd->version == VERSION_FLUX_CONTROLS) {
|
||||
if (!control_latent.empty()) {
|
||||
concat_latent = control_latent;
|
||||
} else {
|
||||
@ -3953,12 +3982,11 @@ static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_c
|
||||
SamplePlan* plan,
|
||||
ImageGenerationLatents* latents) {
|
||||
ConditionerParams condition_params;
|
||||
condition_params.text = request->prompt;
|
||||
condition_params.clip_skip = request->clip_skip;
|
||||
condition_params.width = request->width;
|
||||
condition_params.height = request->height;
|
||||
condition_params.ref_images = &latents->ref_images;
|
||||
condition_params.adm_in_channels = static_cast<int>(sd_ctx->sd->diffusion_model->get_adm_in_channels());
|
||||
condition_params.text = request->prompt;
|
||||
condition_params.clip_skip = request->clip_skip;
|
||||
condition_params.width = request->width;
|
||||
condition_params.height = request->height;
|
||||
condition_params.ref_images = &latents->ref_images;
|
||||
|
||||
auto id_cond = sd_ctx->sd->get_pmid_conditon(request->pm_params, condition_params);
|
||||
int64_t prepare_start_ms = ggml_time_ms();
|
||||
|
||||
24
src/unet.hpp
24
src/unet.hpp
@ -2,6 +2,7 @@
|
||||
#define __UNET_HPP__
|
||||
|
||||
#include "common_block.hpp"
|
||||
#include "diffusion_model.hpp"
|
||||
#include "model.h"
|
||||
|
||||
/*==================================================== UnetModel =====================================================*/
|
||||
@ -599,7 +600,7 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
struct UNetModelRunner : public GGMLRunner {
|
||||
struct UNetModelRunner : public DiffusionModelRunner {
|
||||
UnetModelBlock unet;
|
||||
|
||||
UNetModelRunner(ggml_backend_t backend,
|
||||
@ -607,7 +608,7 @@ struct UNetModelRunner : public GGMLRunner {
|
||||
const String2TensorStorage& tensor_storage_map,
|
||||
const std::string prefix,
|
||||
SDVersion version = VERSION_SD1)
|
||||
: GGMLRunner(backend, params_backend), unet(version, tensor_storage_map) {
|
||||
: DiffusionModelRunner(backend, params_backend, prefix), unet(version, tensor_storage_map) {
|
||||
unet.init(params_ctx, tensor_storage_map, prefix);
|
||||
}
|
||||
|
||||
@ -615,7 +616,7 @@ struct UNetModelRunner : public GGMLRunner {
|
||||
return "unet";
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
|
||||
unet.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
@ -682,6 +683,23 @@ struct UNetModelRunner : public GGMLRunner {
|
||||
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
const auto* extra = diffusion_extra_as<UNetDiffusionExtra>(diffusion_params);
|
||||
static const std::vector<sd::Tensor<float>> empty_controls;
|
||||
return compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context),
|
||||
tensor_or_empty(diffusion_params.c_concat),
|
||||
tensor_or_empty(diffusion_params.y),
|
||||
extra->num_video_frames,
|
||||
extra->controls ? *extra->controls : empty_controls,
|
||||
extra->control_strength);
|
||||
}
|
||||
|
||||
void test() {
|
||||
ggml_init_params params;
|
||||
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB
|
||||
|
||||
23
src/wan.hpp
23
src/wan.hpp
@ -6,6 +6,7 @@
|
||||
#include <utility>
|
||||
|
||||
#include "common_block.hpp"
|
||||
#include "diffusion_model.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "rope.hpp"
|
||||
#include "vae.hpp"
|
||||
@ -2085,7 +2086,7 @@ namespace WAN {
|
||||
}
|
||||
};
|
||||
|
||||
struct WanRunner : public GGMLRunner {
|
||||
struct WanRunner : public DiffusionModelRunner {
|
||||
public:
|
||||
std::string desc = "wan";
|
||||
WanParams wan_params;
|
||||
@ -2098,7 +2099,7 @@ namespace WAN {
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "",
|
||||
SDVersion version = VERSION_WAN2)
|
||||
: GGMLRunner(backend, params_backend) {
|
||||
: DiffusionModelRunner(backend, params_backend, prefix) {
|
||||
wan_params.num_layers = 0;
|
||||
for (auto pair : tensor_storage_map) {
|
||||
std::string tensor_name = pair.first;
|
||||
@ -2208,7 +2209,7 @@ namespace WAN {
|
||||
return desc;
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
|
||||
wan.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
@ -2284,6 +2285,22 @@ namespace WAN {
|
||||
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
const auto* extra = diffusion_extra_as<WanDiffusionExtra>(diffusion_params);
|
||||
return compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context),
|
||||
tensor_or_empty(diffusion_params.y),
|
||||
tensor_or_empty(diffusion_params.c_concat),
|
||||
sd::Tensor<float>(),
|
||||
tensor_or_empty(extra->vace_context),
|
||||
extra->vace_strength);
|
||||
}
|
||||
|
||||
void test() {
|
||||
ggml_init_params params;
|
||||
params.mem_size = static_cast<size_t>(200 * 1024 * 1024); // 200 MB
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "diffusion_model.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "ggml_extend.hpp"
|
||||
#include "mmdit.hpp"
|
||||
@ -464,7 +465,7 @@ namespace ZImage {
|
||||
}
|
||||
};
|
||||
|
||||
struct ZImageRunner : public GGMLRunner {
|
||||
struct ZImageRunner : public DiffusionModelRunner {
|
||||
public:
|
||||
ZImageParams z_image_params;
|
||||
ZImageModel z_image;
|
||||
@ -477,7 +478,7 @@ namespace ZImage {
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "",
|
||||
SDVersion version = VERSION_Z_IMAGE)
|
||||
: GGMLRunner(backend, params_backend) {
|
||||
: DiffusionModelRunner(backend, params_backend, prefix) {
|
||||
z_image = ZImageModel(z_image_params);
|
||||
z_image.init(params_ctx, tensor_storage_map, prefix);
|
||||
}
|
||||
@ -486,7 +487,7 @@ namespace ZImage {
|
||||
return "z_image";
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
|
||||
void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string& prefix) override {
|
||||
z_image.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
@ -556,6 +557,19 @@ namespace ZImage {
|
||||
return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, false), x.dim());
|
||||
}
|
||||
|
||||
sd::Tensor<float> compute(int n_threads,
|
||||
const DiffusionParams& diffusion_params) override {
|
||||
GGML_ASSERT(diffusion_params.x != nullptr);
|
||||
GGML_ASSERT(diffusion_params.timesteps != nullptr);
|
||||
static const std::vector<sd::Tensor<float>> empty_ref_latents;
|
||||
return compute(n_threads,
|
||||
*diffusion_params.x,
|
||||
*diffusion_params.timesteps,
|
||||
tensor_or_empty(diffusion_params.context),
|
||||
diffusion_params.ref_latents ? *diffusion_params.ref_latents : empty_ref_latents,
|
||||
diffusion_params.increase_ref_index);
|
||||
}
|
||||
|
||||
void test() {
|
||||
ggml_init_params params;
|
||||
params.mem_size = static_cast<size_t>(1024 * 1024) * 1024; // 1GB
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user